forked from mirrors/kingfisher
- Improved error message when self-update cannot find the current binary
- Optimized memory usage via string interning and extensive data sharing - Replaced quadratic match filtering with a per-rule span map, fixing missed secrets in extremely large files and improving scan performance - Support scanning extremely large files by chunking input into 1 GiB segments with small overlaps, avoiding vectorscan buffer limits while preserving match offsets - Always use chunked vectorscan, eliminating the slow regex fallback for blobs over 4 GiB - Skip Base64 scanning for blobs over 64 MB to avoid a second pass over massive files - Increased max-file-size default to 64 MB (up from 25 MB)
This commit is contained in:
parent
52b2c02ee9
commit
8d15c8eabf
7 changed files with 172 additions and 74 deletions
|
|
@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
|
|||
## [Unreleased]
|
||||
- Improved error message when self-update cannot find the current binary
|
||||
- Optimized memory usage via string interning and extensive data sharing
|
||||
- Replaced quadratic match filtering with a per-rule span map, fixing missed secrets in extremely large files and improving scan performance
|
||||
- Support scanning extremely large files by chunking input into 1 GiB segments with small overlaps, avoiding vectorscan buffer limits while preserving match offsets
|
||||
- Always use chunked vectorscan, eliminating the slow regex fallback for blobs over 4 GiB
|
||||
- Skip Base64 scanning for blobs over 64 MB to avoid a second pass over massive files
|
||||
- Increased max-file-size default to 64 MB (up from 25 MB)
|
||||
|
||||
## [1.47.0]
|
||||
- MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them
|
||||
|
|
|
|||
|
|
@ -65,7 +65,9 @@ smallvec = { version = "1", features = [
|
|||
tracing = "0.1.41"
|
||||
indicatif = { version = "0.17", features = ["improved_unicode"] }
|
||||
rayon = "1.10"
|
||||
sha1 = { version = "0.10.6", features = ["asm"] }
|
||||
# sha1 = { version = "0.10.6", features = ["asm"] }
|
||||
|
||||
sha1 = "0.10.6"
|
||||
hex = "0.4.3"
|
||||
vectorscan-rs = "0.0.5"
|
||||
regex = "1.11.1"
|
||||
|
|
|
|||
|
|
@ -294,9 +294,15 @@ impl BlobId {
|
|||
/// Create a new BlobId computed from the given input.
|
||||
#[inline]
|
||||
pub fn new(input: &[u8]) -> Self {
|
||||
const CHUNK: usize = 64 * 1024; // 64KB from start and end
|
||||
let mut hasher = Sha1::new();
|
||||
write!(&mut hasher, "blob {}\0", input.len()).unwrap();
|
||||
hasher.update(input);
|
||||
if input.len() <= CHUNK * 2 {
|
||||
hasher.update(input);
|
||||
} else {
|
||||
hasher.update(&input[..CHUNK]);
|
||||
hasher.update(&input[input.len() - CHUNK..]);
|
||||
}
|
||||
BlobId(hasher.finalize().as_slice().try_into().expect("SHA-1 output size mismatch"))
|
||||
}
|
||||
|
||||
|
|
@ -406,5 +412,4 @@ impl BlobMetadata {
|
|||
pub fn mime_essence(&self) -> Option<&str> {
|
||||
self.mime_essence.as_deref()
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -169,7 +169,7 @@ pub struct InputSpecifierArgs {
|
|||
#[derive(Args, Debug, Clone)]
|
||||
pub struct ContentFilteringArgs {
|
||||
/// Ignore files larger than the given size in MB
|
||||
#[arg(long("max-file-size"), default_value_t = 25.0)]
|
||||
#[arg(long("max-file-size"), default_value_t = 64.0)]
|
||||
pub max_file_size_mb: f64,
|
||||
|
||||
// /// Use custom path-based ignore rules from the given file(s)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
use core::ops::Range;
|
||||
use std::cell::RefCell;
|
||||
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
|
@ -87,24 +88,35 @@ impl std::fmt::Display for SourceSpan {
|
|||
}
|
||||
|
||||
/// Records newline byte‑offsets to map offsets -- (line, column).
|
||||
pub struct LocationMapping {
|
||||
newline_offsets: Vec<usize>,
|
||||
pub struct LocationMapping<'a> {
|
||||
bytes: &'a [u8],
|
||||
newline_offsets: RefCell<Vec<usize>>,
|
||||
}
|
||||
|
||||
impl LocationMapping {
|
||||
/// Scan once for all `\n` positions.
|
||||
pub fn new(input: &[u8]) -> Self {
|
||||
let newline_offsets = memchr::memchr_iter(b'\n', input).collect();
|
||||
LocationMapping { newline_offsets }
|
||||
impl<'a> LocationMapping<'a> {
|
||||
/// Create a new mapping without pre-scanning the entire input.
|
||||
pub fn new(input: &'a [u8]) -> Self {
|
||||
LocationMapping { bytes: input, newline_offsets: RefCell::new(Vec::new()) }
|
||||
}
|
||||
|
||||
/// Map a byte offset to a `SourcePoint`.
|
||||
pub fn get_source_point(&self, offset: usize) -> SourcePoint {
|
||||
let line = match self.newline_offsets.binary_search(&offset) {
|
||||
Ok(idx) => idx + 2, // exact newline -- next line
|
||||
fn ensure_offsets_up_to(&self, offset: usize) {
|
||||
let mut offsets = self.newline_offsets.borrow_mut();
|
||||
let start = offsets.last().map_or(0, |&last| last + 1);
|
||||
if offset < start {
|
||||
return;
|
||||
}
|
||||
let end = offset.min(self.bytes.len());
|
||||
for nl in memchr::memchr_iter(b'\n', &self.bytes[start..end]) {
|
||||
offsets.push(start + nl);
|
||||
}
|
||||
}
|
||||
|
||||
fn source_point_from_offsets(offsets: &[usize], offset: usize) -> SourcePoint {
|
||||
let line = match offsets.binary_search(&offset) {
|
||||
Ok(idx) => idx + 2,
|
||||
Err(idx) => idx + 1,
|
||||
};
|
||||
let column = if let Some(&last) = self.newline_offsets.get(line.saturating_sub(2)) {
|
||||
let column = if let Some(&last) = offsets.get(line.saturating_sub(2)) {
|
||||
offset.saturating_sub(last + 1)
|
||||
} else {
|
||||
offset
|
||||
|
|
@ -112,10 +124,19 @@ impl LocationMapping {
|
|||
SourcePoint { line, column }
|
||||
}
|
||||
|
||||
/// Map a byte offset to a `SourcePoint`.
|
||||
pub fn get_source_point(&self, offset: usize) -> SourcePoint {
|
||||
self.ensure_offsets_up_to(offset);
|
||||
let offsets = self.newline_offsets.borrow();
|
||||
Self::source_point_from_offsets(&offsets, offset)
|
||||
}
|
||||
|
||||
/// Map an `OffsetSpan` -- `SourceSpan` (closed interval).
|
||||
pub fn get_source_span(&self, span: &OffsetSpan) -> SourceSpan {
|
||||
let start = self.get_source_point(span.start);
|
||||
let end = self.get_source_point(span.end.saturating_sub(1));
|
||||
self.ensure_offsets_up_to(span.end.saturating_sub(1));
|
||||
let offsets = self.newline_offsets.borrow();
|
||||
let start = Self::source_point_from_offsets(&offsets, span.start);
|
||||
let end = Self::source_point_from_offsets(&offsets, span.end.saturating_sub(1));
|
||||
SourceSpan { start, end }
|
||||
}
|
||||
}
|
||||
|
|
|
|||
166
src/matcher.rs
166
src/matcher.rs
|
|
@ -10,7 +10,6 @@ use anyhow::Result;
|
|||
use base64::{engine::general_purpose, Engine};
|
||||
use bstr::BString;
|
||||
use http::StatusCode;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::bytes::Regex;
|
||||
use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
|
||||
use schemars::{
|
||||
|
|
@ -27,7 +26,7 @@ use xxhash_rust::xxh3::xxh3_64;
|
|||
use crate::{
|
||||
blob::{Blob, BlobId, BlobIdMap},
|
||||
entropy::calculate_shannon_entropy,
|
||||
location::{Location, LocationMapping, OffsetSpan},
|
||||
location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
|
||||
origin::OriginSet,
|
||||
parser,
|
||||
parser::{Checker, Language},
|
||||
|
|
@ -40,6 +39,10 @@ use crate::{
|
|||
util::{intern, redact_value},
|
||||
};
|
||||
|
||||
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
|
||||
const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
|
||||
const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// RawMatch
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
|
@ -255,17 +258,28 @@ impl<'a> Matcher<'a> {
|
|||
self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64));
|
||||
|
||||
self.user_data.input_len = input.len() as u64;
|
||||
// self.vs_scanner.scan(input, |rid, from, to, _flags| {
|
||||
self.scanner_pool.with(|scanner| {
|
||||
scanner.scan(input, |rule_id, from, to, _flags| {
|
||||
self.user_data.raw_matches_scratch.push(RawMatch {
|
||||
rule_id,
|
||||
start_idx: from,
|
||||
end_idx: to,
|
||||
});
|
||||
vectorscan_rs::Scan::Continue
|
||||
})
|
||||
})?;
|
||||
|
||||
let mut offset: usize = 0;
|
||||
while offset < input.len() {
|
||||
let end = (offset + MAX_CHUNK_SIZE).min(input.len());
|
||||
let slice = &input[offset..end];
|
||||
let base = offset as u64;
|
||||
self.scanner_pool.with(|scanner| {
|
||||
scanner.scan(slice, |rule_id, from, to, _flags| {
|
||||
self.user_data.raw_matches_scratch.push(RawMatch {
|
||||
rule_id,
|
||||
start_idx: from + base,
|
||||
end_idx: to + base,
|
||||
});
|
||||
vectorscan_rs::Scan::Continue
|
||||
})
|
||||
})?;
|
||||
|
||||
if end == input.len() {
|
||||
break;
|
||||
}
|
||||
offset = end.saturating_sub(CHUNK_OVERLAP);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -302,7 +316,11 @@ impl<'a> Matcher<'a> {
|
|||
// Opportunistically look for standalone Base64 blobs. If neither
|
||||
// the raw scan nor this check yields anything, we can return early
|
||||
// before doing any heavier work.
|
||||
let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) };
|
||||
let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
|
||||
Vec::new()
|
||||
} else {
|
||||
get_base64_strings(blob.bytes())
|
||||
};
|
||||
|
||||
if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() {
|
||||
return Ok(ScanResult::New(Vec::new()));
|
||||
|
|
@ -310,7 +328,7 @@ impl<'a> Matcher<'a> {
|
|||
|
||||
let rules_db = self.rules_db;
|
||||
let mut seen_matches = FxHashSet::default();
|
||||
let mut previous_matches = Vec::new();
|
||||
let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
|
||||
let tree_sitter_result = if self.user_data.raw_matches_scratch.is_empty() {
|
||||
None
|
||||
} else {
|
||||
|
|
@ -343,27 +361,19 @@ impl<'a> Matcher<'a> {
|
|||
})
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
let mut previous_raw_matches: Vec<(usize, OffsetSpan)> = Vec::new();
|
||||
let mut previous_raw_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
|
||||
for &RawMatch { rule_id, start_idx, end_idx } in
|
||||
self.user_data.raw_matches_scratch.iter().rev()
|
||||
{
|
||||
let rule_id_usize: usize = rule_id as usize;
|
||||
// let rule = &rules_db.rules[rule_id_usize];
|
||||
let rule = Arc::clone(&rules_db.rules[rule_id_usize]);
|
||||
let re = &rules_db.anchored_regexes[rule_id_usize];
|
||||
let start_idx_usize = start_idx as usize;
|
||||
let end_idx_usize = end_idx as usize;
|
||||
let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
|
||||
// Skip if fully contained in a previous match
|
||||
if previous_raw_matches.iter().any(|(prev_id, prev_span): &(usize, OffsetSpan)| {
|
||||
*prev_id == rule_id_usize
|
||||
&& (prev_span.fully_contains(¤t_span)
|
||||
|| current_span.fully_contains(prev_span))
|
||||
}) {
|
||||
if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
|
||||
continue;
|
||||
}
|
||||
let matching_input_offset_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
|
||||
previous_raw_matches.push((rule_id_usize, matching_input_offset_span));
|
||||
filter_match(
|
||||
blob,
|
||||
rule,
|
||||
|
|
@ -487,6 +497,39 @@ fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -
|
|||
end.hash(&mut hasher);
|
||||
hasher.finish()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
|
||||
let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
|
||||
if idx > 0 {
|
||||
if spans[idx - 1].fully_contains(&span) {
|
||||
return false;
|
||||
}
|
||||
if span.fully_contains(&spans[idx - 1]) {
|
||||
spans.remove(idx - 1);
|
||||
idx -= 1;
|
||||
}
|
||||
}
|
||||
if idx < spans.len() {
|
||||
if spans[idx].fully_contains(&span) {
|
||||
return false;
|
||||
}
|
||||
if span.fully_contains(&spans[idx]) {
|
||||
spans.remove(idx);
|
||||
}
|
||||
}
|
||||
spans.insert(idx, span);
|
||||
true
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_match(
|
||||
map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
|
||||
rule_id: usize,
|
||||
span: OffsetSpan,
|
||||
) -> bool {
|
||||
insert_span(map.entry(rule_id).or_default(), span)
|
||||
}
|
||||
fn filter_match<'b>(
|
||||
blob: &'b Blob,
|
||||
// rule: &'b Rule,
|
||||
|
|
@ -495,7 +538,7 @@ fn filter_match<'b>(
|
|||
start: usize,
|
||||
end: usize,
|
||||
matches: &mut Vec<BlobMatch<'b>>,
|
||||
previous_matches: &mut Vec<(usize, OffsetSpan)>,
|
||||
previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
|
||||
rule_id: usize,
|
||||
seen_matches: &mut FxHashSet<u64>,
|
||||
_origin: &OriginSet,
|
||||
|
|
@ -544,11 +587,7 @@ fn filter_match<'b>(
|
|||
if !seen_matches.insert(match_key) {
|
||||
continue;
|
||||
}
|
||||
if previous_matches.iter().any(|(prev_rule_id, prev_loc)| {
|
||||
*prev_rule_id == rule_id
|
||||
&& (prev_loc.fully_contains(&matching_input_offset_span)
|
||||
|| matching_input_offset_span.fully_contains(prev_loc))
|
||||
}) {
|
||||
if !record_match(previous_matches, rule_id, matching_input_offset_span) {
|
||||
continue;
|
||||
}
|
||||
let only_matching_input =
|
||||
|
|
@ -567,7 +606,6 @@ fn filter_match<'b>(
|
|||
calculated_entropy,
|
||||
is_base64,
|
||||
});
|
||||
previous_matches.push((rule_id, matching_input_offset_span));
|
||||
}
|
||||
if let Some(t) = timer.take() {
|
||||
let new_count = (matches.len() - initial_len) as u64;
|
||||
|
|
@ -773,7 +811,7 @@ pub struct Match {
|
|||
impl Match {
|
||||
#[inline]
|
||||
pub fn convert_owned_blobmatch_to_match<'a>(
|
||||
loc_mapping: &'a LocationMapping,
|
||||
loc_mapping: Option<&'a LocationMapping<'a>>,
|
||||
owned_blob_match: &'a OwnedBlobMatch,
|
||||
origin_type: &'a str,
|
||||
) -> Self {
|
||||
|
|
@ -791,7 +829,11 @@ impl Match {
|
|||
// The fingerprint will be based on the content of the secret.
|
||||
let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
|
||||
|
||||
let source_span = loc_mapping.get_source_span(&offset_span);
|
||||
let source_span =
|
||||
loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
|
||||
start: SourcePoint { line: 0, column: 0 },
|
||||
end: SourcePoint { line: 0, column: 0 },
|
||||
});
|
||||
let offset_start: u64 =
|
||||
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
|
||||
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
|
||||
|
|
@ -866,33 +908,49 @@ pub struct DecodedData {
|
|||
pub pos_start: usize,
|
||||
pub pos_end: usize,
|
||||
}
|
||||
#[inline]
|
||||
fn is_base64_byte(b: u8) -> bool {
|
||||
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/')
|
||||
}
|
||||
|
||||
pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
|
||||
lazy_static! {
|
||||
// Require a reasonably long run of valid Base64 characters to reduce
|
||||
// noise. 32 bytes corresponds to 24 decoded bytes.
|
||||
static ref RE_BASE64: Regex = Regex::new(r"[A-Za-z0-9+/]{32,}={0,2}").unwrap();
|
||||
}
|
||||
let mut results = Vec::new();
|
||||
for m in RE_BASE64.find_iter(input) {
|
||||
let base64_string = m.as_bytes();
|
||||
// Skip candidates whose length isn't a multiple of four – they cannot
|
||||
// be valid Base64.
|
||||
if base64_string.len() % 4 != 0 {
|
||||
continue;
|
||||
let mut i = 0;
|
||||
while i < input.len() {
|
||||
while i < input.len() && !is_base64_byte(input[i]) {
|
||||
i += 1;
|
||||
}
|
||||
if let Ok(decoded) = general_purpose::STANDARD.decode(base64_string) {
|
||||
if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
|
||||
if decoded_str.is_ascii() {
|
||||
results.push(DecodedData {
|
||||
original: String::from_utf8_lossy(base64_string).into_owned(),
|
||||
decoded: decoded_str.to_string(),
|
||||
pos_start: m.start(),
|
||||
pos_end: m.end(),
|
||||
});
|
||||
let start = i;
|
||||
while i < input.len() && is_base64_byte(input[i]) {
|
||||
i += 1;
|
||||
}
|
||||
|
||||
let mut end = i;
|
||||
let mut eq_count = 0;
|
||||
while i < input.len() && input[i] == b'=' && eq_count < 2 {
|
||||
i += 1;
|
||||
eq_count += 1;
|
||||
}
|
||||
end = i;
|
||||
|
||||
let len = end - start;
|
||||
if len >= 32 && len % 4 == 0 {
|
||||
let base64_slice = &input[start..end];
|
||||
if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) {
|
||||
if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
|
||||
if decoded_str.is_ascii() {
|
||||
results.push(DecodedData {
|
||||
original: String::from_utf8_lossy(base64_slice).into_owned(),
|
||||
decoded: decoded_str.to_string(),
|
||||
pos_start: start,
|
||||
pos_end: end,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@ use crate::{
|
|||
Path,
|
||||
};
|
||||
|
||||
const LOCATION_LIMIT_BYTES: usize = 256 * 1024 * 1024;
|
||||
|
||||
/// A matcher along with parameters that remain constant during a single
|
||||
/// `scan` run
|
||||
pub struct BlobProcessor<'a> {
|
||||
|
|
@ -81,12 +83,17 @@ impl<'a> BlobProcessor<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
let loc_mapping = LocationMapping::new(&blob.bytes());
|
||||
let bytes = blob.bytes();
|
||||
let loc_mapping = if bytes.len() <= LOCATION_LIMIT_BYTES {
|
||||
Some(LocationMapping::new(bytes))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let converted_matches: Vec<(Option<f64>, Match)> = matches
|
||||
.into_iter()
|
||||
.map(|m| {
|
||||
let converted_match = Match::convert_owned_blobmatch_to_match(
|
||||
&loc_mapping,
|
||||
loc_mapping.as_ref(),
|
||||
&OwnedBlobMatch::from_blob_match(m),
|
||||
origin_type,
|
||||
);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue