- Improved error message when self-update cannot find the current binary

- Optimized memory usage via string interning and extensive data sharing
- Replaced quadratic match filtering with a per-rule span map, fixing missed secrets in extremely large files and improving scan performance
- Support scanning extremely large files by chunking input into 1 GiB segments with small overlaps, avoiding vectorscan buffer limits while preserving match offsets
- Always use chunked vectorscan, eliminating the slow regex fallback for blobs over 4 GiB
- Skip Base64 scanning for blobs over 64 MB to avoid a second pass over massive files
- Increased max-file-size default to 64 MB (up from 25 MB)
This commit is contained in:
Mick Grove 2025-09-04 21:51:24 -07:00
commit 8d15c8eabf
7 changed files with 172 additions and 74 deletions

View file

@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
## [Unreleased]
- Improved error message when self-update cannot find the current binary
- Optimized memory usage via string interning and extensive data sharing
- Replaced quadratic match filtering with a per-rule span map, fixing missed secrets in extremely large files and improving scan performance
- Support scanning extremely large files by chunking input into 1 GiB segments with small overlaps, avoiding vectorscan buffer limits while preserving match offsets
- Always use chunked vectorscan, eliminating the slow regex fallback for blobs over 4 GiB
- Skip Base64 scanning for blobs over 64 MB to avoid a second pass over massive files
- Increased max-file-size default to 64 MB (up from 25 MB)
## [1.47.0]
- MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them

View file

@ -65,7 +65,9 @@ smallvec = { version = "1", features = [
tracing = "0.1.41"
indicatif = { version = "0.17", features = ["improved_unicode"] }
rayon = "1.10"
sha1 = { version = "0.10.6", features = ["asm"] }
# sha1 = { version = "0.10.6", features = ["asm"] }
sha1 = "0.10.6"
hex = "0.4.3"
vectorscan-rs = "0.0.5"
regex = "1.11.1"

View file

@ -294,9 +294,15 @@ impl BlobId {
/// Create a new BlobId computed from the given input.
#[inline]
pub fn new(input: &[u8]) -> Self {
const CHUNK: usize = 64 * 1024; // 64KB from start and end
let mut hasher = Sha1::new();
write!(&mut hasher, "blob {}\0", input.len()).unwrap();
hasher.update(input);
if input.len() <= CHUNK * 2 {
hasher.update(input);
} else {
hasher.update(&input[..CHUNK]);
hasher.update(&input[input.len() - CHUNK..]);
}
BlobId(hasher.finalize().as_slice().try_into().expect("SHA-1 output size mismatch"))
}
@ -406,5 +412,4 @@ impl BlobMetadata {
pub fn mime_essence(&self) -> Option<&str> {
self.mime_essence.as_deref()
}
}

View file

@ -169,7 +169,7 @@ pub struct InputSpecifierArgs {
#[derive(Args, Debug, Clone)]
pub struct ContentFilteringArgs {
/// Ignore files larger than the given size in MB
#[arg(long("max-file-size"), default_value_t = 25.0)]
#[arg(long("max-file-size"), default_value_t = 64.0)]
pub max_file_size_mb: f64,
// /// Use custom path-based ignore rules from the given file(s)

View file

@ -1,4 +1,5 @@
use core::ops::Range;
use std::cell::RefCell;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
@ -87,24 +88,35 @@ impl std::fmt::Display for SourceSpan {
}
/// Records newline byteoffsets to map offsets -- (line, column).
pub struct LocationMapping {
newline_offsets: Vec<usize>,
pub struct LocationMapping<'a> {
bytes: &'a [u8],
newline_offsets: RefCell<Vec<usize>>,
}
impl LocationMapping {
/// Scan once for all `\n` positions.
pub fn new(input: &[u8]) -> Self {
let newline_offsets = memchr::memchr_iter(b'\n', input).collect();
LocationMapping { newline_offsets }
impl<'a> LocationMapping<'a> {
/// Create a new mapping without pre-scanning the entire input.
pub fn new(input: &'a [u8]) -> Self {
LocationMapping { bytes: input, newline_offsets: RefCell::new(Vec::new()) }
}
/// Map a byte offset to a `SourcePoint`.
pub fn get_source_point(&self, offset: usize) -> SourcePoint {
let line = match self.newline_offsets.binary_search(&offset) {
Ok(idx) => idx + 2, // exact newline -- next line
fn ensure_offsets_up_to(&self, offset: usize) {
let mut offsets = self.newline_offsets.borrow_mut();
let start = offsets.last().map_or(0, |&last| last + 1);
if offset < start {
return;
}
let end = offset.min(self.bytes.len());
for nl in memchr::memchr_iter(b'\n', &self.bytes[start..end]) {
offsets.push(start + nl);
}
}
fn source_point_from_offsets(offsets: &[usize], offset: usize) -> SourcePoint {
let line = match offsets.binary_search(&offset) {
Ok(idx) => idx + 2,
Err(idx) => idx + 1,
};
let column = if let Some(&last) = self.newline_offsets.get(line.saturating_sub(2)) {
let column = if let Some(&last) = offsets.get(line.saturating_sub(2)) {
offset.saturating_sub(last + 1)
} else {
offset
@ -112,10 +124,19 @@ impl LocationMapping {
SourcePoint { line, column }
}
/// Map a byte offset to a `SourcePoint`.
pub fn get_source_point(&self, offset: usize) -> SourcePoint {
self.ensure_offsets_up_to(offset);
let offsets = self.newline_offsets.borrow();
Self::source_point_from_offsets(&offsets, offset)
}
/// Map an `OffsetSpan` -- `SourceSpan` (closed interval).
pub fn get_source_span(&self, span: &OffsetSpan) -> SourceSpan {
let start = self.get_source_point(span.start);
let end = self.get_source_point(span.end.saturating_sub(1));
self.ensure_offsets_up_to(span.end.saturating_sub(1));
let offsets = self.newline_offsets.borrow();
let start = Self::source_point_from_offsets(&offsets, span.start);
let end = Self::source_point_from_offsets(&offsets, span.end.saturating_sub(1));
SourceSpan { start, end }
}
}

View file

@ -10,7 +10,6 @@ use anyhow::Result;
use base64::{engine::general_purpose, Engine};
use bstr::BString;
use http::StatusCode;
use lazy_static::lazy_static;
use regex::bytes::Regex;
use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
use schemars::{
@ -27,7 +26,7 @@ use xxhash_rust::xxh3::xxh3_64;
use crate::{
blob::{Blob, BlobId, BlobIdMap},
entropy::calculate_shannon_entropy,
location::{Location, LocationMapping, OffsetSpan},
location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
origin::OriginSet,
parser,
parser::{Checker, Language},
@ -40,6 +39,10 @@ use crate::{
util::{intern, redact_value},
};
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
// -------------------------------------------------------------------------------------------------
// RawMatch
// -------------------------------------------------------------------------------------------------
@ -255,17 +258,28 @@ impl<'a> Matcher<'a> {
self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64));
self.user_data.input_len = input.len() as u64;
// self.vs_scanner.scan(input, |rid, from, to, _flags| {
self.scanner_pool.with(|scanner| {
scanner.scan(input, |rule_id, from, to, _flags| {
self.user_data.raw_matches_scratch.push(RawMatch {
rule_id,
start_idx: from,
end_idx: to,
});
vectorscan_rs::Scan::Continue
})
})?;
let mut offset: usize = 0;
while offset < input.len() {
let end = (offset + MAX_CHUNK_SIZE).min(input.len());
let slice = &input[offset..end];
let base = offset as u64;
self.scanner_pool.with(|scanner| {
scanner.scan(slice, |rule_id, from, to, _flags| {
self.user_data.raw_matches_scratch.push(RawMatch {
rule_id,
start_idx: from + base,
end_idx: to + base,
});
vectorscan_rs::Scan::Continue
})
})?;
if end == input.len() {
break;
}
offset = end.saturating_sub(CHUNK_OVERLAP);
}
Ok(())
}
@ -302,7 +316,11 @@ impl<'a> Matcher<'a> {
// Opportunistically look for standalone Base64 blobs. If neither
// the raw scan nor this check yields anything, we can return early
// before doing any heavier work.
let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) };
let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
Vec::new()
} else {
get_base64_strings(blob.bytes())
};
if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() {
return Ok(ScanResult::New(Vec::new()));
@ -310,7 +328,7 @@ impl<'a> Matcher<'a> {
let rules_db = self.rules_db;
let mut seen_matches = FxHashSet::default();
let mut previous_matches = Vec::new();
let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
let tree_sitter_result = if self.user_data.raw_matches_scratch.is_empty() {
None
} else {
@ -343,27 +361,19 @@ impl<'a> Matcher<'a> {
})
.collect::<Vec<_>>()
});
let mut previous_raw_matches: Vec<(usize, OffsetSpan)> = Vec::new();
let mut previous_raw_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
for &RawMatch { rule_id, start_idx, end_idx } in
self.user_data.raw_matches_scratch.iter().rev()
{
let rule_id_usize: usize = rule_id as usize;
// let rule = &rules_db.rules[rule_id_usize];
let rule = Arc::clone(&rules_db.rules[rule_id_usize]);
let re = &rules_db.anchored_regexes[rule_id_usize];
let start_idx_usize = start_idx as usize;
let end_idx_usize = end_idx as usize;
let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
// Skip if fully contained in a previous match
if previous_raw_matches.iter().any(|(prev_id, prev_span): &(usize, OffsetSpan)| {
*prev_id == rule_id_usize
&& (prev_span.fully_contains(&current_span)
|| current_span.fully_contains(prev_span))
}) {
if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
continue;
}
let matching_input_offset_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
previous_raw_matches.push((rule_id_usize, matching_input_offset_span));
filter_match(
blob,
rule,
@ -487,6 +497,39 @@ fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -
end.hash(&mut hasher);
hasher.finish()
}
#[inline]
fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
if idx > 0 {
if spans[idx - 1].fully_contains(&span) {
return false;
}
if span.fully_contains(&spans[idx - 1]) {
spans.remove(idx - 1);
idx -= 1;
}
}
if idx < spans.len() {
if spans[idx].fully_contains(&span) {
return false;
}
if span.fully_contains(&spans[idx]) {
spans.remove(idx);
}
}
spans.insert(idx, span);
true
}
#[inline]
fn record_match(
map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
rule_id: usize,
span: OffsetSpan,
) -> bool {
insert_span(map.entry(rule_id).or_default(), span)
}
fn filter_match<'b>(
blob: &'b Blob,
// rule: &'b Rule,
@ -495,7 +538,7 @@ fn filter_match<'b>(
start: usize,
end: usize,
matches: &mut Vec<BlobMatch<'b>>,
previous_matches: &mut Vec<(usize, OffsetSpan)>,
previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
rule_id: usize,
seen_matches: &mut FxHashSet<u64>,
_origin: &OriginSet,
@ -544,11 +587,7 @@ fn filter_match<'b>(
if !seen_matches.insert(match_key) {
continue;
}
if previous_matches.iter().any(|(prev_rule_id, prev_loc)| {
*prev_rule_id == rule_id
&& (prev_loc.fully_contains(&matching_input_offset_span)
|| matching_input_offset_span.fully_contains(prev_loc))
}) {
if !record_match(previous_matches, rule_id, matching_input_offset_span) {
continue;
}
let only_matching_input =
@ -567,7 +606,6 @@ fn filter_match<'b>(
calculated_entropy,
is_base64,
});
previous_matches.push((rule_id, matching_input_offset_span));
}
if let Some(t) = timer.take() {
let new_count = (matches.len() - initial_len) as u64;
@ -773,7 +811,7 @@ pub struct Match {
impl Match {
#[inline]
pub fn convert_owned_blobmatch_to_match<'a>(
loc_mapping: &'a LocationMapping,
loc_mapping: Option<&'a LocationMapping<'a>>,
owned_blob_match: &'a OwnedBlobMatch,
origin_type: &'a str,
) -> Self {
@ -791,7 +829,11 @@ impl Match {
// The fingerprint will be based on the content of the secret.
let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
let source_span = loc_mapping.get_source_span(&offset_span);
let source_span =
loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
start: SourcePoint { line: 0, column: 0 },
end: SourcePoint { line: 0, column: 0 },
});
let offset_start: u64 =
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
@ -866,33 +908,49 @@ pub struct DecodedData {
pub pos_start: usize,
pub pos_end: usize,
}
#[inline]
fn is_base64_byte(b: u8) -> bool {
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/')
}
pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
lazy_static! {
// Require a reasonably long run of valid Base64 characters to reduce
// noise. 32 bytes corresponds to 24 decoded bytes.
static ref RE_BASE64: Regex = Regex::new(r"[A-Za-z0-9+/]{32,}={0,2}").unwrap();
}
let mut results = Vec::new();
for m in RE_BASE64.find_iter(input) {
let base64_string = m.as_bytes();
// Skip candidates whose length isn't a multiple of four they cannot
// be valid Base64.
if base64_string.len() % 4 != 0 {
continue;
let mut i = 0;
while i < input.len() {
while i < input.len() && !is_base64_byte(input[i]) {
i += 1;
}
if let Ok(decoded) = general_purpose::STANDARD.decode(base64_string) {
if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
if decoded_str.is_ascii() {
results.push(DecodedData {
original: String::from_utf8_lossy(base64_string).into_owned(),
decoded: decoded_str.to_string(),
pos_start: m.start(),
pos_end: m.end(),
});
let start = i;
while i < input.len() && is_base64_byte(input[i]) {
i += 1;
}
let mut end = i;
let mut eq_count = 0;
while i < input.len() && input[i] == b'=' && eq_count < 2 {
i += 1;
eq_count += 1;
}
end = i;
let len = end - start;
if len >= 32 && len % 4 == 0 {
let base64_slice = &input[start..end];
if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) {
if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
if decoded_str.is_ascii() {
results.push(DecodedData {
original: String::from_utf8_lossy(base64_slice).into_owned(),
decoded: decoded_str.to_string(),
pos_start: start,
pos_end: end,
});
}
}
}
}
}
results
}

View file

@ -12,6 +12,8 @@ use crate::{
Path,
};
const LOCATION_LIMIT_BYTES: usize = 256 * 1024 * 1024;
/// A matcher along with parameters that remain constant during a single
/// `scan` run
pub struct BlobProcessor<'a> {
@ -81,12 +83,17 @@ impl<'a> BlobProcessor<'a> {
}
}
let loc_mapping = LocationMapping::new(&blob.bytes());
let bytes = blob.bytes();
let loc_mapping = if bytes.len() <= LOCATION_LIMIT_BYTES {
Some(LocationMapping::new(bytes))
} else {
None
};
let converted_matches: Vec<(Option<f64>, Match)> = matches
.into_iter()
.map(|m| {
let converted_match = Match::convert_owned_blobmatch_to_match(
&loc_mapping,
loc_mapping.as_ref(),
&OwnedBlobMatch::from_blob_match(m),
origin_type,
);