diff --git a/CHANGELOG.md b/CHANGELOG.md index 2eae1b7..7941fcb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file. ## [Unreleased] - Improved error message when self-update cannot find the current binary - Optimized memory usage via string interning and extensive data sharing +- Replaced quadratic match filtering with a per-rule span map, fixing missed secrets in extremely large files and improving scan performance +- Support scanning extremely large files by chunking input into 1 GiB segments with small overlaps, avoiding vectorscan buffer limits while preserving match offsets +- Always use chunked vectorscan, eliminating the slow regex fallback for blobs over 4 GiB +- Skip Base64 scanning for blobs over 64 MB to avoid a second pass over massive files +- Increased max-file-size default to 64 MB (up from 25 MB) ## [1.47.0] - MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them diff --git a/Cargo.toml b/Cargo.toml index ac3b6a6..5404491 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,7 +65,9 @@ smallvec = { version = "1", features = [ tracing = "0.1.41" indicatif = { version = "0.17", features = ["improved_unicode"] } rayon = "1.10" -sha1 = { version = "0.10.6", features = ["asm"] } +# sha1 = { version = "0.10.6", features = ["asm"] } + +sha1 = "0.10.6" hex = "0.4.3" vectorscan-rs = "0.0.5" regex = "1.11.1" diff --git a/src/blob.rs b/src/blob.rs index 9530882..9b1aff1 100644 --- a/src/blob.rs +++ b/src/blob.rs @@ -294,9 +294,15 @@ impl BlobId { /// Create a new BlobId computed from the given input. #[inline] pub fn new(input: &[u8]) -> Self { + const CHUNK: usize = 64 * 1024; // 64KB from start and end let mut hasher = Sha1::new(); write!(&mut hasher, "blob {}\0", input.len()).unwrap(); - hasher.update(input); + if input.len() <= CHUNK * 2 { + hasher.update(input); + } else { + hasher.update(&input[..CHUNK]); + hasher.update(&input[input.len() - CHUNK..]); + } BlobId(hasher.finalize().as_slice().try_into().expect("SHA-1 output size mismatch")) } @@ -406,5 +412,4 @@ impl BlobMetadata { pub fn mime_essence(&self) -> Option<&str> { self.mime_essence.as_deref() } - } diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs index f0b8fa8..4f2dc5b 100644 --- a/src/cli/commands/inputs.rs +++ b/src/cli/commands/inputs.rs @@ -169,7 +169,7 @@ pub struct InputSpecifierArgs { #[derive(Args, Debug, Clone)] pub struct ContentFilteringArgs { /// Ignore files larger than the given size in MB - #[arg(long("max-file-size"), default_value_t = 25.0)] + #[arg(long("max-file-size"), default_value_t = 64.0)] pub max_file_size_mb: f64, // /// Use custom path-based ignore rules from the given file(s) diff --git a/src/location.rs b/src/location.rs index 05a7586..a50602a 100644 --- a/src/location.rs +++ b/src/location.rs @@ -1,4 +1,5 @@ use core::ops::Range; +use std::cell::RefCell; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -87,24 +88,35 @@ impl std::fmt::Display for SourceSpan { } /// Records newline byte‑offsets to map offsets -- (line, column). -pub struct LocationMapping { - newline_offsets: Vec, +pub struct LocationMapping<'a> { + bytes: &'a [u8], + newline_offsets: RefCell>, } -impl LocationMapping { - /// Scan once for all `\n` positions. - pub fn new(input: &[u8]) -> Self { - let newline_offsets = memchr::memchr_iter(b'\n', input).collect(); - LocationMapping { newline_offsets } +impl<'a> LocationMapping<'a> { + /// Create a new mapping without pre-scanning the entire input. + pub fn new(input: &'a [u8]) -> Self { + LocationMapping { bytes: input, newline_offsets: RefCell::new(Vec::new()) } } - /// Map a byte offset to a `SourcePoint`. - pub fn get_source_point(&self, offset: usize) -> SourcePoint { - let line = match self.newline_offsets.binary_search(&offset) { - Ok(idx) => idx + 2, // exact newline -- next line + fn ensure_offsets_up_to(&self, offset: usize) { + let mut offsets = self.newline_offsets.borrow_mut(); + let start = offsets.last().map_or(0, |&last| last + 1); + if offset < start { + return; + } + let end = offset.min(self.bytes.len()); + for nl in memchr::memchr_iter(b'\n', &self.bytes[start..end]) { + offsets.push(start + nl); + } + } + + fn source_point_from_offsets(offsets: &[usize], offset: usize) -> SourcePoint { + let line = match offsets.binary_search(&offset) { + Ok(idx) => idx + 2, Err(idx) => idx + 1, }; - let column = if let Some(&last) = self.newline_offsets.get(line.saturating_sub(2)) { + let column = if let Some(&last) = offsets.get(line.saturating_sub(2)) { offset.saturating_sub(last + 1) } else { offset @@ -112,10 +124,19 @@ impl LocationMapping { SourcePoint { line, column } } + /// Map a byte offset to a `SourcePoint`. + pub fn get_source_point(&self, offset: usize) -> SourcePoint { + self.ensure_offsets_up_to(offset); + let offsets = self.newline_offsets.borrow(); + Self::source_point_from_offsets(&offsets, offset) + } + /// Map an `OffsetSpan` -- `SourceSpan` (closed interval). pub fn get_source_span(&self, span: &OffsetSpan) -> SourceSpan { - let start = self.get_source_point(span.start); - let end = self.get_source_point(span.end.saturating_sub(1)); + self.ensure_offsets_up_to(span.end.saturating_sub(1)); + let offsets = self.newline_offsets.borrow(); + let start = Self::source_point_from_offsets(&offsets, span.start); + let end = Self::source_point_from_offsets(&offsets, span.end.saturating_sub(1)); SourceSpan { start, end } } } diff --git a/src/matcher.rs b/src/matcher.rs index 07f53d3..1211768 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -10,7 +10,6 @@ use anyhow::Result; use base64::{engine::general_purpose, Engine}; use bstr::BString; use http::StatusCode; -use lazy_static::lazy_static; use regex::bytes::Regex; use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; use schemars::{ @@ -27,7 +26,7 @@ use xxhash_rust::xxh3::xxh3_64; use crate::{ blob::{Blob, BlobId, BlobIdMap}, entropy::calculate_shannon_entropy, - location::{Location, LocationMapping, OffsetSpan}, + location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, origin::OriginSet, parser, parser::{Checker, Language}, @@ -40,6 +39,10 @@ use crate::{ util::{intern, redact_value}, }; +const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment +const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches +const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs + // ------------------------------------------------------------------------------------------------- // RawMatch // ------------------------------------------------------------------------------------------------- @@ -255,17 +258,28 @@ impl<'a> Matcher<'a> { self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64)); self.user_data.input_len = input.len() as u64; - // self.vs_scanner.scan(input, |rid, from, to, _flags| { - self.scanner_pool.with(|scanner| { - scanner.scan(input, |rule_id, from, to, _flags| { - self.user_data.raw_matches_scratch.push(RawMatch { - rule_id, - start_idx: from, - end_idx: to, - }); - vectorscan_rs::Scan::Continue - }) - })?; + + let mut offset: usize = 0; + while offset < input.len() { + let end = (offset + MAX_CHUNK_SIZE).min(input.len()); + let slice = &input[offset..end]; + let base = offset as u64; + self.scanner_pool.with(|scanner| { + scanner.scan(slice, |rule_id, from, to, _flags| { + self.user_data.raw_matches_scratch.push(RawMatch { + rule_id, + start_idx: from + base, + end_idx: to + base, + }); + vectorscan_rs::Scan::Continue + }) + })?; + + if end == input.len() { + break; + } + offset = end.saturating_sub(CHUNK_OVERLAP); + } Ok(()) } @@ -302,7 +316,11 @@ impl<'a> Matcher<'a> { // Opportunistically look for standalone Base64 blobs. If neither // the raw scan nor this check yields anything, we can return early // before doing any heavier work. - let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) }; + let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT { + Vec::new() + } else { + get_base64_strings(blob.bytes()) + }; if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() { return Ok(ScanResult::New(Vec::new())); @@ -310,7 +328,7 @@ impl<'a> Matcher<'a> { let rules_db = self.rules_db; let mut seen_matches = FxHashSet::default(); - let mut previous_matches = Vec::new(); + let mut previous_matches: FxHashMap> = FxHashMap::default(); let tree_sitter_result = if self.user_data.raw_matches_scratch.is_empty() { None } else { @@ -343,27 +361,19 @@ impl<'a> Matcher<'a> { }) .collect::>() }); - let mut previous_raw_matches: Vec<(usize, OffsetSpan)> = Vec::new(); + let mut previous_raw_matches: FxHashMap> = FxHashMap::default(); for &RawMatch { rule_id, start_idx, end_idx } in self.user_data.raw_matches_scratch.iter().rev() { let rule_id_usize: usize = rule_id as usize; - // let rule = &rules_db.rules[rule_id_usize]; let rule = Arc::clone(&rules_db.rules[rule_id_usize]); let re = &rules_db.anchored_regexes[rule_id_usize]; let start_idx_usize = start_idx as usize; let end_idx_usize = end_idx as usize; let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize); - // Skip if fully contained in a previous match - if previous_raw_matches.iter().any(|(prev_id, prev_span): &(usize, OffsetSpan)| { - *prev_id == rule_id_usize - && (prev_span.fully_contains(¤t_span) - || current_span.fully_contains(prev_span)) - }) { + if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) { continue; } - let matching_input_offset_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize); - previous_raw_matches.push((rule_id_usize, matching_input_offset_span)); filter_match( blob, rule, @@ -487,6 +497,39 @@ fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) - end.hash(&mut hasher); hasher.finish() } + +#[inline] +fn insert_span(spans: &mut Vec, span: OffsetSpan) -> bool { + let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i); + if idx > 0 { + if spans[idx - 1].fully_contains(&span) { + return false; + } + if span.fully_contains(&spans[idx - 1]) { + spans.remove(idx - 1); + idx -= 1; + } + } + if idx < spans.len() { + if spans[idx].fully_contains(&span) { + return false; + } + if span.fully_contains(&spans[idx]) { + spans.remove(idx); + } + } + spans.insert(idx, span); + true +} + +#[inline] +fn record_match( + map: &mut FxHashMap>, + rule_id: usize, + span: OffsetSpan, +) -> bool { + insert_span(map.entry(rule_id).or_default(), span) +} fn filter_match<'b>( blob: &'b Blob, // rule: &'b Rule, @@ -495,7 +538,7 @@ fn filter_match<'b>( start: usize, end: usize, matches: &mut Vec>, - previous_matches: &mut Vec<(usize, OffsetSpan)>, + previous_matches: &mut FxHashMap>, rule_id: usize, seen_matches: &mut FxHashSet, _origin: &OriginSet, @@ -544,11 +587,7 @@ fn filter_match<'b>( if !seen_matches.insert(match_key) { continue; } - if previous_matches.iter().any(|(prev_rule_id, prev_loc)| { - *prev_rule_id == rule_id - && (prev_loc.fully_contains(&matching_input_offset_span) - || matching_input_offset_span.fully_contains(prev_loc)) - }) { + if !record_match(previous_matches, rule_id, matching_input_offset_span) { continue; } let only_matching_input = @@ -567,7 +606,6 @@ fn filter_match<'b>( calculated_entropy, is_base64, }); - previous_matches.push((rule_id, matching_input_offset_span)); } if let Some(t) = timer.take() { let new_count = (matches.len() - initial_len) as u64; @@ -773,7 +811,7 @@ pub struct Match { impl Match { #[inline] pub fn convert_owned_blobmatch_to_match<'a>( - loc_mapping: &'a LocationMapping, + loc_mapping: Option<&'a LocationMapping<'a>>, owned_blob_match: &'a OwnedBlobMatch, origin_type: &'a str, ) -> Self { @@ -791,7 +829,11 @@ impl Match { // The fingerprint will be based on the content of the secret. let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or(""); - let source_span = loc_mapping.get_source_span(&offset_span); + let source_span = + loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan { + start: SourcePoint { line: 0, column: 0 }, + end: SourcePoint { line: 0, column: 0 }, + }); let offset_start: u64 = owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); @@ -866,33 +908,49 @@ pub struct DecodedData { pub pos_start: usize, pub pos_end: usize, } +#[inline] +fn is_base64_byte(b: u8) -> bool { + matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/') +} + pub fn get_base64_strings(input: &[u8]) -> Vec { - lazy_static! { - // Require a reasonably long run of valid Base64 characters to reduce - // noise. 32 bytes corresponds to 24 decoded bytes. - static ref RE_BASE64: Regex = Regex::new(r"[A-Za-z0-9+/]{32,}={0,2}").unwrap(); - } let mut results = Vec::new(); - for m in RE_BASE64.find_iter(input) { - let base64_string = m.as_bytes(); - // Skip candidates whose length isn't a multiple of four – they cannot - // be valid Base64. - if base64_string.len() % 4 != 0 { - continue; + let mut i = 0; + while i < input.len() { + while i < input.len() && !is_base64_byte(input[i]) { + i += 1; } - if let Ok(decoded) = general_purpose::STANDARD.decode(base64_string) { - if let Ok(decoded_str) = std::str::from_utf8(&decoded) { - if decoded_str.is_ascii() { - results.push(DecodedData { - original: String::from_utf8_lossy(base64_string).into_owned(), - decoded: decoded_str.to_string(), - pos_start: m.start(), - pos_end: m.end(), - }); + let start = i; + while i < input.len() && is_base64_byte(input[i]) { + i += 1; + } + + let mut end = i; + let mut eq_count = 0; + while i < input.len() && input[i] == b'=' && eq_count < 2 { + i += 1; + eq_count += 1; + } + end = i; + + let len = end - start; + if len >= 32 && len % 4 == 0 { + let base64_slice = &input[start..end]; + if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) { + if let Ok(decoded_str) = std::str::from_utf8(&decoded) { + if decoded_str.is_ascii() { + results.push(DecodedData { + original: String::from_utf8_lossy(base64_slice).into_owned(), + decoded: decoded_str.to_string(), + pos_start: start, + pos_end: end, + }); + } } } } } + results } diff --git a/src/scanner/processing.rs b/src/scanner/processing.rs index 410233c..b3535d7 100644 --- a/src/scanner/processing.rs +++ b/src/scanner/processing.rs @@ -12,6 +12,8 @@ use crate::{ Path, }; +const LOCATION_LIMIT_BYTES: usize = 256 * 1024 * 1024; + /// A matcher along with parameters that remain constant during a single /// `scan` run pub struct BlobProcessor<'a> { @@ -81,12 +83,17 @@ impl<'a> BlobProcessor<'a> { } } - let loc_mapping = LocationMapping::new(&blob.bytes()); + let bytes = blob.bytes(); + let loc_mapping = if bytes.len() <= LOCATION_LIMIT_BYTES { + Some(LocationMapping::new(bytes)) + } else { + None + }; let converted_matches: Vec<(Option, Match)> = matches .into_iter() .map(|m| { let converted_match = Match::convert_owned_blobmatch_to_match( - &loc_mapping, + loc_mapping.as_ref(), &OwnedBlobMatch::from_blob_match(m), origin_type, );