diff --git a/src/matcher.rs.orig b/src/matcher.rs.orig deleted file mode 100644 index 2688875..0000000 --- a/src/matcher.rs.orig +++ /dev/null @@ -1,1317 +0,0 @@ -use std::{ - hash::{Hash, Hasher}, - str, - sync::{Arc, Mutex}, -}; - -use anyhow::Result; -use base64::{engine::general_purpose, Engine}; -use bstr::BString; -use http::StatusCode; -use regex::bytes::Regex; -use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; -use schemars::{ - gen::SchemaGenerator, - schema::{ArrayValidation, InstanceType, Schema}, - JsonSchema, -}; -use serde::{Deserialize, Serialize}; -use smallvec::SmallVec; -use tracing::debug; -use xxhash_rust::xxh3::xxh3_64; - -use crate::{ - blob::{Blob, BlobId, BlobIdMap}, - entropy::calculate_shannon_entropy, - inline_ignore::InlineIgnoreConfig, - location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, - origin::OriginSet, - parser, - parser::{Checker, Language}, - rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, - rules::rule::Rule, - rules_database::RulesDatabase, - safe_list::{is_safe_match, is_user_match}, - scanner_pool::ScannerPool, - snippet::Base64BString, - util::{intern, redact_value}, -}; - -const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment -const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches -const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs -const TREE_SITTER_MAX_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs <= 64 KiB -const TREE_SITTER_MIN_LIMIT: usize = 1 * 1024; // only run tree-sitter on blobs >= 1 KiB - -// ------------------------------------------------------------------------------------------------- -// RawMatch -// ------------------------------------------------------------------------------------------------- -/// A raw match, as recorded by a callback to Vectorscan. -/// -/// When matching with Vectorscan, we simply collect all matches into a -/// preallocated `Vec`, and then go through them all after scanning is complete. -#[derive(PartialEq, Eq, Debug, Clone)] -struct RawMatch { - rule_id: u32, - start_idx: u64, - end_idx: u64, -} -#[derive(Clone)] -pub struct OwnedBlobMatch { - pub rule: Arc, - pub blob_id: BlobId, - /// The unique content-based identifier of this match - pub finding_fingerprint: u64, - pub matching_input_offset_span: OffsetSpan, - pub captures: SerializableCaptures, - pub validation_response_body: String, - pub validation_response_status: StatusCode, - pub validation_success: bool, - pub calculated_entropy: f32, - pub is_base64: bool, -} -impl<'a> Matcher<'a> { - pub fn get_profiling_report(&self) -> Option> { - self.profiler.as_ref().map(|p| p.generate_report()) - } -} -impl OwnedBlobMatch { - pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc) -> OwnedBlobMatch { - OwnedBlobMatch { - rule, - blob_id: m.blob_id, - finding_fingerprint: m.finding_fingerprint, - // matching_input: m.snippet.matching.0.to_vec(), - matching_input_offset_span: m.location.offset_span.clone(), - captures: m.groups.clone(), - validation_response_body: m.validation_response_body.clone(), - validation_response_status: StatusCode::from_u16(m.validation_response_status) - .unwrap_or(StatusCode::CONTINUE), - validation_success: m.validation_success, - calculated_entropy: m.calculated_entropy, - is_base64: m.is_base64, - } - } - - pub fn from_blob_match(blob_match: BlobMatch) -> Self { - // Get the matching value from capture group 1 (or 0 if not available) - let matching_finding = blob_match - .captures - .captures - .get(1) - .or_else(|| blob_match.captures.captures.get(0)) - .map(|capture| capture.value.as_bytes().to_vec()) - .unwrap_or_else(Vec::new); - - let mut owned_blob_match = OwnedBlobMatch { - rule: blob_match.rule, - blob_id: blob_match.blob_id.clone(), - matching_input_offset_span: blob_match.matching_input_offset_span, - captures: blob_match.captures.clone(), - validation_response_body: blob_match.validation_response_body, - validation_response_status: blob_match.validation_response_status, - validation_success: blob_match.validation_success, - calculated_entropy: blob_match.calculated_entropy, - finding_fingerprint: 0, //default - is_base64: blob_match.is_base64, - }; - - // Convert matching_finding to a &str (using lossy conversion if needed) - let finding_value = std::str::from_utf8(&matching_finding).unwrap_or(""); - // Use blob_id as the file/commit identifier - let file_or_commit = &blob_match.blob_id.to_string(); - - let offset_start: u64 = - owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); - let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); - - owned_blob_match.finding_fingerprint = - compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end); - - owned_blob_match - } -} -// ------------------------------------------------------------------------------------------------- -// BlobMatch -// ------------------------------------------------------------------------------------------------- -/// A `BlobMatch` is the result type from `Matcher::scan_blob`. -/// -/// It is mostly made up of references and small data. -/// For a representation that is more friendly for human consumption, see -/// `Match`. -pub struct BlobMatch<'a> { - /// The rule that was matched - pub rule: Arc, // Changed from `&'a Rule` to `Arc, - - pub validation_response_body: String, - pub validation_response_status: StatusCode, - - pub validation_success: bool, - pub calculated_entropy: f32, - pub is_base64: bool, -} -#[derive(Clone)] -struct UserData { - /// A scratch vector for raw matches from Vectorscan, to minimize allocation - raw_matches_scratch: Vec, - - /// The length of the input being scanned - input_len: u64, -} -// ------------------------------------------------------------------------------------------------- -// Matcher -// ------------------------------------------------------------------------------------------------- -/// A `Matcher` is able to scan inputs for matches from rules in a -/// `RulesDatabase`. -/// -/// If doing multi-threaded scanning, use a separate `Matcher` for each thread. -#[derive(Clone)] -pub struct Matcher<'a> { - /// Thread-local pool that hands out a &mut BlockScanner - scanner_pool: std::sync::Arc, - - /// The rules database used for matching - rules_db: &'a RulesDatabase, - - /// Local statistics for this `Matcher` - local_stats: MatcherStats, - - /// Global statistics, updated with the local statsistics when this - /// `Matcher` is dropped - global_stats: Option<&'a Mutex>, - - /// The set of blobs that have been seen - seen_blobs: &'a BlobIdMap, - - /// Data passed to the Vectorscan callback - user_data: UserData, - - /// Rule profiler for measuring performance of individual rules - profiler: Option>, - - /// Configuration that controls inline ignore directives - inline_ignore_config: InlineIgnoreConfig, -} -/// This `Drop` implementation updates the `global_stats` with the local stats -impl<'a> Drop for Matcher<'a> { - fn drop(&mut self) { - if let Some(global_stats) = self.global_stats { - let mut global_stats = global_stats.lock().unwrap(); - global_stats.update(&self.local_stats); - } - } -} -pub enum ScanResult<'a> { - SeenWithMatches, - SeenSansMatches, - New(Vec>), -} -impl<'a> Matcher<'a> { - /// Create a new `Matcher` from the given `RulesDatabase`. - /// - /// If `global_stats` is provided, it will be updated with the local stats - /// from this `Matcher` when it is dropped. - pub fn new( - rules_db: &'a RulesDatabase, - scanner_pool: Arc, - seen_blobs: &'a BlobIdMap, - global_stats: Option<&'a Mutex>, - enable_profiling: bool, - shared_profiler: Option>, - include_external_ignore_syntax: bool, - ) -> Result { - // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec - let raw_matches_scratch = Vec::new(); - let user_data = UserData { raw_matches_scratch, input_len: 0 }; - // let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?; - // pool is created once per scan run (see Scanner section below) - let profiler = shared_profiler.or_else(|| { - if enable_profiling { - Some(Arc::new(ConcurrentRuleProfiler::new())) - } else { - None - } - }); - Ok(Matcher { - scanner_pool, - rules_db, - local_stats: MatcherStats::default(), - global_stats, - seen_blobs, - user_data, - profiler, - inline_ignore_config: InlineIgnoreConfig::new(include_external_ignore_syntax), - }) - } - - fn scan_bytes_raw(&mut self, input: &[u8], _filename: &str) -> Result<()> { - // Remember previous peak automatically - let prev_capacity = self.user_data.raw_matches_scratch.capacity(); - self.user_data.raw_matches_scratch.clear(); - self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64)); - - self.user_data.input_len = input.len() as u64; - - let mut offset: usize = 0; - while offset < input.len() { - let end = (offset + MAX_CHUNK_SIZE).min(input.len()); - let slice = &input[offset..end]; - let base = offset as u64; - self.scanner_pool.with(|scanner| { - scanner.scan(slice, |rule_id, from, to, _flags| { - self.user_data.raw_matches_scratch.push(RawMatch { - rule_id, - start_idx: from + base, - end_idx: to + base, - }); - vectorscan_rs::Scan::Continue - }) - })?; - - if end == input.len() { - break; - } - offset = end.saturating_sub(CHUNK_OVERLAP); - } - - Ok(()) - } - - pub fn scan_blob<'b>( - &mut self, - blob: &'b Blob, - origin: &OriginSet, - lang: Option, - redact: bool, - no_dedup: bool, - no_base64: bool, - ) -> Result> - where - 'a: 'b, - { - // Update local stats - self.local_stats.blobs_seen += 1; - self.local_stats.bytes_seen += blob.bytes().len() as u64; - self.local_stats.blobs_scanned += 1; - self.local_stats.bytes_scanned += blob.bytes().len() as u64; - - // Extract filename from origin - let filename = origin - .first() - .blob_path() - .and_then(|path| path.file_name()) - .and_then(|name| name.to_str()) - .unwrap_or("unknown_file") - .to_string(); - // Perform the scan - self.scan_bytes_raw(&blob.bytes(), &filename)?; - - // Opportunistically look for standalone Base64 blobs. If neither - // the raw scan nor this check yields anything, we can return early - // before doing any heavier work. - let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT { - Vec::new() - } else { - get_base64_strings(blob.bytes()) - }; - - let lang_hint = lang.as_deref(); - let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty(); - let has_base64_items = !b64_items.is_empty(); - - if !has_raw_matches && !has_base64_items { - return Ok(ScanResult::New(Vec::new())); - } - - let rules_db = self.rules_db; - let mut seen_matches = FxHashSet::default(); - let mut previous_matches: FxHashMap> = FxHashMap::default(); - - let blob_len = blob.len(); - - let should_run_tree_sitter = blob_len > 0 - && blob_len <= TREE_SITTER_MAX_LIMIT - && blob_len >= TREE_SITTER_MIN_LIMIT - && has_raw_matches - && lang_hint.is_some() - && !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled - - let tree_sitter_result = if should_run_tree_sitter { - lang_hint.and_then(|lang_str| { - get_language_and_queries(lang_str).and_then(|(language, queries)| { - let checker = Checker { language, rules: queries }; - match checker.check(&blob.bytes()) { - Ok(results) => Some(results), - Err(e) => { - println!("Error in checker.check: {}", e); - None - } - } - }) - }) - } else { - None - }; - // Process matches - let mut matches = Vec::new(); - let owned_ts_results = tree_sitter_result.map(|ts_results| { - ts_results - .into_iter() - .filter(|match_result| match_result.is_base64_decoded) - .map(|match_result| { - ( - match_result.range, - match_result.text, - match_result.is_base64_decoded, - match_result.original_base64, - ) - }) - .collect::>() - }); - let mut previous_raw_matches: FxHashMap> = FxHashMap::default(); - for &RawMatch { rule_id, start_idx, end_idx } in - self.user_data.raw_matches_scratch.iter().rev() - { - let rule_id_usize: usize = rule_id as usize; - let rule = Arc::clone(&rules_db.rules[rule_id_usize]); - let re = &rules_db.anchored_regexes[rule_id_usize]; - let start_idx_usize = start_idx as usize; - let end_idx_usize = end_idx as usize; - let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize); - if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) { - continue; - } - filter_match( - blob, - rule, - re, - start_idx_usize, - end_idx_usize, - &mut matches, - &mut previous_matches, - rule_id_usize, - &mut seen_matches, - origin, - None, - false, - redact, - &filename, - self.profiler.as_ref(), - &self.inline_ignore_config, - ); - } - // If tree-sitter produced base64-decoded matches, try them against all rules - if let Some(ref ts_results) = owned_ts_results { - for (ts_range, ts_match, is_base64_decoded, _original_base64) in ts_results.iter() { - if *is_base64_decoded { - for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() { - let re = &rules_db.anchored_regexes[rule_id_usize]; - filter_match( - blob, - rule.clone(), - re, - ts_range.start, - ts_range.end, - &mut matches, - &mut previous_matches, - rule_id_usize, - &mut seen_matches, - origin, - Some(ts_match.as_bytes()), - *is_base64_decoded, - redact, - &filename, - self.profiler.as_ref(), - &self.inline_ignore_config, - ); - } - } - } - } - - if !no_base64 { - // If the blob contains standalone Base64 blobs, decode and scan them as well - const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep - let mut b64_stack: Vec<(DecodedData, usize)> = - b64_items.drain(..).map(|d| (d, 0)).collect(); - while let Some((item, depth)) = b64_stack.pop() { - for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() { - let re = &rules_db.anchored_regexes[rule_id_usize]; - filter_match( - blob, - rule.clone(), - re, - item.pos_start, - item.pos_end, - &mut matches, - &mut previous_matches, - rule_id_usize, - &mut seen_matches, - origin, - Some(item.decoded.as_bytes()), - true, - redact, - &filename, - self.profiler.as_ref(), - &self.inline_ignore_config, - ); - } - if depth + 1 < MAX_B64_DEPTH { - for nested in get_base64_strings(item.decoded.as_bytes()) { - b64_stack.push(( - DecodedData { - original: nested.original, - decoded: nested.decoded, - pos_start: item.pos_start, - pos_end: item.pos_end, - }, - depth + 1, - )); - } - } - } - } - // Finalize - if !no_dedup && !matches.is_empty() { - let blob_id = blob.id(); - if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) { - return Ok(if had_matches { - ScanResult::SeenWithMatches - } else { - ScanResult::SeenSansMatches - }); - } - } - - // --- opportunistic capacity cap --------------------------------- - if self.user_data.raw_matches_scratch.capacity() - > self.user_data.raw_matches_scratch.len() * 4 - { - // Vec::shrink_to_fit may re-allocate, but we’re about to leave scan_blob - // so the cost is hidden off the hot path. - self.user_data.raw_matches_scratch.shrink_to_fit(); - } - - Ok(ScanResult::New(matches)) - // Ok(result) - } -} - -#[inline] -fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 { - let mut hasher = FxHasher::default(); - // Hash each component directly without allocation - content.hash(&mut hasher); - rule_id.hash(&mut hasher); - start.hash(&mut hasher); - end.hash(&mut hasher); - hasher.finish() -} - -#[inline] -fn insert_span(spans: &mut Vec, span: OffsetSpan) -> bool { - let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i); - if idx > 0 { - if spans[idx - 1].fully_contains(&span) { - return false; - } - if span.fully_contains(&spans[idx - 1]) { - spans.remove(idx - 1); - idx -= 1; - } - } - if idx < spans.len() { - if spans[idx].fully_contains(&span) { - return false; - } - if span.fully_contains(&spans[idx]) { - spans.remove(idx); - } - } - spans.insert(idx, span); - true -} - -#[inline] -fn record_match( - map: &mut FxHashMap>, - rule_id: usize, - span: OffsetSpan, -) -> bool { - insert_span(map.entry(rule_id).or_default(), span) -} -fn filter_match<'b>( - blob: &'b Blob, - // rule: &'b Rule, - rule: Arc, - re: &Regex, - start: usize, - end: usize, - matches: &mut Vec>, - previous_matches: &mut FxHashMap>, - rule_id: usize, - seen_matches: &mut FxHashSet, - _origin: &OriginSet, - ts_match: Option<&[u8]>, - is_base64: bool, - redact: bool, - filename: &str, - profiler: Option<&Arc>, - inline_ignore_config: &InlineIgnoreConfig, -) { - let mut timer = - profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename)); - - let initial_len = matches.len(); - - let blob_bytes = blob.bytes(); - let default_slice = &blob_bytes[start..end]; - let haystack = ts_match.unwrap_or(default_slice); - - for captures in re.captures_iter(haystack) { - let full_capture = captures.get(0).unwrap(); - let matching_input = captures.get(1).unwrap_or(full_capture); - let min_entropy = rule.min_entropy(); - let mi_bytes = matching_input.as_bytes(); - let full_bytes = full_capture.as_bytes(); - let calculated_entropy = calculate_shannon_entropy(mi_bytes); - if calculated_entropy <= min_entropy - || is_safe_match(mi_bytes) - || is_user_match(mi_bytes, full_bytes) - { - debug!( - "Skipping match with entropy {} <= {} or safe match", - calculated_entropy, min_entropy - ); - continue; - } - let matching_input_offset_span = OffsetSpan::from_range( - (start + matching_input.start())..(start + matching_input.end()), - ); - if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) { - debug!("Skipping match due to inline ignore directive"); - continue; - } - let match_key = compute_match_key( - matching_input.as_bytes(), - rule.id().as_bytes(), - matching_input_offset_span.start, - matching_input_offset_span.end, - ); - if !seen_matches.insert(match_key) { - continue; - } - if !record_match(previous_matches, rule_id, matching_input_offset_span) { - continue; - } - let only_matching_input = - &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end]; - let groups = SerializableCaptures::from_captures(&captures, haystack, re, redact); - matches.push(BlobMatch { - rule: Arc::clone(&rule), - blob_id: blob.id_ref(), - matching_input: only_matching_input, - matching_input_offset_span, - captures: groups, - validation_response_body: String::new(), - validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE), - validation_success: false, - calculated_entropy, - is_base64, - }); - } - if let Some(t) = timer.take() { - let new_count = (matches.len() - initial_len) as u64; - t.end(new_count > 0, new_count, 0); - } -} -fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap)> { - match lang.to_lowercase().as_str() { - "bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())), - "c" => Some((Language::C, parser::queries::c::get_c_queries())), - "c#" | "csharp" => Some((Language::CSharp, parser::queries::csharp::get_csharp_queries())), - "c++" | "cpp" => Some((Language::Cpp, parser::queries::cpp::get_cpp_queries())), - "css" => Some((Language::Css, parser::queries::css::get_css_queries())), - "go" => Some((Language::Go, parser::queries::go::get_go_queries())), - "html" => Some((Language::Html, parser::queries::html::get_html_queries())), - "java" => Some((Language::Java, parser::queries::java::get_java_queries())), - "javascript" | "js" => { - Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries())) - } - // "kotlin" => Some(( - // Language::Kotlin, - // parser::queries::kotlin::get_kotlin_queries(), - // )), - "php" => Some((Language::Php, parser::queries::php::get_php_queries())), - "python" | "py" | "starlark" => { - Some((Language::Python, parser::queries::python::get_python_queries())) - } - "ruby" => Some((Language::Ruby, parser::queries::ruby::get_ruby_queries())), - "rust" => Some((Language::Rust, parser::queries::rust::get_rust_queries())), - "toml" => Some((Language::Toml, parser::queries::toml::get_toml_queries())), - "typescript" | "ts" => { - Some((Language::TypeScript, parser::queries::typescript::get_typescript_queries())) - } - "yaml" => Some((Language::Yaml, parser::queries::yaml::get_yaml_queries())), - _ => None, - } -} -// ------------------------------------------------------------------------------------------------- -// MatchStats -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Default, Clone)] -pub struct MatcherStats { - pub blobs_seen: u64, - pub blobs_scanned: u64, - pub bytes_seen: u64, - pub bytes_scanned: u64, - // #[cfg(feature = "rule_profiling")] - // pub rule_stats: crate::rule_profiling::RuleProfile, -} -impl MatcherStats { - pub fn update(&mut self, other: &Self) { - self.blobs_seen += other.blobs_seen; - self.blobs_scanned += other.blobs_scanned; - self.bytes_seen += other.bytes_seen; - self.bytes_scanned += other.bytes_scanned; - - // #[cfg(feature = "rule_profiling")] - // self.rule_stats.update(&other.rule_stats); - } -} -// ------------------------------------------------------------------------------------------------- -// Group -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)] -pub struct Group(pub Base64BString); -impl Group { - pub fn new(m: regex::bytes::Match<'_>) -> Self { - Self(Base64BString(BString::from(m.as_bytes()))) - } -} -// ------------------------------------------------------------------------------------------------- -// Groups -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub struct Groups(pub SmallVec<[Group; 1]>); -impl JsonSchema for Groups { - fn schema_name() -> String { - "Groups".to_string() - } - - fn json_schema(gen: &mut SchemaGenerator) -> Schema { - let group_schema = gen.subschema_for::(); - Schema::Object(schemars::schema::SchemaObject { - instance_type: Some(InstanceType::Array.into()), - array: Some(Box::new(ArrayValidation { - items: Some(group_schema.into()), - ..Default::default() - })), - ..Default::default() - }) - } -} -// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] -// pub struct SerializableCapture { -// pub name: Option, // Named group (if available) -// pub match_number: i32, -// pub start: usize, // Start position of the match -// pub end: usize, // End position of the match -// pub value: String, // The actual captured value -// } -#[derive(Debug, Clone, Serialize, JsonSchema)] -pub struct SerializableCapture { - pub name: Option, - pub match_number: i32, - pub start: usize, - pub end: usize, - /// Interned value of the capture. - pub value: &'static str, -} - -#[derive(Debug, Clone, Serialize, JsonSchema)] -pub struct SerializableCaptures { - #[schemars(with = "Vec")] - pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed) -} -impl SerializableCaptures { - pub fn from_captures( - captures: ®ex::bytes::Captures, - _input: &[u8], - re: &Regex, - redact: bool, - ) -> Self { - let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new(); - // Process named captures - for name in re.capture_names().flatten() { - if let Some(capture) = captures.name(name) { - let value = if redact { - redact_value(&String::from_utf8_lossy(capture.as_bytes())) - } else { - String::from_utf8_lossy(capture.as_bytes()).to_string() - }; - serialized_captures.push(SerializableCapture { - name: Some(name.to_string()), - match_number: -1, - start: capture.start(), - end: capture.end(), - value: intern(&value), - }); - } - } - // Process unnamed captures (numbered groups) - for i in 0..captures.len() { - if let Some(capture) = captures.get(i) { - let value = if redact { - redact_value(&String::from_utf8_lossy(capture.as_bytes())) - } else { - String::from_utf8_lossy(capture.as_bytes()).to_string() - }; - serialized_captures.push(SerializableCapture { - name: None, - match_number: i32::try_from(i).unwrap_or(0), - start: capture.start(), - end: capture.end(), - value: intern(&value), - }); - } - } - SerializableCaptures { captures: serialized_captures } - } -} -// ------------------------------------------------------------------------------------------------- -// Match -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Clone, Serialize, JsonSchema)] -pub struct Match { - /// The location of the entire matching content - pub location: Location, - - /// The capture groups - pub groups: SerializableCaptures, // Store serialized captures - - /// unique identifier of file / blob where this match was found - pub blob_id: BlobId, - - /// The unique content-based identifier of this match - pub finding_fingerprint: u64, - - /// The rule that produced this match - #[serde(skip_serializing)] - #[schemars(skip)] - pub rule: Arc, - - /// Validation Body - pub validation_response_body: String, - - /// Validation Status Code - pub validation_response_status: u16, - - /// Validation Success - pub validation_success: bool, - - /// Validation Success - pub calculated_entropy: f32, - - pub visible: bool, - #[serde(default)] - pub is_base64: bool, -} -impl Match { - #[inline] - pub fn convert_owned_blobmatch_to_match<'a>( - loc_mapping: Option<&'a LocationMapping<'a>>, - owned_blob_match: &'a OwnedBlobMatch, - origin_type: &'a str, - ) -> Self { - let offset_span = owned_blob_match.matching_input_offset_span; - // Extract the matched secret content. Use capture group 1 if it exists, otherwise fall back - // to group 0. - let matching_finding_bytes = owned_blob_match - .captures - .captures - .get(1) - .or_else(|| owned_blob_match.captures.captures.get(0)) - .map(|capture| capture.value.as_bytes()) - .unwrap_or_default(); - - // The fingerprint will be based on the content of the secret. - let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or(""); - - let source_span = - loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan { - start: SourcePoint { line: 0, column: 0 }, - end: SourcePoint { line: 0, column: 0 }, - }); - let offset_start: u64 = - owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); - let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); - - let finding_fingerprint = compute_finding_fingerprint( - finding_value_for_fp, - origin_type, // file_or_commit, - offset_start, - offset_end, - ); - - // matching_snippet - Match { - rule: owned_blob_match.rule.clone(), - visible: owned_blob_match.rule.visible().to_owned(), - location: Location { offset_span, source_span: source_span.clone() }, - groups: owned_blob_match.captures.clone(), - blob_id: owned_blob_match.blob_id, - finding_fingerprint, - validation_response_body: owned_blob_match.validation_response_body.clone(), - validation_response_status: owned_blob_match.validation_response_status.as_u16(), - validation_success: owned_blob_match.validation_success, - calculated_entropy: owned_blob_match.calculated_entropy, - is_base64: owned_blob_match.is_base64, - } - } - - /// Returns the `blob_id` of the match. - pub fn get_blob_id(&self) -> BlobId { - self.blob_id.clone() - } - - pub fn finding_id(&self) -> String { - let mut buffer = Vec::with_capacity(128); - buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes()); - buffer.push(0); - serde_json::to_writer(&mut buffer, &self.groups) - .expect("should be able to serialize groups as JSON"); - let mut num = xxh3_64(&buffer); - // Ensure the number is positive and within i64 range - num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive - // Convert to string - num.to_string() - } -} -#[derive(Debug, Clone)] -pub struct DecodedData { - pub original: String, - pub decoded: String, - pub pos_start: usize, - pub pos_end: usize, -} -#[inline] -fn is_base64_byte(b: u8) -> bool { - matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/') -} - -pub fn get_base64_strings(input: &[u8]) -> Vec { - let mut results = Vec::new(); - let mut i = 0; - while i < input.len() { - while i < input.len() && !is_base64_byte(input[i]) { - i += 1; - } - let start = i; - while i < input.len() && is_base64_byte(input[i]) { - i += 1; - } - - let mut eq_count = 0; - while i < input.len() && input[i] == b'=' && eq_count < 2 { - i += 1; - eq_count += 1; - } - let end = i; - - let len = end - start; - if len >= 32 && len % 4 == 0 { - let base64_slice = &input[start..end]; - if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) { - if let Ok(decoded_str) = std::str::from_utf8(&decoded) { - if decoded_str.is_ascii() { - results.push(DecodedData { - original: String::from_utf8_lossy(base64_slice).into_owned(), - decoded: decoded_str.to_string(), - pos_start: start, - pos_end: end, - }); - } - } - } - } - } - - results -} - -pub fn compute_finding_fingerprint( - finding_value: &str, - file_or_commit: &str, - offset_start: u64, - offset_end: u64, -) -> u64 { - // Combine all into a byte buffer and hash it directly: - let mut buf = Vec::with_capacity( - finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::(), - ); - buf.extend_from_slice(finding_value.as_bytes()); - buf.extend_from_slice(file_or_commit.as_bytes()); - buf.extend_from_slice(&offset_start.to_le_bytes()); - buf.extend_from_slice(&offset_end.to_le_bytes()); - - xxh3_64(&buf) -} - -// ------------------------------------------------------------------------------------------------- -// test -// ------------------------------------------------------------------------------------------------- -#[cfg(test)] -mod test { - use std::{collections::BTreeMap, path::PathBuf}; - - use pretty_assertions::assert_eq; - // --------------------------------------------------------------------- - // proptest: raw-match dedup + entropy gate - // --------------------------------------------------------------------- - use proptest::prelude::*; - - use super::*; - use crate::{ - blob::{Blob, BlobIdMap}, - origin::{Origin, OriginSet}, - rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation}, - }; - - proptest! { - #[test] - fn prop_no_dupes_and_entropy( - // random ASCII up to 300 bytes - mut noise in proptest::collection::vec(any::().prop_filter("ascii", |b| b.is_ascii()), 0..300), - // 0-4 random insertion points - inserts in proptest::collection::vec(0usize..300, 0..5) - ) { - // Constant high-entropy secret token that matches the rule below - const TOKEN: &[u8] = b"secret_abcd1234"; - - // Splice the token at the requested offsets - for &idx in &inserts { - let pos = idx.min(noise.len()); - noise.splice(pos..pos, TOKEN.iter().copied()); - } - - // ── build a single test rule ────────────────────────────────── - use crate::rules::rule::{RuleSyntax, Validation, Confidence}; - - let rule = Rule::new(RuleSyntax { - id: "prop.secret".into(), - name: "prop secret".into(), - pattern: "secret_[a-z]{4}[0-9]{4}".into(), - confidence: Confidence::Low, - min_entropy: 3.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, // no HTTP validation needed - depends_on_rule: vec![], - }); - - let rules_db = RulesDatabase::from_rules(vec![rule]).unwrap(); - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false).unwrap(); - - // ── run the scan ────────────────────────────────────────────── - m.scan_bytes_raw(&noise, "buf").unwrap(); - - // ── property 1: dedup – each (rule,start,end) is unique ────── - - let mut coords = FxHashSet::default(); - for RawMatch{rule_id, start_idx, end_idx} in &m.user_data.raw_matches_scratch { - assert!( - coords.insert((*rule_id, *start_idx, *end_idx)), - "duplicate raw-match detected for coords ({rule_id},{start_idx},{end_idx})" - ); - - // ── property 2: entropy gate held ──────────────────────── - let slice = &noise[*start_idx as usize .. *end_idx as usize]; - let ent = calculate_shannon_entropy(slice); - assert!(ent > 3.0, "entropy {ent} ≤ min_entropy, gate failed"); - } - } - } - - #[test] - pub fn test_simple() -> Result<()> { - let rules = vec![Rule::new(RuleSyntax { - id: "test.1".to_string(), - name: "test".to_string(), - pattern: "test".to_string(), - confidence: crate::rules::rule::Confidence::Medium, - min_entropy: 1.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: Some(Validation::Http(HttpValidation { - request: HttpRequest { - method: "GET".to_string(), - url: "https://example.com".to_string(), - headers: BTreeMap::new(), - body: None, - response_matcher: Some(vec![]), - multipart: None, - response_is_html: false, - }, - multipart: None, - })), - depends_on_rule: vec![ - Some(DependsOnRule { - rule_id: "d8f3c34b-015f-4cd6-b411-b1366493104c".to_string(), - variable: "email".to_string(), - }), - Some(DependsOnRule { - rule_id: "8910f364-7718-4a27-a435-d2da13e6ba9e".to_string(), - variable: "domain".to_string(), - }), - ], - })]; - let rules_db = RulesDatabase::from_rules(rules)?; - let input = "some test data for vectorscan"; - let seen_blobs: BlobIdMap = BlobIdMap::new(); - let enable_rule_profiling = true; - // let mut matcher = Matcher::new(&rules_db, &seen_blobs, None, - // enable_rule_profiling)?; - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new( - &rules_db, - scanner_pool, - &seen_blobs, - None, - enable_rule_profiling, - None, // Pass the shared profiler - false, - )?; - matcher.scan_bytes_raw(input.as_bytes(), "fname")?; - assert_eq!( - matcher.user_data.raw_matches_scratch, - vec![RawMatch { rule_id: 0, start_idx: 0, end_idx: 9 },] - ); - Ok(()) - } - - // --------------------------------------------------------------------- - // additional deterministic unit-tests - // --------------------------------------------------------------------- - - /// `get_base64_strings` should recognise a well-formed token, decode it, - /// and report correct byte-offsets. - #[test] - fn test_get_base64_strings_basic() { - let raw = b"foo MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY= bar"; - // decodes to "0123456789abcdef0123456789abcdef" - let hits = get_base64_strings(raw); - assert_eq!(hits.len(), 1); - let item = &hits[0]; - assert_eq!(item.decoded, "0123456789abcdef0123456789abcdef"); - assert_eq!(item.original, "MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY="); - // "foo␠" is 4 bytes, so the start offset is 4 - assert_eq!((item.pos_start, item.pos_end), (4, 4 + item.original.len())); - } - - /// `compute_finding_fingerprint` must be stable (same input ⇒ same output) - /// and sensitive to any input component. - #[test] - fn test_finding_fingerprint_stability_and_uniqueness() { - let a = compute_finding_fingerprint("secret", "fileA", 0, 6); - let b = compute_finding_fingerprint("secret", "fileA", 0, 6); - assert_eq!(a, b, "fingerprint should be deterministic"); - - // changing any parameter should perturb the hash - let c = compute_finding_fingerprint("secret", "fileA", 1, 7); // offsets differ - let d = compute_finding_fingerprint("secret", "fileB", 0, 6); // file id differs - let e = compute_finding_fingerprint("different", "fileA", 0, 6); // content differs - assert_ne!(a, c); - assert_ne!(a, d); - assert_ne!(a, e); - } - - /// The (private) `compute_match_key` helper is the linchpin of the raw-dedup - /// path. It should return identical keys for identical inputs and different - /// keys as soon as *anything* changes. - #[test] - fn test_compute_match_key_uniqueness() { - use super::compute_match_key; - - let k1 = compute_match_key(b"abc", b"rule-1", 0, 3); - let k2 = compute_match_key(b"abc", b"rule-1", 0, 3); - assert_eq!(k1, k2); - - // mutate each component in turn - let diff_content = compute_match_key(b"abcd", b"rule-1", 0, 4); - let diff_rule = compute_match_key(b"abc", b"rule-2", 0, 3); - let diff_span = compute_match_key(b"abc", b"rule-1", 1, 4); - assert_ne!(k1, diff_content); - assert_ne!(k1, diff_rule); - assert_ne!(k1, diff_span); - } - - /// Running `scan_bytes_raw` twice over the *same* input should never record - /// duplicate entries in `raw_matches_scratch`. - #[test] - fn test_scan_bytes_raw_no_duplicate_raw_matches() -> Result<()> { - // simple rule: literal "dup" - let rule = Rule::new(RuleSyntax { - id: "dup.check".into(), - name: "dup".into(), - pattern: "dup".into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - depends_on_rule: vec![], - }); - - let rules_db = RulesDatabase::from_rules(vec![rule])?; - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; - - let buf = b"dup dup"; // two literal hits, same rule - - // first scan - m.scan_bytes_raw(buf, "buf1")?; - let first_len = m.user_data.raw_matches_scratch.len(); - - // second scan over the same buffer - m.scan_bytes_raw(buf, "buf1")?; - let second_len = m.user_data.raw_matches_scratch.len(); - - // we should still only have two unique raw matches recorded - assert_eq!(first_len, 2); - assert_eq!(second_len, 2); - Ok(()) - } - - #[test] - fn inline_comment_skips_match() -> Result<()> { - let rule = Rule::new(RuleSyntax { - id: "inline.ignore".into(), - name: "inline".into(), - pattern: "secret_token".into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - depends_on_rule: vec![], - }); - let rules_db = RulesDatabase::from_rules(vec![rule])?; - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; - - let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec()); - let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt"))); - - match matcher.scan_blob(&blob, &origin, None, false, false, false)? { - ScanResult::New(matches) => assert!(matches.is_empty()), - _ => panic!("unexpected scan result"), - } - - Ok(()) - } - - #[test] - fn inline_comment_after_multiline_secret_skips_match() -> Result<()> { - let rule = Rule::new(RuleSyntax { - id: "inline.multiline".into(), - name: "inline multiline".into(), - pattern: "line1\\s+line2".into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - depends_on_rule: vec![], - }); - let rules_db = RulesDatabase::from_rules(vec![rule])?; - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; - - let blob = Blob::from_bytes( - br#"let data = """ -line1 -line2 -""" -# kingfisher:ignore -"# - .to_vec(), - ); - let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt"))); - - match matcher.scan_blob(&blob, &origin, None, false, false, false)? { - ScanResult::New(matches) => assert!(matches.is_empty()), - _ => panic!("unexpected scan result"), - } - - Ok(()) - } - - #[test] - fn compat_flag_controls_external_directives() -> Result<()> { - let rule = Rule::new(RuleSyntax { - id: "inline.compat".into(), - name: "inline compat".into(), - pattern: "supersecret123".into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - depends_on_rule: vec![], - }); - let rules_db = RulesDatabase::from_rules(vec![rule])?; - - let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec()); - let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt"))); - - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; - let matches_without_compat = - match matcher.scan_blob(&blob, &origin, None, false, false, false)? { - ScanResult::New(matches) => matches.len(), - _ => panic!("unexpected scan result"), - }; - assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag"); - - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true)?; - match matcher.scan_blob(&blob, &origin, None, false, false, false)? { - ScanResult::New(matches) => assert!(matches.is_empty()), - _ => panic!("unexpected scan result"), - } - - Ok(()) - } -} diff --git a/src/matcher.rs.rej b/src/matcher.rs.rej deleted file mode 100644 index dc61305..0000000 --- a/src/matcher.rs.rej +++ /dev/null @@ -1,52 +0,0 @@ -@@ -1,50 +1,51 @@ - use std::{ - hash::{Hash, Hasher}, - str, - sync::{Arc, Mutex}, - }; - - use anyhow::Result; - use base64::{engine::general_purpose, Engine}; - use bstr::BString; - use http::StatusCode; - use regex::bytes::Regex; - use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; - use schemars::{ - gen::SchemaGenerator, - schema::{ArrayValidation, InstanceType, Schema}, - JsonSchema, - }; - use serde::{Deserialize, Serialize}; - use smallvec::SmallVec; - use tracing::debug; - use xxhash_rust::xxh3::xxh3_64; - - use crate::{ - blob::{Blob, BlobId, BlobIdMap}, - entropy::calculate_shannon_entropy, -+ inline_ignore::InlineIgnoreConfig, - location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, - origin::OriginSet, - parser, - parser::{Checker, Language}, - rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, - rules::rule::Rule, - rules_database::RulesDatabase, - safe_list::{is_safe_match, is_user_match}, - scanner_pool::ScannerPool, - snippet::Base64BString, - util::{intern, redact_value}, - }; - - const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment - const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches - const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs - const TREE_SITTER_SCAN_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs ≤64 KiB - - // ------------------------------------------------------------------------------------------------- - // RawMatch - // ------------------------------------------------------------------------------------------------- - /// A raw match, as recorded by a callback to Vectorscan. - /// - /// When matching with Vectorscan, we simply collect all matches into a - /// preallocated `Vec`, and then go through them all after scanning is complete.