kingfisher/src/matcher/mod.rs
Mick Grove db67105221 v1.88.0
2026-03-11 21:36:30 -07:00

1247 lines
46 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

mod base64_decode;
mod captures;
mod conversion;
mod dedup;
mod filter;
mod fingerprint;
// Re-export public API
pub use base64_decode::{get_base64_strings, DecodedData};
pub use captures::{Group, Groups, SerializableCapture, SerializableCaptures};
pub use conversion::{Match, MatcherStats, OwnedBlobMatch};
pub use fingerprint::compute_finding_fingerprint;
use std::sync::{Arc, Mutex};
use anyhow::Result;
use http::StatusCode;
use rustc_hash::{FxHashMap, FxHashSet};
use tracing::debug;
use crate::{
blob::{Blob, BlobId, BlobIdMap},
inline_ignore::InlineIgnoreConfig,
location::OffsetSpan,
origin::OriginSet,
parser,
parser::{Checker, Language},
rule_profiling::{ConcurrentRuleProfiler, RuleStats},
rules::rule::Rule,
rules_database::{RuleDetectionProfileKind, RulesDatabase},
scanner_pool::ScannerPool,
validation_body::ValidationResponseBody,
};
use kingfisher_scanner::primitives::find_secret_capture;
use self::{
base64_decode::get_base64_strings as get_b64_strings, dedup::record_match, filter::filter_match,
};
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
const TREE_SITTER_MAX_LIMIT: usize = 128 * 1024; // only run tree-sitter on blobs <= 128 KiB
const TREE_SITTER_MIN_LIMIT: usize = 0; // allow tree-sitter starting at 0 bytes
#[inline]
pub(crate) fn should_attempt_tree_sitter(blob_len: usize) -> bool {
blob_len <= TREE_SITTER_MAX_LIMIT && blob_len >= TREE_SITTER_MIN_LIMIT
}
// -------------------------------------------------------------------------------------------------
// RawMatch
// -------------------------------------------------------------------------------------------------
/// A raw match, as recorded by a callback to Vectorscan.
///
/// When matching with Vectorscan, we simply collect all matches into a
/// preallocated `Vec`, and then go through them all after scanning is complete.
#[derive(PartialEq, Eq, Debug, Clone)]
struct RawMatch {
rule_id: u32,
start_idx: u64,
end_idx: u64,
}
// -------------------------------------------------------------------------------------------------
// BlobMatch
// -------------------------------------------------------------------------------------------------
/// A `BlobMatch` is the result type from `Matcher::scan_blob`.
///
/// It is mostly made up of references and small data.
/// For a representation that is more friendly for human consumption, see
/// `Match`.
pub struct BlobMatch<'a> {
/// The rule that was matched
pub rule: Arc<Rule>,
/// The blob that was matched
pub blob_id: &'a BlobId,
/// The matching input in `blob.input`
pub matching_input: &'a [u8],
/// The location of the matching input in `blob.input`
pub matching_input_offset_span: OffsetSpan,
/// The capture groups from the match
pub captures: SerializableCaptures,
pub validation_response_body: ValidationResponseBody,
pub validation_response_status: StatusCode,
pub validation_success: bool,
pub calculated_entropy: f32,
pub is_base64: bool,
}
#[derive(Clone)]
struct UserData {
/// A scratch vector for raw matches from Vectorscan, to minimize allocation
raw_matches_scratch: Vec<RawMatch>,
/// The length of the input being scanned
input_len: u64,
}
// -------------------------------------------------------------------------------------------------
// Matcher
// -------------------------------------------------------------------------------------------------
/// A `Matcher` is able to scan inputs for matches from rules in a
/// `RulesDatabase`.
///
/// If doing multi-threaded scanning, use a separate `Matcher` for each thread.
#[derive(Clone)]
pub struct Matcher<'a> {
/// Thread-local pool that hands out a &mut BlockScanner
scanner_pool: std::sync::Arc<crate::scanner_pool::ScannerPool>,
/// The rules database used for matching
rules_db: &'a RulesDatabase,
/// Local statistics for this `Matcher`
local_stats: MatcherStats,
/// Global statistics, updated with the local statsistics when this
/// `Matcher` is dropped
global_stats: Option<&'a Mutex<MatcherStats>>,
/// The set of blobs that have been seen
seen_blobs: &'a BlobIdMap<bool>,
/// Data passed to the Vectorscan callback
user_data: UserData,
/// Rule profiler for measuring performance of individual rules
profiler: Option<Arc<ConcurrentRuleProfiler>>,
/// Configuration that controls inline ignore directives
inline_ignore_config: InlineIgnoreConfig,
/// Whether matches should honour `ignore_if_contains` requirements.
respect_ignore_if_contains: bool,
}
/// This `Drop` implementation updates the `global_stats` with the local stats
impl<'a> Drop for Matcher<'a> {
fn drop(&mut self) {
if let Some(global_stats) = self.global_stats {
let mut global_stats = global_stats.lock().unwrap();
global_stats.update(&self.local_stats);
}
}
}
pub enum ScanResult<'a> {
SeenWithMatches,
SeenSansMatches,
New(Vec<BlobMatch<'a>>),
}
impl<'a> Matcher<'a> {
pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
self.profiler.as_ref().map(|p| p.generate_report())
}
}
impl<'a> Matcher<'a> {
/// Create a new `Matcher` from the given `RulesDatabase`.
///
/// If `global_stats` is provided, it will be updated with the local stats
/// from this `Matcher` when it is dropped.
pub fn new(
rules_db: &'a RulesDatabase,
scanner_pool: Arc<ScannerPool>,
seen_blobs: &'a BlobIdMap<bool>,
global_stats: Option<&'a Mutex<MatcherStats>>,
enable_profiling: bool,
shared_profiler: Option<Arc<ConcurrentRuleProfiler>>,
extra_ignore_directives: &[String],
disable_inline_ignores: bool,
respect_ignore_if_contains: bool,
) -> Result<Self> {
// Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec
let raw_matches_scratch = Vec::new();
let user_data = UserData { raw_matches_scratch, input_len: 0 };
let profiler = shared_profiler.or_else(|| {
if enable_profiling {
Some(Arc::new(ConcurrentRuleProfiler::new()))
} else {
None
}
});
Ok(Matcher {
scanner_pool,
rules_db,
local_stats: MatcherStats::default(),
global_stats,
seen_blobs,
user_data,
profiler,
inline_ignore_config: if disable_inline_ignores {
InlineIgnoreConfig::disabled()
} else {
InlineIgnoreConfig::new(extra_ignore_directives)
},
respect_ignore_if_contains,
})
}
fn scan_bytes_raw(&mut self, input: &[u8], _filename: &str) -> Result<()> {
// Remember previous peak automatically
let prev_capacity = self.user_data.raw_matches_scratch.capacity();
self.user_data.raw_matches_scratch.clear();
self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64));
self.user_data.input_len = input.len() as u64;
let mut offset: usize = 0;
while offset < input.len() {
let end = (offset + MAX_CHUNK_SIZE).min(input.len());
let slice = &input[offset..end];
let base = offset as u64;
self.scanner_pool.with(|scanner| {
scanner.scan(slice, |rule_id, from, to, _flags| {
self.user_data.raw_matches_scratch.push(RawMatch {
rule_id,
start_idx: from + base,
end_idx: to + base,
});
vectorscan_rs::Scan::Continue
})
})?;
if end == input.len() {
break;
}
offset = end.saturating_sub(CHUNK_OVERLAP);
}
Ok(())
}
pub fn scan_blob<'b>(
&mut self,
blob: &'b Blob,
origin: &OriginSet,
lang: Option<String>,
redact: bool,
no_dedup: bool,
no_base64: bool,
) -> Result<ScanResult<'b>>
where
'a: 'b,
{
// Update local stats
self.local_stats.blobs_seen += 1;
self.local_stats.bytes_seen += blob.bytes().len() as u64;
self.local_stats.blobs_scanned += 1;
self.local_stats.bytes_scanned += blob.bytes().len() as u64;
// Extract filename from origin
let filename = origin
.first()
.blob_path()
.and_then(|path| path.file_name())
.and_then(|name| name.to_str())
.unwrap_or("unknown_file")
.to_string();
// Perform the scan
self.scan_bytes_raw(&blob.bytes(), &filename)?;
// Opportunistically look for standalone Base64 blobs. If neither
// the raw scan nor this check yields anything, we can return early
// before doing any heavier work.
let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
Vec::new()
} else {
get_b64_strings(blob.bytes())
};
let lang_hint = lang.as_deref();
let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty();
let has_base64_items = !b64_items.is_empty();
if !has_raw_matches && !has_base64_items {
return Ok(ScanResult::New(Vec::new()));
}
let rules_db = self.rules_db;
let mut seen_matches = FxHashSet::default();
let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
let mut match_rule_indices: Vec<usize> = Vec::new();
let blob_len = blob.len();
let mut matches = Vec::new();
let mut previous_raw_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
for &RawMatch { rule_id, start_idx, end_idx } in
self.user_data.raw_matches_scratch.iter().rev()
{
let rule_id_usize: usize = rule_id as usize;
let rule = Arc::clone(&rules_db.rules()[rule_id_usize]);
let re = &rules_db.anchored_regexes()[rule_id_usize];
let start_idx_usize = start_idx as usize;
let end_idx_usize = end_idx as usize;
let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
continue;
}
let before_len = matches.len();
filter_match(
blob,
rule,
re,
start_idx_usize,
end_idx_usize,
&mut matches,
&mut previous_matches,
rule_id_usize,
&mut seen_matches,
origin,
None,
false,
redact,
&filename,
self.profiler.as_ref(),
self.respect_ignore_if_contains,
&self.inline_ignore_config,
);
match_rule_indices
.extend(std::iter::repeat_n(rule_id_usize, matches.len() - before_len));
}
if !no_base64 {
// If the blob contains standalone Base64 blobs, decode and scan them as well
const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep
let mut b64_stack: Vec<(DecodedData, usize)> =
b64_items.drain(..).map(|d| (d, 0)).collect();
while let Some((item, depth)) = b64_stack.pop() {
for (rule_id_usize, rule) in rules_db.rules().iter().enumerate() {
let re = &rules_db.anchored_regexes()[rule_id_usize];
let before_len = matches.len();
filter_match(
blob,
rule.clone(),
re,
item.pos_start,
item.pos_end,
&mut matches,
&mut previous_matches,
rule_id_usize,
&mut seen_matches,
origin,
Some(item.decoded.as_slice()),
true,
redact,
&filename,
self.profiler.as_ref(),
self.respect_ignore_if_contains,
&self.inline_ignore_config,
);
match_rule_indices
.extend(std::iter::repeat_n(rule_id_usize, matches.len() - before_len));
}
if depth + 1 < MAX_B64_DEPTH {
for nested in get_b64_strings(item.decoded.as_slice()) {
b64_stack.push((
DecodedData {
decoded: nested.decoded,
pos_start: item.pos_start,
pos_end: item.pos_end,
},
depth + 1,
));
}
}
}
}
maybe_apply_tree_sitter_verification(
rules_db,
blob,
lang_hint,
blob_len,
&mut matches,
&match_rule_indices,
);
// Finalize
if !no_dedup && !matches.is_empty() {
let blob_id = blob.id();
if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
return Ok(if had_matches {
ScanResult::SeenWithMatches
} else {
ScanResult::SeenSansMatches
});
}
}
// --- opportunistic capacity cap ---------------------------------
if self.user_data.raw_matches_scratch.capacity()
> self.user_data.raw_matches_scratch.len() * 4
{
// Vec::shrink_to_fit may re-allocate, but we're about to leave scan_blob
// so the cost is hidden off the hot path.
self.user_data.raw_matches_scratch.shrink_to_fit();
}
Ok(ScanResult::New(matches))
}
}
fn maybe_apply_tree_sitter_verification<'a>(
rules_db: &RulesDatabase,
blob: &'a Blob,
lang_hint: Option<&str>,
blob_len: usize,
matches: &mut Vec<BlobMatch<'a>>,
match_rule_indices: &[usize],
) {
if matches.is_empty() {
return;
}
let profiles = rules_db.rule_match_profiles();
let candidate_indices: Vec<usize> = matches
.iter()
.enumerate()
.filter(|(idx, m)| {
if m.is_base64 {
return false;
}
let Some(rule_idx) = match_rule_indices.get(*idx) else {
return false;
};
profiles[*rule_idx].kind == RuleDetectionProfileKind::ContextDependent
})
.map(|(idx, _)| idx)
.collect();
if candidate_indices.is_empty() {
return;
}
let ts_results = load_tree_sitter_results(blob, lang_hint, blob_len);
let mut keep = vec![true; matches.len()];
for idx in candidate_indices {
let Some(rule_idx) = match_rule_indices.get(idx).copied() else {
continue;
};
let match_secret = matches[idx].matching_input;
let re = &rules_db.anchored_regexes()[rule_idx];
match ts_results.as_ref() {
Some(results) => {
let verified = results.iter().any(|text| {
verify_match_in_tree_sitter_text(re, match_secret, text.as_bytes())
});
if !verified {
keep[idx] = false;
}
}
None => {
// Tree-sitter is an optional precision layer. If parser context
// is unavailable, always fall back to the original regex match.
}
}
}
if keep.iter().all(|k| *k) {
return;
}
let mut filtered = Vec::with_capacity(matches.len());
for (idx, item) in std::mem::take(matches).into_iter().enumerate() {
if keep[idx] {
filtered.push(item);
}
}
*matches = filtered;
}
fn load_tree_sitter_results(
blob: &Blob,
lang_hint: Option<&str>,
blob_len: usize,
) -> Option<Vec<String>> {
if !should_attempt_tree_sitter(blob_len) {
return None;
}
let lang = lang_hint?;
let (language, queries) = get_language_and_queries(lang)?;
let checker = Checker { language, rules: queries };
match checker.check(&blob.bytes()) {
Ok(results) => Some(results.into_iter().map(|m| m.text).collect()),
Err(e) => {
debug!("tree-sitter verification unavailable: {e}");
None
}
}
}
fn verify_match_in_tree_sitter_text(
re: &regex::bytes::Regex,
expected_secret: &[u8],
text: &[u8],
) -> bool {
re.captures_iter(text)
.any(|captures| find_secret_capture(re, &captures).as_bytes() == expected_secret)
}
fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, String>)> {
match lang.to_lowercase().as_str() {
"bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())),
"c" => Some((Language::C, parser::queries::c::get_c_queries())),
"c#" | "csharp" => Some((Language::CSharp, parser::queries::csharp::get_csharp_queries())),
"c++" | "cpp" => Some((Language::Cpp, parser::queries::cpp::get_cpp_queries())),
"css" => Some((Language::Css, parser::queries::css::get_css_queries())),
"go" => Some((Language::Go, parser::queries::go::get_go_queries())),
"html" => Some((Language::Html, parser::queries::html::get_html_queries())),
"java" => Some((Language::Java, parser::queries::java::get_java_queries())),
"javascript" | "js" => {
Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries()))
}
"php" => Some((Language::Php, parser::queries::php::get_php_queries())),
"python" | "py" | "starlark" => {
Some((Language::Python, parser::queries::python::get_python_queries()))
}
"ruby" => Some((Language::Ruby, parser::queries::ruby::get_ruby_queries())),
"rust" => Some((Language::Rust, parser::queries::rust::get_rust_queries())),
"toml" => Some((Language::Toml, parser::queries::toml::get_toml_queries())),
"typescript" | "ts" => {
Some((Language::TypeScript, parser::queries::typescript::get_typescript_queries()))
}
"yaml" => Some((Language::Yaml, parser::queries::yaml::get_yaml_queries())),
_ => None,
}
}
// -------------------------------------------------------------------------------------------------
// test
// -------------------------------------------------------------------------------------------------
#[cfg(test)]
mod test {
use std::{collections::BTreeMap, path::PathBuf};
use pretty_assertions::assert_eq;
// ---------------------------------------------------------------------
// proptest: raw-match dedup + entropy gate
// ---------------------------------------------------------------------
use proptest::prelude::*;
use super::*;
use crate::{
blob::{Blob, BlobIdMap},
entropy::calculate_shannon_entropy,
origin::{Origin, OriginSet},
rules::rule::{
DependsOnRule, HttpRequest, HttpValidation, PatternRequirements, RuleSyntax, Validation,
},
};
proptest! {
#[test]
fn prop_no_dupes_and_entropy(
// random ASCII up to 300 bytes
mut noise in proptest::collection::vec(any::<u8>().prop_filter("ascii", |b| b.is_ascii()), 0..300),
// 0-4 random insertion points
inserts in proptest::collection::vec(0usize..300, 0..5)
) {
// Constant high-entropy secret token that matches the rule below
const TOKEN: &[u8] = b"secret_abcd1234";
// Splice the token at the requested offsets
for &idx in &inserts {
let pos = idx.min(noise.len());
noise.splice(pos..pos, TOKEN.iter().copied());
}
// ── build a single test rule ──────────────────────────────────
use crate::rules::rule::{RuleSyntax, Validation, Confidence};
let rule = Rule::new(RuleSyntax {
id: "prop.secret".into(),
name: "prop secret".into(),
pattern: "secret_[a-z]{4}[0-9]{4}".into(),
confidence: Confidence::Low,
min_entropy: 3.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>, // no HTTP validation needed
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
});
let rules_db = RulesDatabase::from_rules(vec![rule]).unwrap();
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut m = Matcher::new(
&rules_db,
scanner_pool,
&seen,
None,
false,
None,
&[],
false,
true,
)
.unwrap();
// ── run the scan ──────────────────────────────────────────────
m.scan_bytes_raw(&noise, "buf").unwrap();
// ── property 1: dedup each (rule,start,end) is unique ──────
let mut coords = FxHashSet::default();
for RawMatch{rule_id, start_idx, end_idx} in &m.user_data.raw_matches_scratch {
assert!(
coords.insert((*rule_id, *start_idx, *end_idx)),
"duplicate raw-match detected for coords ({rule_id},{start_idx},{end_idx})"
);
// ── property 2: entropy gate held ────────────────────────
let slice = &noise[*start_idx as usize .. *end_idx as usize];
let ent = calculate_shannon_entropy(slice);
assert!(ent > 3.0, "entropy {ent} ≤ min_entropy, gate failed");
}
}
}
#[test]
pub fn test_simple() -> Result<()> {
let rules = vec![Rule::new(RuleSyntax {
id: "test.1".to_string(),
name: "test".to_string(),
pattern: "test".to_string(),
confidence: crate::rules::rule::Confidence::Medium,
min_entropy: 1.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: Some(Validation::Http(HttpValidation {
request: HttpRequest {
method: "GET".to_string(),
url: "https://example.com".to_string(),
headers: BTreeMap::new(),
body: None,
response_matcher: Some(vec![]),
multipart: None,
response_is_html: false,
},
multipart: None,
})),
revocation: None,
depends_on_rule: vec![
Some(DependsOnRule {
rule_id: "d8f3c34b-015f-4cd6-b411-b1366493104c".to_string(),
variable: "email".to_string(),
}),
Some(DependsOnRule {
rule_id: "8910f364-7718-4a27-a435-d2da13e6ba9e".to_string(),
variable: "domain".to_string(),
}),
],
pattern_requirements: None,
tls_mode: None,
})];
let rules_db = RulesDatabase::from_rules(rules)?;
let input = "some test data for vectorscan";
let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
let enable_rule_profiling = true;
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher = Matcher::new(
&rules_db,
scanner_pool,
&seen_blobs,
None,
enable_rule_profiling,
None, // Pass the shared profiler
&[],
false,
true,
)?;
matcher.scan_bytes_raw(input.as_bytes(), "fname")?;
assert_eq!(
matcher.user_data.raw_matches_scratch,
vec![RawMatch { rule_id: 0, start_idx: 0, end_idx: 9 },]
);
Ok(())
}
#[test]
fn test_pattern_requirements_ignore_if_contains_filters_matches() -> Result<()> {
let rules = vec![Rule::new(RuleSyntax {
id: "test.exclude".to_string(),
name: "exclude words".to_string(),
pattern: "(?P<token>prefix[A-Za-z]+)".to_string(),
confidence: crate::rules::rule::Confidence::Medium,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: Some(PatternRequirements {
min_digits: None,
min_uppercase: None,
min_lowercase: None,
min_special_chars: None,
special_chars: None,
ignore_if_contains: Some(vec!["TEST".to_string()]),
checksum: None,
}),
tls_mode: None,
})];
let rules_db = RulesDatabase::from_rules(rules)?;
let input = b"prefixgood prefixtest";
let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher = Matcher::new(
&rules_db,
scanner_pool,
&seen_blobs,
None,
false,
None,
&[],
false,
true,
)?;
let blob = Blob::from_bytes(input.to_vec());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("exclude.txt")));
let matches = match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
ScanResult::New(matches) => matches,
ScanResult::SeenWithMatches => {
panic!("unexpected scan result: blob should not be considered previously seen with matches")
}
ScanResult::SeenSansMatches => {
panic!("unexpected scan result: blob should not be considered previously seen without matches")
}
};
assert_eq!(matches.len(), 1, "ignore_if_contains should drop filtered matches");
assert_eq!(
matches[0].matching_input, b"prefixgood",
"remaining match should be the non-excluded token",
);
Ok(())
}
#[test]
fn test_pattern_requirements_ignore_if_contains_can_be_disabled_in_matcher() -> Result<()> {
let rules = vec![Rule::new(RuleSyntax {
id: "test.exclude".to_string(),
name: "exclude words".to_string(),
pattern: "(?P<token>prefix[A-Za-z]+)".to_string(),
confidence: crate::rules::rule::Confidence::Medium,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: Some(PatternRequirements {
min_digits: None,
min_uppercase: None,
min_lowercase: None,
min_special_chars: None,
special_chars: None,
ignore_if_contains: Some(vec!["TEST".to_string()]),
checksum: None,
}),
tls_mode: None,
})];
let rules_db = RulesDatabase::from_rules(rules)?;
let input = b"prefixgood prefixtest";
let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher = Matcher::new(
&rules_db,
scanner_pool,
&seen_blobs,
None,
false,
None,
&[],
false,
false,
)?;
let blob = Blob::from_bytes(input.to_vec());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("exclude-disabled.txt")));
let matches = match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
ScanResult::New(matches) => matches,
ScanResult::SeenWithMatches => {
panic!(
"unexpected scan result: blob should not be considered previously seen with matches"
)
}
ScanResult::SeenSansMatches => {
panic!(
"unexpected scan result: blob should not be considered previously seen without matches"
)
}
};
assert_eq!(matches.len(), 2, "disabling ignore_if_contains should keep all matches");
Ok(())
}
// ---------------------------------------------------------------------
// additional deterministic unit-tests
// ---------------------------------------------------------------------
/// `get_base64_strings` should recognise a well-formed token, decode it,
/// and report correct byte-offsets.
#[test]
fn test_get_base64_strings_basic() {
let base64_payload = b"MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY=";
let mut raw = b"foo ".to_vec();
raw.extend_from_slice(base64_payload);
raw.extend_from_slice(b" bar");
// decodes to "0123456789abcdef0123456789abcdef"
let hits = get_base64_strings(&raw);
assert_eq!(hits.len(), 1);
let item = &hits[0];
assert_eq!(std::str::from_utf8(&item.decoded).unwrap(), "0123456789abcdef0123456789abcdef");
// "foo␠" is 4 bytes, so the start offset is 4
assert_eq!((item.pos_start, item.pos_end), (4, 4 + base64_payload.len()));
}
/// `compute_finding_fingerprint` must be stable (same input => same output)
/// and sensitive to any input component.
#[test]
fn test_finding_fingerprint_stability_and_uniqueness() {
let a = compute_finding_fingerprint("secret", "fileA", 0, 6);
let b = compute_finding_fingerprint("secret", "fileA", 0, 6);
assert_eq!(a, b, "fingerprint should be deterministic");
// changing any parameter should perturb the hash
let c = compute_finding_fingerprint("secret", "fileA", 1, 7); // offsets differ
let d = compute_finding_fingerprint("secret", "fileB", 0, 6); // file id differs
let e = compute_finding_fingerprint("different", "fileA", 0, 6); // content differs
assert_ne!(a, c);
assert_ne!(a, d);
assert_ne!(a, e);
}
/// The (private) `compute_match_key` helper is the linchpin of the raw-dedup
/// path. It should return identical keys for identical inputs and different
/// keys as soon as *anything* changes.
#[test]
fn test_compute_match_key_uniqueness() {
use super::dedup::compute_match_key;
let k1 = compute_match_key(b"abc", b"rule-1", 0, 3);
let k2 = compute_match_key(b"abc", b"rule-1", 0, 3);
assert_eq!(k1, k2);
// mutate each component in turn
let diff_content = compute_match_key(b"abcd", b"rule-1", 0, 4);
let diff_rule = compute_match_key(b"abc", b"rule-2", 0, 3);
let diff_span = compute_match_key(b"abc", b"rule-1", 1, 4);
assert_ne!(k1, diff_content);
assert_ne!(k1, diff_rule);
assert_ne!(k1, diff_span);
}
/// Running `scan_bytes_raw` twice over the *same* input should never record
/// duplicate entries in `raw_matches_scratch`.
#[test]
fn test_scan_bytes_raw_no_duplicate_raw_matches() -> Result<()> {
// simple rule: literal "dup"
let rule = Rule::new(RuleSyntax {
id: "dup.check".into(),
name: "dup".into(),
pattern: "dup".into(),
confidence: crate::rules::rule::Confidence::Low,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut m =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let buf = b"dup dup"; // two literal hits, same rule
// first scan
m.scan_bytes_raw(buf, "buf1")?;
let first_len = m.user_data.raw_matches_scratch.len();
// second scan over the same buffer
m.scan_bytes_raw(buf, "buf1")?;
let second_len = m.user_data.raw_matches_scratch.len();
// we should still only have two unique raw matches recorded
assert_eq!(first_len, 2);
assert_eq!(second_len, 2);
Ok(())
}
#[test]
fn inline_comment_skips_match() -> Result<()> {
let rule = Rule::new(RuleSyntax {
id: "inline.ignore".into(),
name: "inline".into(),
pattern: "secret_token".into(),
confidence: crate::rules::rule::Confidence::Low,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt")));
match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
ScanResult::New(matches) => assert!(matches.is_empty()),
_ => panic!("unexpected scan result"),
}
Ok(())
}
#[test]
fn inline_comment_after_multiline_secret_skips_match() -> Result<()> {
let rule = Rule::new(RuleSyntax {
id: "inline.multiline".into(),
name: "inline multiline".into(),
pattern: "line1\\s+line2".into(),
confidence: crate::rules::rule::Confidence::Low,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let blob = Blob::from_bytes(
br#"let data = """
line1
line2
"""
# kingfisher:ignore
"#
.to_vec(),
);
let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt")));
match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
ScanResult::New(matches) => assert!(matches.is_empty()),
_ => panic!("unexpected scan result"),
}
Ok(())
}
#[test]
fn compat_flag_controls_external_directives() -> Result<()> {
let rule = Rule::new(RuleSyntax {
id: "inline.compat".into(),
name: "inline compat".into(),
pattern: "supersecret123".into(),
confidence: crate::rules::rule::Confidence::Low,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt")));
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let matches_without_compat =
match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
ScanResult::New(matches) => matches.len(),
_ => panic!("unexpected scan result"),
};
assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag");
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let extra = vec![String::from("gitleaks:allow")];
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &extra, false, true)?;
match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
ScanResult::New(matches) => assert!(matches.is_empty()),
_ => panic!("unexpected scan result"),
}
Ok(())
}
#[test]
fn serializes_captures_in_numeric_order() {
use regex::bytes::Regex;
let re =
Regex::new(r"(?xi)\b(ghp_(?P<body>[A-Z0-9]{3})(?P<checksum>[A-Z0-9]{2}))").unwrap();
let caps = re.captures(b"ghp_ABC12").expect("expected captures");
let serialized = SerializableCaptures::from_captures(&caps, b"", &re);
let entries: Vec<(Option<&str>, i32, &str)> = serialized
.captures
.iter()
.map(|cap| (cap.name.as_deref(), cap.match_number, cap.value))
.collect();
assert_eq!(entries.len(), 3);
assert_eq!(entries[0], (None, 1, "ghp_ABC12"));
assert_eq!(entries[1], (Some("body"), 2, "ABC"));
assert_eq!(entries[2], (Some("checksum"), 3, "12"));
}
#[test]
fn parser_second_pass_keeps_verified_contextual_match() -> Result<()> {
let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234";
let rule = Rule::new(RuleSyntax {
id: "kingfisher.auth0.2".into(),
name: "auth0 secret".into(),
pattern: "(?xi)\\bauth0(?:.|[\\n\\r]){0,16}?(?:secret|token)(?:.|[\\n\\r]){0,64}?\\b([a-z0-9_-]{64,})\\b".into(),
confidence: crate::rules::rule::Confidence::Medium,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let mut content = "x".repeat(1200);
content.push_str(&format!("\nauth0_client_secret = \"{token}\"\n"));
let blob = Blob::from_bytes(content.into_bytes());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("verified.py")));
let found = match matcher.scan_blob(
&blob,
&origin,
Some("python".to_string()),
false,
false,
false,
)? {
ScanResult::New(matches) => matches,
_ => panic!("unexpected scan result"),
};
assert_eq!(found.len(), 1);
Ok(())
}
#[test]
fn parser_second_pass_suppresses_unverified_contextual_match() -> Result<()> {
let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234";
let rule = Rule::new(RuleSyntax {
id: "kingfisher.auth0.2".into(),
name: "auth0 secret".into(),
pattern: "(?xi)\\bauth0(?:.|[\\n\\r]){0,16}?(?:secret|token)(?:.|[\\n\\r]){0,64}?\\b([a-z0-9_-]{64,})\\b".into(),
confidence: crate::rules::rule::Confidence::Medium,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let mut content = "x".repeat(1200);
content.push_str(&format!("\n# auth0 secret {token}\n"));
let blob = Blob::from_bytes(content.into_bytes());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("comment.py")));
let found = match matcher.scan_blob(
&blob,
&origin,
Some("python".to_string()),
false,
false,
false,
)? {
ScanResult::New(matches) => matches,
_ => panic!("unexpected scan result"),
};
assert!(
found.is_empty(),
"comment-only contextual hits should be suppressed when tree-sitter cannot verify assignment context"
);
Ok(())
}
#[test]
fn strict_context_rule_keeps_raw_when_tree_sitter_unavailable() -> Result<()> {
let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234";
let rule = Rule::new(RuleSyntax {
id: "kingfisher.auth0.2".into(),
name: "auth0 secret".into(),
pattern: "(?xi)\\bauth0(?:.|[\\n\\r]){0,16}?(?:secret|token)(?:.|[\\n\\r]){0,64}?\\b([a-z0-9_-]{64,})\\b".into(),
confidence: crate::rules::rule::Confidence::Medium,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let content = format!("auth0 token {token}");
let blob = Blob::from_bytes(content.into_bytes());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("small.txt")));
let found = match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
ScanResult::New(matches) => matches,
_ => panic!("unexpected scan result"),
};
assert_eq!(
found.len(),
1,
"strict contextual rules should fall back to raw regex findings when tree-sitter is unavailable"
);
Ok(())
}
#[test]
fn self_identifying_rule_remains_hyperscan_only() -> Result<()> {
let token = "CCIPAT_FERZRjTN451xnDCy1y9gWn_79fb6ca4d0e5f833612eee17de397a9dca0a9e9f";
let rule = Rule::new(RuleSyntax {
id: "kingfisher.circleci.1".into(),
name: "circleci pat".into(),
pattern: "(?x)\\b(CCIPAT_[A-Za-z0-9]{22}_[a-z0-9]{40})\\b".into(),
confidence: crate::rules::rule::Confidence::Medium,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let blob = Blob::from_bytes(format!("token={token}").into_bytes());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("circleci.txt")));
let found = match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
ScanResult::New(matches) => matches,
_ => panic!("unexpected scan result"),
};
assert_eq!(found.len(), 1, "self-identifying tokens should remain raw-pass findings");
Ok(())
}
}