kingfisher/src/matcher.rs
Mick Grove a003b732fa - Added kingfisher:ignore (or kingfisher:allow) to silence a finding inline within a file
- Added: to reuse existing inline directives from other scanners, pass --compat-ignore-comments to also accept NOSONAR, kics-scan ignore,  gitleaks:allow and trufflehog:ignore
2025-10-09 20:11:31 -07:00

1337 lines
50 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use std::{
hash::{Hash, Hasher},
str,
sync::{Arc, Mutex},
};
use anyhow::Result;
use base64::{engine::general_purpose, Engine};
use bstr::BString;
use http::StatusCode;
use regex::bytes::Regex;
use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
use schemars::{
gen::SchemaGenerator,
schema::{ArrayValidation, InstanceType, Schema},
JsonSchema,
};
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use tracing::debug;
use xxhash_rust::xxh3::xxh3_64;
use crate::{
blob::{Blob, BlobId, BlobIdMap},
entropy::calculate_shannon_entropy,
inline_ignore::InlineIgnoreConfig,
location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
origin::OriginSet,
parser,
parser::{Checker, Language},
rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer},
rules::rule::Rule,
rules_database::RulesDatabase,
safe_list::{is_safe_match, is_user_match},
scanner_pool::ScannerPool,
snippet::Base64BString,
util::{intern, redact_value},
};
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
const TREE_SITTER_MAX_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs <= 64 KiB
const TREE_SITTER_MIN_LIMIT: usize = 1 * 1024; // only run tree-sitter on blobs >= 1 KiB
// -------------------------------------------------------------------------------------------------
// RawMatch
// -------------------------------------------------------------------------------------------------
/// A raw match, as recorded by a callback to Vectorscan.
///
/// When matching with Vectorscan, we simply collect all matches into a
/// preallocated `Vec`, and then go through them all after scanning is complete.
#[derive(PartialEq, Eq, Debug, Clone)]
struct RawMatch {
rule_id: u32,
start_idx: u64,
end_idx: u64,
}
#[derive(Clone)]
pub struct OwnedBlobMatch {
pub rule: Arc<Rule>,
pub blob_id: BlobId,
/// The unique content-based identifier of this match
pub finding_fingerprint: u64,
pub matching_input_offset_span: OffsetSpan,
pub captures: SerializableCaptures,
pub validation_response_body: String,
pub validation_response_status: StatusCode,
pub validation_success: bool,
pub calculated_entropy: f32,
pub is_base64: bool,
}
impl<'a> Matcher<'a> {
pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
self.profiler.as_ref().map(|p| p.generate_report())
}
}
impl OwnedBlobMatch {
pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc<Rule>) -> OwnedBlobMatch {
OwnedBlobMatch {
rule,
blob_id: m.blob_id,
finding_fingerprint: m.finding_fingerprint,
// matching_input: m.snippet.matching.0.to_vec(),
matching_input_offset_span: m.location.offset_span.clone(),
captures: m.groups.clone(),
validation_response_body: m.validation_response_body.clone(),
validation_response_status: StatusCode::from_u16(m.validation_response_status)
.unwrap_or(StatusCode::CONTINUE),
validation_success: m.validation_success,
calculated_entropy: m.calculated_entropy,
is_base64: m.is_base64,
}
}
pub fn from_blob_match(blob_match: BlobMatch) -> Self {
// Get the matching value from capture group 1 (or 0 if not available)
let matching_finding = blob_match
.captures
.captures
.get(1)
.or_else(|| blob_match.captures.captures.get(0))
.map(|capture| capture.value.as_bytes().to_vec())
.unwrap_or_else(Vec::new);
let mut owned_blob_match = OwnedBlobMatch {
rule: blob_match.rule,
blob_id: blob_match.blob_id.clone(),
matching_input_offset_span: blob_match.matching_input_offset_span,
captures: blob_match.captures.clone(),
validation_response_body: blob_match.validation_response_body,
validation_response_status: blob_match.validation_response_status,
validation_success: blob_match.validation_success,
calculated_entropy: blob_match.calculated_entropy,
finding_fingerprint: 0, //default
is_base64: blob_match.is_base64,
};
// Convert matching_finding to a &str (using lossy conversion if needed)
let finding_value = std::str::from_utf8(&matching_finding).unwrap_or("");
// Use blob_id as the file/commit identifier
let file_or_commit = &blob_match.blob_id.to_string();
let offset_start: u64 =
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
owned_blob_match.finding_fingerprint =
compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end);
owned_blob_match
}
}
// -------------------------------------------------------------------------------------------------
// BlobMatch
// -------------------------------------------------------------------------------------------------
/// A `BlobMatch` is the result type from `Matcher::scan_blob`.
///
/// It is mostly made up of references and small data.
/// For a representation that is more friendly for human consumption, see
/// `Match`.
pub struct BlobMatch<'a> {
/// The rule that was matched
pub rule: Arc<Rule>, // Changed from `&'a Rule` to `Arc<Rule
/// The blob that was matched
pub blob_id: &'a BlobId,
/// The matching input in `blob.input`
pub matching_input: &'a [u8],
/// The location of the matching input in `blob.input`
pub matching_input_offset_span: OffsetSpan,
/// The capture groups from the match
pub captures: SerializableCaptures, // regex::bytes::Captures<'a>,
pub validation_response_body: String,
pub validation_response_status: StatusCode,
pub validation_success: bool,
pub calculated_entropy: f32,
pub is_base64: bool,
}
#[derive(Clone)]
struct UserData {
/// A scratch vector for raw matches from Vectorscan, to minimize allocation
raw_matches_scratch: Vec<RawMatch>,
/// The length of the input being scanned
input_len: u64,
}
// -------------------------------------------------------------------------------------------------
// Matcher
// -------------------------------------------------------------------------------------------------
/// A `Matcher` is able to scan inputs for matches from rules in a
/// `RulesDatabase`.
///
/// If doing multi-threaded scanning, use a separate `Matcher` for each thread.
#[derive(Clone)]
pub struct Matcher<'a> {
/// Thread-local pool that hands out a &mut BlockScanner
scanner_pool: std::sync::Arc<crate::scanner_pool::ScannerPool>,
/// The rules database used for matching
rules_db: &'a RulesDatabase,
/// Local statistics for this `Matcher`
local_stats: MatcherStats,
/// Global statistics, updated with the local statsistics when this
/// `Matcher` is dropped
global_stats: Option<&'a Mutex<MatcherStats>>,
/// The set of blobs that have been seen
seen_blobs: &'a BlobIdMap<bool>,
/// Data passed to the Vectorscan callback
user_data: UserData,
/// Rule profiler for measuring performance of individual rules
profiler: Option<Arc<ConcurrentRuleProfiler>>,
/// Configuration that controls inline ignore directives
inline_ignore_config: InlineIgnoreConfig,
}
/// This `Drop` implementation updates the `global_stats` with the local stats
impl<'a> Drop for Matcher<'a> {
fn drop(&mut self) {
if let Some(global_stats) = self.global_stats {
let mut global_stats = global_stats.lock().unwrap();
global_stats.update(&self.local_stats);
}
}
}
pub enum ScanResult<'a> {
SeenWithMatches,
SeenSansMatches,
New(Vec<BlobMatch<'a>>),
}
impl<'a> Matcher<'a> {
/// Create a new `Matcher` from the given `RulesDatabase`.
///
/// If `global_stats` is provided, it will be updated with the local stats
/// from this `Matcher` when it is dropped.
pub fn new(
rules_db: &'a RulesDatabase,
scanner_pool: Arc<ScannerPool>,
seen_blobs: &'a BlobIdMap<bool>,
global_stats: Option<&'a Mutex<MatcherStats>>,
enable_profiling: bool,
shared_profiler: Option<Arc<ConcurrentRuleProfiler>>,
include_external_ignore_syntax: bool,
disable_inline_ignores: bool,
) -> Result<Self> {
// Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec
let raw_matches_scratch = Vec::new();
let user_data = UserData { raw_matches_scratch, input_len: 0 };
// let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?;
// pool is created once per scan run (see Scanner section below)
let profiler = shared_profiler.or_else(|| {
if enable_profiling {
Some(Arc::new(ConcurrentRuleProfiler::new()))
} else {
None
}
});
Ok(Matcher {
scanner_pool,
rules_db,
local_stats: MatcherStats::default(),
global_stats,
seen_blobs,
user_data,
profiler,
inline_ignore_config: if disable_inline_ignores {
InlineIgnoreConfig::disabled()
} else {
InlineIgnoreConfig::new(include_external_ignore_syntax)
},
})
}
fn scan_bytes_raw(&mut self, input: &[u8], _filename: &str) -> Result<()> {
// Remember previous peak automatically
let prev_capacity = self.user_data.raw_matches_scratch.capacity();
self.user_data.raw_matches_scratch.clear();
self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64));
self.user_data.input_len = input.len() as u64;
let mut offset: usize = 0;
while offset < input.len() {
let end = (offset + MAX_CHUNK_SIZE).min(input.len());
let slice = &input[offset..end];
let base = offset as u64;
self.scanner_pool.with(|scanner| {
scanner.scan(slice, |rule_id, from, to, _flags| {
self.user_data.raw_matches_scratch.push(RawMatch {
rule_id,
start_idx: from + base,
end_idx: to + base,
});
vectorscan_rs::Scan::Continue
})
})?;
if end == input.len() {
break;
}
offset = end.saturating_sub(CHUNK_OVERLAP);
}
Ok(())
}
pub fn scan_blob<'b>(
&mut self,
blob: &'b Blob,
origin: &OriginSet,
lang: Option<String>,
redact: bool,
no_dedup: bool,
no_base64: bool,
) -> Result<ScanResult<'b>>
where
'a: 'b,
{
// Update local stats
self.local_stats.blobs_seen += 1;
self.local_stats.bytes_seen += blob.bytes().len() as u64;
self.local_stats.blobs_scanned += 1;
self.local_stats.bytes_scanned += blob.bytes().len() as u64;
// Extract filename from origin
let filename = origin
.first()
.blob_path()
.and_then(|path| path.file_name())
.and_then(|name| name.to_str())
.unwrap_or("unknown_file")
.to_string();
// Perform the scan
self.scan_bytes_raw(&blob.bytes(), &filename)?;
// Opportunistically look for standalone Base64 blobs. If neither
// the raw scan nor this check yields anything, we can return early
// before doing any heavier work.
let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
Vec::new()
} else {
get_base64_strings(blob.bytes())
};
let lang_hint = lang.as_deref();
let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty();
let has_base64_items = !b64_items.is_empty();
if !has_raw_matches && !has_base64_items {
return Ok(ScanResult::New(Vec::new()));
}
let rules_db = self.rules_db;
let mut seen_matches = FxHashSet::default();
let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
let blob_len = blob.len();
let should_run_tree_sitter = blob_len > 0
&& blob_len <= TREE_SITTER_MAX_LIMIT
&& blob_len >= TREE_SITTER_MIN_LIMIT
&& has_raw_matches
&& lang_hint.is_some()
&& !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled
let tree_sitter_result = if should_run_tree_sitter {
lang_hint.and_then(|lang_str| {
get_language_and_queries(lang_str).and_then(|(language, queries)| {
let checker = Checker { language, rules: queries };
match checker.check(&blob.bytes()) {
Ok(results) => Some(results),
Err(e) => {
println!("Error in checker.check: {}", e);
None
}
}
})
})
} else {
None
};
// Process matches
let mut matches = Vec::new();
let owned_ts_results = tree_sitter_result.map(|ts_results| {
ts_results
.into_iter()
.filter(|match_result| match_result.is_base64_decoded)
.map(|match_result| {
(
match_result.range,
match_result.text,
match_result.is_base64_decoded,
match_result.original_base64,
)
})
.collect::<Vec<_>>()
});
let mut previous_raw_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
for &RawMatch { rule_id, start_idx, end_idx } in
self.user_data.raw_matches_scratch.iter().rev()
{
let rule_id_usize: usize = rule_id as usize;
let rule = Arc::clone(&rules_db.rules[rule_id_usize]);
let re = &rules_db.anchored_regexes[rule_id_usize];
let start_idx_usize = start_idx as usize;
let end_idx_usize = end_idx as usize;
let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
continue;
}
filter_match(
blob,
rule,
re,
start_idx_usize,
end_idx_usize,
&mut matches,
&mut previous_matches,
rule_id_usize,
&mut seen_matches,
origin,
None,
false,
redact,
&filename,
self.profiler.as_ref(),
&self.inline_ignore_config,
);
}
// If tree-sitter produced base64-decoded matches, try them against all rules
if let Some(ref ts_results) = owned_ts_results {
for (ts_range, ts_match, is_base64_decoded, _original_base64) in ts_results.iter() {
if *is_base64_decoded {
for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() {
let re = &rules_db.anchored_regexes[rule_id_usize];
filter_match(
blob,
rule.clone(),
re,
ts_range.start,
ts_range.end,
&mut matches,
&mut previous_matches,
rule_id_usize,
&mut seen_matches,
origin,
Some(ts_match.as_bytes()),
*is_base64_decoded,
redact,
&filename,
self.profiler.as_ref(),
&self.inline_ignore_config,
);
}
}
}
}
if !no_base64 {
// If the blob contains standalone Base64 blobs, decode and scan them as well
const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep
let mut b64_stack: Vec<(DecodedData, usize)> =
b64_items.drain(..).map(|d| (d, 0)).collect();
while let Some((item, depth)) = b64_stack.pop() {
for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() {
let re = &rules_db.anchored_regexes[rule_id_usize];
filter_match(
blob,
rule.clone(),
re,
item.pos_start,
item.pos_end,
&mut matches,
&mut previous_matches,
rule_id_usize,
&mut seen_matches,
origin,
Some(item.decoded.as_bytes()),
true,
redact,
&filename,
self.profiler.as_ref(),
&self.inline_ignore_config,
);
}
if depth + 1 < MAX_B64_DEPTH {
for nested in get_base64_strings(item.decoded.as_bytes()) {
b64_stack.push((
DecodedData {
original: nested.original,
decoded: nested.decoded,
pos_start: item.pos_start,
pos_end: item.pos_end,
},
depth + 1,
));
}
}
}
}
// Finalize
if !no_dedup && !matches.is_empty() {
let blob_id = blob.id();
if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
return Ok(if had_matches {
ScanResult::SeenWithMatches
} else {
ScanResult::SeenSansMatches
});
}
}
// --- opportunistic capacity cap ---------------------------------
if self.user_data.raw_matches_scratch.capacity()
> self.user_data.raw_matches_scratch.len() * 4
{
// Vec::shrink_to_fit may re-allocate, but were about to leave scan_blob
// so the cost is hidden off the hot path.
self.user_data.raw_matches_scratch.shrink_to_fit();
}
Ok(ScanResult::New(matches))
// Ok(result)
}
}
#[inline]
fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
let mut hasher = FxHasher::default();
// Hash each component directly without allocation
content.hash(&mut hasher);
rule_id.hash(&mut hasher);
start.hash(&mut hasher);
end.hash(&mut hasher);
hasher.finish()
}
#[inline]
fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
if idx > 0 {
if spans[idx - 1].fully_contains(&span) {
return false;
}
if span.fully_contains(&spans[idx - 1]) {
spans.remove(idx - 1);
idx -= 1;
}
}
if idx < spans.len() {
if spans[idx].fully_contains(&span) {
return false;
}
if span.fully_contains(&spans[idx]) {
spans.remove(idx);
}
}
spans.insert(idx, span);
true
}
#[inline]
fn record_match(
map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
rule_id: usize,
span: OffsetSpan,
) -> bool {
insert_span(map.entry(rule_id).or_default(), span)
}
fn filter_match<'b>(
blob: &'b Blob,
// rule: &'b Rule,
rule: Arc<Rule>,
re: &Regex,
start: usize,
end: usize,
matches: &mut Vec<BlobMatch<'b>>,
previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
rule_id: usize,
seen_matches: &mut FxHashSet<u64>,
_origin: &OriginSet,
ts_match: Option<&[u8]>,
is_base64: bool,
redact: bool,
filename: &str,
profiler: Option<&Arc<ConcurrentRuleProfiler>>,
inline_ignore_config: &InlineIgnoreConfig,
) {
let mut timer =
profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
let initial_len = matches.len();
let blob_bytes = blob.bytes();
let default_slice = &blob_bytes[start..end];
let haystack = ts_match.unwrap_or(default_slice);
for captures in re.captures_iter(haystack) {
let full_capture = captures.get(0).unwrap();
let matching_input = captures.get(1).unwrap_or(full_capture);
let min_entropy = rule.min_entropy();
let mi_bytes = matching_input.as_bytes();
let full_bytes = full_capture.as_bytes();
let calculated_entropy = calculate_shannon_entropy(mi_bytes);
if calculated_entropy <= min_entropy
|| is_safe_match(mi_bytes)
|| is_user_match(mi_bytes, full_bytes)
{
debug!(
"Skipping match with entropy {} <= {} or safe match",
calculated_entropy, min_entropy
);
continue;
}
let matching_input_offset_span = OffsetSpan::from_range(
(start + matching_input.start())..(start + matching_input.end()),
);
if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
debug!("Skipping match due to inline ignore directive");
continue;
}
let match_key = compute_match_key(
matching_input.as_bytes(),
rule.id().as_bytes(),
matching_input_offset_span.start,
matching_input_offset_span.end,
);
if !seen_matches.insert(match_key) {
continue;
}
if !record_match(previous_matches, rule_id, matching_input_offset_span) {
continue;
}
let only_matching_input =
&blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
let groups = SerializableCaptures::from_captures(&captures, haystack, re, redact);
matches.push(BlobMatch {
rule: Arc::clone(&rule),
blob_id: blob.id_ref(),
matching_input: only_matching_input,
matching_input_offset_span,
captures: groups,
validation_response_body: String::new(),
validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
validation_success: false,
calculated_entropy,
is_base64,
});
}
if let Some(t) = timer.take() {
let new_count = (matches.len() - initial_len) as u64;
t.end(new_count > 0, new_count, 0);
}
}
fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, String>)> {
match lang.to_lowercase().as_str() {
"bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())),
"c" => Some((Language::C, parser::queries::c::get_c_queries())),
"c#" | "csharp" => Some((Language::CSharp, parser::queries::csharp::get_csharp_queries())),
"c++" | "cpp" => Some((Language::Cpp, parser::queries::cpp::get_cpp_queries())),
"css" => Some((Language::Css, parser::queries::css::get_css_queries())),
"go" => Some((Language::Go, parser::queries::go::get_go_queries())),
"html" => Some((Language::Html, parser::queries::html::get_html_queries())),
"java" => Some((Language::Java, parser::queries::java::get_java_queries())),
"javascript" | "js" => {
Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries()))
}
// "kotlin" => Some((
// Language::Kotlin,
// parser::queries::kotlin::get_kotlin_queries(),
// )),
"php" => Some((Language::Php, parser::queries::php::get_php_queries())),
"python" | "py" | "starlark" => {
Some((Language::Python, parser::queries::python::get_python_queries()))
}
"ruby" => Some((Language::Ruby, parser::queries::ruby::get_ruby_queries())),
"rust" => Some((Language::Rust, parser::queries::rust::get_rust_queries())),
"toml" => Some((Language::Toml, parser::queries::toml::get_toml_queries())),
"typescript" | "ts" => {
Some((Language::TypeScript, parser::queries::typescript::get_typescript_queries()))
}
"yaml" => Some((Language::Yaml, parser::queries::yaml::get_yaml_queries())),
_ => None,
}
}
// -------------------------------------------------------------------------------------------------
// MatchStats
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Default, Clone)]
pub struct MatcherStats {
pub blobs_seen: u64,
pub blobs_scanned: u64,
pub bytes_seen: u64,
pub bytes_scanned: u64,
// #[cfg(feature = "rule_profiling")]
// pub rule_stats: crate::rule_profiling::RuleProfile,
}
impl MatcherStats {
pub fn update(&mut self, other: &Self) {
self.blobs_seen += other.blobs_seen;
self.blobs_scanned += other.blobs_scanned;
self.bytes_seen += other.bytes_seen;
self.bytes_scanned += other.bytes_scanned;
// #[cfg(feature = "rule_profiling")]
// self.rule_stats.update(&other.rule_stats);
}
}
// -------------------------------------------------------------------------------------------------
// Group
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)]
pub struct Group(pub Base64BString);
impl Group {
pub fn new(m: regex::bytes::Match<'_>) -> Self {
Self(Base64BString(BString::from(m.as_bytes())))
}
}
// -------------------------------------------------------------------------------------------------
// Groups
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct Groups(pub SmallVec<[Group; 1]>);
impl JsonSchema for Groups {
fn schema_name() -> String {
"Groups".to_string()
}
fn json_schema(gen: &mut SchemaGenerator) -> Schema {
let group_schema = gen.subschema_for::<Group>();
Schema::Object(schemars::schema::SchemaObject {
instance_type: Some(InstanceType::Array.into()),
array: Some(Box::new(ArrayValidation {
items: Some(group_schema.into()),
..Default::default()
})),
..Default::default()
})
}
}
// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
// pub struct SerializableCapture {
// pub name: Option<String>, // Named group (if available)
// pub match_number: i32,
// pub start: usize, // Start position of the match
// pub end: usize, // End position of the match
// pub value: String, // The actual captured value
// }
#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCapture {
pub name: Option<String>,
pub match_number: i32,
pub start: usize,
pub end: usize,
/// Interned value of the capture.
pub value: &'static str,
}
#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCaptures {
#[schemars(with = "Vec<SerializableCapture>")]
pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
}
impl SerializableCaptures {
pub fn from_captures(
captures: &regex::bytes::Captures,
_input: &[u8],
re: &Regex,
redact: bool,
) -> Self {
let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
// Process named captures
for name in re.capture_names().flatten() {
if let Some(capture) = captures.name(name) {
let value = if redact {
redact_value(&String::from_utf8_lossy(capture.as_bytes()))
} else {
String::from_utf8_lossy(capture.as_bytes()).to_string()
};
serialized_captures.push(SerializableCapture {
name: Some(name.to_string()),
match_number: -1,
start: capture.start(),
end: capture.end(),
value: intern(&value),
});
}
}
// Process unnamed captures (numbered groups)
for i in 0..captures.len() {
if let Some(capture) = captures.get(i) {
let value = if redact {
redact_value(&String::from_utf8_lossy(capture.as_bytes()))
} else {
String::from_utf8_lossy(capture.as_bytes()).to_string()
};
serialized_captures.push(SerializableCapture {
name: None,
match_number: i32::try_from(i).unwrap_or(0),
start: capture.start(),
end: capture.end(),
value: intern(&value),
});
}
}
SerializableCaptures { captures: serialized_captures }
}
}
// -------------------------------------------------------------------------------------------------
// Match
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct Match {
/// The location of the entire matching content
pub location: Location,
/// The capture groups
pub groups: SerializableCaptures, // Store serialized captures
/// unique identifier of file / blob where this match was found
pub blob_id: BlobId,
/// The unique content-based identifier of this match
pub finding_fingerprint: u64,
/// The rule that produced this match
#[serde(skip_serializing)]
#[schemars(skip)]
pub rule: Arc<Rule>,
/// Validation Body
pub validation_response_body: String,
/// Validation Status Code
pub validation_response_status: u16,
/// Validation Success
pub validation_success: bool,
/// Validation Success
pub calculated_entropy: f32,
pub visible: bool,
#[serde(default)]
pub is_base64: bool,
}
impl Match {
#[inline]
pub fn convert_owned_blobmatch_to_match<'a>(
loc_mapping: Option<&'a LocationMapping<'a>>,
owned_blob_match: &'a OwnedBlobMatch,
origin_type: &'a str,
) -> Self {
let offset_span = owned_blob_match.matching_input_offset_span;
// Extract the matched secret content. Use capture group 1 if it exists, otherwise fall back
// to group 0.
let matching_finding_bytes = owned_blob_match
.captures
.captures
.get(1)
.or_else(|| owned_blob_match.captures.captures.get(0))
.map(|capture| capture.value.as_bytes())
.unwrap_or_default();
// The fingerprint will be based on the content of the secret.
let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
let source_span =
loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
start: SourcePoint { line: 0, column: 0 },
end: SourcePoint { line: 0, column: 0 },
});
let offset_start: u64 =
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
let finding_fingerprint = compute_finding_fingerprint(
finding_value_for_fp,
origin_type, // file_or_commit,
offset_start,
offset_end,
);
// matching_snippet
Match {
rule: owned_blob_match.rule.clone(),
visible: owned_blob_match.rule.visible().to_owned(),
location: Location { offset_span, source_span: source_span.clone() },
groups: owned_blob_match.captures.clone(),
blob_id: owned_blob_match.blob_id,
finding_fingerprint,
validation_response_body: owned_blob_match.validation_response_body.clone(),
validation_response_status: owned_blob_match.validation_response_status.as_u16(),
validation_success: owned_blob_match.validation_success,
calculated_entropy: owned_blob_match.calculated_entropy,
is_base64: owned_blob_match.is_base64,
}
}
/// Returns the `blob_id` of the match.
pub fn get_blob_id(&self) -> BlobId {
self.blob_id.clone()
}
pub fn finding_id(&self) -> String {
let mut buffer = Vec::with_capacity(128);
buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes());
buffer.push(0);
serde_json::to_writer(&mut buffer, &self.groups)
.expect("should be able to serialize groups as JSON");
let mut num = xxh3_64(&buffer);
// Ensure the number is positive and within i64 range
num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive
// Convert to string
num.to_string()
}
}
#[derive(Debug, Clone)]
pub struct DecodedData {
pub original: String,
pub decoded: String,
pub pos_start: usize,
pub pos_end: usize,
}
#[inline]
fn is_base64_byte(b: u8) -> bool {
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/')
}
pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
let mut results = Vec::new();
let mut i = 0;
while i < input.len() {
while i < input.len() && !is_base64_byte(input[i]) {
i += 1;
}
let start = i;
while i < input.len() && is_base64_byte(input[i]) {
i += 1;
}
let mut eq_count = 0;
while i < input.len() && input[i] == b'=' && eq_count < 2 {
i += 1;
eq_count += 1;
}
let end = i;
let len = end - start;
if len >= 32 && len % 4 == 0 {
let base64_slice = &input[start..end];
if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) {
if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
if decoded_str.is_ascii() {
results.push(DecodedData {
original: String::from_utf8_lossy(base64_slice).into_owned(),
decoded: decoded_str.to_string(),
pos_start: start,
pos_end: end,
});
}
}
}
}
}
results
}
pub fn compute_finding_fingerprint(
finding_value: &str,
file_or_commit: &str,
offset_start: u64,
offset_end: u64,
) -> u64 {
// Combine all into a byte buffer and hash it directly:
let mut buf = Vec::with_capacity(
finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::<u64>(),
);
buf.extend_from_slice(finding_value.as_bytes());
buf.extend_from_slice(file_or_commit.as_bytes());
buf.extend_from_slice(&offset_start.to_le_bytes());
buf.extend_from_slice(&offset_end.to_le_bytes());
xxh3_64(&buf)
}
// -------------------------------------------------------------------------------------------------
// test
// -------------------------------------------------------------------------------------------------
#[cfg(test)]
mod test {
use std::{collections::BTreeMap, path::PathBuf};
use pretty_assertions::assert_eq;
// ---------------------------------------------------------------------
// proptest: raw-match dedup + entropy gate
// ---------------------------------------------------------------------
use proptest::prelude::*;
use super::*;
use crate::{
blob::{Blob, BlobIdMap},
origin::{Origin, OriginSet},
rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation},
};
proptest! {
#[test]
fn prop_no_dupes_and_entropy(
// random ASCII up to 300 bytes
mut noise in proptest::collection::vec(any::<u8>().prop_filter("ascii", |b| b.is_ascii()), 0..300),
// 0-4 random insertion points
inserts in proptest::collection::vec(0usize..300, 0..5)
) {
// Constant high-entropy secret token that matches the rule below
const TOKEN: &[u8] = b"secret_abcd1234";
// Splice the token at the requested offsets
for &idx in &inserts {
let pos = idx.min(noise.len());
noise.splice(pos..pos, TOKEN.iter().copied());
}
// ── build a single test rule ──────────────────────────────────
use crate::rules::rule::{RuleSyntax, Validation, Confidence};
let rule = Rule::new(RuleSyntax {
id: "prop.secret".into(),
name: "prop secret".into(),
pattern: "secret_[a-z]{4}[0-9]{4}".into(),
confidence: Confidence::Low,
min_entropy: 3.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>, // no HTTP validation needed
depends_on_rule: vec![],
});
let rules_db = RulesDatabase::from_rules(vec![rule]).unwrap();
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let mut m = Matcher::new(
&rules_db,
scanner_pool,
&seen,
None,
false,
None,
false,
false,
)
.unwrap();
// ── run the scan ──────────────────────────────────────────────
m.scan_bytes_raw(&noise, "buf").unwrap();
// ── property 1: dedup each (rule,start,end) is unique ──────
let mut coords = FxHashSet::default();
for RawMatch{rule_id, start_idx, end_idx} in &m.user_data.raw_matches_scratch {
assert!(
coords.insert((*rule_id, *start_idx, *end_idx)),
"duplicate raw-match detected for coords ({rule_id},{start_idx},{end_idx})"
);
// ── property 2: entropy gate held ────────────────────────
let slice = &noise[*start_idx as usize .. *end_idx as usize];
let ent = calculate_shannon_entropy(slice);
assert!(ent > 3.0, "entropy {ent} ≤ min_entropy, gate failed");
}
}
}
#[test]
pub fn test_simple() -> Result<()> {
let rules = vec![Rule::new(RuleSyntax {
id: "test.1".to_string(),
name: "test".to_string(),
pattern: "test".to_string(),
confidence: crate::rules::rule::Confidence::Medium,
min_entropy: 1.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: Some(Validation::Http(HttpValidation {
request: HttpRequest {
method: "GET".to_string(),
url: "https://example.com".to_string(),
headers: BTreeMap::new(),
body: None,
response_matcher: Some(vec![]),
multipart: None,
response_is_html: false,
},
multipart: None,
})),
depends_on_rule: vec![
Some(DependsOnRule {
rule_id: "d8f3c34b-015f-4cd6-b411-b1366493104c".to_string(),
variable: "email".to_string(),
}),
Some(DependsOnRule {
rule_id: "8910f364-7718-4a27-a435-d2da13e6ba9e".to_string(),
variable: "domain".to_string(),
}),
],
})];
let rules_db = RulesDatabase::from_rules(rules)?;
let input = "some test data for vectorscan";
let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
let enable_rule_profiling = true;
// let mut matcher = Matcher::new(&rules_db, &seen_blobs, None,
// enable_rule_profiling)?;
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let mut matcher = Matcher::new(
&rules_db,
scanner_pool,
&seen_blobs,
None,
enable_rule_profiling,
None, // Pass the shared profiler
false,
false,
)?;
matcher.scan_bytes_raw(input.as_bytes(), "fname")?;
assert_eq!(
matcher.user_data.raw_matches_scratch,
vec![RawMatch { rule_id: 0, start_idx: 0, end_idx: 9 },]
);
Ok(())
}
// ---------------------------------------------------------------------
// additional deterministic unit-tests
// ---------------------------------------------------------------------
/// `get_base64_strings` should recognise a well-formed token, decode it,
/// and report correct byte-offsets.
#[test]
fn test_get_base64_strings_basic() {
let raw = b"foo MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY= bar";
// decodes to "0123456789abcdef0123456789abcdef"
let hits = get_base64_strings(raw);
assert_eq!(hits.len(), 1);
let item = &hits[0];
assert_eq!(item.decoded, "0123456789abcdef0123456789abcdef");
assert_eq!(item.original, "MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY=");
// "foo␠" is 4 bytes, so the start offset is 4
assert_eq!((item.pos_start, item.pos_end), (4, 4 + item.original.len()));
}
/// `compute_finding_fingerprint` must be stable (same input ⇒ same output)
/// and sensitive to any input component.
#[test]
fn test_finding_fingerprint_stability_and_uniqueness() {
let a = compute_finding_fingerprint("secret", "fileA", 0, 6);
let b = compute_finding_fingerprint("secret", "fileA", 0, 6);
assert_eq!(a, b, "fingerprint should be deterministic");
// changing any parameter should perturb the hash
let c = compute_finding_fingerprint("secret", "fileA", 1, 7); // offsets differ
let d = compute_finding_fingerprint("secret", "fileB", 0, 6); // file id differs
let e = compute_finding_fingerprint("different", "fileA", 0, 6); // content differs
assert_ne!(a, c);
assert_ne!(a, d);
assert_ne!(a, e);
}
/// The (private) `compute_match_key` helper is the linchpin of the raw-dedup
/// path. It should return identical keys for identical inputs and different
/// keys as soon as *anything* changes.
#[test]
fn test_compute_match_key_uniqueness() {
use super::compute_match_key;
let k1 = compute_match_key(b"abc", b"rule-1", 0, 3);
let k2 = compute_match_key(b"abc", b"rule-1", 0, 3);
assert_eq!(k1, k2);
// mutate each component in turn
let diff_content = compute_match_key(b"abcd", b"rule-1", 0, 4);
let diff_rule = compute_match_key(b"abc", b"rule-2", 0, 3);
let diff_span = compute_match_key(b"abc", b"rule-1", 1, 4);
assert_ne!(k1, diff_content);
assert_ne!(k1, diff_rule);
assert_ne!(k1, diff_span);
}
/// Running `scan_bytes_raw` twice over the *same* input should never record
/// duplicate entries in `raw_matches_scratch`.
#[test]
fn test_scan_bytes_raw_no_duplicate_raw_matches() -> Result<()> {
// simple rule: literal "dup"
let rule = Rule::new(RuleSyntax {
id: "dup.check".into(),
name: "dup".into(),
pattern: "dup".into(),
confidence: crate::rules::rule::Confidence::Low,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
depends_on_rule: vec![],
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false, false)?;
let buf = b"dup dup"; // two literal hits, same rule
// first scan
m.scan_bytes_raw(buf, "buf1")?;
let first_len = m.user_data.raw_matches_scratch.len();
// second scan over the same buffer
m.scan_bytes_raw(buf, "buf1")?;
let second_len = m.user_data.raw_matches_scratch.len();
// we should still only have two unique raw matches recorded
assert_eq!(first_len, 2);
assert_eq!(second_len, 2);
Ok(())
}
#[test]
fn inline_comment_skips_match() -> Result<()> {
let rule = Rule::new(RuleSyntax {
id: "inline.ignore".into(),
name: "inline".into(),
pattern: "secret_token".into(),
confidence: crate::rules::rule::Confidence::Low,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
depends_on_rule: vec![],
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false, false)?;
let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt")));
match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
ScanResult::New(matches) => assert!(matches.is_empty()),
_ => panic!("unexpected scan result"),
}
Ok(())
}
#[test]
fn inline_comment_after_multiline_secret_skips_match() -> Result<()> {
let rule = Rule::new(RuleSyntax {
id: "inline.multiline".into(),
name: "inline multiline".into(),
pattern: "line1\\s+line2".into(),
confidence: crate::rules::rule::Confidence::Low,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
depends_on_rule: vec![],
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false, false)?;
let blob = Blob::from_bytes(
br#"let data = """
line1
line2
"""
# kingfisher:ignore
"#
.to_vec(),
);
let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt")));
match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
ScanResult::New(matches) => assert!(matches.is_empty()),
_ => panic!("unexpected scan result"),
}
Ok(())
}
#[test]
fn compat_flag_controls_external_directives() -> Result<()> {
let rule = Rule::new(RuleSyntax {
id: "inline.compat".into(),
name: "inline compat".into(),
pattern: "supersecret123".into(),
confidence: crate::rules::rule::Confidence::Low,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
depends_on_rule: vec![],
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt")));
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false, false)?;
let matches_without_compat =
match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
ScanResult::New(matches) => matches.len(),
_ => panic!("unexpected scan result"),
};
assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag");
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true, false)?;
match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
ScanResult::New(matches) => assert!(matches.is_empty()),
_ => panic!("unexpected scan result"),
}
Ok(())
}
}