refactored code

This commit is contained in:
Mick Grove 2026-02-14 13:12:26 -08:00
commit d3e659491d
23 changed files with 1868 additions and 1828 deletions

View file

@ -44,5 +44,8 @@ gix = { version = "0.73", default-features = false, features = ["serde"] }
# Console formatting
console = "0.15"
# Language detection for content types
tokei = "14.0.0"
[dev-dependencies]
pretty_assertions = "1.4"

View file

@ -0,0 +1,232 @@
use once_cell::sync::Lazy;
use std::path::Path;
use tokei::LanguageType;
// Precompute all (shebang_prefix_bytes, language) pairs once.
// Sort longest-first so more specific shebangs win.
static SHEBANG_PREFIXES: Lazy<Vec<(&'static [u8], LanguageType)>> = Lazy::new(|| {
let mut v = Vec::new();
for &(lang, shebangs) in LanguageType::list() {
for &sb in shebangs {
v.push((sb.as_bytes(), lang));
}
}
// Longest prefix first to prefer e.g. "#!/usr/bin/env python3" over "#!/usr/bin/env python"
v.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
v
});
/// The type of content detected.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ContentType {
/// Unprintable or control-heavy data.
BINARY,
/// Mostly printable text.
TEXT,
}
/// Heuristic thresholds for text vs. binary detection.
pub struct ContentInspector {
max_null_bytes: usize,
max_control_ratio: f64,
}
impl Default for ContentInspector {
fn default() -> Self {
Self { max_null_bytes: 4, max_control_ratio: 0.3 }
}
}
impl ContentInspector {
/// Create a new inspector with default thresholds.
#[inline]
pub fn new() -> Self {
Default::default()
}
/// Classify `bytes` as TEXT or BINARY:
///
/// 1. If null-byte count > `max_null_bytes` -- `BINARY`.
/// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` →
/// `BINARY`.
/// 3. Otherwise, `TEXT`.
#[inline]
#[must_use]
pub fn inspect(&self, bytes: &[u8]) -> ContentType {
let nulls = bytes.iter().filter(|&&b| b == 0).count();
if nulls > self.max_null_bytes {
return ContentType::BINARY;
}
let controls =
bytes.iter().filter(|&&b| b < 32 && !matches!(b, b'\n' | b'\r' | b'\t')).count();
let ratio = if bytes.is_empty() { 0.0 } else { controls as f64 / bytes.len() as f64 };
if ratio > self.max_control_ratio {
ContentType::BINARY
} else {
ContentType::TEXT
}
}
/// Guess MIME type from `path` extension.
///
/// Returns:
/// - `Some(mime)` if the extension is one of the known text or image types.
/// - `None` if there is no extension or its unrecognized.
#[inline]
#[must_use]
pub fn guess_mime_type(&self, path: &Path) -> Option<String> {
let ext = path.extension()?.to_str()?.to_ascii_lowercase();
let mime = match ext.as_str() {
"txt" | "md" | "rst" => "text/plain",
"html" | "htm" => "text/html",
"css" => "text/css",
"js" => "application/javascript",
"json" => "application/json",
"xml" => "application/xml",
"pdf" => "application/pdf",
"jpg" | "jpeg" => "image/jpeg",
"png" => "image/png",
"gif" => "image/gif",
_ => return None,
};
Some(mime.to_string())
}
/// Detect UTF-8 encoding by attempting a lossless conversion.
#[inline]
#[must_use]
pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
}
/// Guess programming language with broad coverage using `tokei`.
///
/// Strategy (no disk I/O):
/// 1) Try extension via `LanguageType::from_file_extension`.
/// 2) Handle common extensionless filenames (e.g., Makefile, Dockerfile, CMakeLists.txt).
/// 3) Parse an in-memory shebang (first line) against `LanguageType::shebangs`.
/// 4) Minimal content markers as a last resort.
///
/// Returns the canonical `tokei` language name (e.g., `Rust`, `Bash`, `Python`).
#[inline]
#[must_use]
pub fn guess_language(&self, path: &Path, content: &[u8]) -> Option<String> {
// 1) Extension mapping (fast, no I/O).
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) {
return Some(lang.name().to_string());
}
}
// 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure).
if let Some(file) = path.file_name().and_then(|f| f.to_str()) {
match file {
"Makefile" | "makefile" => {
return Some(LanguageType::Makefile.name().to_string());
}
"Dockerfile" | "dockerfile" => {
return Some(LanguageType::Dockerfile.name().to_string());
}
"CMakeLists.txt" => {
return Some(LanguageType::CMake.name().to_string());
}
"Rakefile" | "rakefile" => {
return Some(LanguageType::Rakefile.name().to_string());
}
// Common ecosystem files; map to their primary language where sensible.
"Gemfile" | "gemfile" => {
return Some(LanguageType::Ruby.name().to_string());
}
_ => {}
}
}
// 3) Shebang detection (in-memory): match by longest prefix, byte-wise (no UTF-8 needed).
if let Some(first_line) = content.split(|&b| b == b'\n').next() {
if first_line.starts_with(b"#!") {
for (prefix, lang) in SHEBANG_PREFIXES.iter() {
if first_line.starts_with(prefix) {
return Some(lang.name().to_string());
}
}
}
}
// 4) Lightweight content markers to catch a few ubiquitous cases without I/O.
let s = String::from_utf8_lossy(content);
if s.contains("<?php") {
return Some(LanguageType::Php.name().to_string());
}
if s.contains("package main") {
return Some(LanguageType::Go.name().to_string());
}
if s.contains("public class") {
return Some(LanguageType::Java.name().to_string());
}
if s.contains("#!/usr/bin/env bash") || s.contains("#!/bin/bash") {
return Some(LanguageType::Bash.name().to_string());
}
if s.contains("#!/usr/bin/env python") {
return Some(LanguageType::Python.name().to_string());
}
None
}
}
/// Shorthand: inspect with default thresholds.
#[inline]
#[must_use]
pub fn inspect(bytes: &[u8]) -> ContentType {
ContentInspector::default().inspect(bytes)
}
#[cfg(test)]
mod tests {
use std::path::PathBuf;
use super::*;
#[test]
fn binary_vs_text() {
let ins = ContentInspector::default();
let bin = vec![0, 1, 2, 0, 0, 0, 5];
assert_eq!(ins.inspect(&bin), ContentType::BINARY);
let txt = b"Hello\nWorld";
assert_eq!(ins.inspect(txt), ContentType::TEXT);
}
#[test]
fn mime_guess() {
let ins = ContentInspector::default();
assert_eq!(ins.guess_mime_type(&PathBuf::from("a.md")), Some("text/plain".into()));
assert_eq!(ins.guess_mime_type(&PathBuf::from("img.png")), Some("image/png".into()));
assert_eq!(ins.guess_mime_type(&PathBuf::from("x.xyz")), None);
}
#[test]
fn charset_guess() {
let ins = ContentInspector::default();
assert_eq!(ins.guess_charset("ok".as_bytes()), Some("UTF-8".into()));
assert_eq!(ins.guess_charset(&[0xFF, 0xFE, 0xFD]), None);
}
#[test]
fn language_guess() {
let ins = ContentInspector::default();
// Compare case-insensitively by lowercasing both sides.
let rust =
ins.guess_language(&PathBuf::from("main.rs"), b"").map(|s| s.to_ascii_lowercase());
assert_eq!(rust, Some("rust".into()));
let php = ins
.guess_language(&PathBuf::from("x"), b"<?php echo; ?>")
.map(|s| s.to_ascii_lowercase());
assert_eq!(php, Some("php".into()));
let bash = ins
.guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi")
.map(|s| s.to_ascii_lowercase());
assert_eq!(bash, Some("bash".into()));
}
}

View file

@ -9,6 +9,7 @@
pub mod blob;
pub mod bstring_escape;
pub mod content_type;
pub mod entropy;
pub mod error;
pub mod git_commit_metadata;
@ -20,6 +21,7 @@ pub use blob::{
Blob, BlobAppearance, BlobAppearanceSet, BlobData, BlobId, BlobIdMap, BlobMetadata,
};
pub use bstring_escape::Escaped;
pub use content_type::{ContentInspector, ContentType};
pub use entropy::calculate_shannon_entropy;
pub use error::{Error, Result};
pub use git_commit_metadata::CommitMetadata;

View file

@ -52,6 +52,7 @@
//! - **validation-all**: Enable all validation features
mod finding;
pub mod primitives;
mod scanner;
mod scanner_pool;

View file

@ -0,0 +1,187 @@
//! Shared matching primitives for secret detection.
//!
//! These functions are used by both the high-level `Scanner` API and the
//! binary crate's `Matcher`. Having a single canonical implementation
//! eliminates duplicated logic across the codebase.
use std::hash::{Hash, Hasher};
use base64::{engine::general_purpose, Engine};
use kingfisher_core::OffsetSpan;
use rustc_hash::{FxHashMap, FxHasher};
use xxhash_rust::xxh3::xxh3_64;
// -------------------------------------------------------------------------------------------------
// Base64 detection
// -------------------------------------------------------------------------------------------------
/// Decoded Base64 data with position information.
#[derive(Debug, Clone)]
pub struct DecodedData {
pub decoded: Vec<u8>,
pub pos_start: usize,
pub pos_end: usize,
}
#[inline]
pub fn is_base64_byte(b: u8) -> bool {
// Accepts both standard base64 ('+', '/') and URL-safe base64 ('-', '_') characters.
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
}
/// Finds standalone Base64-encoded strings in the input and returns decoded data
/// with byte-offset positions.
pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
let mut results = Vec::new();
let mut i = 0;
while i < input.len() {
while i < input.len() && !is_base64_byte(input[i]) {
i += 1;
}
let start = i;
while i < input.len() && is_base64_byte(input[i]) {
i += 1;
}
let mut eq_count = 0;
while i < input.len() && input[i] == b'=' && eq_count < 2 {
i += 1;
eq_count += 1;
}
let end = i;
let len = end - start;
if len >= 32 && len % 4 == 0 {
let base64_slice = &input[start..end];
// Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD
let decode_result = general_purpose::STANDARD
.decode(base64_slice)
.or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
.or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
if let Ok(decoded) = decode_result {
if decoded.is_ascii() {
results.push(DecodedData { decoded, pos_start: start, pos_end: end });
}
}
}
}
results
}
// -------------------------------------------------------------------------------------------------
// Match deduplication
// -------------------------------------------------------------------------------------------------
/// Computes a deduplication key for a match based on content, rule ID, and span.
#[inline]
pub fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
let mut hasher = FxHasher::default();
// Hash each component directly without allocation
content.hash(&mut hasher);
rule_id.hash(&mut hasher);
start.hash(&mut hasher);
end.hash(&mut hasher);
hasher.finish()
}
/// Inserts a span into a sorted list of spans, handling containment.
///
/// Returns `false` if the span is already contained in an existing span
/// (i.e., it's redundant and should be skipped).
#[inline]
pub fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
if idx > 0 {
if spans[idx - 1].fully_contains(&span) {
return false;
}
if span.fully_contains(&spans[idx - 1]) {
spans.remove(idx - 1);
idx -= 1;
}
}
if idx < spans.len() {
if spans[idx].fully_contains(&span) {
return false;
}
if span.fully_contains(&spans[idx]) {
spans.remove(idx);
}
}
spans.insert(idx, span);
true
}
/// Records a match span for a given rule, returning `false` if it's a duplicate.
#[inline]
pub fn record_match(
map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
rule_id: usize,
span: OffsetSpan,
) -> bool {
insert_span(map.entry(rule_id).or_default(), span)
}
// -------------------------------------------------------------------------------------------------
// Finding fingerprint
// -------------------------------------------------------------------------------------------------
/// Computes a stable fingerprint for a finding based on its value, location, and origin.
pub fn compute_finding_fingerprint(
finding_value: &str,
file_or_commit: &str,
offset_start: u64,
offset_end: u64,
) -> u64 {
// Combine all into a byte buffer and hash it directly:
let mut buf = Vec::with_capacity(
finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::<u64>(),
);
buf.extend_from_slice(finding_value.as_bytes());
buf.extend_from_slice(file_or_commit.as_bytes());
buf.extend_from_slice(&offset_start.to_le_bytes());
buf.extend_from_slice(&offset_end.to_le_bytes());
xxh3_64(&buf)
}
// -------------------------------------------------------------------------------------------------
// Secret capture selection
// -------------------------------------------------------------------------------------------------
/// Selects the "secret" capture from the regex match using the priority:
/// 1. Named capture called TOKEN (case-insensitive)
/// 2. First matched named capture
/// 3. First positional capture (group 1)
/// 4. Full match (group 0)
pub fn find_secret_capture<'a>(
re: &regex::bytes::Regex,
captures: &regex::bytes::Captures<'a>,
) -> regex::bytes::Match<'a> {
// 1. Prefer a named capture called TOKEN (case-insensitive).
if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
name_opt.filter(|name| name.eq_ignore_ascii_case("TOKEN")).and_then(|_| captures.get(i))
}) {
return token_cap;
}
// 2. Otherwise, prefer the first *matched* named capture.
if let Some(named_cap) = re
.capture_names()
.enumerate()
.find_map(|(i, name_opt)| name_opt.and_then(|_| captures.get(i)))
{
return named_cap;
}
// 3. Otherwise, fall back to the first positional capture (group 1).
if let Some(pos_cap) = captures.get(1) {
return pos_cap;
}
// 4. Finally, fall back to the full match (group 0).
captures.get(0).unwrap()
}

View file

@ -5,15 +5,13 @@ use std::path::Path;
use std::sync::Arc;
use anyhow::Result;
use base64::{engine::general_purpose, Engine};
use kingfisher_core::{calculate_shannon_entropy, Blob, BlobIdMap, LocationMapping, OffsetSpan};
use kingfisher_rules::RulesDatabase;
use regex::bytes::Regex;
use rustc_hash::{FxHashMap, FxHashSet};
use tracing::debug;
use xxhash_rust::xxh3::xxh3_64;
use crate::finding::{Finding, FindingLocation};
use crate::primitives;
use crate::scanner_pool::ScannerPool;
/// Configuration options for the scanner.
@ -185,21 +183,15 @@ impl Scanner {
let current_span = OffsetSpan::from_range(start..end);
// Check for overlapping spans
if !self.record_span(&mut previous_spans, rule_id, current_span) {
if !primitives::record_match(&mut previous_spans, rule_id, current_span) {
continue;
}
let haystack = &bytes[start..end];
for captures in anchored_regex.captures_iter(haystack) {
let full_capture = match captures.get(0) {
Some(c) => c,
None => continue,
};
// Get the primary secret value
let secret_capture =
self.get_secret_capture(&anchored_regex, &captures, full_capture);
let secret_capture = primitives::find_secret_capture(&anchored_regex, &captures);
let secret_bytes = secret_capture.as_bytes();
// Check entropy
@ -211,7 +203,7 @@ impl Scanner {
}
// Compute match key for dedup
let match_key = self.compute_match_key(
let match_key = primitives::compute_match_key(
secret_bytes,
rule.id().as_bytes(),
start + secret_capture.start(),
@ -242,7 +234,7 @@ impl Scanner {
}
}
let fingerprint = self.compute_fingerprint(
let fingerprint = primitives::compute_finding_fingerprint(
&secret,
&blob.id().to_string(),
offset_span.start as u64,
@ -295,92 +287,6 @@ impl Scanner {
// In a real implementation, you'd want to add a clear method or use a different approach
}
fn get_secret_capture<'a>(
&self,
regex: &Regex,
captures: &regex::bytes::Captures<'a>,
full_capture: regex::bytes::Match<'a>,
) -> regex::bytes::Match<'a> {
// Prefer named capture called TOKEN
for (i, name_opt) in regex.capture_names().enumerate() {
if let Some(name) = name_opt {
if name.eq_ignore_ascii_case("TOKEN") {
if let Some(cap) = captures.get(i) {
return cap;
}
}
}
}
// Otherwise, first named capture
for (i, name_opt) in regex.capture_names().enumerate() {
if name_opt.is_some() {
if let Some(cap) = captures.get(i) {
return cap;
}
}
}
// Otherwise, first positional capture (group 1)
if let Some(cap) = captures.get(1) {
return cap;
}
// Fall back to full match
full_capture
}
fn record_span(
&self,
map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
rule_id: usize,
span: OffsetSpan,
) -> bool {
let spans = map.entry(rule_id).or_default();
// Binary search for insertion point
let idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
// Check if new span is contained in an existing one
if idx > 0 && spans[idx - 1].fully_contains(&span) {
return false;
}
if idx < spans.len() && spans[idx].fully_contains(&span) {
return false;
}
// Remove spans that the new span contains
let remove_idx = idx;
while remove_idx < spans.len() && span.fully_contains(&spans[remove_idx]) {
spans.remove(remove_idx);
}
if idx > 0 && span.fully_contains(&spans[idx - 1]) {
spans.remove(idx - 1);
}
spans.insert(idx.min(spans.len()), span);
true
}
fn compute_match_key(&self, content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
use std::hash::{Hash, Hasher};
let mut hasher = rustc_hash::FxHasher::default();
content.hash(&mut hasher);
rule_id.hash(&mut hasher);
start.hash(&mut hasher);
end.hash(&mut hasher);
hasher.finish()
}
fn compute_fingerprint(&self, value: &str, blob_id: &str, start: u64, end: u64) -> u64 {
let mut buf = Vec::with_capacity(value.len() + blob_id.len() + 16);
buf.extend_from_slice(value.as_bytes());
buf.extend_from_slice(blob_id.as_bytes());
buf.extend_from_slice(&start.to_le_bytes());
buf.extend_from_slice(&end.to_le_bytes());
xxh3_64(&buf)
}
fn redact(&self, bytes: &[u8]) -> String {
let s = String::from_utf8_lossy(bytes);
if s.len() <= 8 {
@ -400,7 +306,7 @@ impl Scanner {
let bytes = blob.bytes();
// Find Base64-encoded strings
let b64_items = self.find_base64_strings(bytes);
let b64_items = primitives::get_base64_strings(bytes);
for item in b64_items {
// Try to match decoded content against all rules
@ -411,12 +317,7 @@ impl Scanner {
};
for captures in regex.captures_iter(&item.decoded) {
let full_capture = match captures.get(0) {
Some(c) => c,
None => continue,
};
let secret_capture = self.get_secret_capture(&regex, &captures, full_capture);
let secret_capture = primitives::find_secret_capture(&regex, &captures);
let secret_bytes = secret_capture.as_bytes();
let min_entropy =
@ -426,7 +327,7 @@ impl Scanner {
continue;
}
let match_key = self.compute_match_key(
let match_key = primitives::compute_match_key(
secret_bytes,
rule.id().as_bytes(),
item.pos_start,
@ -455,7 +356,7 @@ impl Scanner {
}
}
let fingerprint = self.compute_fingerprint(
let fingerprint = primitives::compute_finding_fingerprint(
&secret,
&blob.id().to_string(),
offset_span.start as u64,
@ -488,62 +389,6 @@ impl Scanner {
findings
}
fn find_base64_strings(&self, input: &[u8]) -> Vec<DecodedData> {
let mut results = Vec::new();
let mut i = 0;
while i < input.len() {
// Skip non-base64 characters
while i < input.len() && !Self::is_base64_byte(input[i]) {
i += 1;
}
let start = i;
// Collect base64 characters
while i < input.len() && Self::is_base64_byte(input[i]) {
i += 1;
}
// Handle padding
let mut eq_count = 0;
while i < input.len() && input[i] == b'=' && eq_count < 2 {
i += 1;
eq_count += 1;
}
let end = i;
let len = end - start;
if len >= 32 && len % 4 == 0 {
let base64_slice = &input[start..end];
// Try decoding
let decode_result = general_purpose::STANDARD
.decode(base64_slice)
.or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
.or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
if let Ok(decoded) = decode_result {
if decoded.is_ascii() {
results.push(DecodedData { decoded, pos_start: start, pos_end: end });
}
}
}
}
results
}
#[inline]
fn is_base64_byte(b: u8) -> bool {
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
}
}
struct DecodedData {
decoded: Vec<u8>,
pos_start: usize,
pos_end: usize,
}
#[cfg(test)]

View file

@ -18,6 +18,22 @@ pub(crate) mod postgres;
mod report;
mod slack;
/// Trait for access map providers that map a single token to an access profile.
///
/// This covers the majority of providers (GitHub, GitLab, Slack, HuggingFace,
/// Gitea, Bitbucket). Providers with more complex credentials (AWS, GCP, Azure,
/// Postgres, MongoDB) use their own custom interfaces.
pub trait TokenAccessMapper: Send + Sync {
/// The cloud/platform name for results (e.g., `"github"`, `"slack"`).
fn cloud_name(&self) -> &'static str;
/// Maps a single token to an access map result.
fn map_access_from_token(
&self,
token: &str,
) -> impl std::future::Future<Output = Result<AccessMapResult>> + Send;
}
/// Run the identity mapping workflow for the selected cloud provider.
pub async fn run(args: AccessMapArgs) -> Result<()> {
let result = match args.provider {
@ -244,24 +260,15 @@ pub async fn map_requests(requests: Vec<AccessMapRequest>) -> Vec<AccessMapResul
.unwrap_or_else(|err| build_failed_result("azure_devops", "pat", err)),
fingerprint,
),
AccessMapRequest::Github { token, fingerprint } => (
github::map_access_from_token(&token)
.await
.unwrap_or_else(|err| build_failed_result("github", "token", err)),
fingerprint,
),
AccessMapRequest::Gitlab { token, fingerprint } => (
gitlab::map_access_from_token(&token)
.await
.unwrap_or_else(|err| build_failed_result("gitlab", "token", err)),
fingerprint,
),
AccessMapRequest::Slack { token, fingerprint } => (
slack::map_access_from_token(&token)
.await
.unwrap_or_else(|err| build_failed_result("slack", "token", err)),
fingerprint,
),
AccessMapRequest::Github { token, fingerprint } => {
(map_token(&GithubMapper, &token).await, fingerprint)
}
AccessMapRequest::Gitlab { token, fingerprint } => {
(map_token(&GitlabMapper, &token).await, fingerprint)
}
AccessMapRequest::Slack { token, fingerprint } => {
(map_token(&SlackMapper, &token).await, fingerprint)
}
AccessMapRequest::Postgres { uri, fingerprint } => (
postgres::map_access_from_uri(&uri)
.await
@ -274,24 +281,15 @@ pub async fn map_requests(requests: Vec<AccessMapRequest>) -> Vec<AccessMapResul
.unwrap_or_else(|err| build_failed_result("mongodb", "uri", err)),
fingerprint,
),
AccessMapRequest::HuggingFace { token, fingerprint } => (
huggingface::map_access_from_token(&token)
.await
.unwrap_or_else(|err| build_failed_result("huggingface", "token", err)),
fingerprint,
),
AccessMapRequest::Gitea { token, fingerprint } => (
gitea::map_access_from_token(&token)
.await
.unwrap_or_else(|err| build_failed_result("gitea", "token", err)),
fingerprint,
),
AccessMapRequest::Bitbucket { token, fingerprint } => (
bitbucket::map_access_from_token(&token)
.await
.unwrap_or_else(|err| build_failed_result("bitbucket", "token", err)),
fingerprint,
),
AccessMapRequest::HuggingFace { token, fingerprint } => {
(map_token(&HuggingFaceMapper, &token).await, fingerprint)
}
AccessMapRequest::Gitea { token, fingerprint } => {
(map_token(&GiteaMapper, &token).await, fingerprint)
}
AccessMapRequest::Bitbucket { token, fingerprint } => {
(map_token(&BitbucketMapper, &token).await, fingerprint)
}
};
mapped.fingerprint = Some(fp);
@ -301,12 +299,106 @@ pub async fn map_requests(requests: Vec<AccessMapRequest>) -> Vec<AccessMapResul
results
}
/// Maps a token credential using a `TokenAccessMapper`, with fallback error handling.
async fn map_token(mapper: &impl TokenAccessMapper, token: &str) -> AccessMapResult {
mapper
.map_access_from_token(token)
.await
.unwrap_or_else(|err| build_failed_result(mapper.cloud_name(), "token", err))
}
/// Write HTML/JSON outputs for a collection of identity map results.
pub fn write_reports(results: &[AccessMapResult], html_out: &std::path::Path) -> Result<()> {
report::generate_html_report_multi(results, html_out)?;
Ok(())
}
// -------------------------------------------------------------------------------------------------
// TokenAccessMapper implementations
// -------------------------------------------------------------------------------------------------
/// GitHub access mapper.
pub struct GithubMapper;
impl TokenAccessMapper for GithubMapper {
fn cloud_name(&self) -> &'static str {
"github"
}
async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
github::map_access_from_token(token).await
}
}
/// GitLab access mapper.
pub struct GitlabMapper;
impl TokenAccessMapper for GitlabMapper {
fn cloud_name(&self) -> &'static str {
"gitlab"
}
async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
gitlab::map_access_from_token(token).await
}
}
/// Slack access mapper.
pub struct SlackMapper;
impl TokenAccessMapper for SlackMapper {
fn cloud_name(&self) -> &'static str {
"slack"
}
async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
slack::map_access_from_token(token).await
}
}
/// HuggingFace access mapper.
pub struct HuggingFaceMapper;
impl TokenAccessMapper for HuggingFaceMapper {
fn cloud_name(&self) -> &'static str {
"huggingface"
}
async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
huggingface::map_access_from_token(token).await
}
}
/// Gitea access mapper.
pub struct GiteaMapper;
impl TokenAccessMapper for GiteaMapper {
fn cloud_name(&self) -> &'static str {
"gitea"
}
async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
gitea::map_access_from_token(token).await
}
}
/// Bitbucket access mapper.
pub struct BitbucketMapper;
impl TokenAccessMapper for BitbucketMapper {
fn cloud_name(&self) -> &'static str {
"bitbucket"
}
async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
bitbucket::map_access_from_token(token).await
}
}
// -------------------------------------------------------------------------------------------------
// Helper functions
// -------------------------------------------------------------------------------------------------
fn severity_to_str(severity: Severity) -> &'static str {
match severity {
Severity::Low => "low",

View file

@ -13,13 +13,12 @@ use std::{
// let us opt into newer API versions as Microsoft rolls them out.
use anyhow::{anyhow, Context, Result};
use globset::{Glob, GlobSet, GlobSetBuilder};
use indicatif::{ProgressBar, ProgressStyle};
use serde::Deserialize;
use tracing::warn;
use url::{form_urlencoded, Url};
use crate::{findings_store, git_url::GitUrl};
use crate::{findings_store, git_host, git_url::GitUrl};
const API_VERSION: &str = "7.1-preview.1";
@ -55,33 +54,6 @@ impl RepoSpecifiers {
}
}
#[derive(Debug)]
struct ExcludeMatcher {
exact: HashSet<String>,
globs: Option<GlobSet>,
}
impl ExcludeMatcher {
fn matches(&self, name: &str) -> bool {
let candidate = name.to_lowercase();
if self.exact.contains(&candidate) {
return true;
}
if let Some(globs) = &self.globs {
return globs.is_match(&candidate);
}
false
}
fn is_empty(&self) -> bool {
self.exact.is_empty() && self.globs.is_none()
}
}
fn looks_like_glob(pattern: &str) -> bool {
pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
}
fn encode_segment(segment: &str) -> String {
form_urlencoded::byte_serialize(segment.as_bytes()).collect::<String>()
}
@ -129,7 +101,7 @@ fn parse_repo_identifier_from_path(path: &str) -> Option<String> {
let org = org.to_lowercase();
let project_raw = project.to_string();
if looks_like_glob(&project_raw) {
if git_host::looks_like_glob(&project_raw) {
let pattern = format!("{org}/{}/**", project_raw.to_lowercase());
return Some(pattern);
}
@ -197,59 +169,16 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
parse_repo_identifier_from_path(trimmed)
}
fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
let mut exact = HashSet::new();
let mut glob_builder = GlobSetBuilder::new();
let mut has_glob = false;
for raw in exclude_repos {
match parse_excluded_repo(raw) {
Some(name) => {
let normalized = name.to_lowercase();
if looks_like_glob(&normalized) {
match Glob::new(&normalized) {
Ok(glob) => {
glob_builder.add(glob);
has_glob = true;
}
Err(err) => {
warn!("Ignoring invalid Azure exclusion pattern '{raw}': {err}");
exact.insert(normalized);
}
}
} else {
exact.insert(normalized);
}
}
None => {
warn!("Ignoring invalid Azure exclusion '{raw}' (expected organization/project[/repository])");
}
}
}
let globs = if has_glob {
match glob_builder.build() {
Ok(set) => Some(set),
Err(err) => {
warn!("Failed to build Azure exclusion patterns: {err}");
None
}
}
} else {
None
};
ExcludeMatcher { exact, globs }
fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
git_host::build_exclude_matcher(
exclude_repos,
|raw| parse_excluded_repo(raw).map(|name| name.to_lowercase()),
"Azure",
)
}
fn should_exclude_repo(repo_url: &str, excludes: &ExcludeMatcher) -> bool {
if excludes.is_empty() {
return false;
}
if let Some(name) = parse_repo_identifier_from_url(repo_url) {
return excludes.matches(&name);
}
false
fn should_exclude_repo(repo_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
git_host::should_exclude_repo(repo_url, excludes, parse_repo_identifier_from_url)
}
#[derive(Debug, Deserialize, Default)]

View file

@ -6,8 +6,8 @@ use std::{
time::Duration,
};
use crate::git_host;
use anyhow::{Context, Result};
use globset::{Glob, GlobSet, GlobSetBuilder};
use indicatif::{ProgressBar, ProgressStyle};
use reqwest::Url;
use serde::Deserialize;
@ -136,32 +136,6 @@ impl BitbucketKind {
}
}
#[derive(Debug)]
struct ExcludeMatcher {
exact: HashSet<String>,
globs: Option<GlobSet>,
}
impl ExcludeMatcher {
fn matches(&self, name: &str) -> bool {
if self.exact.contains(name) {
return true;
}
if let Some(globs) = &self.globs {
return globs.is_match(name);
}
false
}
fn is_empty(&self) -> bool {
self.exact.is_empty() && self.globs.is_none()
}
}
fn looks_like_glob(pattern: &str) -> bool {
pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
}
fn normalize_repo_identifier(owner: &str, repo: &str) -> Option<String> {
let owner = owner.trim().trim_matches('/');
let repo = repo.trim().trim_matches('/');
@ -207,58 +181,12 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
parse_repo_name_from_path(trimmed)
}
fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
let mut exact = HashSet::new();
let mut glob_builder = GlobSetBuilder::new();
let mut has_glob = false;
for raw in exclude_repos {
match parse_excluded_repo(raw) {
Some(name) => {
if looks_like_glob(&name) {
match Glob::new(&name) {
Ok(glob) => {
glob_builder.add(glob);
has_glob = true;
}
Err(err) => {
warn!("Ignoring invalid Bitbucket exclusion pattern '{raw}': {err}");
exact.insert(name);
}
}
} else {
exact.insert(name);
}
}
None => {
warn!("Ignoring invalid Bitbucket exclusion '{raw}' (expected owner/repo)");
}
}
}
let globs = if has_glob {
match glob_builder.build() {
Ok(set) => Some(set),
Err(err) => {
warn!("Failed to build Bitbucket exclusion patterns: {err}");
None
}
}
} else {
None
};
ExcludeMatcher { exact, globs }
fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
git_host::build_exclude_matcher(exclude_repos, parse_excluded_repo, "Bitbucket")
}
fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool {
if excludes.is_empty() {
return false;
}
if let Some(name) = parse_repo_name_from_url(clone_url) {
return excludes.matches(&name);
}
false
fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
git_host::should_exclude_repo(clone_url, excludes, parse_repo_name_from_url)
}
fn repo_clone_url_from_links(links: &[CloneLink]) -> Option<String> {
@ -343,7 +271,7 @@ async fn fetch_cloud_repositories(
owner: &str,
auth: &AuthConfig,
repo_filter: RepoType,
excludes: &ExcludeMatcher,
excludes: &git_host::ExcludeMatcher,
results: &mut Vec<String>,
) -> Result<()> {
let mut next = base
@ -387,7 +315,7 @@ async fn fetch_server_repositories(
path: &str,
auth: &AuthConfig,
repo_filter: RepoType,
excludes: &ExcludeMatcher,
excludes: &git_host::ExcludeMatcher,
results: &mut Vec<String>,
) -> Result<()> {
let mut start = 0u64;

View file

@ -1,232 +1,2 @@
use once_cell::sync::Lazy;
use std::path::Path;
use tokei::LanguageType;
// Precompute all (shebang_prefix_bytes, language) pairs once.
// Sort longest-first so more specific shebangs win.
static SHEBANG_PREFIXES: Lazy<Vec<(&'static [u8], LanguageType)>> = Lazy::new(|| {
let mut v = Vec::new();
for &(lang, shebangs) in LanguageType::list() {
for &sb in shebangs {
v.push((sb.as_bytes(), lang));
}
}
// Longest prefix first to prefer e.g. "#!/usr/bin/env python3" over "#!/usr/bin/env python"
v.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
v
});
/// The type of content detected.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ContentType {
/// Unprintable or control-heavy data.
BINARY,
/// Mostly printable text.
TEXT,
}
/// Heuristic thresholds for text vs. binary detection.
pub struct ContentInspector {
max_null_bytes: usize,
max_control_ratio: f64,
}
impl Default for ContentInspector {
fn default() -> Self {
Self { max_null_bytes: 4, max_control_ratio: 0.3 }
}
}
impl ContentInspector {
/// Create a new inspector with default thresholds.
#[inline]
pub fn new() -> Self {
Default::default()
}
/// Classify `bytes` as TEXT or BINARY:
///
/// 1. If null-byte count > `max_null_bytes` -- `BINARY`.
/// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` →
/// `BINARY`.
/// 3. Otherwise, `TEXT`.
#[inline]
#[must_use]
pub fn inspect(&self, bytes: &[u8]) -> ContentType {
let nulls = bytes.iter().filter(|&&b| b == 0).count();
if nulls > self.max_null_bytes {
return ContentType::BINARY;
}
let controls =
bytes.iter().filter(|&&b| b < 32 && !matches!(b, b'\n' | b'\r' | b'\t')).count();
let ratio = if bytes.is_empty() { 0.0 } else { controls as f64 / bytes.len() as f64 };
if ratio > self.max_control_ratio {
ContentType::BINARY
} else {
ContentType::TEXT
}
}
/// Guess MIME type from `path` extension.
///
/// Returns:
/// - `Some(mime)` if the extension is one of the known text or image types.
/// - `None` if there is no extension or its unrecognized.
#[inline]
#[must_use]
pub fn guess_mime_type(&self, path: &Path) -> Option<String> {
let ext = path.extension()?.to_str()?.to_ascii_lowercase();
let mime = match ext.as_str() {
"txt" | "md" | "rst" => "text/plain",
"html" | "htm" => "text/html",
"css" => "text/css",
"js" => "application/javascript",
"json" => "application/json",
"xml" => "application/xml",
"pdf" => "application/pdf",
"jpg" | "jpeg" => "image/jpeg",
"png" => "image/png",
"gif" => "image/gif",
_ => return None,
};
Some(mime.to_string())
}
/// Detect UTF-8 encoding by attempting a lossless conversion.
#[inline]
#[must_use]
pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
}
/// Guess programming language with broad coverage using `tokei`.
///
/// Strategy (no disk I/O):
/// 1) Try extension via `LanguageType::from_file_extension`.
/// 2) Handle common extensionless filenames (e.g., Makefile, Dockerfile, CMakeLists.txt).
/// 3) Parse an in-memory shebang (first line) against `LanguageType::shebangs`.
/// 4) Minimal content markers as a last resort.
///
/// Returns the canonical `tokei` language name (e.g., `Rust`, `Bash`, `Python`).
#[inline]
#[must_use]
pub fn guess_language(&self, path: &Path, content: &[u8]) -> Option<String> {
// 1) Extension mapping (fast, no I/O).
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) {
return Some(lang.name().to_string());
}
}
// 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure).
if let Some(file) = path.file_name().and_then(|f| f.to_str()) {
match file {
"Makefile" | "makefile" => {
return Some(LanguageType::Makefile.name().to_string());
}
"Dockerfile" | "dockerfile" => {
return Some(LanguageType::Dockerfile.name().to_string());
}
"CMakeLists.txt" => {
return Some(LanguageType::CMake.name().to_string());
}
"Rakefile" | "rakefile" => {
return Some(LanguageType::Rakefile.name().to_string());
}
// Common ecosystem files; map to their primary language where sensible.
"Gemfile" | "gemfile" => {
return Some(LanguageType::Ruby.name().to_string());
}
_ => {}
}
}
// 3) Shebang detection (in-memory): match by longest prefix, byte-wise (no UTF-8 needed).
if let Some(first_line) = content.split(|&b| b == b'\n').next() {
if first_line.starts_with(b"#!") {
for (prefix, lang) in SHEBANG_PREFIXES.iter() {
if first_line.starts_with(prefix) {
return Some(lang.name().to_string());
}
}
}
}
// 4) Lightweight content markers to catch a few ubiquitous cases without I/O.
let s = String::from_utf8_lossy(content);
if s.contains("<?php") {
return Some(LanguageType::Php.name().to_string());
}
if s.contains("package main") {
return Some(LanguageType::Go.name().to_string());
}
if s.contains("public class") {
return Some(LanguageType::Java.name().to_string());
}
if s.contains("#!/usr/bin/env bash") || s.contains("#!/bin/bash") {
return Some(LanguageType::Bash.name().to_string());
}
if s.contains("#!/usr/bin/env python") {
return Some(LanguageType::Python.name().to_string());
}
None
}
}
/// Shorthand: inspect with default thresholds.
#[inline]
#[must_use]
pub fn inspect(bytes: &[u8]) -> ContentType {
ContentInspector::default().inspect(bytes)
}
#[cfg(test)]
mod tests {
use std::path::PathBuf;
use super::*;
#[test]
fn binary_vs_text() {
let ins = ContentInspector::default();
let bin = vec![0, 1, 2, 0, 0, 0, 5];
assert_eq!(ins.inspect(&bin), ContentType::BINARY);
let txt = b"Hello\nWorld";
assert_eq!(ins.inspect(txt), ContentType::TEXT);
}
#[test]
fn mime_guess() {
let ins = ContentInspector::default();
assert_eq!(ins.guess_mime_type(&PathBuf::from("a.md")), Some("text/plain".into()));
assert_eq!(ins.guess_mime_type(&PathBuf::from("img.png")), Some("image/png".into()));
assert_eq!(ins.guess_mime_type(&PathBuf::from("x.xyz")), None);
}
#[test]
fn charset_guess() {
let ins = ContentInspector::default();
assert_eq!(ins.guess_charset("ok".as_bytes()), Some("UTF-8".into()));
assert_eq!(ins.guess_charset(&[0xFF, 0xFE, 0xFD]), None);
}
#[test]
fn language_guess() {
let ins = ContentInspector::default();
// Compare case-insensitively by lowercasing both sides.
let rust =
ins.guess_language(&PathBuf::from("main.rs"), b"").map(|s| s.to_ascii_lowercase());
assert_eq!(rust, Some("rust".into()));
let php = ins
.guess_language(&PathBuf::from("x"), b"<?php echo; ?>")
.map(|s| s.to_ascii_lowercase());
assert_eq!(php, Some("php".into()));
let bash = ins
.guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi")
.map(|s| s.to_ascii_lowercase());
assert_eq!(bash, Some("bash".into()));
}
}
// Re-export from the canonical implementation in kingfisher-core.
pub use kingfisher_core::content_type::*;

121
src/git_host.rs Normal file
View file

@ -0,0 +1,121 @@
//! Shared utilities for Git hosting platform integrations.
//!
//! This module provides the `ExcludeMatcher` struct and related helpers
//! used across all platform integration modules (GitHub, GitLab, Azure,
//! Bitbucket, Gitea).
use std::collections::HashSet;
use globset::{Glob, GlobSet, GlobSetBuilder};
use tracing::warn;
// -------------------------------------------------------------------------------------------------
// ExcludeMatcher
// -------------------------------------------------------------------------------------------------
/// Matches repository names against exact strings and glob patterns.
///
/// Used by all platform integrations to filter out excluded repositories.
pub struct ExcludeMatcher {
exact: HashSet<String>,
globs: Option<GlobSet>,
}
impl ExcludeMatcher {
/// Returns `true` if no exclusion patterns are configured.
pub fn is_empty(&self) -> bool {
self.exact.is_empty() && self.globs.is_none()
}
/// Returns `true` if the given name matches any exclusion pattern.
pub fn matches(&self, name: &str) -> bool {
if self.exact.contains(name) {
return true;
}
if let Some(globs) = &self.globs {
return globs.is_match(name);
}
false
}
}
/// Returns `true` if the pattern contains glob metacharacters.
pub fn looks_like_glob(pattern: &str) -> bool {
pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
}
/// Builds an `ExcludeMatcher` from a list of exclusion patterns.
///
/// The `parse_fn` transforms each raw pattern string into a normalized
/// repository name (e.g., `"owner/repo"` or `"group/project"`). It returns
/// `None` if the pattern is invalid.
///
/// The `platform_name` is used in warning messages (e.g., `"GitHub"`,
/// `"GitLab"`).
pub fn build_exclude_matcher(
exclude_repos: &[String],
parse_fn: impl Fn(&str) -> Option<String>,
platform_name: &str,
) -> ExcludeMatcher {
let mut exact = HashSet::new();
let mut glob_builder = GlobSetBuilder::new();
let mut has_glob = false;
for raw in exclude_repos {
match parse_fn(raw) {
Some(name) => {
if looks_like_glob(&name) {
match Glob::new(&name) {
Ok(glob) => {
glob_builder.add(glob);
has_glob = true;
}
Err(err) => {
warn!(
"Ignoring invalid {platform_name} exclusion pattern '{raw}': {err}"
);
exact.insert(name);
}
}
} else {
exact.insert(name);
}
}
None => {
warn!("Ignoring invalid {platform_name} exclusion '{raw}'");
}
}
}
let globs = if has_glob {
match glob_builder.build() {
Ok(set) => Some(set),
Err(err) => {
warn!("Failed to build {platform_name} exclusion patterns: {err}");
None
}
}
} else {
None
};
ExcludeMatcher { exact, globs }
}
/// Checks whether a repository URL should be excluded.
///
/// The `extract_name` function extracts a normalized repository identifier
/// from the URL (e.g., `"owner/repo"` from `"https://github.com/owner/repo.git"`).
pub fn should_exclude_repo(
repo_url: &str,
excludes: &ExcludeMatcher,
extract_name: impl Fn(&str) -> Option<String>,
) -> bool {
if excludes.is_empty() {
return false;
}
if let Some(name) = extract_name(repo_url) {
return excludes.matches(&name);
}
false
}

View file

@ -1,14 +1,13 @@
use std::{collections::HashSet, env, str::FromStr, time::Duration};
use anyhow::{anyhow, Result};
use globset::{Glob, GlobSet, GlobSetBuilder};
use indicatif::{ProgressBar, ProgressStyle};
use reqwest::StatusCode;
use serde::Deserialize;
use tracing::warn;
use url::Url;
use crate::{git_url::GitUrl, validation::GLOBAL_USER_AGENT};
use crate::{git_host, git_url::GitUrl, validation::GLOBAL_USER_AGENT};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RepoType {
@ -55,31 +54,6 @@ struct GiteaOrganization {
username: String,
}
struct ExcludeMatcher {
exact: HashSet<String>,
globs: Option<GlobSet>,
}
impl ExcludeMatcher {
fn matches(&self, name: &str) -> bool {
if self.exact.contains(name) {
return true;
}
if let Some(globs) = &self.globs {
return globs.is_match(name);
}
false
}
fn is_empty(&self) -> bool {
self.exact.is_empty() && self.globs.is_none()
}
}
fn looks_like_glob(pattern: &str) -> bool {
pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
}
fn normalize_repo_identifier(raw: &str) -> Option<String> {
let trimmed = raw.trim().trim_matches('/');
if trimmed.is_empty() {
@ -113,51 +87,11 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
normalize_repo_identifier(trimmed)
}
fn build_exclude_matcher(excludes: &[String]) -> ExcludeMatcher {
let mut exact = HashSet::new();
let mut glob_builder = GlobSetBuilder::new();
let mut has_glob = false;
for raw in excludes {
match parse_excluded_repo(raw) {
Some(name) => {
if looks_like_glob(&name) {
match Glob::new(&name) {
Ok(glob) => {
glob_builder.add(glob);
has_glob = true;
}
Err(err) => {
warn!("Ignoring invalid Gitea exclusion pattern '{raw}': {err}");
exact.insert(name);
}
}
} else {
exact.insert(name);
}
}
None => {
warn!("Ignoring invalid Gitea exclusion '{raw}' (expected owner/repo)");
}
}
}
let globs = if has_glob {
match glob_builder.build() {
Ok(set) => Some(set),
Err(err) => {
warn!("Failed to build Gitea exclusion patterns: {err}");
None
}
}
} else {
None
};
ExcludeMatcher { exact, globs }
fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_repo(raw), "Gitea")
}
fn should_exclude_repo(repo: &GiteaRepository, excludes: &ExcludeMatcher) -> bool {
fn should_exclude_repo(repo: &GiteaRepository, excludes: &git_host::ExcludeMatcher) -> bool {
if excludes.is_empty() {
return false;
}
@ -169,7 +103,7 @@ async fn fetch_paginated_repos(
token: Option<&str>,
mut url: Url,
repo_filter: RepoType,
excludes: &ExcludeMatcher,
excludes: &git_host::ExcludeMatcher,
progress: Option<&ProgressBar>,
) -> Result<Vec<String>> {
let mut page = 1u32;
@ -221,7 +155,7 @@ async fn fetch_user_repos(
api_url: &Url,
username: &str,
repo_filter: RepoType,
excludes: &ExcludeMatcher,
excludes: &git_host::ExcludeMatcher,
progress: Option<&ProgressBar>,
) -> Result<Vec<String>> {
let endpoint = format!("users/{}/repos", username);
@ -235,7 +169,7 @@ async fn fetch_org_repos(
api_url: &Url,
org: &str,
repo_filter: RepoType,
excludes: &ExcludeMatcher,
excludes: &git_host::ExcludeMatcher,
progress: Option<&ProgressBar>,
) -> Result<Vec<String>> {
let endpoint = format!("orgs/{}/repos", org);

View file

@ -7,7 +7,6 @@ use std::{
};
use anyhow::{Context, Result};
use globset::{Glob, GlobSet, GlobSetBuilder};
use indicatif::{ProgressBar, ProgressStyle};
use octorust::{
auth::Credentials,
@ -119,83 +118,14 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
parse_repo_name_from_path(trimmed)
}
struct ExcludeMatcher {
exact: HashSet<String>,
globs: Option<GlobSet>,
use crate::git_host;
fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_repo(raw), "GitHub")
}
impl ExcludeMatcher {
fn is_empty(&self) -> bool {
self.exact.is_empty() && self.globs.is_none()
}
fn matches(&self, name: &str) -> bool {
if self.exact.contains(name) {
return true;
}
if let Some(globs) = &self.globs {
return globs.is_match(name);
}
false
}
}
fn looks_like_glob(pattern: &str) -> bool {
pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
}
fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
let mut exact = HashSet::new();
let mut glob_builder = GlobSetBuilder::new();
let mut has_glob = false;
for raw in exclude_repos {
match parse_excluded_repo(raw) {
Some(name) => {
if looks_like_glob(&name) {
match Glob::new(&name) {
Ok(glob) => {
glob_builder.add(glob);
has_glob = true;
}
Err(err) => {
warn!("Ignoring invalid GitHub exclusion pattern '{raw}': {err}");
exact.insert(name);
}
}
} else {
exact.insert(name);
}
}
None => {
warn!("Ignoring invalid GitHub exclusion '{raw}' (expected owner/repo)");
}
}
}
let globs = if has_glob {
match glob_builder.build() {
Ok(set) => Some(set),
Err(err) => {
warn!("Failed to build GitHub exclusion patterns: {err}");
None
}
}
} else {
None
};
ExcludeMatcher { exact, globs }
}
fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool {
if excludes.is_empty() {
return false;
}
if let Some(name) = parse_repo_name_from_url(clone_url) {
return excludes.matches(&name);
}
false
fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
git_host::should_exclude_repo(clone_url, excludes, parse_repo_name_from_url)
}
fn create_github_client(github_url: &url::Url, ignore_certs: bool) -> Result<Arc<Client>> {
// Try personal access token

View file

@ -16,7 +16,6 @@ use gitlab::{
},
Gitlab, GitlabBuilder,
};
use globset::{Glob, GlobSet, GlobSetBuilder};
use indicatif::{ProgressBar, ProgressStyle};
use reqwest::StatusCode;
use serde::Deserialize;
@ -25,7 +24,7 @@ use tokio::task;
use tracing::{info, warn};
use url::{form_urlencoded, Url};
use crate::{findings_store, git_url::GitUrl};
use crate::{findings_store, git_host, git_url::GitUrl};
use std::str::FromStr;
#[derive(Deserialize)]
@ -128,83 +127,12 @@ fn parse_excluded_project(raw: &str) -> Option<String> {
parse_project_path(trimmed)
}
struct ExcludeMatcher {
exact: HashSet<String>,
globs: Option<GlobSet>,
fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_project(raw), "GitLab")
}
impl ExcludeMatcher {
fn is_empty(&self) -> bool {
self.exact.is_empty() && self.globs.is_none()
}
fn matches(&self, name: &str) -> bool {
if self.exact.contains(name) {
return true;
}
if let Some(globs) = &self.globs {
return globs.is_match(name);
}
false
}
}
fn looks_like_glob(pattern: &str) -> bool {
pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
}
fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
let mut exact = HashSet::new();
let mut glob_builder = GlobSetBuilder::new();
let mut has_glob = false;
for raw in exclude_repos {
match parse_excluded_project(raw) {
Some(name) => {
if looks_like_glob(&name) {
match Glob::new(&name) {
Ok(glob) => {
glob_builder.add(glob);
has_glob = true;
}
Err(err) => {
warn!("Ignoring invalid GitLab exclusion pattern '{raw}': {err}");
exact.insert(name);
}
}
} else {
exact.insert(name);
}
}
None => {
warn!("Ignoring invalid GitLab exclusion '{raw}' (expected group/project)");
}
}
}
let globs = if has_glob {
match glob_builder.build() {
Ok(set) => Some(set),
Err(err) => {
warn!("Failed to build GitLab exclusion patterns: {err}");
None
}
}
} else {
None
};
ExcludeMatcher { exact, globs }
}
fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool {
if excludes.is_empty() {
return false;
}
if let Some(name) = parse_project_path_from_url(clone_url) {
return excludes.matches(&name);
}
false
fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
git_host::should_exclude_repo(clone_url, excludes, parse_project_path_from_url)
}
fn create_gitlab_client(gitlab_url: &Url, ignore_certs: bool) -> Result<Gitlab> {

View file

@ -24,6 +24,7 @@ pub mod findings_store;
pub mod gcs;
pub mod git_binary;
pub mod git_commit_metadata;
pub mod git_host;
pub mod git_metadata_graph;
mod git_repo_enumerator;
pub mod git_url;

View file

@ -0,0 +1,2 @@
// Re-export from the canonical implementation in kingfisher-scanner.
pub use kingfisher_scanner::primitives::{get_base64_strings, DecodedData};

140
src/matcher/captures.rs Normal file
View file

@ -0,0 +1,140 @@
use bstr::BString;
use regex::bytes::Regex;
use schemars::{
gen::SchemaGenerator,
schema::{ArrayValidation, InstanceType, Schema},
JsonSchema,
};
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use crate::{snippet::Base64BString, util::intern};
// -------------------------------------------------------------------------------------------------
// Group
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)]
pub struct Group(pub Base64BString);
impl Group {
pub fn new(m: regex::bytes::Match<'_>) -> Self {
Self(Base64BString(BString::from(m.as_bytes())))
}
}
// -------------------------------------------------------------------------------------------------
// Groups
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct Groups(pub SmallVec<[Group; 1]>);
impl JsonSchema for Groups {
fn schema_name() -> String {
"Groups".to_string()
}
fn json_schema(gen: &mut SchemaGenerator) -> Schema {
let group_schema = gen.subschema_for::<Group>();
Schema::Object(schemars::schema::SchemaObject {
instance_type: Some(InstanceType::Array.into()),
array: Some(Box::new(ArrayValidation {
items: Some(group_schema.into()),
..Default::default()
})),
..Default::default()
})
}
}
#[derive(Debug, Clone, JsonSchema)]
pub struct SerializableCapture {
pub name: Option<&'static str>,
pub match_number: i32,
pub start: usize,
pub end: usize,
/// Interned original (unredacted) value.
#[serde(skip_serializing, skip_deserializing)]
pub value: &'static str,
}
impl SerializableCapture {
/// Returns the original captured value.
pub fn raw_value(&self) -> &'static str {
self.value
}
/// Returns the value that should be shown in user-facing output.
pub fn display_value(&self) -> std::borrow::Cow<'static, str> {
crate::util::display_value(self.value)
}
}
impl serde::Serialize for SerializableCapture {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeStruct;
let mut state = serializer.serialize_struct("SerializableCapture", 5)?;
state.serialize_field("name", &self.name)?;
state.serialize_field("match_number", &self.match_number)?;
state.serialize_field("start", &self.start)?;
state.serialize_field("end", &self.end)?;
let value = self.display_value();
state.serialize_field("value", &value)?;
state.end()
}
}
#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCaptures {
#[schemars(with = "Vec<SerializableCapture>")]
pub captures: SmallVec<[SerializableCapture; 2]>,
}
impl SerializableCaptures {
pub fn from_captures(captures: &regex::bytes::Captures, _input: &[u8], re: &Regex) -> Self {
let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
let capture_names: SmallVec<[Option<&'static str>; 4]> =
re.capture_names().map(|name| name.map(intern)).collect();
// If there are explicit capture groups (e.g., group 1, 2, ...),
// only serialize those.
if captures.len() > 1 {
for i in 1..captures.len() {
// Start from 1
if let Some(cap) = captures.get(i) {
let raw_value = String::from_utf8_lossy(cap.as_bytes());
let raw_interned = intern(raw_value.as_ref());
let name = capture_names.get(i).and_then(|opt| *opt);
serialized_captures.push(SerializableCapture {
name,
match_number: i32::try_from(i).unwrap_or(0),
start: cap.start(),
end: cap.end(),
value: raw_interned,
});
}
}
} else if captures.len() == 1 {
// ELSE, if there is ONLY the full match (len == 1),
// serialize just that full match (group 0) as the fallback.
if let Some(cap) = captures.get(0) {
let raw_value = String::from_utf8_lossy(cap.as_bytes());
let raw_interned = intern(raw_value.as_ref());
let name = capture_names.get(0).and_then(|opt| *opt);
serialized_captures.push(SerializableCapture {
name,
match_number: 0,
start: cap.start(),
end: cap.end(),
value: raw_interned,
});
}
}
// If len == 0 (no match), loop is skipped, empty vec is returned.
SerializableCaptures { captures: serialized_captures }
}
}

251
src/matcher/conversion.rs Normal file
View file

@ -0,0 +1,251 @@
use std::sync::Arc;
use http::StatusCode;
use schemars::JsonSchema;
use serde::Serialize;
use xxhash_rust::xxh3::xxh3_64;
use crate::{
blob::BlobId,
location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
rules::rule::Rule,
validation_body::{self, ValidationResponseBody},
};
use super::{captures::SerializableCaptures, BlobMatch};
use kingfisher_scanner::primitives::compute_finding_fingerprint;
// -------------------------------------------------------------------------------------------------
// OwnedBlobMatch
// -------------------------------------------------------------------------------------------------
#[derive(Clone)]
pub struct OwnedBlobMatch {
pub rule: Arc<Rule>,
pub blob_id: BlobId,
/// The unique content-based identifier of this match
pub finding_fingerprint: u64,
pub matching_input_offset_span: OffsetSpan,
pub captures: SerializableCaptures,
pub validation_response_body: ValidationResponseBody,
pub validation_response_status: StatusCode,
pub validation_success: bool,
pub calculated_entropy: f32,
pub is_base64: bool,
/// Variables captured from dependent rules (from depends_on_rule).
/// Maps variable name (uppercase) to captured value.
pub dependent_captures: std::collections::BTreeMap<String, String>,
}
impl OwnedBlobMatch {
pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc<Rule>) -> OwnedBlobMatch {
OwnedBlobMatch {
rule,
blob_id: m.blob_id,
finding_fingerprint: m.finding_fingerprint,
// matching_input: m.snippet.matching.0.to_vec(),
matching_input_offset_span: m.location.offset_span.clone(),
captures: m.groups.clone(),
validation_response_body: m.validation_response_body.clone(),
validation_response_status: StatusCode::from_u16(m.validation_response_status)
.unwrap_or(StatusCode::CONTINUE),
validation_success: m.validation_success,
calculated_entropy: m.calculated_entropy,
is_base64: m.is_base64,
dependent_captures: m.dependent_captures.clone(),
}
}
pub fn from_blob_match(blob_match: BlobMatch) -> Self {
// EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
//
// This indexing is intentionally different from the internal `validation_dedup_key()`
// (which uses get(0)) to maintain stable external fingerprints. Changing this would break:
// - Historical baselines that rely on fingerprint matching
// - Dedup entries stored in external systems
//
// For rules with nested captures like (?<REGEX>...(ABC)...), this may pick up
// the inner group, but that behavior is now established and must be preserved.
let matching_finding = blob_match
.captures
.captures
.get(1)
.or_else(|| blob_match.captures.captures.get(0))
.map(|capture| capture.raw_value().as_bytes().to_vec())
.unwrap_or_else(Vec::new);
let mut owned_blob_match = OwnedBlobMatch {
rule: blob_match.rule,
blob_id: blob_match.blob_id.clone(),
matching_input_offset_span: blob_match.matching_input_offset_span,
captures: blob_match.captures.clone(),
validation_response_body: blob_match.validation_response_body,
validation_response_status: blob_match.validation_response_status,
validation_success: blob_match.validation_success,
calculated_entropy: blob_match.calculated_entropy,
finding_fingerprint: 0, //default
is_base64: blob_match.is_base64,
dependent_captures: std::collections::BTreeMap::new(),
};
// Convert matching_finding to a &str (using lossy conversion if needed)
let finding_value = std::str::from_utf8(&matching_finding).unwrap_or("");
// Use blob_id as the file/commit identifier
let file_or_commit = &blob_match.blob_id.to_string();
let offset_start: u64 =
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
owned_blob_match.finding_fingerprint =
compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end);
owned_blob_match
}
}
// -------------------------------------------------------------------------------------------------
// Match
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct Match {
/// The location of the entire matching content
pub location: Location,
/// The capture groups
pub groups: SerializableCaptures, // Store serialized captures
/// unique identifier of file / blob where this match was found
pub blob_id: BlobId,
/// The unique content-based identifier of this match
pub finding_fingerprint: u64,
/// The rule that produced this match
#[serde(skip_serializing)]
#[schemars(skip)]
pub rule: Arc<Rule>,
/// Validation Body
#[serde(
default,
serialize_with = "validation_body::serialize",
deserialize_with = "validation_body::deserialize"
)]
#[schemars(schema_with = "validation_body::schema")]
pub validation_response_body: ValidationResponseBody,
/// Validation Status Code
pub validation_response_status: u16,
/// Validation Success
pub validation_success: bool,
/// Validation Success
pub calculated_entropy: f32,
pub visible: bool,
#[serde(default)]
pub is_base64: bool,
/// Variables captured from dependent rules (from depends_on_rule).
/// Maps variable name (uppercase) to captured value.
#[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
pub dependent_captures: std::collections::BTreeMap<String, String>,
}
impl Match {
#[inline]
pub fn convert_owned_blobmatch_to_match<'a>(
loc_mapping: Option<&'a LocationMapping<'a>>,
owned_blob_match: &'a OwnedBlobMatch,
origin_type: &'a str,
) -> Self {
let offset_span = owned_blob_match.matching_input_offset_span;
// EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
// See comment in from_blob_match() for why this differs from validation_dedup_key().
let matching_finding_bytes = owned_blob_match
.captures
.captures
.get(1)
.or_else(|| owned_blob_match.captures.captures.get(0))
.map(|capture| capture.raw_value().as_bytes())
.unwrap_or_default();
// The fingerprint will be based on the content of the secret.
let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
let source_span =
loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
start: SourcePoint { line: 0, column: 0 },
end: SourcePoint { line: 0, column: 0 },
});
let offset_start: u64 =
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
let finding_fingerprint = compute_finding_fingerprint(
finding_value_for_fp,
origin_type, // file_or_commit,
offset_start,
offset_end,
);
// matching_snippet
Match {
rule: owned_blob_match.rule.clone(),
visible: owned_blob_match.rule.visible().to_owned(),
location: Location::with_source_span(offset_span, Some(source_span.clone())),
groups: owned_blob_match.captures.clone(),
blob_id: owned_blob_match.blob_id,
finding_fingerprint,
validation_response_body: owned_blob_match.validation_response_body.clone(),
validation_response_status: owned_blob_match.validation_response_status.as_u16(),
validation_success: owned_blob_match.validation_success,
calculated_entropy: owned_blob_match.calculated_entropy,
is_base64: owned_blob_match.is_base64,
dependent_captures: owned_blob_match.dependent_captures.clone(),
}
}
/// Returns the `blob_id` of the match.
pub fn get_blob_id(&self) -> BlobId {
self.blob_id.clone()
}
pub fn finding_id(&self) -> String {
let mut buffer = Vec::with_capacity(128);
buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes());
buffer.push(0);
serde_json::to_writer(&mut buffer, &self.groups)
.expect("should be able to serialize groups as JSON");
let mut num = xxh3_64(&buffer);
// Ensure the number is positive and within i64 range
num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive
// Convert to string
num.to_string()
}
}
// -------------------------------------------------------------------------------------------------
// MatcherStats
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Default, Clone)]
pub struct MatcherStats {
pub blobs_seen: u64,
pub blobs_scanned: u64,
pub bytes_seen: u64,
pub bytes_scanned: u64,
}
impl MatcherStats {
pub fn update(&mut self, other: &Self) {
self.blobs_seen += other.blobs_seen;
self.blobs_scanned += other.blobs_scanned;
self.bytes_seen += other.bytes_seen;
self.bytes_scanned += other.bytes_scanned;
}
}

2
src/matcher/dedup.rs Normal file
View file

@ -0,0 +1,2 @@
// Re-export from the canonical implementation in kingfisher-scanner.
pub(crate) use kingfisher_scanner::primitives::{compute_match_key, record_match};

274
src/matcher/filter.rs Normal file
View file

@ -0,0 +1,274 @@
use std::sync::Arc;
use http::StatusCode;
use regex::bytes::Regex;
use rustc_hash::{FxHashMap, FxHashSet};
use tracing::debug;
use crate::{
blob::Blob,
entropy::calculate_shannon_entropy,
inline_ignore::InlineIgnoreConfig,
location::OffsetSpan,
origin::OriginSet,
rule_profiling::{ConcurrentRuleProfiler, RuleTimer},
rules::rule::{PatternRequirementContext, PatternValidationResult, Rule, Validation},
safe_list::{is_safe_match, is_user_match},
validation::{is_parseable_mongodb_uri, is_parseable_mysql_uri, is_parseable_postgres_uri},
};
use super::{
captures::SerializableCaptures,
dedup::{compute_match_key, record_match},
BlobMatch,
};
// Re-use the canonical secret capture selection from kingfisher-scanner.
use kingfisher_scanner::primitives::find_secret_capture;
// -------------------------------------------------------------------------------------------------
// Entropy and safe-list check
// -------------------------------------------------------------------------------------------------
/// Returns `Some(entropy)` if the match passes entropy and safe-list checks,
/// `None` if it should be skipped.
fn check_entropy_and_safelist(
entropy_bytes: &[u8],
full_bytes: &[u8],
min_entropy: f32,
) -> Option<f32> {
let calculated_entropy = calculate_shannon_entropy(entropy_bytes);
if calculated_entropy <= min_entropy
|| is_safe_match(entropy_bytes)
|| is_user_match(entropy_bytes, full_bytes)
{
debug!(
"Skipping match with entropy {} <= {} or safe match",
calculated_entropy, min_entropy
);
None
} else {
Some(calculated_entropy)
}
}
// -------------------------------------------------------------------------------------------------
// Pattern requirements check
// -------------------------------------------------------------------------------------------------
/// Returns `true` if the match passes pattern requirements, `false` if it should be skipped.
fn check_pattern_requirements(
rule: &Rule,
re: &Regex,
captures: &regex::bytes::Captures,
full_bytes: &[u8],
entropy_bytes: &[u8],
respect_ignore_if_contains: bool,
) -> bool {
let Some(char_reqs) = rule.pattern_requirements() else {
return true;
};
let context = PatternRequirementContext { regex: re, captures, full_match: full_bytes };
// Decide which bytes to validate:
// - If there are multiple capture groups OR any named captures -> use full match
// - Otherwise -> use entropy_bytes (the actual secret)
let use_full_match = {
let has_named_captures = re.capture_names().any(|n| n.is_some());
let capture_count = captures.len(); // includes group 0
has_named_captures || capture_count > 2
};
let validation_bytes = if use_full_match { full_bytes } else { entropy_bytes };
match char_reqs.validate(validation_bytes, Some(context), respect_ignore_if_contains) {
PatternValidationResult::Passed => true,
PatternValidationResult::Failed => {
debug!(
"Skipping match that does not meet character requirements for rule {}",
rule.id()
);
false
}
PatternValidationResult::FailedChecksum { actual_len, expected_len } => {
debug!(
"Skipping match for rule {} due to checksum mismatch (actual_len={}, expected_len={})",
rule.id(),
actual_len,
expected_len
);
false
}
PatternValidationResult::IgnoredBySubstring { matched_term } => {
debug!(
"Skipping match for rule {} because it contains ignored term {matched_term}",
rule.id()
);
false
}
}
}
// -------------------------------------------------------------------------------------------------
// URI validation
// -------------------------------------------------------------------------------------------------
/// Returns `true` if the match passes URI validation (for database rules), `false` if it should
/// be skipped.
fn check_uri_validation(rule: &Rule, matching_input_bytes: &[u8]) -> bool {
let Some(validation) = rule.syntax.validation.as_ref() else {
return true;
};
match validation {
Validation::MongoDB => {
let Ok(uri) = std::str::from_utf8(matching_input_bytes) else {
debug!("Skipping match for rule {} due to non-UTF8 MongoDB URI", rule.id());
return false;
};
if !is_parseable_mongodb_uri(uri) {
debug!("Skipping match for rule {} due to invalid MongoDB URI", rule.id());
return false;
}
}
Validation::Postgres => {
let Ok(uri) = std::str::from_utf8(matching_input_bytes) else {
debug!("Skipping match for rule {} due to non-UTF8 Postgres URI", rule.id());
return false;
};
if !is_parseable_postgres_uri(uri) {
debug!("Skipping match for rule {} due to invalid Postgres URI", rule.id());
return false;
}
}
Validation::MySQL => {
let Ok(uri) = std::str::from_utf8(matching_input_bytes) else {
debug!("Skipping match for rule {} due to non-UTF8 MySQL URI", rule.id());
return false;
};
if !is_parseable_mysql_uri(uri) {
debug!("Skipping match for rule {} due to invalid MySQL URI", rule.id());
return false;
}
}
_ => {}
}
true
}
// -------------------------------------------------------------------------------------------------
// filter_match — main entry point
// -------------------------------------------------------------------------------------------------
#[allow(clippy::too_many_arguments)]
pub(crate) fn filter_match<'b>(
blob: &'b Blob,
rule: Arc<Rule>,
re: &Regex,
start: usize,
end: usize,
matches: &mut Vec<BlobMatch<'b>>,
previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
rule_id: usize,
seen_matches: &mut FxHashSet<u64>,
_origin: &OriginSet,
ts_match: Option<&[u8]>,
is_base64: bool,
_redact: bool,
filename: &str,
profiler: Option<&Arc<ConcurrentRuleProfiler>>,
respect_ignore_if_contains: bool,
inline_ignore_config: &InlineIgnoreConfig,
) {
let mut timer =
profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
let initial_len = matches.len();
let blob_bytes = blob.bytes();
let default_slice = &blob_bytes[start..end];
let haystack = ts_match.unwrap_or(default_slice);
for captures in re.captures_iter(haystack) {
let full_capture = captures.get(0).unwrap();
let matching_input_for_entropy = find_secret_capture(re, &captures);
let min_entropy = rule.min_entropy();
let entropy_bytes = matching_input_for_entropy.as_bytes();
let full_bytes = full_capture.as_bytes();
// Check entropy and safe-listing
let calculated_entropy =
match check_entropy_and_safelist(entropy_bytes, full_bytes, min_entropy) {
Some(e) => e,
None => continue,
};
// Check pattern requirements
if !check_pattern_requirements(
&rule,
re,
&captures,
full_bytes,
entropy_bytes,
respect_ignore_if_contains,
) {
continue;
}
// Use the `matching_input_for_entropy` as the span/key for the finding.
let matching_input = matching_input_for_entropy;
let matching_input_offset_span = OffsetSpan::from_range(
(start + matching_input.start())..(start + matching_input.end()),
);
// Check inline ignore directives
if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
debug!("Skipping match due to inline ignore directive");
continue;
}
// Check URI validation (MongoDB, Postgres, MySQL)
if !check_uri_validation(&rule, matching_input.as_bytes()) {
continue;
}
// Deduplication
let match_key = compute_match_key(
matching_input.as_bytes(),
rule.id().as_bytes(),
matching_input_offset_span.start,
matching_input_offset_span.end,
);
if !seen_matches.insert(match_key) {
continue;
}
if !record_match(previous_matches, rule_id, matching_input_offset_span) {
continue;
}
let only_matching_input =
&blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
// Pass the *full* capture object to from_captures
let groups = SerializableCaptures::from_captures(&captures, haystack, re);
matches.push(BlobMatch {
rule: Arc::clone(&rule),
blob_id: blob.id_ref(),
matching_input: only_matching_input,
matching_input_offset_span,
captures: groups,
validation_response_body: None,
validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
validation_success: false,
calculated_entropy,
is_base64,
});
}
if let Some(t) = timer.take() {
let new_count = (matches.len() - initial_len) as u64;
t.end(new_count > 0, new_count, 0);
}
}

View file

@ -0,0 +1,2 @@
// Re-export from the canonical implementation in kingfisher-scanner.
pub use kingfisher_scanner::primitives::compute_finding_fingerprint;

View file

@ -1,42 +1,38 @@
use std::{
hash::{Hash, Hasher},
str,
sync::{Arc, Mutex},
};
mod base64_decode;
mod captures;
mod conversion;
mod dedup;
mod filter;
mod fingerprint;
// Re-export public API
pub use base64_decode::{get_base64_strings, DecodedData};
pub use captures::{Group, Groups, SerializableCapture, SerializableCaptures};
pub use conversion::{Match, MatcherStats, OwnedBlobMatch};
pub use fingerprint::compute_finding_fingerprint;
use std::sync::{Arc, Mutex};
use anyhow::Result;
use base64::{engine::general_purpose, Engine};
use bstr::BString;
use http::StatusCode;
use regex::bytes::Regex;
use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
use schemars::{
gen::SchemaGenerator,
schema::{ArrayValidation, InstanceType, Schema},
JsonSchema,
};
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use tracing::debug;
use xxhash_rust::xxh3::xxh3_64;
use rustc_hash::{FxHashMap, FxHashSet};
use crate::{
blob::{Blob, BlobId, BlobIdMap},
entropy::calculate_shannon_entropy,
inline_ignore::InlineIgnoreConfig,
location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
location::OffsetSpan,
origin::OriginSet,
parser,
parser::{Checker, Language},
rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer},
rules::rule::{PatternRequirementContext, PatternValidationResult, Rule, Validation},
rule_profiling::{ConcurrentRuleProfiler, RuleStats},
rules::rule::Rule,
rules_database::RulesDatabase,
safe_list::{is_safe_match, is_user_match},
scanner_pool::ScannerPool,
snippet::Base64BString,
util::intern,
validation::{is_parseable_mongodb_uri, is_parseable_mysql_uri, is_parseable_postgres_uri},
validation_body::{self, ValidationResponseBody},
validation_body::ValidationResponseBody,
};
use self::{
base64_decode::get_base64_strings as get_b64_strings, dedup::record_match, filter::filter_match,
};
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
@ -58,94 +54,7 @@ struct RawMatch {
start_idx: u64,
end_idx: u64,
}
#[derive(Clone)]
pub struct OwnedBlobMatch {
pub rule: Arc<Rule>,
pub blob_id: BlobId,
/// The unique content-based identifier of this match
pub finding_fingerprint: u64,
pub matching_input_offset_span: OffsetSpan,
pub captures: SerializableCaptures,
pub validation_response_body: ValidationResponseBody,
pub validation_response_status: StatusCode,
pub validation_success: bool,
pub calculated_entropy: f32,
pub is_base64: bool,
/// Variables captured from dependent rules (from depends_on_rule).
/// Maps variable name (uppercase) to captured value.
pub dependent_captures: std::collections::BTreeMap<String, String>,
}
impl<'a> Matcher<'a> {
pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
self.profiler.as_ref().map(|p| p.generate_report())
}
}
impl OwnedBlobMatch {
pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc<Rule>) -> OwnedBlobMatch {
OwnedBlobMatch {
rule,
blob_id: m.blob_id,
finding_fingerprint: m.finding_fingerprint,
// matching_input: m.snippet.matching.0.to_vec(),
matching_input_offset_span: m.location.offset_span.clone(),
captures: m.groups.clone(),
validation_response_body: m.validation_response_body.clone(),
validation_response_status: StatusCode::from_u16(m.validation_response_status)
.unwrap_or(StatusCode::CONTINUE),
validation_success: m.validation_success,
calculated_entropy: m.calculated_entropy,
is_base64: m.is_base64,
dependent_captures: m.dependent_captures.clone(),
}
}
pub fn from_blob_match(blob_match: BlobMatch) -> Self {
// EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
//
// This indexing is intentionally different from the internal `validation_dedup_key()`
// (which uses get(0)) to maintain stable external fingerprints. Changing this would break:
// - Historical baselines that rely on fingerprint matching
// - Dedup entries stored in external systems
//
// For rules with nested captures like (?<REGEX>...(ABC)...), this may pick up
// the inner group, but that behavior is now established and must be preserved.
let matching_finding = blob_match
.captures
.captures
.get(1)
.or_else(|| blob_match.captures.captures.get(0))
.map(|capture| capture.raw_value().as_bytes().to_vec())
.unwrap_or_else(Vec::new);
let mut owned_blob_match = OwnedBlobMatch {
rule: blob_match.rule,
blob_id: blob_match.blob_id.clone(),
matching_input_offset_span: blob_match.matching_input_offset_span,
captures: blob_match.captures.clone(),
validation_response_body: blob_match.validation_response_body,
validation_response_status: blob_match.validation_response_status,
validation_success: blob_match.validation_success,
calculated_entropy: blob_match.calculated_entropy,
finding_fingerprint: 0, //default
is_base64: blob_match.is_base64,
dependent_captures: std::collections::BTreeMap::new(),
};
// Convert matching_finding to a &str (using lossy conversion if needed)
let finding_value = std::str::from_utf8(&matching_finding).unwrap_or("");
// Use blob_id as the file/commit identifier
let file_or_commit = &blob_match.blob_id.to_string();
let offset_start: u64 =
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
owned_blob_match.finding_fingerprint =
compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end);
owned_blob_match
}
}
// -------------------------------------------------------------------------------------------------
// BlobMatch
// -------------------------------------------------------------------------------------------------
@ -156,7 +65,7 @@ impl OwnedBlobMatch {
/// `Match`.
pub struct BlobMatch<'a> {
/// The rule that was matched
pub rule: Arc<Rule>, // Changed from `&'a Rule` to `Arc<Rule
pub rule: Arc<Rule>,
/// The blob that was matched
pub blob_id: &'a BlobId,
@ -168,7 +77,7 @@ pub struct BlobMatch<'a> {
pub matching_input_offset_span: OffsetSpan,
/// The capture groups from the match
pub captures: SerializableCaptures, // regex::bytes::Captures<'a>,
pub captures: SerializableCaptures,
pub validation_response_body: ValidationResponseBody,
pub validation_response_status: StatusCode,
@ -177,6 +86,7 @@ pub struct BlobMatch<'a> {
pub calculated_entropy: f32,
pub is_base64: bool,
}
#[derive(Clone)]
struct UserData {
/// A scratch vector for raw matches from Vectorscan, to minimize allocation
@ -185,6 +95,7 @@ struct UserData {
/// The length of the input being scanned
input_len: u64,
}
// -------------------------------------------------------------------------------------------------
// Matcher
// -------------------------------------------------------------------------------------------------
@ -222,6 +133,7 @@ pub struct Matcher<'a> {
/// Whether matches should honour `ignore_if_contains` requirements.
respect_ignore_if_contains: bool,
}
/// This `Drop` implementation updates the `global_stats` with the local stats
impl<'a> Drop for Matcher<'a> {
fn drop(&mut self) {
@ -231,11 +143,19 @@ impl<'a> Drop for Matcher<'a> {
}
}
}
pub enum ScanResult<'a> {
SeenWithMatches,
SeenSansMatches,
New(Vec<BlobMatch<'a>>),
}
impl<'a> Matcher<'a> {
pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
self.profiler.as_ref().map(|p| p.generate_report())
}
}
impl<'a> Matcher<'a> {
/// Create a new `Matcher` from the given `RulesDatabase`.
///
@ -255,8 +175,6 @@ impl<'a> Matcher<'a> {
// Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec
let raw_matches_scratch = Vec::new();
let user_data = UserData { raw_matches_scratch, input_len: 0 };
// let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?;
// pool is created once per scan run (see Scanner section below)
let profiler = shared_profiler.or_else(|| {
if enable_profiling {
Some(Arc::new(ConcurrentRuleProfiler::new()))
@ -349,7 +267,7 @@ impl<'a> Matcher<'a> {
let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
Vec::new()
} else {
get_base64_strings(blob.bytes())
get_b64_strings(blob.bytes())
};
let lang_hint = lang.as_deref();
@ -530,7 +448,7 @@ impl<'a> Matcher<'a> {
);
}
if depth + 1 < MAX_B64_DEPTH {
for nested in get_base64_strings(item.decoded.as_slice()) {
for nested in get_b64_strings(item.decoded.as_slice()) {
b64_stack.push((
DecodedData {
decoded: nested.decoded,
@ -559,268 +477,12 @@ impl<'a> Matcher<'a> {
if self.user_data.raw_matches_scratch.capacity()
> self.user_data.raw_matches_scratch.len() * 4
{
// Vec::shrink_to_fit may re-allocate, but were about to leave scan_blob
// Vec::shrink_to_fit may re-allocate, but we're about to leave scan_blob
// so the cost is hidden off the hot path.
self.user_data.raw_matches_scratch.shrink_to_fit();
}
Ok(ScanResult::New(matches))
// Ok(result)
}
}
#[inline]
fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
let mut hasher = FxHasher::default();
// Hash each component directly without allocation
content.hash(&mut hasher);
rule_id.hash(&mut hasher);
start.hash(&mut hasher);
end.hash(&mut hasher);
hasher.finish()
}
#[inline]
fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
if idx > 0 {
if spans[idx - 1].fully_contains(&span) {
return false;
}
if span.fully_contains(&spans[idx - 1]) {
spans.remove(idx - 1);
idx -= 1;
}
}
if idx < spans.len() {
if spans[idx].fully_contains(&span) {
return false;
}
if span.fully_contains(&spans[idx]) {
spans.remove(idx);
}
}
spans.insert(idx, span);
true
}
#[inline]
fn record_match(
map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
rule_id: usize,
span: OffsetSpan,
) -> bool {
insert_span(map.entry(rule_id).or_default(), span)
}
#[allow(clippy::too_many_arguments)]
fn filter_match<'b>(
blob: &'b Blob,
rule: Arc<Rule>,
re: &Regex,
start: usize,
end: usize,
matches: &mut Vec<BlobMatch<'b>>,
previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
rule_id: usize,
seen_matches: &mut FxHashSet<u64>,
_origin: &OriginSet,
ts_match: Option<&[u8]>,
is_base64: bool,
_redact: bool,
filename: &str,
profiler: Option<&Arc<ConcurrentRuleProfiler>>,
respect_ignore_if_contains: bool,
inline_ignore_config: &InlineIgnoreConfig,
) {
let mut timer =
profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
let initial_len = matches.len();
let blob_bytes = blob.bytes();
let default_slice = &blob_bytes[start..end];
let haystack = ts_match.unwrap_or(default_slice);
for captures in re.captures_iter(haystack) {
let full_capture = captures.get(0).unwrap();
// --- LOGIC TO FIND THE "SECRET" FOR ENTROPY/SAFE-LISTING ---
let matching_input_for_entropy = 'block: {
// 1. Prefer a named capture called TOKEN (case-insensitive).
if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
name_opt
.filter(|name| name.eq_ignore_ascii_case("TOKEN"))
.and_then(|_| captures.get(i))
}) {
break 'block token_cap;
}
// 2. Otherwise, prefer the first *matched* named capture.
if let Some(named_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
name_opt.and_then(|_| captures.get(i)) // find(i > 0 && name_opt.is_some())
}) {
break 'block named_cap;
}
// 3. Otherwise, fall back to the first positional capture (group 1).
if let Some(pos_cap) = captures.get(1) {
break 'block pos_cap;
}
// 4. Finally, fall back to the full match (group 0).
break 'block full_capture;
};
// --- END LOGIC ---
let min_entropy = rule.min_entropy();
let entropy_bytes = matching_input_for_entropy.as_bytes();
let full_bytes = full_capture.as_bytes();
let calculated_entropy = calculate_shannon_entropy(entropy_bytes);
// Check entropy and safe-listing against the *selected* secret bytes
if calculated_entropy <= min_entropy
|| is_safe_match(entropy_bytes)
|| is_user_match(entropy_bytes, full_bytes)
{
debug!(
"Skipping match with entropy {} <= {} or safe match",
calculated_entropy, min_entropy
);
continue;
}
// Check character requirements if specified
if let Some(char_reqs) = rule.pattern_requirements() {
let context = PatternRequirementContext {
regex: re,
captures: &captures,
full_match: full_bytes,
};
// Decide which bytes to validate:
// - If there are multiple capture groups OR any named captures → use full match
// - Otherwise → use entropy_bytes (the actual secret)
let use_full_match = {
let has_named_captures = re.capture_names().any(|n| n.is_some());
let capture_count = captures.len(); // includes group 0
has_named_captures || capture_count > 2
};
let validation_bytes = if use_full_match { full_bytes } else { entropy_bytes };
match char_reqs.validate(validation_bytes, Some(context), respect_ignore_if_contains) {
//
// --- END FIX ---
PatternValidationResult::Passed => {}
PatternValidationResult::Failed => {
debug!(
"Skipping match that does not meet character requirements for rule {}",
rule.id()
);
continue;
}
PatternValidationResult::FailedChecksum { actual_len, expected_len } => {
debug!(
"Skipping match for rule {} due to checksum mismatch (actual_len={}, expected_len={})",
rule.id(),
actual_len,
expected_len
);
continue;
}
PatternValidationResult::IgnoredBySubstring { matched_term } => {
debug!(
"Skipping match for rule {} because it contains ignored term {matched_term}",
rule.id()
);
continue;
}
}
}
// Use the `matching_input_for_entropy` as the span/key for the finding.
let matching_input = matching_input_for_entropy;
let matching_input_offset_span = OffsetSpan::from_range(
(start + matching_input.start())..(start + matching_input.end()),
);
if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
debug!("Skipping match due to inline ignore directive");
continue;
}
if let Some(validation) = rule.syntax.validation.as_ref() {
match validation {
Validation::MongoDB => {
let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
debug!("Skipping match for rule {} due to non-UTF8 MongoDB URI", rule.id());
continue;
};
if !is_parseable_mongodb_uri(uri) {
debug!("Skipping match for rule {} due to invalid MongoDB URI", rule.id());
continue;
}
}
Validation::Postgres => {
let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
debug!(
"Skipping match for rule {} due to non-UTF8 Postgres URI",
rule.id()
);
continue;
};
if !is_parseable_postgres_uri(uri) {
debug!("Skipping match for rule {} due to invalid Postgres URI", rule.id());
continue;
}
}
Validation::MySQL => {
let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
debug!("Skipping match for rule {} due to non-UTF8 MySQL URI", rule.id());
continue;
};
if !is_parseable_mysql_uri(uri) {
debug!("Skipping match for rule {} due to invalid MySQL URI", rule.id());
continue;
}
}
_ => {}
}
}
let match_key = compute_match_key(
matching_input.as_bytes(),
rule.id().as_bytes(),
matching_input_offset_span.start,
matching_input_offset_span.end,
);
if !seen_matches.insert(match_key) {
continue;
}
if !record_match(previous_matches, rule_id, matching_input_offset_span) {
continue;
}
let only_matching_input =
&blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
// Pass the *full* capture object to from_captures
let groups = SerializableCaptures::from_captures(&captures, haystack, re);
matches.push(BlobMatch {
rule: Arc::clone(&rule),
blob_id: blob.id_ref(),
matching_input: only_matching_input,
matching_input_offset_span,
captures: groups,
validation_response_body: None,
validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
validation_success: false,
calculated_entropy,
is_base64,
});
}
if let Some(t) = timer.take() {
let new_count = (matches.len() - initial_len) as u64;
t.end(new_count > 0, new_count, 0);
}
}
@ -837,10 +499,6 @@ fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, S
"javascript" | "js" => {
Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries()))
}
// "kotlin" => Some((
// Language::Kotlin,
// parser::queries::kotlin::get_kotlin_queries(),
// )),
"php" => Some((Language::Php, parser::queries::php::get_php_queries())),
"python" | "py" | "starlark" => {
Some((Language::Python, parser::queries::python::get_python_queries()))
@ -855,354 +513,6 @@ fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, S
_ => None,
}
}
// -------------------------------------------------------------------------------------------------
// MatchStats
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Default, Clone)]
pub struct MatcherStats {
pub blobs_seen: u64,
pub blobs_scanned: u64,
pub bytes_seen: u64,
pub bytes_scanned: u64,
// #[cfg(feature = "rule_profiling")]
// pub rule_stats: crate::rule_profiling::RuleProfile,
}
impl MatcherStats {
pub fn update(&mut self, other: &Self) {
self.blobs_seen += other.blobs_seen;
self.blobs_scanned += other.blobs_scanned;
self.bytes_seen += other.bytes_seen;
self.bytes_scanned += other.bytes_scanned;
// #[cfg(feature = "rule_profiling")]
// self.rule_stats.update(&other.rule_stats);
}
}
// -------------------------------------------------------------------------------------------------
// Group
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)]
pub struct Group(pub Base64BString);
impl Group {
pub fn new(m: regex::bytes::Match<'_>) -> Self {
Self(Base64BString(BString::from(m.as_bytes())))
}
}
// -------------------------------------------------------------------------------------------------
// Groups
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct Groups(pub SmallVec<[Group; 1]>);
impl JsonSchema for Groups {
fn schema_name() -> String {
"Groups".to_string()
}
fn json_schema(gen: &mut SchemaGenerator) -> Schema {
let group_schema = gen.subschema_for::<Group>();
Schema::Object(schemars::schema::SchemaObject {
instance_type: Some(InstanceType::Array.into()),
array: Some(Box::new(ArrayValidation {
items: Some(group_schema.into()),
..Default::default()
})),
..Default::default()
})
}
}
// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
// pub struct SerializableCapture {
// pub name: Option<String>, // Named group (if available)
// pub match_number: i32,
// pub start: usize, // Start position of the match
// pub end: usize, // End position of the match
// pub value: String, // The actual captured value
// }
#[derive(Debug, Clone, JsonSchema)]
pub struct SerializableCapture {
pub name: Option<&'static str>,
pub match_number: i32,
pub start: usize,
pub end: usize,
/// Interned original (unredacted) value.
#[serde(skip_serializing, skip_deserializing)]
pub value: &'static str,
}
impl SerializableCapture {
/// Returns the original captured value.
pub fn raw_value(&self) -> &'static str {
self.value
}
/// Returns the value that should be shown in user-facing output.
pub fn display_value(&self) -> std::borrow::Cow<'static, str> {
crate::util::display_value(self.value)
}
}
impl serde::Serialize for SerializableCapture {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeStruct;
let mut state = serializer.serialize_struct("SerializableCapture", 5)?;
state.serialize_field("name", &self.name)?;
state.serialize_field("match_number", &self.match_number)?;
state.serialize_field("start", &self.start)?;
state.serialize_field("end", &self.end)?;
let value = self.display_value();
state.serialize_field("value", &value)?;
state.end()
}
}
#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCaptures {
#[schemars(with = "Vec<SerializableCapture>")]
pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
}
impl SerializableCaptures {
pub fn from_captures(captures: &regex::bytes::Captures, _input: &[u8], re: &Regex) -> Self {
let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
let capture_names: SmallVec<[Option<&'static str>; 4]> =
re.capture_names().map(|name| name.map(intern)).collect();
// If there are explicit capture groups (e.g., group 1, 2, ...),
// only serialize those.
if captures.len() > 1 {
for i in 1..captures.len() {
// Start from 1
if let Some(cap) = captures.get(i) {
let raw_value = String::from_utf8_lossy(cap.as_bytes());
let raw_interned = intern(raw_value.as_ref());
let name = capture_names.get(i).and_then(|opt| *opt);
serialized_captures.push(SerializableCapture {
name,
match_number: i32::try_from(i).unwrap_or(0),
start: cap.start(),
end: cap.end(),
value: raw_interned,
});
}
}
} else if captures.len() == 1 {
// ELSE, if there is ONLY the full match (len == 1),
// serialize just that full match (group 0) as the fallback.
if let Some(cap) = captures.get(0) {
let raw_value = String::from_utf8_lossy(cap.as_bytes());
let raw_interned = intern(raw_value.as_ref());
let name = capture_names.get(0).and_then(|opt| *opt);
serialized_captures.push(SerializableCapture {
name,
match_number: 0,
start: cap.start(),
end: cap.end(),
value: raw_interned,
});
}
}
// If len == 0 (no match), loop is skipped, empty vec is returned.
SerializableCaptures { captures: serialized_captures }
}
}
// -------------------------------------------------------------------------------------------------
// Match
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct Match {
/// The location of the entire matching content
pub location: Location,
/// The capture groups
pub groups: SerializableCaptures, // Store serialized captures
/// unique identifier of file / blob where this match was found
pub blob_id: BlobId,
/// The unique content-based identifier of this match
pub finding_fingerprint: u64,
/// The rule that produced this match
#[serde(skip_serializing)]
#[schemars(skip)]
pub rule: Arc<Rule>,
/// Validation Body
#[serde(
default,
serialize_with = "validation_body::serialize",
deserialize_with = "validation_body::deserialize"
)]
#[schemars(schema_with = "validation_body::schema")]
pub validation_response_body: ValidationResponseBody,
/// Validation Status Code
pub validation_response_status: u16,
/// Validation Success
pub validation_success: bool,
/// Validation Success
pub calculated_entropy: f32,
pub visible: bool,
#[serde(default)]
pub is_base64: bool,
/// Variables captured from dependent rules (from depends_on_rule).
/// Maps variable name (uppercase) to captured value.
#[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
pub dependent_captures: std::collections::BTreeMap<String, String>,
}
impl Match {
#[inline]
pub fn convert_owned_blobmatch_to_match<'a>(
loc_mapping: Option<&'a LocationMapping<'a>>,
owned_blob_match: &'a OwnedBlobMatch,
origin_type: &'a str,
) -> Self {
let offset_span = owned_blob_match.matching_input_offset_span;
// EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
// See comment in from_blob_match() for why this differs from validation_dedup_key().
let matching_finding_bytes = owned_blob_match
.captures
.captures
.get(1)
.or_else(|| owned_blob_match.captures.captures.get(0))
.map(|capture| capture.raw_value().as_bytes())
.unwrap_or_default();
// The fingerprint will be based on the content of the secret.
let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
let source_span =
loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
start: SourcePoint { line: 0, column: 0 },
end: SourcePoint { line: 0, column: 0 },
});
let offset_start: u64 =
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
let finding_fingerprint = compute_finding_fingerprint(
finding_value_for_fp,
origin_type, // file_or_commit,
offset_start,
offset_end,
);
// matching_snippet
Match {
rule: owned_blob_match.rule.clone(),
visible: owned_blob_match.rule.visible().to_owned(),
location: Location::with_source_span(offset_span, Some(source_span.clone())),
groups: owned_blob_match.captures.clone(),
blob_id: owned_blob_match.blob_id,
finding_fingerprint,
validation_response_body: owned_blob_match.validation_response_body.clone(),
validation_response_status: owned_blob_match.validation_response_status.as_u16(),
validation_success: owned_blob_match.validation_success,
calculated_entropy: owned_blob_match.calculated_entropy,
is_base64: owned_blob_match.is_base64,
dependent_captures: owned_blob_match.dependent_captures.clone(),
}
}
/// Returns the `blob_id` of the match.
pub fn get_blob_id(&self) -> BlobId {
self.blob_id.clone()
}
pub fn finding_id(&self) -> String {
let mut buffer = Vec::with_capacity(128);
buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes());
buffer.push(0);
serde_json::to_writer(&mut buffer, &self.groups)
.expect("should be able to serialize groups as JSON");
let mut num = xxh3_64(&buffer);
// Ensure the number is positive and within i64 range
num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive
// Convert to string
num.to_string()
}
}
#[derive(Debug, Clone)]
pub struct DecodedData {
pub decoded: Vec<u8>,
pub pos_start: usize,
pub pos_end: usize,
}
#[inline]
fn is_base64_byte(b: u8) -> bool {
// Accepts both standard base64 ('+', '/') and URL-safe base64 ('-', '_') characters.
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
}
pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
let mut results = Vec::new();
let mut i = 0;
while i < input.len() {
while i < input.len() && !is_base64_byte(input[i]) {
i += 1;
}
let start = i;
while i < input.len() && is_base64_byte(input[i]) {
i += 1;
}
let mut eq_count = 0;
while i < input.len() && input[i] == b'=' && eq_count < 2 {
i += 1;
eq_count += 1;
}
let end = i;
let len = end - start;
if len >= 32 && len % 4 == 0 {
let base64_slice = &input[start..end];
// Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD
let decode_result = general_purpose::STANDARD
.decode(base64_slice)
.or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
.or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
if let Ok(decoded) = decode_result {
if decoded.is_ascii() {
results.push(DecodedData { decoded, pos_start: start, pos_end: end });
}
}
}
}
results
}
pub fn compute_finding_fingerprint(
finding_value: &str,
file_or_commit: &str,
offset_start: u64,
offset_end: u64,
) -> u64 {
// Combine all into a byte buffer and hash it directly:
let mut buf = Vec::with_capacity(
finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::<u64>(),
);
buf.extend_from_slice(finding_value.as_bytes());
buf.extend_from_slice(file_or_commit.as_bytes());
buf.extend_from_slice(&offset_start.to_le_bytes());
buf.extend_from_slice(&offset_end.to_le_bytes());
xxh3_64(&buf)
}
// -------------------------------------------------------------------------------------------------
// test
@ -1220,6 +530,7 @@ mod test {
use super::*;
use crate::{
blob::{Blob, BlobIdMap},
entropy::calculate_shannon_entropy,
origin::{Origin, OriginSet},
rules::rule::{
DependsOnRule, HttpRequest, HttpValidation, PatternRequirements, RuleSyntax, Validation,
@ -1341,8 +652,6 @@ mod test {
let input = "some test data for vectorscan";
let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
let enable_rule_profiling = true;
// let mut matcher = Matcher::new(&rules_db, &seen_blobs, None,
// enable_rule_profiling)?;
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher = Matcher::new(
&rules_db,
@ -1513,7 +822,7 @@ mod test {
assert_eq!((item.pos_start, item.pos_end), (4, 4 + base64_payload.len()));
}
/// `compute_finding_fingerprint` must be stable (same input same output)
/// `compute_finding_fingerprint` must be stable (same input => same output)
/// and sensitive to any input component.
#[test]
fn test_finding_fingerprint_stability_and_uniqueness() {
@ -1535,7 +844,7 @@ mod test {
/// keys as soon as *anything* changes.
#[test]
fn test_compute_match_key_uniqueness() {
use super::compute_match_key;
use super::dedup::compute_match_key;
let k1 = compute_match_key(b"abc", b"rule-1", 0, 3);
let k2 = compute_match_key(b"abc", b"rule-1", 0, 3);
@ -1721,6 +1030,8 @@ line2
#[test]
fn serializes_captures_in_numeric_order() {
use regex::bytes::Regex;
let re =
Regex::new(r"(?xi)\b(ghp_(?P<body>[A-Z0-9]{3})(?P<checksum>[A-Z0-9]{2}))").unwrap();
let caps = re.captures(b"ghp_ABC12").expect("expected captures");

View file

@ -42,9 +42,18 @@ use crate::{
AccessMapCollector,
},
util::set_redaction_enabled,
validation::CachedResponse,
validation_rate_limit::ValidationRateLimiter,
};
/// Shared validation dependencies: (liquid parser, HTTP clients, validation cache, rate limiter).
type ValidationDeps = Arc<(
liquid::Parser,
crate::validation::ValidationClients,
Arc<SkipMap<String, CachedResponse>>,
Option<Arc<ValidationRateLimiter>>,
)>;
pub async fn run_scan(
global_args: &global::GlobalArgs,
scan_args: &scan::ScanArgs,
@ -64,22 +73,9 @@ pub async fn run_async_scan(
rules_db: &RulesDatabase,
update_status: &crate::update::UpdateStatus,
) -> Result<()> {
// Ensure all provided paths exist before proceeding
for path in &args.input_specifier_args.path_inputs {
if !path.exists() {
error!("Specified input path does not exist: {}", path.display());
bail!("Invalid input: Path does not exist - {}", path.display());
}
}
// Register user-provided allow-list patterns
for pattern in &args.skip_regex {
safe_list::add_user_regex(pattern)
.map_err(|e| anyhow::anyhow!("Invalid skip-regex '{pattern}': {e}"))?;
}
for word in &args.skip_word {
safe_list::add_user_skipword(word);
}
// ── Phase 1: Input validation and environment setup ──────────────────
validate_inputs(args)?;
register_safe_list_patterns(args)?;
let start_time = Instant::now();
let scan_started_at = chrono::Local::now();
@ -90,123 +86,26 @@ pub async fn run_async_scan(
set_redaction_enabled(args.redact);
let mut repo_urls = enumerate_github_repos(args, global_args).await?;
let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
// Combine repository URLs
repo_urls.extend(gitlab_repo_urls);
repo_urls.extend(gitea_repo_urls);
repo_urls.extend(huggingface_repo_urls);
repo_urls.extend(bitbucket_repo_urls);
repo_urls.extend(azure_repo_urls);
// Add wiki repositories for each URL when requested
if args.input_specifier_args.repo_artifacts {
let mut wiki_urls = Vec::new();
for url in &repo_urls {
if let Some(w) = github::wiki_url(url) {
wiki_urls.push(w);
}
if let Some(w) = gitlab::wiki_url(url) {
wiki_urls.push(w);
}
if let Some(w) = gitea::wiki_url(url) {
wiki_urls.push(w);
}
if let Some(w) = bitbucket::wiki_url(url) {
wiki_urls.push(w);
}
if let Some(w) = azure::wiki_url(url) {
wiki_urls.push(w);
}
}
repo_urls.extend(wiki_urls);
}
// just sort and dedup once
repo_urls.sort();
repo_urls.dedup();
// ── Phase 2: Repository enumeration ─────────────────────────────────
let repo_urls = enumerate_all_repos(args, global_args).await?;
let mut input_roots = args.input_specifier_args.path_inputs.clone();
let (repo_tx, repo_rx) = crossbeam_channel::unbounded();
let repo_clone_handle = if repo_urls.is_empty() {
None
} else {
let clone_args = args.clone();
let clone_globals = global_args.clone();
let clone_repo_urls = repo_urls.clone();
let clone_datastore = Arc::clone(&datastore);
let clone_repo_tx = repo_tx.clone();
Some(std::thread::spawn(move || {
if let Err(e) = clone_or_update_git_repos_streaming(
&clone_args,
&clone_globals,
&clone_repo_urls,
&clone_datastore,
|path| {
let _ = clone_repo_tx.send(path);
},
) {
error!("Failed to fetch one or more Git repositories: {e}");
}
}))
};
drop(repo_tx);
let repo_clone_handle =
start_repo_cloning(&repo_urls, args, global_args, &datastore, repo_tx, progress_enabled);
// Fetch issues, gists, and wikis if enabled
let bitbucket_auth = bitbucket::AuthConfig::from_env();
let bitbucket_host =
args.input_specifier_args.bitbucket_api_url.host_str().map(|s| s.to_string());
if args.input_specifier_args.repo_artifacts {
let repo_artifact_dirs = fetch_git_host_artifacts(
&repo_urls,
&args.input_specifier_args.bitbucket_api_url,
&bitbucket_auth,
bitbucket_host.clone(),
global_args,
&datastore,
)
.await?;
input_roots.extend(repo_artifact_dirs);
}
// Fetch Jira issues if requested
let jira_dirs = fetch_jira_issues(args, global_args, &datastore).await?;
input_roots.extend(jira_dirs);
// Fetch Confluence pages if requested
let confluence_dirs = fetch_confluence_pages(args, global_args, &datastore).await?;
input_roots.extend(confluence_dirs);
// Fetch Slack messages if requested
let slack_dirs = fetch_slack_messages(args, global_args, &datastore).await?;
input_roots.extend(slack_dirs);
// Save Docker images if specified
if !args.input_specifier_args.docker_image.is_empty() {
let clone_root = {
let ds = datastore.lock().unwrap();
ds.clone_root()
};
let docker_dirs = save_docker_images(
&args.input_specifier_args.docker_image,
&clone_root,
progress_enabled,
)
.await?;
for (dir, img) in docker_dirs {
{
let mut ds = datastore.lock().unwrap();
ds.register_docker_image(dir.clone(), img);
}
input_roots.push(dir);
}
}
// ── Phase 3: Artifact fetching ──────────────────────────────────────
fetch_all_artifacts(
args,
global_args,
&repo_urls,
&datastore,
&mut input_roots,
progress_enabled,
)
.await?;
// ── Phase 4: Scan configuration ─────────────────────────────────────
let shared_profiler = Arc::new(ConcurrentRuleProfiler::new());
let enable_profiling = args.rule_stats;
let matcher_stats = Arc::new(Mutex::new(MatcherStats::default()));
@ -246,11 +145,257 @@ pub async fn run_async_scan(
.unwrap_or_else(|| std::path::PathBuf::from("baseline-file.yaml")),
);
let mut skip_aws_accounts = args.skip_aws_account.clone();
let skip_aws_accounts = load_skip_aws_accounts(args)?;
crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
let mut access_map_collector =
if args.access_map { Some(AccessMapCollector::default()) } else { None };
let repo_roots = expand_repo_roots(&input_roots)?;
let git_repo_count =
repo_roots.iter().filter(|p| p.join(".git").is_dir()).count() + repo_urls.len();
let use_parallel_repo_scan = git_repo_count > 10;
let validation_rate_limiter =
ValidationRateLimiter::from_cli(args.validation_rps, &args.validation_rps_rule)?
.map(Arc::new);
let validation_deps: Option<ValidationDeps> = if !args.no_validate {
info!("Starting secret validation phase...");
Some(Arc::new((
register_all(liquid::ParserBuilder::with_stdlib()).build()?,
crate::validation::ValidationClients::new(global_args.tls_mode)?,
Arc::new(SkipMap::new()),
validation_rate_limiter.clone(),
)))
} else {
None
};
// ── Phase 5: Scanning ───────────────────────────────────────────────
if !use_parallel_repo_scan {
run_sequential_scan(
args,
global_args,
&datastore,
rules_db,
&mut input_roots,
repo_rx,
repo_clone_handle,
&shared_profiler,
enable_profiling,
&matcher_stats,
&baseline_path,
&validation_deps,
&mut access_map_collector,
progress_enabled,
start_time,
scan_started_at,
update_status,
)
.await?;
return Ok(());
}
run_parallel_scan(
args,
global_args,
&datastore,
rules_db,
&repo_roots,
repo_rx,
repo_clone_handle,
&shared_profiler,
enable_profiling,
&matcher_stats,
&baseline_path,
&validation_deps,
&mut access_map_collector,
progress_enabled,
start_time,
scan_started_at,
update_status,
)
.await
}
// =================================================================================================
// Phase helpers
// =================================================================================================
/// Validates that all provided input paths exist.
fn validate_inputs(args: &scan::ScanArgs) -> Result<()> {
for path in &args.input_specifier_args.path_inputs {
if !path.exists() {
error!("Specified input path does not exist: {}", path.display());
bail!("Invalid input: Path does not exist - {}", path.display());
}
}
Ok(())
}
/// Registers user-provided allow-list patterns (skip-regex and skip-word).
fn register_safe_list_patterns(args: &scan::ScanArgs) -> Result<()> {
for pattern in &args.skip_regex {
safe_list::add_user_regex(pattern)
.map_err(|e| anyhow::anyhow!("Invalid skip-regex '{pattern}': {e}"))?;
}
for word in &args.skip_word {
safe_list::add_user_skipword(word);
}
Ok(())
}
/// Enumerates repositories from all configured platforms, adds wiki URLs, and deduplicates.
async fn enumerate_all_repos(
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
) -> Result<Vec<crate::git_url::GitUrl>> {
let mut repo_urls = enumerate_github_repos(args, global_args).await?;
let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
repo_urls.extend(gitlab_repo_urls);
repo_urls.extend(gitea_repo_urls);
repo_urls.extend(huggingface_repo_urls);
repo_urls.extend(bitbucket_repo_urls);
repo_urls.extend(azure_repo_urls);
// Add wiki repositories for each URL when requested
if args.input_specifier_args.repo_artifacts {
let mut wiki_urls = Vec::new();
for url in &repo_urls {
if let Some(w) = github::wiki_url(url) {
wiki_urls.push(w);
}
if let Some(w) = gitlab::wiki_url(url) {
wiki_urls.push(w);
}
if let Some(w) = gitea::wiki_url(url) {
wiki_urls.push(w);
}
if let Some(w) = bitbucket::wiki_url(url) {
wiki_urls.push(w);
}
if let Some(w) = azure::wiki_url(url) {
wiki_urls.push(w);
}
}
repo_urls.extend(wiki_urls);
}
repo_urls.sort();
repo_urls.dedup();
Ok(repo_urls)
}
/// Spawns a background thread to clone/update git repositories, streaming results via a channel.
fn start_repo_cloning(
repo_urls: &[crate::git_url::GitUrl],
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
datastore: &Arc<Mutex<FindingsStore>>,
repo_tx: crossbeam_channel::Sender<PathBuf>,
_progress_enabled: bool,
) -> Option<std::thread::JoinHandle<()>> {
if repo_urls.is_empty() {
drop(repo_tx);
return None;
}
let clone_args = args.clone();
let clone_globals = global_args.clone();
let clone_repo_urls = repo_urls.to_vec();
let clone_datastore = Arc::clone(datastore);
let clone_repo_tx = repo_tx.clone();
let handle = std::thread::spawn(move || {
if let Err(e) = clone_or_update_git_repos_streaming(
&clone_args,
&clone_globals,
&clone_repo_urls,
&clone_datastore,
|path| {
let _ = clone_repo_tx.send(path);
},
) {
error!("Failed to fetch one or more Git repositories: {e}");
}
});
drop(repo_tx);
Some(handle)
}
/// Fetches artifacts from various platforms (issues, wikis, Jira, Confluence, Slack, Docker).
async fn fetch_all_artifacts(
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
repo_urls: &[crate::git_url::GitUrl],
datastore: &Arc<Mutex<FindingsStore>>,
input_roots: &mut Vec<PathBuf>,
progress_enabled: bool,
) -> Result<()> {
let bitbucket_auth = bitbucket::AuthConfig::from_env();
let bitbucket_host =
args.input_specifier_args.bitbucket_api_url.host_str().map(|s| s.to_string());
if args.input_specifier_args.repo_artifacts {
let repo_artifact_dirs = fetch_git_host_artifacts(
repo_urls,
&args.input_specifier_args.bitbucket_api_url,
&bitbucket_auth,
bitbucket_host.clone(),
global_args,
datastore,
)
.await?;
input_roots.extend(repo_artifact_dirs);
}
// Fetch Jira issues if requested
let jira_dirs = fetch_jira_issues(args, global_args, datastore).await?;
input_roots.extend(jira_dirs);
// Fetch Confluence pages if requested
let confluence_dirs = fetch_confluence_pages(args, global_args, datastore).await?;
input_roots.extend(confluence_dirs);
// Fetch Slack messages if requested
let slack_dirs = fetch_slack_messages(args, global_args, datastore).await?;
input_roots.extend(slack_dirs);
// Save Docker images if specified
if !args.input_specifier_args.docker_image.is_empty() {
let clone_root = {
let ds = datastore.lock().unwrap();
ds.clone_root()
};
let docker_dirs = save_docker_images(
&args.input_specifier_args.docker_image,
&clone_root,
progress_enabled,
)
.await?;
for (dir, img) in docker_dirs {
{
let mut ds = datastore.lock().unwrap();
ds.register_docker_image(dir.clone(), img);
}
input_roots.push(dir);
}
}
Ok(())
}
/// Loads AWS account IDs to skip from CLI args and optional file.
fn load_skip_aws_accounts(args: &scan::ScanArgs) -> Result<Vec<String>> {
let mut skip_aws_accounts = args.skip_aws_account.clone();
if let Some(path) = args.skip_aws_account_file.as_ref() {
let contents = fs::read_to_string(path).with_context(|| {
format!("Failed to read --skip-aws-account-file {}", path.display())
@ -267,183 +412,214 @@ pub async fn run_async_scan(
}
}
crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
Ok(skip_aws_accounts)
}
let repo_roots = expand_repo_roots(&input_roots)?;
let git_repo_count =
repo_roots.iter().filter(|p| p.join(".git").is_dir()).count() + repo_urls.len();
let use_parallel_repo_scan = git_repo_count > 10;
let validation_rate_limiter =
ValidationRateLimiter::from_cli(args.validation_rps, &args.validation_rps_rule)?
.map(Arc::new);
let validation_deps = if !args.no_validate {
info!("Starting secret validation phase...");
Some(Arc::new((
register_all(liquid::ParserBuilder::with_stdlib()).build()?,
crate::validation::ValidationClients::new(global_args.tls_mode)?,
Arc::new(SkipMap::new()),
validation_rate_limiter.clone(),
)))
} else {
None
};
if !use_parallel_repo_scan {
let mut streamed_roots = Vec::new();
if !input_roots.is_empty() {
let _inputs = enumerate_filesystem_inputs(
args,
datastore.clone(),
&input_roots,
progress_enabled,
rules_db,
enable_profiling,
Arc::clone(&shared_profiler),
matcher_stats.as_ref(),
)?;
}
for repo_root in repo_rx.clone().iter() {
enumerate_filesystem_inputs(
args,
datastore.clone(),
&[repo_root.clone()],
progress_enabled,
rules_db,
enable_profiling,
Arc::clone(&shared_profiler),
matcher_stats.as_ref(),
)?;
streamed_roots.push(repo_root);
}
input_roots.extend(streamed_roots);
if let Some(handle) = repo_clone_handle {
let _ = handle.join();
}
if !args.no_dedup {
let reporter = crate::reporter::DetailsReporter {
datastore: Arc::clone(&datastore),
styles: Styles::new(global_args.use_color(std::io::stdout())),
only_valid: args.only_valid,
};
let all_matches = reporter.get_unfiltered_matches(Some(false))?;
let deduped_matches = reporter.deduplicate_matches(all_matches, args.no_dedup);
let deduped_arcs: Vec<Arc<FindingsStoreMessage>> = deduped_matches
.into_iter()
.map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m)))
.collect();
let mut ds = datastore.lock().unwrap();
ds.replace_matches(deduped_arcs);
}
if args.baseline_file.is_some() || args.manage_baseline {
let mut ds = datastore.lock().unwrap();
crate::baseline::apply_baseline(
&mut ds,
baseline_path.as_ref(),
args.manage_baseline,
&input_roots,
)?;
}
if let Some(validation) = &validation_deps {
let (parser, clients, cache, rate_limiter) =
(&validation.0, &validation.1, &validation.2, &validation.3);
run_secret_validation(
Arc::clone(&datastore),
parser,
clients,
cache,
args.num_jobs,
None,
access_map_collector.clone(),
rate_limiter.clone(),
Duration::from_secs(args.validation_timeout),
args.validation_retries,
)
.await?;
}
if let Some(collector) = access_map_collector.take() {
finalize_access_map(&datastore, collector, args).await?;
}
crate::reporter::run(global_args, Arc::clone(&datastore), args)
.context("Failed to run report command")?;
print_scan_summary(
start_time,
scan_started_at,
&datastore,
global_args,
args,
rules_db,
matcher_stats.as_ref(),
if enable_profiling { Some(shared_profiler.as_ref()) } else { None },
update_status,
None,
None,
);
maybe_hint_access_map(&datastore, args);
/// Deduplicates matches in the datastore starting from `start_index`.
fn deduplicate_new_matches(
store: &Arc<Mutex<FindingsStore>>,
global_args: &global::GlobalArgs,
args: &scan::ScanArgs,
start_index: usize,
) -> Result<()> {
if args.no_dedup {
return Ok(());
}
let deduplicate_new_matches =
|store: &Arc<Mutex<FindingsStore>>, start_index: usize| -> Result<()> {
if args.no_dedup {
return Ok(());
}
let reporter = crate::reporter::DetailsReporter {
datastore: Arc::clone(store),
styles: Styles::new(global_args.use_color(std::io::stdout())),
only_valid: args.only_valid,
};
let reporter = crate::reporter::DetailsReporter {
datastore: Arc::clone(store),
styles: Styles::new(global_args.use_color(std::io::stdout())),
only_valid: args.only_valid,
};
let all_matches = reporter.get_unfiltered_matches(Some(false))?;
if start_index >= all_matches.len() {
return Ok(());
}
let all_matches = reporter.get_unfiltered_matches(Some(false))?;
if start_index >= all_matches.len() {
return Ok(());
}
let slice = if start_index == 0 { all_matches } else { all_matches[start_index..].to_vec() };
let deduped_matches = reporter.deduplicate_matches(slice, args.no_dedup);
let deduped_matches =
reporter.deduplicate_matches(all_matches[start_index..].to_vec(), args.no_dedup);
let deduped_arcs: Vec<Arc<FindingsStoreMessage>> = deduped_matches
.into_iter()
.map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m)))
.collect();
let deduped_arcs: Vec<Arc<FindingsStoreMessage>> = deduped_matches
.into_iter()
.map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m)))
.collect();
let mut ds = store.lock().unwrap();
let mut preserved = ds.get_matches()[..start_index].to_vec();
preserved.extend(deduped_arcs);
ds.replace_matches(preserved);
Ok(())
};
deduplicate_new_matches(&datastore, 0)?;
let mut ds = store.lock().unwrap();
if start_index == 0 {
ds.replace_matches(deduped_arcs);
} else {
let mut preserved = ds.get_matches()[..start_index].to_vec();
preserved.extend(deduped_arcs);
ds.replace_matches(preserved);
}
Ok(())
}
/// Applies baseline filtering if configured.
fn apply_baseline_if_configured(
args: &scan::ScanArgs,
datastore: &Arc<Mutex<FindingsStore>>,
baseline_path: &std::path::Path,
roots: &[PathBuf],
) -> Result<()> {
if args.baseline_file.is_some() || args.manage_baseline {
let mut ds = datastore.lock().unwrap();
crate::baseline::apply_baseline(
&mut ds,
baseline_path.as_ref(),
args.manage_baseline,
&repo_roots,
crate::baseline::apply_baseline(&mut ds, baseline_path, args.manage_baseline, roots)?;
}
Ok(())
}
/// Runs the validation phase on matches in the datastore.
#[allow(clippy::too_many_arguments)]
async fn run_validation_phase(
datastore: &Arc<Mutex<FindingsStore>>,
validation_deps: &Option<ValidationDeps>,
args: &scan::ScanArgs,
match_range: Option<std::ops::Range<usize>>,
access_map_collector: Option<AccessMapCollector>,
) -> Result<()> {
if let Some(validation) = validation_deps {
let (parser, clients, cache, rate_limiter) =
(&validation.0, &validation.1, &validation.2, &validation.3);
run_secret_validation(
Arc::clone(datastore),
parser,
clients,
cache,
args.num_jobs,
match_range,
access_map_collector,
rate_limiter.clone(),
Duration::from_secs(args.validation_timeout),
args.validation_retries,
)
.await?;
}
Ok(())
}
// =================================================================================================
// Sequential scan path
// =================================================================================================
#[allow(clippy::too_many_arguments)]
async fn run_sequential_scan(
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
datastore: &Arc<Mutex<FindingsStore>>,
rules_db: &RulesDatabase,
input_roots: &mut Vec<PathBuf>,
repo_rx: crossbeam_channel::Receiver<PathBuf>,
repo_clone_handle: Option<std::thread::JoinHandle<()>>,
shared_profiler: &Arc<ConcurrentRuleProfiler>,
enable_profiling: bool,
matcher_stats: &Arc<Mutex<MatcherStats>>,
baseline_path: &Arc<PathBuf>,
validation_deps: &Option<ValidationDeps>,
access_map_collector: &mut Option<AccessMapCollector>,
progress_enabled: bool,
start_time: Instant,
scan_started_at: chrono::DateTime<chrono::Local>,
update_status: &crate::update::UpdateStatus,
) -> Result<()> {
let mut streamed_roots = Vec::new();
if !input_roots.is_empty() {
let _inputs = enumerate_filesystem_inputs(
args,
datastore.clone(),
input_roots,
progress_enabled,
rules_db,
enable_profiling,
Arc::clone(shared_profiler),
matcher_stats.as_ref(),
)?;
}
if let Some(validation) = &validation_deps {
for repo_root in repo_rx.iter() {
enumerate_filesystem_inputs(
args,
datastore.clone(),
&[repo_root.clone()],
progress_enabled,
rules_db,
enable_profiling,
Arc::clone(shared_profiler),
matcher_stats.as_ref(),
)?;
streamed_roots.push(repo_root);
}
input_roots.extend(streamed_roots);
if let Some(handle) = repo_clone_handle {
let _ = handle.join();
}
deduplicate_new_matches(datastore, global_args, args, 0)?;
apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), input_roots)?;
run_validation_phase(datastore, validation_deps, args, None, access_map_collector.clone())
.await?;
if let Some(collector) = access_map_collector.take() {
finalize_access_map(datastore, collector, args).await?;
}
crate::reporter::run(global_args, Arc::clone(datastore), args)
.context("Failed to run report command")?;
print_scan_summary(
start_time,
scan_started_at,
datastore,
global_args,
args,
rules_db,
matcher_stats.as_ref(),
if enable_profiling { Some(shared_profiler.as_ref()) } else { None },
update_status,
None,
None,
);
maybe_hint_access_map(datastore, args);
Ok(())
}
// =================================================================================================
// Parallel scan path
// =================================================================================================
#[allow(clippy::too_many_arguments)]
async fn run_parallel_scan(
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
datastore: &Arc<Mutex<FindingsStore>>,
rules_db: &RulesDatabase,
repo_roots: &[PathBuf],
repo_rx: crossbeam_channel::Receiver<PathBuf>,
repo_clone_handle: Option<std::thread::JoinHandle<()>>,
shared_profiler: &Arc<ConcurrentRuleProfiler>,
enable_profiling: bool,
matcher_stats: &Arc<Mutex<MatcherStats>>,
baseline_path: &Arc<PathBuf>,
validation_deps: &Option<ValidationDeps>,
access_map_collector: &mut Option<AccessMapCollector>,
progress_enabled: bool,
start_time: Instant,
scan_started_at: chrono::DateTime<chrono::Local>,
update_status: &crate::update::UpdateStatus,
) -> Result<()> {
deduplicate_new_matches(datastore, global_args, args, 0)?;
apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), repo_roots)?;
// Validate initial (non-repo) matches
if let Some(validation) = validation_deps {
let (parser, clients, cache, rate_limiter) =
(&validation.0, &validation.1, &validation.2, &validation.3);
let initial_match_count = { datastore.lock().unwrap().get_matches().len() };
if initial_match_count > 0 {
run_secret_validation(
Arc::clone(&datastore),
Arc::clone(datastore),
parser,
clients,
cache,
@ -458,6 +634,7 @@ pub async fn run_async_scan(
}
}
// Parallel per-repo scanning
let repo_concurrency = std::cmp::max(1, args.num_jobs);
let rt_handle = Handle::current();
@ -476,16 +653,16 @@ pub async fn run_async_scan(
let spawn_repo_scan = |root: PathBuf| {
let repo_rules = repo_rules.clone();
let base_clone_root = base_clone_root.clone();
let baseline_path = Arc::clone(&baseline_path);
let shared_profiler = Arc::clone(&shared_profiler);
let baseline_path = Arc::clone(baseline_path);
let shared_profiler = Arc::clone(shared_profiler);
let args = args.clone();
let root = root.clone();
let validation_deps = validation_deps.clone();
let matcher_stats = Arc::clone(&matcher_stats);
let matcher_stats = Arc::clone(matcher_stats);
let rt_handle = rt_handle.clone();
let ran_repo_scan = Arc::clone(&ran_repo_scan);
let repo_errors = Arc::clone(&repo_errors);
let datastore = Arc::clone(&datastore);
let datastore = Arc::clone(datastore);
let access_map = access_map_collector.clone();
scope.spawn(move |_| {
@ -509,7 +686,9 @@ pub async fn run_async_scan(
Arc::clone(&shared_profiler),
&repo_matcher_stats,
)
.and_then(|_| deduplicate_new_matches(&repo_datastore, 0))?;
.and_then(|_| {
deduplicate_new_matches(&repo_datastore, global_args, &args, 0)
})?;
if args.baseline_file.is_some() || args.manage_baseline {
let mut ds = repo_datastore.lock().unwrap();
@ -568,11 +747,11 @@ pub async fn run_async_scan(
});
};
for root in repo_roots.clone() {
for root in repo_roots.iter().cloned() {
spawn_repo_scan(root);
}
for root in repo_rx.clone().iter() {
for root in repo_rx.iter() {
spawn_repo_scan(root);
}
});
@ -586,51 +765,27 @@ pub async fn run_async_scan(
}
if output_to_file && ran_repo_scan.load(Ordering::Relaxed) {
crate::reporter::run(global_args, Arc::clone(&datastore), args)
crate::reporter::run(global_args, Arc::clone(datastore), args)
.context("Failed to run report command")?;
}
if !ran_repo_scan.load(Ordering::Relaxed) {
deduplicate_new_matches(&datastore, 0)?;
deduplicate_new_matches(datastore, global_args, args, 0)?;
apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), repo_roots)?;
if args.baseline_file.is_some() || args.manage_baseline {
let mut ds = datastore.lock().unwrap();
crate::baseline::apply_baseline(
&mut ds,
baseline_path.as_ref(),
args.manage_baseline,
&repo_roots,
)?;
}
if let Some(validation) = &validation_deps {
let (parser, clients, cache, rate_limiter) =
(&validation.0, &validation.1, &validation.2, &validation.3);
run_secret_validation(
Arc::clone(&datastore),
parser,
clients,
cache,
args.num_jobs,
None,
access_map_collector.clone(),
rate_limiter.clone(),
Duration::from_secs(args.validation_timeout),
args.validation_retries,
)
run_validation_phase(datastore, validation_deps, args, None, access_map_collector.clone())
.await?;
}
if let Some(collector) = access_map_collector.take() {
finalize_access_map(&datastore, collector, args).await?;
finalize_access_map(datastore, collector, args).await?;
}
crate::reporter::run(global_args, Arc::clone(&datastore), args)
crate::reporter::run(global_args, Arc::clone(datastore), args)
.context("Failed to run report command")?;
}
let aggregate_summary = if ran_repo_scan.load(Ordering::Relaxed) {
let totals = compute_scan_totals(&datastore, args, matcher_stats.as_ref());
let totals = compute_scan_totals(datastore, args, matcher_stats.as_ref());
let mut sorted: Vec<_> = datastore.lock().unwrap().get_summary().into_iter().collect();
sorted.sort_by(|a, b| b.1.cmp(&a.1));
Some((totals, sorted))
@ -641,7 +796,7 @@ pub async fn run_async_scan(
print_scan_summary(
start_time,
scan_started_at,
&datastore,
datastore,
global_args,
args,
rules_db,
@ -652,14 +807,18 @@ pub async fn run_async_scan(
aggregate_summary,
);
if let Some(collector) = access_map_collector {
finalize_access_map(&datastore, collector, args).await?;
if let Some(collector) = access_map_collector.take() {
finalize_access_map(datastore, collector, args).await?;
} else {
maybe_hint_access_map(&datastore, args);
maybe_hint_access_map(datastore, args);
}
Ok(())
}
// =================================================================================================
// Existing helper functions (unchanged)
// =================================================================================================
async fn finalize_access_map(
datastore: &Arc<Mutex<FindingsStore>>,
collector: AccessMapCollector,
@ -780,8 +939,6 @@ pub fn create_datastore_channel(
) {
const BATCH_SIZE: usize = 1024;
let channel_size = std::cmp::max(num_jobs * BATCH_SIZE, 16 * BATCH_SIZE);
// const BATCH_SIZE: usize = 256;
// let channel_size = std::cmp::max(num_jobs * BATCH_SIZE, 4096);
crossbeam_channel::bounded(channel_size)
}
@ -864,7 +1021,6 @@ pub fn load_and_record_rules(
) -> Result<RulesDatabase> {
let init_progress =
if use_progress { ProgressBar::new_spinner() } else { ProgressBar::hidden() };
// init_progress.set_message("Compiling rules...");
let rules_db = {
let loaded = RuleLoader::from_rule_specifiers(&args.rules)
.load(args)
@ -876,7 +1032,6 @@ pub fn load_and_record_rules(
.cloned()
.map(|mut rule| {
if let Some(min_entropy) = args.min_entropy {
// rule.syntax.min_entropy = min_entropy;
let _ = rule.set_entropy(min_entropy);
}
rule