forked from mirrors/kingfisher
refactored code
This commit is contained in:
parent
f62bfe103b
commit
d3e659491d
23 changed files with 1868 additions and 1828 deletions
|
|
@ -44,5 +44,8 @@ gix = { version = "0.73", default-features = false, features = ["serde"] }
|
|||
# Console formatting
|
||||
console = "0.15"
|
||||
|
||||
# Language detection for content types
|
||||
tokei = "14.0.0"
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions = "1.4"
|
||||
|
|
|
|||
232
crates/kingfisher-core/src/content_type.rs
Normal file
232
crates/kingfisher-core/src/content_type.rs
Normal file
|
|
@ -0,0 +1,232 @@
|
|||
use once_cell::sync::Lazy;
|
||||
use std::path::Path;
|
||||
use tokei::LanguageType;
|
||||
|
||||
// Precompute all (shebang_prefix_bytes, language) pairs once.
|
||||
// Sort longest-first so more specific shebangs win.
|
||||
static SHEBANG_PREFIXES: Lazy<Vec<(&'static [u8], LanguageType)>> = Lazy::new(|| {
|
||||
let mut v = Vec::new();
|
||||
for &(lang, shebangs) in LanguageType::list() {
|
||||
for &sb in shebangs {
|
||||
v.push((sb.as_bytes(), lang));
|
||||
}
|
||||
}
|
||||
// Longest prefix first to prefer e.g. "#!/usr/bin/env python3" over "#!/usr/bin/env python"
|
||||
v.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
|
||||
v
|
||||
});
|
||||
|
||||
/// The type of content detected.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum ContentType {
|
||||
/// Unprintable or control-heavy data.
|
||||
BINARY,
|
||||
/// Mostly printable text.
|
||||
TEXT,
|
||||
}
|
||||
|
||||
/// Heuristic thresholds for text vs. binary detection.
|
||||
pub struct ContentInspector {
|
||||
max_null_bytes: usize,
|
||||
max_control_ratio: f64,
|
||||
}
|
||||
|
||||
impl Default for ContentInspector {
|
||||
fn default() -> Self {
|
||||
Self { max_null_bytes: 4, max_control_ratio: 0.3 }
|
||||
}
|
||||
}
|
||||
|
||||
impl ContentInspector {
|
||||
/// Create a new inspector with default thresholds.
|
||||
#[inline]
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
/// Classify `bytes` as TEXT or BINARY:
|
||||
///
|
||||
/// 1. If null-byte count > `max_null_bytes` -- `BINARY`.
|
||||
/// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` →
|
||||
/// `BINARY`.
|
||||
/// 3. Otherwise, `TEXT`.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn inspect(&self, bytes: &[u8]) -> ContentType {
|
||||
let nulls = bytes.iter().filter(|&&b| b == 0).count();
|
||||
if nulls > self.max_null_bytes {
|
||||
return ContentType::BINARY;
|
||||
}
|
||||
let controls =
|
||||
bytes.iter().filter(|&&b| b < 32 && !matches!(b, b'\n' | b'\r' | b'\t')).count();
|
||||
let ratio = if bytes.is_empty() { 0.0 } else { controls as f64 / bytes.len() as f64 };
|
||||
if ratio > self.max_control_ratio {
|
||||
ContentType::BINARY
|
||||
} else {
|
||||
ContentType::TEXT
|
||||
}
|
||||
}
|
||||
|
||||
/// Guess MIME type from `path` extension.
|
||||
///
|
||||
/// Returns:
|
||||
/// - `Some(mime)` if the extension is one of the known text or image types.
|
||||
/// - `None` if there is no extension or it’s unrecognized.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn guess_mime_type(&self, path: &Path) -> Option<String> {
|
||||
let ext = path.extension()?.to_str()?.to_ascii_lowercase();
|
||||
let mime = match ext.as_str() {
|
||||
"txt" | "md" | "rst" => "text/plain",
|
||||
"html" | "htm" => "text/html",
|
||||
"css" => "text/css",
|
||||
"js" => "application/javascript",
|
||||
"json" => "application/json",
|
||||
"xml" => "application/xml",
|
||||
"pdf" => "application/pdf",
|
||||
"jpg" | "jpeg" => "image/jpeg",
|
||||
"png" => "image/png",
|
||||
"gif" => "image/gif",
|
||||
_ => return None,
|
||||
};
|
||||
Some(mime.to_string())
|
||||
}
|
||||
|
||||
/// Detect UTF-8 encoding by attempting a lossless conversion.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
|
||||
std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
|
||||
}
|
||||
/// Guess programming language with broad coverage using `tokei`.
|
||||
///
|
||||
/// Strategy (no disk I/O):
|
||||
/// 1) Try extension via `LanguageType::from_file_extension`.
|
||||
/// 2) Handle common extensionless filenames (e.g., Makefile, Dockerfile, CMakeLists.txt).
|
||||
/// 3) Parse an in-memory shebang (first line) against `LanguageType::shebangs`.
|
||||
/// 4) Minimal content markers as a last resort.
|
||||
///
|
||||
/// Returns the canonical `tokei` language name (e.g., `Rust`, `Bash`, `Python`).
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn guess_language(&self, path: &Path, content: &[u8]) -> Option<String> {
|
||||
// 1) Extension mapping (fast, no I/O).
|
||||
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
||||
if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) {
|
||||
return Some(lang.name().to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure).
|
||||
if let Some(file) = path.file_name().and_then(|f| f.to_str()) {
|
||||
match file {
|
||||
"Makefile" | "makefile" => {
|
||||
return Some(LanguageType::Makefile.name().to_string());
|
||||
}
|
||||
"Dockerfile" | "dockerfile" => {
|
||||
return Some(LanguageType::Dockerfile.name().to_string());
|
||||
}
|
||||
"CMakeLists.txt" => {
|
||||
return Some(LanguageType::CMake.name().to_string());
|
||||
}
|
||||
"Rakefile" | "rakefile" => {
|
||||
return Some(LanguageType::Rakefile.name().to_string());
|
||||
}
|
||||
// Common ecosystem files; map to their primary language where sensible.
|
||||
"Gemfile" | "gemfile" => {
|
||||
return Some(LanguageType::Ruby.name().to_string());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// 3) Shebang detection (in-memory): match by longest prefix, byte-wise (no UTF-8 needed).
|
||||
if let Some(first_line) = content.split(|&b| b == b'\n').next() {
|
||||
if first_line.starts_with(b"#!") {
|
||||
for (prefix, lang) in SHEBANG_PREFIXES.iter() {
|
||||
if first_line.starts_with(prefix) {
|
||||
return Some(lang.name().to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4) Lightweight content markers to catch a few ubiquitous cases without I/O.
|
||||
let s = String::from_utf8_lossy(content);
|
||||
if s.contains("<?php") {
|
||||
return Some(LanguageType::Php.name().to_string());
|
||||
}
|
||||
if s.contains("package main") {
|
||||
return Some(LanguageType::Go.name().to_string());
|
||||
}
|
||||
if s.contains("public class") {
|
||||
return Some(LanguageType::Java.name().to_string());
|
||||
}
|
||||
if s.contains("#!/usr/bin/env bash") || s.contains("#!/bin/bash") {
|
||||
return Some(LanguageType::Bash.name().to_string());
|
||||
}
|
||||
if s.contains("#!/usr/bin/env python") {
|
||||
return Some(LanguageType::Python.name().to_string());
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Shorthand: inspect with default thresholds.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn inspect(bytes: &[u8]) -> ContentType {
|
||||
ContentInspector::default().inspect(bytes)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::path::PathBuf;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn binary_vs_text() {
|
||||
let ins = ContentInspector::default();
|
||||
let bin = vec![0, 1, 2, 0, 0, 0, 5];
|
||||
assert_eq!(ins.inspect(&bin), ContentType::BINARY);
|
||||
let txt = b"Hello\nWorld";
|
||||
assert_eq!(ins.inspect(txt), ContentType::TEXT);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mime_guess() {
|
||||
let ins = ContentInspector::default();
|
||||
assert_eq!(ins.guess_mime_type(&PathBuf::from("a.md")), Some("text/plain".into()));
|
||||
assert_eq!(ins.guess_mime_type(&PathBuf::from("img.png")), Some("image/png".into()));
|
||||
assert_eq!(ins.guess_mime_type(&PathBuf::from("x.xyz")), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn charset_guess() {
|
||||
let ins = ContentInspector::default();
|
||||
assert_eq!(ins.guess_charset("ok".as_bytes()), Some("UTF-8".into()));
|
||||
assert_eq!(ins.guess_charset(&[0xFF, 0xFE, 0xFD]), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn language_guess() {
|
||||
let ins = ContentInspector::default();
|
||||
|
||||
// Compare case-insensitively by lowercasing both sides.
|
||||
let rust =
|
||||
ins.guess_language(&PathBuf::from("main.rs"), b"").map(|s| s.to_ascii_lowercase());
|
||||
assert_eq!(rust, Some("rust".into()));
|
||||
|
||||
let php = ins
|
||||
.guess_language(&PathBuf::from("x"), b"<?php echo; ?>")
|
||||
.map(|s| s.to_ascii_lowercase());
|
||||
assert_eq!(php, Some("php".into()));
|
||||
|
||||
let bash = ins
|
||||
.guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi")
|
||||
.map(|s| s.to_ascii_lowercase());
|
||||
assert_eq!(bash, Some("bash".into()));
|
||||
}
|
||||
}
|
||||
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
pub mod blob;
|
||||
pub mod bstring_escape;
|
||||
pub mod content_type;
|
||||
pub mod entropy;
|
||||
pub mod error;
|
||||
pub mod git_commit_metadata;
|
||||
|
|
@ -20,6 +21,7 @@ pub use blob::{
|
|||
Blob, BlobAppearance, BlobAppearanceSet, BlobData, BlobId, BlobIdMap, BlobMetadata,
|
||||
};
|
||||
pub use bstring_escape::Escaped;
|
||||
pub use content_type::{ContentInspector, ContentType};
|
||||
pub use entropy::calculate_shannon_entropy;
|
||||
pub use error::{Error, Result};
|
||||
pub use git_commit_metadata::CommitMetadata;
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@
|
|||
//! - **validation-all**: Enable all validation features
|
||||
|
||||
mod finding;
|
||||
pub mod primitives;
|
||||
mod scanner;
|
||||
mod scanner_pool;
|
||||
|
||||
|
|
|
|||
187
crates/kingfisher-scanner/src/primitives.rs
Normal file
187
crates/kingfisher-scanner/src/primitives.rs
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
//! Shared matching primitives for secret detection.
|
||||
//!
|
||||
//! These functions are used by both the high-level `Scanner` API and the
|
||||
//! binary crate's `Matcher`. Having a single canonical implementation
|
||||
//! eliminates duplicated logic across the codebase.
|
||||
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use base64::{engine::general_purpose, Engine};
|
||||
use kingfisher_core::OffsetSpan;
|
||||
use rustc_hash::{FxHashMap, FxHasher};
|
||||
use xxhash_rust::xxh3::xxh3_64;
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Base64 detection
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
/// Decoded Base64 data with position information.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DecodedData {
|
||||
pub decoded: Vec<u8>,
|
||||
pub pos_start: usize,
|
||||
pub pos_end: usize,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn is_base64_byte(b: u8) -> bool {
|
||||
// Accepts both standard base64 ('+', '/') and URL-safe base64 ('-', '_') characters.
|
||||
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
|
||||
}
|
||||
|
||||
/// Finds standalone Base64-encoded strings in the input and returns decoded data
|
||||
/// with byte-offset positions.
|
||||
pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
|
||||
let mut results = Vec::new();
|
||||
let mut i = 0;
|
||||
while i < input.len() {
|
||||
while i < input.len() && !is_base64_byte(input[i]) {
|
||||
i += 1;
|
||||
}
|
||||
let start = i;
|
||||
while i < input.len() && is_base64_byte(input[i]) {
|
||||
i += 1;
|
||||
}
|
||||
|
||||
let mut eq_count = 0;
|
||||
while i < input.len() && input[i] == b'=' && eq_count < 2 {
|
||||
i += 1;
|
||||
eq_count += 1;
|
||||
}
|
||||
let end = i;
|
||||
|
||||
let len = end - start;
|
||||
if len >= 32 && len % 4 == 0 {
|
||||
let base64_slice = &input[start..end];
|
||||
|
||||
// Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD
|
||||
let decode_result = general_purpose::STANDARD
|
||||
.decode(base64_slice)
|
||||
.or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
|
||||
.or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
|
||||
|
||||
if let Ok(decoded) = decode_result {
|
||||
if decoded.is_ascii() {
|
||||
results.push(DecodedData { decoded, pos_start: start, pos_end: end });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Match deduplication
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
/// Computes a deduplication key for a match based on content, rule ID, and span.
|
||||
#[inline]
|
||||
pub fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
|
||||
let mut hasher = FxHasher::default();
|
||||
// Hash each component directly without allocation
|
||||
content.hash(&mut hasher);
|
||||
rule_id.hash(&mut hasher);
|
||||
start.hash(&mut hasher);
|
||||
end.hash(&mut hasher);
|
||||
hasher.finish()
|
||||
}
|
||||
|
||||
/// Inserts a span into a sorted list of spans, handling containment.
|
||||
///
|
||||
/// Returns `false` if the span is already contained in an existing span
|
||||
/// (i.e., it's redundant and should be skipped).
|
||||
#[inline]
|
||||
pub fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
|
||||
let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
|
||||
if idx > 0 {
|
||||
if spans[idx - 1].fully_contains(&span) {
|
||||
return false;
|
||||
}
|
||||
if span.fully_contains(&spans[idx - 1]) {
|
||||
spans.remove(idx - 1);
|
||||
idx -= 1;
|
||||
}
|
||||
}
|
||||
if idx < spans.len() {
|
||||
if spans[idx].fully_contains(&span) {
|
||||
return false;
|
||||
}
|
||||
if span.fully_contains(&spans[idx]) {
|
||||
spans.remove(idx);
|
||||
}
|
||||
}
|
||||
spans.insert(idx, span);
|
||||
true
|
||||
}
|
||||
|
||||
/// Records a match span for a given rule, returning `false` if it's a duplicate.
|
||||
#[inline]
|
||||
pub fn record_match(
|
||||
map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
|
||||
rule_id: usize,
|
||||
span: OffsetSpan,
|
||||
) -> bool {
|
||||
insert_span(map.entry(rule_id).or_default(), span)
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Finding fingerprint
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
/// Computes a stable fingerprint for a finding based on its value, location, and origin.
|
||||
pub fn compute_finding_fingerprint(
|
||||
finding_value: &str,
|
||||
file_or_commit: &str,
|
||||
offset_start: u64,
|
||||
offset_end: u64,
|
||||
) -> u64 {
|
||||
// Combine all into a byte buffer and hash it directly:
|
||||
let mut buf = Vec::with_capacity(
|
||||
finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::<u64>(),
|
||||
);
|
||||
buf.extend_from_slice(finding_value.as_bytes());
|
||||
buf.extend_from_slice(file_or_commit.as_bytes());
|
||||
buf.extend_from_slice(&offset_start.to_le_bytes());
|
||||
buf.extend_from_slice(&offset_end.to_le_bytes());
|
||||
|
||||
xxh3_64(&buf)
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Secret capture selection
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
/// Selects the "secret" capture from the regex match using the priority:
|
||||
/// 1. Named capture called TOKEN (case-insensitive)
|
||||
/// 2. First matched named capture
|
||||
/// 3. First positional capture (group 1)
|
||||
/// 4. Full match (group 0)
|
||||
pub fn find_secret_capture<'a>(
|
||||
re: ®ex::bytes::Regex,
|
||||
captures: ®ex::bytes::Captures<'a>,
|
||||
) -> regex::bytes::Match<'a> {
|
||||
// 1. Prefer a named capture called TOKEN (case-insensitive).
|
||||
if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
|
||||
name_opt.filter(|name| name.eq_ignore_ascii_case("TOKEN")).and_then(|_| captures.get(i))
|
||||
}) {
|
||||
return token_cap;
|
||||
}
|
||||
|
||||
// 2. Otherwise, prefer the first *matched* named capture.
|
||||
if let Some(named_cap) = re
|
||||
.capture_names()
|
||||
.enumerate()
|
||||
.find_map(|(i, name_opt)| name_opt.and_then(|_| captures.get(i)))
|
||||
{
|
||||
return named_cap;
|
||||
}
|
||||
|
||||
// 3. Otherwise, fall back to the first positional capture (group 1).
|
||||
if let Some(pos_cap) = captures.get(1) {
|
||||
return pos_cap;
|
||||
}
|
||||
|
||||
// 4. Finally, fall back to the full match (group 0).
|
||||
captures.get(0).unwrap()
|
||||
}
|
||||
|
|
@ -5,15 +5,13 @@ use std::path::Path;
|
|||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Result;
|
||||
use base64::{engine::general_purpose, Engine};
|
||||
use kingfisher_core::{calculate_shannon_entropy, Blob, BlobIdMap, LocationMapping, OffsetSpan};
|
||||
use kingfisher_rules::RulesDatabase;
|
||||
use regex::bytes::Regex;
|
||||
use rustc_hash::{FxHashMap, FxHashSet};
|
||||
use tracing::debug;
|
||||
use xxhash_rust::xxh3::xxh3_64;
|
||||
|
||||
use crate::finding::{Finding, FindingLocation};
|
||||
use crate::primitives;
|
||||
use crate::scanner_pool::ScannerPool;
|
||||
|
||||
/// Configuration options for the scanner.
|
||||
|
|
@ -185,21 +183,15 @@ impl Scanner {
|
|||
let current_span = OffsetSpan::from_range(start..end);
|
||||
|
||||
// Check for overlapping spans
|
||||
if !self.record_span(&mut previous_spans, rule_id, current_span) {
|
||||
if !primitives::record_match(&mut previous_spans, rule_id, current_span) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let haystack = &bytes[start..end];
|
||||
|
||||
for captures in anchored_regex.captures_iter(haystack) {
|
||||
let full_capture = match captures.get(0) {
|
||||
Some(c) => c,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// Get the primary secret value
|
||||
let secret_capture =
|
||||
self.get_secret_capture(&anchored_regex, &captures, full_capture);
|
||||
let secret_capture = primitives::find_secret_capture(&anchored_regex, &captures);
|
||||
let secret_bytes = secret_capture.as_bytes();
|
||||
|
||||
// Check entropy
|
||||
|
|
@ -211,7 +203,7 @@ impl Scanner {
|
|||
}
|
||||
|
||||
// Compute match key for dedup
|
||||
let match_key = self.compute_match_key(
|
||||
let match_key = primitives::compute_match_key(
|
||||
secret_bytes,
|
||||
rule.id().as_bytes(),
|
||||
start + secret_capture.start(),
|
||||
|
|
@ -242,7 +234,7 @@ impl Scanner {
|
|||
}
|
||||
}
|
||||
|
||||
let fingerprint = self.compute_fingerprint(
|
||||
let fingerprint = primitives::compute_finding_fingerprint(
|
||||
&secret,
|
||||
&blob.id().to_string(),
|
||||
offset_span.start as u64,
|
||||
|
|
@ -295,92 +287,6 @@ impl Scanner {
|
|||
// In a real implementation, you'd want to add a clear method or use a different approach
|
||||
}
|
||||
|
||||
fn get_secret_capture<'a>(
|
||||
&self,
|
||||
regex: &Regex,
|
||||
captures: ®ex::bytes::Captures<'a>,
|
||||
full_capture: regex::bytes::Match<'a>,
|
||||
) -> regex::bytes::Match<'a> {
|
||||
// Prefer named capture called TOKEN
|
||||
for (i, name_opt) in regex.capture_names().enumerate() {
|
||||
if let Some(name) = name_opt {
|
||||
if name.eq_ignore_ascii_case("TOKEN") {
|
||||
if let Some(cap) = captures.get(i) {
|
||||
return cap;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise, first named capture
|
||||
for (i, name_opt) in regex.capture_names().enumerate() {
|
||||
if name_opt.is_some() {
|
||||
if let Some(cap) = captures.get(i) {
|
||||
return cap;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise, first positional capture (group 1)
|
||||
if let Some(cap) = captures.get(1) {
|
||||
return cap;
|
||||
}
|
||||
|
||||
// Fall back to full match
|
||||
full_capture
|
||||
}
|
||||
|
||||
fn record_span(
|
||||
&self,
|
||||
map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
|
||||
rule_id: usize,
|
||||
span: OffsetSpan,
|
||||
) -> bool {
|
||||
let spans = map.entry(rule_id).or_default();
|
||||
|
||||
// Binary search for insertion point
|
||||
let idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
|
||||
|
||||
// Check if new span is contained in an existing one
|
||||
if idx > 0 && spans[idx - 1].fully_contains(&span) {
|
||||
return false;
|
||||
}
|
||||
if idx < spans.len() && spans[idx].fully_contains(&span) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Remove spans that the new span contains
|
||||
let remove_idx = idx;
|
||||
while remove_idx < spans.len() && span.fully_contains(&spans[remove_idx]) {
|
||||
spans.remove(remove_idx);
|
||||
}
|
||||
if idx > 0 && span.fully_contains(&spans[idx - 1]) {
|
||||
spans.remove(idx - 1);
|
||||
}
|
||||
|
||||
spans.insert(idx.min(spans.len()), span);
|
||||
true
|
||||
}
|
||||
|
||||
fn compute_match_key(&self, content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
|
||||
use std::hash::{Hash, Hasher};
|
||||
let mut hasher = rustc_hash::FxHasher::default();
|
||||
content.hash(&mut hasher);
|
||||
rule_id.hash(&mut hasher);
|
||||
start.hash(&mut hasher);
|
||||
end.hash(&mut hasher);
|
||||
hasher.finish()
|
||||
}
|
||||
|
||||
fn compute_fingerprint(&self, value: &str, blob_id: &str, start: u64, end: u64) -> u64 {
|
||||
let mut buf = Vec::with_capacity(value.len() + blob_id.len() + 16);
|
||||
buf.extend_from_slice(value.as_bytes());
|
||||
buf.extend_from_slice(blob_id.as_bytes());
|
||||
buf.extend_from_slice(&start.to_le_bytes());
|
||||
buf.extend_from_slice(&end.to_le_bytes());
|
||||
xxh3_64(&buf)
|
||||
}
|
||||
|
||||
fn redact(&self, bytes: &[u8]) -> String {
|
||||
let s = String::from_utf8_lossy(bytes);
|
||||
if s.len() <= 8 {
|
||||
|
|
@ -400,7 +306,7 @@ impl Scanner {
|
|||
let bytes = blob.bytes();
|
||||
|
||||
// Find Base64-encoded strings
|
||||
let b64_items = self.find_base64_strings(bytes);
|
||||
let b64_items = primitives::get_base64_strings(bytes);
|
||||
|
||||
for item in b64_items {
|
||||
// Try to match decoded content against all rules
|
||||
|
|
@ -411,12 +317,7 @@ impl Scanner {
|
|||
};
|
||||
|
||||
for captures in regex.captures_iter(&item.decoded) {
|
||||
let full_capture = match captures.get(0) {
|
||||
Some(c) => c,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let secret_capture = self.get_secret_capture(®ex, &captures, full_capture);
|
||||
let secret_capture = primitives::find_secret_capture(®ex, &captures);
|
||||
let secret_bytes = secret_capture.as_bytes();
|
||||
|
||||
let min_entropy =
|
||||
|
|
@ -426,7 +327,7 @@ impl Scanner {
|
|||
continue;
|
||||
}
|
||||
|
||||
let match_key = self.compute_match_key(
|
||||
let match_key = primitives::compute_match_key(
|
||||
secret_bytes,
|
||||
rule.id().as_bytes(),
|
||||
item.pos_start,
|
||||
|
|
@ -455,7 +356,7 @@ impl Scanner {
|
|||
}
|
||||
}
|
||||
|
||||
let fingerprint = self.compute_fingerprint(
|
||||
let fingerprint = primitives::compute_finding_fingerprint(
|
||||
&secret,
|
||||
&blob.id().to_string(),
|
||||
offset_span.start as u64,
|
||||
|
|
@ -488,62 +389,6 @@ impl Scanner {
|
|||
|
||||
findings
|
||||
}
|
||||
|
||||
fn find_base64_strings(&self, input: &[u8]) -> Vec<DecodedData> {
|
||||
let mut results = Vec::new();
|
||||
let mut i = 0;
|
||||
|
||||
while i < input.len() {
|
||||
// Skip non-base64 characters
|
||||
while i < input.len() && !Self::is_base64_byte(input[i]) {
|
||||
i += 1;
|
||||
}
|
||||
let start = i;
|
||||
|
||||
// Collect base64 characters
|
||||
while i < input.len() && Self::is_base64_byte(input[i]) {
|
||||
i += 1;
|
||||
}
|
||||
|
||||
// Handle padding
|
||||
let mut eq_count = 0;
|
||||
while i < input.len() && input[i] == b'=' && eq_count < 2 {
|
||||
i += 1;
|
||||
eq_count += 1;
|
||||
}
|
||||
let end = i;
|
||||
|
||||
let len = end - start;
|
||||
if len >= 32 && len % 4 == 0 {
|
||||
let base64_slice = &input[start..end];
|
||||
|
||||
// Try decoding
|
||||
let decode_result = general_purpose::STANDARD
|
||||
.decode(base64_slice)
|
||||
.or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
|
||||
.or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
|
||||
|
||||
if let Ok(decoded) = decode_result {
|
||||
if decoded.is_ascii() {
|
||||
results.push(DecodedData { decoded, pos_start: start, pos_end: end });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_base64_byte(b: u8) -> bool {
|
||||
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
|
||||
}
|
||||
}
|
||||
|
||||
struct DecodedData {
|
||||
decoded: Vec<u8>,
|
||||
pos_start: usize,
|
||||
pos_end: usize,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
|||
|
|
@ -18,6 +18,22 @@ pub(crate) mod postgres;
|
|||
mod report;
|
||||
mod slack;
|
||||
|
||||
/// Trait for access map providers that map a single token to an access profile.
|
||||
///
|
||||
/// This covers the majority of providers (GitHub, GitLab, Slack, HuggingFace,
|
||||
/// Gitea, Bitbucket). Providers with more complex credentials (AWS, GCP, Azure,
|
||||
/// Postgres, MongoDB) use their own custom interfaces.
|
||||
pub trait TokenAccessMapper: Send + Sync {
|
||||
/// The cloud/platform name for results (e.g., `"github"`, `"slack"`).
|
||||
fn cloud_name(&self) -> &'static str;
|
||||
|
||||
/// Maps a single token to an access map result.
|
||||
fn map_access_from_token(
|
||||
&self,
|
||||
token: &str,
|
||||
) -> impl std::future::Future<Output = Result<AccessMapResult>> + Send;
|
||||
}
|
||||
|
||||
/// Run the identity mapping workflow for the selected cloud provider.
|
||||
pub async fn run(args: AccessMapArgs) -> Result<()> {
|
||||
let result = match args.provider {
|
||||
|
|
@ -244,24 +260,15 @@ pub async fn map_requests(requests: Vec<AccessMapRequest>) -> Vec<AccessMapResul
|
|||
.unwrap_or_else(|err| build_failed_result("azure_devops", "pat", err)),
|
||||
fingerprint,
|
||||
),
|
||||
AccessMapRequest::Github { token, fingerprint } => (
|
||||
github::map_access_from_token(&token)
|
||||
.await
|
||||
.unwrap_or_else(|err| build_failed_result("github", "token", err)),
|
||||
fingerprint,
|
||||
),
|
||||
AccessMapRequest::Gitlab { token, fingerprint } => (
|
||||
gitlab::map_access_from_token(&token)
|
||||
.await
|
||||
.unwrap_or_else(|err| build_failed_result("gitlab", "token", err)),
|
||||
fingerprint,
|
||||
),
|
||||
AccessMapRequest::Slack { token, fingerprint } => (
|
||||
slack::map_access_from_token(&token)
|
||||
.await
|
||||
.unwrap_or_else(|err| build_failed_result("slack", "token", err)),
|
||||
fingerprint,
|
||||
),
|
||||
AccessMapRequest::Github { token, fingerprint } => {
|
||||
(map_token(&GithubMapper, &token).await, fingerprint)
|
||||
}
|
||||
AccessMapRequest::Gitlab { token, fingerprint } => {
|
||||
(map_token(&GitlabMapper, &token).await, fingerprint)
|
||||
}
|
||||
AccessMapRequest::Slack { token, fingerprint } => {
|
||||
(map_token(&SlackMapper, &token).await, fingerprint)
|
||||
}
|
||||
AccessMapRequest::Postgres { uri, fingerprint } => (
|
||||
postgres::map_access_from_uri(&uri)
|
||||
.await
|
||||
|
|
@ -274,24 +281,15 @@ pub async fn map_requests(requests: Vec<AccessMapRequest>) -> Vec<AccessMapResul
|
|||
.unwrap_or_else(|err| build_failed_result("mongodb", "uri", err)),
|
||||
fingerprint,
|
||||
),
|
||||
AccessMapRequest::HuggingFace { token, fingerprint } => (
|
||||
huggingface::map_access_from_token(&token)
|
||||
.await
|
||||
.unwrap_or_else(|err| build_failed_result("huggingface", "token", err)),
|
||||
fingerprint,
|
||||
),
|
||||
AccessMapRequest::Gitea { token, fingerprint } => (
|
||||
gitea::map_access_from_token(&token)
|
||||
.await
|
||||
.unwrap_or_else(|err| build_failed_result("gitea", "token", err)),
|
||||
fingerprint,
|
||||
),
|
||||
AccessMapRequest::Bitbucket { token, fingerprint } => (
|
||||
bitbucket::map_access_from_token(&token)
|
||||
.await
|
||||
.unwrap_or_else(|err| build_failed_result("bitbucket", "token", err)),
|
||||
fingerprint,
|
||||
),
|
||||
AccessMapRequest::HuggingFace { token, fingerprint } => {
|
||||
(map_token(&HuggingFaceMapper, &token).await, fingerprint)
|
||||
}
|
||||
AccessMapRequest::Gitea { token, fingerprint } => {
|
||||
(map_token(&GiteaMapper, &token).await, fingerprint)
|
||||
}
|
||||
AccessMapRequest::Bitbucket { token, fingerprint } => {
|
||||
(map_token(&BitbucketMapper, &token).await, fingerprint)
|
||||
}
|
||||
};
|
||||
|
||||
mapped.fingerprint = Some(fp);
|
||||
|
|
@ -301,12 +299,106 @@ pub async fn map_requests(requests: Vec<AccessMapRequest>) -> Vec<AccessMapResul
|
|||
results
|
||||
}
|
||||
|
||||
/// Maps a token credential using a `TokenAccessMapper`, with fallback error handling.
|
||||
async fn map_token(mapper: &impl TokenAccessMapper, token: &str) -> AccessMapResult {
|
||||
mapper
|
||||
.map_access_from_token(token)
|
||||
.await
|
||||
.unwrap_or_else(|err| build_failed_result(mapper.cloud_name(), "token", err))
|
||||
}
|
||||
|
||||
/// Write HTML/JSON outputs for a collection of identity map results.
|
||||
pub fn write_reports(results: &[AccessMapResult], html_out: &std::path::Path) -> Result<()> {
|
||||
report::generate_html_report_multi(results, html_out)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// TokenAccessMapper implementations
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
/// GitHub access mapper.
|
||||
pub struct GithubMapper;
|
||||
|
||||
impl TokenAccessMapper for GithubMapper {
|
||||
fn cloud_name(&self) -> &'static str {
|
||||
"github"
|
||||
}
|
||||
|
||||
async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
|
||||
github::map_access_from_token(token).await
|
||||
}
|
||||
}
|
||||
|
||||
/// GitLab access mapper.
|
||||
pub struct GitlabMapper;
|
||||
|
||||
impl TokenAccessMapper for GitlabMapper {
|
||||
fn cloud_name(&self) -> &'static str {
|
||||
"gitlab"
|
||||
}
|
||||
|
||||
async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
|
||||
gitlab::map_access_from_token(token).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Slack access mapper.
|
||||
pub struct SlackMapper;
|
||||
|
||||
impl TokenAccessMapper for SlackMapper {
|
||||
fn cloud_name(&self) -> &'static str {
|
||||
"slack"
|
||||
}
|
||||
|
||||
async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
|
||||
slack::map_access_from_token(token).await
|
||||
}
|
||||
}
|
||||
|
||||
/// HuggingFace access mapper.
|
||||
pub struct HuggingFaceMapper;
|
||||
|
||||
impl TokenAccessMapper for HuggingFaceMapper {
|
||||
fn cloud_name(&self) -> &'static str {
|
||||
"huggingface"
|
||||
}
|
||||
|
||||
async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
|
||||
huggingface::map_access_from_token(token).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Gitea access mapper.
|
||||
pub struct GiteaMapper;
|
||||
|
||||
impl TokenAccessMapper for GiteaMapper {
|
||||
fn cloud_name(&self) -> &'static str {
|
||||
"gitea"
|
||||
}
|
||||
|
||||
async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
|
||||
gitea::map_access_from_token(token).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Bitbucket access mapper.
|
||||
pub struct BitbucketMapper;
|
||||
|
||||
impl TokenAccessMapper for BitbucketMapper {
|
||||
fn cloud_name(&self) -> &'static str {
|
||||
"bitbucket"
|
||||
}
|
||||
|
||||
async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
|
||||
bitbucket::map_access_from_token(token).await
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Helper functions
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
fn severity_to_str(severity: Severity) -> &'static str {
|
||||
match severity {
|
||||
Severity::Low => "low",
|
||||
|
|
|
|||
91
src/azure.rs
91
src/azure.rs
|
|
@ -13,13 +13,12 @@ use std::{
|
|||
// let us opt into newer API versions as Microsoft rolls them out.
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use globset::{Glob, GlobSet, GlobSetBuilder};
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use serde::Deserialize;
|
||||
use tracing::warn;
|
||||
use url::{form_urlencoded, Url};
|
||||
|
||||
use crate::{findings_store, git_url::GitUrl};
|
||||
use crate::{findings_store, git_host, git_url::GitUrl};
|
||||
|
||||
const API_VERSION: &str = "7.1-preview.1";
|
||||
|
||||
|
|
@ -55,33 +54,6 @@ impl RepoSpecifiers {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ExcludeMatcher {
|
||||
exact: HashSet<String>,
|
||||
globs: Option<GlobSet>,
|
||||
}
|
||||
|
||||
impl ExcludeMatcher {
|
||||
fn matches(&self, name: &str) -> bool {
|
||||
let candidate = name.to_lowercase();
|
||||
if self.exact.contains(&candidate) {
|
||||
return true;
|
||||
}
|
||||
if let Some(globs) = &self.globs {
|
||||
return globs.is_match(&candidate);
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.exact.is_empty() && self.globs.is_none()
|
||||
}
|
||||
}
|
||||
|
||||
fn looks_like_glob(pattern: &str) -> bool {
|
||||
pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
|
||||
}
|
||||
|
||||
fn encode_segment(segment: &str) -> String {
|
||||
form_urlencoded::byte_serialize(segment.as_bytes()).collect::<String>()
|
||||
}
|
||||
|
|
@ -129,7 +101,7 @@ fn parse_repo_identifier_from_path(path: &str) -> Option<String> {
|
|||
|
||||
let org = org.to_lowercase();
|
||||
let project_raw = project.to_string();
|
||||
if looks_like_glob(&project_raw) {
|
||||
if git_host::looks_like_glob(&project_raw) {
|
||||
let pattern = format!("{org}/{}/**", project_raw.to_lowercase());
|
||||
return Some(pattern);
|
||||
}
|
||||
|
|
@ -197,59 +169,16 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
|
|||
parse_repo_identifier_from_path(trimmed)
|
||||
}
|
||||
|
||||
fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
|
||||
let mut exact = HashSet::new();
|
||||
let mut glob_builder = GlobSetBuilder::new();
|
||||
let mut has_glob = false;
|
||||
|
||||
for raw in exclude_repos {
|
||||
match parse_excluded_repo(raw) {
|
||||
Some(name) => {
|
||||
let normalized = name.to_lowercase();
|
||||
if looks_like_glob(&normalized) {
|
||||
match Glob::new(&normalized) {
|
||||
Ok(glob) => {
|
||||
glob_builder.add(glob);
|
||||
has_glob = true;
|
||||
}
|
||||
Err(err) => {
|
||||
warn!("Ignoring invalid Azure exclusion pattern '{raw}': {err}");
|
||||
exact.insert(normalized);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
exact.insert(normalized);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
warn!("Ignoring invalid Azure exclusion '{raw}' (expected organization/project[/repository])");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let globs = if has_glob {
|
||||
match glob_builder.build() {
|
||||
Ok(set) => Some(set),
|
||||
Err(err) => {
|
||||
warn!("Failed to build Azure exclusion patterns: {err}");
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
ExcludeMatcher { exact, globs }
|
||||
fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
|
||||
git_host::build_exclude_matcher(
|
||||
exclude_repos,
|
||||
|raw| parse_excluded_repo(raw).map(|name| name.to_lowercase()),
|
||||
"Azure",
|
||||
)
|
||||
}
|
||||
|
||||
fn should_exclude_repo(repo_url: &str, excludes: &ExcludeMatcher) -> bool {
|
||||
if excludes.is_empty() {
|
||||
return false;
|
||||
}
|
||||
if let Some(name) = parse_repo_identifier_from_url(repo_url) {
|
||||
return excludes.matches(&name);
|
||||
}
|
||||
false
|
||||
fn should_exclude_repo(repo_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
|
||||
git_host::should_exclude_repo(repo_url, excludes, parse_repo_identifier_from_url)
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Default)]
|
||||
|
|
|
|||
|
|
@ -6,8 +6,8 @@ use std::{
|
|||
time::Duration,
|
||||
};
|
||||
|
||||
use crate::git_host;
|
||||
use anyhow::{Context, Result};
|
||||
use globset::{Glob, GlobSet, GlobSetBuilder};
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use reqwest::Url;
|
||||
use serde::Deserialize;
|
||||
|
|
@ -136,32 +136,6 @@ impl BitbucketKind {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ExcludeMatcher {
|
||||
exact: HashSet<String>,
|
||||
globs: Option<GlobSet>,
|
||||
}
|
||||
|
||||
impl ExcludeMatcher {
|
||||
fn matches(&self, name: &str) -> bool {
|
||||
if self.exact.contains(name) {
|
||||
return true;
|
||||
}
|
||||
if let Some(globs) = &self.globs {
|
||||
return globs.is_match(name);
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.exact.is_empty() && self.globs.is_none()
|
||||
}
|
||||
}
|
||||
|
||||
fn looks_like_glob(pattern: &str) -> bool {
|
||||
pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
|
||||
}
|
||||
|
||||
fn normalize_repo_identifier(owner: &str, repo: &str) -> Option<String> {
|
||||
let owner = owner.trim().trim_matches('/');
|
||||
let repo = repo.trim().trim_matches('/');
|
||||
|
|
@ -207,58 +181,12 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
|
|||
parse_repo_name_from_path(trimmed)
|
||||
}
|
||||
|
||||
fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
|
||||
let mut exact = HashSet::new();
|
||||
let mut glob_builder = GlobSetBuilder::new();
|
||||
let mut has_glob = false;
|
||||
|
||||
for raw in exclude_repos {
|
||||
match parse_excluded_repo(raw) {
|
||||
Some(name) => {
|
||||
if looks_like_glob(&name) {
|
||||
match Glob::new(&name) {
|
||||
Ok(glob) => {
|
||||
glob_builder.add(glob);
|
||||
has_glob = true;
|
||||
}
|
||||
Err(err) => {
|
||||
warn!("Ignoring invalid Bitbucket exclusion pattern '{raw}': {err}");
|
||||
exact.insert(name);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
exact.insert(name);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
warn!("Ignoring invalid Bitbucket exclusion '{raw}' (expected owner/repo)");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let globs = if has_glob {
|
||||
match glob_builder.build() {
|
||||
Ok(set) => Some(set),
|
||||
Err(err) => {
|
||||
warn!("Failed to build Bitbucket exclusion patterns: {err}");
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
ExcludeMatcher { exact, globs }
|
||||
fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
|
||||
git_host::build_exclude_matcher(exclude_repos, parse_excluded_repo, "Bitbucket")
|
||||
}
|
||||
|
||||
fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool {
|
||||
if excludes.is_empty() {
|
||||
return false;
|
||||
}
|
||||
if let Some(name) = parse_repo_name_from_url(clone_url) {
|
||||
return excludes.matches(&name);
|
||||
}
|
||||
false
|
||||
fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
|
||||
git_host::should_exclude_repo(clone_url, excludes, parse_repo_name_from_url)
|
||||
}
|
||||
|
||||
fn repo_clone_url_from_links(links: &[CloneLink]) -> Option<String> {
|
||||
|
|
@ -343,7 +271,7 @@ async fn fetch_cloud_repositories(
|
|||
owner: &str,
|
||||
auth: &AuthConfig,
|
||||
repo_filter: RepoType,
|
||||
excludes: &ExcludeMatcher,
|
||||
excludes: &git_host::ExcludeMatcher,
|
||||
results: &mut Vec<String>,
|
||||
) -> Result<()> {
|
||||
let mut next = base
|
||||
|
|
@ -387,7 +315,7 @@ async fn fetch_server_repositories(
|
|||
path: &str,
|
||||
auth: &AuthConfig,
|
||||
repo_filter: RepoType,
|
||||
excludes: &ExcludeMatcher,
|
||||
excludes: &git_host::ExcludeMatcher,
|
||||
results: &mut Vec<String>,
|
||||
) -> Result<()> {
|
||||
let mut start = 0u64;
|
||||
|
|
|
|||
|
|
@ -1,232 +1,2 @@
|
|||
use once_cell::sync::Lazy;
|
||||
use std::path::Path;
|
||||
use tokei::LanguageType;
|
||||
|
||||
// Precompute all (shebang_prefix_bytes, language) pairs once.
|
||||
// Sort longest-first so more specific shebangs win.
|
||||
static SHEBANG_PREFIXES: Lazy<Vec<(&'static [u8], LanguageType)>> = Lazy::new(|| {
|
||||
let mut v = Vec::new();
|
||||
for &(lang, shebangs) in LanguageType::list() {
|
||||
for &sb in shebangs {
|
||||
v.push((sb.as_bytes(), lang));
|
||||
}
|
||||
}
|
||||
// Longest prefix first to prefer e.g. "#!/usr/bin/env python3" over "#!/usr/bin/env python"
|
||||
v.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
|
||||
v
|
||||
});
|
||||
|
||||
/// The type of content detected.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum ContentType {
|
||||
/// Unprintable or control-heavy data.
|
||||
BINARY,
|
||||
/// Mostly printable text.
|
||||
TEXT,
|
||||
}
|
||||
|
||||
/// Heuristic thresholds for text vs. binary detection.
|
||||
pub struct ContentInspector {
|
||||
max_null_bytes: usize,
|
||||
max_control_ratio: f64,
|
||||
}
|
||||
|
||||
impl Default for ContentInspector {
|
||||
fn default() -> Self {
|
||||
Self { max_null_bytes: 4, max_control_ratio: 0.3 }
|
||||
}
|
||||
}
|
||||
|
||||
impl ContentInspector {
|
||||
/// Create a new inspector with default thresholds.
|
||||
#[inline]
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
/// Classify `bytes` as TEXT or BINARY:
|
||||
///
|
||||
/// 1. If null-byte count > `max_null_bytes` -- `BINARY`.
|
||||
/// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` →
|
||||
/// `BINARY`.
|
||||
/// 3. Otherwise, `TEXT`.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn inspect(&self, bytes: &[u8]) -> ContentType {
|
||||
let nulls = bytes.iter().filter(|&&b| b == 0).count();
|
||||
if nulls > self.max_null_bytes {
|
||||
return ContentType::BINARY;
|
||||
}
|
||||
let controls =
|
||||
bytes.iter().filter(|&&b| b < 32 && !matches!(b, b'\n' | b'\r' | b'\t')).count();
|
||||
let ratio = if bytes.is_empty() { 0.0 } else { controls as f64 / bytes.len() as f64 };
|
||||
if ratio > self.max_control_ratio {
|
||||
ContentType::BINARY
|
||||
} else {
|
||||
ContentType::TEXT
|
||||
}
|
||||
}
|
||||
|
||||
/// Guess MIME type from `path` extension.
|
||||
///
|
||||
/// Returns:
|
||||
/// - `Some(mime)` if the extension is one of the known text or image types.
|
||||
/// - `None` if there is no extension or it’s unrecognized.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn guess_mime_type(&self, path: &Path) -> Option<String> {
|
||||
let ext = path.extension()?.to_str()?.to_ascii_lowercase();
|
||||
let mime = match ext.as_str() {
|
||||
"txt" | "md" | "rst" => "text/plain",
|
||||
"html" | "htm" => "text/html",
|
||||
"css" => "text/css",
|
||||
"js" => "application/javascript",
|
||||
"json" => "application/json",
|
||||
"xml" => "application/xml",
|
||||
"pdf" => "application/pdf",
|
||||
"jpg" | "jpeg" => "image/jpeg",
|
||||
"png" => "image/png",
|
||||
"gif" => "image/gif",
|
||||
_ => return None,
|
||||
};
|
||||
Some(mime.to_string())
|
||||
}
|
||||
|
||||
/// Detect UTF-8 encoding by attempting a lossless conversion.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
|
||||
std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
|
||||
}
|
||||
/// Guess programming language with broad coverage using `tokei`.
|
||||
///
|
||||
/// Strategy (no disk I/O):
|
||||
/// 1) Try extension via `LanguageType::from_file_extension`.
|
||||
/// 2) Handle common extensionless filenames (e.g., Makefile, Dockerfile, CMakeLists.txt).
|
||||
/// 3) Parse an in-memory shebang (first line) against `LanguageType::shebangs`.
|
||||
/// 4) Minimal content markers as a last resort.
|
||||
///
|
||||
/// Returns the canonical `tokei` language name (e.g., `Rust`, `Bash`, `Python`).
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn guess_language(&self, path: &Path, content: &[u8]) -> Option<String> {
|
||||
// 1) Extension mapping (fast, no I/O).
|
||||
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
||||
if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) {
|
||||
return Some(lang.name().to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure).
|
||||
if let Some(file) = path.file_name().and_then(|f| f.to_str()) {
|
||||
match file {
|
||||
"Makefile" | "makefile" => {
|
||||
return Some(LanguageType::Makefile.name().to_string());
|
||||
}
|
||||
"Dockerfile" | "dockerfile" => {
|
||||
return Some(LanguageType::Dockerfile.name().to_string());
|
||||
}
|
||||
"CMakeLists.txt" => {
|
||||
return Some(LanguageType::CMake.name().to_string());
|
||||
}
|
||||
"Rakefile" | "rakefile" => {
|
||||
return Some(LanguageType::Rakefile.name().to_string());
|
||||
}
|
||||
// Common ecosystem files; map to their primary language where sensible.
|
||||
"Gemfile" | "gemfile" => {
|
||||
return Some(LanguageType::Ruby.name().to_string());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// 3) Shebang detection (in-memory): match by longest prefix, byte-wise (no UTF-8 needed).
|
||||
if let Some(first_line) = content.split(|&b| b == b'\n').next() {
|
||||
if first_line.starts_with(b"#!") {
|
||||
for (prefix, lang) in SHEBANG_PREFIXES.iter() {
|
||||
if first_line.starts_with(prefix) {
|
||||
return Some(lang.name().to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4) Lightweight content markers to catch a few ubiquitous cases without I/O.
|
||||
let s = String::from_utf8_lossy(content);
|
||||
if s.contains("<?php") {
|
||||
return Some(LanguageType::Php.name().to_string());
|
||||
}
|
||||
if s.contains("package main") {
|
||||
return Some(LanguageType::Go.name().to_string());
|
||||
}
|
||||
if s.contains("public class") {
|
||||
return Some(LanguageType::Java.name().to_string());
|
||||
}
|
||||
if s.contains("#!/usr/bin/env bash") || s.contains("#!/bin/bash") {
|
||||
return Some(LanguageType::Bash.name().to_string());
|
||||
}
|
||||
if s.contains("#!/usr/bin/env python") {
|
||||
return Some(LanguageType::Python.name().to_string());
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Shorthand: inspect with default thresholds.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn inspect(bytes: &[u8]) -> ContentType {
|
||||
ContentInspector::default().inspect(bytes)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::path::PathBuf;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn binary_vs_text() {
|
||||
let ins = ContentInspector::default();
|
||||
let bin = vec![0, 1, 2, 0, 0, 0, 5];
|
||||
assert_eq!(ins.inspect(&bin), ContentType::BINARY);
|
||||
let txt = b"Hello\nWorld";
|
||||
assert_eq!(ins.inspect(txt), ContentType::TEXT);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mime_guess() {
|
||||
let ins = ContentInspector::default();
|
||||
assert_eq!(ins.guess_mime_type(&PathBuf::from("a.md")), Some("text/plain".into()));
|
||||
assert_eq!(ins.guess_mime_type(&PathBuf::from("img.png")), Some("image/png".into()));
|
||||
assert_eq!(ins.guess_mime_type(&PathBuf::from("x.xyz")), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn charset_guess() {
|
||||
let ins = ContentInspector::default();
|
||||
assert_eq!(ins.guess_charset("ok".as_bytes()), Some("UTF-8".into()));
|
||||
assert_eq!(ins.guess_charset(&[0xFF, 0xFE, 0xFD]), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn language_guess() {
|
||||
let ins = ContentInspector::default();
|
||||
|
||||
// Compare case-insensitively by lowercasing both sides.
|
||||
let rust =
|
||||
ins.guess_language(&PathBuf::from("main.rs"), b"").map(|s| s.to_ascii_lowercase());
|
||||
assert_eq!(rust, Some("rust".into()));
|
||||
|
||||
let php = ins
|
||||
.guess_language(&PathBuf::from("x"), b"<?php echo; ?>")
|
||||
.map(|s| s.to_ascii_lowercase());
|
||||
assert_eq!(php, Some("php".into()));
|
||||
|
||||
let bash = ins
|
||||
.guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi")
|
||||
.map(|s| s.to_ascii_lowercase());
|
||||
assert_eq!(bash, Some("bash".into()));
|
||||
}
|
||||
}
|
||||
// Re-export from the canonical implementation in kingfisher-core.
|
||||
pub use kingfisher_core::content_type::*;
|
||||
|
|
|
|||
121
src/git_host.rs
Normal file
121
src/git_host.rs
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
//! Shared utilities for Git hosting platform integrations.
|
||||
//!
|
||||
//! This module provides the `ExcludeMatcher` struct and related helpers
|
||||
//! used across all platform integration modules (GitHub, GitLab, Azure,
|
||||
//! Bitbucket, Gitea).
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use globset::{Glob, GlobSet, GlobSetBuilder};
|
||||
use tracing::warn;
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// ExcludeMatcher
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
/// Matches repository names against exact strings and glob patterns.
|
||||
///
|
||||
/// Used by all platform integrations to filter out excluded repositories.
|
||||
pub struct ExcludeMatcher {
|
||||
exact: HashSet<String>,
|
||||
globs: Option<GlobSet>,
|
||||
}
|
||||
|
||||
impl ExcludeMatcher {
|
||||
/// Returns `true` if no exclusion patterns are configured.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.exact.is_empty() && self.globs.is_none()
|
||||
}
|
||||
|
||||
/// Returns `true` if the given name matches any exclusion pattern.
|
||||
pub fn matches(&self, name: &str) -> bool {
|
||||
if self.exact.contains(name) {
|
||||
return true;
|
||||
}
|
||||
if let Some(globs) = &self.globs {
|
||||
return globs.is_match(name);
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `true` if the pattern contains glob metacharacters.
|
||||
pub fn looks_like_glob(pattern: &str) -> bool {
|
||||
pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
|
||||
}
|
||||
|
||||
/// Builds an `ExcludeMatcher` from a list of exclusion patterns.
|
||||
///
|
||||
/// The `parse_fn` transforms each raw pattern string into a normalized
|
||||
/// repository name (e.g., `"owner/repo"` or `"group/project"`). It returns
|
||||
/// `None` if the pattern is invalid.
|
||||
///
|
||||
/// The `platform_name` is used in warning messages (e.g., `"GitHub"`,
|
||||
/// `"GitLab"`).
|
||||
pub fn build_exclude_matcher(
|
||||
exclude_repos: &[String],
|
||||
parse_fn: impl Fn(&str) -> Option<String>,
|
||||
platform_name: &str,
|
||||
) -> ExcludeMatcher {
|
||||
let mut exact = HashSet::new();
|
||||
let mut glob_builder = GlobSetBuilder::new();
|
||||
let mut has_glob = false;
|
||||
|
||||
for raw in exclude_repos {
|
||||
match parse_fn(raw) {
|
||||
Some(name) => {
|
||||
if looks_like_glob(&name) {
|
||||
match Glob::new(&name) {
|
||||
Ok(glob) => {
|
||||
glob_builder.add(glob);
|
||||
has_glob = true;
|
||||
}
|
||||
Err(err) => {
|
||||
warn!(
|
||||
"Ignoring invalid {platform_name} exclusion pattern '{raw}': {err}"
|
||||
);
|
||||
exact.insert(name);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
exact.insert(name);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
warn!("Ignoring invalid {platform_name} exclusion '{raw}'");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let globs = if has_glob {
|
||||
match glob_builder.build() {
|
||||
Ok(set) => Some(set),
|
||||
Err(err) => {
|
||||
warn!("Failed to build {platform_name} exclusion patterns: {err}");
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
ExcludeMatcher { exact, globs }
|
||||
}
|
||||
|
||||
/// Checks whether a repository URL should be excluded.
|
||||
///
|
||||
/// The `extract_name` function extracts a normalized repository identifier
|
||||
/// from the URL (e.g., `"owner/repo"` from `"https://github.com/owner/repo.git"`).
|
||||
pub fn should_exclude_repo(
|
||||
repo_url: &str,
|
||||
excludes: &ExcludeMatcher,
|
||||
extract_name: impl Fn(&str) -> Option<String>,
|
||||
) -> bool {
|
||||
if excludes.is_empty() {
|
||||
return false;
|
||||
}
|
||||
if let Some(name) = extract_name(repo_url) {
|
||||
return excludes.matches(&name);
|
||||
}
|
||||
false
|
||||
}
|
||||
80
src/gitea.rs
80
src/gitea.rs
|
|
@ -1,14 +1,13 @@
|
|||
use std::{collections::HashSet, env, str::FromStr, time::Duration};
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use globset::{Glob, GlobSet, GlobSetBuilder};
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use reqwest::StatusCode;
|
||||
use serde::Deserialize;
|
||||
use tracing::warn;
|
||||
use url::Url;
|
||||
|
||||
use crate::{git_url::GitUrl, validation::GLOBAL_USER_AGENT};
|
||||
use crate::{git_host, git_url::GitUrl, validation::GLOBAL_USER_AGENT};
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum RepoType {
|
||||
|
|
@ -55,31 +54,6 @@ struct GiteaOrganization {
|
|||
username: String,
|
||||
}
|
||||
|
||||
struct ExcludeMatcher {
|
||||
exact: HashSet<String>,
|
||||
globs: Option<GlobSet>,
|
||||
}
|
||||
|
||||
impl ExcludeMatcher {
|
||||
fn matches(&self, name: &str) -> bool {
|
||||
if self.exact.contains(name) {
|
||||
return true;
|
||||
}
|
||||
if let Some(globs) = &self.globs {
|
||||
return globs.is_match(name);
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.exact.is_empty() && self.globs.is_none()
|
||||
}
|
||||
}
|
||||
|
||||
fn looks_like_glob(pattern: &str) -> bool {
|
||||
pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
|
||||
}
|
||||
|
||||
fn normalize_repo_identifier(raw: &str) -> Option<String> {
|
||||
let trimmed = raw.trim().trim_matches('/');
|
||||
if trimmed.is_empty() {
|
||||
|
|
@ -113,51 +87,11 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
|
|||
normalize_repo_identifier(trimmed)
|
||||
}
|
||||
|
||||
fn build_exclude_matcher(excludes: &[String]) -> ExcludeMatcher {
|
||||
let mut exact = HashSet::new();
|
||||
let mut glob_builder = GlobSetBuilder::new();
|
||||
let mut has_glob = false;
|
||||
|
||||
for raw in excludes {
|
||||
match parse_excluded_repo(raw) {
|
||||
Some(name) => {
|
||||
if looks_like_glob(&name) {
|
||||
match Glob::new(&name) {
|
||||
Ok(glob) => {
|
||||
glob_builder.add(glob);
|
||||
has_glob = true;
|
||||
}
|
||||
Err(err) => {
|
||||
warn!("Ignoring invalid Gitea exclusion pattern '{raw}': {err}");
|
||||
exact.insert(name);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
exact.insert(name);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
warn!("Ignoring invalid Gitea exclusion '{raw}' (expected owner/repo)");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let globs = if has_glob {
|
||||
match glob_builder.build() {
|
||||
Ok(set) => Some(set),
|
||||
Err(err) => {
|
||||
warn!("Failed to build Gitea exclusion patterns: {err}");
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
ExcludeMatcher { exact, globs }
|
||||
fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
|
||||
git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_repo(raw), "Gitea")
|
||||
}
|
||||
|
||||
fn should_exclude_repo(repo: &GiteaRepository, excludes: &ExcludeMatcher) -> bool {
|
||||
fn should_exclude_repo(repo: &GiteaRepository, excludes: &git_host::ExcludeMatcher) -> bool {
|
||||
if excludes.is_empty() {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -169,7 +103,7 @@ async fn fetch_paginated_repos(
|
|||
token: Option<&str>,
|
||||
mut url: Url,
|
||||
repo_filter: RepoType,
|
||||
excludes: &ExcludeMatcher,
|
||||
excludes: &git_host::ExcludeMatcher,
|
||||
progress: Option<&ProgressBar>,
|
||||
) -> Result<Vec<String>> {
|
||||
let mut page = 1u32;
|
||||
|
|
@ -221,7 +155,7 @@ async fn fetch_user_repos(
|
|||
api_url: &Url,
|
||||
username: &str,
|
||||
repo_filter: RepoType,
|
||||
excludes: &ExcludeMatcher,
|
||||
excludes: &git_host::ExcludeMatcher,
|
||||
progress: Option<&ProgressBar>,
|
||||
) -> Result<Vec<String>> {
|
||||
let endpoint = format!("users/{}/repos", username);
|
||||
|
|
@ -235,7 +169,7 @@ async fn fetch_org_repos(
|
|||
api_url: &Url,
|
||||
org: &str,
|
||||
repo_filter: RepoType,
|
||||
excludes: &ExcludeMatcher,
|
||||
excludes: &git_host::ExcludeMatcher,
|
||||
progress: Option<&ProgressBar>,
|
||||
) -> Result<Vec<String>> {
|
||||
let endpoint = format!("orgs/{}/repos", org);
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ use std::{
|
|||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use globset::{Glob, GlobSet, GlobSetBuilder};
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use octorust::{
|
||||
auth::Credentials,
|
||||
|
|
@ -119,83 +118,14 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
|
|||
parse_repo_name_from_path(trimmed)
|
||||
}
|
||||
|
||||
struct ExcludeMatcher {
|
||||
exact: HashSet<String>,
|
||||
globs: Option<GlobSet>,
|
||||
use crate::git_host;
|
||||
|
||||
fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
|
||||
git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_repo(raw), "GitHub")
|
||||
}
|
||||
|
||||
impl ExcludeMatcher {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.exact.is_empty() && self.globs.is_none()
|
||||
}
|
||||
|
||||
fn matches(&self, name: &str) -> bool {
|
||||
if self.exact.contains(name) {
|
||||
return true;
|
||||
}
|
||||
if let Some(globs) = &self.globs {
|
||||
return globs.is_match(name);
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn looks_like_glob(pattern: &str) -> bool {
|
||||
pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
|
||||
}
|
||||
|
||||
fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
|
||||
let mut exact = HashSet::new();
|
||||
let mut glob_builder = GlobSetBuilder::new();
|
||||
let mut has_glob = false;
|
||||
|
||||
for raw in exclude_repos {
|
||||
match parse_excluded_repo(raw) {
|
||||
Some(name) => {
|
||||
if looks_like_glob(&name) {
|
||||
match Glob::new(&name) {
|
||||
Ok(glob) => {
|
||||
glob_builder.add(glob);
|
||||
has_glob = true;
|
||||
}
|
||||
Err(err) => {
|
||||
warn!("Ignoring invalid GitHub exclusion pattern '{raw}': {err}");
|
||||
exact.insert(name);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
exact.insert(name);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
warn!("Ignoring invalid GitHub exclusion '{raw}' (expected owner/repo)");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let globs = if has_glob {
|
||||
match glob_builder.build() {
|
||||
Ok(set) => Some(set),
|
||||
Err(err) => {
|
||||
warn!("Failed to build GitHub exclusion patterns: {err}");
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
ExcludeMatcher { exact, globs }
|
||||
}
|
||||
|
||||
fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool {
|
||||
if excludes.is_empty() {
|
||||
return false;
|
||||
}
|
||||
if let Some(name) = parse_repo_name_from_url(clone_url) {
|
||||
return excludes.matches(&name);
|
||||
}
|
||||
false
|
||||
fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
|
||||
git_host::should_exclude_repo(clone_url, excludes, parse_repo_name_from_url)
|
||||
}
|
||||
fn create_github_client(github_url: &url::Url, ignore_certs: bool) -> Result<Arc<Client>> {
|
||||
// Try personal access token
|
||||
|
|
|
|||
|
|
@ -16,7 +16,6 @@ use gitlab::{
|
|||
},
|
||||
Gitlab, GitlabBuilder,
|
||||
};
|
||||
use globset::{Glob, GlobSet, GlobSetBuilder};
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use reqwest::StatusCode;
|
||||
use serde::Deserialize;
|
||||
|
|
@ -25,7 +24,7 @@ use tokio::task;
|
|||
use tracing::{info, warn};
|
||||
use url::{form_urlencoded, Url};
|
||||
|
||||
use crate::{findings_store, git_url::GitUrl};
|
||||
use crate::{findings_store, git_host, git_url::GitUrl};
|
||||
use std::str::FromStr;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
|
|
@ -128,83 +127,12 @@ fn parse_excluded_project(raw: &str) -> Option<String> {
|
|||
parse_project_path(trimmed)
|
||||
}
|
||||
|
||||
struct ExcludeMatcher {
|
||||
exact: HashSet<String>,
|
||||
globs: Option<GlobSet>,
|
||||
fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
|
||||
git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_project(raw), "GitLab")
|
||||
}
|
||||
|
||||
impl ExcludeMatcher {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.exact.is_empty() && self.globs.is_none()
|
||||
}
|
||||
|
||||
fn matches(&self, name: &str) -> bool {
|
||||
if self.exact.contains(name) {
|
||||
return true;
|
||||
}
|
||||
if let Some(globs) = &self.globs {
|
||||
return globs.is_match(name);
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn looks_like_glob(pattern: &str) -> bool {
|
||||
pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
|
||||
}
|
||||
|
||||
fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
|
||||
let mut exact = HashSet::new();
|
||||
let mut glob_builder = GlobSetBuilder::new();
|
||||
let mut has_glob = false;
|
||||
|
||||
for raw in exclude_repos {
|
||||
match parse_excluded_project(raw) {
|
||||
Some(name) => {
|
||||
if looks_like_glob(&name) {
|
||||
match Glob::new(&name) {
|
||||
Ok(glob) => {
|
||||
glob_builder.add(glob);
|
||||
has_glob = true;
|
||||
}
|
||||
Err(err) => {
|
||||
warn!("Ignoring invalid GitLab exclusion pattern '{raw}': {err}");
|
||||
exact.insert(name);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
exact.insert(name);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
warn!("Ignoring invalid GitLab exclusion '{raw}' (expected group/project)");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let globs = if has_glob {
|
||||
match glob_builder.build() {
|
||||
Ok(set) => Some(set),
|
||||
Err(err) => {
|
||||
warn!("Failed to build GitLab exclusion patterns: {err}");
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
ExcludeMatcher { exact, globs }
|
||||
}
|
||||
|
||||
fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool {
|
||||
if excludes.is_empty() {
|
||||
return false;
|
||||
}
|
||||
if let Some(name) = parse_project_path_from_url(clone_url) {
|
||||
return excludes.matches(&name);
|
||||
}
|
||||
false
|
||||
fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
|
||||
git_host::should_exclude_repo(clone_url, excludes, parse_project_path_from_url)
|
||||
}
|
||||
|
||||
fn create_gitlab_client(gitlab_url: &Url, ignore_certs: bool) -> Result<Gitlab> {
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ pub mod findings_store;
|
|||
pub mod gcs;
|
||||
pub mod git_binary;
|
||||
pub mod git_commit_metadata;
|
||||
pub mod git_host;
|
||||
pub mod git_metadata_graph;
|
||||
mod git_repo_enumerator;
|
||||
pub mod git_url;
|
||||
|
|
|
|||
2
src/matcher/base64_decode.rs
Normal file
2
src/matcher/base64_decode.rs
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
// Re-export from the canonical implementation in kingfisher-scanner.
|
||||
pub use kingfisher_scanner::primitives::{get_base64_strings, DecodedData};
|
||||
140
src/matcher/captures.rs
Normal file
140
src/matcher/captures.rs
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
use bstr::BString;
|
||||
use regex::bytes::Regex;
|
||||
use schemars::{
|
||||
gen::SchemaGenerator,
|
||||
schema::{ArrayValidation, InstanceType, Schema},
|
||||
JsonSchema,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use crate::{snippet::Base64BString, util::intern};
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Group
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)]
|
||||
pub struct Group(pub Base64BString);
|
||||
impl Group {
|
||||
pub fn new(m: regex::bytes::Match<'_>) -> Self {
|
||||
Self(Base64BString(BString::from(m.as_bytes())))
|
||||
}
|
||||
}
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Groups
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||
pub struct Groups(pub SmallVec<[Group; 1]>);
|
||||
impl JsonSchema for Groups {
|
||||
fn schema_name() -> String {
|
||||
"Groups".to_string()
|
||||
}
|
||||
|
||||
fn json_schema(gen: &mut SchemaGenerator) -> Schema {
|
||||
let group_schema = gen.subschema_for::<Group>();
|
||||
Schema::Object(schemars::schema::SchemaObject {
|
||||
instance_type: Some(InstanceType::Array.into()),
|
||||
array: Some(Box::new(ArrayValidation {
|
||||
items: Some(group_schema.into()),
|
||||
..Default::default()
|
||||
})),
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, JsonSchema)]
|
||||
pub struct SerializableCapture {
|
||||
pub name: Option<&'static str>,
|
||||
pub match_number: i32,
|
||||
pub start: usize,
|
||||
pub end: usize,
|
||||
/// Interned original (unredacted) value.
|
||||
#[serde(skip_serializing, skip_deserializing)]
|
||||
pub value: &'static str,
|
||||
}
|
||||
|
||||
impl SerializableCapture {
|
||||
/// Returns the original captured value.
|
||||
pub fn raw_value(&self) -> &'static str {
|
||||
self.value
|
||||
}
|
||||
|
||||
/// Returns the value that should be shown in user-facing output.
|
||||
pub fn display_value(&self) -> std::borrow::Cow<'static, str> {
|
||||
crate::util::display_value(self.value)
|
||||
}
|
||||
}
|
||||
|
||||
impl serde::Serialize for SerializableCapture {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeStruct;
|
||||
|
||||
let mut state = serializer.serialize_struct("SerializableCapture", 5)?;
|
||||
state.serialize_field("name", &self.name)?;
|
||||
state.serialize_field("match_number", &self.match_number)?;
|
||||
state.serialize_field("start", &self.start)?;
|
||||
state.serialize_field("end", &self.end)?;
|
||||
let value = self.display_value();
|
||||
state.serialize_field("value", &value)?;
|
||||
state.end()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, JsonSchema)]
|
||||
pub struct SerializableCaptures {
|
||||
#[schemars(with = "Vec<SerializableCapture>")]
|
||||
pub captures: SmallVec<[SerializableCapture; 2]>,
|
||||
}
|
||||
|
||||
impl SerializableCaptures {
|
||||
pub fn from_captures(captures: ®ex::bytes::Captures, _input: &[u8], re: &Regex) -> Self {
|
||||
let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
|
||||
|
||||
let capture_names: SmallVec<[Option<&'static str>; 4]> =
|
||||
re.capture_names().map(|name| name.map(intern)).collect();
|
||||
|
||||
// If there are explicit capture groups (e.g., group 1, 2, ...),
|
||||
// only serialize those.
|
||||
if captures.len() > 1 {
|
||||
for i in 1..captures.len() {
|
||||
// Start from 1
|
||||
if let Some(cap) = captures.get(i) {
|
||||
let raw_value = String::from_utf8_lossy(cap.as_bytes());
|
||||
let raw_interned = intern(raw_value.as_ref());
|
||||
let name = capture_names.get(i).and_then(|opt| *opt);
|
||||
|
||||
serialized_captures.push(SerializableCapture {
|
||||
name,
|
||||
match_number: i32::try_from(i).unwrap_or(0),
|
||||
start: cap.start(),
|
||||
end: cap.end(),
|
||||
value: raw_interned,
|
||||
});
|
||||
}
|
||||
}
|
||||
} else if captures.len() == 1 {
|
||||
// ELSE, if there is ONLY the full match (len == 1),
|
||||
// serialize just that full match (group 0) as the fallback.
|
||||
if let Some(cap) = captures.get(0) {
|
||||
let raw_value = String::from_utf8_lossy(cap.as_bytes());
|
||||
let raw_interned = intern(raw_value.as_ref());
|
||||
let name = capture_names.get(0).and_then(|opt| *opt);
|
||||
|
||||
serialized_captures.push(SerializableCapture {
|
||||
name,
|
||||
match_number: 0,
|
||||
start: cap.start(),
|
||||
end: cap.end(),
|
||||
value: raw_interned,
|
||||
});
|
||||
}
|
||||
}
|
||||
// If len == 0 (no match), loop is skipped, empty vec is returned.
|
||||
|
||||
SerializableCaptures { captures: serialized_captures }
|
||||
}
|
||||
}
|
||||
251
src/matcher/conversion.rs
Normal file
251
src/matcher/conversion.rs
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use http::StatusCode;
|
||||
use schemars::JsonSchema;
|
||||
use serde::Serialize;
|
||||
use xxhash_rust::xxh3::xxh3_64;
|
||||
|
||||
use crate::{
|
||||
blob::BlobId,
|
||||
location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
|
||||
rules::rule::Rule,
|
||||
validation_body::{self, ValidationResponseBody},
|
||||
};
|
||||
|
||||
use super::{captures::SerializableCaptures, BlobMatch};
|
||||
|
||||
use kingfisher_scanner::primitives::compute_finding_fingerprint;
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// OwnedBlobMatch
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct OwnedBlobMatch {
|
||||
pub rule: Arc<Rule>,
|
||||
pub blob_id: BlobId,
|
||||
/// The unique content-based identifier of this match
|
||||
pub finding_fingerprint: u64,
|
||||
pub matching_input_offset_span: OffsetSpan,
|
||||
pub captures: SerializableCaptures,
|
||||
pub validation_response_body: ValidationResponseBody,
|
||||
pub validation_response_status: StatusCode,
|
||||
pub validation_success: bool,
|
||||
pub calculated_entropy: f32,
|
||||
pub is_base64: bool,
|
||||
/// Variables captured from dependent rules (from depends_on_rule).
|
||||
/// Maps variable name (uppercase) to captured value.
|
||||
pub dependent_captures: std::collections::BTreeMap<String, String>,
|
||||
}
|
||||
|
||||
impl OwnedBlobMatch {
|
||||
pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc<Rule>) -> OwnedBlobMatch {
|
||||
OwnedBlobMatch {
|
||||
rule,
|
||||
blob_id: m.blob_id,
|
||||
finding_fingerprint: m.finding_fingerprint,
|
||||
// matching_input: m.snippet.matching.0.to_vec(),
|
||||
matching_input_offset_span: m.location.offset_span.clone(),
|
||||
captures: m.groups.clone(),
|
||||
validation_response_body: m.validation_response_body.clone(),
|
||||
validation_response_status: StatusCode::from_u16(m.validation_response_status)
|
||||
.unwrap_or(StatusCode::CONTINUE),
|
||||
validation_success: m.validation_success,
|
||||
calculated_entropy: m.calculated_entropy,
|
||||
is_base64: m.is_base64,
|
||||
dependent_captures: m.dependent_captures.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_blob_match(blob_match: BlobMatch) -> Self {
|
||||
// EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
|
||||
//
|
||||
// This indexing is intentionally different from the internal `validation_dedup_key()`
|
||||
// (which uses get(0)) to maintain stable external fingerprints. Changing this would break:
|
||||
// - Historical baselines that rely on fingerprint matching
|
||||
// - Dedup entries stored in external systems
|
||||
//
|
||||
// For rules with nested captures like (?<REGEX>...(ABC)...), this may pick up
|
||||
// the inner group, but that behavior is now established and must be preserved.
|
||||
let matching_finding = blob_match
|
||||
.captures
|
||||
.captures
|
||||
.get(1)
|
||||
.or_else(|| blob_match.captures.captures.get(0))
|
||||
.map(|capture| capture.raw_value().as_bytes().to_vec())
|
||||
.unwrap_or_else(Vec::new);
|
||||
|
||||
let mut owned_blob_match = OwnedBlobMatch {
|
||||
rule: blob_match.rule,
|
||||
blob_id: blob_match.blob_id.clone(),
|
||||
matching_input_offset_span: blob_match.matching_input_offset_span,
|
||||
captures: blob_match.captures.clone(),
|
||||
validation_response_body: blob_match.validation_response_body,
|
||||
validation_response_status: blob_match.validation_response_status,
|
||||
validation_success: blob_match.validation_success,
|
||||
calculated_entropy: blob_match.calculated_entropy,
|
||||
finding_fingerprint: 0, //default
|
||||
is_base64: blob_match.is_base64,
|
||||
dependent_captures: std::collections::BTreeMap::new(),
|
||||
};
|
||||
|
||||
// Convert matching_finding to a &str (using lossy conversion if needed)
|
||||
let finding_value = std::str::from_utf8(&matching_finding).unwrap_or("");
|
||||
// Use blob_id as the file/commit identifier
|
||||
let file_or_commit = &blob_match.blob_id.to_string();
|
||||
|
||||
let offset_start: u64 =
|
||||
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
|
||||
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
|
||||
|
||||
owned_blob_match.finding_fingerprint =
|
||||
compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end);
|
||||
|
||||
owned_blob_match
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Match
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
#[derive(Debug, Clone, Serialize, JsonSchema)]
|
||||
pub struct Match {
|
||||
/// The location of the entire matching content
|
||||
pub location: Location,
|
||||
|
||||
/// The capture groups
|
||||
pub groups: SerializableCaptures, // Store serialized captures
|
||||
|
||||
/// unique identifier of file / blob where this match was found
|
||||
pub blob_id: BlobId,
|
||||
|
||||
/// The unique content-based identifier of this match
|
||||
pub finding_fingerprint: u64,
|
||||
|
||||
/// The rule that produced this match
|
||||
#[serde(skip_serializing)]
|
||||
#[schemars(skip)]
|
||||
pub rule: Arc<Rule>,
|
||||
|
||||
/// Validation Body
|
||||
#[serde(
|
||||
default,
|
||||
serialize_with = "validation_body::serialize",
|
||||
deserialize_with = "validation_body::deserialize"
|
||||
)]
|
||||
#[schemars(schema_with = "validation_body::schema")]
|
||||
pub validation_response_body: ValidationResponseBody,
|
||||
|
||||
/// Validation Status Code
|
||||
pub validation_response_status: u16,
|
||||
|
||||
/// Validation Success
|
||||
pub validation_success: bool,
|
||||
|
||||
/// Validation Success
|
||||
pub calculated_entropy: f32,
|
||||
|
||||
pub visible: bool,
|
||||
#[serde(default)]
|
||||
pub is_base64: bool,
|
||||
|
||||
/// Variables captured from dependent rules (from depends_on_rule).
|
||||
/// Maps variable name (uppercase) to captured value.
|
||||
#[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
|
||||
pub dependent_captures: std::collections::BTreeMap<String, String>,
|
||||
}
|
||||
|
||||
impl Match {
|
||||
#[inline]
|
||||
pub fn convert_owned_blobmatch_to_match<'a>(
|
||||
loc_mapping: Option<&'a LocationMapping<'a>>,
|
||||
owned_blob_match: &'a OwnedBlobMatch,
|
||||
origin_type: &'a str,
|
||||
) -> Self {
|
||||
let offset_span = owned_blob_match.matching_input_offset_span;
|
||||
// EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
|
||||
// See comment in from_blob_match() for why this differs from validation_dedup_key().
|
||||
let matching_finding_bytes = owned_blob_match
|
||||
.captures
|
||||
.captures
|
||||
.get(1)
|
||||
.or_else(|| owned_blob_match.captures.captures.get(0))
|
||||
.map(|capture| capture.raw_value().as_bytes())
|
||||
.unwrap_or_default();
|
||||
|
||||
// The fingerprint will be based on the content of the secret.
|
||||
let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
|
||||
|
||||
let source_span =
|
||||
loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
|
||||
start: SourcePoint { line: 0, column: 0 },
|
||||
end: SourcePoint { line: 0, column: 0 },
|
||||
});
|
||||
let offset_start: u64 =
|
||||
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
|
||||
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
|
||||
|
||||
let finding_fingerprint = compute_finding_fingerprint(
|
||||
finding_value_for_fp,
|
||||
origin_type, // file_or_commit,
|
||||
offset_start,
|
||||
offset_end,
|
||||
);
|
||||
|
||||
// matching_snippet
|
||||
Match {
|
||||
rule: owned_blob_match.rule.clone(),
|
||||
visible: owned_blob_match.rule.visible().to_owned(),
|
||||
location: Location::with_source_span(offset_span, Some(source_span.clone())),
|
||||
groups: owned_blob_match.captures.clone(),
|
||||
blob_id: owned_blob_match.blob_id,
|
||||
finding_fingerprint,
|
||||
validation_response_body: owned_blob_match.validation_response_body.clone(),
|
||||
validation_response_status: owned_blob_match.validation_response_status.as_u16(),
|
||||
validation_success: owned_blob_match.validation_success,
|
||||
calculated_entropy: owned_blob_match.calculated_entropy,
|
||||
is_base64: owned_blob_match.is_base64,
|
||||
dependent_captures: owned_blob_match.dependent_captures.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the `blob_id` of the match.
|
||||
pub fn get_blob_id(&self) -> BlobId {
|
||||
self.blob_id.clone()
|
||||
}
|
||||
|
||||
pub fn finding_id(&self) -> String {
|
||||
let mut buffer = Vec::with_capacity(128);
|
||||
buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes());
|
||||
buffer.push(0);
|
||||
serde_json::to_writer(&mut buffer, &self.groups)
|
||||
.expect("should be able to serialize groups as JSON");
|
||||
let mut num = xxh3_64(&buffer);
|
||||
// Ensure the number is positive and within i64 range
|
||||
num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive
|
||||
// Convert to string
|
||||
num.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// MatcherStats
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct MatcherStats {
|
||||
pub blobs_seen: u64,
|
||||
pub blobs_scanned: u64,
|
||||
pub bytes_seen: u64,
|
||||
pub bytes_scanned: u64,
|
||||
}
|
||||
|
||||
impl MatcherStats {
|
||||
pub fn update(&mut self, other: &Self) {
|
||||
self.blobs_seen += other.blobs_seen;
|
||||
self.blobs_scanned += other.blobs_scanned;
|
||||
self.bytes_seen += other.bytes_seen;
|
||||
self.bytes_scanned += other.bytes_scanned;
|
||||
}
|
||||
}
|
||||
2
src/matcher/dedup.rs
Normal file
2
src/matcher/dedup.rs
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
// Re-export from the canonical implementation in kingfisher-scanner.
|
||||
pub(crate) use kingfisher_scanner::primitives::{compute_match_key, record_match};
|
||||
274
src/matcher/filter.rs
Normal file
274
src/matcher/filter.rs
Normal file
|
|
@ -0,0 +1,274 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use http::StatusCode;
|
||||
use regex::bytes::Regex;
|
||||
use rustc_hash::{FxHashMap, FxHashSet};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{
|
||||
blob::Blob,
|
||||
entropy::calculate_shannon_entropy,
|
||||
inline_ignore::InlineIgnoreConfig,
|
||||
location::OffsetSpan,
|
||||
origin::OriginSet,
|
||||
rule_profiling::{ConcurrentRuleProfiler, RuleTimer},
|
||||
rules::rule::{PatternRequirementContext, PatternValidationResult, Rule, Validation},
|
||||
safe_list::{is_safe_match, is_user_match},
|
||||
validation::{is_parseable_mongodb_uri, is_parseable_mysql_uri, is_parseable_postgres_uri},
|
||||
};
|
||||
|
||||
use super::{
|
||||
captures::SerializableCaptures,
|
||||
dedup::{compute_match_key, record_match},
|
||||
BlobMatch,
|
||||
};
|
||||
|
||||
// Re-use the canonical secret capture selection from kingfisher-scanner.
|
||||
use kingfisher_scanner::primitives::find_secret_capture;
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Entropy and safe-list check
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
/// Returns `Some(entropy)` if the match passes entropy and safe-list checks,
|
||||
/// `None` if it should be skipped.
|
||||
fn check_entropy_and_safelist(
|
||||
entropy_bytes: &[u8],
|
||||
full_bytes: &[u8],
|
||||
min_entropy: f32,
|
||||
) -> Option<f32> {
|
||||
let calculated_entropy = calculate_shannon_entropy(entropy_bytes);
|
||||
if calculated_entropy <= min_entropy
|
||||
|| is_safe_match(entropy_bytes)
|
||||
|| is_user_match(entropy_bytes, full_bytes)
|
||||
{
|
||||
debug!(
|
||||
"Skipping match with entropy {} <= {} or safe match",
|
||||
calculated_entropy, min_entropy
|
||||
);
|
||||
None
|
||||
} else {
|
||||
Some(calculated_entropy)
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Pattern requirements check
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
/// Returns `true` if the match passes pattern requirements, `false` if it should be skipped.
|
||||
fn check_pattern_requirements(
|
||||
rule: &Rule,
|
||||
re: &Regex,
|
||||
captures: ®ex::bytes::Captures,
|
||||
full_bytes: &[u8],
|
||||
entropy_bytes: &[u8],
|
||||
respect_ignore_if_contains: bool,
|
||||
) -> bool {
|
||||
let Some(char_reqs) = rule.pattern_requirements() else {
|
||||
return true;
|
||||
};
|
||||
|
||||
let context = PatternRequirementContext { regex: re, captures, full_match: full_bytes };
|
||||
|
||||
// Decide which bytes to validate:
|
||||
// - If there are multiple capture groups OR any named captures -> use full match
|
||||
// - Otherwise -> use entropy_bytes (the actual secret)
|
||||
let use_full_match = {
|
||||
let has_named_captures = re.capture_names().any(|n| n.is_some());
|
||||
let capture_count = captures.len(); // includes group 0
|
||||
has_named_captures || capture_count > 2
|
||||
};
|
||||
|
||||
let validation_bytes = if use_full_match { full_bytes } else { entropy_bytes };
|
||||
|
||||
match char_reqs.validate(validation_bytes, Some(context), respect_ignore_if_contains) {
|
||||
PatternValidationResult::Passed => true,
|
||||
PatternValidationResult::Failed => {
|
||||
debug!(
|
||||
"Skipping match that does not meet character requirements for rule {}",
|
||||
rule.id()
|
||||
);
|
||||
false
|
||||
}
|
||||
PatternValidationResult::FailedChecksum { actual_len, expected_len } => {
|
||||
debug!(
|
||||
"Skipping match for rule {} due to checksum mismatch (actual_len={}, expected_len={})",
|
||||
rule.id(),
|
||||
actual_len,
|
||||
expected_len
|
||||
);
|
||||
false
|
||||
}
|
||||
PatternValidationResult::IgnoredBySubstring { matched_term } => {
|
||||
debug!(
|
||||
"Skipping match for rule {} because it contains ignored term {matched_term}",
|
||||
rule.id()
|
||||
);
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// URI validation
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
/// Returns `true` if the match passes URI validation (for database rules), `false` if it should
|
||||
/// be skipped.
|
||||
fn check_uri_validation(rule: &Rule, matching_input_bytes: &[u8]) -> bool {
|
||||
let Some(validation) = rule.syntax.validation.as_ref() else {
|
||||
return true;
|
||||
};
|
||||
|
||||
match validation {
|
||||
Validation::MongoDB => {
|
||||
let Ok(uri) = std::str::from_utf8(matching_input_bytes) else {
|
||||
debug!("Skipping match for rule {} due to non-UTF8 MongoDB URI", rule.id());
|
||||
return false;
|
||||
};
|
||||
if !is_parseable_mongodb_uri(uri) {
|
||||
debug!("Skipping match for rule {} due to invalid MongoDB URI", rule.id());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
Validation::Postgres => {
|
||||
let Ok(uri) = std::str::from_utf8(matching_input_bytes) else {
|
||||
debug!("Skipping match for rule {} due to non-UTF8 Postgres URI", rule.id());
|
||||
return false;
|
||||
};
|
||||
if !is_parseable_postgres_uri(uri) {
|
||||
debug!("Skipping match for rule {} due to invalid Postgres URI", rule.id());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
Validation::MySQL => {
|
||||
let Ok(uri) = std::str::from_utf8(matching_input_bytes) else {
|
||||
debug!("Skipping match for rule {} due to non-UTF8 MySQL URI", rule.id());
|
||||
return false;
|
||||
};
|
||||
if !is_parseable_mysql_uri(uri) {
|
||||
debug!("Skipping match for rule {} due to invalid MySQL URI", rule.id());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// filter_match — main entry point
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn filter_match<'b>(
|
||||
blob: &'b Blob,
|
||||
rule: Arc<Rule>,
|
||||
re: &Regex,
|
||||
start: usize,
|
||||
end: usize,
|
||||
matches: &mut Vec<BlobMatch<'b>>,
|
||||
previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
|
||||
rule_id: usize,
|
||||
seen_matches: &mut FxHashSet<u64>,
|
||||
_origin: &OriginSet,
|
||||
ts_match: Option<&[u8]>,
|
||||
is_base64: bool,
|
||||
_redact: bool,
|
||||
filename: &str,
|
||||
profiler: Option<&Arc<ConcurrentRuleProfiler>>,
|
||||
respect_ignore_if_contains: bool,
|
||||
inline_ignore_config: &InlineIgnoreConfig,
|
||||
) {
|
||||
let mut timer =
|
||||
profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
|
||||
|
||||
let initial_len = matches.len();
|
||||
|
||||
let blob_bytes = blob.bytes();
|
||||
let default_slice = &blob_bytes[start..end];
|
||||
let haystack = ts_match.unwrap_or(default_slice);
|
||||
|
||||
for captures in re.captures_iter(haystack) {
|
||||
let full_capture = captures.get(0).unwrap();
|
||||
let matching_input_for_entropy = find_secret_capture(re, &captures);
|
||||
|
||||
let min_entropy = rule.min_entropy();
|
||||
let entropy_bytes = matching_input_for_entropy.as_bytes();
|
||||
let full_bytes = full_capture.as_bytes();
|
||||
|
||||
// Check entropy and safe-listing
|
||||
let calculated_entropy =
|
||||
match check_entropy_and_safelist(entropy_bytes, full_bytes, min_entropy) {
|
||||
Some(e) => e,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// Check pattern requirements
|
||||
if !check_pattern_requirements(
|
||||
&rule,
|
||||
re,
|
||||
&captures,
|
||||
full_bytes,
|
||||
entropy_bytes,
|
||||
respect_ignore_if_contains,
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Use the `matching_input_for_entropy` as the span/key for the finding.
|
||||
let matching_input = matching_input_for_entropy;
|
||||
|
||||
let matching_input_offset_span = OffsetSpan::from_range(
|
||||
(start + matching_input.start())..(start + matching_input.end()),
|
||||
);
|
||||
|
||||
// Check inline ignore directives
|
||||
if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
|
||||
debug!("Skipping match due to inline ignore directive");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check URI validation (MongoDB, Postgres, MySQL)
|
||||
if !check_uri_validation(&rule, matching_input.as_bytes()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Deduplication
|
||||
let match_key = compute_match_key(
|
||||
matching_input.as_bytes(),
|
||||
rule.id().as_bytes(),
|
||||
matching_input_offset_span.start,
|
||||
matching_input_offset_span.end,
|
||||
);
|
||||
if !seen_matches.insert(match_key) {
|
||||
continue;
|
||||
}
|
||||
if !record_match(previous_matches, rule_id, matching_input_offset_span) {
|
||||
continue;
|
||||
}
|
||||
let only_matching_input =
|
||||
&blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
|
||||
|
||||
// Pass the *full* capture object to from_captures
|
||||
let groups = SerializableCaptures::from_captures(&captures, haystack, re);
|
||||
|
||||
matches.push(BlobMatch {
|
||||
rule: Arc::clone(&rule),
|
||||
blob_id: blob.id_ref(),
|
||||
matching_input: only_matching_input,
|
||||
matching_input_offset_span,
|
||||
captures: groups,
|
||||
validation_response_body: None,
|
||||
validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
|
||||
validation_success: false,
|
||||
calculated_entropy,
|
||||
is_base64,
|
||||
});
|
||||
}
|
||||
if let Some(t) = timer.take() {
|
||||
let new_count = (matches.len() - initial_len) as u64;
|
||||
t.end(new_count > 0, new_count, 0);
|
||||
}
|
||||
}
|
||||
2
src/matcher/fingerprint.rs
Normal file
2
src/matcher/fingerprint.rs
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
// Re-export from the canonical implementation in kingfisher-scanner.
|
||||
pub use kingfisher_scanner::primitives::compute_finding_fingerprint;
|
||||
|
|
@ -1,42 +1,38 @@
|
|||
use std::{
|
||||
hash::{Hash, Hasher},
|
||||
str,
|
||||
sync::{Arc, Mutex},
|
||||
};
|
||||
mod base64_decode;
|
||||
mod captures;
|
||||
mod conversion;
|
||||
mod dedup;
|
||||
mod filter;
|
||||
mod fingerprint;
|
||||
|
||||
// Re-export public API
|
||||
pub use base64_decode::{get_base64_strings, DecodedData};
|
||||
pub use captures::{Group, Groups, SerializableCapture, SerializableCaptures};
|
||||
pub use conversion::{Match, MatcherStats, OwnedBlobMatch};
|
||||
pub use fingerprint::compute_finding_fingerprint;
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use anyhow::Result;
|
||||
use base64::{engine::general_purpose, Engine};
|
||||
use bstr::BString;
|
||||
use http::StatusCode;
|
||||
use regex::bytes::Regex;
|
||||
use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
|
||||
use schemars::{
|
||||
gen::SchemaGenerator,
|
||||
schema::{ArrayValidation, InstanceType, Schema},
|
||||
JsonSchema,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use smallvec::SmallVec;
|
||||
use tracing::debug;
|
||||
use xxhash_rust::xxh3::xxh3_64;
|
||||
use rustc_hash::{FxHashMap, FxHashSet};
|
||||
|
||||
use crate::{
|
||||
blob::{Blob, BlobId, BlobIdMap},
|
||||
entropy::calculate_shannon_entropy,
|
||||
inline_ignore::InlineIgnoreConfig,
|
||||
location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
|
||||
location::OffsetSpan,
|
||||
origin::OriginSet,
|
||||
parser,
|
||||
parser::{Checker, Language},
|
||||
rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer},
|
||||
rules::rule::{PatternRequirementContext, PatternValidationResult, Rule, Validation},
|
||||
rule_profiling::{ConcurrentRuleProfiler, RuleStats},
|
||||
rules::rule::Rule,
|
||||
rules_database::RulesDatabase,
|
||||
safe_list::{is_safe_match, is_user_match},
|
||||
scanner_pool::ScannerPool,
|
||||
snippet::Base64BString,
|
||||
util::intern,
|
||||
validation::{is_parseable_mongodb_uri, is_parseable_mysql_uri, is_parseable_postgres_uri},
|
||||
validation_body::{self, ValidationResponseBody},
|
||||
validation_body::ValidationResponseBody,
|
||||
};
|
||||
|
||||
use self::{
|
||||
base64_decode::get_base64_strings as get_b64_strings, dedup::record_match, filter::filter_match,
|
||||
};
|
||||
|
||||
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
|
||||
|
|
@ -58,94 +54,7 @@ struct RawMatch {
|
|||
start_idx: u64,
|
||||
end_idx: u64,
|
||||
}
|
||||
#[derive(Clone)]
|
||||
pub struct OwnedBlobMatch {
|
||||
pub rule: Arc<Rule>,
|
||||
pub blob_id: BlobId,
|
||||
/// The unique content-based identifier of this match
|
||||
pub finding_fingerprint: u64,
|
||||
pub matching_input_offset_span: OffsetSpan,
|
||||
pub captures: SerializableCaptures,
|
||||
pub validation_response_body: ValidationResponseBody,
|
||||
pub validation_response_status: StatusCode,
|
||||
pub validation_success: bool,
|
||||
pub calculated_entropy: f32,
|
||||
pub is_base64: bool,
|
||||
/// Variables captured from dependent rules (from depends_on_rule).
|
||||
/// Maps variable name (uppercase) to captured value.
|
||||
pub dependent_captures: std::collections::BTreeMap<String, String>,
|
||||
}
|
||||
impl<'a> Matcher<'a> {
|
||||
pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
|
||||
self.profiler.as_ref().map(|p| p.generate_report())
|
||||
}
|
||||
}
|
||||
impl OwnedBlobMatch {
|
||||
pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc<Rule>) -> OwnedBlobMatch {
|
||||
OwnedBlobMatch {
|
||||
rule,
|
||||
blob_id: m.blob_id,
|
||||
finding_fingerprint: m.finding_fingerprint,
|
||||
// matching_input: m.snippet.matching.0.to_vec(),
|
||||
matching_input_offset_span: m.location.offset_span.clone(),
|
||||
captures: m.groups.clone(),
|
||||
validation_response_body: m.validation_response_body.clone(),
|
||||
validation_response_status: StatusCode::from_u16(m.validation_response_status)
|
||||
.unwrap_or(StatusCode::CONTINUE),
|
||||
validation_success: m.validation_success,
|
||||
calculated_entropy: m.calculated_entropy,
|
||||
is_base64: m.is_base64,
|
||||
dependent_captures: m.dependent_captures.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_blob_match(blob_match: BlobMatch) -> Self {
|
||||
// EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
|
||||
//
|
||||
// This indexing is intentionally different from the internal `validation_dedup_key()`
|
||||
// (which uses get(0)) to maintain stable external fingerprints. Changing this would break:
|
||||
// - Historical baselines that rely on fingerprint matching
|
||||
// - Dedup entries stored in external systems
|
||||
//
|
||||
// For rules with nested captures like (?<REGEX>...(ABC)...), this may pick up
|
||||
// the inner group, but that behavior is now established and must be preserved.
|
||||
let matching_finding = blob_match
|
||||
.captures
|
||||
.captures
|
||||
.get(1)
|
||||
.or_else(|| blob_match.captures.captures.get(0))
|
||||
.map(|capture| capture.raw_value().as_bytes().to_vec())
|
||||
.unwrap_or_else(Vec::new);
|
||||
|
||||
let mut owned_blob_match = OwnedBlobMatch {
|
||||
rule: blob_match.rule,
|
||||
blob_id: blob_match.blob_id.clone(),
|
||||
matching_input_offset_span: blob_match.matching_input_offset_span,
|
||||
captures: blob_match.captures.clone(),
|
||||
validation_response_body: blob_match.validation_response_body,
|
||||
validation_response_status: blob_match.validation_response_status,
|
||||
validation_success: blob_match.validation_success,
|
||||
calculated_entropy: blob_match.calculated_entropy,
|
||||
finding_fingerprint: 0, //default
|
||||
is_base64: blob_match.is_base64,
|
||||
dependent_captures: std::collections::BTreeMap::new(),
|
||||
};
|
||||
|
||||
// Convert matching_finding to a &str (using lossy conversion if needed)
|
||||
let finding_value = std::str::from_utf8(&matching_finding).unwrap_or("");
|
||||
// Use blob_id as the file/commit identifier
|
||||
let file_or_commit = &blob_match.blob_id.to_string();
|
||||
|
||||
let offset_start: u64 =
|
||||
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
|
||||
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
|
||||
|
||||
owned_blob_match.finding_fingerprint =
|
||||
compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end);
|
||||
|
||||
owned_blob_match
|
||||
}
|
||||
}
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// BlobMatch
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
|
@ -156,7 +65,7 @@ impl OwnedBlobMatch {
|
|||
/// `Match`.
|
||||
pub struct BlobMatch<'a> {
|
||||
/// The rule that was matched
|
||||
pub rule: Arc<Rule>, // Changed from `&'a Rule` to `Arc<Rule
|
||||
pub rule: Arc<Rule>,
|
||||
|
||||
/// The blob that was matched
|
||||
pub blob_id: &'a BlobId,
|
||||
|
|
@ -168,7 +77,7 @@ pub struct BlobMatch<'a> {
|
|||
pub matching_input_offset_span: OffsetSpan,
|
||||
|
||||
/// The capture groups from the match
|
||||
pub captures: SerializableCaptures, // regex::bytes::Captures<'a>,
|
||||
pub captures: SerializableCaptures,
|
||||
|
||||
pub validation_response_body: ValidationResponseBody,
|
||||
pub validation_response_status: StatusCode,
|
||||
|
|
@ -177,6 +86,7 @@ pub struct BlobMatch<'a> {
|
|||
pub calculated_entropy: f32,
|
||||
pub is_base64: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct UserData {
|
||||
/// A scratch vector for raw matches from Vectorscan, to minimize allocation
|
||||
|
|
@ -185,6 +95,7 @@ struct UserData {
|
|||
/// The length of the input being scanned
|
||||
input_len: u64,
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Matcher
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
|
|
@ -222,6 +133,7 @@ pub struct Matcher<'a> {
|
|||
/// Whether matches should honour `ignore_if_contains` requirements.
|
||||
respect_ignore_if_contains: bool,
|
||||
}
|
||||
|
||||
/// This `Drop` implementation updates the `global_stats` with the local stats
|
||||
impl<'a> Drop for Matcher<'a> {
|
||||
fn drop(&mut self) {
|
||||
|
|
@ -231,11 +143,19 @@ impl<'a> Drop for Matcher<'a> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ScanResult<'a> {
|
||||
SeenWithMatches,
|
||||
SeenSansMatches,
|
||||
New(Vec<BlobMatch<'a>>),
|
||||
}
|
||||
|
||||
impl<'a> Matcher<'a> {
|
||||
pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
|
||||
self.profiler.as_ref().map(|p| p.generate_report())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Matcher<'a> {
|
||||
/// Create a new `Matcher` from the given `RulesDatabase`.
|
||||
///
|
||||
|
|
@ -255,8 +175,6 @@ impl<'a> Matcher<'a> {
|
|||
// Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec
|
||||
let raw_matches_scratch = Vec::new();
|
||||
let user_data = UserData { raw_matches_scratch, input_len: 0 };
|
||||
// let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?;
|
||||
// pool is created once per scan run (see Scanner section below)
|
||||
let profiler = shared_profiler.or_else(|| {
|
||||
if enable_profiling {
|
||||
Some(Arc::new(ConcurrentRuleProfiler::new()))
|
||||
|
|
@ -349,7 +267,7 @@ impl<'a> Matcher<'a> {
|
|||
let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
|
||||
Vec::new()
|
||||
} else {
|
||||
get_base64_strings(blob.bytes())
|
||||
get_b64_strings(blob.bytes())
|
||||
};
|
||||
|
||||
let lang_hint = lang.as_deref();
|
||||
|
|
@ -530,7 +448,7 @@ impl<'a> Matcher<'a> {
|
|||
);
|
||||
}
|
||||
if depth + 1 < MAX_B64_DEPTH {
|
||||
for nested in get_base64_strings(item.decoded.as_slice()) {
|
||||
for nested in get_b64_strings(item.decoded.as_slice()) {
|
||||
b64_stack.push((
|
||||
DecodedData {
|
||||
decoded: nested.decoded,
|
||||
|
|
@ -559,268 +477,12 @@ impl<'a> Matcher<'a> {
|
|||
if self.user_data.raw_matches_scratch.capacity()
|
||||
> self.user_data.raw_matches_scratch.len() * 4
|
||||
{
|
||||
// Vec::shrink_to_fit may re-allocate, but we’re about to leave scan_blob
|
||||
// Vec::shrink_to_fit may re-allocate, but we're about to leave scan_blob
|
||||
// so the cost is hidden off the hot path.
|
||||
self.user_data.raw_matches_scratch.shrink_to_fit();
|
||||
}
|
||||
|
||||
Ok(ScanResult::New(matches))
|
||||
// Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
|
||||
let mut hasher = FxHasher::default();
|
||||
// Hash each component directly without allocation
|
||||
content.hash(&mut hasher);
|
||||
rule_id.hash(&mut hasher);
|
||||
start.hash(&mut hasher);
|
||||
end.hash(&mut hasher);
|
||||
hasher.finish()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
|
||||
let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
|
||||
if idx > 0 {
|
||||
if spans[idx - 1].fully_contains(&span) {
|
||||
return false;
|
||||
}
|
||||
if span.fully_contains(&spans[idx - 1]) {
|
||||
spans.remove(idx - 1);
|
||||
idx -= 1;
|
||||
}
|
||||
}
|
||||
if idx < spans.len() {
|
||||
if spans[idx].fully_contains(&span) {
|
||||
return false;
|
||||
}
|
||||
if span.fully_contains(&spans[idx]) {
|
||||
spans.remove(idx);
|
||||
}
|
||||
}
|
||||
spans.insert(idx, span);
|
||||
true
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_match(
|
||||
map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
|
||||
rule_id: usize,
|
||||
span: OffsetSpan,
|
||||
) -> bool {
|
||||
insert_span(map.entry(rule_id).or_default(), span)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn filter_match<'b>(
|
||||
blob: &'b Blob,
|
||||
rule: Arc<Rule>,
|
||||
re: &Regex,
|
||||
start: usize,
|
||||
end: usize,
|
||||
matches: &mut Vec<BlobMatch<'b>>,
|
||||
previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
|
||||
rule_id: usize,
|
||||
seen_matches: &mut FxHashSet<u64>,
|
||||
_origin: &OriginSet,
|
||||
ts_match: Option<&[u8]>,
|
||||
is_base64: bool,
|
||||
_redact: bool,
|
||||
filename: &str,
|
||||
profiler: Option<&Arc<ConcurrentRuleProfiler>>,
|
||||
respect_ignore_if_contains: bool,
|
||||
inline_ignore_config: &InlineIgnoreConfig,
|
||||
) {
|
||||
let mut timer =
|
||||
profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
|
||||
|
||||
let initial_len = matches.len();
|
||||
|
||||
let blob_bytes = blob.bytes();
|
||||
let default_slice = &blob_bytes[start..end];
|
||||
let haystack = ts_match.unwrap_or(default_slice);
|
||||
|
||||
for captures in re.captures_iter(haystack) {
|
||||
let full_capture = captures.get(0).unwrap();
|
||||
|
||||
// --- LOGIC TO FIND THE "SECRET" FOR ENTROPY/SAFE-LISTING ---
|
||||
let matching_input_for_entropy = 'block: {
|
||||
// 1. Prefer a named capture called TOKEN (case-insensitive).
|
||||
if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
|
||||
name_opt
|
||||
.filter(|name| name.eq_ignore_ascii_case("TOKEN"))
|
||||
.and_then(|_| captures.get(i))
|
||||
}) {
|
||||
break 'block token_cap;
|
||||
}
|
||||
|
||||
// 2. Otherwise, prefer the first *matched* named capture.
|
||||
if let Some(named_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
|
||||
name_opt.and_then(|_| captures.get(i)) // find(i > 0 && name_opt.is_some())
|
||||
}) {
|
||||
break 'block named_cap;
|
||||
}
|
||||
|
||||
// 3. Otherwise, fall back to the first positional capture (group 1).
|
||||
if let Some(pos_cap) = captures.get(1) {
|
||||
break 'block pos_cap;
|
||||
}
|
||||
|
||||
// 4. Finally, fall back to the full match (group 0).
|
||||
break 'block full_capture;
|
||||
};
|
||||
// --- END LOGIC ---
|
||||
|
||||
let min_entropy = rule.min_entropy();
|
||||
let entropy_bytes = matching_input_for_entropy.as_bytes();
|
||||
let full_bytes = full_capture.as_bytes();
|
||||
let calculated_entropy = calculate_shannon_entropy(entropy_bytes);
|
||||
|
||||
// Check entropy and safe-listing against the *selected* secret bytes
|
||||
if calculated_entropy <= min_entropy
|
||||
|| is_safe_match(entropy_bytes)
|
||||
|| is_user_match(entropy_bytes, full_bytes)
|
||||
{
|
||||
debug!(
|
||||
"Skipping match with entropy {} <= {} or safe match",
|
||||
calculated_entropy, min_entropy
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check character requirements if specified
|
||||
if let Some(char_reqs) = rule.pattern_requirements() {
|
||||
let context = PatternRequirementContext {
|
||||
regex: re,
|
||||
captures: &captures,
|
||||
full_match: full_bytes,
|
||||
};
|
||||
|
||||
// Decide which bytes to validate:
|
||||
// - If there are multiple capture groups OR any named captures → use full match
|
||||
// - Otherwise → use entropy_bytes (the actual secret)
|
||||
let use_full_match = {
|
||||
let has_named_captures = re.capture_names().any(|n| n.is_some());
|
||||
let capture_count = captures.len(); // includes group 0
|
||||
has_named_captures || capture_count > 2
|
||||
};
|
||||
|
||||
let validation_bytes = if use_full_match { full_bytes } else { entropy_bytes };
|
||||
|
||||
match char_reqs.validate(validation_bytes, Some(context), respect_ignore_if_contains) {
|
||||
//
|
||||
// --- END FIX ---
|
||||
PatternValidationResult::Passed => {}
|
||||
PatternValidationResult::Failed => {
|
||||
debug!(
|
||||
"Skipping match that does not meet character requirements for rule {}",
|
||||
rule.id()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
PatternValidationResult::FailedChecksum { actual_len, expected_len } => {
|
||||
debug!(
|
||||
"Skipping match for rule {} due to checksum mismatch (actual_len={}, expected_len={})",
|
||||
rule.id(),
|
||||
actual_len,
|
||||
expected_len
|
||||
);
|
||||
continue;
|
||||
}
|
||||
PatternValidationResult::IgnoredBySubstring { matched_term } => {
|
||||
debug!(
|
||||
"Skipping match for rule {} because it contains ignored term {matched_term}",
|
||||
rule.id()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Use the `matching_input_for_entropy` as the span/key for the finding.
|
||||
let matching_input = matching_input_for_entropy;
|
||||
|
||||
let matching_input_offset_span = OffsetSpan::from_range(
|
||||
(start + matching_input.start())..(start + matching_input.end()),
|
||||
);
|
||||
if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
|
||||
debug!("Skipping match due to inline ignore directive");
|
||||
continue;
|
||||
}
|
||||
if let Some(validation) = rule.syntax.validation.as_ref() {
|
||||
match validation {
|
||||
Validation::MongoDB => {
|
||||
let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
|
||||
debug!("Skipping match for rule {} due to non-UTF8 MongoDB URI", rule.id());
|
||||
continue;
|
||||
};
|
||||
if !is_parseable_mongodb_uri(uri) {
|
||||
debug!("Skipping match for rule {} due to invalid MongoDB URI", rule.id());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Validation::Postgres => {
|
||||
let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
|
||||
debug!(
|
||||
"Skipping match for rule {} due to non-UTF8 Postgres URI",
|
||||
rule.id()
|
||||
);
|
||||
continue;
|
||||
};
|
||||
if !is_parseable_postgres_uri(uri) {
|
||||
debug!("Skipping match for rule {} due to invalid Postgres URI", rule.id());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Validation::MySQL => {
|
||||
let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
|
||||
debug!("Skipping match for rule {} due to non-UTF8 MySQL URI", rule.id());
|
||||
continue;
|
||||
};
|
||||
if !is_parseable_mysql_uri(uri) {
|
||||
debug!("Skipping match for rule {} due to invalid MySQL URI", rule.id());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let match_key = compute_match_key(
|
||||
matching_input.as_bytes(),
|
||||
rule.id().as_bytes(),
|
||||
matching_input_offset_span.start,
|
||||
matching_input_offset_span.end,
|
||||
);
|
||||
if !seen_matches.insert(match_key) {
|
||||
continue;
|
||||
}
|
||||
if !record_match(previous_matches, rule_id, matching_input_offset_span) {
|
||||
continue;
|
||||
}
|
||||
let only_matching_input =
|
||||
&blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
|
||||
|
||||
// Pass the *full* capture object to from_captures
|
||||
let groups = SerializableCaptures::from_captures(&captures, haystack, re);
|
||||
|
||||
matches.push(BlobMatch {
|
||||
rule: Arc::clone(&rule),
|
||||
blob_id: blob.id_ref(),
|
||||
matching_input: only_matching_input,
|
||||
matching_input_offset_span,
|
||||
captures: groups,
|
||||
validation_response_body: None,
|
||||
validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
|
||||
validation_success: false,
|
||||
calculated_entropy,
|
||||
is_base64,
|
||||
});
|
||||
}
|
||||
if let Some(t) = timer.take() {
|
||||
let new_count = (matches.len() - initial_len) as u64;
|
||||
t.end(new_count > 0, new_count, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -837,10 +499,6 @@ fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, S
|
|||
"javascript" | "js" => {
|
||||
Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries()))
|
||||
}
|
||||
// "kotlin" => Some((
|
||||
// Language::Kotlin,
|
||||
// parser::queries::kotlin::get_kotlin_queries(),
|
||||
// )),
|
||||
"php" => Some((Language::Php, parser::queries::php::get_php_queries())),
|
||||
"python" | "py" | "starlark" => {
|
||||
Some((Language::Python, parser::queries::python::get_python_queries()))
|
||||
|
|
@ -855,354 +513,6 @@ fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, S
|
|||
_ => None,
|
||||
}
|
||||
}
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// MatchStats
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct MatcherStats {
|
||||
pub blobs_seen: u64,
|
||||
pub blobs_scanned: u64,
|
||||
pub bytes_seen: u64,
|
||||
pub bytes_scanned: u64,
|
||||
// #[cfg(feature = "rule_profiling")]
|
||||
// pub rule_stats: crate::rule_profiling::RuleProfile,
|
||||
}
|
||||
impl MatcherStats {
|
||||
pub fn update(&mut self, other: &Self) {
|
||||
self.blobs_seen += other.blobs_seen;
|
||||
self.blobs_scanned += other.blobs_scanned;
|
||||
self.bytes_seen += other.bytes_seen;
|
||||
self.bytes_scanned += other.bytes_scanned;
|
||||
|
||||
// #[cfg(feature = "rule_profiling")]
|
||||
// self.rule_stats.update(&other.rule_stats);
|
||||
}
|
||||
}
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Group
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)]
|
||||
pub struct Group(pub Base64BString);
|
||||
impl Group {
|
||||
pub fn new(m: regex::bytes::Match<'_>) -> Self {
|
||||
Self(Base64BString(BString::from(m.as_bytes())))
|
||||
}
|
||||
}
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Groups
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||
pub struct Groups(pub SmallVec<[Group; 1]>);
|
||||
impl JsonSchema for Groups {
|
||||
fn schema_name() -> String {
|
||||
"Groups".to_string()
|
||||
}
|
||||
|
||||
fn json_schema(gen: &mut SchemaGenerator) -> Schema {
|
||||
let group_schema = gen.subschema_for::<Group>();
|
||||
Schema::Object(schemars::schema::SchemaObject {
|
||||
instance_type: Some(InstanceType::Array.into()),
|
||||
array: Some(Box::new(ArrayValidation {
|
||||
items: Some(group_schema.into()),
|
||||
..Default::default()
|
||||
})),
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
}
|
||||
// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
|
||||
// pub struct SerializableCapture {
|
||||
// pub name: Option<String>, // Named group (if available)
|
||||
// pub match_number: i32,
|
||||
// pub start: usize, // Start position of the match
|
||||
// pub end: usize, // End position of the match
|
||||
// pub value: String, // The actual captured value
|
||||
// }
|
||||
#[derive(Debug, Clone, JsonSchema)]
|
||||
pub struct SerializableCapture {
|
||||
pub name: Option<&'static str>,
|
||||
pub match_number: i32,
|
||||
pub start: usize,
|
||||
pub end: usize,
|
||||
/// Interned original (unredacted) value.
|
||||
#[serde(skip_serializing, skip_deserializing)]
|
||||
pub value: &'static str,
|
||||
}
|
||||
|
||||
impl SerializableCapture {
|
||||
/// Returns the original captured value.
|
||||
pub fn raw_value(&self) -> &'static str {
|
||||
self.value
|
||||
}
|
||||
|
||||
/// Returns the value that should be shown in user-facing output.
|
||||
pub fn display_value(&self) -> std::borrow::Cow<'static, str> {
|
||||
crate::util::display_value(self.value)
|
||||
}
|
||||
}
|
||||
|
||||
impl serde::Serialize for SerializableCapture {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeStruct;
|
||||
|
||||
let mut state = serializer.serialize_struct("SerializableCapture", 5)?;
|
||||
state.serialize_field("name", &self.name)?;
|
||||
state.serialize_field("match_number", &self.match_number)?;
|
||||
state.serialize_field("start", &self.start)?;
|
||||
state.serialize_field("end", &self.end)?;
|
||||
let value = self.display_value();
|
||||
state.serialize_field("value", &value)?;
|
||||
state.end()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, JsonSchema)]
|
||||
pub struct SerializableCaptures {
|
||||
#[schemars(with = "Vec<SerializableCapture>")]
|
||||
pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
|
||||
}
|
||||
|
||||
impl SerializableCaptures {
|
||||
pub fn from_captures(captures: ®ex::bytes::Captures, _input: &[u8], re: &Regex) -> Self {
|
||||
let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
|
||||
|
||||
let capture_names: SmallVec<[Option<&'static str>; 4]> =
|
||||
re.capture_names().map(|name| name.map(intern)).collect();
|
||||
|
||||
// If there are explicit capture groups (e.g., group 1, 2, ...),
|
||||
// only serialize those.
|
||||
if captures.len() > 1 {
|
||||
for i in 1..captures.len() {
|
||||
// Start from 1
|
||||
if let Some(cap) = captures.get(i) {
|
||||
let raw_value = String::from_utf8_lossy(cap.as_bytes());
|
||||
let raw_interned = intern(raw_value.as_ref());
|
||||
let name = capture_names.get(i).and_then(|opt| *opt);
|
||||
|
||||
serialized_captures.push(SerializableCapture {
|
||||
name,
|
||||
match_number: i32::try_from(i).unwrap_or(0),
|
||||
start: cap.start(),
|
||||
end: cap.end(),
|
||||
value: raw_interned,
|
||||
});
|
||||
}
|
||||
}
|
||||
} else if captures.len() == 1 {
|
||||
// ELSE, if there is ONLY the full match (len == 1),
|
||||
// serialize just that full match (group 0) as the fallback.
|
||||
if let Some(cap) = captures.get(0) {
|
||||
let raw_value = String::from_utf8_lossy(cap.as_bytes());
|
||||
let raw_interned = intern(raw_value.as_ref());
|
||||
let name = capture_names.get(0).and_then(|opt| *opt);
|
||||
|
||||
serialized_captures.push(SerializableCapture {
|
||||
name,
|
||||
match_number: 0,
|
||||
start: cap.start(),
|
||||
end: cap.end(),
|
||||
value: raw_interned,
|
||||
});
|
||||
}
|
||||
}
|
||||
// If len == 0 (no match), loop is skipped, empty vec is returned.
|
||||
|
||||
SerializableCaptures { captures: serialized_captures }
|
||||
}
|
||||
}
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// Match
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
#[derive(Debug, Clone, Serialize, JsonSchema)]
|
||||
pub struct Match {
|
||||
/// The location of the entire matching content
|
||||
pub location: Location,
|
||||
|
||||
/// The capture groups
|
||||
pub groups: SerializableCaptures, // Store serialized captures
|
||||
|
||||
/// unique identifier of file / blob where this match was found
|
||||
pub blob_id: BlobId,
|
||||
|
||||
/// The unique content-based identifier of this match
|
||||
pub finding_fingerprint: u64,
|
||||
|
||||
/// The rule that produced this match
|
||||
#[serde(skip_serializing)]
|
||||
#[schemars(skip)]
|
||||
pub rule: Arc<Rule>,
|
||||
|
||||
/// Validation Body
|
||||
#[serde(
|
||||
default,
|
||||
serialize_with = "validation_body::serialize",
|
||||
deserialize_with = "validation_body::deserialize"
|
||||
)]
|
||||
#[schemars(schema_with = "validation_body::schema")]
|
||||
pub validation_response_body: ValidationResponseBody,
|
||||
|
||||
/// Validation Status Code
|
||||
pub validation_response_status: u16,
|
||||
|
||||
/// Validation Success
|
||||
pub validation_success: bool,
|
||||
|
||||
/// Validation Success
|
||||
pub calculated_entropy: f32,
|
||||
|
||||
pub visible: bool,
|
||||
#[serde(default)]
|
||||
pub is_base64: bool,
|
||||
|
||||
/// Variables captured from dependent rules (from depends_on_rule).
|
||||
/// Maps variable name (uppercase) to captured value.
|
||||
#[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
|
||||
pub dependent_captures: std::collections::BTreeMap<String, String>,
|
||||
}
|
||||
impl Match {
|
||||
#[inline]
|
||||
pub fn convert_owned_blobmatch_to_match<'a>(
|
||||
loc_mapping: Option<&'a LocationMapping<'a>>,
|
||||
owned_blob_match: &'a OwnedBlobMatch,
|
||||
origin_type: &'a str,
|
||||
) -> Self {
|
||||
let offset_span = owned_blob_match.matching_input_offset_span;
|
||||
// EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
|
||||
// See comment in from_blob_match() for why this differs from validation_dedup_key().
|
||||
let matching_finding_bytes = owned_blob_match
|
||||
.captures
|
||||
.captures
|
||||
.get(1)
|
||||
.or_else(|| owned_blob_match.captures.captures.get(0))
|
||||
.map(|capture| capture.raw_value().as_bytes())
|
||||
.unwrap_or_default();
|
||||
|
||||
// The fingerprint will be based on the content of the secret.
|
||||
let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
|
||||
|
||||
let source_span =
|
||||
loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
|
||||
start: SourcePoint { line: 0, column: 0 },
|
||||
end: SourcePoint { line: 0, column: 0 },
|
||||
});
|
||||
let offset_start: u64 =
|
||||
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
|
||||
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
|
||||
|
||||
let finding_fingerprint = compute_finding_fingerprint(
|
||||
finding_value_for_fp,
|
||||
origin_type, // file_or_commit,
|
||||
offset_start,
|
||||
offset_end,
|
||||
);
|
||||
|
||||
// matching_snippet
|
||||
Match {
|
||||
rule: owned_blob_match.rule.clone(),
|
||||
visible: owned_blob_match.rule.visible().to_owned(),
|
||||
location: Location::with_source_span(offset_span, Some(source_span.clone())),
|
||||
groups: owned_blob_match.captures.clone(),
|
||||
blob_id: owned_blob_match.blob_id,
|
||||
finding_fingerprint,
|
||||
validation_response_body: owned_blob_match.validation_response_body.clone(),
|
||||
validation_response_status: owned_blob_match.validation_response_status.as_u16(),
|
||||
validation_success: owned_blob_match.validation_success,
|
||||
calculated_entropy: owned_blob_match.calculated_entropy,
|
||||
is_base64: owned_blob_match.is_base64,
|
||||
dependent_captures: owned_blob_match.dependent_captures.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the `blob_id` of the match.
|
||||
pub fn get_blob_id(&self) -> BlobId {
|
||||
self.blob_id.clone()
|
||||
}
|
||||
|
||||
pub fn finding_id(&self) -> String {
|
||||
let mut buffer = Vec::with_capacity(128);
|
||||
buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes());
|
||||
buffer.push(0);
|
||||
serde_json::to_writer(&mut buffer, &self.groups)
|
||||
.expect("should be able to serialize groups as JSON");
|
||||
let mut num = xxh3_64(&buffer);
|
||||
// Ensure the number is positive and within i64 range
|
||||
num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive
|
||||
// Convert to string
|
||||
num.to_string()
|
||||
}
|
||||
}
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DecodedData {
|
||||
pub decoded: Vec<u8>,
|
||||
pub pos_start: usize,
|
||||
pub pos_end: usize,
|
||||
}
|
||||
#[inline]
|
||||
fn is_base64_byte(b: u8) -> bool {
|
||||
// Accepts both standard base64 ('+', '/') and URL-safe base64 ('-', '_') characters.
|
||||
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
|
||||
}
|
||||
|
||||
pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
|
||||
let mut results = Vec::new();
|
||||
let mut i = 0;
|
||||
while i < input.len() {
|
||||
while i < input.len() && !is_base64_byte(input[i]) {
|
||||
i += 1;
|
||||
}
|
||||
let start = i;
|
||||
while i < input.len() && is_base64_byte(input[i]) {
|
||||
i += 1;
|
||||
}
|
||||
|
||||
let mut eq_count = 0;
|
||||
while i < input.len() && input[i] == b'=' && eq_count < 2 {
|
||||
i += 1;
|
||||
eq_count += 1;
|
||||
}
|
||||
let end = i;
|
||||
|
||||
let len = end - start;
|
||||
if len >= 32 && len % 4 == 0 {
|
||||
let base64_slice = &input[start..end];
|
||||
|
||||
// Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD
|
||||
let decode_result = general_purpose::STANDARD
|
||||
.decode(base64_slice)
|
||||
.or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
|
||||
.or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
|
||||
|
||||
if let Ok(decoded) = decode_result {
|
||||
if decoded.is_ascii() {
|
||||
results.push(DecodedData { decoded, pos_start: start, pos_end: end });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
pub fn compute_finding_fingerprint(
|
||||
finding_value: &str,
|
||||
file_or_commit: &str,
|
||||
offset_start: u64,
|
||||
offset_end: u64,
|
||||
) -> u64 {
|
||||
// Combine all into a byte buffer and hash it directly:
|
||||
let mut buf = Vec::with_capacity(
|
||||
finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::<u64>(),
|
||||
);
|
||||
buf.extend_from_slice(finding_value.as_bytes());
|
||||
buf.extend_from_slice(file_or_commit.as_bytes());
|
||||
buf.extend_from_slice(&offset_start.to_le_bytes());
|
||||
buf.extend_from_slice(&offset_end.to_le_bytes());
|
||||
|
||||
xxh3_64(&buf)
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// test
|
||||
|
|
@ -1220,6 +530,7 @@ mod test {
|
|||
use super::*;
|
||||
use crate::{
|
||||
blob::{Blob, BlobIdMap},
|
||||
entropy::calculate_shannon_entropy,
|
||||
origin::{Origin, OriginSet},
|
||||
rules::rule::{
|
||||
DependsOnRule, HttpRequest, HttpValidation, PatternRequirements, RuleSyntax, Validation,
|
||||
|
|
@ -1341,8 +652,6 @@ mod test {
|
|||
let input = "some test data for vectorscan";
|
||||
let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
|
||||
let enable_rule_profiling = true;
|
||||
// let mut matcher = Matcher::new(&rules_db, &seen_blobs, None,
|
||||
// enable_rule_profiling)?;
|
||||
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
|
||||
let mut matcher = Matcher::new(
|
||||
&rules_db,
|
||||
|
|
@ -1513,7 +822,7 @@ mod test {
|
|||
assert_eq!((item.pos_start, item.pos_end), (4, 4 + base64_payload.len()));
|
||||
}
|
||||
|
||||
/// `compute_finding_fingerprint` must be stable (same input ⇒ same output)
|
||||
/// `compute_finding_fingerprint` must be stable (same input => same output)
|
||||
/// and sensitive to any input component.
|
||||
#[test]
|
||||
fn test_finding_fingerprint_stability_and_uniqueness() {
|
||||
|
|
@ -1535,7 +844,7 @@ mod test {
|
|||
/// keys as soon as *anything* changes.
|
||||
#[test]
|
||||
fn test_compute_match_key_uniqueness() {
|
||||
use super::compute_match_key;
|
||||
use super::dedup::compute_match_key;
|
||||
|
||||
let k1 = compute_match_key(b"abc", b"rule-1", 0, 3);
|
||||
let k2 = compute_match_key(b"abc", b"rule-1", 0, 3);
|
||||
|
|
@ -1721,6 +1030,8 @@ line2
|
|||
|
||||
#[test]
|
||||
fn serializes_captures_in_numeric_order() {
|
||||
use regex::bytes::Regex;
|
||||
|
||||
let re =
|
||||
Regex::new(r"(?xi)\b(ghp_(?P<body>[A-Z0-9]{3})(?P<checksum>[A-Z0-9]{2}))").unwrap();
|
||||
let caps = re.captures(b"ghp_ABC12").expect("expected captures");
|
||||
|
|
@ -42,9 +42,18 @@ use crate::{
|
|||
AccessMapCollector,
|
||||
},
|
||||
util::set_redaction_enabled,
|
||||
validation::CachedResponse,
|
||||
validation_rate_limit::ValidationRateLimiter,
|
||||
};
|
||||
|
||||
/// Shared validation dependencies: (liquid parser, HTTP clients, validation cache, rate limiter).
|
||||
type ValidationDeps = Arc<(
|
||||
liquid::Parser,
|
||||
crate::validation::ValidationClients,
|
||||
Arc<SkipMap<String, CachedResponse>>,
|
||||
Option<Arc<ValidationRateLimiter>>,
|
||||
)>;
|
||||
|
||||
pub async fn run_scan(
|
||||
global_args: &global::GlobalArgs,
|
||||
scan_args: &scan::ScanArgs,
|
||||
|
|
@ -64,22 +73,9 @@ pub async fn run_async_scan(
|
|||
rules_db: &RulesDatabase,
|
||||
update_status: &crate::update::UpdateStatus,
|
||||
) -> Result<()> {
|
||||
// Ensure all provided paths exist before proceeding
|
||||
for path in &args.input_specifier_args.path_inputs {
|
||||
if !path.exists() {
|
||||
error!("Specified input path does not exist: {}", path.display());
|
||||
bail!("Invalid input: Path does not exist - {}", path.display());
|
||||
}
|
||||
}
|
||||
|
||||
// Register user-provided allow-list patterns
|
||||
for pattern in &args.skip_regex {
|
||||
safe_list::add_user_regex(pattern)
|
||||
.map_err(|e| anyhow::anyhow!("Invalid skip-regex '{pattern}': {e}"))?;
|
||||
}
|
||||
for word in &args.skip_word {
|
||||
safe_list::add_user_skipword(word);
|
||||
}
|
||||
// ── Phase 1: Input validation and environment setup ──────────────────
|
||||
validate_inputs(args)?;
|
||||
register_safe_list_patterns(args)?;
|
||||
|
||||
let start_time = Instant::now();
|
||||
let scan_started_at = chrono::Local::now();
|
||||
|
|
@ -90,123 +86,26 @@ pub async fn run_async_scan(
|
|||
|
||||
set_redaction_enabled(args.redact);
|
||||
|
||||
let mut repo_urls = enumerate_github_repos(args, global_args).await?;
|
||||
let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
|
||||
let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
|
||||
let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
|
||||
let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
|
||||
let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
|
||||
|
||||
// Combine repository URLs
|
||||
repo_urls.extend(gitlab_repo_urls);
|
||||
repo_urls.extend(gitea_repo_urls);
|
||||
repo_urls.extend(huggingface_repo_urls);
|
||||
repo_urls.extend(bitbucket_repo_urls);
|
||||
repo_urls.extend(azure_repo_urls);
|
||||
|
||||
// Add wiki repositories for each URL when requested
|
||||
if args.input_specifier_args.repo_artifacts {
|
||||
let mut wiki_urls = Vec::new();
|
||||
for url in &repo_urls {
|
||||
if let Some(w) = github::wiki_url(url) {
|
||||
wiki_urls.push(w);
|
||||
}
|
||||
if let Some(w) = gitlab::wiki_url(url) {
|
||||
wiki_urls.push(w);
|
||||
}
|
||||
if let Some(w) = gitea::wiki_url(url) {
|
||||
wiki_urls.push(w);
|
||||
}
|
||||
if let Some(w) = bitbucket::wiki_url(url) {
|
||||
wiki_urls.push(w);
|
||||
}
|
||||
if let Some(w) = azure::wiki_url(url) {
|
||||
wiki_urls.push(w);
|
||||
}
|
||||
}
|
||||
repo_urls.extend(wiki_urls);
|
||||
}
|
||||
|
||||
// just sort and dedup once
|
||||
repo_urls.sort();
|
||||
repo_urls.dedup();
|
||||
// ── Phase 2: Repository enumeration ─────────────────────────────────
|
||||
let repo_urls = enumerate_all_repos(args, global_args).await?;
|
||||
|
||||
let mut input_roots = args.input_specifier_args.path_inputs.clone();
|
||||
let (repo_tx, repo_rx) = crossbeam_channel::unbounded();
|
||||
let repo_clone_handle = if repo_urls.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let clone_args = args.clone();
|
||||
let clone_globals = global_args.clone();
|
||||
let clone_repo_urls = repo_urls.clone();
|
||||
let clone_datastore = Arc::clone(&datastore);
|
||||
let clone_repo_tx = repo_tx.clone();
|
||||
Some(std::thread::spawn(move || {
|
||||
if let Err(e) = clone_or_update_git_repos_streaming(
|
||||
&clone_args,
|
||||
&clone_globals,
|
||||
&clone_repo_urls,
|
||||
&clone_datastore,
|
||||
|path| {
|
||||
let _ = clone_repo_tx.send(path);
|
||||
},
|
||||
) {
|
||||
error!("Failed to fetch one or more Git repositories: {e}");
|
||||
}
|
||||
}))
|
||||
};
|
||||
drop(repo_tx);
|
||||
let repo_clone_handle =
|
||||
start_repo_cloning(&repo_urls, args, global_args, &datastore, repo_tx, progress_enabled);
|
||||
|
||||
// Fetch issues, gists, and wikis if enabled
|
||||
let bitbucket_auth = bitbucket::AuthConfig::from_env();
|
||||
let bitbucket_host =
|
||||
args.input_specifier_args.bitbucket_api_url.host_str().map(|s| s.to_string());
|
||||
|
||||
if args.input_specifier_args.repo_artifacts {
|
||||
let repo_artifact_dirs = fetch_git_host_artifacts(
|
||||
&repo_urls,
|
||||
&args.input_specifier_args.bitbucket_api_url,
|
||||
&bitbucket_auth,
|
||||
bitbucket_host.clone(),
|
||||
global_args,
|
||||
&datastore,
|
||||
)
|
||||
.await?;
|
||||
input_roots.extend(repo_artifact_dirs);
|
||||
}
|
||||
// Fetch Jira issues if requested
|
||||
let jira_dirs = fetch_jira_issues(args, global_args, &datastore).await?;
|
||||
input_roots.extend(jira_dirs);
|
||||
|
||||
// Fetch Confluence pages if requested
|
||||
let confluence_dirs = fetch_confluence_pages(args, global_args, &datastore).await?;
|
||||
input_roots.extend(confluence_dirs);
|
||||
|
||||
// Fetch Slack messages if requested
|
||||
let slack_dirs = fetch_slack_messages(args, global_args, &datastore).await?;
|
||||
input_roots.extend(slack_dirs);
|
||||
|
||||
// Save Docker images if specified
|
||||
if !args.input_specifier_args.docker_image.is_empty() {
|
||||
let clone_root = {
|
||||
let ds = datastore.lock().unwrap();
|
||||
ds.clone_root()
|
||||
};
|
||||
let docker_dirs = save_docker_images(
|
||||
&args.input_specifier_args.docker_image,
|
||||
&clone_root,
|
||||
progress_enabled,
|
||||
)
|
||||
.await?;
|
||||
for (dir, img) in docker_dirs {
|
||||
{
|
||||
let mut ds = datastore.lock().unwrap();
|
||||
ds.register_docker_image(dir.clone(), img);
|
||||
}
|
||||
input_roots.push(dir);
|
||||
}
|
||||
}
|
||||
// ── Phase 3: Artifact fetching ──────────────────────────────────────
|
||||
fetch_all_artifacts(
|
||||
args,
|
||||
global_args,
|
||||
&repo_urls,
|
||||
&datastore,
|
||||
&mut input_roots,
|
||||
progress_enabled,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// ── Phase 4: Scan configuration ─────────────────────────────────────
|
||||
let shared_profiler = Arc::new(ConcurrentRuleProfiler::new());
|
||||
let enable_profiling = args.rule_stats;
|
||||
let matcher_stats = Arc::new(Mutex::new(MatcherStats::default()));
|
||||
|
|
@ -246,11 +145,257 @@ pub async fn run_async_scan(
|
|||
.unwrap_or_else(|| std::path::PathBuf::from("baseline-file.yaml")),
|
||||
);
|
||||
|
||||
let mut skip_aws_accounts = args.skip_aws_account.clone();
|
||||
let skip_aws_accounts = load_skip_aws_accounts(args)?;
|
||||
crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
|
||||
|
||||
let mut access_map_collector =
|
||||
if args.access_map { Some(AccessMapCollector::default()) } else { None };
|
||||
|
||||
let repo_roots = expand_repo_roots(&input_roots)?;
|
||||
let git_repo_count =
|
||||
repo_roots.iter().filter(|p| p.join(".git").is_dir()).count() + repo_urls.len();
|
||||
let use_parallel_repo_scan = git_repo_count > 10;
|
||||
|
||||
let validation_rate_limiter =
|
||||
ValidationRateLimiter::from_cli(args.validation_rps, &args.validation_rps_rule)?
|
||||
.map(Arc::new);
|
||||
|
||||
let validation_deps: Option<ValidationDeps> = if !args.no_validate {
|
||||
info!("Starting secret validation phase...");
|
||||
Some(Arc::new((
|
||||
register_all(liquid::ParserBuilder::with_stdlib()).build()?,
|
||||
crate::validation::ValidationClients::new(global_args.tls_mode)?,
|
||||
Arc::new(SkipMap::new()),
|
||||
validation_rate_limiter.clone(),
|
||||
)))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// ── Phase 5: Scanning ───────────────────────────────────────────────
|
||||
if !use_parallel_repo_scan {
|
||||
run_sequential_scan(
|
||||
args,
|
||||
global_args,
|
||||
&datastore,
|
||||
rules_db,
|
||||
&mut input_roots,
|
||||
repo_rx,
|
||||
repo_clone_handle,
|
||||
&shared_profiler,
|
||||
enable_profiling,
|
||||
&matcher_stats,
|
||||
&baseline_path,
|
||||
&validation_deps,
|
||||
&mut access_map_collector,
|
||||
progress_enabled,
|
||||
start_time,
|
||||
scan_started_at,
|
||||
update_status,
|
||||
)
|
||||
.await?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
run_parallel_scan(
|
||||
args,
|
||||
global_args,
|
||||
&datastore,
|
||||
rules_db,
|
||||
&repo_roots,
|
||||
repo_rx,
|
||||
repo_clone_handle,
|
||||
&shared_profiler,
|
||||
enable_profiling,
|
||||
&matcher_stats,
|
||||
&baseline_path,
|
||||
&validation_deps,
|
||||
&mut access_map_collector,
|
||||
progress_enabled,
|
||||
start_time,
|
||||
scan_started_at,
|
||||
update_status,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
// Phase helpers
|
||||
// =================================================================================================
|
||||
|
||||
/// Validates that all provided input paths exist.
|
||||
fn validate_inputs(args: &scan::ScanArgs) -> Result<()> {
|
||||
for path in &args.input_specifier_args.path_inputs {
|
||||
if !path.exists() {
|
||||
error!("Specified input path does not exist: {}", path.display());
|
||||
bail!("Invalid input: Path does not exist - {}", path.display());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Registers user-provided allow-list patterns (skip-regex and skip-word).
|
||||
fn register_safe_list_patterns(args: &scan::ScanArgs) -> Result<()> {
|
||||
for pattern in &args.skip_regex {
|
||||
safe_list::add_user_regex(pattern)
|
||||
.map_err(|e| anyhow::anyhow!("Invalid skip-regex '{pattern}': {e}"))?;
|
||||
}
|
||||
for word in &args.skip_word {
|
||||
safe_list::add_user_skipword(word);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Enumerates repositories from all configured platforms, adds wiki URLs, and deduplicates.
|
||||
async fn enumerate_all_repos(
|
||||
args: &scan::ScanArgs,
|
||||
global_args: &global::GlobalArgs,
|
||||
) -> Result<Vec<crate::git_url::GitUrl>> {
|
||||
let mut repo_urls = enumerate_github_repos(args, global_args).await?;
|
||||
let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
|
||||
let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
|
||||
let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
|
||||
let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
|
||||
let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
|
||||
|
||||
repo_urls.extend(gitlab_repo_urls);
|
||||
repo_urls.extend(gitea_repo_urls);
|
||||
repo_urls.extend(huggingface_repo_urls);
|
||||
repo_urls.extend(bitbucket_repo_urls);
|
||||
repo_urls.extend(azure_repo_urls);
|
||||
|
||||
// Add wiki repositories for each URL when requested
|
||||
if args.input_specifier_args.repo_artifacts {
|
||||
let mut wiki_urls = Vec::new();
|
||||
for url in &repo_urls {
|
||||
if let Some(w) = github::wiki_url(url) {
|
||||
wiki_urls.push(w);
|
||||
}
|
||||
if let Some(w) = gitlab::wiki_url(url) {
|
||||
wiki_urls.push(w);
|
||||
}
|
||||
if let Some(w) = gitea::wiki_url(url) {
|
||||
wiki_urls.push(w);
|
||||
}
|
||||
if let Some(w) = bitbucket::wiki_url(url) {
|
||||
wiki_urls.push(w);
|
||||
}
|
||||
if let Some(w) = azure::wiki_url(url) {
|
||||
wiki_urls.push(w);
|
||||
}
|
||||
}
|
||||
repo_urls.extend(wiki_urls);
|
||||
}
|
||||
|
||||
repo_urls.sort();
|
||||
repo_urls.dedup();
|
||||
|
||||
Ok(repo_urls)
|
||||
}
|
||||
|
||||
/// Spawns a background thread to clone/update git repositories, streaming results via a channel.
|
||||
fn start_repo_cloning(
|
||||
repo_urls: &[crate::git_url::GitUrl],
|
||||
args: &scan::ScanArgs,
|
||||
global_args: &global::GlobalArgs,
|
||||
datastore: &Arc<Mutex<FindingsStore>>,
|
||||
repo_tx: crossbeam_channel::Sender<PathBuf>,
|
||||
_progress_enabled: bool,
|
||||
) -> Option<std::thread::JoinHandle<()>> {
|
||||
if repo_urls.is_empty() {
|
||||
drop(repo_tx);
|
||||
return None;
|
||||
}
|
||||
|
||||
let clone_args = args.clone();
|
||||
let clone_globals = global_args.clone();
|
||||
let clone_repo_urls = repo_urls.to_vec();
|
||||
let clone_datastore = Arc::clone(datastore);
|
||||
let clone_repo_tx = repo_tx.clone();
|
||||
|
||||
let handle = std::thread::spawn(move || {
|
||||
if let Err(e) = clone_or_update_git_repos_streaming(
|
||||
&clone_args,
|
||||
&clone_globals,
|
||||
&clone_repo_urls,
|
||||
&clone_datastore,
|
||||
|path| {
|
||||
let _ = clone_repo_tx.send(path);
|
||||
},
|
||||
) {
|
||||
error!("Failed to fetch one or more Git repositories: {e}");
|
||||
}
|
||||
});
|
||||
drop(repo_tx);
|
||||
Some(handle)
|
||||
}
|
||||
|
||||
/// Fetches artifacts from various platforms (issues, wikis, Jira, Confluence, Slack, Docker).
|
||||
async fn fetch_all_artifacts(
|
||||
args: &scan::ScanArgs,
|
||||
global_args: &global::GlobalArgs,
|
||||
repo_urls: &[crate::git_url::GitUrl],
|
||||
datastore: &Arc<Mutex<FindingsStore>>,
|
||||
input_roots: &mut Vec<PathBuf>,
|
||||
progress_enabled: bool,
|
||||
) -> Result<()> {
|
||||
let bitbucket_auth = bitbucket::AuthConfig::from_env();
|
||||
let bitbucket_host =
|
||||
args.input_specifier_args.bitbucket_api_url.host_str().map(|s| s.to_string());
|
||||
|
||||
if args.input_specifier_args.repo_artifacts {
|
||||
let repo_artifact_dirs = fetch_git_host_artifacts(
|
||||
repo_urls,
|
||||
&args.input_specifier_args.bitbucket_api_url,
|
||||
&bitbucket_auth,
|
||||
bitbucket_host.clone(),
|
||||
global_args,
|
||||
datastore,
|
||||
)
|
||||
.await?;
|
||||
input_roots.extend(repo_artifact_dirs);
|
||||
}
|
||||
|
||||
// Fetch Jira issues if requested
|
||||
let jira_dirs = fetch_jira_issues(args, global_args, datastore).await?;
|
||||
input_roots.extend(jira_dirs);
|
||||
|
||||
// Fetch Confluence pages if requested
|
||||
let confluence_dirs = fetch_confluence_pages(args, global_args, datastore).await?;
|
||||
input_roots.extend(confluence_dirs);
|
||||
|
||||
// Fetch Slack messages if requested
|
||||
let slack_dirs = fetch_slack_messages(args, global_args, datastore).await?;
|
||||
input_roots.extend(slack_dirs);
|
||||
|
||||
// Save Docker images if specified
|
||||
if !args.input_specifier_args.docker_image.is_empty() {
|
||||
let clone_root = {
|
||||
let ds = datastore.lock().unwrap();
|
||||
ds.clone_root()
|
||||
};
|
||||
let docker_dirs = save_docker_images(
|
||||
&args.input_specifier_args.docker_image,
|
||||
&clone_root,
|
||||
progress_enabled,
|
||||
)
|
||||
.await?;
|
||||
for (dir, img) in docker_dirs {
|
||||
{
|
||||
let mut ds = datastore.lock().unwrap();
|
||||
ds.register_docker_image(dir.clone(), img);
|
||||
}
|
||||
input_roots.push(dir);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Loads AWS account IDs to skip from CLI args and optional file.
|
||||
fn load_skip_aws_accounts(args: &scan::ScanArgs) -> Result<Vec<String>> {
|
||||
let mut skip_aws_accounts = args.skip_aws_account.clone();
|
||||
|
||||
if let Some(path) = args.skip_aws_account_file.as_ref() {
|
||||
let contents = fs::read_to_string(path).with_context(|| {
|
||||
format!("Failed to read --skip-aws-account-file {}", path.display())
|
||||
|
|
@ -267,183 +412,214 @@ pub async fn run_async_scan(
|
|||
}
|
||||
}
|
||||
|
||||
crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
|
||||
Ok(skip_aws_accounts)
|
||||
}
|
||||
|
||||
let repo_roots = expand_repo_roots(&input_roots)?;
|
||||
let git_repo_count =
|
||||
repo_roots.iter().filter(|p| p.join(".git").is_dir()).count() + repo_urls.len();
|
||||
let use_parallel_repo_scan = git_repo_count > 10;
|
||||
|
||||
let validation_rate_limiter =
|
||||
ValidationRateLimiter::from_cli(args.validation_rps, &args.validation_rps_rule)?
|
||||
.map(Arc::new);
|
||||
|
||||
let validation_deps = if !args.no_validate {
|
||||
info!("Starting secret validation phase...");
|
||||
Some(Arc::new((
|
||||
register_all(liquid::ParserBuilder::with_stdlib()).build()?,
|
||||
crate::validation::ValidationClients::new(global_args.tls_mode)?,
|
||||
Arc::new(SkipMap::new()),
|
||||
validation_rate_limiter.clone(),
|
||||
)))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
if !use_parallel_repo_scan {
|
||||
let mut streamed_roots = Vec::new();
|
||||
if !input_roots.is_empty() {
|
||||
let _inputs = enumerate_filesystem_inputs(
|
||||
args,
|
||||
datastore.clone(),
|
||||
&input_roots,
|
||||
progress_enabled,
|
||||
rules_db,
|
||||
enable_profiling,
|
||||
Arc::clone(&shared_profiler),
|
||||
matcher_stats.as_ref(),
|
||||
)?;
|
||||
}
|
||||
|
||||
for repo_root in repo_rx.clone().iter() {
|
||||
enumerate_filesystem_inputs(
|
||||
args,
|
||||
datastore.clone(),
|
||||
&[repo_root.clone()],
|
||||
progress_enabled,
|
||||
rules_db,
|
||||
enable_profiling,
|
||||
Arc::clone(&shared_profiler),
|
||||
matcher_stats.as_ref(),
|
||||
)?;
|
||||
streamed_roots.push(repo_root);
|
||||
}
|
||||
input_roots.extend(streamed_roots);
|
||||
|
||||
if let Some(handle) = repo_clone_handle {
|
||||
let _ = handle.join();
|
||||
}
|
||||
|
||||
if !args.no_dedup {
|
||||
let reporter = crate::reporter::DetailsReporter {
|
||||
datastore: Arc::clone(&datastore),
|
||||
styles: Styles::new(global_args.use_color(std::io::stdout())),
|
||||
only_valid: args.only_valid,
|
||||
};
|
||||
|
||||
let all_matches = reporter.get_unfiltered_matches(Some(false))?;
|
||||
let deduped_matches = reporter.deduplicate_matches(all_matches, args.no_dedup);
|
||||
|
||||
let deduped_arcs: Vec<Arc<FindingsStoreMessage>> = deduped_matches
|
||||
.into_iter()
|
||||
.map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m)))
|
||||
.collect();
|
||||
let mut ds = datastore.lock().unwrap();
|
||||
ds.replace_matches(deduped_arcs);
|
||||
}
|
||||
|
||||
if args.baseline_file.is_some() || args.manage_baseline {
|
||||
let mut ds = datastore.lock().unwrap();
|
||||
crate::baseline::apply_baseline(
|
||||
&mut ds,
|
||||
baseline_path.as_ref(),
|
||||
args.manage_baseline,
|
||||
&input_roots,
|
||||
)?;
|
||||
}
|
||||
|
||||
if let Some(validation) = &validation_deps {
|
||||
let (parser, clients, cache, rate_limiter) =
|
||||
(&validation.0, &validation.1, &validation.2, &validation.3);
|
||||
run_secret_validation(
|
||||
Arc::clone(&datastore),
|
||||
parser,
|
||||
clients,
|
||||
cache,
|
||||
args.num_jobs,
|
||||
None,
|
||||
access_map_collector.clone(),
|
||||
rate_limiter.clone(),
|
||||
Duration::from_secs(args.validation_timeout),
|
||||
args.validation_retries,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
if let Some(collector) = access_map_collector.take() {
|
||||
finalize_access_map(&datastore, collector, args).await?;
|
||||
}
|
||||
|
||||
crate::reporter::run(global_args, Arc::clone(&datastore), args)
|
||||
.context("Failed to run report command")?;
|
||||
print_scan_summary(
|
||||
start_time,
|
||||
scan_started_at,
|
||||
&datastore,
|
||||
global_args,
|
||||
args,
|
||||
rules_db,
|
||||
matcher_stats.as_ref(),
|
||||
if enable_profiling { Some(shared_profiler.as_ref()) } else { None },
|
||||
update_status,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
maybe_hint_access_map(&datastore, args);
|
||||
/// Deduplicates matches in the datastore starting from `start_index`.
|
||||
fn deduplicate_new_matches(
|
||||
store: &Arc<Mutex<FindingsStore>>,
|
||||
global_args: &global::GlobalArgs,
|
||||
args: &scan::ScanArgs,
|
||||
start_index: usize,
|
||||
) -> Result<()> {
|
||||
if args.no_dedup {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let deduplicate_new_matches =
|
||||
|store: &Arc<Mutex<FindingsStore>>, start_index: usize| -> Result<()> {
|
||||
if args.no_dedup {
|
||||
return Ok(());
|
||||
}
|
||||
let reporter = crate::reporter::DetailsReporter {
|
||||
datastore: Arc::clone(store),
|
||||
styles: Styles::new(global_args.use_color(std::io::stdout())),
|
||||
only_valid: args.only_valid,
|
||||
};
|
||||
|
||||
let reporter = crate::reporter::DetailsReporter {
|
||||
datastore: Arc::clone(store),
|
||||
styles: Styles::new(global_args.use_color(std::io::stdout())),
|
||||
only_valid: args.only_valid,
|
||||
};
|
||||
let all_matches = reporter.get_unfiltered_matches(Some(false))?;
|
||||
if start_index >= all_matches.len() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let all_matches = reporter.get_unfiltered_matches(Some(false))?;
|
||||
if start_index >= all_matches.len() {
|
||||
return Ok(());
|
||||
}
|
||||
let slice = if start_index == 0 { all_matches } else { all_matches[start_index..].to_vec() };
|
||||
let deduped_matches = reporter.deduplicate_matches(slice, args.no_dedup);
|
||||
|
||||
let deduped_matches =
|
||||
reporter.deduplicate_matches(all_matches[start_index..].to_vec(), args.no_dedup);
|
||||
let deduped_arcs: Vec<Arc<FindingsStoreMessage>> = deduped_matches
|
||||
.into_iter()
|
||||
.map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m)))
|
||||
.collect();
|
||||
|
||||
let deduped_arcs: Vec<Arc<FindingsStoreMessage>> = deduped_matches
|
||||
.into_iter()
|
||||
.map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m)))
|
||||
.collect();
|
||||
|
||||
let mut ds = store.lock().unwrap();
|
||||
let mut preserved = ds.get_matches()[..start_index].to_vec();
|
||||
preserved.extend(deduped_arcs);
|
||||
ds.replace_matches(preserved);
|
||||
Ok(())
|
||||
};
|
||||
|
||||
deduplicate_new_matches(&datastore, 0)?;
|
||||
let mut ds = store.lock().unwrap();
|
||||
if start_index == 0 {
|
||||
ds.replace_matches(deduped_arcs);
|
||||
} else {
|
||||
let mut preserved = ds.get_matches()[..start_index].to_vec();
|
||||
preserved.extend(deduped_arcs);
|
||||
ds.replace_matches(preserved);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Applies baseline filtering if configured.
|
||||
fn apply_baseline_if_configured(
|
||||
args: &scan::ScanArgs,
|
||||
datastore: &Arc<Mutex<FindingsStore>>,
|
||||
baseline_path: &std::path::Path,
|
||||
roots: &[PathBuf],
|
||||
) -> Result<()> {
|
||||
if args.baseline_file.is_some() || args.manage_baseline {
|
||||
let mut ds = datastore.lock().unwrap();
|
||||
crate::baseline::apply_baseline(
|
||||
&mut ds,
|
||||
baseline_path.as_ref(),
|
||||
args.manage_baseline,
|
||||
&repo_roots,
|
||||
crate::baseline::apply_baseline(&mut ds, baseline_path, args.manage_baseline, roots)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Runs the validation phase on matches in the datastore.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn run_validation_phase(
|
||||
datastore: &Arc<Mutex<FindingsStore>>,
|
||||
validation_deps: &Option<ValidationDeps>,
|
||||
args: &scan::ScanArgs,
|
||||
match_range: Option<std::ops::Range<usize>>,
|
||||
access_map_collector: Option<AccessMapCollector>,
|
||||
) -> Result<()> {
|
||||
if let Some(validation) = validation_deps {
|
||||
let (parser, clients, cache, rate_limiter) =
|
||||
(&validation.0, &validation.1, &validation.2, &validation.3);
|
||||
run_secret_validation(
|
||||
Arc::clone(datastore),
|
||||
parser,
|
||||
clients,
|
||||
cache,
|
||||
args.num_jobs,
|
||||
match_range,
|
||||
access_map_collector,
|
||||
rate_limiter.clone(),
|
||||
Duration::from_secs(args.validation_timeout),
|
||||
args.validation_retries,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
// Sequential scan path
|
||||
// =================================================================================================
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn run_sequential_scan(
|
||||
args: &scan::ScanArgs,
|
||||
global_args: &global::GlobalArgs,
|
||||
datastore: &Arc<Mutex<FindingsStore>>,
|
||||
rules_db: &RulesDatabase,
|
||||
input_roots: &mut Vec<PathBuf>,
|
||||
repo_rx: crossbeam_channel::Receiver<PathBuf>,
|
||||
repo_clone_handle: Option<std::thread::JoinHandle<()>>,
|
||||
shared_profiler: &Arc<ConcurrentRuleProfiler>,
|
||||
enable_profiling: bool,
|
||||
matcher_stats: &Arc<Mutex<MatcherStats>>,
|
||||
baseline_path: &Arc<PathBuf>,
|
||||
validation_deps: &Option<ValidationDeps>,
|
||||
access_map_collector: &mut Option<AccessMapCollector>,
|
||||
progress_enabled: bool,
|
||||
start_time: Instant,
|
||||
scan_started_at: chrono::DateTime<chrono::Local>,
|
||||
update_status: &crate::update::UpdateStatus,
|
||||
) -> Result<()> {
|
||||
let mut streamed_roots = Vec::new();
|
||||
if !input_roots.is_empty() {
|
||||
let _inputs = enumerate_filesystem_inputs(
|
||||
args,
|
||||
datastore.clone(),
|
||||
input_roots,
|
||||
progress_enabled,
|
||||
rules_db,
|
||||
enable_profiling,
|
||||
Arc::clone(shared_profiler),
|
||||
matcher_stats.as_ref(),
|
||||
)?;
|
||||
}
|
||||
|
||||
if let Some(validation) = &validation_deps {
|
||||
for repo_root in repo_rx.iter() {
|
||||
enumerate_filesystem_inputs(
|
||||
args,
|
||||
datastore.clone(),
|
||||
&[repo_root.clone()],
|
||||
progress_enabled,
|
||||
rules_db,
|
||||
enable_profiling,
|
||||
Arc::clone(shared_profiler),
|
||||
matcher_stats.as_ref(),
|
||||
)?;
|
||||
streamed_roots.push(repo_root);
|
||||
}
|
||||
input_roots.extend(streamed_roots);
|
||||
|
||||
if let Some(handle) = repo_clone_handle {
|
||||
let _ = handle.join();
|
||||
}
|
||||
|
||||
deduplicate_new_matches(datastore, global_args, args, 0)?;
|
||||
apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), input_roots)?;
|
||||
|
||||
run_validation_phase(datastore, validation_deps, args, None, access_map_collector.clone())
|
||||
.await?;
|
||||
|
||||
if let Some(collector) = access_map_collector.take() {
|
||||
finalize_access_map(datastore, collector, args).await?;
|
||||
}
|
||||
|
||||
crate::reporter::run(global_args, Arc::clone(datastore), args)
|
||||
.context("Failed to run report command")?;
|
||||
print_scan_summary(
|
||||
start_time,
|
||||
scan_started_at,
|
||||
datastore,
|
||||
global_args,
|
||||
args,
|
||||
rules_db,
|
||||
matcher_stats.as_ref(),
|
||||
if enable_profiling { Some(shared_profiler.as_ref()) } else { None },
|
||||
update_status,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
maybe_hint_access_map(datastore, args);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
// Parallel scan path
|
||||
// =================================================================================================
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn run_parallel_scan(
|
||||
args: &scan::ScanArgs,
|
||||
global_args: &global::GlobalArgs,
|
||||
datastore: &Arc<Mutex<FindingsStore>>,
|
||||
rules_db: &RulesDatabase,
|
||||
repo_roots: &[PathBuf],
|
||||
repo_rx: crossbeam_channel::Receiver<PathBuf>,
|
||||
repo_clone_handle: Option<std::thread::JoinHandle<()>>,
|
||||
shared_profiler: &Arc<ConcurrentRuleProfiler>,
|
||||
enable_profiling: bool,
|
||||
matcher_stats: &Arc<Mutex<MatcherStats>>,
|
||||
baseline_path: &Arc<PathBuf>,
|
||||
validation_deps: &Option<ValidationDeps>,
|
||||
access_map_collector: &mut Option<AccessMapCollector>,
|
||||
progress_enabled: bool,
|
||||
start_time: Instant,
|
||||
scan_started_at: chrono::DateTime<chrono::Local>,
|
||||
update_status: &crate::update::UpdateStatus,
|
||||
) -> Result<()> {
|
||||
deduplicate_new_matches(datastore, global_args, args, 0)?;
|
||||
apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), repo_roots)?;
|
||||
|
||||
// Validate initial (non-repo) matches
|
||||
if let Some(validation) = validation_deps {
|
||||
let (parser, clients, cache, rate_limiter) =
|
||||
(&validation.0, &validation.1, &validation.2, &validation.3);
|
||||
let initial_match_count = { datastore.lock().unwrap().get_matches().len() };
|
||||
if initial_match_count > 0 {
|
||||
run_secret_validation(
|
||||
Arc::clone(&datastore),
|
||||
Arc::clone(datastore),
|
||||
parser,
|
||||
clients,
|
||||
cache,
|
||||
|
|
@ -458,6 +634,7 @@ pub async fn run_async_scan(
|
|||
}
|
||||
}
|
||||
|
||||
// Parallel per-repo scanning
|
||||
let repo_concurrency = std::cmp::max(1, args.num_jobs);
|
||||
let rt_handle = Handle::current();
|
||||
|
||||
|
|
@ -476,16 +653,16 @@ pub async fn run_async_scan(
|
|||
let spawn_repo_scan = |root: PathBuf| {
|
||||
let repo_rules = repo_rules.clone();
|
||||
let base_clone_root = base_clone_root.clone();
|
||||
let baseline_path = Arc::clone(&baseline_path);
|
||||
let shared_profiler = Arc::clone(&shared_profiler);
|
||||
let baseline_path = Arc::clone(baseline_path);
|
||||
let shared_profiler = Arc::clone(shared_profiler);
|
||||
let args = args.clone();
|
||||
let root = root.clone();
|
||||
let validation_deps = validation_deps.clone();
|
||||
let matcher_stats = Arc::clone(&matcher_stats);
|
||||
let matcher_stats = Arc::clone(matcher_stats);
|
||||
let rt_handle = rt_handle.clone();
|
||||
let ran_repo_scan = Arc::clone(&ran_repo_scan);
|
||||
let repo_errors = Arc::clone(&repo_errors);
|
||||
let datastore = Arc::clone(&datastore);
|
||||
let datastore = Arc::clone(datastore);
|
||||
let access_map = access_map_collector.clone();
|
||||
|
||||
scope.spawn(move |_| {
|
||||
|
|
@ -509,7 +686,9 @@ pub async fn run_async_scan(
|
|||
Arc::clone(&shared_profiler),
|
||||
&repo_matcher_stats,
|
||||
)
|
||||
.and_then(|_| deduplicate_new_matches(&repo_datastore, 0))?;
|
||||
.and_then(|_| {
|
||||
deduplicate_new_matches(&repo_datastore, global_args, &args, 0)
|
||||
})?;
|
||||
|
||||
if args.baseline_file.is_some() || args.manage_baseline {
|
||||
let mut ds = repo_datastore.lock().unwrap();
|
||||
|
|
@ -568,11 +747,11 @@ pub async fn run_async_scan(
|
|||
});
|
||||
};
|
||||
|
||||
for root in repo_roots.clone() {
|
||||
for root in repo_roots.iter().cloned() {
|
||||
spawn_repo_scan(root);
|
||||
}
|
||||
|
||||
for root in repo_rx.clone().iter() {
|
||||
for root in repo_rx.iter() {
|
||||
spawn_repo_scan(root);
|
||||
}
|
||||
});
|
||||
|
|
@ -586,51 +765,27 @@ pub async fn run_async_scan(
|
|||
}
|
||||
|
||||
if output_to_file && ran_repo_scan.load(Ordering::Relaxed) {
|
||||
crate::reporter::run(global_args, Arc::clone(&datastore), args)
|
||||
crate::reporter::run(global_args, Arc::clone(datastore), args)
|
||||
.context("Failed to run report command")?;
|
||||
}
|
||||
|
||||
if !ran_repo_scan.load(Ordering::Relaxed) {
|
||||
deduplicate_new_matches(&datastore, 0)?;
|
||||
deduplicate_new_matches(datastore, global_args, args, 0)?;
|
||||
apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), repo_roots)?;
|
||||
|
||||
if args.baseline_file.is_some() || args.manage_baseline {
|
||||
let mut ds = datastore.lock().unwrap();
|
||||
crate::baseline::apply_baseline(
|
||||
&mut ds,
|
||||
baseline_path.as_ref(),
|
||||
args.manage_baseline,
|
||||
&repo_roots,
|
||||
)?;
|
||||
}
|
||||
|
||||
if let Some(validation) = &validation_deps {
|
||||
let (parser, clients, cache, rate_limiter) =
|
||||
(&validation.0, &validation.1, &validation.2, &validation.3);
|
||||
run_secret_validation(
|
||||
Arc::clone(&datastore),
|
||||
parser,
|
||||
clients,
|
||||
cache,
|
||||
args.num_jobs,
|
||||
None,
|
||||
access_map_collector.clone(),
|
||||
rate_limiter.clone(),
|
||||
Duration::from_secs(args.validation_timeout),
|
||||
args.validation_retries,
|
||||
)
|
||||
run_validation_phase(datastore, validation_deps, args, None, access_map_collector.clone())
|
||||
.await?;
|
||||
}
|
||||
|
||||
if let Some(collector) = access_map_collector.take() {
|
||||
finalize_access_map(&datastore, collector, args).await?;
|
||||
finalize_access_map(datastore, collector, args).await?;
|
||||
}
|
||||
|
||||
crate::reporter::run(global_args, Arc::clone(&datastore), args)
|
||||
crate::reporter::run(global_args, Arc::clone(datastore), args)
|
||||
.context("Failed to run report command")?;
|
||||
}
|
||||
|
||||
let aggregate_summary = if ran_repo_scan.load(Ordering::Relaxed) {
|
||||
let totals = compute_scan_totals(&datastore, args, matcher_stats.as_ref());
|
||||
let totals = compute_scan_totals(datastore, args, matcher_stats.as_ref());
|
||||
let mut sorted: Vec<_> = datastore.lock().unwrap().get_summary().into_iter().collect();
|
||||
sorted.sort_by(|a, b| b.1.cmp(&a.1));
|
||||
Some((totals, sorted))
|
||||
|
|
@ -641,7 +796,7 @@ pub async fn run_async_scan(
|
|||
print_scan_summary(
|
||||
start_time,
|
||||
scan_started_at,
|
||||
&datastore,
|
||||
datastore,
|
||||
global_args,
|
||||
args,
|
||||
rules_db,
|
||||
|
|
@ -652,14 +807,18 @@ pub async fn run_async_scan(
|
|||
aggregate_summary,
|
||||
);
|
||||
|
||||
if let Some(collector) = access_map_collector {
|
||||
finalize_access_map(&datastore, collector, args).await?;
|
||||
if let Some(collector) = access_map_collector.take() {
|
||||
finalize_access_map(datastore, collector, args).await?;
|
||||
} else {
|
||||
maybe_hint_access_map(&datastore, args);
|
||||
maybe_hint_access_map(datastore, args);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
// Existing helper functions (unchanged)
|
||||
// =================================================================================================
|
||||
|
||||
async fn finalize_access_map(
|
||||
datastore: &Arc<Mutex<FindingsStore>>,
|
||||
collector: AccessMapCollector,
|
||||
|
|
@ -780,8 +939,6 @@ pub fn create_datastore_channel(
|
|||
) {
|
||||
const BATCH_SIZE: usize = 1024;
|
||||
let channel_size = std::cmp::max(num_jobs * BATCH_SIZE, 16 * BATCH_SIZE);
|
||||
// const BATCH_SIZE: usize = 256;
|
||||
// let channel_size = std::cmp::max(num_jobs * BATCH_SIZE, 4096);
|
||||
crossbeam_channel::bounded(channel_size)
|
||||
}
|
||||
|
||||
|
|
@ -864,7 +1021,6 @@ pub fn load_and_record_rules(
|
|||
) -> Result<RulesDatabase> {
|
||||
let init_progress =
|
||||
if use_progress { ProgressBar::new_spinner() } else { ProgressBar::hidden() };
|
||||
// init_progress.set_message("Compiling rules...");
|
||||
let rules_db = {
|
||||
let loaded = RuleLoader::from_rule_specifiers(&args.rules)
|
||||
.load(args)
|
||||
|
|
@ -876,7 +1032,6 @@ pub fn load_and_record_rules(
|
|||
.cloned()
|
||||
.map(|mut rule| {
|
||||
if let Some(min_entropy) = args.min_entropy {
|
||||
// rule.syntax.min_entropy = min_entropy;
|
||||
let _ = rule.set_entropy(min_entropy);
|
||||
}
|
||||
rule
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue