From d3e659491d3314349fc2e169350aab05759a1dc8 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sat, 14 Feb 2026 13:12:26 -0800 Subject: [PATCH] refactored code --- crates/kingfisher-core/Cargo.toml | 3 + crates/kingfisher-core/src/content_type.rs | 232 ++++++ crates/kingfisher-core/src/lib.rs | 2 + crates/kingfisher-scanner/src/lib.rs | 1 + crates/kingfisher-scanner/src/primitives.rs | 187 +++++ crates/kingfisher-scanner/src/scanner.rs | 173 +---- src/access_map.rs | 164 +++- src/azure.rs | 91 +-- src/bitbucket.rs | 86 +- src/content_type.rs | 234 +----- src/git_host.rs | 121 +++ src/gitea.rs | 80 +- src/github.rs | 82 +- src/gitlab.rs | 82 +- src/lib.rs | 1 + src/matcher/base64_decode.rs | 2 + src/matcher/captures.rs | 140 ++++ src/matcher/conversion.rs | 251 ++++++ src/matcher/dedup.rs | 2 + src/matcher/filter.rs | 274 +++++++ src/matcher/fingerprint.rs | 2 + src/{matcher.rs => matcher/mod.rs} | 777 ++---------------- src/scanner/runner.rs | 821 ++++++++++++-------- 23 files changed, 1924 insertions(+), 1884 deletions(-) create mode 100644 crates/kingfisher-core/src/content_type.rs create mode 100644 crates/kingfisher-scanner/src/primitives.rs create mode 100644 src/git_host.rs create mode 100644 src/matcher/base64_decode.rs create mode 100644 src/matcher/captures.rs create mode 100644 src/matcher/conversion.rs create mode 100644 src/matcher/dedup.rs create mode 100644 src/matcher/filter.rs create mode 100644 src/matcher/fingerprint.rs rename src/{matcher.rs => matcher/mod.rs} (58%) diff --git a/crates/kingfisher-core/Cargo.toml b/crates/kingfisher-core/Cargo.toml index ef532f3..d8a0902 100644 --- a/crates/kingfisher-core/Cargo.toml +++ b/crates/kingfisher-core/Cargo.toml @@ -44,5 +44,8 @@ gix = { version = "0.73", default-features = false, features = ["serde"] } # Console formatting console = "0.15" +# Language detection for content types +tokei = "14.0.0" + [dev-dependencies] pretty_assertions = "1.4" diff --git a/crates/kingfisher-core/src/content_type.rs b/crates/kingfisher-core/src/content_type.rs new file mode 100644 index 0000000..611badb --- /dev/null +++ b/crates/kingfisher-core/src/content_type.rs @@ -0,0 +1,232 @@ +use once_cell::sync::Lazy; +use std::path::Path; +use tokei::LanguageType; + +// Precompute all (shebang_prefix_bytes, language) pairs once. +// Sort longest-first so more specific shebangs win. +static SHEBANG_PREFIXES: Lazy> = Lazy::new(|| { + let mut v = Vec::new(); + for &(lang, shebangs) in LanguageType::list() { + for &sb in shebangs { + v.push((sb.as_bytes(), lang)); + } + } + // Longest prefix first to prefer e.g. "#!/usr/bin/env python3" over "#!/usr/bin/env python" + v.sort_by(|a, b| b.0.len().cmp(&a.0.len())); + v +}); + +/// The type of content detected. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ContentType { + /// Unprintable or control-heavy data. + BINARY, + /// Mostly printable text. + TEXT, +} + +/// Heuristic thresholds for text vs. binary detection. +pub struct ContentInspector { + max_null_bytes: usize, + max_control_ratio: f64, +} + +impl Default for ContentInspector { + fn default() -> Self { + Self { max_null_bytes: 4, max_control_ratio: 0.3 } + } +} + +impl ContentInspector { + /// Create a new inspector with default thresholds. + #[inline] + pub fn new() -> Self { + Default::default() + } + + /// Classify `bytes` as TEXT or BINARY: + /// + /// 1. If null-byte count > `max_null_bytes` -- `BINARY`. + /// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` → + /// `BINARY`. + /// 3. Otherwise, `TEXT`. + #[inline] + #[must_use] + pub fn inspect(&self, bytes: &[u8]) -> ContentType { + let nulls = bytes.iter().filter(|&&b| b == 0).count(); + if nulls > self.max_null_bytes { + return ContentType::BINARY; + } + let controls = + bytes.iter().filter(|&&b| b < 32 && !matches!(b, b'\n' | b'\r' | b'\t')).count(); + let ratio = if bytes.is_empty() { 0.0 } else { controls as f64 / bytes.len() as f64 }; + if ratio > self.max_control_ratio { + ContentType::BINARY + } else { + ContentType::TEXT + } + } + + /// Guess MIME type from `path` extension. + /// + /// Returns: + /// - `Some(mime)` if the extension is one of the known text or image types. + /// - `None` if there is no extension or it’s unrecognized. + #[inline] + #[must_use] + pub fn guess_mime_type(&self, path: &Path) -> Option { + let ext = path.extension()?.to_str()?.to_ascii_lowercase(); + let mime = match ext.as_str() { + "txt" | "md" | "rst" => "text/plain", + "html" | "htm" => "text/html", + "css" => "text/css", + "js" => "application/javascript", + "json" => "application/json", + "xml" => "application/xml", + "pdf" => "application/pdf", + "jpg" | "jpeg" => "image/jpeg", + "png" => "image/png", + "gif" => "image/gif", + _ => return None, + }; + Some(mime.to_string()) + } + + /// Detect UTF-8 encoding by attempting a lossless conversion. + #[inline] + #[must_use] + pub fn guess_charset(&self, bytes: &[u8]) -> Option { + std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string()) + } + /// Guess programming language with broad coverage using `tokei`. + /// + /// Strategy (no disk I/O): + /// 1) Try extension via `LanguageType::from_file_extension`. + /// 2) Handle common extensionless filenames (e.g., Makefile, Dockerfile, CMakeLists.txt). + /// 3) Parse an in-memory shebang (first line) against `LanguageType::shebangs`. + /// 4) Minimal content markers as a last resort. + /// + /// Returns the canonical `tokei` language name (e.g., `Rust`, `Bash`, `Python`). + #[inline] + #[must_use] + pub fn guess_language(&self, path: &Path, content: &[u8]) -> Option { + // 1) Extension mapping (fast, no I/O). + if let Some(ext) = path.extension().and_then(|e| e.to_str()) { + if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) { + return Some(lang.name().to_string()); + } + } + + // 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure). + if let Some(file) = path.file_name().and_then(|f| f.to_str()) { + match file { + "Makefile" | "makefile" => { + return Some(LanguageType::Makefile.name().to_string()); + } + "Dockerfile" | "dockerfile" => { + return Some(LanguageType::Dockerfile.name().to_string()); + } + "CMakeLists.txt" => { + return Some(LanguageType::CMake.name().to_string()); + } + "Rakefile" | "rakefile" => { + return Some(LanguageType::Rakefile.name().to_string()); + } + // Common ecosystem files; map to their primary language where sensible. + "Gemfile" | "gemfile" => { + return Some(LanguageType::Ruby.name().to_string()); + } + _ => {} + } + } + + // 3) Shebang detection (in-memory): match by longest prefix, byte-wise (no UTF-8 needed). + if let Some(first_line) = content.split(|&b| b == b'\n').next() { + if first_line.starts_with(b"#!") { + for (prefix, lang) in SHEBANG_PREFIXES.iter() { + if first_line.starts_with(prefix) { + return Some(lang.name().to_string()); + } + } + } + } + + // 4) Lightweight content markers to catch a few ubiquitous cases without I/O. + let s = String::from_utf8_lossy(content); + if s.contains(" ContentType { + ContentInspector::default().inspect(bytes) +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use super::*; + + #[test] + fn binary_vs_text() { + let ins = ContentInspector::default(); + let bin = vec![0, 1, 2, 0, 0, 0, 5]; + assert_eq!(ins.inspect(&bin), ContentType::BINARY); + let txt = b"Hello\nWorld"; + assert_eq!(ins.inspect(txt), ContentType::TEXT); + } + + #[test] + fn mime_guess() { + let ins = ContentInspector::default(); + assert_eq!(ins.guess_mime_type(&PathBuf::from("a.md")), Some("text/plain".into())); + assert_eq!(ins.guess_mime_type(&PathBuf::from("img.png")), Some("image/png".into())); + assert_eq!(ins.guess_mime_type(&PathBuf::from("x.xyz")), None); + } + + #[test] + fn charset_guess() { + let ins = ContentInspector::default(); + assert_eq!(ins.guess_charset("ok".as_bytes()), Some("UTF-8".into())); + assert_eq!(ins.guess_charset(&[0xFF, 0xFE, 0xFD]), None); + } + + #[test] + fn language_guess() { + let ins = ContentInspector::default(); + + // Compare case-insensitively by lowercasing both sides. + let rust = + ins.guess_language(&PathBuf::from("main.rs"), b"").map(|s| s.to_ascii_lowercase()); + assert_eq!(rust, Some("rust".into())); + + let php = ins + .guess_language(&PathBuf::from("x"), b"") + .map(|s| s.to_ascii_lowercase()); + assert_eq!(php, Some("php".into())); + + let bash = ins + .guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi") + .map(|s| s.to_ascii_lowercase()); + assert_eq!(bash, Some("bash".into())); + } +} diff --git a/crates/kingfisher-core/src/lib.rs b/crates/kingfisher-core/src/lib.rs index dc964f3..f656ea2 100644 --- a/crates/kingfisher-core/src/lib.rs +++ b/crates/kingfisher-core/src/lib.rs @@ -9,6 +9,7 @@ pub mod blob; pub mod bstring_escape; +pub mod content_type; pub mod entropy; pub mod error; pub mod git_commit_metadata; @@ -20,6 +21,7 @@ pub use blob::{ Blob, BlobAppearance, BlobAppearanceSet, BlobData, BlobId, BlobIdMap, BlobMetadata, }; pub use bstring_escape::Escaped; +pub use content_type::{ContentInspector, ContentType}; pub use entropy::calculate_shannon_entropy; pub use error::{Error, Result}; pub use git_commit_metadata::CommitMetadata; diff --git a/crates/kingfisher-scanner/src/lib.rs b/crates/kingfisher-scanner/src/lib.rs index 620cb61..f6976ea 100644 --- a/crates/kingfisher-scanner/src/lib.rs +++ b/crates/kingfisher-scanner/src/lib.rs @@ -52,6 +52,7 @@ //! - **validation-all**: Enable all validation features mod finding; +pub mod primitives; mod scanner; mod scanner_pool; diff --git a/crates/kingfisher-scanner/src/primitives.rs b/crates/kingfisher-scanner/src/primitives.rs new file mode 100644 index 0000000..a9e11fe --- /dev/null +++ b/crates/kingfisher-scanner/src/primitives.rs @@ -0,0 +1,187 @@ +//! Shared matching primitives for secret detection. +//! +//! These functions are used by both the high-level `Scanner` API and the +//! binary crate's `Matcher`. Having a single canonical implementation +//! eliminates duplicated logic across the codebase. + +use std::hash::{Hash, Hasher}; + +use base64::{engine::general_purpose, Engine}; +use kingfisher_core::OffsetSpan; +use rustc_hash::{FxHashMap, FxHasher}; +use xxhash_rust::xxh3::xxh3_64; + +// ------------------------------------------------------------------------------------------------- +// Base64 detection +// ------------------------------------------------------------------------------------------------- + +/// Decoded Base64 data with position information. +#[derive(Debug, Clone)] +pub struct DecodedData { + pub decoded: Vec, + pub pos_start: usize, + pub pos_end: usize, +} + +#[inline] +pub fn is_base64_byte(b: u8) -> bool { + // Accepts both standard base64 ('+', '/') and URL-safe base64 ('-', '_') characters. + matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_') +} + +/// Finds standalone Base64-encoded strings in the input and returns decoded data +/// with byte-offset positions. +pub fn get_base64_strings(input: &[u8]) -> Vec { + let mut results = Vec::new(); + let mut i = 0; + while i < input.len() { + while i < input.len() && !is_base64_byte(input[i]) { + i += 1; + } + let start = i; + while i < input.len() && is_base64_byte(input[i]) { + i += 1; + } + + let mut eq_count = 0; + while i < input.len() && input[i] == b'=' && eq_count < 2 { + i += 1; + eq_count += 1; + } + let end = i; + + let len = end - start; + if len >= 32 && len % 4 == 0 { + let base64_slice = &input[start..end]; + + // Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD + let decode_result = general_purpose::STANDARD + .decode(base64_slice) + .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice)) + .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice)); + + if let Ok(decoded) = decode_result { + if decoded.is_ascii() { + results.push(DecodedData { decoded, pos_start: start, pos_end: end }); + } + } + } + } + + results +} + +// ------------------------------------------------------------------------------------------------- +// Match deduplication +// ------------------------------------------------------------------------------------------------- + +/// Computes a deduplication key for a match based on content, rule ID, and span. +#[inline] +pub fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 { + let mut hasher = FxHasher::default(); + // Hash each component directly without allocation + content.hash(&mut hasher); + rule_id.hash(&mut hasher); + start.hash(&mut hasher); + end.hash(&mut hasher); + hasher.finish() +} + +/// Inserts a span into a sorted list of spans, handling containment. +/// +/// Returns `false` if the span is already contained in an existing span +/// (i.e., it's redundant and should be skipped). +#[inline] +pub fn insert_span(spans: &mut Vec, span: OffsetSpan) -> bool { + let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i); + if idx > 0 { + if spans[idx - 1].fully_contains(&span) { + return false; + } + if span.fully_contains(&spans[idx - 1]) { + spans.remove(idx - 1); + idx -= 1; + } + } + if idx < spans.len() { + if spans[idx].fully_contains(&span) { + return false; + } + if span.fully_contains(&spans[idx]) { + spans.remove(idx); + } + } + spans.insert(idx, span); + true +} + +/// Records a match span for a given rule, returning `false` if it's a duplicate. +#[inline] +pub fn record_match( + map: &mut FxHashMap>, + rule_id: usize, + span: OffsetSpan, +) -> bool { + insert_span(map.entry(rule_id).or_default(), span) +} + +// ------------------------------------------------------------------------------------------------- +// Finding fingerprint +// ------------------------------------------------------------------------------------------------- + +/// Computes a stable fingerprint for a finding based on its value, location, and origin. +pub fn compute_finding_fingerprint( + finding_value: &str, + file_or_commit: &str, + offset_start: u64, + offset_end: u64, +) -> u64 { + // Combine all into a byte buffer and hash it directly: + let mut buf = Vec::with_capacity( + finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::(), + ); + buf.extend_from_slice(finding_value.as_bytes()); + buf.extend_from_slice(file_or_commit.as_bytes()); + buf.extend_from_slice(&offset_start.to_le_bytes()); + buf.extend_from_slice(&offset_end.to_le_bytes()); + + xxh3_64(&buf) +} + +// ------------------------------------------------------------------------------------------------- +// Secret capture selection +// ------------------------------------------------------------------------------------------------- + +/// Selects the "secret" capture from the regex match using the priority: +/// 1. Named capture called TOKEN (case-insensitive) +/// 2. First matched named capture +/// 3. First positional capture (group 1) +/// 4. Full match (group 0) +pub fn find_secret_capture<'a>( + re: ®ex::bytes::Regex, + captures: ®ex::bytes::Captures<'a>, +) -> regex::bytes::Match<'a> { + // 1. Prefer a named capture called TOKEN (case-insensitive). + if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| { + name_opt.filter(|name| name.eq_ignore_ascii_case("TOKEN")).and_then(|_| captures.get(i)) + }) { + return token_cap; + } + + // 2. Otherwise, prefer the first *matched* named capture. + if let Some(named_cap) = re + .capture_names() + .enumerate() + .find_map(|(i, name_opt)| name_opt.and_then(|_| captures.get(i))) + { + return named_cap; + } + + // 3. Otherwise, fall back to the first positional capture (group 1). + if let Some(pos_cap) = captures.get(1) { + return pos_cap; + } + + // 4. Finally, fall back to the full match (group 0). + captures.get(0).unwrap() +} diff --git a/crates/kingfisher-scanner/src/scanner.rs b/crates/kingfisher-scanner/src/scanner.rs index 9917f26..d3d83d6 100644 --- a/crates/kingfisher-scanner/src/scanner.rs +++ b/crates/kingfisher-scanner/src/scanner.rs @@ -5,15 +5,13 @@ use std::path::Path; use std::sync::Arc; use anyhow::Result; -use base64::{engine::general_purpose, Engine}; use kingfisher_core::{calculate_shannon_entropy, Blob, BlobIdMap, LocationMapping, OffsetSpan}; use kingfisher_rules::RulesDatabase; -use regex::bytes::Regex; use rustc_hash::{FxHashMap, FxHashSet}; use tracing::debug; -use xxhash_rust::xxh3::xxh3_64; use crate::finding::{Finding, FindingLocation}; +use crate::primitives; use crate::scanner_pool::ScannerPool; /// Configuration options for the scanner. @@ -185,21 +183,15 @@ impl Scanner { let current_span = OffsetSpan::from_range(start..end); // Check for overlapping spans - if !self.record_span(&mut previous_spans, rule_id, current_span) { + if !primitives::record_match(&mut previous_spans, rule_id, current_span) { continue; } let haystack = &bytes[start..end]; for captures in anchored_regex.captures_iter(haystack) { - let full_capture = match captures.get(0) { - Some(c) => c, - None => continue, - }; - // Get the primary secret value - let secret_capture = - self.get_secret_capture(&anchored_regex, &captures, full_capture); + let secret_capture = primitives::find_secret_capture(&anchored_regex, &captures); let secret_bytes = secret_capture.as_bytes(); // Check entropy @@ -211,7 +203,7 @@ impl Scanner { } // Compute match key for dedup - let match_key = self.compute_match_key( + let match_key = primitives::compute_match_key( secret_bytes, rule.id().as_bytes(), start + secret_capture.start(), @@ -242,7 +234,7 @@ impl Scanner { } } - let fingerprint = self.compute_fingerprint( + let fingerprint = primitives::compute_finding_fingerprint( &secret, &blob.id().to_string(), offset_span.start as u64, @@ -295,92 +287,6 @@ impl Scanner { // In a real implementation, you'd want to add a clear method or use a different approach } - fn get_secret_capture<'a>( - &self, - regex: &Regex, - captures: ®ex::bytes::Captures<'a>, - full_capture: regex::bytes::Match<'a>, - ) -> regex::bytes::Match<'a> { - // Prefer named capture called TOKEN - for (i, name_opt) in regex.capture_names().enumerate() { - if let Some(name) = name_opt { - if name.eq_ignore_ascii_case("TOKEN") { - if let Some(cap) = captures.get(i) { - return cap; - } - } - } - } - - // Otherwise, first named capture - for (i, name_opt) in regex.capture_names().enumerate() { - if name_opt.is_some() { - if let Some(cap) = captures.get(i) { - return cap; - } - } - } - - // Otherwise, first positional capture (group 1) - if let Some(cap) = captures.get(1) { - return cap; - } - - // Fall back to full match - full_capture - } - - fn record_span( - &self, - map: &mut FxHashMap>, - rule_id: usize, - span: OffsetSpan, - ) -> bool { - let spans = map.entry(rule_id).or_default(); - - // Binary search for insertion point - let idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i); - - // Check if new span is contained in an existing one - if idx > 0 && spans[idx - 1].fully_contains(&span) { - return false; - } - if idx < spans.len() && spans[idx].fully_contains(&span) { - return false; - } - - // Remove spans that the new span contains - let remove_idx = idx; - while remove_idx < spans.len() && span.fully_contains(&spans[remove_idx]) { - spans.remove(remove_idx); - } - if idx > 0 && span.fully_contains(&spans[idx - 1]) { - spans.remove(idx - 1); - } - - spans.insert(idx.min(spans.len()), span); - true - } - - fn compute_match_key(&self, content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 { - use std::hash::{Hash, Hasher}; - let mut hasher = rustc_hash::FxHasher::default(); - content.hash(&mut hasher); - rule_id.hash(&mut hasher); - start.hash(&mut hasher); - end.hash(&mut hasher); - hasher.finish() - } - - fn compute_fingerprint(&self, value: &str, blob_id: &str, start: u64, end: u64) -> u64 { - let mut buf = Vec::with_capacity(value.len() + blob_id.len() + 16); - buf.extend_from_slice(value.as_bytes()); - buf.extend_from_slice(blob_id.as_bytes()); - buf.extend_from_slice(&start.to_le_bytes()); - buf.extend_from_slice(&end.to_le_bytes()); - xxh3_64(&buf) - } - fn redact(&self, bytes: &[u8]) -> String { let s = String::from_utf8_lossy(bytes); if s.len() <= 8 { @@ -400,7 +306,7 @@ impl Scanner { let bytes = blob.bytes(); // Find Base64-encoded strings - let b64_items = self.find_base64_strings(bytes); + let b64_items = primitives::get_base64_strings(bytes); for item in b64_items { // Try to match decoded content against all rules @@ -411,12 +317,7 @@ impl Scanner { }; for captures in regex.captures_iter(&item.decoded) { - let full_capture = match captures.get(0) { - Some(c) => c, - None => continue, - }; - - let secret_capture = self.get_secret_capture(®ex, &captures, full_capture); + let secret_capture = primitives::find_secret_capture(®ex, &captures); let secret_bytes = secret_capture.as_bytes(); let min_entropy = @@ -426,7 +327,7 @@ impl Scanner { continue; } - let match_key = self.compute_match_key( + let match_key = primitives::compute_match_key( secret_bytes, rule.id().as_bytes(), item.pos_start, @@ -455,7 +356,7 @@ impl Scanner { } } - let fingerprint = self.compute_fingerprint( + let fingerprint = primitives::compute_finding_fingerprint( &secret, &blob.id().to_string(), offset_span.start as u64, @@ -488,62 +389,6 @@ impl Scanner { findings } - - fn find_base64_strings(&self, input: &[u8]) -> Vec { - let mut results = Vec::new(); - let mut i = 0; - - while i < input.len() { - // Skip non-base64 characters - while i < input.len() && !Self::is_base64_byte(input[i]) { - i += 1; - } - let start = i; - - // Collect base64 characters - while i < input.len() && Self::is_base64_byte(input[i]) { - i += 1; - } - - // Handle padding - let mut eq_count = 0; - while i < input.len() && input[i] == b'=' && eq_count < 2 { - i += 1; - eq_count += 1; - } - let end = i; - - let len = end - start; - if len >= 32 && len % 4 == 0 { - let base64_slice = &input[start..end]; - - // Try decoding - let decode_result = general_purpose::STANDARD - .decode(base64_slice) - .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice)) - .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice)); - - if let Ok(decoded) = decode_result { - if decoded.is_ascii() { - results.push(DecodedData { decoded, pos_start: start, pos_end: end }); - } - } - } - } - - results - } - - #[inline] - fn is_base64_byte(b: u8) -> bool { - matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_') - } -} - -struct DecodedData { - decoded: Vec, - pos_start: usize, - pos_end: usize, } #[cfg(test)] diff --git a/src/access_map.rs b/src/access_map.rs index dc26d9f..eb07647 100644 --- a/src/access_map.rs +++ b/src/access_map.rs @@ -18,6 +18,22 @@ pub(crate) mod postgres; mod report; mod slack; +/// Trait for access map providers that map a single token to an access profile. +/// +/// This covers the majority of providers (GitHub, GitLab, Slack, HuggingFace, +/// Gitea, Bitbucket). Providers with more complex credentials (AWS, GCP, Azure, +/// Postgres, MongoDB) use their own custom interfaces. +pub trait TokenAccessMapper: Send + Sync { + /// The cloud/platform name for results (e.g., `"github"`, `"slack"`). + fn cloud_name(&self) -> &'static str; + + /// Maps a single token to an access map result. + fn map_access_from_token( + &self, + token: &str, + ) -> impl std::future::Future> + Send; +} + /// Run the identity mapping workflow for the selected cloud provider. pub async fn run(args: AccessMapArgs) -> Result<()> { let result = match args.provider { @@ -244,24 +260,15 @@ pub async fn map_requests(requests: Vec) -> Vec ( - github::map_access_from_token(&token) - .await - .unwrap_or_else(|err| build_failed_result("github", "token", err)), - fingerprint, - ), - AccessMapRequest::Gitlab { token, fingerprint } => ( - gitlab::map_access_from_token(&token) - .await - .unwrap_or_else(|err| build_failed_result("gitlab", "token", err)), - fingerprint, - ), - AccessMapRequest::Slack { token, fingerprint } => ( - slack::map_access_from_token(&token) - .await - .unwrap_or_else(|err| build_failed_result("slack", "token", err)), - fingerprint, - ), + AccessMapRequest::Github { token, fingerprint } => { + (map_token(&GithubMapper, &token).await, fingerprint) + } + AccessMapRequest::Gitlab { token, fingerprint } => { + (map_token(&GitlabMapper, &token).await, fingerprint) + } + AccessMapRequest::Slack { token, fingerprint } => { + (map_token(&SlackMapper, &token).await, fingerprint) + } AccessMapRequest::Postgres { uri, fingerprint } => ( postgres::map_access_from_uri(&uri) .await @@ -274,24 +281,15 @@ pub async fn map_requests(requests: Vec) -> Vec ( - huggingface::map_access_from_token(&token) - .await - .unwrap_or_else(|err| build_failed_result("huggingface", "token", err)), - fingerprint, - ), - AccessMapRequest::Gitea { token, fingerprint } => ( - gitea::map_access_from_token(&token) - .await - .unwrap_or_else(|err| build_failed_result("gitea", "token", err)), - fingerprint, - ), - AccessMapRequest::Bitbucket { token, fingerprint } => ( - bitbucket::map_access_from_token(&token) - .await - .unwrap_or_else(|err| build_failed_result("bitbucket", "token", err)), - fingerprint, - ), + AccessMapRequest::HuggingFace { token, fingerprint } => { + (map_token(&HuggingFaceMapper, &token).await, fingerprint) + } + AccessMapRequest::Gitea { token, fingerprint } => { + (map_token(&GiteaMapper, &token).await, fingerprint) + } + AccessMapRequest::Bitbucket { token, fingerprint } => { + (map_token(&BitbucketMapper, &token).await, fingerprint) + } }; mapped.fingerprint = Some(fp); @@ -301,12 +299,106 @@ pub async fn map_requests(requests: Vec) -> Vec AccessMapResult { + mapper + .map_access_from_token(token) + .await + .unwrap_or_else(|err| build_failed_result(mapper.cloud_name(), "token", err)) +} + /// Write HTML/JSON outputs for a collection of identity map results. pub fn write_reports(results: &[AccessMapResult], html_out: &std::path::Path) -> Result<()> { report::generate_html_report_multi(results, html_out)?; Ok(()) } +// ------------------------------------------------------------------------------------------------- +// TokenAccessMapper implementations +// ------------------------------------------------------------------------------------------------- + +/// GitHub access mapper. +pub struct GithubMapper; + +impl TokenAccessMapper for GithubMapper { + fn cloud_name(&self) -> &'static str { + "github" + } + + async fn map_access_from_token(&self, token: &str) -> Result { + github::map_access_from_token(token).await + } +} + +/// GitLab access mapper. +pub struct GitlabMapper; + +impl TokenAccessMapper for GitlabMapper { + fn cloud_name(&self) -> &'static str { + "gitlab" + } + + async fn map_access_from_token(&self, token: &str) -> Result { + gitlab::map_access_from_token(token).await + } +} + +/// Slack access mapper. +pub struct SlackMapper; + +impl TokenAccessMapper for SlackMapper { + fn cloud_name(&self) -> &'static str { + "slack" + } + + async fn map_access_from_token(&self, token: &str) -> Result { + slack::map_access_from_token(token).await + } +} + +/// HuggingFace access mapper. +pub struct HuggingFaceMapper; + +impl TokenAccessMapper for HuggingFaceMapper { + fn cloud_name(&self) -> &'static str { + "huggingface" + } + + async fn map_access_from_token(&self, token: &str) -> Result { + huggingface::map_access_from_token(token).await + } +} + +/// Gitea access mapper. +pub struct GiteaMapper; + +impl TokenAccessMapper for GiteaMapper { + fn cloud_name(&self) -> &'static str { + "gitea" + } + + async fn map_access_from_token(&self, token: &str) -> Result { + gitea::map_access_from_token(token).await + } +} + +/// Bitbucket access mapper. +pub struct BitbucketMapper; + +impl TokenAccessMapper for BitbucketMapper { + fn cloud_name(&self) -> &'static str { + "bitbucket" + } + + async fn map_access_from_token(&self, token: &str) -> Result { + bitbucket::map_access_from_token(token).await + } +} + +// ------------------------------------------------------------------------------------------------- +// Helper functions +// ------------------------------------------------------------------------------------------------- + fn severity_to_str(severity: Severity) -> &'static str { match severity { Severity::Low => "low", diff --git a/src/azure.rs b/src/azure.rs index 9a3b6d5..9f2a752 100644 --- a/src/azure.rs +++ b/src/azure.rs @@ -13,13 +13,12 @@ use std::{ // let us opt into newer API versions as Microsoft rolls them out. use anyhow::{anyhow, Context, Result}; -use globset::{Glob, GlobSet, GlobSetBuilder}; use indicatif::{ProgressBar, ProgressStyle}; use serde::Deserialize; use tracing::warn; use url::{form_urlencoded, Url}; -use crate::{findings_store, git_url::GitUrl}; +use crate::{findings_store, git_host, git_url::GitUrl}; const API_VERSION: &str = "7.1-preview.1"; @@ -55,33 +54,6 @@ impl RepoSpecifiers { } } -#[derive(Debug)] -struct ExcludeMatcher { - exact: HashSet, - globs: Option, -} - -impl ExcludeMatcher { - fn matches(&self, name: &str) -> bool { - let candidate = name.to_lowercase(); - if self.exact.contains(&candidate) { - return true; - } - if let Some(globs) = &self.globs { - return globs.is_match(&candidate); - } - false - } - - fn is_empty(&self) -> bool { - self.exact.is_empty() && self.globs.is_none() - } -} - -fn looks_like_glob(pattern: &str) -> bool { - pattern.contains('*') || pattern.contains('?') || pattern.contains('[') -} - fn encode_segment(segment: &str) -> String { form_urlencoded::byte_serialize(segment.as_bytes()).collect::() } @@ -129,7 +101,7 @@ fn parse_repo_identifier_from_path(path: &str) -> Option { let org = org.to_lowercase(); let project_raw = project.to_string(); - if looks_like_glob(&project_raw) { + if git_host::looks_like_glob(&project_raw) { let pattern = format!("{org}/{}/**", project_raw.to_lowercase()); return Some(pattern); } @@ -197,59 +169,16 @@ fn parse_excluded_repo(raw: &str) -> Option { parse_repo_identifier_from_path(trimmed) } -fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher { - let mut exact = HashSet::new(); - let mut glob_builder = GlobSetBuilder::new(); - let mut has_glob = false; - - for raw in exclude_repos { - match parse_excluded_repo(raw) { - Some(name) => { - let normalized = name.to_lowercase(); - if looks_like_glob(&normalized) { - match Glob::new(&normalized) { - Ok(glob) => { - glob_builder.add(glob); - has_glob = true; - } - Err(err) => { - warn!("Ignoring invalid Azure exclusion pattern '{raw}': {err}"); - exact.insert(normalized); - } - } - } else { - exact.insert(normalized); - } - } - None => { - warn!("Ignoring invalid Azure exclusion '{raw}' (expected organization/project[/repository])"); - } - } - } - - let globs = if has_glob { - match glob_builder.build() { - Ok(set) => Some(set), - Err(err) => { - warn!("Failed to build Azure exclusion patterns: {err}"); - None - } - } - } else { - None - }; - - ExcludeMatcher { exact, globs } +fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher { + git_host::build_exclude_matcher( + exclude_repos, + |raw| parse_excluded_repo(raw).map(|name| name.to_lowercase()), + "Azure", + ) } -fn should_exclude_repo(repo_url: &str, excludes: &ExcludeMatcher) -> bool { - if excludes.is_empty() { - return false; - } - if let Some(name) = parse_repo_identifier_from_url(repo_url) { - return excludes.matches(&name); - } - false +fn should_exclude_repo(repo_url: &str, excludes: &git_host::ExcludeMatcher) -> bool { + git_host::should_exclude_repo(repo_url, excludes, parse_repo_identifier_from_url) } #[derive(Debug, Deserialize, Default)] diff --git a/src/bitbucket.rs b/src/bitbucket.rs index 12edf6c..948cf90 100644 --- a/src/bitbucket.rs +++ b/src/bitbucket.rs @@ -6,8 +6,8 @@ use std::{ time::Duration, }; +use crate::git_host; use anyhow::{Context, Result}; -use globset::{Glob, GlobSet, GlobSetBuilder}; use indicatif::{ProgressBar, ProgressStyle}; use reqwest::Url; use serde::Deserialize; @@ -136,32 +136,6 @@ impl BitbucketKind { } } -#[derive(Debug)] -struct ExcludeMatcher { - exact: HashSet, - globs: Option, -} - -impl ExcludeMatcher { - fn matches(&self, name: &str) -> bool { - if self.exact.contains(name) { - return true; - } - if let Some(globs) = &self.globs { - return globs.is_match(name); - } - false - } - - fn is_empty(&self) -> bool { - self.exact.is_empty() && self.globs.is_none() - } -} - -fn looks_like_glob(pattern: &str) -> bool { - pattern.contains('*') || pattern.contains('?') || pattern.contains('[') -} - fn normalize_repo_identifier(owner: &str, repo: &str) -> Option { let owner = owner.trim().trim_matches('/'); let repo = repo.trim().trim_matches('/'); @@ -207,58 +181,12 @@ fn parse_excluded_repo(raw: &str) -> Option { parse_repo_name_from_path(trimmed) } -fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher { - let mut exact = HashSet::new(); - let mut glob_builder = GlobSetBuilder::new(); - let mut has_glob = false; - - for raw in exclude_repos { - match parse_excluded_repo(raw) { - Some(name) => { - if looks_like_glob(&name) { - match Glob::new(&name) { - Ok(glob) => { - glob_builder.add(glob); - has_glob = true; - } - Err(err) => { - warn!("Ignoring invalid Bitbucket exclusion pattern '{raw}': {err}"); - exact.insert(name); - } - } - } else { - exact.insert(name); - } - } - None => { - warn!("Ignoring invalid Bitbucket exclusion '{raw}' (expected owner/repo)"); - } - } - } - - let globs = if has_glob { - match glob_builder.build() { - Ok(set) => Some(set), - Err(err) => { - warn!("Failed to build Bitbucket exclusion patterns: {err}"); - None - } - } - } else { - None - }; - - ExcludeMatcher { exact, globs } +fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher { + git_host::build_exclude_matcher(exclude_repos, parse_excluded_repo, "Bitbucket") } -fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool { - if excludes.is_empty() { - return false; - } - if let Some(name) = parse_repo_name_from_url(clone_url) { - return excludes.matches(&name); - } - false +fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool { + git_host::should_exclude_repo(clone_url, excludes, parse_repo_name_from_url) } fn repo_clone_url_from_links(links: &[CloneLink]) -> Option { @@ -343,7 +271,7 @@ async fn fetch_cloud_repositories( owner: &str, auth: &AuthConfig, repo_filter: RepoType, - excludes: &ExcludeMatcher, + excludes: &git_host::ExcludeMatcher, results: &mut Vec, ) -> Result<()> { let mut next = base @@ -387,7 +315,7 @@ async fn fetch_server_repositories( path: &str, auth: &AuthConfig, repo_filter: RepoType, - excludes: &ExcludeMatcher, + excludes: &git_host::ExcludeMatcher, results: &mut Vec, ) -> Result<()> { let mut start = 0u64; diff --git a/src/content_type.rs b/src/content_type.rs index 611badb..f2ffbc6 100644 --- a/src/content_type.rs +++ b/src/content_type.rs @@ -1,232 +1,2 @@ -use once_cell::sync::Lazy; -use std::path::Path; -use tokei::LanguageType; - -// Precompute all (shebang_prefix_bytes, language) pairs once. -// Sort longest-first so more specific shebangs win. -static SHEBANG_PREFIXES: Lazy> = Lazy::new(|| { - let mut v = Vec::new(); - for &(lang, shebangs) in LanguageType::list() { - for &sb in shebangs { - v.push((sb.as_bytes(), lang)); - } - } - // Longest prefix first to prefer e.g. "#!/usr/bin/env python3" over "#!/usr/bin/env python" - v.sort_by(|a, b| b.0.len().cmp(&a.0.len())); - v -}); - -/// The type of content detected. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum ContentType { - /// Unprintable or control-heavy data. - BINARY, - /// Mostly printable text. - TEXT, -} - -/// Heuristic thresholds for text vs. binary detection. -pub struct ContentInspector { - max_null_bytes: usize, - max_control_ratio: f64, -} - -impl Default for ContentInspector { - fn default() -> Self { - Self { max_null_bytes: 4, max_control_ratio: 0.3 } - } -} - -impl ContentInspector { - /// Create a new inspector with default thresholds. - #[inline] - pub fn new() -> Self { - Default::default() - } - - /// Classify `bytes` as TEXT or BINARY: - /// - /// 1. If null-byte count > `max_null_bytes` -- `BINARY`. - /// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` → - /// `BINARY`. - /// 3. Otherwise, `TEXT`. - #[inline] - #[must_use] - pub fn inspect(&self, bytes: &[u8]) -> ContentType { - let nulls = bytes.iter().filter(|&&b| b == 0).count(); - if nulls > self.max_null_bytes { - return ContentType::BINARY; - } - let controls = - bytes.iter().filter(|&&b| b < 32 && !matches!(b, b'\n' | b'\r' | b'\t')).count(); - let ratio = if bytes.is_empty() { 0.0 } else { controls as f64 / bytes.len() as f64 }; - if ratio > self.max_control_ratio { - ContentType::BINARY - } else { - ContentType::TEXT - } - } - - /// Guess MIME type from `path` extension. - /// - /// Returns: - /// - `Some(mime)` if the extension is one of the known text or image types. - /// - `None` if there is no extension or it’s unrecognized. - #[inline] - #[must_use] - pub fn guess_mime_type(&self, path: &Path) -> Option { - let ext = path.extension()?.to_str()?.to_ascii_lowercase(); - let mime = match ext.as_str() { - "txt" | "md" | "rst" => "text/plain", - "html" | "htm" => "text/html", - "css" => "text/css", - "js" => "application/javascript", - "json" => "application/json", - "xml" => "application/xml", - "pdf" => "application/pdf", - "jpg" | "jpeg" => "image/jpeg", - "png" => "image/png", - "gif" => "image/gif", - _ => return None, - }; - Some(mime.to_string()) - } - - /// Detect UTF-8 encoding by attempting a lossless conversion. - #[inline] - #[must_use] - pub fn guess_charset(&self, bytes: &[u8]) -> Option { - std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string()) - } - /// Guess programming language with broad coverage using `tokei`. - /// - /// Strategy (no disk I/O): - /// 1) Try extension via `LanguageType::from_file_extension`. - /// 2) Handle common extensionless filenames (e.g., Makefile, Dockerfile, CMakeLists.txt). - /// 3) Parse an in-memory shebang (first line) against `LanguageType::shebangs`. - /// 4) Minimal content markers as a last resort. - /// - /// Returns the canonical `tokei` language name (e.g., `Rust`, `Bash`, `Python`). - #[inline] - #[must_use] - pub fn guess_language(&self, path: &Path, content: &[u8]) -> Option { - // 1) Extension mapping (fast, no I/O). - if let Some(ext) = path.extension().and_then(|e| e.to_str()) { - if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) { - return Some(lang.name().to_string()); - } - } - - // 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure). - if let Some(file) = path.file_name().and_then(|f| f.to_str()) { - match file { - "Makefile" | "makefile" => { - return Some(LanguageType::Makefile.name().to_string()); - } - "Dockerfile" | "dockerfile" => { - return Some(LanguageType::Dockerfile.name().to_string()); - } - "CMakeLists.txt" => { - return Some(LanguageType::CMake.name().to_string()); - } - "Rakefile" | "rakefile" => { - return Some(LanguageType::Rakefile.name().to_string()); - } - // Common ecosystem files; map to their primary language where sensible. - "Gemfile" | "gemfile" => { - return Some(LanguageType::Ruby.name().to_string()); - } - _ => {} - } - } - - // 3) Shebang detection (in-memory): match by longest prefix, byte-wise (no UTF-8 needed). - if let Some(first_line) = content.split(|&b| b == b'\n').next() { - if first_line.starts_with(b"#!") { - for (prefix, lang) in SHEBANG_PREFIXES.iter() { - if first_line.starts_with(prefix) { - return Some(lang.name().to_string()); - } - } - } - } - - // 4) Lightweight content markers to catch a few ubiquitous cases without I/O. - let s = String::from_utf8_lossy(content); - if s.contains(" ContentType { - ContentInspector::default().inspect(bytes) -} - -#[cfg(test)] -mod tests { - use std::path::PathBuf; - - use super::*; - - #[test] - fn binary_vs_text() { - let ins = ContentInspector::default(); - let bin = vec![0, 1, 2, 0, 0, 0, 5]; - assert_eq!(ins.inspect(&bin), ContentType::BINARY); - let txt = b"Hello\nWorld"; - assert_eq!(ins.inspect(txt), ContentType::TEXT); - } - - #[test] - fn mime_guess() { - let ins = ContentInspector::default(); - assert_eq!(ins.guess_mime_type(&PathBuf::from("a.md")), Some("text/plain".into())); - assert_eq!(ins.guess_mime_type(&PathBuf::from("img.png")), Some("image/png".into())); - assert_eq!(ins.guess_mime_type(&PathBuf::from("x.xyz")), None); - } - - #[test] - fn charset_guess() { - let ins = ContentInspector::default(); - assert_eq!(ins.guess_charset("ok".as_bytes()), Some("UTF-8".into())); - assert_eq!(ins.guess_charset(&[0xFF, 0xFE, 0xFD]), None); - } - - #[test] - fn language_guess() { - let ins = ContentInspector::default(); - - // Compare case-insensitively by lowercasing both sides. - let rust = - ins.guess_language(&PathBuf::from("main.rs"), b"").map(|s| s.to_ascii_lowercase()); - assert_eq!(rust, Some("rust".into())); - - let php = ins - .guess_language(&PathBuf::from("x"), b"") - .map(|s| s.to_ascii_lowercase()); - assert_eq!(php, Some("php".into())); - - let bash = ins - .guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi") - .map(|s| s.to_ascii_lowercase()); - assert_eq!(bash, Some("bash".into())); - } -} +// Re-export from the canonical implementation in kingfisher-core. +pub use kingfisher_core::content_type::*; diff --git a/src/git_host.rs b/src/git_host.rs new file mode 100644 index 0000000..5f432fa --- /dev/null +++ b/src/git_host.rs @@ -0,0 +1,121 @@ +//! Shared utilities for Git hosting platform integrations. +//! +//! This module provides the `ExcludeMatcher` struct and related helpers +//! used across all platform integration modules (GitHub, GitLab, Azure, +//! Bitbucket, Gitea). + +use std::collections::HashSet; + +use globset::{Glob, GlobSet, GlobSetBuilder}; +use tracing::warn; + +// ------------------------------------------------------------------------------------------------- +// ExcludeMatcher +// ------------------------------------------------------------------------------------------------- + +/// Matches repository names against exact strings and glob patterns. +/// +/// Used by all platform integrations to filter out excluded repositories. +pub struct ExcludeMatcher { + exact: HashSet, + globs: Option, +} + +impl ExcludeMatcher { + /// Returns `true` if no exclusion patterns are configured. + pub fn is_empty(&self) -> bool { + self.exact.is_empty() && self.globs.is_none() + } + + /// Returns `true` if the given name matches any exclusion pattern. + pub fn matches(&self, name: &str) -> bool { + if self.exact.contains(name) { + return true; + } + if let Some(globs) = &self.globs { + return globs.is_match(name); + } + false + } +} + +/// Returns `true` if the pattern contains glob metacharacters. +pub fn looks_like_glob(pattern: &str) -> bool { + pattern.contains('*') || pattern.contains('?') || pattern.contains('[') +} + +/// Builds an `ExcludeMatcher` from a list of exclusion patterns. +/// +/// The `parse_fn` transforms each raw pattern string into a normalized +/// repository name (e.g., `"owner/repo"` or `"group/project"`). It returns +/// `None` if the pattern is invalid. +/// +/// The `platform_name` is used in warning messages (e.g., `"GitHub"`, +/// `"GitLab"`). +pub fn build_exclude_matcher( + exclude_repos: &[String], + parse_fn: impl Fn(&str) -> Option, + platform_name: &str, +) -> ExcludeMatcher { + let mut exact = HashSet::new(); + let mut glob_builder = GlobSetBuilder::new(); + let mut has_glob = false; + + for raw in exclude_repos { + match parse_fn(raw) { + Some(name) => { + if looks_like_glob(&name) { + match Glob::new(&name) { + Ok(glob) => { + glob_builder.add(glob); + has_glob = true; + } + Err(err) => { + warn!( + "Ignoring invalid {platform_name} exclusion pattern '{raw}': {err}" + ); + exact.insert(name); + } + } + } else { + exact.insert(name); + } + } + None => { + warn!("Ignoring invalid {platform_name} exclusion '{raw}'"); + } + } + } + + let globs = if has_glob { + match glob_builder.build() { + Ok(set) => Some(set), + Err(err) => { + warn!("Failed to build {platform_name} exclusion patterns: {err}"); + None + } + } + } else { + None + }; + + ExcludeMatcher { exact, globs } +} + +/// Checks whether a repository URL should be excluded. +/// +/// The `extract_name` function extracts a normalized repository identifier +/// from the URL (e.g., `"owner/repo"` from `"https://github.com/owner/repo.git"`). +pub fn should_exclude_repo( + repo_url: &str, + excludes: &ExcludeMatcher, + extract_name: impl Fn(&str) -> Option, +) -> bool { + if excludes.is_empty() { + return false; + } + if let Some(name) = extract_name(repo_url) { + return excludes.matches(&name); + } + false +} diff --git a/src/gitea.rs b/src/gitea.rs index 90a00ff..d748685 100644 --- a/src/gitea.rs +++ b/src/gitea.rs @@ -1,14 +1,13 @@ use std::{collections::HashSet, env, str::FromStr, time::Duration}; use anyhow::{anyhow, Result}; -use globset::{Glob, GlobSet, GlobSetBuilder}; use indicatif::{ProgressBar, ProgressStyle}; use reqwest::StatusCode; use serde::Deserialize; use tracing::warn; use url::Url; -use crate::{git_url::GitUrl, validation::GLOBAL_USER_AGENT}; +use crate::{git_host, git_url::GitUrl, validation::GLOBAL_USER_AGENT}; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum RepoType { @@ -55,31 +54,6 @@ struct GiteaOrganization { username: String, } -struct ExcludeMatcher { - exact: HashSet, - globs: Option, -} - -impl ExcludeMatcher { - fn matches(&self, name: &str) -> bool { - if self.exact.contains(name) { - return true; - } - if let Some(globs) = &self.globs { - return globs.is_match(name); - } - false - } - - fn is_empty(&self) -> bool { - self.exact.is_empty() && self.globs.is_none() - } -} - -fn looks_like_glob(pattern: &str) -> bool { - pattern.contains('*') || pattern.contains('?') || pattern.contains('[') -} - fn normalize_repo_identifier(raw: &str) -> Option { let trimmed = raw.trim().trim_matches('/'); if trimmed.is_empty() { @@ -113,51 +87,11 @@ fn parse_excluded_repo(raw: &str) -> Option { normalize_repo_identifier(trimmed) } -fn build_exclude_matcher(excludes: &[String]) -> ExcludeMatcher { - let mut exact = HashSet::new(); - let mut glob_builder = GlobSetBuilder::new(); - let mut has_glob = false; - - for raw in excludes { - match parse_excluded_repo(raw) { - Some(name) => { - if looks_like_glob(&name) { - match Glob::new(&name) { - Ok(glob) => { - glob_builder.add(glob); - has_glob = true; - } - Err(err) => { - warn!("Ignoring invalid Gitea exclusion pattern '{raw}': {err}"); - exact.insert(name); - } - } - } else { - exact.insert(name); - } - } - None => { - warn!("Ignoring invalid Gitea exclusion '{raw}' (expected owner/repo)"); - } - } - } - - let globs = if has_glob { - match glob_builder.build() { - Ok(set) => Some(set), - Err(err) => { - warn!("Failed to build Gitea exclusion patterns: {err}"); - None - } - } - } else { - None - }; - - ExcludeMatcher { exact, globs } +fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher { + git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_repo(raw), "Gitea") } -fn should_exclude_repo(repo: &GiteaRepository, excludes: &ExcludeMatcher) -> bool { +fn should_exclude_repo(repo: &GiteaRepository, excludes: &git_host::ExcludeMatcher) -> bool { if excludes.is_empty() { return false; } @@ -169,7 +103,7 @@ async fn fetch_paginated_repos( token: Option<&str>, mut url: Url, repo_filter: RepoType, - excludes: &ExcludeMatcher, + excludes: &git_host::ExcludeMatcher, progress: Option<&ProgressBar>, ) -> Result> { let mut page = 1u32; @@ -221,7 +155,7 @@ async fn fetch_user_repos( api_url: &Url, username: &str, repo_filter: RepoType, - excludes: &ExcludeMatcher, + excludes: &git_host::ExcludeMatcher, progress: Option<&ProgressBar>, ) -> Result> { let endpoint = format!("users/{}/repos", username); @@ -235,7 +169,7 @@ async fn fetch_org_repos( api_url: &Url, org: &str, repo_filter: RepoType, - excludes: &ExcludeMatcher, + excludes: &git_host::ExcludeMatcher, progress: Option<&ProgressBar>, ) -> Result> { let endpoint = format!("orgs/{}/repos", org); diff --git a/src/github.rs b/src/github.rs index 092095c..ca2847d 100644 --- a/src/github.rs +++ b/src/github.rs @@ -7,7 +7,6 @@ use std::{ }; use anyhow::{Context, Result}; -use globset::{Glob, GlobSet, GlobSetBuilder}; use indicatif::{ProgressBar, ProgressStyle}; use octorust::{ auth::Credentials, @@ -119,83 +118,14 @@ fn parse_excluded_repo(raw: &str) -> Option { parse_repo_name_from_path(trimmed) } -struct ExcludeMatcher { - exact: HashSet, - globs: Option, +use crate::git_host; + +fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher { + git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_repo(raw), "GitHub") } -impl ExcludeMatcher { - fn is_empty(&self) -> bool { - self.exact.is_empty() && self.globs.is_none() - } - - fn matches(&self, name: &str) -> bool { - if self.exact.contains(name) { - return true; - } - if let Some(globs) = &self.globs { - return globs.is_match(name); - } - false - } -} - -fn looks_like_glob(pattern: &str) -> bool { - pattern.contains('*') || pattern.contains('?') || pattern.contains('[') -} - -fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher { - let mut exact = HashSet::new(); - let mut glob_builder = GlobSetBuilder::new(); - let mut has_glob = false; - - for raw in exclude_repos { - match parse_excluded_repo(raw) { - Some(name) => { - if looks_like_glob(&name) { - match Glob::new(&name) { - Ok(glob) => { - glob_builder.add(glob); - has_glob = true; - } - Err(err) => { - warn!("Ignoring invalid GitHub exclusion pattern '{raw}': {err}"); - exact.insert(name); - } - } - } else { - exact.insert(name); - } - } - None => { - warn!("Ignoring invalid GitHub exclusion '{raw}' (expected owner/repo)"); - } - } - } - - let globs = if has_glob { - match glob_builder.build() { - Ok(set) => Some(set), - Err(err) => { - warn!("Failed to build GitHub exclusion patterns: {err}"); - None - } - } - } else { - None - }; - - ExcludeMatcher { exact, globs } -} - -fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool { - if excludes.is_empty() { - return false; - } - if let Some(name) = parse_repo_name_from_url(clone_url) { - return excludes.matches(&name); - } - false +fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool { + git_host::should_exclude_repo(clone_url, excludes, parse_repo_name_from_url) } fn create_github_client(github_url: &url::Url, ignore_certs: bool) -> Result> { // Try personal access token diff --git a/src/gitlab.rs b/src/gitlab.rs index df394fa..4bfdefe 100644 --- a/src/gitlab.rs +++ b/src/gitlab.rs @@ -16,7 +16,6 @@ use gitlab::{ }, Gitlab, GitlabBuilder, }; -use globset::{Glob, GlobSet, GlobSetBuilder}; use indicatif::{ProgressBar, ProgressStyle}; use reqwest::StatusCode; use serde::Deserialize; @@ -25,7 +24,7 @@ use tokio::task; use tracing::{info, warn}; use url::{form_urlencoded, Url}; -use crate::{findings_store, git_url::GitUrl}; +use crate::{findings_store, git_host, git_url::GitUrl}; use std::str::FromStr; #[derive(Deserialize)] @@ -128,83 +127,12 @@ fn parse_excluded_project(raw: &str) -> Option { parse_project_path(trimmed) } -struct ExcludeMatcher { - exact: HashSet, - globs: Option, +fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher { + git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_project(raw), "GitLab") } -impl ExcludeMatcher { - fn is_empty(&self) -> bool { - self.exact.is_empty() && self.globs.is_none() - } - - fn matches(&self, name: &str) -> bool { - if self.exact.contains(name) { - return true; - } - if let Some(globs) = &self.globs { - return globs.is_match(name); - } - false - } -} - -fn looks_like_glob(pattern: &str) -> bool { - pattern.contains('*') || pattern.contains('?') || pattern.contains('[') -} - -fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher { - let mut exact = HashSet::new(); - let mut glob_builder = GlobSetBuilder::new(); - let mut has_glob = false; - - for raw in exclude_repos { - match parse_excluded_project(raw) { - Some(name) => { - if looks_like_glob(&name) { - match Glob::new(&name) { - Ok(glob) => { - glob_builder.add(glob); - has_glob = true; - } - Err(err) => { - warn!("Ignoring invalid GitLab exclusion pattern '{raw}': {err}"); - exact.insert(name); - } - } - } else { - exact.insert(name); - } - } - None => { - warn!("Ignoring invalid GitLab exclusion '{raw}' (expected group/project)"); - } - } - } - - let globs = if has_glob { - match glob_builder.build() { - Ok(set) => Some(set), - Err(err) => { - warn!("Failed to build GitLab exclusion patterns: {err}"); - None - } - } - } else { - None - }; - - ExcludeMatcher { exact, globs } -} - -fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool { - if excludes.is_empty() { - return false; - } - if let Some(name) = parse_project_path_from_url(clone_url) { - return excludes.matches(&name); - } - false +fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool { + git_host::should_exclude_repo(clone_url, excludes, parse_project_path_from_url) } fn create_gitlab_client(gitlab_url: &Url, ignore_certs: bool) -> Result { diff --git a/src/lib.rs b/src/lib.rs index f1ca2a5..781d920 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,6 +24,7 @@ pub mod findings_store; pub mod gcs; pub mod git_binary; pub mod git_commit_metadata; +pub mod git_host; pub mod git_metadata_graph; mod git_repo_enumerator; pub mod git_url; diff --git a/src/matcher/base64_decode.rs b/src/matcher/base64_decode.rs new file mode 100644 index 0000000..0ee1f26 --- /dev/null +++ b/src/matcher/base64_decode.rs @@ -0,0 +1,2 @@ +// Re-export from the canonical implementation in kingfisher-scanner. +pub use kingfisher_scanner::primitives::{get_base64_strings, DecodedData}; diff --git a/src/matcher/captures.rs b/src/matcher/captures.rs new file mode 100644 index 0000000..075f66f --- /dev/null +++ b/src/matcher/captures.rs @@ -0,0 +1,140 @@ +use bstr::BString; +use regex::bytes::Regex; +use schemars::{ + gen::SchemaGenerator, + schema::{ArrayValidation, InstanceType, Schema}, + JsonSchema, +}; +use serde::{Deserialize, Serialize}; +use smallvec::SmallVec; + +use crate::{snippet::Base64BString, util::intern}; + +// ------------------------------------------------------------------------------------------------- +// Group +// ------------------------------------------------------------------------------------------------- +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)] +pub struct Group(pub Base64BString); +impl Group { + pub fn new(m: regex::bytes::Match<'_>) -> Self { + Self(Base64BString(BString::from(m.as_bytes()))) + } +} +// ------------------------------------------------------------------------------------------------- +// Groups +// ------------------------------------------------------------------------------------------------- +#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +pub struct Groups(pub SmallVec<[Group; 1]>); +impl JsonSchema for Groups { + fn schema_name() -> String { + "Groups".to_string() + } + + fn json_schema(gen: &mut SchemaGenerator) -> Schema { + let group_schema = gen.subschema_for::(); + Schema::Object(schemars::schema::SchemaObject { + instance_type: Some(InstanceType::Array.into()), + array: Some(Box::new(ArrayValidation { + items: Some(group_schema.into()), + ..Default::default() + })), + ..Default::default() + }) + } +} + +#[derive(Debug, Clone, JsonSchema)] +pub struct SerializableCapture { + pub name: Option<&'static str>, + pub match_number: i32, + pub start: usize, + pub end: usize, + /// Interned original (unredacted) value. + #[serde(skip_serializing, skip_deserializing)] + pub value: &'static str, +} + +impl SerializableCapture { + /// Returns the original captured value. + pub fn raw_value(&self) -> &'static str { + self.value + } + + /// Returns the value that should be shown in user-facing output. + pub fn display_value(&self) -> std::borrow::Cow<'static, str> { + crate::util::display_value(self.value) + } +} + +impl serde::Serialize for SerializableCapture { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + + let mut state = serializer.serialize_struct("SerializableCapture", 5)?; + state.serialize_field("name", &self.name)?; + state.serialize_field("match_number", &self.match_number)?; + state.serialize_field("start", &self.start)?; + state.serialize_field("end", &self.end)?; + let value = self.display_value(); + state.serialize_field("value", &value)?; + state.end() + } +} + +#[derive(Debug, Clone, Serialize, JsonSchema)] +pub struct SerializableCaptures { + #[schemars(with = "Vec")] + pub captures: SmallVec<[SerializableCapture; 2]>, +} + +impl SerializableCaptures { + pub fn from_captures(captures: ®ex::bytes::Captures, _input: &[u8], re: &Regex) -> Self { + let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new(); + + let capture_names: SmallVec<[Option<&'static str>; 4]> = + re.capture_names().map(|name| name.map(intern)).collect(); + + // If there are explicit capture groups (e.g., group 1, 2, ...), + // only serialize those. + if captures.len() > 1 { + for i in 1..captures.len() { + // Start from 1 + if let Some(cap) = captures.get(i) { + let raw_value = String::from_utf8_lossy(cap.as_bytes()); + let raw_interned = intern(raw_value.as_ref()); + let name = capture_names.get(i).and_then(|opt| *opt); + + serialized_captures.push(SerializableCapture { + name, + match_number: i32::try_from(i).unwrap_or(0), + start: cap.start(), + end: cap.end(), + value: raw_interned, + }); + } + } + } else if captures.len() == 1 { + // ELSE, if there is ONLY the full match (len == 1), + // serialize just that full match (group 0) as the fallback. + if let Some(cap) = captures.get(0) { + let raw_value = String::from_utf8_lossy(cap.as_bytes()); + let raw_interned = intern(raw_value.as_ref()); + let name = capture_names.get(0).and_then(|opt| *opt); + + serialized_captures.push(SerializableCapture { + name, + match_number: 0, + start: cap.start(), + end: cap.end(), + value: raw_interned, + }); + } + } + // If len == 0 (no match), loop is skipped, empty vec is returned. + + SerializableCaptures { captures: serialized_captures } + } +} diff --git a/src/matcher/conversion.rs b/src/matcher/conversion.rs new file mode 100644 index 0000000..551119c --- /dev/null +++ b/src/matcher/conversion.rs @@ -0,0 +1,251 @@ +use std::sync::Arc; + +use http::StatusCode; +use schemars::JsonSchema; +use serde::Serialize; +use xxhash_rust::xxh3::xxh3_64; + +use crate::{ + blob::BlobId, + location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, + rules::rule::Rule, + validation_body::{self, ValidationResponseBody}, +}; + +use super::{captures::SerializableCaptures, BlobMatch}; + +use kingfisher_scanner::primitives::compute_finding_fingerprint; + +// ------------------------------------------------------------------------------------------------- +// OwnedBlobMatch +// ------------------------------------------------------------------------------------------------- + +#[derive(Clone)] +pub struct OwnedBlobMatch { + pub rule: Arc, + pub blob_id: BlobId, + /// The unique content-based identifier of this match + pub finding_fingerprint: u64, + pub matching_input_offset_span: OffsetSpan, + pub captures: SerializableCaptures, + pub validation_response_body: ValidationResponseBody, + pub validation_response_status: StatusCode, + pub validation_success: bool, + pub calculated_entropy: f32, + pub is_base64: bool, + /// Variables captured from dependent rules (from depends_on_rule). + /// Maps variable name (uppercase) to captured value. + pub dependent_captures: std::collections::BTreeMap, +} + +impl OwnedBlobMatch { + pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc) -> OwnedBlobMatch { + OwnedBlobMatch { + rule, + blob_id: m.blob_id, + finding_fingerprint: m.finding_fingerprint, + // matching_input: m.snippet.matching.0.to_vec(), + matching_input_offset_span: m.location.offset_span.clone(), + captures: m.groups.clone(), + validation_response_body: m.validation_response_body.clone(), + validation_response_status: StatusCode::from_u16(m.validation_response_status) + .unwrap_or(StatusCode::CONTINUE), + validation_success: m.validation_success, + calculated_entropy: m.calculated_entropy, + is_base64: m.is_base64, + dependent_captures: m.dependent_captures.clone(), + } + } + + pub fn from_blob_match(blob_match: BlobMatch) -> Self { + // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility. + // + // This indexing is intentionally different from the internal `validation_dedup_key()` + // (which uses get(0)) to maintain stable external fingerprints. Changing this would break: + // - Historical baselines that rely on fingerprint matching + // - Dedup entries stored in external systems + // + // For rules with nested captures like (?...(ABC)...), this may pick up + // the inner group, but that behavior is now established and must be preserved. + let matching_finding = blob_match + .captures + .captures + .get(1) + .or_else(|| blob_match.captures.captures.get(0)) + .map(|capture| capture.raw_value().as_bytes().to_vec()) + .unwrap_or_else(Vec::new); + + let mut owned_blob_match = OwnedBlobMatch { + rule: blob_match.rule, + blob_id: blob_match.blob_id.clone(), + matching_input_offset_span: blob_match.matching_input_offset_span, + captures: blob_match.captures.clone(), + validation_response_body: blob_match.validation_response_body, + validation_response_status: blob_match.validation_response_status, + validation_success: blob_match.validation_success, + calculated_entropy: blob_match.calculated_entropy, + finding_fingerprint: 0, //default + is_base64: blob_match.is_base64, + dependent_captures: std::collections::BTreeMap::new(), + }; + + // Convert matching_finding to a &str (using lossy conversion if needed) + let finding_value = std::str::from_utf8(&matching_finding).unwrap_or(""); + // Use blob_id as the file/commit identifier + let file_or_commit = &blob_match.blob_id.to_string(); + + let offset_start: u64 = + owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); + let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); + + owned_blob_match.finding_fingerprint = + compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end); + + owned_blob_match + } +} + +// ------------------------------------------------------------------------------------------------- +// Match +// ------------------------------------------------------------------------------------------------- + +#[derive(Debug, Clone, Serialize, JsonSchema)] +pub struct Match { + /// The location of the entire matching content + pub location: Location, + + /// The capture groups + pub groups: SerializableCaptures, // Store serialized captures + + /// unique identifier of file / blob where this match was found + pub blob_id: BlobId, + + /// The unique content-based identifier of this match + pub finding_fingerprint: u64, + + /// The rule that produced this match + #[serde(skip_serializing)] + #[schemars(skip)] + pub rule: Arc, + + /// Validation Body + #[serde( + default, + serialize_with = "validation_body::serialize", + deserialize_with = "validation_body::deserialize" + )] + #[schemars(schema_with = "validation_body::schema")] + pub validation_response_body: ValidationResponseBody, + + /// Validation Status Code + pub validation_response_status: u16, + + /// Validation Success + pub validation_success: bool, + + /// Validation Success + pub calculated_entropy: f32, + + pub visible: bool, + #[serde(default)] + pub is_base64: bool, + + /// Variables captured from dependent rules (from depends_on_rule). + /// Maps variable name (uppercase) to captured value. + #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")] + pub dependent_captures: std::collections::BTreeMap, +} + +impl Match { + #[inline] + pub fn convert_owned_blobmatch_to_match<'a>( + loc_mapping: Option<&'a LocationMapping<'a>>, + owned_blob_match: &'a OwnedBlobMatch, + origin_type: &'a str, + ) -> Self { + let offset_span = owned_blob_match.matching_input_offset_span; + // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility. + // See comment in from_blob_match() for why this differs from validation_dedup_key(). + let matching_finding_bytes = owned_blob_match + .captures + .captures + .get(1) + .or_else(|| owned_blob_match.captures.captures.get(0)) + .map(|capture| capture.raw_value().as_bytes()) + .unwrap_or_default(); + + // The fingerprint will be based on the content of the secret. + let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or(""); + + let source_span = + loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan { + start: SourcePoint { line: 0, column: 0 }, + end: SourcePoint { line: 0, column: 0 }, + }); + let offset_start: u64 = + owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); + let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); + + let finding_fingerprint = compute_finding_fingerprint( + finding_value_for_fp, + origin_type, // file_or_commit, + offset_start, + offset_end, + ); + + // matching_snippet + Match { + rule: owned_blob_match.rule.clone(), + visible: owned_blob_match.rule.visible().to_owned(), + location: Location::with_source_span(offset_span, Some(source_span.clone())), + groups: owned_blob_match.captures.clone(), + blob_id: owned_blob_match.blob_id, + finding_fingerprint, + validation_response_body: owned_blob_match.validation_response_body.clone(), + validation_response_status: owned_blob_match.validation_response_status.as_u16(), + validation_success: owned_blob_match.validation_success, + calculated_entropy: owned_blob_match.calculated_entropy, + is_base64: owned_blob_match.is_base64, + dependent_captures: owned_blob_match.dependent_captures.clone(), + } + } + + /// Returns the `blob_id` of the match. + pub fn get_blob_id(&self) -> BlobId { + self.blob_id.clone() + } + + pub fn finding_id(&self) -> String { + let mut buffer = Vec::with_capacity(128); + buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes()); + buffer.push(0); + serde_json::to_writer(&mut buffer, &self.groups) + .expect("should be able to serialize groups as JSON"); + let mut num = xxh3_64(&buffer); + // Ensure the number is positive and within i64 range + num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive + // Convert to string + num.to_string() + } +} + +// ------------------------------------------------------------------------------------------------- +// MatcherStats +// ------------------------------------------------------------------------------------------------- + +#[derive(Debug, Default, Clone)] +pub struct MatcherStats { + pub blobs_seen: u64, + pub blobs_scanned: u64, + pub bytes_seen: u64, + pub bytes_scanned: u64, +} + +impl MatcherStats { + pub fn update(&mut self, other: &Self) { + self.blobs_seen += other.blobs_seen; + self.blobs_scanned += other.blobs_scanned; + self.bytes_seen += other.bytes_seen; + self.bytes_scanned += other.bytes_scanned; + } +} diff --git a/src/matcher/dedup.rs b/src/matcher/dedup.rs new file mode 100644 index 0000000..5755a45 --- /dev/null +++ b/src/matcher/dedup.rs @@ -0,0 +1,2 @@ +// Re-export from the canonical implementation in kingfisher-scanner. +pub(crate) use kingfisher_scanner::primitives::{compute_match_key, record_match}; diff --git a/src/matcher/filter.rs b/src/matcher/filter.rs new file mode 100644 index 0000000..96ec74b --- /dev/null +++ b/src/matcher/filter.rs @@ -0,0 +1,274 @@ +use std::sync::Arc; + +use http::StatusCode; +use regex::bytes::Regex; +use rustc_hash::{FxHashMap, FxHashSet}; +use tracing::debug; + +use crate::{ + blob::Blob, + entropy::calculate_shannon_entropy, + inline_ignore::InlineIgnoreConfig, + location::OffsetSpan, + origin::OriginSet, + rule_profiling::{ConcurrentRuleProfiler, RuleTimer}, + rules::rule::{PatternRequirementContext, PatternValidationResult, Rule, Validation}, + safe_list::{is_safe_match, is_user_match}, + validation::{is_parseable_mongodb_uri, is_parseable_mysql_uri, is_parseable_postgres_uri}, +}; + +use super::{ + captures::SerializableCaptures, + dedup::{compute_match_key, record_match}, + BlobMatch, +}; + +// Re-use the canonical secret capture selection from kingfisher-scanner. +use kingfisher_scanner::primitives::find_secret_capture; + +// ------------------------------------------------------------------------------------------------- +// Entropy and safe-list check +// ------------------------------------------------------------------------------------------------- + +/// Returns `Some(entropy)` if the match passes entropy and safe-list checks, +/// `None` if it should be skipped. +fn check_entropy_and_safelist( + entropy_bytes: &[u8], + full_bytes: &[u8], + min_entropy: f32, +) -> Option { + let calculated_entropy = calculate_shannon_entropy(entropy_bytes); + if calculated_entropy <= min_entropy + || is_safe_match(entropy_bytes) + || is_user_match(entropy_bytes, full_bytes) + { + debug!( + "Skipping match with entropy {} <= {} or safe match", + calculated_entropy, min_entropy + ); + None + } else { + Some(calculated_entropy) + } +} + +// ------------------------------------------------------------------------------------------------- +// Pattern requirements check +// ------------------------------------------------------------------------------------------------- + +/// Returns `true` if the match passes pattern requirements, `false` if it should be skipped. +fn check_pattern_requirements( + rule: &Rule, + re: &Regex, + captures: ®ex::bytes::Captures, + full_bytes: &[u8], + entropy_bytes: &[u8], + respect_ignore_if_contains: bool, +) -> bool { + let Some(char_reqs) = rule.pattern_requirements() else { + return true; + }; + + let context = PatternRequirementContext { regex: re, captures, full_match: full_bytes }; + + // Decide which bytes to validate: + // - If there are multiple capture groups OR any named captures -> use full match + // - Otherwise -> use entropy_bytes (the actual secret) + let use_full_match = { + let has_named_captures = re.capture_names().any(|n| n.is_some()); + let capture_count = captures.len(); // includes group 0 + has_named_captures || capture_count > 2 + }; + + let validation_bytes = if use_full_match { full_bytes } else { entropy_bytes }; + + match char_reqs.validate(validation_bytes, Some(context), respect_ignore_if_contains) { + PatternValidationResult::Passed => true, + PatternValidationResult::Failed => { + debug!( + "Skipping match that does not meet character requirements for rule {}", + rule.id() + ); + false + } + PatternValidationResult::FailedChecksum { actual_len, expected_len } => { + debug!( + "Skipping match for rule {} due to checksum mismatch (actual_len={}, expected_len={})", + rule.id(), + actual_len, + expected_len + ); + false + } + PatternValidationResult::IgnoredBySubstring { matched_term } => { + debug!( + "Skipping match for rule {} because it contains ignored term {matched_term}", + rule.id() + ); + false + } + } +} + +// ------------------------------------------------------------------------------------------------- +// URI validation +// ------------------------------------------------------------------------------------------------- + +/// Returns `true` if the match passes URI validation (for database rules), `false` if it should +/// be skipped. +fn check_uri_validation(rule: &Rule, matching_input_bytes: &[u8]) -> bool { + let Some(validation) = rule.syntax.validation.as_ref() else { + return true; + }; + + match validation { + Validation::MongoDB => { + let Ok(uri) = std::str::from_utf8(matching_input_bytes) else { + debug!("Skipping match for rule {} due to non-UTF8 MongoDB URI", rule.id()); + return false; + }; + if !is_parseable_mongodb_uri(uri) { + debug!("Skipping match for rule {} due to invalid MongoDB URI", rule.id()); + return false; + } + } + Validation::Postgres => { + let Ok(uri) = std::str::from_utf8(matching_input_bytes) else { + debug!("Skipping match for rule {} due to non-UTF8 Postgres URI", rule.id()); + return false; + }; + if !is_parseable_postgres_uri(uri) { + debug!("Skipping match for rule {} due to invalid Postgres URI", rule.id()); + return false; + } + } + Validation::MySQL => { + let Ok(uri) = std::str::from_utf8(matching_input_bytes) else { + debug!("Skipping match for rule {} due to non-UTF8 MySQL URI", rule.id()); + return false; + }; + if !is_parseable_mysql_uri(uri) { + debug!("Skipping match for rule {} due to invalid MySQL URI", rule.id()); + return false; + } + } + _ => {} + } + true +} + +// ------------------------------------------------------------------------------------------------- +// filter_match — main entry point +// ------------------------------------------------------------------------------------------------- + +#[allow(clippy::too_many_arguments)] +pub(crate) fn filter_match<'b>( + blob: &'b Blob, + rule: Arc, + re: &Regex, + start: usize, + end: usize, + matches: &mut Vec>, + previous_matches: &mut FxHashMap>, + rule_id: usize, + seen_matches: &mut FxHashSet, + _origin: &OriginSet, + ts_match: Option<&[u8]>, + is_base64: bool, + _redact: bool, + filename: &str, + profiler: Option<&Arc>, + respect_ignore_if_contains: bool, + inline_ignore_config: &InlineIgnoreConfig, +) { + let mut timer = + profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename)); + + let initial_len = matches.len(); + + let blob_bytes = blob.bytes(); + let default_slice = &blob_bytes[start..end]; + let haystack = ts_match.unwrap_or(default_slice); + + for captures in re.captures_iter(haystack) { + let full_capture = captures.get(0).unwrap(); + let matching_input_for_entropy = find_secret_capture(re, &captures); + + let min_entropy = rule.min_entropy(); + let entropy_bytes = matching_input_for_entropy.as_bytes(); + let full_bytes = full_capture.as_bytes(); + + // Check entropy and safe-listing + let calculated_entropy = + match check_entropy_and_safelist(entropy_bytes, full_bytes, min_entropy) { + Some(e) => e, + None => continue, + }; + + // Check pattern requirements + if !check_pattern_requirements( + &rule, + re, + &captures, + full_bytes, + entropy_bytes, + respect_ignore_if_contains, + ) { + continue; + } + + // Use the `matching_input_for_entropy` as the span/key for the finding. + let matching_input = matching_input_for_entropy; + + let matching_input_offset_span = OffsetSpan::from_range( + (start + matching_input.start())..(start + matching_input.end()), + ); + + // Check inline ignore directives + if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) { + debug!("Skipping match due to inline ignore directive"); + continue; + } + + // Check URI validation (MongoDB, Postgres, MySQL) + if !check_uri_validation(&rule, matching_input.as_bytes()) { + continue; + } + + // Deduplication + let match_key = compute_match_key( + matching_input.as_bytes(), + rule.id().as_bytes(), + matching_input_offset_span.start, + matching_input_offset_span.end, + ); + if !seen_matches.insert(match_key) { + continue; + } + if !record_match(previous_matches, rule_id, matching_input_offset_span) { + continue; + } + let only_matching_input = + &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end]; + + // Pass the *full* capture object to from_captures + let groups = SerializableCaptures::from_captures(&captures, haystack, re); + + matches.push(BlobMatch { + rule: Arc::clone(&rule), + blob_id: blob.id_ref(), + matching_input: only_matching_input, + matching_input_offset_span, + captures: groups, + validation_response_body: None, + validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE), + validation_success: false, + calculated_entropy, + is_base64, + }); + } + if let Some(t) = timer.take() { + let new_count = (matches.len() - initial_len) as u64; + t.end(new_count > 0, new_count, 0); + } +} diff --git a/src/matcher/fingerprint.rs b/src/matcher/fingerprint.rs new file mode 100644 index 0000000..0a764da --- /dev/null +++ b/src/matcher/fingerprint.rs @@ -0,0 +1,2 @@ +// Re-export from the canonical implementation in kingfisher-scanner. +pub use kingfisher_scanner::primitives::compute_finding_fingerprint; diff --git a/src/matcher.rs b/src/matcher/mod.rs similarity index 58% rename from src/matcher.rs rename to src/matcher/mod.rs index ac89bae..6324ba7 100644 --- a/src/matcher.rs +++ b/src/matcher/mod.rs @@ -1,42 +1,38 @@ -use std::{ - hash::{Hash, Hasher}, - str, - sync::{Arc, Mutex}, -}; +mod base64_decode; +mod captures; +mod conversion; +mod dedup; +mod filter; +mod fingerprint; + +// Re-export public API +pub use base64_decode::{get_base64_strings, DecodedData}; +pub use captures::{Group, Groups, SerializableCapture, SerializableCaptures}; +pub use conversion::{Match, MatcherStats, OwnedBlobMatch}; +pub use fingerprint::compute_finding_fingerprint; + +use std::sync::{Arc, Mutex}; use anyhow::Result; -use base64::{engine::general_purpose, Engine}; -use bstr::BString; use http::StatusCode; -use regex::bytes::Regex; -use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; -use schemars::{ - gen::SchemaGenerator, - schema::{ArrayValidation, InstanceType, Schema}, - JsonSchema, -}; -use serde::{Deserialize, Serialize}; -use smallvec::SmallVec; -use tracing::debug; -use xxhash_rust::xxh3::xxh3_64; +use rustc_hash::{FxHashMap, FxHashSet}; use crate::{ blob::{Blob, BlobId, BlobIdMap}, - entropy::calculate_shannon_entropy, inline_ignore::InlineIgnoreConfig, - location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, + location::OffsetSpan, origin::OriginSet, parser, parser::{Checker, Language}, - rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, - rules::rule::{PatternRequirementContext, PatternValidationResult, Rule, Validation}, + rule_profiling::{ConcurrentRuleProfiler, RuleStats}, + rules::rule::Rule, rules_database::RulesDatabase, - safe_list::{is_safe_match, is_user_match}, scanner_pool::ScannerPool, - snippet::Base64BString, - util::intern, - validation::{is_parseable_mongodb_uri, is_parseable_mysql_uri, is_parseable_postgres_uri}, - validation_body::{self, ValidationResponseBody}, + validation_body::ValidationResponseBody, +}; + +use self::{ + base64_decode::get_base64_strings as get_b64_strings, dedup::record_match, filter::filter_match, }; const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment @@ -58,94 +54,7 @@ struct RawMatch { start_idx: u64, end_idx: u64, } -#[derive(Clone)] -pub struct OwnedBlobMatch { - pub rule: Arc, - pub blob_id: BlobId, - /// The unique content-based identifier of this match - pub finding_fingerprint: u64, - pub matching_input_offset_span: OffsetSpan, - pub captures: SerializableCaptures, - pub validation_response_body: ValidationResponseBody, - pub validation_response_status: StatusCode, - pub validation_success: bool, - pub calculated_entropy: f32, - pub is_base64: bool, - /// Variables captured from dependent rules (from depends_on_rule). - /// Maps variable name (uppercase) to captured value. - pub dependent_captures: std::collections::BTreeMap, -} -impl<'a> Matcher<'a> { - pub fn get_profiling_report(&self) -> Option> { - self.profiler.as_ref().map(|p| p.generate_report()) - } -} -impl OwnedBlobMatch { - pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc) -> OwnedBlobMatch { - OwnedBlobMatch { - rule, - blob_id: m.blob_id, - finding_fingerprint: m.finding_fingerprint, - // matching_input: m.snippet.matching.0.to_vec(), - matching_input_offset_span: m.location.offset_span.clone(), - captures: m.groups.clone(), - validation_response_body: m.validation_response_body.clone(), - validation_response_status: StatusCode::from_u16(m.validation_response_status) - .unwrap_or(StatusCode::CONTINUE), - validation_success: m.validation_success, - calculated_entropy: m.calculated_entropy, - is_base64: m.is_base64, - dependent_captures: m.dependent_captures.clone(), - } - } - pub fn from_blob_match(blob_match: BlobMatch) -> Self { - // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility. - // - // This indexing is intentionally different from the internal `validation_dedup_key()` - // (which uses get(0)) to maintain stable external fingerprints. Changing this would break: - // - Historical baselines that rely on fingerprint matching - // - Dedup entries stored in external systems - // - // For rules with nested captures like (?...(ABC)...), this may pick up - // the inner group, but that behavior is now established and must be preserved. - let matching_finding = blob_match - .captures - .captures - .get(1) - .or_else(|| blob_match.captures.captures.get(0)) - .map(|capture| capture.raw_value().as_bytes().to_vec()) - .unwrap_or_else(Vec::new); - - let mut owned_blob_match = OwnedBlobMatch { - rule: blob_match.rule, - blob_id: blob_match.blob_id.clone(), - matching_input_offset_span: blob_match.matching_input_offset_span, - captures: blob_match.captures.clone(), - validation_response_body: blob_match.validation_response_body, - validation_response_status: blob_match.validation_response_status, - validation_success: blob_match.validation_success, - calculated_entropy: blob_match.calculated_entropy, - finding_fingerprint: 0, //default - is_base64: blob_match.is_base64, - dependent_captures: std::collections::BTreeMap::new(), - }; - - // Convert matching_finding to a &str (using lossy conversion if needed) - let finding_value = std::str::from_utf8(&matching_finding).unwrap_or(""); - // Use blob_id as the file/commit identifier - let file_or_commit = &blob_match.blob_id.to_string(); - - let offset_start: u64 = - owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); - let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); - - owned_blob_match.finding_fingerprint = - compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end); - - owned_blob_match - } -} // ------------------------------------------------------------------------------------------------- // BlobMatch // ------------------------------------------------------------------------------------------------- @@ -156,7 +65,7 @@ impl OwnedBlobMatch { /// `Match`. pub struct BlobMatch<'a> { /// The rule that was matched - pub rule: Arc, // Changed from `&'a Rule` to `Arc, /// The blob that was matched pub blob_id: &'a BlobId, @@ -168,7 +77,7 @@ pub struct BlobMatch<'a> { pub matching_input_offset_span: OffsetSpan, /// The capture groups from the match - pub captures: SerializableCaptures, // regex::bytes::Captures<'a>, + pub captures: SerializableCaptures, pub validation_response_body: ValidationResponseBody, pub validation_response_status: StatusCode, @@ -177,6 +86,7 @@ pub struct BlobMatch<'a> { pub calculated_entropy: f32, pub is_base64: bool, } + #[derive(Clone)] struct UserData { /// A scratch vector for raw matches from Vectorscan, to minimize allocation @@ -185,6 +95,7 @@ struct UserData { /// The length of the input being scanned input_len: u64, } + // ------------------------------------------------------------------------------------------------- // Matcher // ------------------------------------------------------------------------------------------------- @@ -222,6 +133,7 @@ pub struct Matcher<'a> { /// Whether matches should honour `ignore_if_contains` requirements. respect_ignore_if_contains: bool, } + /// This `Drop` implementation updates the `global_stats` with the local stats impl<'a> Drop for Matcher<'a> { fn drop(&mut self) { @@ -231,11 +143,19 @@ impl<'a> Drop for Matcher<'a> { } } } + pub enum ScanResult<'a> { SeenWithMatches, SeenSansMatches, New(Vec>), } + +impl<'a> Matcher<'a> { + pub fn get_profiling_report(&self) -> Option> { + self.profiler.as_ref().map(|p| p.generate_report()) + } +} + impl<'a> Matcher<'a> { /// Create a new `Matcher` from the given `RulesDatabase`. /// @@ -255,8 +175,6 @@ impl<'a> Matcher<'a> { // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec let raw_matches_scratch = Vec::new(); let user_data = UserData { raw_matches_scratch, input_len: 0 }; - // let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?; - // pool is created once per scan run (see Scanner section below) let profiler = shared_profiler.or_else(|| { if enable_profiling { Some(Arc::new(ConcurrentRuleProfiler::new())) @@ -349,7 +267,7 @@ impl<'a> Matcher<'a> { let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT { Vec::new() } else { - get_base64_strings(blob.bytes()) + get_b64_strings(blob.bytes()) }; let lang_hint = lang.as_deref(); @@ -530,7 +448,7 @@ impl<'a> Matcher<'a> { ); } if depth + 1 < MAX_B64_DEPTH { - for nested in get_base64_strings(item.decoded.as_slice()) { + for nested in get_b64_strings(item.decoded.as_slice()) { b64_stack.push(( DecodedData { decoded: nested.decoded, @@ -559,268 +477,12 @@ impl<'a> Matcher<'a> { if self.user_data.raw_matches_scratch.capacity() > self.user_data.raw_matches_scratch.len() * 4 { - // Vec::shrink_to_fit may re-allocate, but we’re about to leave scan_blob + // Vec::shrink_to_fit may re-allocate, but we're about to leave scan_blob // so the cost is hidden off the hot path. self.user_data.raw_matches_scratch.shrink_to_fit(); } Ok(ScanResult::New(matches)) - // Ok(result) - } -} - -#[inline] -fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 { - let mut hasher = FxHasher::default(); - // Hash each component directly without allocation - content.hash(&mut hasher); - rule_id.hash(&mut hasher); - start.hash(&mut hasher); - end.hash(&mut hasher); - hasher.finish() -} - -#[inline] -fn insert_span(spans: &mut Vec, span: OffsetSpan) -> bool { - let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i); - if idx > 0 { - if spans[idx - 1].fully_contains(&span) { - return false; - } - if span.fully_contains(&spans[idx - 1]) { - spans.remove(idx - 1); - idx -= 1; - } - } - if idx < spans.len() { - if spans[idx].fully_contains(&span) { - return false; - } - if span.fully_contains(&spans[idx]) { - spans.remove(idx); - } - } - spans.insert(idx, span); - true -} - -#[inline] -fn record_match( - map: &mut FxHashMap>, - rule_id: usize, - span: OffsetSpan, -) -> bool { - insert_span(map.entry(rule_id).or_default(), span) -} - -#[allow(clippy::too_many_arguments)] -fn filter_match<'b>( - blob: &'b Blob, - rule: Arc, - re: &Regex, - start: usize, - end: usize, - matches: &mut Vec>, - previous_matches: &mut FxHashMap>, - rule_id: usize, - seen_matches: &mut FxHashSet, - _origin: &OriginSet, - ts_match: Option<&[u8]>, - is_base64: bool, - _redact: bool, - filename: &str, - profiler: Option<&Arc>, - respect_ignore_if_contains: bool, - inline_ignore_config: &InlineIgnoreConfig, -) { - let mut timer = - profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename)); - - let initial_len = matches.len(); - - let blob_bytes = blob.bytes(); - let default_slice = &blob_bytes[start..end]; - let haystack = ts_match.unwrap_or(default_slice); - - for captures in re.captures_iter(haystack) { - let full_capture = captures.get(0).unwrap(); - - // --- LOGIC TO FIND THE "SECRET" FOR ENTROPY/SAFE-LISTING --- - let matching_input_for_entropy = 'block: { - // 1. Prefer a named capture called TOKEN (case-insensitive). - if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| { - name_opt - .filter(|name| name.eq_ignore_ascii_case("TOKEN")) - .and_then(|_| captures.get(i)) - }) { - break 'block token_cap; - } - - // 2. Otherwise, prefer the first *matched* named capture. - if let Some(named_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| { - name_opt.and_then(|_| captures.get(i)) // find(i > 0 && name_opt.is_some()) - }) { - break 'block named_cap; - } - - // 3. Otherwise, fall back to the first positional capture (group 1). - if let Some(pos_cap) = captures.get(1) { - break 'block pos_cap; - } - - // 4. Finally, fall back to the full match (group 0). - break 'block full_capture; - }; - // --- END LOGIC --- - - let min_entropy = rule.min_entropy(); - let entropy_bytes = matching_input_for_entropy.as_bytes(); - let full_bytes = full_capture.as_bytes(); - let calculated_entropy = calculate_shannon_entropy(entropy_bytes); - - // Check entropy and safe-listing against the *selected* secret bytes - if calculated_entropy <= min_entropy - || is_safe_match(entropy_bytes) - || is_user_match(entropy_bytes, full_bytes) - { - debug!( - "Skipping match with entropy {} <= {} or safe match", - calculated_entropy, min_entropy - ); - continue; - } - - // Check character requirements if specified - if let Some(char_reqs) = rule.pattern_requirements() { - let context = PatternRequirementContext { - regex: re, - captures: &captures, - full_match: full_bytes, - }; - - // Decide which bytes to validate: - // - If there are multiple capture groups OR any named captures → use full match - // - Otherwise → use entropy_bytes (the actual secret) - let use_full_match = { - let has_named_captures = re.capture_names().any(|n| n.is_some()); - let capture_count = captures.len(); // includes group 0 - has_named_captures || capture_count > 2 - }; - - let validation_bytes = if use_full_match { full_bytes } else { entropy_bytes }; - - match char_reqs.validate(validation_bytes, Some(context), respect_ignore_if_contains) { - // - // --- END FIX --- - PatternValidationResult::Passed => {} - PatternValidationResult::Failed => { - debug!( - "Skipping match that does not meet character requirements for rule {}", - rule.id() - ); - continue; - } - PatternValidationResult::FailedChecksum { actual_len, expected_len } => { - debug!( - "Skipping match for rule {} due to checksum mismatch (actual_len={}, expected_len={})", - rule.id(), - actual_len, - expected_len - ); - continue; - } - PatternValidationResult::IgnoredBySubstring { matched_term } => { - debug!( - "Skipping match for rule {} because it contains ignored term {matched_term}", - rule.id() - ); - continue; - } - } - } - - // Use the `matching_input_for_entropy` as the span/key for the finding. - let matching_input = matching_input_for_entropy; - - let matching_input_offset_span = OffsetSpan::from_range( - (start + matching_input.start())..(start + matching_input.end()), - ); - if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) { - debug!("Skipping match due to inline ignore directive"); - continue; - } - if let Some(validation) = rule.syntax.validation.as_ref() { - match validation { - Validation::MongoDB => { - let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else { - debug!("Skipping match for rule {} due to non-UTF8 MongoDB URI", rule.id()); - continue; - }; - if !is_parseable_mongodb_uri(uri) { - debug!("Skipping match for rule {} due to invalid MongoDB URI", rule.id()); - continue; - } - } - Validation::Postgres => { - let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else { - debug!( - "Skipping match for rule {} due to non-UTF8 Postgres URI", - rule.id() - ); - continue; - }; - if !is_parseable_postgres_uri(uri) { - debug!("Skipping match for rule {} due to invalid Postgres URI", rule.id()); - continue; - } - } - Validation::MySQL => { - let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else { - debug!("Skipping match for rule {} due to non-UTF8 MySQL URI", rule.id()); - continue; - }; - if !is_parseable_mysql_uri(uri) { - debug!("Skipping match for rule {} due to invalid MySQL URI", rule.id()); - continue; - } - } - _ => {} - } - } - let match_key = compute_match_key( - matching_input.as_bytes(), - rule.id().as_bytes(), - matching_input_offset_span.start, - matching_input_offset_span.end, - ); - if !seen_matches.insert(match_key) { - continue; - } - if !record_match(previous_matches, rule_id, matching_input_offset_span) { - continue; - } - let only_matching_input = - &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end]; - - // Pass the *full* capture object to from_captures - let groups = SerializableCaptures::from_captures(&captures, haystack, re); - - matches.push(BlobMatch { - rule: Arc::clone(&rule), - blob_id: blob.id_ref(), - matching_input: only_matching_input, - matching_input_offset_span, - captures: groups, - validation_response_body: None, - validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE), - validation_success: false, - calculated_entropy, - is_base64, - }); - } - if let Some(t) = timer.take() { - let new_count = (matches.len() - initial_len) as u64; - t.end(new_count > 0, new_count, 0); } } @@ -837,10 +499,6 @@ fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap { Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries())) } - // "kotlin" => Some(( - // Language::Kotlin, - // parser::queries::kotlin::get_kotlin_queries(), - // )), "php" => Some((Language::Php, parser::queries::php::get_php_queries())), "python" | "py" | "starlark" => { Some((Language::Python, parser::queries::python::get_python_queries())) @@ -855,354 +513,6 @@ fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap None, } } -// ------------------------------------------------------------------------------------------------- -// MatchStats -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Default, Clone)] -pub struct MatcherStats { - pub blobs_seen: u64, - pub blobs_scanned: u64, - pub bytes_seen: u64, - pub bytes_scanned: u64, - // #[cfg(feature = "rule_profiling")] - // pub rule_stats: crate::rule_profiling::RuleProfile, -} -impl MatcherStats { - pub fn update(&mut self, other: &Self) { - self.blobs_seen += other.blobs_seen; - self.blobs_scanned += other.blobs_scanned; - self.bytes_seen += other.bytes_seen; - self.bytes_scanned += other.bytes_scanned; - - // #[cfg(feature = "rule_profiling")] - // self.rule_stats.update(&other.rule_stats); - } -} -// ------------------------------------------------------------------------------------------------- -// Group -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)] -pub struct Group(pub Base64BString); -impl Group { - pub fn new(m: regex::bytes::Match<'_>) -> Self { - Self(Base64BString(BString::from(m.as_bytes()))) - } -} -// ------------------------------------------------------------------------------------------------- -// Groups -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub struct Groups(pub SmallVec<[Group; 1]>); -impl JsonSchema for Groups { - fn schema_name() -> String { - "Groups".to_string() - } - - fn json_schema(gen: &mut SchemaGenerator) -> Schema { - let group_schema = gen.subschema_for::(); - Schema::Object(schemars::schema::SchemaObject { - instance_type: Some(InstanceType::Array.into()), - array: Some(Box::new(ArrayValidation { - items: Some(group_schema.into()), - ..Default::default() - })), - ..Default::default() - }) - } -} -// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] -// pub struct SerializableCapture { -// pub name: Option, // Named group (if available) -// pub match_number: i32, -// pub start: usize, // Start position of the match -// pub end: usize, // End position of the match -// pub value: String, // The actual captured value -// } -#[derive(Debug, Clone, JsonSchema)] -pub struct SerializableCapture { - pub name: Option<&'static str>, - pub match_number: i32, - pub start: usize, - pub end: usize, - /// Interned original (unredacted) value. - #[serde(skip_serializing, skip_deserializing)] - pub value: &'static str, -} - -impl SerializableCapture { - /// Returns the original captured value. - pub fn raw_value(&self) -> &'static str { - self.value - } - - /// Returns the value that should be shown in user-facing output. - pub fn display_value(&self) -> std::borrow::Cow<'static, str> { - crate::util::display_value(self.value) - } -} - -impl serde::Serialize for SerializableCapture { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - use serde::ser::SerializeStruct; - - let mut state = serializer.serialize_struct("SerializableCapture", 5)?; - state.serialize_field("name", &self.name)?; - state.serialize_field("match_number", &self.match_number)?; - state.serialize_field("start", &self.start)?; - state.serialize_field("end", &self.end)?; - let value = self.display_value(); - state.serialize_field("value", &value)?; - state.end() - } -} - -#[derive(Debug, Clone, Serialize, JsonSchema)] -pub struct SerializableCaptures { - #[schemars(with = "Vec")] - pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed) -} - -impl SerializableCaptures { - pub fn from_captures(captures: ®ex::bytes::Captures, _input: &[u8], re: &Regex) -> Self { - let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new(); - - let capture_names: SmallVec<[Option<&'static str>; 4]> = - re.capture_names().map(|name| name.map(intern)).collect(); - - // If there are explicit capture groups (e.g., group 1, 2, ...), - // only serialize those. - if captures.len() > 1 { - for i in 1..captures.len() { - // Start from 1 - if let Some(cap) = captures.get(i) { - let raw_value = String::from_utf8_lossy(cap.as_bytes()); - let raw_interned = intern(raw_value.as_ref()); - let name = capture_names.get(i).and_then(|opt| *opt); - - serialized_captures.push(SerializableCapture { - name, - match_number: i32::try_from(i).unwrap_or(0), - start: cap.start(), - end: cap.end(), - value: raw_interned, - }); - } - } - } else if captures.len() == 1 { - // ELSE, if there is ONLY the full match (len == 1), - // serialize just that full match (group 0) as the fallback. - if let Some(cap) = captures.get(0) { - let raw_value = String::from_utf8_lossy(cap.as_bytes()); - let raw_interned = intern(raw_value.as_ref()); - let name = capture_names.get(0).and_then(|opt| *opt); - - serialized_captures.push(SerializableCapture { - name, - match_number: 0, - start: cap.start(), - end: cap.end(), - value: raw_interned, - }); - } - } - // If len == 0 (no match), loop is skipped, empty vec is returned. - - SerializableCaptures { captures: serialized_captures } - } -} -// ------------------------------------------------------------------------------------------------- -// Match -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Clone, Serialize, JsonSchema)] -pub struct Match { - /// The location of the entire matching content - pub location: Location, - - /// The capture groups - pub groups: SerializableCaptures, // Store serialized captures - - /// unique identifier of file / blob where this match was found - pub blob_id: BlobId, - - /// The unique content-based identifier of this match - pub finding_fingerprint: u64, - - /// The rule that produced this match - #[serde(skip_serializing)] - #[schemars(skip)] - pub rule: Arc, - - /// Validation Body - #[serde( - default, - serialize_with = "validation_body::serialize", - deserialize_with = "validation_body::deserialize" - )] - #[schemars(schema_with = "validation_body::schema")] - pub validation_response_body: ValidationResponseBody, - - /// Validation Status Code - pub validation_response_status: u16, - - /// Validation Success - pub validation_success: bool, - - /// Validation Success - pub calculated_entropy: f32, - - pub visible: bool, - #[serde(default)] - pub is_base64: bool, - - /// Variables captured from dependent rules (from depends_on_rule). - /// Maps variable name (uppercase) to captured value. - #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")] - pub dependent_captures: std::collections::BTreeMap, -} -impl Match { - #[inline] - pub fn convert_owned_blobmatch_to_match<'a>( - loc_mapping: Option<&'a LocationMapping<'a>>, - owned_blob_match: &'a OwnedBlobMatch, - origin_type: &'a str, - ) -> Self { - let offset_span = owned_blob_match.matching_input_offset_span; - // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility. - // See comment in from_blob_match() for why this differs from validation_dedup_key(). - let matching_finding_bytes = owned_blob_match - .captures - .captures - .get(1) - .or_else(|| owned_blob_match.captures.captures.get(0)) - .map(|capture| capture.raw_value().as_bytes()) - .unwrap_or_default(); - - // The fingerprint will be based on the content of the secret. - let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or(""); - - let source_span = - loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan { - start: SourcePoint { line: 0, column: 0 }, - end: SourcePoint { line: 0, column: 0 }, - }); - let offset_start: u64 = - owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); - let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); - - let finding_fingerprint = compute_finding_fingerprint( - finding_value_for_fp, - origin_type, // file_or_commit, - offset_start, - offset_end, - ); - - // matching_snippet - Match { - rule: owned_blob_match.rule.clone(), - visible: owned_blob_match.rule.visible().to_owned(), - location: Location::with_source_span(offset_span, Some(source_span.clone())), - groups: owned_blob_match.captures.clone(), - blob_id: owned_blob_match.blob_id, - finding_fingerprint, - validation_response_body: owned_blob_match.validation_response_body.clone(), - validation_response_status: owned_blob_match.validation_response_status.as_u16(), - validation_success: owned_blob_match.validation_success, - calculated_entropy: owned_blob_match.calculated_entropy, - is_base64: owned_blob_match.is_base64, - dependent_captures: owned_blob_match.dependent_captures.clone(), - } - } - - /// Returns the `blob_id` of the match. - pub fn get_blob_id(&self) -> BlobId { - self.blob_id.clone() - } - - pub fn finding_id(&self) -> String { - let mut buffer = Vec::with_capacity(128); - buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes()); - buffer.push(0); - serde_json::to_writer(&mut buffer, &self.groups) - .expect("should be able to serialize groups as JSON"); - let mut num = xxh3_64(&buffer); - // Ensure the number is positive and within i64 range - num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive - // Convert to string - num.to_string() - } -} -#[derive(Debug, Clone)] -pub struct DecodedData { - pub decoded: Vec, - pub pos_start: usize, - pub pos_end: usize, -} -#[inline] -fn is_base64_byte(b: u8) -> bool { - // Accepts both standard base64 ('+', '/') and URL-safe base64 ('-', '_') characters. - matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_') -} - -pub fn get_base64_strings(input: &[u8]) -> Vec { - let mut results = Vec::new(); - let mut i = 0; - while i < input.len() { - while i < input.len() && !is_base64_byte(input[i]) { - i += 1; - } - let start = i; - while i < input.len() && is_base64_byte(input[i]) { - i += 1; - } - - let mut eq_count = 0; - while i < input.len() && input[i] == b'=' && eq_count < 2 { - i += 1; - eq_count += 1; - } - let end = i; - - let len = end - start; - if len >= 32 && len % 4 == 0 { - let base64_slice = &input[start..end]; - - // Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD - let decode_result = general_purpose::STANDARD - .decode(base64_slice) - .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice)) - .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice)); - - if let Ok(decoded) = decode_result { - if decoded.is_ascii() { - results.push(DecodedData { decoded, pos_start: start, pos_end: end }); - } - } - } - } - - results -} - -pub fn compute_finding_fingerprint( - finding_value: &str, - file_or_commit: &str, - offset_start: u64, - offset_end: u64, -) -> u64 { - // Combine all into a byte buffer and hash it directly: - let mut buf = Vec::with_capacity( - finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::(), - ); - buf.extend_from_slice(finding_value.as_bytes()); - buf.extend_from_slice(file_or_commit.as_bytes()); - buf.extend_from_slice(&offset_start.to_le_bytes()); - buf.extend_from_slice(&offset_end.to_le_bytes()); - - xxh3_64(&buf) -} // ------------------------------------------------------------------------------------------------- // test @@ -1220,6 +530,7 @@ mod test { use super::*; use crate::{ blob::{Blob, BlobIdMap}, + entropy::calculate_shannon_entropy, origin::{Origin, OriginSet}, rules::rule::{ DependsOnRule, HttpRequest, HttpValidation, PatternRequirements, RuleSyntax, Validation, @@ -1341,8 +652,6 @@ mod test { let input = "some test data for vectorscan"; let seen_blobs: BlobIdMap = BlobIdMap::new(); let enable_rule_profiling = true; - // let mut matcher = Matcher::new(&rules_db, &seen_blobs, None, - // enable_rule_profiling)?; let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone()))); let mut matcher = Matcher::new( &rules_db, @@ -1513,7 +822,7 @@ mod test { assert_eq!((item.pos_start, item.pos_end), (4, 4 + base64_payload.len())); } - /// `compute_finding_fingerprint` must be stable (same input ⇒ same output) + /// `compute_finding_fingerprint` must be stable (same input => same output) /// and sensitive to any input component. #[test] fn test_finding_fingerprint_stability_and_uniqueness() { @@ -1535,7 +844,7 @@ mod test { /// keys as soon as *anything* changes. #[test] fn test_compute_match_key_uniqueness() { - use super::compute_match_key; + use super::dedup::compute_match_key; let k1 = compute_match_key(b"abc", b"rule-1", 0, 3); let k2 = compute_match_key(b"abc", b"rule-1", 0, 3); @@ -1721,6 +1030,8 @@ line2 #[test] fn serializes_captures_in_numeric_order() { + use regex::bytes::Regex; + let re = Regex::new(r"(?xi)\b(ghp_(?P[A-Z0-9]{3})(?P[A-Z0-9]{2}))").unwrap(); let caps = re.captures(b"ghp_ABC12").expect("expected captures"); diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs index a7ffc87..e07cec7 100644 --- a/src/scanner/runner.rs +++ b/src/scanner/runner.rs @@ -42,9 +42,18 @@ use crate::{ AccessMapCollector, }, util::set_redaction_enabled, + validation::CachedResponse, validation_rate_limit::ValidationRateLimiter, }; +/// Shared validation dependencies: (liquid parser, HTTP clients, validation cache, rate limiter). +type ValidationDeps = Arc<( + liquid::Parser, + crate::validation::ValidationClients, + Arc>, + Option>, +)>; + pub async fn run_scan( global_args: &global::GlobalArgs, scan_args: &scan::ScanArgs, @@ -64,22 +73,9 @@ pub async fn run_async_scan( rules_db: &RulesDatabase, update_status: &crate::update::UpdateStatus, ) -> Result<()> { - // Ensure all provided paths exist before proceeding - for path in &args.input_specifier_args.path_inputs { - if !path.exists() { - error!("Specified input path does not exist: {}", path.display()); - bail!("Invalid input: Path does not exist - {}", path.display()); - } - } - - // Register user-provided allow-list patterns - for pattern in &args.skip_regex { - safe_list::add_user_regex(pattern) - .map_err(|e| anyhow::anyhow!("Invalid skip-regex '{pattern}': {e}"))?; - } - for word in &args.skip_word { - safe_list::add_user_skipword(word); - } + // ── Phase 1: Input validation and environment setup ────────────────── + validate_inputs(args)?; + register_safe_list_patterns(args)?; let start_time = Instant::now(); let scan_started_at = chrono::Local::now(); @@ -90,123 +86,26 @@ pub async fn run_async_scan( set_redaction_enabled(args.redact); - let mut repo_urls = enumerate_github_repos(args, global_args).await?; - let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?; - let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?; - let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?; - let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?; - let azure_repo_urls = enumerate_azure_repos(args, global_args).await?; - - // Combine repository URLs - repo_urls.extend(gitlab_repo_urls); - repo_urls.extend(gitea_repo_urls); - repo_urls.extend(huggingface_repo_urls); - repo_urls.extend(bitbucket_repo_urls); - repo_urls.extend(azure_repo_urls); - - // Add wiki repositories for each URL when requested - if args.input_specifier_args.repo_artifacts { - let mut wiki_urls = Vec::new(); - for url in &repo_urls { - if let Some(w) = github::wiki_url(url) { - wiki_urls.push(w); - } - if let Some(w) = gitlab::wiki_url(url) { - wiki_urls.push(w); - } - if let Some(w) = gitea::wiki_url(url) { - wiki_urls.push(w); - } - if let Some(w) = bitbucket::wiki_url(url) { - wiki_urls.push(w); - } - if let Some(w) = azure::wiki_url(url) { - wiki_urls.push(w); - } - } - repo_urls.extend(wiki_urls); - } - - // just sort and dedup once - repo_urls.sort(); - repo_urls.dedup(); + // ── Phase 2: Repository enumeration ───────────────────────────────── + let repo_urls = enumerate_all_repos(args, global_args).await?; let mut input_roots = args.input_specifier_args.path_inputs.clone(); let (repo_tx, repo_rx) = crossbeam_channel::unbounded(); - let repo_clone_handle = if repo_urls.is_empty() { - None - } else { - let clone_args = args.clone(); - let clone_globals = global_args.clone(); - let clone_repo_urls = repo_urls.clone(); - let clone_datastore = Arc::clone(&datastore); - let clone_repo_tx = repo_tx.clone(); - Some(std::thread::spawn(move || { - if let Err(e) = clone_or_update_git_repos_streaming( - &clone_args, - &clone_globals, - &clone_repo_urls, - &clone_datastore, - |path| { - let _ = clone_repo_tx.send(path); - }, - ) { - error!("Failed to fetch one or more Git repositories: {e}"); - } - })) - }; - drop(repo_tx); + let repo_clone_handle = + start_repo_cloning(&repo_urls, args, global_args, &datastore, repo_tx, progress_enabled); - // Fetch issues, gists, and wikis if enabled - let bitbucket_auth = bitbucket::AuthConfig::from_env(); - let bitbucket_host = - args.input_specifier_args.bitbucket_api_url.host_str().map(|s| s.to_string()); - - if args.input_specifier_args.repo_artifacts { - let repo_artifact_dirs = fetch_git_host_artifacts( - &repo_urls, - &args.input_specifier_args.bitbucket_api_url, - &bitbucket_auth, - bitbucket_host.clone(), - global_args, - &datastore, - ) - .await?; - input_roots.extend(repo_artifact_dirs); - } - // Fetch Jira issues if requested - let jira_dirs = fetch_jira_issues(args, global_args, &datastore).await?; - input_roots.extend(jira_dirs); - - // Fetch Confluence pages if requested - let confluence_dirs = fetch_confluence_pages(args, global_args, &datastore).await?; - input_roots.extend(confluence_dirs); - - // Fetch Slack messages if requested - let slack_dirs = fetch_slack_messages(args, global_args, &datastore).await?; - input_roots.extend(slack_dirs); - - // Save Docker images if specified - if !args.input_specifier_args.docker_image.is_empty() { - let clone_root = { - let ds = datastore.lock().unwrap(); - ds.clone_root() - }; - let docker_dirs = save_docker_images( - &args.input_specifier_args.docker_image, - &clone_root, - progress_enabled, - ) - .await?; - for (dir, img) in docker_dirs { - { - let mut ds = datastore.lock().unwrap(); - ds.register_docker_image(dir.clone(), img); - } - input_roots.push(dir); - } - } + // ── Phase 3: Artifact fetching ────────────────────────────────────── + fetch_all_artifacts( + args, + global_args, + &repo_urls, + &datastore, + &mut input_roots, + progress_enabled, + ) + .await?; + // ── Phase 4: Scan configuration ───────────────────────────────────── let shared_profiler = Arc::new(ConcurrentRuleProfiler::new()); let enable_profiling = args.rule_stats; let matcher_stats = Arc::new(Mutex::new(MatcherStats::default())); @@ -246,11 +145,257 @@ pub async fn run_async_scan( .unwrap_or_else(|| std::path::PathBuf::from("baseline-file.yaml")), ); - let mut skip_aws_accounts = args.skip_aws_account.clone(); + let skip_aws_accounts = load_skip_aws_accounts(args)?; + crate::validation::set_skip_aws_account_ids(skip_aws_accounts); let mut access_map_collector = if args.access_map { Some(AccessMapCollector::default()) } else { None }; + let repo_roots = expand_repo_roots(&input_roots)?; + let git_repo_count = + repo_roots.iter().filter(|p| p.join(".git").is_dir()).count() + repo_urls.len(); + let use_parallel_repo_scan = git_repo_count > 10; + + let validation_rate_limiter = + ValidationRateLimiter::from_cli(args.validation_rps, &args.validation_rps_rule)? + .map(Arc::new); + + let validation_deps: Option = if !args.no_validate { + info!("Starting secret validation phase..."); + Some(Arc::new(( + register_all(liquid::ParserBuilder::with_stdlib()).build()?, + crate::validation::ValidationClients::new(global_args.tls_mode)?, + Arc::new(SkipMap::new()), + validation_rate_limiter.clone(), + ))) + } else { + None + }; + + // ── Phase 5: Scanning ─────────────────────────────────────────────── + if !use_parallel_repo_scan { + run_sequential_scan( + args, + global_args, + &datastore, + rules_db, + &mut input_roots, + repo_rx, + repo_clone_handle, + &shared_profiler, + enable_profiling, + &matcher_stats, + &baseline_path, + &validation_deps, + &mut access_map_collector, + progress_enabled, + start_time, + scan_started_at, + update_status, + ) + .await?; + return Ok(()); + } + + run_parallel_scan( + args, + global_args, + &datastore, + rules_db, + &repo_roots, + repo_rx, + repo_clone_handle, + &shared_profiler, + enable_profiling, + &matcher_stats, + &baseline_path, + &validation_deps, + &mut access_map_collector, + progress_enabled, + start_time, + scan_started_at, + update_status, + ) + .await +} + +// ================================================================================================= +// Phase helpers +// ================================================================================================= + +/// Validates that all provided input paths exist. +fn validate_inputs(args: &scan::ScanArgs) -> Result<()> { + for path in &args.input_specifier_args.path_inputs { + if !path.exists() { + error!("Specified input path does not exist: {}", path.display()); + bail!("Invalid input: Path does not exist - {}", path.display()); + } + } + Ok(()) +} + +/// Registers user-provided allow-list patterns (skip-regex and skip-word). +fn register_safe_list_patterns(args: &scan::ScanArgs) -> Result<()> { + for pattern in &args.skip_regex { + safe_list::add_user_regex(pattern) + .map_err(|e| anyhow::anyhow!("Invalid skip-regex '{pattern}': {e}"))?; + } + for word in &args.skip_word { + safe_list::add_user_skipword(word); + } + Ok(()) +} + +/// Enumerates repositories from all configured platforms, adds wiki URLs, and deduplicates. +async fn enumerate_all_repos( + args: &scan::ScanArgs, + global_args: &global::GlobalArgs, +) -> Result> { + let mut repo_urls = enumerate_github_repos(args, global_args).await?; + let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?; + let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?; + let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?; + let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?; + let azure_repo_urls = enumerate_azure_repos(args, global_args).await?; + + repo_urls.extend(gitlab_repo_urls); + repo_urls.extend(gitea_repo_urls); + repo_urls.extend(huggingface_repo_urls); + repo_urls.extend(bitbucket_repo_urls); + repo_urls.extend(azure_repo_urls); + + // Add wiki repositories for each URL when requested + if args.input_specifier_args.repo_artifacts { + let mut wiki_urls = Vec::new(); + for url in &repo_urls { + if let Some(w) = github::wiki_url(url) { + wiki_urls.push(w); + } + if let Some(w) = gitlab::wiki_url(url) { + wiki_urls.push(w); + } + if let Some(w) = gitea::wiki_url(url) { + wiki_urls.push(w); + } + if let Some(w) = bitbucket::wiki_url(url) { + wiki_urls.push(w); + } + if let Some(w) = azure::wiki_url(url) { + wiki_urls.push(w); + } + } + repo_urls.extend(wiki_urls); + } + + repo_urls.sort(); + repo_urls.dedup(); + + Ok(repo_urls) +} + +/// Spawns a background thread to clone/update git repositories, streaming results via a channel. +fn start_repo_cloning( + repo_urls: &[crate::git_url::GitUrl], + args: &scan::ScanArgs, + global_args: &global::GlobalArgs, + datastore: &Arc>, + repo_tx: crossbeam_channel::Sender, + _progress_enabled: bool, +) -> Option> { + if repo_urls.is_empty() { + drop(repo_tx); + return None; + } + + let clone_args = args.clone(); + let clone_globals = global_args.clone(); + let clone_repo_urls = repo_urls.to_vec(); + let clone_datastore = Arc::clone(datastore); + let clone_repo_tx = repo_tx.clone(); + + let handle = std::thread::spawn(move || { + if let Err(e) = clone_or_update_git_repos_streaming( + &clone_args, + &clone_globals, + &clone_repo_urls, + &clone_datastore, + |path| { + let _ = clone_repo_tx.send(path); + }, + ) { + error!("Failed to fetch one or more Git repositories: {e}"); + } + }); + drop(repo_tx); + Some(handle) +} + +/// Fetches artifacts from various platforms (issues, wikis, Jira, Confluence, Slack, Docker). +async fn fetch_all_artifacts( + args: &scan::ScanArgs, + global_args: &global::GlobalArgs, + repo_urls: &[crate::git_url::GitUrl], + datastore: &Arc>, + input_roots: &mut Vec, + progress_enabled: bool, +) -> Result<()> { + let bitbucket_auth = bitbucket::AuthConfig::from_env(); + let bitbucket_host = + args.input_specifier_args.bitbucket_api_url.host_str().map(|s| s.to_string()); + + if args.input_specifier_args.repo_artifacts { + let repo_artifact_dirs = fetch_git_host_artifacts( + repo_urls, + &args.input_specifier_args.bitbucket_api_url, + &bitbucket_auth, + bitbucket_host.clone(), + global_args, + datastore, + ) + .await?; + input_roots.extend(repo_artifact_dirs); + } + + // Fetch Jira issues if requested + let jira_dirs = fetch_jira_issues(args, global_args, datastore).await?; + input_roots.extend(jira_dirs); + + // Fetch Confluence pages if requested + let confluence_dirs = fetch_confluence_pages(args, global_args, datastore).await?; + input_roots.extend(confluence_dirs); + + // Fetch Slack messages if requested + let slack_dirs = fetch_slack_messages(args, global_args, datastore).await?; + input_roots.extend(slack_dirs); + + // Save Docker images if specified + if !args.input_specifier_args.docker_image.is_empty() { + let clone_root = { + let ds = datastore.lock().unwrap(); + ds.clone_root() + }; + let docker_dirs = save_docker_images( + &args.input_specifier_args.docker_image, + &clone_root, + progress_enabled, + ) + .await?; + for (dir, img) in docker_dirs { + { + let mut ds = datastore.lock().unwrap(); + ds.register_docker_image(dir.clone(), img); + } + input_roots.push(dir); + } + } + + Ok(()) +} + +/// Loads AWS account IDs to skip from CLI args and optional file. +fn load_skip_aws_accounts(args: &scan::ScanArgs) -> Result> { + let mut skip_aws_accounts = args.skip_aws_account.clone(); + if let Some(path) = args.skip_aws_account_file.as_ref() { let contents = fs::read_to_string(path).with_context(|| { format!("Failed to read --skip-aws-account-file {}", path.display()) @@ -267,183 +412,214 @@ pub async fn run_async_scan( } } - crate::validation::set_skip_aws_account_ids(skip_aws_accounts); + Ok(skip_aws_accounts) +} - let repo_roots = expand_repo_roots(&input_roots)?; - let git_repo_count = - repo_roots.iter().filter(|p| p.join(".git").is_dir()).count() + repo_urls.len(); - let use_parallel_repo_scan = git_repo_count > 10; - - let validation_rate_limiter = - ValidationRateLimiter::from_cli(args.validation_rps, &args.validation_rps_rule)? - .map(Arc::new); - - let validation_deps = if !args.no_validate { - info!("Starting secret validation phase..."); - Some(Arc::new(( - register_all(liquid::ParserBuilder::with_stdlib()).build()?, - crate::validation::ValidationClients::new(global_args.tls_mode)?, - Arc::new(SkipMap::new()), - validation_rate_limiter.clone(), - ))) - } else { - None - }; - - if !use_parallel_repo_scan { - let mut streamed_roots = Vec::new(); - if !input_roots.is_empty() { - let _inputs = enumerate_filesystem_inputs( - args, - datastore.clone(), - &input_roots, - progress_enabled, - rules_db, - enable_profiling, - Arc::clone(&shared_profiler), - matcher_stats.as_ref(), - )?; - } - - for repo_root in repo_rx.clone().iter() { - enumerate_filesystem_inputs( - args, - datastore.clone(), - &[repo_root.clone()], - progress_enabled, - rules_db, - enable_profiling, - Arc::clone(&shared_profiler), - matcher_stats.as_ref(), - )?; - streamed_roots.push(repo_root); - } - input_roots.extend(streamed_roots); - - if let Some(handle) = repo_clone_handle { - let _ = handle.join(); - } - - if !args.no_dedup { - let reporter = crate::reporter::DetailsReporter { - datastore: Arc::clone(&datastore), - styles: Styles::new(global_args.use_color(std::io::stdout())), - only_valid: args.only_valid, - }; - - let all_matches = reporter.get_unfiltered_matches(Some(false))?; - let deduped_matches = reporter.deduplicate_matches(all_matches, args.no_dedup); - - let deduped_arcs: Vec> = deduped_matches - .into_iter() - .map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m))) - .collect(); - let mut ds = datastore.lock().unwrap(); - ds.replace_matches(deduped_arcs); - } - - if args.baseline_file.is_some() || args.manage_baseline { - let mut ds = datastore.lock().unwrap(); - crate::baseline::apply_baseline( - &mut ds, - baseline_path.as_ref(), - args.manage_baseline, - &input_roots, - )?; - } - - if let Some(validation) = &validation_deps { - let (parser, clients, cache, rate_limiter) = - (&validation.0, &validation.1, &validation.2, &validation.3); - run_secret_validation( - Arc::clone(&datastore), - parser, - clients, - cache, - args.num_jobs, - None, - access_map_collector.clone(), - rate_limiter.clone(), - Duration::from_secs(args.validation_timeout), - args.validation_retries, - ) - .await?; - } - - if let Some(collector) = access_map_collector.take() { - finalize_access_map(&datastore, collector, args).await?; - } - - crate::reporter::run(global_args, Arc::clone(&datastore), args) - .context("Failed to run report command")?; - print_scan_summary( - start_time, - scan_started_at, - &datastore, - global_args, - args, - rules_db, - matcher_stats.as_ref(), - if enable_profiling { Some(shared_profiler.as_ref()) } else { None }, - update_status, - None, - None, - ); - maybe_hint_access_map(&datastore, args); +/// Deduplicates matches in the datastore starting from `start_index`. +fn deduplicate_new_matches( + store: &Arc>, + global_args: &global::GlobalArgs, + args: &scan::ScanArgs, + start_index: usize, +) -> Result<()> { + if args.no_dedup { return Ok(()); } - let deduplicate_new_matches = - |store: &Arc>, start_index: usize| -> Result<()> { - if args.no_dedup { - return Ok(()); - } + let reporter = crate::reporter::DetailsReporter { + datastore: Arc::clone(store), + styles: Styles::new(global_args.use_color(std::io::stdout())), + only_valid: args.only_valid, + }; - let reporter = crate::reporter::DetailsReporter { - datastore: Arc::clone(store), - styles: Styles::new(global_args.use_color(std::io::stdout())), - only_valid: args.only_valid, - }; + let all_matches = reporter.get_unfiltered_matches(Some(false))?; + if start_index >= all_matches.len() { + return Ok(()); + } - let all_matches = reporter.get_unfiltered_matches(Some(false))?; - if start_index >= all_matches.len() { - return Ok(()); - } + let slice = if start_index == 0 { all_matches } else { all_matches[start_index..].to_vec() }; + let deduped_matches = reporter.deduplicate_matches(slice, args.no_dedup); - let deduped_matches = - reporter.deduplicate_matches(all_matches[start_index..].to_vec(), args.no_dedup); + let deduped_arcs: Vec> = deduped_matches + .into_iter() + .map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m))) + .collect(); - let deduped_arcs: Vec> = deduped_matches - .into_iter() - .map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m))) - .collect(); - - let mut ds = store.lock().unwrap(); - let mut preserved = ds.get_matches()[..start_index].to_vec(); - preserved.extend(deduped_arcs); - ds.replace_matches(preserved); - Ok(()) - }; - - deduplicate_new_matches(&datastore, 0)?; + let mut ds = store.lock().unwrap(); + if start_index == 0 { + ds.replace_matches(deduped_arcs); + } else { + let mut preserved = ds.get_matches()[..start_index].to_vec(); + preserved.extend(deduped_arcs); + ds.replace_matches(preserved); + } + Ok(()) +} +/// Applies baseline filtering if configured. +fn apply_baseline_if_configured( + args: &scan::ScanArgs, + datastore: &Arc>, + baseline_path: &std::path::Path, + roots: &[PathBuf], +) -> Result<()> { if args.baseline_file.is_some() || args.manage_baseline { let mut ds = datastore.lock().unwrap(); - crate::baseline::apply_baseline( - &mut ds, - baseline_path.as_ref(), - args.manage_baseline, - &repo_roots, + crate::baseline::apply_baseline(&mut ds, baseline_path, args.manage_baseline, roots)?; + } + Ok(()) +} + +/// Runs the validation phase on matches in the datastore. +#[allow(clippy::too_many_arguments)] +async fn run_validation_phase( + datastore: &Arc>, + validation_deps: &Option, + args: &scan::ScanArgs, + match_range: Option>, + access_map_collector: Option, +) -> Result<()> { + if let Some(validation) = validation_deps { + let (parser, clients, cache, rate_limiter) = + (&validation.0, &validation.1, &validation.2, &validation.3); + run_secret_validation( + Arc::clone(datastore), + parser, + clients, + cache, + args.num_jobs, + match_range, + access_map_collector, + rate_limiter.clone(), + Duration::from_secs(args.validation_timeout), + args.validation_retries, + ) + .await?; + } + Ok(()) +} + +// ================================================================================================= +// Sequential scan path +// ================================================================================================= + +#[allow(clippy::too_many_arguments)] +async fn run_sequential_scan( + args: &scan::ScanArgs, + global_args: &global::GlobalArgs, + datastore: &Arc>, + rules_db: &RulesDatabase, + input_roots: &mut Vec, + repo_rx: crossbeam_channel::Receiver, + repo_clone_handle: Option>, + shared_profiler: &Arc, + enable_profiling: bool, + matcher_stats: &Arc>, + baseline_path: &Arc, + validation_deps: &Option, + access_map_collector: &mut Option, + progress_enabled: bool, + start_time: Instant, + scan_started_at: chrono::DateTime, + update_status: &crate::update::UpdateStatus, +) -> Result<()> { + let mut streamed_roots = Vec::new(); + if !input_roots.is_empty() { + let _inputs = enumerate_filesystem_inputs( + args, + datastore.clone(), + input_roots, + progress_enabled, + rules_db, + enable_profiling, + Arc::clone(shared_profiler), + matcher_stats.as_ref(), )?; } - if let Some(validation) = &validation_deps { + for repo_root in repo_rx.iter() { + enumerate_filesystem_inputs( + args, + datastore.clone(), + &[repo_root.clone()], + progress_enabled, + rules_db, + enable_profiling, + Arc::clone(shared_profiler), + matcher_stats.as_ref(), + )?; + streamed_roots.push(repo_root); + } + input_roots.extend(streamed_roots); + + if let Some(handle) = repo_clone_handle { + let _ = handle.join(); + } + + deduplicate_new_matches(datastore, global_args, args, 0)?; + apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), input_roots)?; + + run_validation_phase(datastore, validation_deps, args, None, access_map_collector.clone()) + .await?; + + if let Some(collector) = access_map_collector.take() { + finalize_access_map(datastore, collector, args).await?; + } + + crate::reporter::run(global_args, Arc::clone(datastore), args) + .context("Failed to run report command")?; + print_scan_summary( + start_time, + scan_started_at, + datastore, + global_args, + args, + rules_db, + matcher_stats.as_ref(), + if enable_profiling { Some(shared_profiler.as_ref()) } else { None }, + update_status, + None, + None, + ); + maybe_hint_access_map(datastore, args); + Ok(()) +} + +// ================================================================================================= +// Parallel scan path +// ================================================================================================= + +#[allow(clippy::too_many_arguments)] +async fn run_parallel_scan( + args: &scan::ScanArgs, + global_args: &global::GlobalArgs, + datastore: &Arc>, + rules_db: &RulesDatabase, + repo_roots: &[PathBuf], + repo_rx: crossbeam_channel::Receiver, + repo_clone_handle: Option>, + shared_profiler: &Arc, + enable_profiling: bool, + matcher_stats: &Arc>, + baseline_path: &Arc, + validation_deps: &Option, + access_map_collector: &mut Option, + progress_enabled: bool, + start_time: Instant, + scan_started_at: chrono::DateTime, + update_status: &crate::update::UpdateStatus, +) -> Result<()> { + deduplicate_new_matches(datastore, global_args, args, 0)?; + apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), repo_roots)?; + + // Validate initial (non-repo) matches + if let Some(validation) = validation_deps { let (parser, clients, cache, rate_limiter) = (&validation.0, &validation.1, &validation.2, &validation.3); let initial_match_count = { datastore.lock().unwrap().get_matches().len() }; if initial_match_count > 0 { run_secret_validation( - Arc::clone(&datastore), + Arc::clone(datastore), parser, clients, cache, @@ -458,6 +634,7 @@ pub async fn run_async_scan( } } + // Parallel per-repo scanning let repo_concurrency = std::cmp::max(1, args.num_jobs); let rt_handle = Handle::current(); @@ -476,16 +653,16 @@ pub async fn run_async_scan( let spawn_repo_scan = |root: PathBuf| { let repo_rules = repo_rules.clone(); let base_clone_root = base_clone_root.clone(); - let baseline_path = Arc::clone(&baseline_path); - let shared_profiler = Arc::clone(&shared_profiler); + let baseline_path = Arc::clone(baseline_path); + let shared_profiler = Arc::clone(shared_profiler); let args = args.clone(); let root = root.clone(); let validation_deps = validation_deps.clone(); - let matcher_stats = Arc::clone(&matcher_stats); + let matcher_stats = Arc::clone(matcher_stats); let rt_handle = rt_handle.clone(); let ran_repo_scan = Arc::clone(&ran_repo_scan); let repo_errors = Arc::clone(&repo_errors); - let datastore = Arc::clone(&datastore); + let datastore = Arc::clone(datastore); let access_map = access_map_collector.clone(); scope.spawn(move |_| { @@ -509,7 +686,9 @@ pub async fn run_async_scan( Arc::clone(&shared_profiler), &repo_matcher_stats, ) - .and_then(|_| deduplicate_new_matches(&repo_datastore, 0))?; + .and_then(|_| { + deduplicate_new_matches(&repo_datastore, global_args, &args, 0) + })?; if args.baseline_file.is_some() || args.manage_baseline { let mut ds = repo_datastore.lock().unwrap(); @@ -568,11 +747,11 @@ pub async fn run_async_scan( }); }; - for root in repo_roots.clone() { + for root in repo_roots.iter().cloned() { spawn_repo_scan(root); } - for root in repo_rx.clone().iter() { + for root in repo_rx.iter() { spawn_repo_scan(root); } }); @@ -586,51 +765,27 @@ pub async fn run_async_scan( } if output_to_file && ran_repo_scan.load(Ordering::Relaxed) { - crate::reporter::run(global_args, Arc::clone(&datastore), args) + crate::reporter::run(global_args, Arc::clone(datastore), args) .context("Failed to run report command")?; } if !ran_repo_scan.load(Ordering::Relaxed) { - deduplicate_new_matches(&datastore, 0)?; + deduplicate_new_matches(datastore, global_args, args, 0)?; + apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), repo_roots)?; - if args.baseline_file.is_some() || args.manage_baseline { - let mut ds = datastore.lock().unwrap(); - crate::baseline::apply_baseline( - &mut ds, - baseline_path.as_ref(), - args.manage_baseline, - &repo_roots, - )?; - } - - if let Some(validation) = &validation_deps { - let (parser, clients, cache, rate_limiter) = - (&validation.0, &validation.1, &validation.2, &validation.3); - run_secret_validation( - Arc::clone(&datastore), - parser, - clients, - cache, - args.num_jobs, - None, - access_map_collector.clone(), - rate_limiter.clone(), - Duration::from_secs(args.validation_timeout), - args.validation_retries, - ) + run_validation_phase(datastore, validation_deps, args, None, access_map_collector.clone()) .await?; - } if let Some(collector) = access_map_collector.take() { - finalize_access_map(&datastore, collector, args).await?; + finalize_access_map(datastore, collector, args).await?; } - crate::reporter::run(global_args, Arc::clone(&datastore), args) + crate::reporter::run(global_args, Arc::clone(datastore), args) .context("Failed to run report command")?; } let aggregate_summary = if ran_repo_scan.load(Ordering::Relaxed) { - let totals = compute_scan_totals(&datastore, args, matcher_stats.as_ref()); + let totals = compute_scan_totals(datastore, args, matcher_stats.as_ref()); let mut sorted: Vec<_> = datastore.lock().unwrap().get_summary().into_iter().collect(); sorted.sort_by(|a, b| b.1.cmp(&a.1)); Some((totals, sorted)) @@ -641,7 +796,7 @@ pub async fn run_async_scan( print_scan_summary( start_time, scan_started_at, - &datastore, + datastore, global_args, args, rules_db, @@ -652,14 +807,18 @@ pub async fn run_async_scan( aggregate_summary, ); - if let Some(collector) = access_map_collector { - finalize_access_map(&datastore, collector, args).await?; + if let Some(collector) = access_map_collector.take() { + finalize_access_map(datastore, collector, args).await?; } else { - maybe_hint_access_map(&datastore, args); + maybe_hint_access_map(datastore, args); } Ok(()) } +// ================================================================================================= +// Existing helper functions (unchanged) +// ================================================================================================= + async fn finalize_access_map( datastore: &Arc>, collector: AccessMapCollector, @@ -780,8 +939,6 @@ pub fn create_datastore_channel( ) { const BATCH_SIZE: usize = 1024; let channel_size = std::cmp::max(num_jobs * BATCH_SIZE, 16 * BATCH_SIZE); - // const BATCH_SIZE: usize = 256; - // let channel_size = std::cmp::max(num_jobs * BATCH_SIZE, 4096); crossbeam_channel::bounded(channel_size) } @@ -864,7 +1021,6 @@ pub fn load_and_record_rules( ) -> Result { let init_progress = if use_progress { ProgressBar::new_spinner() } else { ProgressBar::hidden() }; - // init_progress.set_message("Compiling rules..."); let rules_db = { let loaded = RuleLoader::from_rule_specifiers(&args.rules) .load(args) @@ -876,7 +1032,6 @@ pub fn load_and_record_rules( .cloned() .map(|mut rule| { if let Some(min_entropy) = args.min_entropy { - // rule.syntax.min_entropy = min_entropy; let _ = rule.set_entropy(min_entropy); } rule