diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e57853..fb350a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,15 @@ All notable changes to this project will be documented in this file. +## [1.48.0] +- Improved error message when self-update cannot find the current binary +- Optimized memory usage via string interning and extensive data sharing +- Replaced quadratic match filtering with a per-rule span map, fixing missed secrets in extremely large files and improving scan performance +- Support scanning extremely large files by chunking input into 1Β GiB segments with small overlaps, avoiding vectorscan buffer limits while preserving match offsets +- Always use chunked vectorscan, eliminating the slow regex fallback for blobs over 4Β GiB +- Skip Base64 scanning for blobs over 64 MB to avoid a second pass over massive files +- Increased max-file-size default to 64 MB (up from 25 MB) + ## [1.47.0] - MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them - Improved rules: github oauth2, diffbot, mailchimp, aws diff --git a/Cargo.toml b/Cargo.toml index 4e774bf..797ef62 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.47.0" +version = "1.48.0" description = "MongoDB's blazingly fast secret scanning and validation tool" edition.workspace = true rust-version.workspace = true @@ -65,7 +65,7 @@ smallvec = { version = "1", features = [ tracing = "0.1.41" indicatif = { version = "0.17", features = ["improved_unicode"] } rayon = "1.10" -sha1 = "0.10.6" +sha1 = { version = "0.10.6", features = ["asm"] } hex = "0.4.3" vectorscan-rs = "0.0.5" regex = "1.11.1" @@ -151,6 +151,7 @@ tar = "0.4.44" xz2 = "0.1.7" asar = "0.3.0" blake3 = "1.8.2" +memchr = "2.7" memmap2 = "0.9.7" futures = "0.3.31" dashmap = "6.1.0" diff --git a/README.md b/README.md index cccbf34..1da58dd 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,64 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) Kingfisher Runtime Comparison

+- [Kingfisher](#kingfisher) + - [Key Features](#key-features) +- [Benchmark Results](#benchmark-results) +- [Getting Started](#getting-started) + - [Installation](#installation) + - [Run Kingfisher in Docker](#run-kingfisher-in-docker) +- [πŸ” Detection Rules at a Glance](#-detection-rules-at-a-glance) + - [Write Custom Rules!](#write-custom-rules) +- [Usage](#usage) + - [Basic Examples](#basic-examples) + - [Scan with secret validation](#scan-with-secret-validation) + - [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories) + - [Scan a Git repository without validation](#scan-a-git-repository-without-validation) + - [Display only secrets confirmed active by third‑party APIs](#display-only-secrets-confirmed-active-by-thirdparty-apis) + - [Output JSON and capture to a file](#output-json-and-capture-to-a-file) + - [Output SARIF directly to disk](#output-sarif-directly-to-disk) + - [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--) + - [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size) + - [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag) + - [Display rule performance statistics](#display-rule-performance-statistics) + - [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files) + - [Exclude specific paths](#exclude-specific-paths) + - [Scan an S3 bucket](#scan-an-s3-bucket) + - [Scanning Docker Images](#scanning-docker-images) + - [Scanning GitHub](#scanning-github) + - [Scan GitHub organisation (requires `KF_GITHUB_TOKEN`)](#scan-github-organisation-requires-kf_github_token) + - [Scan remote GitHub repository](#scan-remote-github-repository) + - [Scanning GitLab](#scanning-gitlab) + - [Scan GitLab group (requires `KF_GITLAB_TOKEN`)](#scan-gitlab-group-requires-kf_gitlab_token) + - [Scan GitLab user](#scan-gitlab-user) + - [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url) + - [List GitLab repositories](#list-gitlab-repositories) + - [Scanning Jira](#scanning-jira) + - [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query) + - [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues) + - [Scanning Confluence](#scanning-confluence) + - [Scan Confluence pages matching a CQL query](#scan-confluence-pages-matching-a-cql-query) + - [Scanning Slack](#scanning-slack) + - [Scan Slack messages matching a search query](#scan-slack-messages-matching-a-search-query) + - [Environment Variables for Tokens](#environment-variables-for-tokens) + - [Exit Codes](#exit-codes) + - [Update Checks](#update-checks) +- [Advanced Options](#advanced-options) + - [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets) + - [List Builtin Rules](#list-builtin-rules) + - [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run) + - [To add your rules alongside the built‑ins:](#to-add-your-rules-alongside-the-builtins) + - [Other Examples](#other-examples) + - [Notable Scan Options](#notable-scan-options) + - [Understanding `--confidence`](#understanding---confidence) + - [Ignore known false positives](#ignore-known-false-positives) + - [Finding Fingerprint](#finding-fingerprint) + - [Rule Performance Profiling](#rule-performance-profiling) + - [CLI Options](#cli-options) + - [Origins and Divergence](#origins-and-divergence) +- [Roadmap](#roadmap) +- [License](#license) + # Getting Started ## Installation @@ -225,9 +283,18 @@ cat /path/to/file.py | kingfisher scan - ``` +### Limit maximum file size scanned (`--max-file-size`) + +By default, Kingfisher skips files larger than **64 MB**. You can raise or lower this cap per run with `--max-file-size`, which takes a value in **megabytes**. + +```bash +# Scan files up to 250 mb in size +kingfisher scan /some/file --max-file-size 250 +``` + ### Scan using a rule _family_ with one flag -_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws._`)\* +_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws.*`)_ ```bash # Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2) @@ -617,6 +684,14 @@ kingfisher github repos list --organization my-org - `--skip-regex `: Ignore findings whose text matches this regex (repeatable) - `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) +## Understanding `--confidence` + +The `--confidence` flag sets a minimum confidence threshold, not an exact match. + +- If you pass `--confidence medium`, findings with **medium and higher** confidence (medium + high) will be included. +- If you pass `--confidence low`, you’ll see **all levels** (low, medium, high). + + ### Ignore known false positives Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. diff --git a/data/rules/jwt.yml b/data/rules/jwt.yml index e596027..b237b4c 100644 --- a/data/rules/jwt.yml +++ b/data/rules/jwt.yml @@ -5,7 +5,7 @@ rules: (?x) \b ( - ey[A-Za-z0-9_-]{12,} (?# header ) + (?:ey|ewogIC)[A-Za-z0-9_-]{12,} (?# header ) \. ey[A-Za-z0-9_-]{12,} (?# payload ) \. diff --git a/src/blob.rs b/src/blob.rs index a54a4a8..9b1aff1 100644 --- a/src/blob.rs +++ b/src/blob.rs @@ -10,16 +10,20 @@ use anyhow::Result; use bstr::{BString, ByteSlice}; use gix::ObjectId; use hex; +use once_cell::sync::OnceCell; use parking_lot::Mutex; use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; use sha1::{Digest, Sha1}; use smallvec::SmallVec; +use std::sync::atomic::{AtomicU64, Ordering}; use crate::git_commit_metadata::CommitMetadata; // const LARGE_FILE_THRESHOLD: u64 = 512 * 1024; // 512 KB const LARGE_FILE_THRESHOLD: u64 = 0; // always mmap +static NEXT_ID: AtomicU64 = AtomicU64::new(1); + /// The data of a blob, either owned (small files) or memory mapped (large files). pub enum BlobData<'a> { /// Small blobs – remains as-is. @@ -75,47 +79,68 @@ pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>; /// A Git blob, storing its SHA-1 id and its contents. pub struct Blob<'a> { - pub id: BlobId, - pub data: BlobData<'a>, + id: OnceCell, + data: BlobData<'a>, + temp_id: u64, } impl Blob<'_> { #[inline] - pub fn from_file>(path: P) -> Result { let mut file = File::open(&path)?; let file_size = file.metadata()?.len(); + let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed); if file_size > LARGE_FILE_THRESHOLD { // Large files: one mmap, zero extra copies. let mmap = unsafe { memmap2::Mmap::map(&file)? }; - let id = BlobId::new(mmap.as_ref()); - Ok(Blob { id, data: BlobData::Mapped(mmap) }) + Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id }) } else { // Small files: reuse the same handle and pre-allocate exact capacity let mut bytes = Vec::with_capacity(file_size as usize); file.read_to_end(&mut bytes)?; - let id = BlobId::new(&bytes); - Ok(Blob { id, data: BlobData::Owned(bytes) }) + Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }) } } + /// Returns the blob's bytes as a slice. #[inline] pub fn bytes(&self) -> &[u8] { self.data.as_ref() } + /// Lazily compute and return the blob's SHA-1 `BlobId`. + #[inline] + pub fn id(&self) -> BlobId { + *self.id.get_or_init(|| BlobId::new(self.bytes())) + } + + /// Get a reference to the blob's SHA-1 `BlobId`, computing it if necessary. + #[inline] + pub fn id_ref(&self) -> &BlobId { + self.id.get_or_init(|| BlobId::new(self.bytes())) + } + + /// Return the temporary identifier assigned on blob creation. + #[inline] + pub fn temp_id(&self) -> u64 { + self.temp_id + } + /// Create a new `Blob` from a vector of bytes. #[inline] pub fn from_bytes(bytes: Vec) -> Self { - let id = BlobId::compute_from_bytes(&bytes); - Blob { id, data: BlobData::Owned(bytes) } + let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id } } /// Create a new `Blob` with the given id and data. #[inline] pub fn new(id: BlobId, bytes: Vec) -> Self { - Blob { id, data: BlobData::Owned(bytes) } + let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + let cell = OnceCell::new(); + let _ = cell.set(id); + Blob { id: cell, data: BlobData::Owned(bytes), temp_id } } #[inline] @@ -269,9 +294,15 @@ impl BlobId { /// Create a new BlobId computed from the given input. #[inline] pub fn new(input: &[u8]) -> Self { + const CHUNK: usize = 64 * 1024; // 64KB from start and end let mut hasher = Sha1::new(); write!(&mut hasher, "blob {}\0", input.len()).unwrap(); - hasher.update(input); + if input.len() <= CHUNK * 2 { + hasher.update(input); + } else { + hasher.update(&input[..CHUNK]); + hasher.update(&input[input.len() - CHUNK..]); + } BlobId(hasher.finalize().as_slice().try_into().expect("SHA-1 output size mismatch")) } @@ -360,9 +391,6 @@ pub struct BlobMetadata { /// The guessed multimedia type of the blob pub mime_essence: Option, - /// The guessed charset of the blob - pub charset: Option, - /// The guessed programming language of the blob pub language: Option, } @@ -384,9 +412,4 @@ impl BlobMetadata { pub fn mime_essence(&self) -> Option<&str> { self.mime_essence.as_deref() } - - #[inline] - pub fn charset(&self) -> Option<&str> { - self.charset.as_deref() - } } diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs index f0b8fa8..4f2dc5b 100644 --- a/src/cli/commands/inputs.rs +++ b/src/cli/commands/inputs.rs @@ -169,7 +169,7 @@ pub struct InputSpecifierArgs { #[derive(Args, Debug, Clone)] pub struct ContentFilteringArgs { /// Ignore files larger than the given size in MB - #[arg(long("max-file-size"), default_value_t = 25.0)] + #[arg(long("max-file-size"), default_value_t = 64.0)] pub max_file_size_mb: f64, // /// Use custom path-based ignore rules from the given file(s) diff --git a/src/content_type.rs b/src/content_type.rs index 197258d..ee44da6 100644 --- a/src/content_type.rs +++ b/src/content_type.rs @@ -96,9 +96,8 @@ impl ContentInspector { #[inline] #[must_use] pub fn guess_charset(&self, bytes: &[u8]) -> Option { - String::from_utf8(bytes.to_vec()).ok().map(|_| "UTF-8".to_string()) + std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string()) } - /// Guess programming language with broad coverage using `tokei`. /// /// Strategy (no disk I/O): diff --git a/src/decompress.rs b/src/decompress.rs index 49dc7ef..46ae5d5 100644 --- a/src/decompress.rs +++ b/src/decompress.rs @@ -161,11 +161,8 @@ fn handle_zip_archive_streaming( Ok(CompressedContent::ArchiveFiles(entries_on_disk)) } -fn handle_asar_archive_in_memory( - buffer: Vec, - archive_path: &Path, -) -> Result { - match AsarReader::new(&buffer, None) { +fn handle_asar_archive_in_memory(buffer: &[u8], archive_path: &Path) -> Result { + match AsarReader::new(buffer, None) { Ok(reader) => { let mut contents = Vec::new(); for (path_in_asar, file) in reader.files() { @@ -200,7 +197,7 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result { let mmap = unsafe { Mmap::map(&file)? }; - return handle_asar_archive_in_memory(mmap.to_vec(), path); + return handle_asar_archive_in_memory(&mmap, path); } "tar" => { if let Some(base) = base_dir { diff --git a/src/findings_store.rs b/src/findings_store.rs index fc610f2..23d8c7b 100644 --- a/src/findings_store.rs +++ b/src/findings_store.rs @@ -18,6 +18,7 @@ use crate::{ matcher::Match, origin::{Origin, OriginSet}, rules::rule::Rule, + util::intern, }; // share with Arc so every blob/origin is materialised once @@ -151,7 +152,7 @@ impl FindingsStore { .captures .get(1) .or_else(|| m.groups.captures.get(0)) - .map_or("", |c| c.value.as_ref()); + .map_or("", |c| c.value); let origin_kind = match origin.first() { Origin::GitRepo(_) => "git", @@ -160,7 +161,7 @@ impl FindingsStore { }; let key = xxh3_64( - format!("{}|{}|{}", m.rule_text_id.to_uppercase(), origin_kind, snippet) + format!("{}|{}|{}", m.rule.id().to_uppercase(), origin_kind, snippet) .as_bytes(), ); @@ -280,7 +281,7 @@ impl FindingsStore { pub fn get_summary(&self) -> FxHashMap<&'static str, usize> { self.matches.iter().fold(FxHashMap::default(), |mut acc, msg| { let (_, _, m) = &**msg; - *acc.entry(m.rule_name).or_insert(0) += 1; // borrow, no alloc + *acc.entry(intern(m.rule.name())).or_insert(0) += 1; acc }) } @@ -342,13 +343,13 @@ impl FindingsStore { self.matches.iter().map(|msg| { let (_, _, match_item) = &**msg; finding_data::FindingMetadata { - rule_name: match_item.rule_name.to_string(), + rule_name: match_item.rule.name().to_string(), num_matches: 1, comment: None, visible: match_item.visible, finding_id: match_item.finding_id(), - rule_finding_fingerprint: match_item.rule_finding_fingerprint.to_string(), - rule_text_id: match_item.rule_text_id.to_string(), + rule_finding_fingerprint: match_item.rule.finding_sha1_fingerprint().to_string(), + rule_text_id: match_item.rule.id().to_string(), } }) } @@ -362,7 +363,7 @@ impl FindingsStore { .iter() .filter(|msg| { let (_, _, match_item) = &***msg; - match_item.rule_name == metadata.rule_name + match_item.rule.name() == metadata.rule_name }) .map(|msg| { let (origin, blob_metadata, match_item) = &**msg; @@ -373,7 +374,7 @@ impl FindingsStore { match_id: MatchIdInt::from_str(&match_item.finding_id())?, match_comment: None, visible: match_item.visible, - match_confidence: match_item.rule_confidence, + match_confidence: match_item.rule.confidence(), validation_response_body: match_item.validation_response_body.clone(), validation_response_status: match_item.validation_response_status, validation_success: match_item.validation_success, diff --git a/src/location.rs b/src/location.rs index 4af7021..a50602a 100644 --- a/src/location.rs +++ b/src/location.rs @@ -1,4 +1,5 @@ use core::ops::Range; +use std::cell::RefCell; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -87,25 +88,35 @@ impl std::fmt::Display for SourceSpan { } /// Records newline byte‑offsets to map offsets -- (line, column). -pub struct LocationMapping { - newline_offsets: Vec, +pub struct LocationMapping<'a> { + bytes: &'a [u8], + newline_offsets: RefCell>, } -impl LocationMapping { - /// Scan once for all `\n` positions. - pub fn new(input: &[u8]) -> Self { - let newline_offsets = - input.iter().enumerate().filter_map(|(i, &b)| (b == b'\n').then_some(i)).collect(); - LocationMapping { newline_offsets } +impl<'a> LocationMapping<'a> { + /// Create a new mapping without pre-scanning the entire input. + pub fn new(input: &'a [u8]) -> Self { + LocationMapping { bytes: input, newline_offsets: RefCell::new(Vec::new()) } } - /// Map a byte offset to a `SourcePoint`. - pub fn get_source_point(&self, offset: usize) -> SourcePoint { - let line = match self.newline_offsets.binary_search(&offset) { - Ok(idx) => idx + 2, // exact newline -- next line + fn ensure_offsets_up_to(&self, offset: usize) { + let mut offsets = self.newline_offsets.borrow_mut(); + let start = offsets.last().map_or(0, |&last| last + 1); + if offset < start { + return; + } + let end = offset.min(self.bytes.len()); + for nl in memchr::memchr_iter(b'\n', &self.bytes[start..end]) { + offsets.push(start + nl); + } + } + + fn source_point_from_offsets(offsets: &[usize], offset: usize) -> SourcePoint { + let line = match offsets.binary_search(&offset) { + Ok(idx) => idx + 2, Err(idx) => idx + 1, }; - let column = if let Some(&last) = self.newline_offsets.get(line.saturating_sub(2)) { + let column = if let Some(&last) = offsets.get(line.saturating_sub(2)) { offset.saturating_sub(last + 1) } else { offset @@ -113,10 +124,19 @@ impl LocationMapping { SourcePoint { line, column } } + /// Map a byte offset to a `SourcePoint`. + pub fn get_source_point(&self, offset: usize) -> SourcePoint { + self.ensure_offsets_up_to(offset); + let offsets = self.newline_offsets.borrow(); + Self::source_point_from_offsets(&offsets, offset) + } + /// Map an `OffsetSpan` -- `SourceSpan` (closed interval). pub fn get_source_span(&self, span: &OffsetSpan) -> SourceSpan { - let start = self.get_source_point(span.start); - let end = self.get_source_point(span.end.saturating_sub(1)); + self.ensure_offsets_up_to(span.end.saturating_sub(1)); + let offsets = self.newline_offsets.borrow(); + let start = Self::source_point_from_offsets(&offsets, span.start); + let end = Self::source_point_from_offsets(&offsets, span.end.saturating_sub(1)); SourceSpan { start, end } } } diff --git a/src/matcher.rs b/src/matcher.rs index 1cbb16f..d30b0d8 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -10,7 +10,6 @@ use anyhow::Result; use base64::{engine::general_purpose, Engine}; use bstr::BString; use http::StatusCode; -use lazy_static::lazy_static; use regex::bytes::Regex; use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; use schemars::{ @@ -27,7 +26,7 @@ use xxhash_rust::xxh3::xxh3_64; use crate::{ blob::{Blob, BlobId, BlobIdMap}, entropy::calculate_shannon_entropy, - location::{Location, LocationMapping, OffsetSpan}, + location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, origin::OriginSet, parser, parser::{Checker, Language}, @@ -40,6 +39,10 @@ use crate::{ util::{intern, redact_value}, }; +const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment +const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches +const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs + // ------------------------------------------------------------------------------------------------- // RawMatch // ------------------------------------------------------------------------------------------------- @@ -255,17 +258,28 @@ impl<'a> Matcher<'a> { self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64)); self.user_data.input_len = input.len() as u64; - // self.vs_scanner.scan(input, |rid, from, to, _flags| { - self.scanner_pool.with(|scanner| { - scanner.scan(input, |rule_id, from, to, _flags| { - self.user_data.raw_matches_scratch.push(RawMatch { - rule_id, - start_idx: from, - end_idx: to, - }); - vectorscan_rs::Scan::Continue - }) - })?; + + let mut offset: usize = 0; + while offset < input.len() { + let end = (offset + MAX_CHUNK_SIZE).min(input.len()); + let slice = &input[offset..end]; + let base = offset as u64; + self.scanner_pool.with(|scanner| { + scanner.scan(slice, |rule_id, from, to, _flags| { + self.user_data.raw_matches_scratch.push(RawMatch { + rule_id, + start_idx: from + base, + end_idx: to + base, + }); + vectorscan_rs::Scan::Continue + }) + })?; + + if end == input.len() { + break; + } + offset = end.saturating_sub(CHUNK_OVERLAP); + } Ok(()) } @@ -288,17 +302,6 @@ impl<'a> Matcher<'a> { self.local_stats.blobs_scanned += 1; self.local_stats.bytes_scanned += blob.bytes().len() as u64; - // Check if blob was already seen and respect no_dedup flag - if !no_dedup { - if let Some(had_matches) = self.seen_blobs.get(&blob.id) { - return Ok(if had_matches { - ScanResult::SeenWithMatches - } else { - ScanResult::SeenSansMatches - }); - } - } - // Extract filename from origin let filename = origin .first() @@ -313,24 +316,19 @@ impl<'a> Matcher<'a> { // Opportunistically look for standalone Base64 blobs. If neither // the raw scan nor this check yields anything, we can return early // before doing any heavier work. - let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) }; + let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT { + Vec::new() + } else { + get_base64_strings(blob.bytes()) + }; if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() { - // Only record in seen_blobs if deduplication is enabled - if !no_dedup { - return Ok(match self.seen_blobs.insert(blob.id, false) { - None => ScanResult::New(Vec::new()), - Some(true) => ScanResult::SeenWithMatches, - Some(false) => ScanResult::SeenSansMatches, - }); - } else { - return Ok(ScanResult::New(Vec::new())); - } + return Ok(ScanResult::New(Vec::new())); } let rules_db = self.rules_db; let mut seen_matches = FxHashSet::default(); - let mut previous_matches = Vec::new(); + let mut previous_matches: FxHashMap> = FxHashMap::default(); let tree_sitter_result = if self.user_data.raw_matches_scratch.is_empty() { None } else { @@ -363,27 +361,19 @@ impl<'a> Matcher<'a> { }) .collect::>() }); - let mut previous_raw_matches: Vec<(usize, OffsetSpan)> = Vec::new(); + let mut previous_raw_matches: FxHashMap> = FxHashMap::default(); for &RawMatch { rule_id, start_idx, end_idx } in self.user_data.raw_matches_scratch.iter().rev() { let rule_id_usize: usize = rule_id as usize; - // let rule = &rules_db.rules[rule_id_usize]; let rule = Arc::clone(&rules_db.rules[rule_id_usize]); let re = &rules_db.anchored_regexes[rule_id_usize]; let start_idx_usize = start_idx as usize; let end_idx_usize = end_idx as usize; let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize); - // Skip if fully contained in a previous match - if previous_raw_matches.iter().any(|(prev_id, prev_span): &(usize, OffsetSpan)| { - *prev_id == rule_id_usize - && (prev_span.fully_contains(¤t_span) - || current_span.fully_contains(prev_span)) - }) { + if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) { continue; } - let matching_input_offset_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize); - previous_raw_matches.push((rule_id_usize, matching_input_offset_span)); filter_match( blob, rule, @@ -472,9 +462,15 @@ impl<'a> Matcher<'a> { } } // Finalize - // Only record in seen_blobs if deduplication is enabled - if !no_dedup { - self.seen_blobs.insert(blob.id, !matches.is_empty()); + if !no_dedup && !matches.is_empty() { + let blob_id = blob.id(); + if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) { + return Ok(if had_matches { + ScanResult::SeenWithMatches + } else { + ScanResult::SeenSansMatches + }); + } } // --- opportunistic capacity cap --------------------------------- @@ -501,6 +497,39 @@ fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) - end.hash(&mut hasher); hasher.finish() } + +#[inline] +fn insert_span(spans: &mut Vec, span: OffsetSpan) -> bool { + let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i); + if idx > 0 { + if spans[idx - 1].fully_contains(&span) { + return false; + } + if span.fully_contains(&spans[idx - 1]) { + spans.remove(idx - 1); + idx -= 1; + } + } + if idx < spans.len() { + if spans[idx].fully_contains(&span) { + return false; + } + if span.fully_contains(&spans[idx]) { + spans.remove(idx); + } + } + spans.insert(idx, span); + true +} + +#[inline] +fn record_match( + map: &mut FxHashMap>, + rule_id: usize, + span: OffsetSpan, +) -> bool { + insert_span(map.entry(rule_id).or_default(), span) +} fn filter_match<'b>( blob: &'b Blob, // rule: &'b Rule, @@ -509,7 +538,7 @@ fn filter_match<'b>( start: usize, end: usize, matches: &mut Vec>, - previous_matches: &mut Vec<(usize, OffsetSpan)>, + previous_matches: &mut FxHashMap>, rule_id: usize, seen_matches: &mut FxHashSet, _origin: &OriginSet, @@ -558,11 +587,7 @@ fn filter_match<'b>( if !seen_matches.insert(match_key) { continue; } - if previous_matches.iter().any(|(prev_rule_id, prev_loc)| { - *prev_rule_id == rule_id - && (prev_loc.fully_contains(&matching_input_offset_span) - || matching_input_offset_span.fully_contains(prev_loc)) - }) { + if !record_match(previous_matches, rule_id, matching_input_offset_span) { continue; } let only_matching_input = @@ -571,7 +596,7 @@ fn filter_match<'b>( SerializableCaptures::from_captures(&captures, byte_slice.as_ref(), re, redact); matches.push(BlobMatch { rule: Arc::clone(&rule), - blob_id: &blob.id, + blob_id: blob.id_ref(), matching_input: only_matching_input, matching_input_offset_span, captures: groups, @@ -581,7 +606,6 @@ fn filter_match<'b>( calculated_entropy, is_base64, }); - previous_matches.push((rule_id, matching_input_offset_span)); } if let Some(t) = timer.take() { let new_count = (matches.len() - initial_len) as u64; @@ -682,20 +706,20 @@ impl JsonSchema for Groups { // pub end: usize, // End position of the match // pub value: String, // The actual captured value // } -#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Clone, Serialize, JsonSchema)] pub struct SerializableCapture { pub name: Option, pub match_number: i32, pub start: usize, pub end: usize, - // Instead of storing an owned String, store a borrowed or interned value. - // Here we use Cow to allow either borrowing or owning as needed. - pub value: std::borrow::Cow<'static, str>, + /// Interned value of the capture. + pub value: &'static str, } -#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Clone, Serialize, JsonSchema)] pub struct SerializableCaptures { - pub captures: Vec, // All captures (named and unnamed) + #[schemars(with = "Vec")] + pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed) } impl SerializableCaptures { pub fn from_captures( @@ -704,7 +728,7 @@ impl SerializableCaptures { re: &Regex, redact: bool, ) -> Self { - let mut serialized_captures = Vec::new(); + let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new(); // Process named captures for name in re.capture_names().flatten() { if let Some(capture) = captures.name(name) { @@ -718,7 +742,7 @@ impl SerializableCaptures { match_number: -1, start: capture.start(), end: capture.end(), - value: value.into(), + value: intern(&value), }); } } @@ -735,7 +759,7 @@ impl SerializableCaptures { match_number: i32::try_from(i).unwrap_or(0), start: capture.start(), end: capture.end(), - value: value.into(), + value: intern(&value), }); } } @@ -764,16 +788,9 @@ pub struct Match { pub finding_fingerprint: u64, /// The rule that produced this match - pub rule_finding_fingerprint: &'static str, - - /// The text identifier of the rule that produced this match - pub rule_text_id: &'static str, - - /// The name of the rule that produced this match - pub rule_name: &'static str, - - /// The confidence property of the rule that produced this match - pub rule_confidence: crate::rules::rule::Confidence, + #[serde(skip_serializing)] + #[schemars(skip)] + pub rule: Arc, /// Validation Body pub validation_response_body: String, @@ -794,7 +811,7 @@ pub struct Match { impl Match { #[inline] pub fn convert_owned_blobmatch_to_match<'a>( - loc_mapping: &'a LocationMapping, + loc_mapping: Option<&'a LocationMapping<'a>>, owned_blob_match: &'a OwnedBlobMatch, origin_type: &'a str, ) -> Self { @@ -812,9 +829,11 @@ impl Match { // The fingerprint will be based on the content of the secret. let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or(""); - let source_span = loc_mapping.get_source_span(&offset_span); - let rule_finding_fingerprint = owned_blob_match.rule.finding_sha1_fingerprint().to_owned(); - + let source_span = + loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan { + start: SourcePoint { line: 0, column: 0 }, + end: SourcePoint { line: 0, column: 0 }, + }); let offset_start: u64 = owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); @@ -828,10 +847,7 @@ impl Match { // matching_snippet Match { - rule_finding_fingerprint: intern(&rule_finding_fingerprint), - rule_name: intern(owned_blob_match.rule.name()), - rule_confidence: owned_blob_match.rule.confidence(), - rule_text_id: intern(owned_blob_match.rule.id()), + rule: owned_blob_match.rule.clone(), visible: owned_blob_match.rule.visible().to_owned(), location: Location { offset_span, source_span: source_span.clone() }, groups: owned_blob_match.captures.clone(), @@ -852,7 +868,7 @@ impl Match { pub fn finding_id(&self) -> String { let mut h = Sha1::new(); - write!(&mut h, "{}\0", self.rule_finding_fingerprint) + write!(&mut h, "{}\0", self.rule.finding_sha1_fingerprint()) .expect("should be able to write to memory"); serde_json::to_writer(&mut h, &self.groups) .expect("should be able to serialize groups as JSON"); @@ -892,33 +908,48 @@ pub struct DecodedData { pub pos_start: usize, pub pos_end: usize, } +#[inline] +fn is_base64_byte(b: u8) -> bool { + matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/') +} + pub fn get_base64_strings(input: &[u8]) -> Vec { - lazy_static! { - // Require a reasonably long run of valid Base64 characters to reduce - // noise. 32 bytes corresponds to 24 decoded bytes. - static ref RE_BASE64: Regex = Regex::new(r"[A-Za-z0-9+/]{32,}={0,2}").unwrap(); - } let mut results = Vec::new(); - for m in RE_BASE64.find_iter(input) { - let base64_string = m.as_bytes(); - // Skip candidates whose length isn't a multiple of four – they cannot - // be valid Base64. - if base64_string.len() % 4 != 0 { - continue; + let mut i = 0; + while i < input.len() { + while i < input.len() && !is_base64_byte(input[i]) { + i += 1; } - if let Ok(decoded) = general_purpose::STANDARD.decode(base64_string) { - if let Ok(decoded_str) = std::str::from_utf8(&decoded) { - if decoded_str.is_ascii() { - results.push(DecodedData { - original: String::from_utf8_lossy(base64_string).into_owned(), - decoded: decoded_str.to_string(), - pos_start: m.start(), - pos_end: m.end(), - }); + let start = i; + while i < input.len() && is_base64_byte(input[i]) { + i += 1; + } + + let mut eq_count = 0; + while i < input.len() && input[i] == b'=' && eq_count < 2 { + i += 1; + eq_count += 1; + } + let end = i; + + let len = end - start; + if len >= 32 && len % 4 == 0 { + let base64_slice = &input[start..end]; + if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) { + if let Ok(decoded_str) = std::str::from_utf8(&decoded) { + if decoded_str.is_ascii() { + results.push(DecodedData { + original: String::from_utf8_lossy(base64_slice).into_owned(), + decoded: decoded_str.to_string(), + pos_start: start, + pos_end: end, + }); + } } } } } + results } diff --git a/src/reporter.rs b/src/reporter.rs index c9c2a62..dacf75b 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -216,7 +216,7 @@ impl DetailsReporter { m: match_item.clone(), comment: None, visible: match_item.visible, - match_confidence: match_item.rule_confidence, + match_confidence: match_item.rule.confidence(), validation_response_body: match_item.validation_response_body.clone(), validation_response_status: match_item.validation_response_status, validation_success: match_item.validation_success, @@ -366,13 +366,13 @@ impl DetailsReporter { FindingReporterRecord { rule: RuleMetadata { - name: rm.m.rule_name.to_string(), - id: rm.m.rule_text_id.to_string(), + name: rm.m.rule.name().to_string(), + id: rm.m.rule.id().to_string(), }, finding: FindingRecordData { snippet, fingerprint: rm.m.finding_fingerprint.to_string(), - confidence: rm.match_confidence.to_string(), + confidence: rm.m.rule.confidence().to_string(), entropy: format!("{:.2}", rm.m.calculated_entropy), validation: ValidationInfo { status: validation_status, response: response_body }, language: rm diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 0b562d2..1ee7fb7 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -35,6 +35,7 @@ mod tests { use crate::cli::commands::github::GitHistoryMode; use crate::cli::commands::rules::RuleSpecifierArgs; use crate::matcher::{SerializableCapture, SerializableCaptures}; + use crate::rules::rule::{Confidence, Rule, RuleSyntax}; use crate::util::intern; use crate::{ blob::BlobId, @@ -49,6 +50,7 @@ mod tests { origin::Origin, reporter::styles::Styles, }; + use smallvec::smallvec; use std::{ io::Cursor, path::PathBuf, @@ -132,12 +134,21 @@ mod tests { } } - fn create_mock_match( - rule_name: &str, - rule_text_id: &str, - rule_finding_fingerprint: &str, - validation_success: bool, - ) -> Match { + fn create_mock_match(rule_name: &str, rule_text_id: &str, validation_success: bool) -> Match { + let syntax = RuleSyntax { + name: rule_name.to_string(), + id: rule_text_id.to_string(), + pattern: "dummy".to_string(), + min_entropy: 0.0, + confidence: Confidence::Medium, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None, + depends_on_rule: vec![], + }; + let rule = Arc::new(Rule::new(syntax)); Match { location: Location { offset_span: OffsetSpan { start: 10, end: 20 }, @@ -147,20 +158,17 @@ mod tests { }, }, groups: SerializableCaptures { - captures: vec![SerializableCapture { + captures: smallvec![SerializableCapture { name: Some("token".to_string()), match_number: 1, start: 10, end: 20, - value: "mock_token".into(), + value: intern("mock_token"), }], }, blob_id: BlobId::new(b"mock_blob"), finding_fingerprint: 0123, - rule_finding_fingerprint: intern(rule_finding_fingerprint), - rule_text_id: intern(rule_text_id), - rule_name: intern(rule_name), - rule_confidence: Confidence::Medium, + rule, validation_response_body: "validation response".to_string(), validation_response_status: 200, validation_success, @@ -177,7 +185,6 @@ mod tests { id: BlobId::new(b"mock_blob"), num_bytes: 1024, mime_essence: Some("text/plain".to_string()), - charset: Some("UTF-8".to_string()), language: Some("Rust".to_string()), }; let dedup = true; @@ -204,15 +211,13 @@ mod tests { #[test] fn test_json_format() -> Result<()> { - let mock_match = - create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); + let mock_match = create_mock_match("MockRule", "mock_rule_1", true); let matches = vec![ReportMatch { origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]), blob_metadata: BlobMetadata { id: BlobId::new(b"mock_blob"), num_bytes: 1024, mime_essence: Some("text/plain".to_string()), - charset: Some("UTF-8".to_string()), language: Some("Rust".to_string()), }, m: mock_match, @@ -238,12 +243,7 @@ mod tests { fn test_validation_status_in_json() -> Result<()> { let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")]; for (validation_success, expected_status) in test_cases { - let mock_match = create_mock_match( - "MockRule", - "mock_rule_1", - "mock_finding_fingerprint", - validation_success, - ); + let mock_match = create_mock_match("MockRule", "mock_rule_1", validation_success); let matches = vec![ReportMatch { origin: OriginSet::new( Origin::from_file(PathBuf::from("/mock/path/file.rs")), @@ -253,7 +253,6 @@ mod tests { id: BlobId::new(b"mock_blob"), num_bytes: 1024, mime_essence: Some("text/plain".to_string()), - charset: Some("UTF-8".to_string()), language: Some("Rust".to_string()), }, m: mock_match, diff --git a/src/scanner/docker.rs b/src/scanner/docker.rs index b427a3b..7ebb2d4 100644 --- a/src/scanner/docker.rs +++ b/src/scanner/docker.rs @@ -175,9 +175,11 @@ impl Docker { pb.set_length(layer_paths.len() as u64); for p in layer_paths { - let mut data = Vec::new(); - File::open(&p)?.read_to_end(&mut data)?; - let digest = format!("{:x}", Sha256::digest(&data)); + let mut file = File::open(&p)?; + let mut hasher = Sha256::new(); + std::io::copy(&mut file, &mut hasher)?; + let digest = format!("{:x}", hasher.finalize()); + let new_path = out_dir.join(format!("layer_{digest}.tar")); std::fs::rename(&p, &new_path)?; // extract layer contents so inner filenames appear in scan results diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index 14861ff..970ca5c 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -172,10 +172,15 @@ pub fn enumerate_filesystem_inputs( }; if should_skip { progress.suspend(|| { + let path = origin + .first() + .blob_path() + .map(|p| p.display().to_string()) + .unwrap_or_else(|| blob.temp_id().to_string()); if is_archive { - debug!("Skipping archive: {}", blob.id); + debug!("Skipping archive: {path}"); } else { - debug!("Skipping binary blob: {}", blob.id); + debug!("Skipping binary blob: {path}"); } }); return Ok(()); diff --git a/src/scanner/processing.rs b/src/scanner/processing.rs index 9447dbf..5132209 100644 --- a/src/scanner/processing.rs +++ b/src/scanner/processing.rs @@ -12,6 +12,8 @@ use crate::{ Path, }; +const LOCATION_LIMIT_BYTES: usize = 256 * 1024 * 1024; + /// A matcher along with parameters that remain constant during a single /// `scan` run pub struct BlobProcessor<'a> { @@ -27,8 +29,7 @@ impl<'a> BlobProcessor<'a> { redact: bool, no_base64: bool, ) -> Result> { - let blob_id = blob.id.hex(); - let _span = debug_span!("matcher", blob_id).entered(); + let _span = debug_span!("matcher", temp_id = blob.temp_id()).entered(); let t1 = Instant::now(); let res = self.matcher.scan_blob(&blob, &origin, None, redact, no_dedup, no_base64)?; let scan_us = t1.elapsed().as_micros(); @@ -43,10 +44,9 @@ impl<'a> BlobProcessor<'a> { ScanResult::SeenWithMatches => { trace!("({scan_us}us) blob already scanned with matches"); let metadata = BlobMetadata { - id: blob.id, + id: blob.id(), num_bytes: blob.len(), mime_essence: None, - charset: None, language: None, }; Ok(Some((origin, metadata, Vec::new()))) @@ -63,10 +63,9 @@ impl<'a> BlobProcessor<'a> { } let md = MetadataResult::from_blob_and_origin(&blob, &origin); let metadata = BlobMetadata { - id: blob.id, + id: blob.id(), num_bytes: blob.len(), mime_essence: md.mime_essence, - charset: md.charset, language: md.language, }; @@ -84,12 +83,17 @@ impl<'a> BlobProcessor<'a> { } } - let loc_mapping = LocationMapping::new(&blob.bytes()); + let bytes = blob.bytes(); + let loc_mapping = if bytes.len() <= LOCATION_LIMIT_BYTES { + Some(LocationMapping::new(bytes)) + } else { + None + }; let converted_matches: Vec<(Option, Match)> = matches .into_iter() .map(|m| { let converted_match = Match::convert_owned_blobmatch_to_match( - &loc_mapping, + loc_mapping.as_ref(), &OwnedBlobMatch::from_blob_match(m), origin_type, ); @@ -106,7 +110,6 @@ impl<'a> BlobProcessor<'a> { struct MetadataResult { mime_essence: Option, language: Option, - charset: Option, } impl MetadataResult { fn from_blob_and_origin(blob: &Blob, origin: &OriginSet) -> MetadataResult { @@ -115,7 +118,6 @@ impl MetadataResult { let mime_essence = Some(tree_magic_mini::from_u8(bytes).to_string()); let inspector = ContentInspector::default(); let language = blob_path.and_then(|p| inspector.guess_language(p, bytes)); - let charset = inspector.guess_charset(bytes); - MetadataResult { mime_essence, language, charset } + MetadataResult { mime_essence, language } } } diff --git a/src/scanner/validation.rs b/src/scanner/validation.rs index 05865b2..d69dbeb 100644 --- a/src/scanner/validation.rs +++ b/src/scanner/validation.rs @@ -13,7 +13,7 @@ use futures::{stream, StreamExt}; use indicatif::{ProgressBar, ProgressStyle}; use liquid::Parser; use reqwest::{Client, StatusCode}; -use rustc_hash::{FxHashMap, FxHashSet}; +use rustc_hash::FxHashMap; use tokio::{sync::Notify, time::timeout}; use crate::{ @@ -21,7 +21,6 @@ use crate::{ findings_store::{FindingsStore, FindingsStoreMessage}, location::OffsetSpan, matcher::{Match, OwnedBlobMatch}, - rules::rule, validation::{collect_variables_and_dependencies, validate_single_match, CachedResponse}, }; @@ -40,7 +39,7 @@ pub async fn run_secret_validation( let fail_count = Arc::new(AtomicUsize::new(0)); // ── 2. Fetch rules + matches ──────────────────────────────────────────── - let (all_rules, all_matches_by_blob) = { + let (_all_rules, all_matches_by_blob) = { let ds = datastore.lock().unwrap(); let rules = ds.get_rules()?; let mut map: FxHashMap>> = FxHashMap::default(); @@ -51,16 +50,10 @@ pub async fn run_secret_validation( }; // ── 3. Partition blobs ────────────────────────────────────────────────── - let rules_with_deps: FxHashSet<&str> = all_rules - .iter() - .filter(|r| !r.syntax().depends_on_rule.is_empty()) - .map(|r| r.id()) - .collect(); - let mut simple_matches = Vec::new(); let mut dependent_blobs = FxHashMap::default(); // blob_id -- Vec> for (blob_id, matches) in all_matches_by_blob { - if matches.iter().any(|m| rules_with_deps.contains(m.2.rule_text_id)) { + if matches.iter().any(|m| !m.2.rule.syntax().depends_on_rule.is_empty()) { dependent_blobs.insert(blob_id, matches); } else { simple_matches.extend(matches); @@ -80,11 +73,8 @@ pub async fn run_secret_validation( .captures .get(1) .or_else(|| arc_msg.2.groups.captures.get(0)) - .map_or("", |c| c.value.as_ref()); - groups - .entry(format!("{}|{}", arc_msg.2.rule_text_id, secret)) - .or_default() - .push(arc_msg); + .map_or("", |c| c.value); + groups.entry(format!("{}|{}", arc_msg.2.rule.id(), secret)).or_default().push(arc_msg); } let validation_results = DashMap::::new(); @@ -109,7 +99,6 @@ pub async fn run_secret_validation( let client = client.clone(); let cache_glob = cache.clone(); let val_res = &validation_results; - let rules = &all_rules; let success = success_count.clone(); let fail = fail_count.clone(); // *** FIX: Clone the progress bar for each concurrent task *** @@ -122,8 +111,8 @@ pub async fn run_secret_validation( .captures .get(1) .or_else(|| rep_arc.2.groups.captures.get(0)) - .map_or("", |c| c.value.as_ref()); - let key = format!("{}|{}", rep_arc.2.rule_text_id, secret); + .map_or("", |c| c.value); + let key = format!("{}|{}", rep_arc.2.rule.id(), secret); match val_res.entry(key.clone()) { dashmap::mapref::entry::Entry::Occupied(_) => return, @@ -138,8 +127,10 @@ pub async fn run_secret_validation( } } - let rule = find_rule_for_match(rules, rep_arc.2.rule_text_id).unwrap(); - let mut om = OwnedBlobMatch::convert_match_to_owned_blobmatch(&rep_arc.2, rule); + let mut om = OwnedBlobMatch::convert_match_to_owned_blobmatch( + &rep_arc.2, + rep_arc.2.rule.clone(), + ); validate_single( &mut om, @@ -211,7 +202,6 @@ pub async fn run_secret_validation( let val_cache = Arc::new(DashMap::::new()); let in_flight = Arc::new(DashMap::::new()); - let rules_ref = Arc::new(all_rules.clone()); for chunk in blob_ids.chunks(chunk_size) { let tasks: Vec<_> = chunk @@ -225,15 +215,15 @@ pub async fn run_secret_validation( let success = success_count.clone(); let fail = fail_count.clone(); let cache_glob = cache.clone(); - let rules = rules_ref.clone(); async move { let owned = matches_for_blob .iter() .map(|arc_msg| { - let rule = find_rule_for_match(&rules, arc_msg.2.rule_text_id) - .expect("rule"); - OwnedBlobMatch::convert_match_to_owned_blobmatch(&arc_msg.2, rule) + OwnedBlobMatch::convert_match_to_owned_blobmatch( + &arc_msg.2, + arc_msg.2.rule.clone(), + ) }) .collect::>(); @@ -338,21 +328,6 @@ pub async fn run_secret_validation( Ok(()) } -/// Returns `Some(Arc)` if a matching rule is found; otherwise returns `None`. -/// Callers can decide how to handle the `None` case (e.g., skip processing). -fn find_rule_for_match( - all_rules: &[Arc], - rule_text_id: &str, -) -> Option> { - match all_rules.iter().find(|r| r.syntax().id == rule_text_id).cloned() { - Some(rule) => Some(rule), - None => { - eprintln!("Warning: no rule found with id '{}'. Skipping.", rule_text_id); - None - } - } -} - // --------------------------------------------------- // The core validation logic, used in an async pipeline // --------------------------------------------------- diff --git a/src/update.rs b/src/update.rs index 76629be..3ab5208 100644 --- a/src/update.rs +++ b/src/update.rs @@ -129,16 +129,29 @@ pub fn check_for_update(global_args: &GlobalArgs, base_url: Option<&str>) -> Opt .apply_to(&format!("Updated to version {}", status.version())) ), Err(e) => match e { - UpdError::Io(ref io_err) if io_err.kind() == ErrorKind::PermissionDenied => { - warn!( - "{}", - styles.style_finding_active_heading.apply_to( - "Cannot replace the current binary - permission denied.\n\ - If you installed via a package manager, run its upgrade command.\n\ - Otherwise reinstall to a user-writable directory or re-run with sudo." - ) - ); - } + UpdError::Io(ref io_err) => match io_err.kind() { + ErrorKind::PermissionDenied => { + warn!( + "{}", + styles.style_finding_active_heading.apply_to( + "Cannot replace the current binary - permission denied.\n\ + If you installed via a package manager, run its upgrade command.\n\ + Otherwise reinstall to a user-writable directory or re-run with sudo." + ) + ); + } + ErrorKind::NotFound => { + warn!( + "{}", + styles.style_finding_active_heading.apply_to( + "Cannot replace the current binary - file not found.\n\ + If you installed via a package manager, run its upgrade command.\n\ + Otherwise reinstall to a user-writable directory." + ) + ); + } + _ => error!("Failed to update: {e}"), + }, _ => error!("Failed to update: {e}"), }, } diff --git a/src/validation.rs b/src/validation.rs index 3548d33..a645ce7 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -910,6 +910,7 @@ mod tests { use crossbeam_skiplist::SkipMap; use http::StatusCode; use rustc_hash::FxHashMap; + use smallvec::smallvec; use crate::{ blob::BlobId, @@ -920,6 +921,7 @@ mod tests { rule::{Confidence, Rule}, Rules, }, + util::intern, validation::{validate_single_match, Cache}, }; #[tokio::test] @@ -1016,12 +1018,12 @@ rules: // matching_input: token.as_bytes().to_vec(), matching_input_offset_span: OffsetSpan { start: 0, end: token.len() }, captures: SerializableCaptures { - captures: vec![SerializableCapture { + captures: smallvec![SerializableCapture { name: Some("TOKEN".to_string()), match_number: -1, start: 0, end: token.len(), - value: token.into(), + value: intern(token), }], }, validation_response_body: String::new(), diff --git a/src/validation.rs.orig b/src/validation.rs.orig deleted file mode 100644 index ca76943..0000000 --- a/src/validation.rs.orig +++ /dev/null @@ -1,1052 +0,0 @@ -use std::{ - collections::BTreeMap, - fs, - hash::{Hash, Hasher}, - sync::Arc, - time::{Duration, Instant}, -}; - -use anyhow::Result; -use crossbeam_skiplist::SkipMap; -use dashmap::DashMap; -use http::StatusCode; -use liquid::Object; -use liquid_core::{Value, ValueView}; -use once_cell::sync::OnceCell; -use reqwest::{header, header::HeaderValue, multipart, Client, Url}; -use rustc_hash::FxHashMap; -use tokio::{sync::Notify, time}; -use tracing::debug; - -use crate::{ - location::OffsetSpan, - matcher::{OwnedBlobMatch, SerializableCaptures}, - rules::rule::Validation, -}; - -mod aws; -mod azure; -mod coinbase; -mod gcp; -mod httpvalidation; -mod jwt; -mod mongodb; -mod postgres; -mod utils; - -const VALIDATION_CACHE_SECONDS: u64 = 1200; // 20 minutes -const MAX_VALIDATION_BODY_LEN: usize = 2048; - -// Use SkipMap-based cache instead of a mutex-wrapped FxHashMap. -type Cache = Arc>; - -/// Returns an opaque 64-bit fingerprint for β€œsame secret under the same rule”. -fn secret_fingerprint(m: &OwnedBlobMatch) -> u64 { - let mut hasher = xxhash_rust::xxh3::Xxh3::new(); - m.rule.syntax().id.hash(&mut hasher); - - // first capture = the secret text itself - if let Some(c0) = m.captures.captures.get(0) { - c0.value.hash(&mut hasher); - } - hasher.finish() -} - -static VALIDATION_CACHE: OnceCell> = OnceCell::new(); -static IN_FLIGHT: OnceCell>> = OnceCell::new(); - -/// Call this once near program start (e.g. in `main()`) -pub fn init_validation_caches() { - VALIDATION_CACHE.set(DashMap::new()).ok(); - IN_FLIGHT.set(DashMap::new()).ok(); -} - -#[derive(Clone)] -pub struct CachedResponse { - pub body: String, - pub status: StatusCode, - pub is_valid: bool, - pub timestamp: Instant, -} - -impl CachedResponse { - pub fn new(body: String, status: StatusCode, is_valid: bool) -> Self { - Self { body, status, is_valid, timestamp: Instant::now() } - } - - pub fn is_still_valid(&self, cache_duration: Duration) -> bool { - self.timestamp.elapsed() < cache_duration - } -} - -/// Collect dependent variables and missing dependencies from the provided matches. -pub fn collect_variables_and_dependencies( - matches: &[OwnedBlobMatch], -) -> (FxHashMap>, FxHashMap>) { - let mut variable_map: FxHashMap> = FxHashMap::default(); - let mut missing_deps: FxHashMap> = FxHashMap::default(); - - for m in matches { - let rule_id = m.rule.syntax().id.clone(); - for dependency in m.rule.syntax().depends_on_rule.iter().flatten() { - let dependency_rule_id = &dependency.rule_id; - // Use iterator adapter to get all matching dependencies. - let matching_dependencies: Vec<_> = - matches.iter().filter(|x| x.rule.syntax().id == *dependency_rule_id).collect(); - - if !matching_dependencies.is_empty() { - for other_match in matching_dependencies { - let matching_input = other_match - .captures - .captures - .get(1) - .or_else(|| other_match.captures.captures.get(0)) - .expect("Expected at least one capture"); - variable_map - .entry(dependency.variable.to_uppercase()) - .or_insert_with(Vec::new) - .push(( - matching_input.value.to_string(), - other_match.matching_input_offset_span, - )); - } - } else { - missing_deps.entry(rule_id.clone()).or_default().push(dependency.rule_id.clone()); - } - } - } - (variable_map, missing_deps) -} - -/// Render a template and parse the resulting string as a URL. -async fn render_and_parse_url( - parser: &liquid::Parser, - globals: &liquid::Object, - rule_name: &str, - template_url: &str, -) -> Result { - let rendered_url_str = - render_template(parser, globals, rule_name, template_url).await.map_err(|e| { - let error_msg = format!("Error rendering URL template: <{}> {}", rule_name, e); - debug!("{}", error_msg); - error_msg - })?; - - let url = Url::parse(&rendered_url_str).map_err(|e| { - let error_msg = format!("Error parsing rendered URL: {}", e); - debug!("{}", error_msg); - error_msg - })?; - - // Check if the URL is resolvable. - utils::check_url_resolvable(&url).await.map_err(|e| { - let error_msg = format!("URL resolution failed: {}", e); - error_msg - })?; - - Ok(url) -} - -/// Render a template string using Liquid. -async fn render_template( - parser: &liquid::Parser, - globals: &liquid::Object, - rule_name: &str, - template_str: &str, -) -> Result { - parser - .parse(template_str) - .map_err(|e| { - let msg = format!("Error parsing template for rule <{}>: {}", rule_name, e); - debug!("{}", msg); - msg - }) - .and_then(|template| { - template.render(globals).map_err(|e| { - let msg = format!("Error rendering template for rule <{}>: {}", rule_name, e); - debug!("{}", msg); - msg - }) - }) -} - -/// Validate a single match with a timeout of 60 seconds. -pub async fn validate_single_match( - m: &mut OwnedBlobMatch, - parser: &liquid::Parser, - client: &Client, - dependent_variables: &FxHashMap>, - missing_dependencies: &FxHashMap>, - cache: &Cache, -) { - let timeout_result = time::timeout(Duration::from_secs(60), async { - timed_validate_single_match( - m, - parser, - client, - dependent_variables, - missing_dependencies, - cache, - ) - .await - }) - .await; - - if timeout_result.is_err() { - m.validation_success = false; - m.validation_response_body = "Validation timed out after 60 seconds".to_string(); - m.validation_response_status = StatusCode::REQUEST_TIMEOUT; - } -} - -/// Perform the actual validation of a match. -/// Guarantees that each | is validated only once per process, -/// even when `--no-dedup` is used. -async fn timed_validate_single_match<'a>( - m: &mut OwnedBlobMatch, - parser: &liquid::Parser, - client: &Client, - dependent_variables: &FxHashMap>, - missing_dependencies: &FxHashMap>, - cache: &Cache, -) { - // ────────────────────────────────────────────────────────── - // 1. process-wide fingerprint de-dup - // ────────────────────────────────────────────────────────── - let fp = secret_fingerprint(m); - - if let Some(entry) = VALIDATION_CACHE.get_or_init(DashMap::new).get(&fp) { - if entry.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) { - m.validation_success = entry.is_valid; - m.validation_response_body = entry.body.clone(); - m.validation_response_status = entry.status; - return; - } - } - if let Some(wait) = IN_FLIGHT.get_or_init(DashMap::new).get(&fp) { - wait.notified().await; - if let Some(entry) = VALIDATION_CACHE.get().unwrap().get(&fp) { - m.validation_success = entry.is_valid; - m.validation_response_body = entry.body.clone(); - m.validation_response_status = entry.status; - } - return; - } - let notify = Arc::new(Notify::new()); - IN_FLIGHT.get().unwrap().insert(fp, notify.clone()); - - // helper to persist result + notify waiters - let commit_and_return = |m: &OwnedBlobMatch| { - VALIDATION_CACHE.get().unwrap().insert( - fp, - CachedResponse { - body: m.validation_response_body.clone(), - status: m.validation_response_status, - is_valid: m.validation_success, - timestamp: Instant::now(), - }, - ); - IN_FLIGHT.get().unwrap().remove(&fp); - notify.notify_waiters(); - }; - // ────────────────────────────────────────────────────────── - - // 2. dependency check - if let Some(missing) = missing_dependencies.get(&m.rule.syntax().id) { - if !missing.is_empty() { - m.validation_success = false; - m.validation_response_body = - format!("Validation skipped - missing dependent rules: {}", missing.join(", ")); - m.validation_response_status = StatusCode::PRECONDITION_REQUIRED; - commit_and_return(m); - return; - } - } - - // 3. capture processing - let match_re_result = m.rule.syntax().as_anchored_regex(); - let mut captured_values: Vec<(String, String, usize, usize)> = match match_re_result { - Ok(_) => utils::process_captures(&m.captures), - Err(e) => { - m.validation_success = false; - m.validation_response_body = format!("Regex error: {}", e); - m.validation_response_status = StatusCode::INTERNAL_SERVER_ERROR; - commit_and_return(m); - return; - } - }; - - for dep in m.rule.syntax().depends_on_rule.iter().flatten() { - if let Some(vals) = dependent_variables.get(&dep.variable.to_uppercase()) { - for (val, span) in vals { - captured_values.push(( - dep.variable.to_uppercase(), - val.clone(), - span.start, - span.end, - )); - } - } - } - - let mut globals = Object::new(); - for (k, v, ..) in &captured_values { - globals.insert(k.to_uppercase().into(), Value::scalar(v.clone())); - } - - let rule_syntax = m.rule.syntax(); - - // ────────────────────────────────────────────────────────── - // 4. validator switch - // ────────────────────────────────────────────────────────── - match &rule_syntax.validation { - // ---------------------------------------------------- HTTP validator - Some(Validation::Http(http_validation)) => { - // render URL - let url = match render_and_parse_url( - parser, - &globals, - &rule_syntax.name, - &http_validation.request.url, - ) - .await - { - Ok(u) => u, - Err(e) => { - m.validation_success = false; - m.validation_response_body = e; - m.validation_response_status = StatusCode::BAD_REQUEST; - commit_and_return(m); - return; - } - }; - - // build request builder - let request_builder = match httpvalidation::build_request_builder( - client, - &http_validation.request.method, - &url, - &http_validation.request.headers, - &http_validation.request.body, - parser, - &globals, - ) { - Ok(rb) => rb, - Err(e) => { - m.validation_success = false; - m.validation_response_body = e; - m.validation_response_status = StatusCode::BAD_REQUEST; - commit_and_return(m); - return; - } - }; - - let is_multipart = http_validation.request.multipart.is_some(); - let mut cache_key = String::new(); - - // old per-request cache (optional) - if !is_multipart { - let rendered_headers = httpvalidation::process_headers( - &http_validation.request.headers, - parser, - &globals, - &url, - ) - .unwrap_or_default(); - - let mut header_map = BTreeMap::new(); - for (name, value) in rendered_headers.iter() { - if let Ok(v) = value.to_str() { - header_map.insert(name.as_str().to_string(), v.to_string()); - } - } - cache_key = httpvalidation::generate_http_cache_key_parts( - http_validation.request.method.as_str(), - &url, - &header_map, - ); - if let Some(cached) = cache.get(&cache_key) { - let c = cached.value(); - if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) { - m.validation_success = c.is_valid; - m.validation_response_body = c.body.clone(); - m.validation_response_status = c.status; - commit_and_return(m); - return; - } - } - } - - // helper to execute single non-multipart request with retry - let exec_single = |builder: reqwest::RequestBuilder| async { - httpvalidation::retry_request( - builder, - 1, - Duration::from_millis(500), - Duration::from_secs(2), - ) - .await - }; - - // run request (multipart vs non-multipart) - let resp_res = if is_multipart { - // build multipart request each retry - let build_request = || async { - let method = httpvalidation::parse_http_method(&http_validation.request.method) - .unwrap_or(reqwest::Method::GET); - - let mut fresh_builder = - client.request(method, url.clone()).timeout(Duration::from_secs(5)); - - if let Ok(mut headers) = httpvalidation::process_headers( - &http_validation.request.headers, - parser, - &globals, - &url, - ) { - // add realistic UA & accept headers - let ua = format!( - "{} {}/{}", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ - AppleWebKit/537.36 (KHTML, like Gecko) \ - Chrome/132.0.0.0 Safari/537.36", - env!("CARGO_PKG_NAME"), - env!("CARGO_PKG_VERSION") - ); - let std_headers = [ - (header::USER_AGENT, ua.as_str()), - (header::ACCEPT , "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"), - (header::ACCEPT_LANGUAGE, "en-US,en;q=0.5"), - (header::ACCEPT_ENCODING, "gzip, deflate, br"), - (header::CONNECTION, "keep-alive"), - ]; - for (hn, hv) in &std_headers { - if let Ok(v) = HeaderValue::from_str(hv) { - headers.insert(hn.clone(), v); - } - } - fresh_builder = fresh_builder.headers(headers); - } - - // build multipart form - let mut form = multipart::Form::new(); - for part in http_validation.request.multipart.as_ref().unwrap().parts.iter() { - match part.part_type.as_str() { - "file" => { - let path = render_template( - parser, - &globals, - &rule_syntax.name, - &part.content, - ) - .await - .unwrap_or_default(); - let bytes = fs::read(path).unwrap_or_default(); - let p = multipart::Part::bytes(bytes) - .mime_str( - part.content_type - .as_deref() - .unwrap_or("application/octet-stream"), - ) - .unwrap_or_else(|_| multipart::Part::text("invalid")); - form = form.part(part.name.clone(), p); - } - "text" => { - let txt = render_template( - parser, - &globals, - &rule_syntax.name, - &part.content, - ) - .await - .unwrap_or_default(); - let p = multipart::Part::text(txt) - .mime_str(part.content_type.as_deref().unwrap_or("text/plain")) - .unwrap_or_else(|_| multipart::Part::text("invalid")); - form = form.part(part.name.clone(), p); - } - _ => { /* ignore */ } - } - } - fresh_builder.multipart(form) - }; - - httpvalidation::retry_multipart_request( - build_request, - 1, - Duration::from_millis(500), - Duration::from_secs(2), - ) - .await - } else { - exec_single(request_builder).await - }; - - // handle result - match resp_res { - Ok(resp) => { - let status = resp.status(); - let headers = resp.headers().clone(); - let mut body = match resp.text().await { - Ok(b) => b, - Err(e) => { - m.validation_success = false; - m.validation_response_body = format!("Error reading response: {}", e); - m.validation_response_status = StatusCode::BAD_GATEWAY; - commit_and_return(m); - return; - } - }; - if body.len() > MAX_VALIDATION_BODY_LEN { - body.truncate(MAX_VALIDATION_BODY_LEN); - } - - m.validation_response_status = status; - m.validation_response_body = body.clone(); - let matchers = http_validation - .request - .response_matcher - .as_ref() - .expect("missing response_matcher"); - - m.validation_success = httpvalidation::validate_response( - matchers, - &body, - &status, - &headers, - http_validation.request.response_is_html, - ); - - if !is_multipart && !cache_key.is_empty() { - cache.insert( - cache_key, - CachedResponse { - body, - status, - is_valid: m.validation_success, - timestamp: Instant::now(), - }, - ); - } - } - Err(e) => { - m.validation_success = false; - m.validation_response_body = format!("HTTP error: {:?}", e); - m.validation_response_status = StatusCode::BAD_GATEWAY; - } - } - } - - // ---------------------------------------------------- MongoDB validator - Some(Validation::MongoDB) => { - let uri = globals - .get("TOKEN") - .and_then(|v| v.as_scalar()) - .map(|s| s.into_owned().to_kstr().to_string()) - .unwrap_or_default(); - - if uri.is_empty() { - m.validation_success = false; - m.validation_response_body = "MongoDB URI not found.".to_string(); - m.validation_response_status = StatusCode::BAD_REQUEST; - commit_and_return(m); - return; - } - - let cache_key = mongodb::generate_mongodb_cache_key(&uri); - if let Some(cached) = cache.get(&cache_key) { - let c = cached.value(); - if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) { - m.validation_success = c.is_valid; - m.validation_response_body = c.body.clone(); - m.validation_response_status = c.status; - commit_and_return(m); - return; - } - } - - match mongodb::validate_mongodb(&uri).await { - Ok((ok, msg)) => { - m.validation_success = ok; - m.validation_response_body = msg; - m.validation_response_status = - if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED }; - } - Err(e) => { - m.validation_success = false; - m.validation_response_body = format!("MongoDB validation error: {}", e); - m.validation_response_status = StatusCode::BAD_GATEWAY; - } - } - } - - // ------------------------------------------------ Azure Storage validator - Some(Validation::AzureStorage) => { - let storage_key = captured_values - .iter() - .find(|(n, ..)| n == "TOKEN") - .map(|(_, v, ..)| v.clone()) - .unwrap_or_default(); - let storage_account = - utils::find_closest_variable(&captured_values, &storage_key, "TOKEN", "AZURENAME") - .unwrap_or_default(); - - if storage_account.is_empty() || storage_key.is_empty() { - m.validation_success = false; - m.validation_response_body = "Missing Azure Storage account or key.".to_string(); - m.validation_response_status = StatusCode::BAD_REQUEST; - commit_and_return(m); - return; - } - - let creds_json = format!( - r#"{{"storage_account":"{}","storage_key":"{}"}}"#, - storage_account, storage_key - ); - let cache_key = azure::generate_azure_cache_key(&creds_json); - - if let Some(cached) = cache.get(&cache_key) { - let c = cached.value(); - if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) { - m.validation_success = c.is_valid; - m.validation_response_body = c.body.clone(); - m.validation_response_status = c.status; - commit_and_return(m); - return; - } - } - - match azure::validate_azure_storage_credentials(&creds_json, cache).await { - Ok((ok, msg)) => { - m.validation_success = ok; - m.validation_response_body = msg; - m.validation_response_status = - if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED }; - } - Err(e) => { - m.validation_success = false; - m.validation_response_body = format!("Azure Storage error: {}", e); - m.validation_response_status = StatusCode::BAD_GATEWAY; - } - } - cache.insert( - cache_key, - CachedResponse { - body: m.validation_response_body.clone(), - status: m.validation_response_status, - is_valid: m.validation_success, - timestamp: Instant::now(), - }, - ); - } - - // ------------------------------------------------ Postgres validator - Some(Validation::Postgres) => { - let pg_url = globals - .get("TOKEN") - .and_then(|v| v.as_scalar()) - .map(|s| s.into_owned().to_kstr().to_string()) - .unwrap_or_default(); - - if pg_url.is_empty() { - m.validation_success = false; - m.validation_response_body = "Postgres URL not found.".to_string(); - m.validation_response_status = StatusCode::BAD_REQUEST; - commit_and_return(m); - return; - } - - let cache_key = postgres::generate_postgres_cache_key(&pg_url); - if let Some(cached) = cache.get(&cache_key) { - let c = cached.value(); - if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) { - m.validation_success = c.is_valid; - m.validation_response_body = c.body.clone(); - m.validation_response_status = c.status; - commit_and_return(m); - return; - } - } - - match postgres::validate_postgres(&pg_url).await { - Ok((ok, meta)) => { - m.validation_success = ok; - m.validation_response_body = if ok { - format!("Postgres connection is valid. Metadata: {:?}", meta) - } else { - "Postgres connection failed.".to_string() - }; - m.validation_response_status = - if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED }; - } - Err(e) => { - m.validation_success = false; - m.validation_response_body = format!("Postgres error: {}", e); - m.validation_response_status = StatusCode::BAD_GATEWAY; - } - } - cache.insert( - cache_key, - CachedResponse { - body: m.validation_response_body.clone(), - status: m.validation_response_status, - is_valid: m.validation_success, - timestamp: Instant::now(), - }, - ); - } - // ---------------------------------------------------- JWT validator - Some(Validation::JWT) => { - let token = captured_values - .iter() - .find(|(n, ..)| n == "TOKEN") - .map(|(_, v, ..)| v.clone()) - .unwrap_or_default(); - - if token.is_empty() { - m.validation_success = false; - m.validation_response_body = "JWT token not found.".to_string(); - m.validation_response_status = StatusCode::BAD_REQUEST; - commit_and_return(m); - return; - } - - match jwt::validate_jwt(&token).await { - Ok((ok, msg)) => { - m.validation_success = ok; - m.validation_response_body = msg; - m.validation_response_status = - if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED }; - } - Err(e) => { - m.validation_success = false; - m.validation_response_body = format!("JWT validation error: {}", e); - m.validation_response_status = StatusCode::BAD_REQUEST; - } - } - } - // ---------------------------------------------------- AWS validator - Some(Validation::AWS) => { - let secret = captured_values - .iter() - .find(|(n, ..)| n == "TOKEN") - .map(|(_, v, ..)| v.clone()) - .unwrap_or_default(); - let akid = utils::find_closest_variable(&captured_values, &secret, "TOKEN", "AKID") - .unwrap_or_default(); - - if akid.is_empty() || secret.is_empty() { - m.validation_success = false; - m.validation_response_body = "Missing AWS access-key ID or secret.".to_string(); - m.validation_response_status = StatusCode::BAD_REQUEST; - commit_and_return(m); - return; - } - - let cache_key = aws::generate_aws_cache_key(&akid, &secret); - if let Some(cached) = cache.get(&cache_key) { - let c = cached.value(); - if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) { - m.validation_success = c.is_valid; - m.validation_response_body = c.body.clone(); - m.validation_response_status = c.status; - commit_and_return(m); - return; - } - } - - if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) { - m.validation_success = false; - m.validation_response_body = format!("Invalid AWS credentials ({}): {}", akid, e); - m.validation_response_status = StatusCode::BAD_REQUEST; - commit_and_return(m); - return; - } - - match aws::validate_aws_credentials(&akid, &secret, cache).await { - Ok((ok, arn)) => { - m.validation_success = ok; - m.validation_response_body = format!("{} --- ARN: {}", akid, arn); - m.validation_response_status = - if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED }; - if let Ok(acct) = aws::aws_key_to_account_number(&akid) { - m.validation_response_body - .push_str(&format!(" --- AWS Account Number: {:012}", acct)); - } - } - Err(e) => { - m.validation_success = false; - m.validation_response_body = format!("AWS validation error ({}): {}", akid, e); - m.validation_response_status = StatusCode::BAD_GATEWAY; - } - } - cache.insert( - cache_key, - CachedResponse { - body: m.validation_response_body.clone(), - status: m.validation_response_status, - is_valid: m.validation_success, - timestamp: Instant::now(), - }, - ); - } - - // ----------------------------------------------------- GCP validator - Some(Validation::GCP) => { - let gcp_json = globals - .get("TOKEN") - .and_then(|v| v.as_scalar()) - .map(|s| s.into_owned().to_kstr().to_string()) - .unwrap_or_default(); - - if gcp_json.is_empty() { - m.validation_success = false; - m.validation_response_body = "GCP JSON not found.".to_string(); - m.validation_response_status = StatusCode::BAD_REQUEST; - commit_and_return(m); - return; - } - - let cache_key = gcp::generate_gcp_cache_key(&gcp_json); - if let Some(cached) = cache.get(&cache_key) { - let c = cached.value(); - if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) { - m.validation_success = c.is_valid; - m.validation_response_body = c.body.clone(); - m.validation_response_status = c.status; - commit_and_return(m); - return; - } - } - - match gcp::GcpValidator::global() { - Ok(validator) => { - match validator.validate_gcp_credentials(&gcp_json.as_bytes()).await { - Ok((ok, meta)) => { - m.validation_success = ok; - m.validation_response_body = meta.join("\n"); - m.validation_response_status = - if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED }; - } - Err(e) => { - m.validation_success = false; - m.validation_response_body = format!("GCP validation error: {}", e); - m.validation_response_status = StatusCode::BAD_GATEWAY; - } - } - } - Err(e) => { - m.validation_success = false; - m.validation_response_body = format!("Failed to create GCP validator: {}", e); - m.validation_response_status = StatusCode::INTERNAL_SERVER_ERROR; - } - } - cache.insert( - cache_key, - CachedResponse { - body: m.validation_response_body.clone(), - status: m.validation_response_status, - is_valid: m.validation_success, - timestamp: Instant::now(), - }, - ); - } - // ----------------------------------------------------- Coinbase validator - Some(Validation::Coinbase) => { - let cred_name = globals - .get("CRED_NAME") - .and_then(|v| v.as_scalar()) - .map(|s| s.into_owned().to_kstr().to_string()) - .unwrap_or_default(); - let private_key = globals - .get("PRIVATE_KEY") - .and_then(|v| v.as_scalar()) - .map(|s| s.into_owned().to_kstr().to_string()) - .unwrap_or_default(); - - if cred_name.is_empty() || private_key.is_empty() { - m.validation_success = false; - m.validation_response_body = "Missing key name or private key.".to_string(); - m.validation_response_status = StatusCode::BAD_REQUEST; - commit_and_return(m); - return; - } - - match coinbase::validate_cdp_api_key(&cred_name, &private_key, client, parser, cache) - .await - { - Ok((ok, msg)) => { - m.validation_success = ok; - m.validation_response_body = msg; - m.validation_response_status = - if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED }; - } - Err(e) => { - m.validation_success = false; - m.validation_response_body = format!("Coinbase validation error: {}", e); - m.validation_response_status = StatusCode::BAD_GATEWAY; - } - } - } - // --------------------------------------------------------- Raw / none - Some(Validation::Raw(raw)) => { - debug!("Raw validation not implemented: {}", raw); - m.validation_success = false; - m.validation_response_body = "Validator not implemented".to_string(); - m.validation_response_status = StatusCode::NOT_IMPLEMENTED; - } - None => { /* no validation specified */ } - } - - // 5. persist result for success path - commit_and_return(m); -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use anyhow::Result; - use crossbeam_skiplist::SkipMap; - use http::StatusCode; - use rustc_hash::FxHashMap; - - use crate::{ - blob::BlobId, - liquid_filters::register_all, - location::OffsetSpan, - matcher::{OwnedBlobMatch, SerializableCapture, SerializableCaptures}, - rules::{ - rule::{Confidence, Rule}, - Rules, - }, - validation::{validate_single_match, Cache}, - }; - #[tokio::test] - async fn test_actual_pypi_token_validation() -> Result<()> { - // Minimal PyPI YAML snippet for testing - let pypi_yaml = r#" -rules: - - name: PyPI Upload Token - id: kingfisher.pypi.1 - pattern: | - (?x) - \b - ( - pypi-AgEIcHlwaS5vcmc[a-zA-Z0-9_-]{50,} - ) - (?:[^a-zA-Z0-9_-]|$) - min_entropy: 4.0 - confidence: medium - examples: - - '# password = pypi-AgEIcHlwaS5vcmcCJDkwNzYwNzU1LWMwOTUtNGNkOC1iYjQzLTU3OWNhZjI1NDQ1MwACJXsicGVybWCf99lvbnMiOiAidXNlciIsICJ2ZXJzaW9uIjogMX0AAAYgSpW5PAywXvchMUQnkF5H6-SolJysfUvIWopMsxE4hCM' - - 'password: pypi-AgEIcHlwaS5vcmcCJGExMDIxZjRhLTFhZDMtNDc4YS1iOWNmLWQwCf99OTIwZjFjNwACSHsicGVybWlzc2lvbnMiOiB7InByb2plY3RzIjogWyJkamFuZ28tY2hhbm5lbHMtanNvbnJwYyJdfSwgInZlcnNpb24iOiAxfQAABiBZg48cIBQt7HckwM4G3q-462xphsLbm7IZvjqMS4jvQw' - validation: - type: Http - content: - request: - method: POST - url: https://upload.pypi.org/legacy/ - response_is_html: true - response_matcher: - - report_response: true - - type: WordMatch - words: - - "isn't allowed to upload to project" - headers: - Authorization: 'Basic {{ "__token__:" | append: TOKEN | b64enc }}' - multipart: - parts: - - name: name - type: text - content: "my-package" - - name: version - type: text - content: "0.0.1" - - name: filetype - type: text - content: "sdist" - - name: metadata_version - type: text - content: "2.1" - - name: summary - type: text - content: "A simple example package" - - name: home_page - type: text - content: "https://github.com/yourusername/my_package" - - name: sha256_digest - type: text - content: "0447379dd46c4ca8b8992bda56d07b358d015efb9300e6e16f224f4536e71d64" - - name: md5_digest - type: text - content: "9b4036ab91a71124ab9f1d32a518e2bb" - - name: :action - type: text - content: "file_upload" - - name: protocol_version - type: text - content: "1" - - name: content - type: file - content: "path/to/my_package-0.0.1.tar.gz" - content_type: "application/octet-stream" - "#; - // Use from_paths_and_contents to parse the YAML snippet into a Rules object - let data = vec![(std::path::Path::new("pypi_test.yaml"), pypi_yaml.as_bytes())]; - let rules = Rules::from_paths_and_contents(data, Confidence::Low)?; - // Find the PyPI rule we just loaded - let pypi_rule_syntax = rules - .iter_rules() - .find(|r| r.id == "kingfisher.pypi.1") - .expect("Failed to find PyPI rule in test YAML") - .clone(); // Clone so we can create a `Rule` from it - // Wrap that into a `Rule` object - let pypi_rule = Rule::new(pypi_rule_syntax); - ////////////////////////////////////////// - // - // Your actual PyPI token to test - let token = ""; - let id = BlobId::new(&pypi_yaml.as_bytes()); - // Construct an `OwnedBlobMatch` (all fields needed): - let mut owned_blob_match = OwnedBlobMatch { - rule: pypi_rule.into(), - blob_id: id, - finding_fingerprint: 0, // dummy value - // matching_input: token.as_bytes().to_vec(), - matching_input_offset_span: OffsetSpan { start: 0, end: token.len() }, - captures: SerializableCaptures { - captures: vec![SerializableCapture { - name: Some("TOKEN".to_string()), - match_number: -1, - start: 0, - end: token.len(), - value: token.into(), - }], - }, - validation_response_body: String::new(), - validation_response_status: StatusCode::OK, - validation_success: false, - calculated_entropy: 0.0, // or compute your own - }; - let parser = register_all(liquid::ParserBuilder::with_stdlib()).build()?; - let client = reqwest::Client::new(); - let cache: Cache = Arc::new(SkipMap::new()); - let dependent_vars = FxHashMap::default(); - let missing_deps = FxHashMap::default(); - // Run the validation - validate_single_match( - &mut owned_blob_match, - &parser, - &client, - &dependent_vars, - &missing_deps, - &cache, - ) - .await; - println!("Success? {:?}", owned_blob_match.validation_success); - println!("Status: {:?}", owned_blob_match.validation_response_status); - println!("Body: {:?}", owned_blob_match.validation_response_body); - Ok(()) - } -} diff --git a/src/validation/utils.rs b/src/validation/utils.rs index ee118c8..ea0b020 100644 --- a/src/validation/utils.rs +++ b/src/validation/utils.rs @@ -22,7 +22,7 @@ pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String, .map(|cap| { let name = cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string()); - (name, cap.value.clone().into_owned(), cap.start, cap.end) + (name, cap.value.to_string(), cap.start, cap.end) }) .collect() } @@ -76,16 +76,17 @@ mod tests { use super::*; use crate::matcher::{SerializableCapture, SerializableCaptures}; use pretty_assertions::assert_eq; + use smallvec::smallvec; #[test] fn single_unnamed_capture_is_returned() { let captures = SerializableCaptures { - captures: vec![SerializableCapture { + captures: smallvec![SerializableCapture { name: None, match_number: 0, start: 1, end: 4, - value: "abc".into(), + value: "abc", }], }; let result = process_captures(&captures); @@ -95,20 +96,20 @@ mod tests { #[test] fn skips_whole_match_when_multiple() { let captures = SerializableCaptures { - captures: vec![ + captures: smallvec![ SerializableCapture { name: None, match_number: 0, start: 0, end: 5, - value: "abcde".into(), + value: "abcde", }, SerializableCapture { name: Some("foo".to_string()), match_number: -1, start: 1, end: 4, - value: "bcd".into(), + value: "bcd", }, ], }; @@ -119,28 +120,22 @@ mod tests { #[test] fn includes_unnamed_groups_but_skips_whole_match() { let captures = SerializableCaptures { - captures: vec![ + captures: smallvec![ SerializableCapture { name: None, match_number: 0, start: 0, end: 6, - value: "aabbcc".into(), + value: "aabbcc", }, SerializableCapture { name: Some("foo".to_string()), match_number: -1, start: 0, end: 2, - value: "aa".into(), - }, - SerializableCapture { - name: None, - match_number: 1, - start: 4, - end: 6, - value: "cc".into(), + value: "aa", }, + SerializableCapture { name: None, match_number: 1, start: 4, end: 6, value: "cc" }, ], }; let result = process_captures(&captures); diff --git a/tests/fingerprint_dedup.rs b/tests/fingerprint_dedup.rs index e4380ac..f458e7d 100644 --- a/tests/fingerprint_dedup.rs +++ b/tests/fingerprint_dedup.rs @@ -14,11 +14,27 @@ use kingfisher::{ matcher::{Match, SerializableCapture, SerializableCaptures}, origin::{Origin, OriginSet}, reporter::{styles::Styles, DetailsReporter, ReportMatch}, - rules::rule::Confidence, + rules::rule::{Confidence, Rule, RuleSyntax}, + util::intern, }; +use smallvec::smallvec; // ---- helpers ------------------------------------------------------------------------------- fn make_match(fp: u64) -> Match { + let syntax = RuleSyntax { + name: "Example Rule".to_string(), + id: "RULE.1".to_string(), + pattern: "dummy".to_string(), + min_entropy: 0.0, + confidence: Confidence::Medium, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None, + depends_on_rule: vec![], + }; + let rule = Arc::new(Rule::new(syntax)); Match { location: Location { offset_span: OffsetSpan { start: 0, end: 10 }, @@ -28,20 +44,17 @@ fn make_match(fp: u64) -> Match { }, }, groups: SerializableCaptures { - captures: vec![SerializableCapture { + captures: smallvec![SerializableCapture { name: None, match_number: 0, start: 0, end: 10, - value: "dummy".into(), + value: intern("dummy"), }], }, blob_id: BlobId::new(b"dummy"), finding_fingerprint: fp, - rule_finding_fingerprint: "structural.1".into(), - rule_text_id: "RULE.1".into(), - rule_name: "Example Rule".into(), - rule_confidence: Confidence::Medium, + rule, validation_response_body: String::new(), validation_response_status: 0, validation_success: false, @@ -106,7 +119,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> { id: BlobId::new(b"dummy"), num_bytes: 10, mime_essence: None, - charset: None, language: None, }, m: m1, @@ -123,7 +135,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> { id: BlobId::new(b"dummy"), num_bytes: 10, mime_essence: None, - charset: None, language: None, }, m: m2,