diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6e57853..fb350a4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,15 @@
All notable changes to this project will be documented in this file.
+## [1.48.0]
+- Improved error message when self-update cannot find the current binary
+- Optimized memory usage via string interning and extensive data sharing
+- Replaced quadratic match filtering with a per-rule span map, fixing missed secrets in extremely large files and improving scan performance
+- Support scanning extremely large files by chunking input into 1Β GiB segments with small overlaps, avoiding vectorscan buffer limits while preserving match offsets
+- Always use chunked vectorscan, eliminating the slow regex fallback for blobs over 4Β GiB
+- Skip Base64 scanning for blobs over 64 MB to avoid a second pass over massive files
+- Increased max-file-size default to 64 MB (up from 25 MB)
+
## [1.47.0]
- MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them
- Improved rules: github oauth2, diffbot, mailchimp, aws
diff --git a/Cargo.toml b/Cargo.toml
index 4e774bf..797ef62 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,7 +10,7 @@ publish = false
[package]
name = "kingfisher"
-version = "1.47.0"
+version = "1.48.0"
description = "MongoDB's blazingly fast secret scanning and validation tool"
edition.workspace = true
rust-version.workspace = true
@@ -65,7 +65,7 @@ smallvec = { version = "1", features = [
tracing = "0.1.41"
indicatif = { version = "0.17", features = ["improved_unicode"] }
rayon = "1.10"
-sha1 = "0.10.6"
+sha1 = { version = "0.10.6", features = ["asm"] }
hex = "0.4.3"
vectorscan-rs = "0.0.5"
regex = "1.11.1"
@@ -151,6 +151,7 @@ tar = "0.4.44"
xz2 = "0.1.7"
asar = "0.3.0"
blake3 = "1.8.2"
+memchr = "2.7"
memmap2 = "0.9.7"
futures = "0.3.31"
dashmap = "6.1.0"
diff --git a/README.md b/README.md
index cccbf34..1da58dd 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,64 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
+- [Kingfisher](#kingfisher)
+ - [Key Features](#key-features)
+- [Benchmark Results](#benchmark-results)
+- [Getting Started](#getting-started)
+ - [Installation](#installation)
+ - [Run Kingfisher in Docker](#run-kingfisher-in-docker)
+- [π Detection Rules at a Glance](#-detection-rules-at-a-glance)
+ - [Write Custom Rules!](#write-custom-rules)
+- [Usage](#usage)
+ - [Basic Examples](#basic-examples)
+ - [Scan with secret validation](#scan-with-secret-validation)
+ - [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories)
+ - [Scan a Git repository without validation](#scan-a-git-repository-without-validation)
+ - [Display only secrets confirmed active by thirdβparty APIs](#display-only-secrets-confirmed-active-by-thirdparty-apis)
+ - [Output JSON and capture to a file](#output-json-and-capture-to-a-file)
+ - [Output SARIF directly to disk](#output-sarif-directly-to-disk)
+ - [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--)
+ - [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size)
+ - [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag)
+ - [Display rule performance statistics](#display-rule-performance-statistics)
+ - [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files)
+ - [Exclude specific paths](#exclude-specific-paths)
+ - [Scan an S3 bucket](#scan-an-s3-bucket)
+ - [Scanning Docker Images](#scanning-docker-images)
+ - [Scanning GitHub](#scanning-github)
+ - [Scan GitHub organisation (requires `KF_GITHUB_TOKEN`)](#scan-github-organisation-requires-kf_github_token)
+ - [Scan remote GitHub repository](#scan-remote-github-repository)
+ - [Scanning GitLab](#scanning-gitlab)
+ - [Scan GitLab group (requires `KF_GITLAB_TOKEN`)](#scan-gitlab-group-requires-kf_gitlab_token)
+ - [Scan GitLab user](#scan-gitlab-user)
+ - [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url)
+ - [List GitLab repositories](#list-gitlab-repositories)
+ - [Scanning Jira](#scanning-jira)
+ - [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query)
+ - [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues)
+ - [Scanning Confluence](#scanning-confluence)
+ - [Scan Confluence pages matching a CQL query](#scan-confluence-pages-matching-a-cql-query)
+ - [Scanning Slack](#scanning-slack)
+ - [Scan Slack messages matching a search query](#scan-slack-messages-matching-a-search-query)
+ - [Environment Variables for Tokens](#environment-variables-for-tokens)
+ - [Exit Codes](#exit-codes)
+ - [Update Checks](#update-checks)
+- [Advanced Options](#advanced-options)
+ - [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets)
+ - [List Builtin Rules](#list-builtin-rules)
+ - [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run)
+ - [To add your rules alongside the builtβins:](#to-add-your-rules-alongside-the-builtins)
+ - [Other Examples](#other-examples)
+ - [Notable Scan Options](#notable-scan-options)
+ - [Understanding `--confidence`](#understanding---confidence)
+ - [Ignore known false positives](#ignore-known-false-positives)
+ - [Finding Fingerprint](#finding-fingerprint)
+ - [Rule Performance Profiling](#rule-performance-profiling)
+ - [CLI Options](#cli-options)
+ - [Origins and Divergence](#origins-and-divergence)
+- [Roadmap](#roadmap)
+- [License](#license)
+
# Getting Started
## Installation
@@ -225,9 +283,18 @@ cat /path/to/file.py | kingfisher scan -
```
+### Limit maximum file size scanned (`--max-file-size`)
+
+By default, Kingfisher skips files larger than **64 MB**. You can raise or lower this cap per run with `--max-file-size`, which takes a value in **megabytes**.
+
+```bash
+# Scan files up to 250 mb in size
+kingfisher scan /some/file --max-file-size 250
+```
+
### Scan using a rule _family_ with one flag
-_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws._`)\*
+_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws.*`)_
```bash
# Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2)
@@ -617,6 +684,14 @@ kingfisher github repos list --organization my-org
- `--skip-regex `: Ignore findings whose text matches this regex (repeatable)
- `--skip-word `: Ignore findings containing this case-insensitive word (repeatable)
+## Understanding `--confidence`
+
+The `--confidence` flag sets a minimum confidence threshold, not an exact match.
+
+- If you pass `--confidence medium`, findings with **medium and higher** confidence (medium + high) will be included.
+- If you pass `--confidence low`, youβll see **all levels** (low, medium, high).
+
+
### Ignore known false positives
Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context.
diff --git a/data/rules/jwt.yml b/data/rules/jwt.yml
index e596027..b237b4c 100644
--- a/data/rules/jwt.yml
+++ b/data/rules/jwt.yml
@@ -5,7 +5,7 @@ rules:
(?x)
\b
(
- ey[A-Za-z0-9_-]{12,} (?# header )
+ (?:ey|ewogIC)[A-Za-z0-9_-]{12,} (?# header )
\.
ey[A-Za-z0-9_-]{12,} (?# payload )
\.
diff --git a/src/blob.rs b/src/blob.rs
index a54a4a8..9b1aff1 100644
--- a/src/blob.rs
+++ b/src/blob.rs
@@ -10,16 +10,20 @@ use anyhow::Result;
use bstr::{BString, ByteSlice};
use gix::ObjectId;
use hex;
+use once_cell::sync::OnceCell;
use parking_lot::Mutex;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use sha1::{Digest, Sha1};
use smallvec::SmallVec;
+use std::sync::atomic::{AtomicU64, Ordering};
use crate::git_commit_metadata::CommitMetadata;
// const LARGE_FILE_THRESHOLD: u64 = 512 * 1024; // 512 KB
const LARGE_FILE_THRESHOLD: u64 = 0; // always mmap
+static NEXT_ID: AtomicU64 = AtomicU64::new(1);
+
/// The data of a blob, either owned (small files) or memory mapped (large files).
pub enum BlobData<'a> {
/// Small blobs β remains as-is.
@@ -75,47 +79,68 @@ pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>;
/// A Git blob, storing its SHA-1 id and its contents.
pub struct Blob<'a> {
- pub id: BlobId,
- pub data: BlobData<'a>,
+ id: OnceCell,
+ data: BlobData<'a>,
+ temp_id: u64,
}
impl Blob<'_> {
#[inline]
-
pub fn from_file>(path: P) -> Result {
let mut file = File::open(&path)?;
let file_size = file.metadata()?.len();
+ let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
if file_size > LARGE_FILE_THRESHOLD {
// Large files: one mmap, zero extra copies.
let mmap = unsafe { memmap2::Mmap::map(&file)? };
- let id = BlobId::new(mmap.as_ref());
- Ok(Blob { id, data: BlobData::Mapped(mmap) })
+ Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id })
} else {
// Small files: reuse the same handle and pre-allocate exact capacity
let mut bytes = Vec::with_capacity(file_size as usize);
file.read_to_end(&mut bytes)?;
- let id = BlobId::new(&bytes);
- Ok(Blob { id, data: BlobData::Owned(bytes) })
+ Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id })
}
}
+
/// Returns the blob's bytes as a slice.
#[inline]
pub fn bytes(&self) -> &[u8] {
self.data.as_ref()
}
+ /// Lazily compute and return the blob's SHA-1 `BlobId`.
+ #[inline]
+ pub fn id(&self) -> BlobId {
+ *self.id.get_or_init(|| BlobId::new(self.bytes()))
+ }
+
+ /// Get a reference to the blob's SHA-1 `BlobId`, computing it if necessary.
+ #[inline]
+ pub fn id_ref(&self) -> &BlobId {
+ self.id.get_or_init(|| BlobId::new(self.bytes()))
+ }
+
+ /// Return the temporary identifier assigned on blob creation.
+ #[inline]
+ pub fn temp_id(&self) -> u64 {
+ self.temp_id
+ }
+
/// Create a new `Blob` from a vector of bytes.
#[inline]
pub fn from_bytes(bytes: Vec) -> Self {
- let id = BlobId::compute_from_bytes(&bytes);
- Blob { id, data: BlobData::Owned(bytes) }
+ let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+ Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }
}
/// Create a new `Blob` with the given id and data.
#[inline]
pub fn new(id: BlobId, bytes: Vec) -> Self {
- Blob { id, data: BlobData::Owned(bytes) }
+ let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+ let cell = OnceCell::new();
+ let _ = cell.set(id);
+ Blob { id: cell, data: BlobData::Owned(bytes), temp_id }
}
#[inline]
@@ -269,9 +294,15 @@ impl BlobId {
/// Create a new BlobId computed from the given input.
#[inline]
pub fn new(input: &[u8]) -> Self {
+ const CHUNK: usize = 64 * 1024; // 64KB from start and end
let mut hasher = Sha1::new();
write!(&mut hasher, "blob {}\0", input.len()).unwrap();
- hasher.update(input);
+ if input.len() <= CHUNK * 2 {
+ hasher.update(input);
+ } else {
+ hasher.update(&input[..CHUNK]);
+ hasher.update(&input[input.len() - CHUNK..]);
+ }
BlobId(hasher.finalize().as_slice().try_into().expect("SHA-1 output size mismatch"))
}
@@ -360,9 +391,6 @@ pub struct BlobMetadata {
/// The guessed multimedia type of the blob
pub mime_essence: Option,
- /// The guessed charset of the blob
- pub charset: Option,
-
/// The guessed programming language of the blob
pub language: Option,
}
@@ -384,9 +412,4 @@ impl BlobMetadata {
pub fn mime_essence(&self) -> Option<&str> {
self.mime_essence.as_deref()
}
-
- #[inline]
- pub fn charset(&self) -> Option<&str> {
- self.charset.as_deref()
- }
}
diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs
index f0b8fa8..4f2dc5b 100644
--- a/src/cli/commands/inputs.rs
+++ b/src/cli/commands/inputs.rs
@@ -169,7 +169,7 @@ pub struct InputSpecifierArgs {
#[derive(Args, Debug, Clone)]
pub struct ContentFilteringArgs {
/// Ignore files larger than the given size in MB
- #[arg(long("max-file-size"), default_value_t = 25.0)]
+ #[arg(long("max-file-size"), default_value_t = 64.0)]
pub max_file_size_mb: f64,
// /// Use custom path-based ignore rules from the given file(s)
diff --git a/src/content_type.rs b/src/content_type.rs
index 197258d..ee44da6 100644
--- a/src/content_type.rs
+++ b/src/content_type.rs
@@ -96,9 +96,8 @@ impl ContentInspector {
#[inline]
#[must_use]
pub fn guess_charset(&self, bytes: &[u8]) -> Option {
- String::from_utf8(bytes.to_vec()).ok().map(|_| "UTF-8".to_string())
+ std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
}
-
/// Guess programming language with broad coverage using `tokei`.
///
/// Strategy (no disk I/O):
diff --git a/src/decompress.rs b/src/decompress.rs
index 49dc7ef..46ae5d5 100644
--- a/src/decompress.rs
+++ b/src/decompress.rs
@@ -161,11 +161,8 @@ fn handle_zip_archive_streaming(
Ok(CompressedContent::ArchiveFiles(entries_on_disk))
}
-fn handle_asar_archive_in_memory(
- buffer: Vec,
- archive_path: &Path,
-) -> Result {
- match AsarReader::new(&buffer, None) {
+fn handle_asar_archive_in_memory(buffer: &[u8], archive_path: &Path) -> Result {
+ match AsarReader::new(buffer, None) {
Ok(reader) => {
let mut contents = Vec::new();
for (path_in_asar, file) in reader.files() {
@@ -200,7 +197,7 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result {
let mmap = unsafe { Mmap::map(&file)? };
- return handle_asar_archive_in_memory(mmap.to_vec(), path);
+ return handle_asar_archive_in_memory(&mmap, path);
}
"tar" => {
if let Some(base) = base_dir {
diff --git a/src/findings_store.rs b/src/findings_store.rs
index fc610f2..23d8c7b 100644
--- a/src/findings_store.rs
+++ b/src/findings_store.rs
@@ -18,6 +18,7 @@ use crate::{
matcher::Match,
origin::{Origin, OriginSet},
rules::rule::Rule,
+ util::intern,
};
// share with Arc so every blob/origin is materialised once
@@ -151,7 +152,7 @@ impl FindingsStore {
.captures
.get(1)
.or_else(|| m.groups.captures.get(0))
- .map_or("", |c| c.value.as_ref());
+ .map_or("", |c| c.value);
let origin_kind = match origin.first() {
Origin::GitRepo(_) => "git",
@@ -160,7 +161,7 @@ impl FindingsStore {
};
let key = xxh3_64(
- format!("{}|{}|{}", m.rule_text_id.to_uppercase(), origin_kind, snippet)
+ format!("{}|{}|{}", m.rule.id().to_uppercase(), origin_kind, snippet)
.as_bytes(),
);
@@ -280,7 +281,7 @@ impl FindingsStore {
pub fn get_summary(&self) -> FxHashMap<&'static str, usize> {
self.matches.iter().fold(FxHashMap::default(), |mut acc, msg| {
let (_, _, m) = &**msg;
- *acc.entry(m.rule_name).or_insert(0) += 1; // borrow, no alloc
+ *acc.entry(intern(m.rule.name())).or_insert(0) += 1;
acc
})
}
@@ -342,13 +343,13 @@ impl FindingsStore {
self.matches.iter().map(|msg| {
let (_, _, match_item) = &**msg;
finding_data::FindingMetadata {
- rule_name: match_item.rule_name.to_string(),
+ rule_name: match_item.rule.name().to_string(),
num_matches: 1,
comment: None,
visible: match_item.visible,
finding_id: match_item.finding_id(),
- rule_finding_fingerprint: match_item.rule_finding_fingerprint.to_string(),
- rule_text_id: match_item.rule_text_id.to_string(),
+ rule_finding_fingerprint: match_item.rule.finding_sha1_fingerprint().to_string(),
+ rule_text_id: match_item.rule.id().to_string(),
}
})
}
@@ -362,7 +363,7 @@ impl FindingsStore {
.iter()
.filter(|msg| {
let (_, _, match_item) = &***msg;
- match_item.rule_name == metadata.rule_name
+ match_item.rule.name() == metadata.rule_name
})
.map(|msg| {
let (origin, blob_metadata, match_item) = &**msg;
@@ -373,7 +374,7 @@ impl FindingsStore {
match_id: MatchIdInt::from_str(&match_item.finding_id())?,
match_comment: None,
visible: match_item.visible,
- match_confidence: match_item.rule_confidence,
+ match_confidence: match_item.rule.confidence(),
validation_response_body: match_item.validation_response_body.clone(),
validation_response_status: match_item.validation_response_status,
validation_success: match_item.validation_success,
diff --git a/src/location.rs b/src/location.rs
index 4af7021..a50602a 100644
--- a/src/location.rs
+++ b/src/location.rs
@@ -1,4 +1,5 @@
use core::ops::Range;
+use std::cell::RefCell;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
@@ -87,25 +88,35 @@ impl std::fmt::Display for SourceSpan {
}
/// Records newline byteβoffsets to map offsets -- (line, column).
-pub struct LocationMapping {
- newline_offsets: Vec,
+pub struct LocationMapping<'a> {
+ bytes: &'a [u8],
+ newline_offsets: RefCell>,
}
-impl LocationMapping {
- /// Scan once for all `\n` positions.
- pub fn new(input: &[u8]) -> Self {
- let newline_offsets =
- input.iter().enumerate().filter_map(|(i, &b)| (b == b'\n').then_some(i)).collect();
- LocationMapping { newline_offsets }
+impl<'a> LocationMapping<'a> {
+ /// Create a new mapping without pre-scanning the entire input.
+ pub fn new(input: &'a [u8]) -> Self {
+ LocationMapping { bytes: input, newline_offsets: RefCell::new(Vec::new()) }
}
- /// Map a byte offset to a `SourcePoint`.
- pub fn get_source_point(&self, offset: usize) -> SourcePoint {
- let line = match self.newline_offsets.binary_search(&offset) {
- Ok(idx) => idx + 2, // exact newline -- next line
+ fn ensure_offsets_up_to(&self, offset: usize) {
+ let mut offsets = self.newline_offsets.borrow_mut();
+ let start = offsets.last().map_or(0, |&last| last + 1);
+ if offset < start {
+ return;
+ }
+ let end = offset.min(self.bytes.len());
+ for nl in memchr::memchr_iter(b'\n', &self.bytes[start..end]) {
+ offsets.push(start + nl);
+ }
+ }
+
+ fn source_point_from_offsets(offsets: &[usize], offset: usize) -> SourcePoint {
+ let line = match offsets.binary_search(&offset) {
+ Ok(idx) => idx + 2,
Err(idx) => idx + 1,
};
- let column = if let Some(&last) = self.newline_offsets.get(line.saturating_sub(2)) {
+ let column = if let Some(&last) = offsets.get(line.saturating_sub(2)) {
offset.saturating_sub(last + 1)
} else {
offset
@@ -113,10 +124,19 @@ impl LocationMapping {
SourcePoint { line, column }
}
+ /// Map a byte offset to a `SourcePoint`.
+ pub fn get_source_point(&self, offset: usize) -> SourcePoint {
+ self.ensure_offsets_up_to(offset);
+ let offsets = self.newline_offsets.borrow();
+ Self::source_point_from_offsets(&offsets, offset)
+ }
+
/// Map an `OffsetSpan` -- `SourceSpan` (closed interval).
pub fn get_source_span(&self, span: &OffsetSpan) -> SourceSpan {
- let start = self.get_source_point(span.start);
- let end = self.get_source_point(span.end.saturating_sub(1));
+ self.ensure_offsets_up_to(span.end.saturating_sub(1));
+ let offsets = self.newline_offsets.borrow();
+ let start = Self::source_point_from_offsets(&offsets, span.start);
+ let end = Self::source_point_from_offsets(&offsets, span.end.saturating_sub(1));
SourceSpan { start, end }
}
}
diff --git a/src/matcher.rs b/src/matcher.rs
index 1cbb16f..d30b0d8 100644
--- a/src/matcher.rs
+++ b/src/matcher.rs
@@ -10,7 +10,6 @@ use anyhow::Result;
use base64::{engine::general_purpose, Engine};
use bstr::BString;
use http::StatusCode;
-use lazy_static::lazy_static;
use regex::bytes::Regex;
use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
use schemars::{
@@ -27,7 +26,7 @@ use xxhash_rust::xxh3::xxh3_64;
use crate::{
blob::{Blob, BlobId, BlobIdMap},
entropy::calculate_shannon_entropy,
- location::{Location, LocationMapping, OffsetSpan},
+ location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
origin::OriginSet,
parser,
parser::{Checker, Language},
@@ -40,6 +39,10 @@ use crate::{
util::{intern, redact_value},
};
+const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
+const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
+const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
+
// -------------------------------------------------------------------------------------------------
// RawMatch
// -------------------------------------------------------------------------------------------------
@@ -255,17 +258,28 @@ impl<'a> Matcher<'a> {
self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64));
self.user_data.input_len = input.len() as u64;
- // self.vs_scanner.scan(input, |rid, from, to, _flags| {
- self.scanner_pool.with(|scanner| {
- scanner.scan(input, |rule_id, from, to, _flags| {
- self.user_data.raw_matches_scratch.push(RawMatch {
- rule_id,
- start_idx: from,
- end_idx: to,
- });
- vectorscan_rs::Scan::Continue
- })
- })?;
+
+ let mut offset: usize = 0;
+ while offset < input.len() {
+ let end = (offset + MAX_CHUNK_SIZE).min(input.len());
+ let slice = &input[offset..end];
+ let base = offset as u64;
+ self.scanner_pool.with(|scanner| {
+ scanner.scan(slice, |rule_id, from, to, _flags| {
+ self.user_data.raw_matches_scratch.push(RawMatch {
+ rule_id,
+ start_idx: from + base,
+ end_idx: to + base,
+ });
+ vectorscan_rs::Scan::Continue
+ })
+ })?;
+
+ if end == input.len() {
+ break;
+ }
+ offset = end.saturating_sub(CHUNK_OVERLAP);
+ }
Ok(())
}
@@ -288,17 +302,6 @@ impl<'a> Matcher<'a> {
self.local_stats.blobs_scanned += 1;
self.local_stats.bytes_scanned += blob.bytes().len() as u64;
- // Check if blob was already seen and respect no_dedup flag
- if !no_dedup {
- if let Some(had_matches) = self.seen_blobs.get(&blob.id) {
- return Ok(if had_matches {
- ScanResult::SeenWithMatches
- } else {
- ScanResult::SeenSansMatches
- });
- }
- }
-
// Extract filename from origin
let filename = origin
.first()
@@ -313,24 +316,19 @@ impl<'a> Matcher<'a> {
// Opportunistically look for standalone Base64 blobs. If neither
// the raw scan nor this check yields anything, we can return early
// before doing any heavier work.
- let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) };
+ let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
+ Vec::new()
+ } else {
+ get_base64_strings(blob.bytes())
+ };
if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() {
- // Only record in seen_blobs if deduplication is enabled
- if !no_dedup {
- return Ok(match self.seen_blobs.insert(blob.id, false) {
- None => ScanResult::New(Vec::new()),
- Some(true) => ScanResult::SeenWithMatches,
- Some(false) => ScanResult::SeenSansMatches,
- });
- } else {
- return Ok(ScanResult::New(Vec::new()));
- }
+ return Ok(ScanResult::New(Vec::new()));
}
let rules_db = self.rules_db;
let mut seen_matches = FxHashSet::default();
- let mut previous_matches = Vec::new();
+ let mut previous_matches: FxHashMap> = FxHashMap::default();
let tree_sitter_result = if self.user_data.raw_matches_scratch.is_empty() {
None
} else {
@@ -363,27 +361,19 @@ impl<'a> Matcher<'a> {
})
.collect::>()
});
- let mut previous_raw_matches: Vec<(usize, OffsetSpan)> = Vec::new();
+ let mut previous_raw_matches: FxHashMap> = FxHashMap::default();
for &RawMatch { rule_id, start_idx, end_idx } in
self.user_data.raw_matches_scratch.iter().rev()
{
let rule_id_usize: usize = rule_id as usize;
- // let rule = &rules_db.rules[rule_id_usize];
let rule = Arc::clone(&rules_db.rules[rule_id_usize]);
let re = &rules_db.anchored_regexes[rule_id_usize];
let start_idx_usize = start_idx as usize;
let end_idx_usize = end_idx as usize;
let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
- // Skip if fully contained in a previous match
- if previous_raw_matches.iter().any(|(prev_id, prev_span): &(usize, OffsetSpan)| {
- *prev_id == rule_id_usize
- && (prev_span.fully_contains(¤t_span)
- || current_span.fully_contains(prev_span))
- }) {
+ if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
continue;
}
- let matching_input_offset_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
- previous_raw_matches.push((rule_id_usize, matching_input_offset_span));
filter_match(
blob,
rule,
@@ -472,9 +462,15 @@ impl<'a> Matcher<'a> {
}
}
// Finalize
- // Only record in seen_blobs if deduplication is enabled
- if !no_dedup {
- self.seen_blobs.insert(blob.id, !matches.is_empty());
+ if !no_dedup && !matches.is_empty() {
+ let blob_id = blob.id();
+ if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
+ return Ok(if had_matches {
+ ScanResult::SeenWithMatches
+ } else {
+ ScanResult::SeenSansMatches
+ });
+ }
}
// --- opportunistic capacity cap ---------------------------------
@@ -501,6 +497,39 @@ fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -
end.hash(&mut hasher);
hasher.finish()
}
+
+#[inline]
+fn insert_span(spans: &mut Vec, span: OffsetSpan) -> bool {
+ let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
+ if idx > 0 {
+ if spans[idx - 1].fully_contains(&span) {
+ return false;
+ }
+ if span.fully_contains(&spans[idx - 1]) {
+ spans.remove(idx - 1);
+ idx -= 1;
+ }
+ }
+ if idx < spans.len() {
+ if spans[idx].fully_contains(&span) {
+ return false;
+ }
+ if span.fully_contains(&spans[idx]) {
+ spans.remove(idx);
+ }
+ }
+ spans.insert(idx, span);
+ true
+}
+
+#[inline]
+fn record_match(
+ map: &mut FxHashMap>,
+ rule_id: usize,
+ span: OffsetSpan,
+) -> bool {
+ insert_span(map.entry(rule_id).or_default(), span)
+}
fn filter_match<'b>(
blob: &'b Blob,
// rule: &'b Rule,
@@ -509,7 +538,7 @@ fn filter_match<'b>(
start: usize,
end: usize,
matches: &mut Vec>,
- previous_matches: &mut Vec<(usize, OffsetSpan)>,
+ previous_matches: &mut FxHashMap>,
rule_id: usize,
seen_matches: &mut FxHashSet,
_origin: &OriginSet,
@@ -558,11 +587,7 @@ fn filter_match<'b>(
if !seen_matches.insert(match_key) {
continue;
}
- if previous_matches.iter().any(|(prev_rule_id, prev_loc)| {
- *prev_rule_id == rule_id
- && (prev_loc.fully_contains(&matching_input_offset_span)
- || matching_input_offset_span.fully_contains(prev_loc))
- }) {
+ if !record_match(previous_matches, rule_id, matching_input_offset_span) {
continue;
}
let only_matching_input =
@@ -571,7 +596,7 @@ fn filter_match<'b>(
SerializableCaptures::from_captures(&captures, byte_slice.as_ref(), re, redact);
matches.push(BlobMatch {
rule: Arc::clone(&rule),
- blob_id: &blob.id,
+ blob_id: blob.id_ref(),
matching_input: only_matching_input,
matching_input_offset_span,
captures: groups,
@@ -581,7 +606,6 @@ fn filter_match<'b>(
calculated_entropy,
is_base64,
});
- previous_matches.push((rule_id, matching_input_offset_span));
}
if let Some(t) = timer.take() {
let new_count = (matches.len() - initial_len) as u64;
@@ -682,20 +706,20 @@ impl JsonSchema for Groups {
// pub end: usize, // End position of the match
// pub value: String, // The actual captured value
// }
-#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCapture {
pub name: Option,
pub match_number: i32,
pub start: usize,
pub end: usize,
- // Instead of storing an owned String, store a borrowed or interned value.
- // Here we use Cow to allow either borrowing or owning as needed.
- pub value: std::borrow::Cow<'static, str>,
+ /// Interned value of the capture.
+ pub value: &'static str,
}
-#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCaptures {
- pub captures: Vec, // All captures (named and unnamed)
+ #[schemars(with = "Vec")]
+ pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
}
impl SerializableCaptures {
pub fn from_captures(
@@ -704,7 +728,7 @@ impl SerializableCaptures {
re: &Regex,
redact: bool,
) -> Self {
- let mut serialized_captures = Vec::new();
+ let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
// Process named captures
for name in re.capture_names().flatten() {
if let Some(capture) = captures.name(name) {
@@ -718,7 +742,7 @@ impl SerializableCaptures {
match_number: -1,
start: capture.start(),
end: capture.end(),
- value: value.into(),
+ value: intern(&value),
});
}
}
@@ -735,7 +759,7 @@ impl SerializableCaptures {
match_number: i32::try_from(i).unwrap_or(0),
start: capture.start(),
end: capture.end(),
- value: value.into(),
+ value: intern(&value),
});
}
}
@@ -764,16 +788,9 @@ pub struct Match {
pub finding_fingerprint: u64,
/// The rule that produced this match
- pub rule_finding_fingerprint: &'static str,
-
- /// The text identifier of the rule that produced this match
- pub rule_text_id: &'static str,
-
- /// The name of the rule that produced this match
- pub rule_name: &'static str,
-
- /// The confidence property of the rule that produced this match
- pub rule_confidence: crate::rules::rule::Confidence,
+ #[serde(skip_serializing)]
+ #[schemars(skip)]
+ pub rule: Arc,
/// Validation Body
pub validation_response_body: String,
@@ -794,7 +811,7 @@ pub struct Match {
impl Match {
#[inline]
pub fn convert_owned_blobmatch_to_match<'a>(
- loc_mapping: &'a LocationMapping,
+ loc_mapping: Option<&'a LocationMapping<'a>>,
owned_blob_match: &'a OwnedBlobMatch,
origin_type: &'a str,
) -> Self {
@@ -812,9 +829,11 @@ impl Match {
// The fingerprint will be based on the content of the secret.
let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
- let source_span = loc_mapping.get_source_span(&offset_span);
- let rule_finding_fingerprint = owned_blob_match.rule.finding_sha1_fingerprint().to_owned();
-
+ let source_span =
+ loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
+ start: SourcePoint { line: 0, column: 0 },
+ end: SourcePoint { line: 0, column: 0 },
+ });
let offset_start: u64 =
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
@@ -828,10 +847,7 @@ impl Match {
// matching_snippet
Match {
- rule_finding_fingerprint: intern(&rule_finding_fingerprint),
- rule_name: intern(owned_blob_match.rule.name()),
- rule_confidence: owned_blob_match.rule.confidence(),
- rule_text_id: intern(owned_blob_match.rule.id()),
+ rule: owned_blob_match.rule.clone(),
visible: owned_blob_match.rule.visible().to_owned(),
location: Location { offset_span, source_span: source_span.clone() },
groups: owned_blob_match.captures.clone(),
@@ -852,7 +868,7 @@ impl Match {
pub fn finding_id(&self) -> String {
let mut h = Sha1::new();
- write!(&mut h, "{}\0", self.rule_finding_fingerprint)
+ write!(&mut h, "{}\0", self.rule.finding_sha1_fingerprint())
.expect("should be able to write to memory");
serde_json::to_writer(&mut h, &self.groups)
.expect("should be able to serialize groups as JSON");
@@ -892,33 +908,48 @@ pub struct DecodedData {
pub pos_start: usize,
pub pos_end: usize,
}
+#[inline]
+fn is_base64_byte(b: u8) -> bool {
+ matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/')
+}
+
pub fn get_base64_strings(input: &[u8]) -> Vec {
- lazy_static! {
- // Require a reasonably long run of valid Base64 characters to reduce
- // noise. 32 bytes corresponds to 24 decoded bytes.
- static ref RE_BASE64: Regex = Regex::new(r"[A-Za-z0-9+/]{32,}={0,2}").unwrap();
- }
let mut results = Vec::new();
- for m in RE_BASE64.find_iter(input) {
- let base64_string = m.as_bytes();
- // Skip candidates whose length isn't a multiple of four β they cannot
- // be valid Base64.
- if base64_string.len() % 4 != 0 {
- continue;
+ let mut i = 0;
+ while i < input.len() {
+ while i < input.len() && !is_base64_byte(input[i]) {
+ i += 1;
}
- if let Ok(decoded) = general_purpose::STANDARD.decode(base64_string) {
- if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
- if decoded_str.is_ascii() {
- results.push(DecodedData {
- original: String::from_utf8_lossy(base64_string).into_owned(),
- decoded: decoded_str.to_string(),
- pos_start: m.start(),
- pos_end: m.end(),
- });
+ let start = i;
+ while i < input.len() && is_base64_byte(input[i]) {
+ i += 1;
+ }
+
+ let mut eq_count = 0;
+ while i < input.len() && input[i] == b'=' && eq_count < 2 {
+ i += 1;
+ eq_count += 1;
+ }
+ let end = i;
+
+ let len = end - start;
+ if len >= 32 && len % 4 == 0 {
+ let base64_slice = &input[start..end];
+ if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) {
+ if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
+ if decoded_str.is_ascii() {
+ results.push(DecodedData {
+ original: String::from_utf8_lossy(base64_slice).into_owned(),
+ decoded: decoded_str.to_string(),
+ pos_start: start,
+ pos_end: end,
+ });
+ }
}
}
}
}
+
results
}
diff --git a/src/reporter.rs b/src/reporter.rs
index c9c2a62..dacf75b 100644
--- a/src/reporter.rs
+++ b/src/reporter.rs
@@ -216,7 +216,7 @@ impl DetailsReporter {
m: match_item.clone(),
comment: None,
visible: match_item.visible,
- match_confidence: match_item.rule_confidence,
+ match_confidence: match_item.rule.confidence(),
validation_response_body: match_item.validation_response_body.clone(),
validation_response_status: match_item.validation_response_status,
validation_success: match_item.validation_success,
@@ -366,13 +366,13 @@ impl DetailsReporter {
FindingReporterRecord {
rule: RuleMetadata {
- name: rm.m.rule_name.to_string(),
- id: rm.m.rule_text_id.to_string(),
+ name: rm.m.rule.name().to_string(),
+ id: rm.m.rule.id().to_string(),
},
finding: FindingRecordData {
snippet,
fingerprint: rm.m.finding_fingerprint.to_string(),
- confidence: rm.match_confidence.to_string(),
+ confidence: rm.m.rule.confidence().to_string(),
entropy: format!("{:.2}", rm.m.calculated_entropy),
validation: ValidationInfo { status: validation_status, response: response_body },
language: rm
diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs
index 0b562d2..1ee7fb7 100644
--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@@ -35,6 +35,7 @@ mod tests {
use crate::cli::commands::github::GitHistoryMode;
use crate::cli::commands::rules::RuleSpecifierArgs;
use crate::matcher::{SerializableCapture, SerializableCaptures};
+ use crate::rules::rule::{Confidence, Rule, RuleSyntax};
use crate::util::intern;
use crate::{
blob::BlobId,
@@ -49,6 +50,7 @@ mod tests {
origin::Origin,
reporter::styles::Styles,
};
+ use smallvec::smallvec;
use std::{
io::Cursor,
path::PathBuf,
@@ -132,12 +134,21 @@ mod tests {
}
}
- fn create_mock_match(
- rule_name: &str,
- rule_text_id: &str,
- rule_finding_fingerprint: &str,
- validation_success: bool,
- ) -> Match {
+ fn create_mock_match(rule_name: &str, rule_text_id: &str, validation_success: bool) -> Match {
+ let syntax = RuleSyntax {
+ name: rule_name.to_string(),
+ id: rule_text_id.to_string(),
+ pattern: "dummy".to_string(),
+ min_entropy: 0.0,
+ confidence: Confidence::Medium,
+ visible: true,
+ examples: vec![],
+ negative_examples: vec![],
+ references: vec![],
+ validation: None,
+ depends_on_rule: vec![],
+ };
+ let rule = Arc::new(Rule::new(syntax));
Match {
location: Location {
offset_span: OffsetSpan { start: 10, end: 20 },
@@ -147,20 +158,17 @@ mod tests {
},
},
groups: SerializableCaptures {
- captures: vec![SerializableCapture {
+ captures: smallvec![SerializableCapture {
name: Some("token".to_string()),
match_number: 1,
start: 10,
end: 20,
- value: "mock_token".into(),
+ value: intern("mock_token"),
}],
},
blob_id: BlobId::new(b"mock_blob"),
finding_fingerprint: 0123,
- rule_finding_fingerprint: intern(rule_finding_fingerprint),
- rule_text_id: intern(rule_text_id),
- rule_name: intern(rule_name),
- rule_confidence: Confidence::Medium,
+ rule,
validation_response_body: "validation response".to_string(),
validation_response_status: 200,
validation_success,
@@ -177,7 +185,6 @@ mod tests {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
- charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
};
let dedup = true;
@@ -204,15 +211,13 @@ mod tests {
#[test]
fn test_json_format() -> Result<()> {
- let mock_match =
- create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
+ let mock_match = create_mock_match("MockRule", "mock_rule_1", true);
let matches = vec![ReportMatch {
origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]),
blob_metadata: BlobMetadata {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
- charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
},
m: mock_match,
@@ -238,12 +243,7 @@ mod tests {
fn test_validation_status_in_json() -> Result<()> {
let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")];
for (validation_success, expected_status) in test_cases {
- let mock_match = create_mock_match(
- "MockRule",
- "mock_rule_1",
- "mock_finding_fingerprint",
- validation_success,
- );
+ let mock_match = create_mock_match("MockRule", "mock_rule_1", validation_success);
let matches = vec![ReportMatch {
origin: OriginSet::new(
Origin::from_file(PathBuf::from("/mock/path/file.rs")),
@@ -253,7 +253,6 @@ mod tests {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
- charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
},
m: mock_match,
diff --git a/src/scanner/docker.rs b/src/scanner/docker.rs
index b427a3b..7ebb2d4 100644
--- a/src/scanner/docker.rs
+++ b/src/scanner/docker.rs
@@ -175,9 +175,11 @@ impl Docker {
pb.set_length(layer_paths.len() as u64);
for p in layer_paths {
- let mut data = Vec::new();
- File::open(&p)?.read_to_end(&mut data)?;
- let digest = format!("{:x}", Sha256::digest(&data));
+ let mut file = File::open(&p)?;
+ let mut hasher = Sha256::new();
+ std::io::copy(&mut file, &mut hasher)?;
+ let digest = format!("{:x}", hasher.finalize());
+
let new_path = out_dir.join(format!("layer_{digest}.tar"));
std::fs::rename(&p, &new_path)?;
// extract layer contents so inner filenames appear in scan results
diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs
index 14861ff..970ca5c 100644
--- a/src/scanner/enumerate.rs
+++ b/src/scanner/enumerate.rs
@@ -172,10 +172,15 @@ pub fn enumerate_filesystem_inputs(
};
if should_skip {
progress.suspend(|| {
+ let path = origin
+ .first()
+ .blob_path()
+ .map(|p| p.display().to_string())
+ .unwrap_or_else(|| blob.temp_id().to_string());
if is_archive {
- debug!("Skipping archive: {}", blob.id);
+ debug!("Skipping archive: {path}");
} else {
- debug!("Skipping binary blob: {}", blob.id);
+ debug!("Skipping binary blob: {path}");
}
});
return Ok(());
diff --git a/src/scanner/processing.rs b/src/scanner/processing.rs
index 9447dbf..5132209 100644
--- a/src/scanner/processing.rs
+++ b/src/scanner/processing.rs
@@ -12,6 +12,8 @@ use crate::{
Path,
};
+const LOCATION_LIMIT_BYTES: usize = 256 * 1024 * 1024;
+
/// A matcher along with parameters that remain constant during a single
/// `scan` run
pub struct BlobProcessor<'a> {
@@ -27,8 +29,7 @@ impl<'a> BlobProcessor<'a> {
redact: bool,
no_base64: bool,
) -> Result