Merge pull request #105 from mongodb/development

v1.48.0
This commit is contained in:
Mick Grove 2025-09-05 11:30:10 -07:00 committed by GitHub
commit f43a26030d
22 changed files with 440 additions and 1332 deletions

View file

@ -2,6 +2,15 @@
All notable changes to this project will be documented in this file.
## [1.48.0]
- Improved error message when self-update cannot find the current binary
- Optimized memory usage via string interning and extensive data sharing
- Replaced quadratic match filtering with a per-rule span map, fixing missed secrets in extremely large files and improving scan performance
- Support scanning extremely large files by chunking input into 1 GiB segments with small overlaps, avoiding vectorscan buffer limits while preserving match offsets
- Always use chunked vectorscan, eliminating the slow regex fallback for blobs over 4 GiB
- Skip Base64 scanning for blobs over 64 MB to avoid a second pass over massive files
- Increased max-file-size default to 64 MB (up from 25 MB)
## [1.47.0]
- MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them
- Improved rules: github oauth2, diffbot, mailchimp, aws

View file

@ -10,7 +10,7 @@ publish = false
[package]
name = "kingfisher"
version = "1.47.0"
version = "1.48.0"
description = "MongoDB's blazingly fast secret scanning and validation tool"
edition.workspace = true
rust-version.workspace = true
@ -65,7 +65,7 @@ smallvec = { version = "1", features = [
tracing = "0.1.41"
indicatif = { version = "0.17", features = ["improved_unicode"] }
rayon = "1.10"
sha1 = "0.10.6"
sha1 = { version = "0.10.6", features = ["asm"] }
hex = "0.4.3"
vectorscan-rs = "0.0.5"
regex = "1.11.1"
@ -151,6 +151,7 @@ tar = "0.4.44"
xz2 = "0.1.7"
asar = "0.3.0"
blake3 = "1.8.2"
memchr = "2.7"
memmap2 = "0.9.7"
futures = "0.3.31"
dashmap = "6.1.0"

View file

@ -36,6 +36,64 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
<img src="docs/runtime-comparison.png" alt="Kingfisher Runtime Comparison" style="vertical-align: center;" />
</p>
- [Kingfisher](#kingfisher)
- [Key Features](#key-features)
- [Benchmark Results](#benchmark-results)
- [Getting Started](#getting-started)
- [Installation](#installation)
- [Run Kingfisher in Docker](#run-kingfisher-in-docker)
- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance)
- [Write Custom Rules!](#write-custom-rules)
- [Usage](#usage)
- [Basic Examples](#basic-examples)
- [Scan with secret validation](#scan-with-secret-validation)
- [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories)
- [Scan a Git repository without validation](#scan-a-git-repository-without-validation)
- [Display only secrets confirmed active by thirdparty APIs](#display-only-secrets-confirmed-active-by-thirdparty-apis)
- [Output JSON and capture to a file](#output-json-and-capture-to-a-file)
- [Output SARIF directly to disk](#output-sarif-directly-to-disk)
- [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--)
- [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size)
- [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag)
- [Display rule performance statistics](#display-rule-performance-statistics)
- [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files)
- [Exclude specific paths](#exclude-specific-paths)
- [Scan an S3 bucket](#scan-an-s3-bucket)
- [Scanning Docker Images](#scanning-docker-images)
- [Scanning GitHub](#scanning-github)
- [Scan GitHub organisation (requires `KF_GITHUB_TOKEN`)](#scan-github-organisation-requires-kf_github_token)
- [Scan remote GitHub repository](#scan-remote-github-repository)
- [Scanning GitLab](#scanning-gitlab)
- [Scan GitLab group (requires `KF_GITLAB_TOKEN`)](#scan-gitlab-group-requires-kf_gitlab_token)
- [Scan GitLab user](#scan-gitlab-user)
- [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url)
- [List GitLab repositories](#list-gitlab-repositories)
- [Scanning Jira](#scanning-jira)
- [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query)
- [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues)
- [Scanning Confluence](#scanning-confluence)
- [Scan Confluence pages matching a CQL query](#scan-confluence-pages-matching-a-cql-query)
- [Scanning Slack](#scanning-slack)
- [Scan Slack messages matching a search query](#scan-slack-messages-matching-a-search-query)
- [Environment Variables for Tokens](#environment-variables-for-tokens)
- [Exit Codes](#exit-codes)
- [Update Checks](#update-checks)
- [Advanced Options](#advanced-options)
- [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets)
- [List Builtin Rules](#list-builtin-rules)
- [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run)
- [To add your rules alongside the builtins:](#to-add-your-rules-alongside-the-builtins)
- [Other Examples](#other-examples)
- [Notable Scan Options](#notable-scan-options)
- [Understanding `--confidence`](#understanding---confidence)
- [Ignore known false positives](#ignore-known-false-positives)
- [Finding Fingerprint](#finding-fingerprint)
- [Rule Performance Profiling](#rule-performance-profiling)
- [CLI Options](#cli-options)
- [Origins and Divergence](#origins-and-divergence)
- [Roadmap](#roadmap)
- [License](#license)
# Getting Started
## Installation
@ -225,9 +283,18 @@ cat /path/to/file.py | kingfisher scan -
```
### Limit maximum file size scanned (`--max-file-size`)
By default, Kingfisher skips files larger than **64 MB**. You can raise or lower this cap per run with `--max-file-size`, which takes a value in **megabytes**.
```bash
# Scan files up to 250 mb in size
kingfisher scan /some/file --max-file-size 250
```
### Scan using a rule _family_ with one flag
_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws._`)\*
_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws.*`)_
```bash
# Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2)
@ -617,6 +684,14 @@ kingfisher github repos list --organization my-org
- `--skip-regex <PATTERN>`: Ignore findings whose text matches this regex (repeatable)
- `--skip-word <WORD>`: Ignore findings containing this case-insensitive word (repeatable)
## Understanding `--confidence`
The `--confidence` flag sets a minimum confidence threshold, not an exact match.
- If you pass `--confidence medium`, findings with **medium and higher** confidence (medium + high) will be included.
- If you pass `--confidence low`, youll see **all levels** (low, medium, high).
### Ignore known false positives
Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context.

View file

@ -5,7 +5,7 @@ rules:
(?x)
\b
(
ey[A-Za-z0-9_-]{12,} (?# header )
(?:ey|ewogIC)[A-Za-z0-9_-]{12,} (?# header )
\.
ey[A-Za-z0-9_-]{12,} (?# payload )
\.

View file

@ -10,16 +10,20 @@ use anyhow::Result;
use bstr::{BString, ByteSlice};
use gix::ObjectId;
use hex;
use once_cell::sync::OnceCell;
use parking_lot::Mutex;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use sha1::{Digest, Sha1};
use smallvec::SmallVec;
use std::sync::atomic::{AtomicU64, Ordering};
use crate::git_commit_metadata::CommitMetadata;
// const LARGE_FILE_THRESHOLD: u64 = 512 * 1024; // 512 KB
const LARGE_FILE_THRESHOLD: u64 = 0; // always mmap
static NEXT_ID: AtomicU64 = AtomicU64::new(1);
/// The data of a blob, either owned (small files) or memory mapped (large files).
pub enum BlobData<'a> {
/// Small blobs remains as-is.
@ -75,47 +79,68 @@ pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>;
/// A Git blob, storing its SHA-1 id and its contents.
pub struct Blob<'a> {
pub id: BlobId,
pub data: BlobData<'a>,
id: OnceCell<BlobId>,
data: BlobData<'a>,
temp_id: u64,
}
impl Blob<'_> {
#[inline]
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let mut file = File::open(&path)?;
let file_size = file.metadata()?.len();
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
if file_size > LARGE_FILE_THRESHOLD {
// Large files: one mmap, zero extra copies.
let mmap = unsafe { memmap2::Mmap::map(&file)? };
let id = BlobId::new(mmap.as_ref());
Ok(Blob { id, data: BlobData::Mapped(mmap) })
Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id })
} else {
// Small files: reuse the same handle and pre-allocate exact capacity
let mut bytes = Vec::with_capacity(file_size as usize);
file.read_to_end(&mut bytes)?;
let id = BlobId::new(&bytes);
Ok(Blob { id, data: BlobData::Owned(bytes) })
Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id })
}
}
/// Returns the blob's bytes as a slice.
#[inline]
pub fn bytes(&self) -> &[u8] {
self.data.as_ref()
}
/// Lazily compute and return the blob's SHA-1 `BlobId`.
#[inline]
pub fn id(&self) -> BlobId {
*self.id.get_or_init(|| BlobId::new(self.bytes()))
}
/// Get a reference to the blob's SHA-1 `BlobId`, computing it if necessary.
#[inline]
pub fn id_ref(&self) -> &BlobId {
self.id.get_or_init(|| BlobId::new(self.bytes()))
}
/// Return the temporary identifier assigned on blob creation.
#[inline]
pub fn temp_id(&self) -> u64 {
self.temp_id
}
/// Create a new `Blob` from a vector of bytes.
#[inline]
pub fn from_bytes(bytes: Vec<u8>) -> Self {
let id = BlobId::compute_from_bytes(&bytes);
Blob { id, data: BlobData::Owned(bytes) }
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }
}
/// Create a new `Blob` with the given id and data.
#[inline]
pub fn new(id: BlobId, bytes: Vec<u8>) -> Self {
Blob { id, data: BlobData::Owned(bytes) }
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
let cell = OnceCell::new();
let _ = cell.set(id);
Blob { id: cell, data: BlobData::Owned(bytes), temp_id }
}
#[inline]
@ -269,9 +294,15 @@ impl BlobId {
/// Create a new BlobId computed from the given input.
#[inline]
pub fn new(input: &[u8]) -> Self {
const CHUNK: usize = 64 * 1024; // 64KB from start and end
let mut hasher = Sha1::new();
write!(&mut hasher, "blob {}\0", input.len()).unwrap();
hasher.update(input);
if input.len() <= CHUNK * 2 {
hasher.update(input);
} else {
hasher.update(&input[..CHUNK]);
hasher.update(&input[input.len() - CHUNK..]);
}
BlobId(hasher.finalize().as_slice().try_into().expect("SHA-1 output size mismatch"))
}
@ -360,9 +391,6 @@ pub struct BlobMetadata {
/// The guessed multimedia type of the blob
pub mime_essence: Option<String>,
/// The guessed charset of the blob
pub charset: Option<String>,
/// The guessed programming language of the blob
pub language: Option<String>,
}
@ -384,9 +412,4 @@ impl BlobMetadata {
pub fn mime_essence(&self) -> Option<&str> {
self.mime_essence.as_deref()
}
#[inline]
pub fn charset(&self) -> Option<&str> {
self.charset.as_deref()
}
}

View file

@ -169,7 +169,7 @@ pub struct InputSpecifierArgs {
#[derive(Args, Debug, Clone)]
pub struct ContentFilteringArgs {
/// Ignore files larger than the given size in MB
#[arg(long("max-file-size"), default_value_t = 25.0)]
#[arg(long("max-file-size"), default_value_t = 64.0)]
pub max_file_size_mb: f64,
// /// Use custom path-based ignore rules from the given file(s)

View file

@ -96,9 +96,8 @@ impl ContentInspector {
#[inline]
#[must_use]
pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
String::from_utf8(bytes.to_vec()).ok().map(|_| "UTF-8".to_string())
std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
}
/// Guess programming language with broad coverage using `tokei`.
///
/// Strategy (no disk I/O):

View file

@ -161,11 +161,8 @@ fn handle_zip_archive_streaming(
Ok(CompressedContent::ArchiveFiles(entries_on_disk))
}
fn handle_asar_archive_in_memory(
buffer: Vec<u8>,
archive_path: &Path,
) -> Result<CompressedContent> {
match AsarReader::new(&buffer, None) {
fn handle_asar_archive_in_memory(buffer: &[u8], archive_path: &Path) -> Result<CompressedContent> {
match AsarReader::new(buffer, None) {
Ok(reader) => {
let mut contents = Vec::new();
for (path_in_asar, file) in reader.files() {
@ -200,7 +197,7 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result<CompressedCon
match ext {
"asar" => {
let mmap = unsafe { Mmap::map(&file)? };
return handle_asar_archive_in_memory(mmap.to_vec(), path);
return handle_asar_archive_in_memory(&mmap, path);
}
"tar" => {
if let Some(base) = base_dir {

View file

@ -18,6 +18,7 @@ use crate::{
matcher::Match,
origin::{Origin, OriginSet},
rules::rule::Rule,
util::intern,
};
// share with Arc so every blob/origin is materialised once
@ -151,7 +152,7 @@ impl FindingsStore {
.captures
.get(1)
.or_else(|| m.groups.captures.get(0))
.map_or("", |c| c.value.as_ref());
.map_or("", |c| c.value);
let origin_kind = match origin.first() {
Origin::GitRepo(_) => "git",
@ -160,7 +161,7 @@ impl FindingsStore {
};
let key = xxh3_64(
format!("{}|{}|{}", m.rule_text_id.to_uppercase(), origin_kind, snippet)
format!("{}|{}|{}", m.rule.id().to_uppercase(), origin_kind, snippet)
.as_bytes(),
);
@ -280,7 +281,7 @@ impl FindingsStore {
pub fn get_summary(&self) -> FxHashMap<&'static str, usize> {
self.matches.iter().fold(FxHashMap::default(), |mut acc, msg| {
let (_, _, m) = &**msg;
*acc.entry(m.rule_name).or_insert(0) += 1; // borrow, no alloc
*acc.entry(intern(m.rule.name())).or_insert(0) += 1;
acc
})
}
@ -342,13 +343,13 @@ impl FindingsStore {
self.matches.iter().map(|msg| {
let (_, _, match_item) = &**msg;
finding_data::FindingMetadata {
rule_name: match_item.rule_name.to_string(),
rule_name: match_item.rule.name().to_string(),
num_matches: 1,
comment: None,
visible: match_item.visible,
finding_id: match_item.finding_id(),
rule_finding_fingerprint: match_item.rule_finding_fingerprint.to_string(),
rule_text_id: match_item.rule_text_id.to_string(),
rule_finding_fingerprint: match_item.rule.finding_sha1_fingerprint().to_string(),
rule_text_id: match_item.rule.id().to_string(),
}
})
}
@ -362,7 +363,7 @@ impl FindingsStore {
.iter()
.filter(|msg| {
let (_, _, match_item) = &***msg;
match_item.rule_name == metadata.rule_name
match_item.rule.name() == metadata.rule_name
})
.map(|msg| {
let (origin, blob_metadata, match_item) = &**msg;
@ -373,7 +374,7 @@ impl FindingsStore {
match_id: MatchIdInt::from_str(&match_item.finding_id())?,
match_comment: None,
visible: match_item.visible,
match_confidence: match_item.rule_confidence,
match_confidence: match_item.rule.confidence(),
validation_response_body: match_item.validation_response_body.clone(),
validation_response_status: match_item.validation_response_status,
validation_success: match_item.validation_success,

View file

@ -1,4 +1,5 @@
use core::ops::Range;
use std::cell::RefCell;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
@ -87,25 +88,35 @@ impl std::fmt::Display for SourceSpan {
}
/// Records newline byteoffsets to map offsets -- (line, column).
pub struct LocationMapping {
newline_offsets: Vec<usize>,
pub struct LocationMapping<'a> {
bytes: &'a [u8],
newline_offsets: RefCell<Vec<usize>>,
}
impl LocationMapping {
/// Scan once for all `\n` positions.
pub fn new(input: &[u8]) -> Self {
let newline_offsets =
input.iter().enumerate().filter_map(|(i, &b)| (b == b'\n').then_some(i)).collect();
LocationMapping { newline_offsets }
impl<'a> LocationMapping<'a> {
/// Create a new mapping without pre-scanning the entire input.
pub fn new(input: &'a [u8]) -> Self {
LocationMapping { bytes: input, newline_offsets: RefCell::new(Vec::new()) }
}
/// Map a byte offset to a `SourcePoint`.
pub fn get_source_point(&self, offset: usize) -> SourcePoint {
let line = match self.newline_offsets.binary_search(&offset) {
Ok(idx) => idx + 2, // exact newline -- next line
fn ensure_offsets_up_to(&self, offset: usize) {
let mut offsets = self.newline_offsets.borrow_mut();
let start = offsets.last().map_or(0, |&last| last + 1);
if offset < start {
return;
}
let end = offset.min(self.bytes.len());
for nl in memchr::memchr_iter(b'\n', &self.bytes[start..end]) {
offsets.push(start + nl);
}
}
fn source_point_from_offsets(offsets: &[usize], offset: usize) -> SourcePoint {
let line = match offsets.binary_search(&offset) {
Ok(idx) => idx + 2,
Err(idx) => idx + 1,
};
let column = if let Some(&last) = self.newline_offsets.get(line.saturating_sub(2)) {
let column = if let Some(&last) = offsets.get(line.saturating_sub(2)) {
offset.saturating_sub(last + 1)
} else {
offset
@ -113,10 +124,19 @@ impl LocationMapping {
SourcePoint { line, column }
}
/// Map a byte offset to a `SourcePoint`.
pub fn get_source_point(&self, offset: usize) -> SourcePoint {
self.ensure_offsets_up_to(offset);
let offsets = self.newline_offsets.borrow();
Self::source_point_from_offsets(&offsets, offset)
}
/// Map an `OffsetSpan` -- `SourceSpan` (closed interval).
pub fn get_source_span(&self, span: &OffsetSpan) -> SourceSpan {
let start = self.get_source_point(span.start);
let end = self.get_source_point(span.end.saturating_sub(1));
self.ensure_offsets_up_to(span.end.saturating_sub(1));
let offsets = self.newline_offsets.borrow();
let start = Self::source_point_from_offsets(&offsets, span.start);
let end = Self::source_point_from_offsets(&offsets, span.end.saturating_sub(1));
SourceSpan { start, end }
}
}

View file

@ -10,7 +10,6 @@ use anyhow::Result;
use base64::{engine::general_purpose, Engine};
use bstr::BString;
use http::StatusCode;
use lazy_static::lazy_static;
use regex::bytes::Regex;
use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
use schemars::{
@ -27,7 +26,7 @@ use xxhash_rust::xxh3::xxh3_64;
use crate::{
blob::{Blob, BlobId, BlobIdMap},
entropy::calculate_shannon_entropy,
location::{Location, LocationMapping, OffsetSpan},
location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
origin::OriginSet,
parser,
parser::{Checker, Language},
@ -40,6 +39,10 @@ use crate::{
util::{intern, redact_value},
};
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
// -------------------------------------------------------------------------------------------------
// RawMatch
// -------------------------------------------------------------------------------------------------
@ -255,17 +258,28 @@ impl<'a> Matcher<'a> {
self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64));
self.user_data.input_len = input.len() as u64;
// self.vs_scanner.scan(input, |rid, from, to, _flags| {
self.scanner_pool.with(|scanner| {
scanner.scan(input, |rule_id, from, to, _flags| {
self.user_data.raw_matches_scratch.push(RawMatch {
rule_id,
start_idx: from,
end_idx: to,
});
vectorscan_rs::Scan::Continue
})
})?;
let mut offset: usize = 0;
while offset < input.len() {
let end = (offset + MAX_CHUNK_SIZE).min(input.len());
let slice = &input[offset..end];
let base = offset as u64;
self.scanner_pool.with(|scanner| {
scanner.scan(slice, |rule_id, from, to, _flags| {
self.user_data.raw_matches_scratch.push(RawMatch {
rule_id,
start_idx: from + base,
end_idx: to + base,
});
vectorscan_rs::Scan::Continue
})
})?;
if end == input.len() {
break;
}
offset = end.saturating_sub(CHUNK_OVERLAP);
}
Ok(())
}
@ -288,17 +302,6 @@ impl<'a> Matcher<'a> {
self.local_stats.blobs_scanned += 1;
self.local_stats.bytes_scanned += blob.bytes().len() as u64;
// Check if blob was already seen and respect no_dedup flag
if !no_dedup {
if let Some(had_matches) = self.seen_blobs.get(&blob.id) {
return Ok(if had_matches {
ScanResult::SeenWithMatches
} else {
ScanResult::SeenSansMatches
});
}
}
// Extract filename from origin
let filename = origin
.first()
@ -313,24 +316,19 @@ impl<'a> Matcher<'a> {
// Opportunistically look for standalone Base64 blobs. If neither
// the raw scan nor this check yields anything, we can return early
// before doing any heavier work.
let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) };
let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
Vec::new()
} else {
get_base64_strings(blob.bytes())
};
if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() {
// Only record in seen_blobs if deduplication is enabled
if !no_dedup {
return Ok(match self.seen_blobs.insert(blob.id, false) {
None => ScanResult::New(Vec::new()),
Some(true) => ScanResult::SeenWithMatches,
Some(false) => ScanResult::SeenSansMatches,
});
} else {
return Ok(ScanResult::New(Vec::new()));
}
return Ok(ScanResult::New(Vec::new()));
}
let rules_db = self.rules_db;
let mut seen_matches = FxHashSet::default();
let mut previous_matches = Vec::new();
let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
let tree_sitter_result = if self.user_data.raw_matches_scratch.is_empty() {
None
} else {
@ -363,27 +361,19 @@ impl<'a> Matcher<'a> {
})
.collect::<Vec<_>>()
});
let mut previous_raw_matches: Vec<(usize, OffsetSpan)> = Vec::new();
let mut previous_raw_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
for &RawMatch { rule_id, start_idx, end_idx } in
self.user_data.raw_matches_scratch.iter().rev()
{
let rule_id_usize: usize = rule_id as usize;
// let rule = &rules_db.rules[rule_id_usize];
let rule = Arc::clone(&rules_db.rules[rule_id_usize]);
let re = &rules_db.anchored_regexes[rule_id_usize];
let start_idx_usize = start_idx as usize;
let end_idx_usize = end_idx as usize;
let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
// Skip if fully contained in a previous match
if previous_raw_matches.iter().any(|(prev_id, prev_span): &(usize, OffsetSpan)| {
*prev_id == rule_id_usize
&& (prev_span.fully_contains(&current_span)
|| current_span.fully_contains(prev_span))
}) {
if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
continue;
}
let matching_input_offset_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
previous_raw_matches.push((rule_id_usize, matching_input_offset_span));
filter_match(
blob,
rule,
@ -472,9 +462,15 @@ impl<'a> Matcher<'a> {
}
}
// Finalize
// Only record in seen_blobs if deduplication is enabled
if !no_dedup {
self.seen_blobs.insert(blob.id, !matches.is_empty());
if !no_dedup && !matches.is_empty() {
let blob_id = blob.id();
if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
return Ok(if had_matches {
ScanResult::SeenWithMatches
} else {
ScanResult::SeenSansMatches
});
}
}
// --- opportunistic capacity cap ---------------------------------
@ -501,6 +497,39 @@ fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -
end.hash(&mut hasher);
hasher.finish()
}
#[inline]
fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
if idx > 0 {
if spans[idx - 1].fully_contains(&span) {
return false;
}
if span.fully_contains(&spans[idx - 1]) {
spans.remove(idx - 1);
idx -= 1;
}
}
if idx < spans.len() {
if spans[idx].fully_contains(&span) {
return false;
}
if span.fully_contains(&spans[idx]) {
spans.remove(idx);
}
}
spans.insert(idx, span);
true
}
#[inline]
fn record_match(
map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
rule_id: usize,
span: OffsetSpan,
) -> bool {
insert_span(map.entry(rule_id).or_default(), span)
}
fn filter_match<'b>(
blob: &'b Blob,
// rule: &'b Rule,
@ -509,7 +538,7 @@ fn filter_match<'b>(
start: usize,
end: usize,
matches: &mut Vec<BlobMatch<'b>>,
previous_matches: &mut Vec<(usize, OffsetSpan)>,
previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
rule_id: usize,
seen_matches: &mut FxHashSet<u64>,
_origin: &OriginSet,
@ -558,11 +587,7 @@ fn filter_match<'b>(
if !seen_matches.insert(match_key) {
continue;
}
if previous_matches.iter().any(|(prev_rule_id, prev_loc)| {
*prev_rule_id == rule_id
&& (prev_loc.fully_contains(&matching_input_offset_span)
|| matching_input_offset_span.fully_contains(prev_loc))
}) {
if !record_match(previous_matches, rule_id, matching_input_offset_span) {
continue;
}
let only_matching_input =
@ -571,7 +596,7 @@ fn filter_match<'b>(
SerializableCaptures::from_captures(&captures, byte_slice.as_ref(), re, redact);
matches.push(BlobMatch {
rule: Arc::clone(&rule),
blob_id: &blob.id,
blob_id: blob.id_ref(),
matching_input: only_matching_input,
matching_input_offset_span,
captures: groups,
@ -581,7 +606,6 @@ fn filter_match<'b>(
calculated_entropy,
is_base64,
});
previous_matches.push((rule_id, matching_input_offset_span));
}
if let Some(t) = timer.take() {
let new_count = (matches.len() - initial_len) as u64;
@ -682,20 +706,20 @@ impl JsonSchema for Groups {
// pub end: usize, // End position of the match
// pub value: String, // The actual captured value
// }
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCapture {
pub name: Option<String>,
pub match_number: i32,
pub start: usize,
pub end: usize,
// Instead of storing an owned String, store a borrowed or interned value.
// Here we use Cow to allow either borrowing or owning as needed.
pub value: std::borrow::Cow<'static, str>,
/// Interned value of the capture.
pub value: &'static str,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCaptures {
pub captures: Vec<SerializableCapture>, // All captures (named and unnamed)
#[schemars(with = "Vec<SerializableCapture>")]
pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
}
impl SerializableCaptures {
pub fn from_captures(
@ -704,7 +728,7 @@ impl SerializableCaptures {
re: &Regex,
redact: bool,
) -> Self {
let mut serialized_captures = Vec::new();
let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
// Process named captures
for name in re.capture_names().flatten() {
if let Some(capture) = captures.name(name) {
@ -718,7 +742,7 @@ impl SerializableCaptures {
match_number: -1,
start: capture.start(),
end: capture.end(),
value: value.into(),
value: intern(&value),
});
}
}
@ -735,7 +759,7 @@ impl SerializableCaptures {
match_number: i32::try_from(i).unwrap_or(0),
start: capture.start(),
end: capture.end(),
value: value.into(),
value: intern(&value),
});
}
}
@ -764,16 +788,9 @@ pub struct Match {
pub finding_fingerprint: u64,
/// The rule that produced this match
pub rule_finding_fingerprint: &'static str,
/// The text identifier of the rule that produced this match
pub rule_text_id: &'static str,
/// The name of the rule that produced this match
pub rule_name: &'static str,
/// The confidence property of the rule that produced this match
pub rule_confidence: crate::rules::rule::Confidence,
#[serde(skip_serializing)]
#[schemars(skip)]
pub rule: Arc<Rule>,
/// Validation Body
pub validation_response_body: String,
@ -794,7 +811,7 @@ pub struct Match {
impl Match {
#[inline]
pub fn convert_owned_blobmatch_to_match<'a>(
loc_mapping: &'a LocationMapping,
loc_mapping: Option<&'a LocationMapping<'a>>,
owned_blob_match: &'a OwnedBlobMatch,
origin_type: &'a str,
) -> Self {
@ -812,9 +829,11 @@ impl Match {
// The fingerprint will be based on the content of the secret.
let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
let source_span = loc_mapping.get_source_span(&offset_span);
let rule_finding_fingerprint = owned_blob_match.rule.finding_sha1_fingerprint().to_owned();
let source_span =
loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
start: SourcePoint { line: 0, column: 0 },
end: SourcePoint { line: 0, column: 0 },
});
let offset_start: u64 =
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
@ -828,10 +847,7 @@ impl Match {
// matching_snippet
Match {
rule_finding_fingerprint: intern(&rule_finding_fingerprint),
rule_name: intern(owned_blob_match.rule.name()),
rule_confidence: owned_blob_match.rule.confidence(),
rule_text_id: intern(owned_blob_match.rule.id()),
rule: owned_blob_match.rule.clone(),
visible: owned_blob_match.rule.visible().to_owned(),
location: Location { offset_span, source_span: source_span.clone() },
groups: owned_blob_match.captures.clone(),
@ -852,7 +868,7 @@ impl Match {
pub fn finding_id(&self) -> String {
let mut h = Sha1::new();
write!(&mut h, "{}\0", self.rule_finding_fingerprint)
write!(&mut h, "{}\0", self.rule.finding_sha1_fingerprint())
.expect("should be able to write to memory");
serde_json::to_writer(&mut h, &self.groups)
.expect("should be able to serialize groups as JSON");
@ -892,33 +908,48 @@ pub struct DecodedData {
pub pos_start: usize,
pub pos_end: usize,
}
#[inline]
fn is_base64_byte(b: u8) -> bool {
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/')
}
pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
lazy_static! {
// Require a reasonably long run of valid Base64 characters to reduce
// noise. 32 bytes corresponds to 24 decoded bytes.
static ref RE_BASE64: Regex = Regex::new(r"[A-Za-z0-9+/]{32,}={0,2}").unwrap();
}
let mut results = Vec::new();
for m in RE_BASE64.find_iter(input) {
let base64_string = m.as_bytes();
// Skip candidates whose length isn't a multiple of four they cannot
// be valid Base64.
if base64_string.len() % 4 != 0 {
continue;
let mut i = 0;
while i < input.len() {
while i < input.len() && !is_base64_byte(input[i]) {
i += 1;
}
if let Ok(decoded) = general_purpose::STANDARD.decode(base64_string) {
if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
if decoded_str.is_ascii() {
results.push(DecodedData {
original: String::from_utf8_lossy(base64_string).into_owned(),
decoded: decoded_str.to_string(),
pos_start: m.start(),
pos_end: m.end(),
});
let start = i;
while i < input.len() && is_base64_byte(input[i]) {
i += 1;
}
let mut eq_count = 0;
while i < input.len() && input[i] == b'=' && eq_count < 2 {
i += 1;
eq_count += 1;
}
let end = i;
let len = end - start;
if len >= 32 && len % 4 == 0 {
let base64_slice = &input[start..end];
if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) {
if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
if decoded_str.is_ascii() {
results.push(DecodedData {
original: String::from_utf8_lossy(base64_slice).into_owned(),
decoded: decoded_str.to_string(),
pos_start: start,
pos_end: end,
});
}
}
}
}
}
results
}

View file

@ -216,7 +216,7 @@ impl DetailsReporter {
m: match_item.clone(),
comment: None,
visible: match_item.visible,
match_confidence: match_item.rule_confidence,
match_confidence: match_item.rule.confidence(),
validation_response_body: match_item.validation_response_body.clone(),
validation_response_status: match_item.validation_response_status,
validation_success: match_item.validation_success,
@ -366,13 +366,13 @@ impl DetailsReporter {
FindingReporterRecord {
rule: RuleMetadata {
name: rm.m.rule_name.to_string(),
id: rm.m.rule_text_id.to_string(),
name: rm.m.rule.name().to_string(),
id: rm.m.rule.id().to_string(),
},
finding: FindingRecordData {
snippet,
fingerprint: rm.m.finding_fingerprint.to_string(),
confidence: rm.match_confidence.to_string(),
confidence: rm.m.rule.confidence().to_string(),
entropy: format!("{:.2}", rm.m.calculated_entropy),
validation: ValidationInfo { status: validation_status, response: response_body },
language: rm

View file

@ -35,6 +35,7 @@ mod tests {
use crate::cli::commands::github::GitHistoryMode;
use crate::cli::commands::rules::RuleSpecifierArgs;
use crate::matcher::{SerializableCapture, SerializableCaptures};
use crate::rules::rule::{Confidence, Rule, RuleSyntax};
use crate::util::intern;
use crate::{
blob::BlobId,
@ -49,6 +50,7 @@ mod tests {
origin::Origin,
reporter::styles::Styles,
};
use smallvec::smallvec;
use std::{
io::Cursor,
path::PathBuf,
@ -132,12 +134,21 @@ mod tests {
}
}
fn create_mock_match(
rule_name: &str,
rule_text_id: &str,
rule_finding_fingerprint: &str,
validation_success: bool,
) -> Match {
fn create_mock_match(rule_name: &str, rule_text_id: &str, validation_success: bool) -> Match {
let syntax = RuleSyntax {
name: rule_name.to_string(),
id: rule_text_id.to_string(),
pattern: "dummy".to_string(),
min_entropy: 0.0,
confidence: Confidence::Medium,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
depends_on_rule: vec![],
};
let rule = Arc::new(Rule::new(syntax));
Match {
location: Location {
offset_span: OffsetSpan { start: 10, end: 20 },
@ -147,20 +158,17 @@ mod tests {
},
},
groups: SerializableCaptures {
captures: vec![SerializableCapture {
captures: smallvec![SerializableCapture {
name: Some("token".to_string()),
match_number: 1,
start: 10,
end: 20,
value: "mock_token".into(),
value: intern("mock_token"),
}],
},
blob_id: BlobId::new(b"mock_blob"),
finding_fingerprint: 0123,
rule_finding_fingerprint: intern(rule_finding_fingerprint),
rule_text_id: intern(rule_text_id),
rule_name: intern(rule_name),
rule_confidence: Confidence::Medium,
rule,
validation_response_body: "validation response".to_string(),
validation_response_status: 200,
validation_success,
@ -177,7 +185,6 @@ mod tests {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
};
let dedup = true;
@ -204,15 +211,13 @@ mod tests {
#[test]
fn test_json_format() -> Result<()> {
let mock_match =
create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
let mock_match = create_mock_match("MockRule", "mock_rule_1", true);
let matches = vec![ReportMatch {
origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]),
blob_metadata: BlobMetadata {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
},
m: mock_match,
@ -238,12 +243,7 @@ mod tests {
fn test_validation_status_in_json() -> Result<()> {
let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")];
for (validation_success, expected_status) in test_cases {
let mock_match = create_mock_match(
"MockRule",
"mock_rule_1",
"mock_finding_fingerprint",
validation_success,
);
let mock_match = create_mock_match("MockRule", "mock_rule_1", validation_success);
let matches = vec![ReportMatch {
origin: OriginSet::new(
Origin::from_file(PathBuf::from("/mock/path/file.rs")),
@ -253,7 +253,6 @@ mod tests {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
},
m: mock_match,

View file

@ -175,9 +175,11 @@ impl Docker {
pb.set_length(layer_paths.len() as u64);
for p in layer_paths {
let mut data = Vec::new();
File::open(&p)?.read_to_end(&mut data)?;
let digest = format!("{:x}", Sha256::digest(&data));
let mut file = File::open(&p)?;
let mut hasher = Sha256::new();
std::io::copy(&mut file, &mut hasher)?;
let digest = format!("{:x}", hasher.finalize());
let new_path = out_dir.join(format!("layer_{digest}.tar"));
std::fs::rename(&p, &new_path)?;
// extract layer contents so inner filenames appear in scan results

View file

@ -172,10 +172,15 @@ pub fn enumerate_filesystem_inputs(
};
if should_skip {
progress.suspend(|| {
let path = origin
.first()
.blob_path()
.map(|p| p.display().to_string())
.unwrap_or_else(|| blob.temp_id().to_string());
if is_archive {
debug!("Skipping archive: {}", blob.id);
debug!("Skipping archive: {path}");
} else {
debug!("Skipping binary blob: {}", blob.id);
debug!("Skipping binary blob: {path}");
}
});
return Ok(());

View file

@ -12,6 +12,8 @@ use crate::{
Path,
};
const LOCATION_LIMIT_BYTES: usize = 256 * 1024 * 1024;
/// A matcher along with parameters that remain constant during a single
/// `scan` run
pub struct BlobProcessor<'a> {
@ -27,8 +29,7 @@ impl<'a> BlobProcessor<'a> {
redact: bool,
no_base64: bool,
) -> Result<Option<DatastoreMessage>> {
let blob_id = blob.id.hex();
let _span = debug_span!("matcher", blob_id).entered();
let _span = debug_span!("matcher", temp_id = blob.temp_id()).entered();
let t1 = Instant::now();
let res = self.matcher.scan_blob(&blob, &origin, None, redact, no_dedup, no_base64)?;
let scan_us = t1.elapsed().as_micros();
@ -43,10 +44,9 @@ impl<'a> BlobProcessor<'a> {
ScanResult::SeenWithMatches => {
trace!("({scan_us}us) blob already scanned with matches");
let metadata = BlobMetadata {
id: blob.id,
id: blob.id(),
num_bytes: blob.len(),
mime_essence: None,
charset: None,
language: None,
};
Ok(Some((origin, metadata, Vec::new())))
@ -63,10 +63,9 @@ impl<'a> BlobProcessor<'a> {
}
let md = MetadataResult::from_blob_and_origin(&blob, &origin);
let metadata = BlobMetadata {
id: blob.id,
id: blob.id(),
num_bytes: blob.len(),
mime_essence: md.mime_essence,
charset: md.charset,
language: md.language,
};
@ -84,12 +83,17 @@ impl<'a> BlobProcessor<'a> {
}
}
let loc_mapping = LocationMapping::new(&blob.bytes());
let bytes = blob.bytes();
let loc_mapping = if bytes.len() <= LOCATION_LIMIT_BYTES {
Some(LocationMapping::new(bytes))
} else {
None
};
let converted_matches: Vec<(Option<f64>, Match)> = matches
.into_iter()
.map(|m| {
let converted_match = Match::convert_owned_blobmatch_to_match(
&loc_mapping,
loc_mapping.as_ref(),
&OwnedBlobMatch::from_blob_match(m),
origin_type,
);
@ -106,7 +110,6 @@ impl<'a> BlobProcessor<'a> {
struct MetadataResult {
mime_essence: Option<String>,
language: Option<String>,
charset: Option<String>,
}
impl MetadataResult {
fn from_blob_and_origin(blob: &Blob, origin: &OriginSet) -> MetadataResult {
@ -115,7 +118,6 @@ impl MetadataResult {
let mime_essence = Some(tree_magic_mini::from_u8(bytes).to_string());
let inspector = ContentInspector::default();
let language = blob_path.and_then(|p| inspector.guess_language(p, bytes));
let charset = inspector.guess_charset(bytes);
MetadataResult { mime_essence, language, charset }
MetadataResult { mime_essence, language }
}
}

View file

@ -13,7 +13,7 @@ use futures::{stream, StreamExt};
use indicatif::{ProgressBar, ProgressStyle};
use liquid::Parser;
use reqwest::{Client, StatusCode};
use rustc_hash::{FxHashMap, FxHashSet};
use rustc_hash::FxHashMap;
use tokio::{sync::Notify, time::timeout};
use crate::{
@ -21,7 +21,6 @@ use crate::{
findings_store::{FindingsStore, FindingsStoreMessage},
location::OffsetSpan,
matcher::{Match, OwnedBlobMatch},
rules::rule,
validation::{collect_variables_and_dependencies, validate_single_match, CachedResponse},
};
@ -40,7 +39,7 @@ pub async fn run_secret_validation(
let fail_count = Arc::new(AtomicUsize::new(0));
// ── 2. Fetch rules + matches ────────────────────────────────────────────
let (all_rules, all_matches_by_blob) = {
let (_all_rules, all_matches_by_blob) = {
let ds = datastore.lock().unwrap();
let rules = ds.get_rules()?;
let mut map: FxHashMap<BlobId, Vec<Arc<FindingsStoreMessage>>> = FxHashMap::default();
@ -51,16 +50,10 @@ pub async fn run_secret_validation(
};
// ── 3. Partition blobs ──────────────────────────────────────────────────
let rules_with_deps: FxHashSet<&str> = all_rules
.iter()
.filter(|r| !r.syntax().depends_on_rule.is_empty())
.map(|r| r.id())
.collect();
let mut simple_matches = Vec::new();
let mut dependent_blobs = FxHashMap::default(); // blob_id -- Vec<Arc<…>>
for (blob_id, matches) in all_matches_by_blob {
if matches.iter().any(|m| rules_with_deps.contains(m.2.rule_text_id)) {
if matches.iter().any(|m| !m.2.rule.syntax().depends_on_rule.is_empty()) {
dependent_blobs.insert(blob_id, matches);
} else {
simple_matches.extend(matches);
@ -80,11 +73,8 @@ pub async fn run_secret_validation(
.captures
.get(1)
.or_else(|| arc_msg.2.groups.captures.get(0))
.map_or("", |c| c.value.as_ref());
groups
.entry(format!("{}|{}", arc_msg.2.rule_text_id, secret))
.or_default()
.push(arc_msg);
.map_or("", |c| c.value);
groups.entry(format!("{}|{}", arc_msg.2.rule.id(), secret)).or_default().push(arc_msg);
}
let validation_results = DashMap::<String, CachedResponse>::new();
@ -109,7 +99,6 @@ pub async fn run_secret_validation(
let client = client.clone();
let cache_glob = cache.clone();
let val_res = &validation_results;
let rules = &all_rules;
let success = success_count.clone();
let fail = fail_count.clone();
// *** FIX: Clone the progress bar for each concurrent task ***
@ -122,8 +111,8 @@ pub async fn run_secret_validation(
.captures
.get(1)
.or_else(|| rep_arc.2.groups.captures.get(0))
.map_or("", |c| c.value.as_ref());
let key = format!("{}|{}", rep_arc.2.rule_text_id, secret);
.map_or("", |c| c.value);
let key = format!("{}|{}", rep_arc.2.rule.id(), secret);
match val_res.entry(key.clone()) {
dashmap::mapref::entry::Entry::Occupied(_) => return,
@ -138,8 +127,10 @@ pub async fn run_secret_validation(
}
}
let rule = find_rule_for_match(rules, rep_arc.2.rule_text_id).unwrap();
let mut om = OwnedBlobMatch::convert_match_to_owned_blobmatch(&rep_arc.2, rule);
let mut om = OwnedBlobMatch::convert_match_to_owned_blobmatch(
&rep_arc.2,
rep_arc.2.rule.clone(),
);
validate_single(
&mut om,
@ -211,7 +202,6 @@ pub async fn run_secret_validation(
let val_cache = Arc::new(DashMap::<String, CachedResponse>::new());
let in_flight = Arc::new(DashMap::<String, ()>::new());
let rules_ref = Arc::new(all_rules.clone());
for chunk in blob_ids.chunks(chunk_size) {
let tasks: Vec<_> = chunk
@ -225,15 +215,15 @@ pub async fn run_secret_validation(
let success = success_count.clone();
let fail = fail_count.clone();
let cache_glob = cache.clone();
let rules = rules_ref.clone();
async move {
let owned = matches_for_blob
.iter()
.map(|arc_msg| {
let rule = find_rule_for_match(&rules, arc_msg.2.rule_text_id)
.expect("rule");
OwnedBlobMatch::convert_match_to_owned_blobmatch(&arc_msg.2, rule)
OwnedBlobMatch::convert_match_to_owned_blobmatch(
&arc_msg.2,
arc_msg.2.rule.clone(),
)
})
.collect::<Vec<_>>();
@ -338,21 +328,6 @@ pub async fn run_secret_validation(
Ok(())
}
/// Returns `Some(Arc<Rule>)` if a matching rule is found; otherwise returns `None`.
/// Callers can decide how to handle the `None` case (e.g., skip processing).
fn find_rule_for_match(
all_rules: &[Arc<rule::Rule>],
rule_text_id: &str,
) -> Option<Arc<rule::Rule>> {
match all_rules.iter().find(|r| r.syntax().id == rule_text_id).cloned() {
Some(rule) => Some(rule),
None => {
eprintln!("Warning: no rule found with id '{}'. Skipping.", rule_text_id);
None
}
}
}
// ---------------------------------------------------
// The core validation logic, used in an async pipeline
// ---------------------------------------------------

View file

@ -129,16 +129,29 @@ pub fn check_for_update(global_args: &GlobalArgs, base_url: Option<&str>) -> Opt
.apply_to(&format!("Updated to version {}", status.version()))
),
Err(e) => match e {
UpdError::Io(ref io_err) if io_err.kind() == ErrorKind::PermissionDenied => {
warn!(
"{}",
styles.style_finding_active_heading.apply_to(
"Cannot replace the current binary - permission denied.\n\
If you installed via a package manager, run its upgrade command.\n\
Otherwise reinstall to a user-writable directory or re-run with sudo."
)
);
}
UpdError::Io(ref io_err) => match io_err.kind() {
ErrorKind::PermissionDenied => {
warn!(
"{}",
styles.style_finding_active_heading.apply_to(
"Cannot replace the current binary - permission denied.\n\
If you installed via a package manager, run its upgrade command.\n\
Otherwise reinstall to a user-writable directory or re-run with sudo."
)
);
}
ErrorKind::NotFound => {
warn!(
"{}",
styles.style_finding_active_heading.apply_to(
"Cannot replace the current binary - file not found.\n\
If you installed via a package manager, run its upgrade command.\n\
Otherwise reinstall to a user-writable directory."
)
);
}
_ => error!("Failed to update: {e}"),
},
_ => error!("Failed to update: {e}"),
},
}

View file

@ -910,6 +910,7 @@ mod tests {
use crossbeam_skiplist::SkipMap;
use http::StatusCode;
use rustc_hash::FxHashMap;
use smallvec::smallvec;
use crate::{
blob::BlobId,
@ -920,6 +921,7 @@ mod tests {
rule::{Confidence, Rule},
Rules,
},
util::intern,
validation::{validate_single_match, Cache},
};
#[tokio::test]
@ -1016,12 +1018,12 @@ rules:
// matching_input: token.as_bytes().to_vec(),
matching_input_offset_span: OffsetSpan { start: 0, end: token.len() },
captures: SerializableCaptures {
captures: vec![SerializableCapture {
captures: smallvec![SerializableCapture {
name: Some("TOKEN".to_string()),
match_number: -1,
start: 0,
end: token.len(),
value: token.into(),
value: intern(token),
}],
},
validation_response_body: String::new(),

File diff suppressed because it is too large Load diff

View file

@ -22,7 +22,7 @@ pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String,
.map(|cap| {
let name =
cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string());
(name, cap.value.clone().into_owned(), cap.start, cap.end)
(name, cap.value.to_string(), cap.start, cap.end)
})
.collect()
}
@ -76,16 +76,17 @@ mod tests {
use super::*;
use crate::matcher::{SerializableCapture, SerializableCaptures};
use pretty_assertions::assert_eq;
use smallvec::smallvec;
#[test]
fn single_unnamed_capture_is_returned() {
let captures = SerializableCaptures {
captures: vec![SerializableCapture {
captures: smallvec![SerializableCapture {
name: None,
match_number: 0,
start: 1,
end: 4,
value: "abc".into(),
value: "abc",
}],
};
let result = process_captures(&captures);
@ -95,20 +96,20 @@ mod tests {
#[test]
fn skips_whole_match_when_multiple() {
let captures = SerializableCaptures {
captures: vec![
captures: smallvec![
SerializableCapture {
name: None,
match_number: 0,
start: 0,
end: 5,
value: "abcde".into(),
value: "abcde",
},
SerializableCapture {
name: Some("foo".to_string()),
match_number: -1,
start: 1,
end: 4,
value: "bcd".into(),
value: "bcd",
},
],
};
@ -119,28 +120,22 @@ mod tests {
#[test]
fn includes_unnamed_groups_but_skips_whole_match() {
let captures = SerializableCaptures {
captures: vec![
captures: smallvec![
SerializableCapture {
name: None,
match_number: 0,
start: 0,
end: 6,
value: "aabbcc".into(),
value: "aabbcc",
},
SerializableCapture {
name: Some("foo".to_string()),
match_number: -1,
start: 0,
end: 2,
value: "aa".into(),
},
SerializableCapture {
name: None,
match_number: 1,
start: 4,
end: 6,
value: "cc".into(),
value: "aa",
},
SerializableCapture { name: None, match_number: 1, start: 4, end: 6, value: "cc" },
],
};
let result = process_captures(&captures);

View file

@ -14,11 +14,27 @@ use kingfisher::{
matcher::{Match, SerializableCapture, SerializableCaptures},
origin::{Origin, OriginSet},
reporter::{styles::Styles, DetailsReporter, ReportMatch},
rules::rule::Confidence,
rules::rule::{Confidence, Rule, RuleSyntax},
util::intern,
};
use smallvec::smallvec;
// ---- helpers -------------------------------------------------------------------------------
fn make_match(fp: u64) -> Match {
let syntax = RuleSyntax {
name: "Example Rule".to_string(),
id: "RULE.1".to_string(),
pattern: "dummy".to_string(),
min_entropy: 0.0,
confidence: Confidence::Medium,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
depends_on_rule: vec![],
};
let rule = Arc::new(Rule::new(syntax));
Match {
location: Location {
offset_span: OffsetSpan { start: 0, end: 10 },
@ -28,20 +44,17 @@ fn make_match(fp: u64) -> Match {
},
},
groups: SerializableCaptures {
captures: vec![SerializableCapture {
captures: smallvec![SerializableCapture {
name: None,
match_number: 0,
start: 0,
end: 10,
value: "dummy".into(),
value: intern("dummy"),
}],
},
blob_id: BlobId::new(b"dummy"),
finding_fingerprint: fp,
rule_finding_fingerprint: "structural.1".into(),
rule_text_id: "RULE.1".into(),
rule_name: "Example Rule".into(),
rule_confidence: Confidence::Medium,
rule,
validation_response_body: String::new(),
validation_response_status: 0,
validation_success: false,
@ -106,7 +119,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> {
id: BlobId::new(b"dummy"),
num_bytes: 10,
mime_essence: None,
charset: None,
language: None,
},
m: m1,
@ -123,7 +135,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> {
id: BlobId::new(b"dummy"),
num_bytes: 10,
mime_essence: None,
charset: None,
language: None,
},
m: m2,