Merge pull request #213 from mongodb/development

v1.77.0
This commit is contained in:
Mick Grove 2026-02-02 10:39:34 -08:00 committed by GitHub
commit 2f41d159e2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
360 changed files with 11759 additions and 4351 deletions

View file

@ -18,11 +18,25 @@ jobs:
runs-on: ubuntu-24.04-arm
steps:
- uses: actions/checkout@v4
# Free up disk space on Ubuntu runners
- name: Free Disk Space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo docker image prune --all --force
df -h
- uses: actions-rs/toolchain@v1
with:
toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
- uses: swatinem/rust-cache@v2
- name: Build (Makefile linux-arm64)
run: make ubuntu-arm64
- name: Run tests

View file

@ -27,6 +27,16 @@ jobs:
steps:
- uses: actions/checkout@v4
# Free up disk space on Ubuntu runners
- name: Free Disk Space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo docker image prune --all --force
df -h
- uses: actions-rs/toolchain@v1
with:
toolchain: ${{ env.RUST_TOOLCHAIN }}
@ -81,12 +91,24 @@ jobs:
steps:
- uses: actions/checkout@v4
# Free up disk space on Ubuntu runners
- name: Free Disk Space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo docker image prune --all --force
df -h
- uses: actions-rs/toolchain@v1
with:
toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
- uses: swatinem/rust-cache@v2
- name: Install packaging tools
run: cargo install cargo-deb cargo-generate-rpm

View file

@ -1,6 +1,14 @@
- id: kingfisher-auto
name: kingfisher (auto-install)
description: Automatically downloads and caches the Kingfisher binary, then scans staged changes. No manual installation required.
entry: scripts/kingfisher-pre-commit-auto.sh
language: script
pass_filenames: false
stages: [commit]
- id: kingfisher-docker
name: kingfisher (docker)
description: Run Kingfisher in Docker against staged changes at the repository root. No local install required.
description: Run Kingfisher in Docker against staged changes at the repository root. Requires Docker but no local install.
entry: ghcr.io/mongodb/kingfisher:latest
language: docker
args: ["scan", ".", "--staged", "--quiet", "--no-update-check"]

View file

@ -2,6 +2,18 @@
All notable changes to this project will be documented in this file.
## [v1.77.0]
- Added `kingfisher revoke` subcommand for revoking leaked credentials directly with the provider.
- Added optional `revocation` section to rules to support credential revocation (currently supporting AWS, GCP, GitHub, GitLab, Slack, and Buildkite).
- Added `kingfisher validate` subcommand to validate credentials without running a full scan.
- Added `validate_command` and `revoke_command` fields to scan output (pretty, JSON, JSONL, BSON, SARIF formats) showing the exact `kingfisher validate` or `kingfisher revoke` command to run for each finding. The `validate_command` is included for all findings with validation support; `revoke_command` is included only for active credentials with revocation support. These fields are omitted when `--redact` is used since they contain the secret value.
- Updated the HTML report viewer to display validate and revoke commands in the Finding Details panel with copy-to-clipboard functionality.
- Refactored project into multiple crates for better modularity and maintainability.
- Ensured more CLI arguments are global and available across all subcommands.
- Added `kingfisher-auto` pre-commit hook that automatically downloads and caches the appropriate binary for your platform (no Docker or manual installation required).
- Added Husky integration support with `install-husky.sh` helper script and documentation for Node.js projects.
- Added `kingfisher-pre-commit-auto.sh` and `kingfisher-pre-commit-auto.ps1` scripts for automatic binary download in Git hooks (Linux, macOS, Windows support).
## [v1.76.0]
- Fixed validation deduplication for rules with nested unnamed captures (e.g. `(?<REGEX>...(ABC|DEF)...)`) to use the primary capture for grouping, ensuring each unique match triggers a separate validation request.
- Added trace-level (`-vv`) logging for internal validation dedup keys and grouping to aid debugging.

View file

@ -1,3 +1,12 @@
[workspace]
members = [
".",
"crates/kingfisher-core",
"crates/kingfisher-rules",
"crates/kingfisher-scanner",
]
resolver = "2"
[workspace.package]
edition = "2021"
rust-version = "1.90"
@ -10,7 +19,7 @@ publish = false
[package]
name = "kingfisher"
version = "1.76.0"
version = "1.77.0"
description = "MongoDB's blazingly fast and accurate secret scanning and validation tool"
edition.workspace = true
rust-version.workspace = true
@ -40,6 +49,11 @@ assets = [
]
[dependencies]
# Library crates
kingfisher-core = { path = "crates/kingfisher-core" }
kingfisher-rules = { path = "crates/kingfisher-rules" }
kingfisher-scanner = { path = "crates/kingfisher-scanner" }
clap = { version = "4.5", features = [
"cargo",
"derive",

1570
README.md

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,48 @@
[package]
name = "kingfisher-core"
version = "0.1.0"
description = "Core types and traits for Kingfisher secret scanner"
edition.workspace = true
rust-version.workspace = true
license.workspace = true
authors.workspace = true
homepage.workspace = true
repository.workspace = true
publish.workspace = true
[dependencies]
# Serialization
serde = { version = "1.0", features = ["derive", "rc"] }
serde_json = "1.0"
schemars = "0.8"
# Error handling
anyhow = "1.0"
thiserror = "1.0"
# Hashing and crypto
sha1 = "0.10"
hex = "0.4"
# Memory management
memmap2 = "0.9"
once_cell = "1.21"
parking_lot = "0.12"
# Collections
smallvec = { version = "1", features = ["const_generics", "const_new", "union"] }
rustc-hash = "2.1"
dashmap = "6.1"
# Byte string handling
bstr = { version = "1.12", features = ["serde"] }
memchr = "2.7"
# Git types (minimal, for ObjectId and Time)
gix = { version = "0.73", default-features = false, features = ["serde"] }
# Console formatting
console = "0.15"
[dev-dependencies]
pretty_assertions = "1.4"

View file

@ -0,0 +1,495 @@
//! Blob representation for scannable content.
//!
//! A [`Blob`] represents content that can be scanned for secrets. It can be
//! created from:
//! - In-memory bytes ([`Blob::from_bytes`])
//! - A file path ([`Blob::from_file`])
//! - Borrowed data ([`Blob::from_borrowed`])
//!
//! Large files are automatically memory-mapped for efficiency.
use std::{
convert::TryInto,
fs::File,
io::{Read, Write},
path::Path,
sync::{
atomic::{AtomicU64, Ordering},
Arc,
},
};
use bstr::{BString, ByteSlice};
use gix::ObjectId;
use once_cell::sync::OnceCell;
use parking_lot::Mutex;
use rustc_hash::FxHashMap;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use sha1::{Digest, Sha1};
use smallvec::SmallVec;
use crate::error::Result;
use crate::git_commit_metadata::CommitMetadata;
/// Threshold above which files are memory-mapped instead of read into memory.
const LARGE_FILE_THRESHOLD: u64 = 0; // Currently: always mmap
/// Global counter for temporary blob IDs.
static NEXT_ID: AtomicU64 = AtomicU64::new(1);
/// Tracks where a blob was seen in git history.
#[derive(Clone, Debug, serde::Serialize)]
pub struct BlobAppearance {
/// Metadata about the commit where this blob appeared.
pub commit_metadata: Arc<CommitMetadata>,
/// The path of the blob within the repository.
pub path: BString,
}
impl BlobAppearance {
/// Returns the path as a `&Path`, if it's valid UTF-8.
#[inline]
pub fn path(&self) -> std::result::Result<&Path, bstr::Utf8Error> {
self.path.to_path()
}
}
/// A set of [`BlobAppearance`] entries, optimized for the common case of a single appearance.
pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>;
/// The underlying data storage for a [`Blob`].
pub enum BlobData<'a> {
/// Small blobs stored as owned bytes.
Owned(Vec<u8>),
/// Large blobs that are memory-mapped from disk.
Mapped(memmap2::Mmap),
/// Borrowed bytes (e.g., from a git pack file).
Borrowed(&'a [u8]),
}
impl<'a> AsRef<[u8]> for BlobData<'a> {
fn as_ref(&self) -> &[u8] {
match self {
BlobData::Owned(v) => v,
BlobData::Mapped(m) => m,
BlobData::Borrowed(slice) => slice,
}
}
}
impl<'a> BlobData<'a> {
/// Returns the length of the blob data in bytes.
#[inline]
pub fn len(&self) -> usize {
self.as_ref().len()
}
/// Returns true if the blob data is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.as_ref().is_empty()
}
}
/// A scannable blob of content.
///
/// `Blob` is the primary type for representing content to be scanned. It lazily
/// computes a content-based ID (SHA-1) and supports multiple backing storage types.
///
/// # Examples
///
/// ```
/// use kingfisher_core::Blob;
///
/// // Create from bytes
/// let blob = Blob::from_bytes(b"my secret content".to_vec());
/// assert_eq!(blob.len(), 17);
///
/// // Create from file
/// // let blob = Blob::from_file("path/to/file.txt")?;
/// ```
pub struct Blob<'a> {
/// Lazily computed content-based ID.
id: OnceCell<BlobId>,
/// The underlying data.
data: BlobData<'a>,
/// Temporary ID assigned at creation (for debugging/tracking).
temp_id: u64,
}
impl Blob<'_> {
/// Create a new `Blob` by reading from a file.
///
/// Large files are automatically memory-mapped for efficiency.
#[inline]
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let mut file = File::open(&path)?;
let file_size = file.metadata()?.len();
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
if file_size > LARGE_FILE_THRESHOLD {
// Large files: one mmap, zero extra copies.
let mmap = unsafe { memmap2::Mmap::map(&file)? };
Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id })
} else {
// Small files: read into memory.
let mut bytes = Vec::with_capacity(file_size as usize);
file.read_to_end(&mut bytes)?;
Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id })
}
}
/// Create a new `Blob` from a vector of bytes.
#[inline]
pub fn from_bytes(bytes: Vec<u8>) -> Self {
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }
}
/// Create a new `Blob` with a pre-computed ID and owned data.
#[inline]
pub fn new(id: BlobId, bytes: Vec<u8>) -> Self {
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
let cell = OnceCell::new();
let _ = cell.set(id);
Blob { id: cell, data: BlobData::Owned(bytes), temp_id }
}
/// Returns the blob's content as a byte slice.
#[inline]
pub fn bytes(&self) -> &[u8] {
self.data.as_ref()
}
/// Lazily computes and returns the blob's content-based [`BlobId`].
#[inline]
pub fn id(&self) -> BlobId {
*self.id.get_or_init(|| BlobId::new(self.bytes()))
}
/// Returns a reference to the blob's [`BlobId`], computing it if necessary.
#[inline]
pub fn id_ref(&self) -> &BlobId {
self.id.get_or_init(|| BlobId::new(self.bytes()))
}
/// Returns the temporary ID assigned when this blob was created.
#[inline]
pub fn temp_id(&self) -> u64 {
self.temp_id
}
/// Returns the length of the blob in bytes.
#[inline]
pub fn len(&self) -> usize {
self.bytes().len()
}
/// Returns true if the blob is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.bytes().is_empty()
}
}
impl<'a> Blob<'a> {
/// Create a new `Blob` from borrowed bytes.
///
/// This is useful for zero-copy scanning of data that already exists
/// in memory (e.g., from a git pack file).
#[inline]
pub fn from_borrowed(bytes: &'a [u8]) -> Self {
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
Blob { id: OnceCell::new(), data: BlobData::Borrowed(bytes), temp_id }
}
}
impl Drop for Blob<'_> {
fn drop(&mut self) {
// For owned data, clear and shrink to free memory promptly.
if let BlobData::Owned(ref mut v) = self.data {
v.clear();
v.shrink_to_fit();
}
}
}
/// A content-based identifier for a blob, computed as a Git-compatible SHA-1 hash.
#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone, Serialize)]
#[serde(into = "String")]
pub struct BlobId([u8; 20]);
impl BlobId {
/// Creates a zero-filled (default) `BlobId`.
pub fn default() -> Self {
BlobId([0; 20])
}
/// Computes a `BlobId` from raw bytes.
///
/// For large inputs, only the first and last 64KB are hashed for performance.
#[inline]
pub fn new(input: &[u8]) -> Self {
const CHUNK: usize = 64 * 1024; // 64KB from start and end
let mut hasher = Sha1::new();
write!(&mut hasher, "blob {}\0", input.len()).unwrap();
if input.len() <= CHUNK * 2 {
hasher.update(input);
} else {
hasher.update(&input[..CHUNK]);
hasher.update(&input[input.len() - CHUNK..]);
}
let digest: [u8; 20] = hasher.finalize().into();
BlobId(digest)
}
/// Computes a `BlobId` from the complete bytes (no truncation).
pub fn compute_from_bytes(bytes: &[u8]) -> Self {
let mut hasher = Sha1::new();
write!(&mut hasher, "blob {}\0", bytes.len()).unwrap();
hasher.update(bytes);
let digest: [u8; 20] = hasher.finalize().into();
BlobId(digest)
}
/// Parses a `BlobId` from a hex string.
#[inline]
pub fn from_hex(v: &str) -> crate::Result<Self> {
let bytes = hex::decode(v)?;
let arr: [u8; 20] =
bytes.as_slice().try_into().map_err(|_| crate::Error::InvalidBlobId(v.to_string()))?;
Ok(BlobId(arr))
}
/// Returns the blob ID as a hex string.
#[inline]
pub fn hex(&self) -> String {
hex::encode(self.0)
}
/// Returns the raw bytes of the blob ID.
#[inline]
pub fn as_bytes(&self) -> &[u8] {
&self.0
}
}
impl<'de> Deserialize<'de> for BlobId {
fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
struct Vis;
impl serde::de::Visitor<'_> for Vis {
type Value = BlobId;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
formatter.write_str("a 40-character hex string")
}
fn visit_str<E: serde::de::Error>(
self,
v: &str,
) -> std::result::Result<Self::Value, E> {
BlobId::from_hex(v).map_err(|e| serde::de::Error::custom(e))
}
}
d.deserialize_str(Vis)
}
}
impl std::fmt::Debug for BlobId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "BlobId({})", self.hex())
}
}
impl std::fmt::Display for BlobId {
#[inline]
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.hex())
}
}
impl JsonSchema for BlobId {
fn schema_name() -> String {
"BlobId".into()
}
fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
let s = String::json_schema(gen);
let mut o = s.into_object();
o.string().pattern = Some("[0-9a-f]{40}".into());
let md = o.metadata();
md.description = Some("A hex-encoded blob ID as computed by Git".into());
schemars::schema::Schema::Object(o)
}
}
impl From<BlobId> for String {
#[inline]
fn from(blob_id: BlobId) -> String {
blob_id.hex()
}
}
impl TryFrom<&str> for BlobId {
type Error = crate::Error;
#[inline]
fn try_from(s: &str) -> std::result::Result<Self, Self::Error> {
BlobId::from_hex(s)
}
}
impl<'a> From<&'a gix::ObjectId> for BlobId {
#[inline]
fn from(id: &'a gix::ObjectId) -> Self {
BlobId(id.as_bytes().try_into().expect("oid should be a 20-byte value"))
}
}
impl From<gix::ObjectId> for BlobId {
#[inline]
fn from(id: gix::ObjectId) -> Self {
BlobId(id.as_bytes().try_into().expect("oid should be a 20-byte value"))
}
}
impl<'a> From<&'a BlobId> for gix::ObjectId {
#[inline]
fn from(blob_id: &'a BlobId) -> Self {
gix::hash::ObjectId::try_from(blob_id.as_bytes()).unwrap()
}
}
impl From<BlobId> for gix::ObjectId {
#[inline]
fn from(blob_id: BlobId) -> Self {
gix::hash::ObjectId::try_from(blob_id.as_bytes()).unwrap()
}
}
/// A concurrent map with [`BlobId`] keys, optimized for low contention.
///
/// This implementation uses 256 shards (based on the first byte of the blob ID)
/// to minimize lock contention during parallel scanning.
pub struct BlobIdMap<V> {
maps: [Mutex<FxHashMap<ObjectId, V>>; 256],
}
impl<V> BlobIdMap<V> {
/// Creates a new empty `BlobIdMap`.
pub fn new() -> Self {
BlobIdMap { maps: std::array::from_fn(|_| Mutex::new(FxHashMap::default())) }
}
/// Inserts a value, returning the previous value if one existed.
#[inline]
pub fn insert(&self, blob_id: BlobId, v: V) -> Option<V> {
let idx = blob_id.as_bytes()[0] as usize;
self.maps[idx].lock().insert(blob_id.into(), v)
}
/// Returns true if the map contains the given key.
#[inline]
pub fn contains_key(&self, blob_id: &BlobId) -> bool {
let idx = blob_id.as_bytes()[0] as usize;
self.maps[idx].lock().contains_key(&ObjectId::from(blob_id))
}
/// Returns the total number of entries in the map.
///
/// Note: This is not a cheap operation as it must lock all shards.
pub fn len(&self) -> usize {
self.maps.iter().map(|m| m.lock().len()).sum()
}
/// Returns true if the map is empty.
pub fn is_empty(&self) -> bool {
self.maps.iter().all(|m| m.lock().is_empty())
}
}
impl<V: Copy> BlobIdMap<V> {
/// Gets a copy of the value for the given key.
#[inline]
pub fn get(&self, blob_id: &BlobId) -> Option<V> {
let idx = blob_id.as_bytes()[0] as usize;
self.maps[idx].lock().get(&ObjectId::from(blob_id)).copied()
}
}
impl<V> Default for BlobIdMap<V> {
fn default() -> Self {
Self::new()
}
}
/// Metadata about a blob.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, JsonSchema)]
pub struct BlobMetadata {
/// The blob's content-based ID.
pub id: BlobId,
/// The length of the blob in bytes.
pub num_bytes: usize,
/// The guessed MIME type of the blob (e.g., "text/plain").
pub mime_essence: Option<String>,
/// The guessed programming language of the blob (e.g., "Python").
pub language: Option<String>,
}
impl BlobMetadata {
/// Returns the size in bytes.
#[inline]
pub fn num_bytes(&self) -> usize {
self.num_bytes
}
/// Returns the size in megabytes, rounded to 3 decimal places.
#[inline]
pub fn num_megabytes(&self) -> f64 {
let mb = self.num_bytes as f64 / 1_048_576.0;
format!("{:.3}", mb).parse::<f64>().unwrap_or(mb)
}
/// Returns the MIME essence if known.
#[inline]
pub fn mime_essence(&self) -> Option<&str> {
self.mime_essence.as_deref()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_blob_id_empty() {
assert_eq!(BlobId::new(&[]).hex(), "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391");
}
#[test]
fn test_blob_id_small() {
assert_eq!(BlobId::new(&vec![0; 1024]).hex(), "06d7405020018ddf3cacee90fd4af10487da3d20");
}
#[test]
fn test_blob_from_bytes() {
let blob = Blob::from_bytes(b"hello world".to_vec());
assert_eq!(blob.len(), 11);
assert_eq!(blob.bytes(), b"hello world");
}
#[test]
fn test_blob_id_roundtrip() {
let original = BlobId::new(b"test data");
let hex = original.hex();
let parsed = BlobId::from_hex(&hex).unwrap();
assert_eq!(original, parsed);
}
}

View file

@ -0,0 +1,128 @@
//! Safe string escaping utilities.
//!
//! This module provides utilities for safely displaying byte strings that may
//! contain non-UTF8 data, ANSI escape codes, or control characters.
use std::{
borrow::Cow,
fmt::{Display, Formatter},
};
use console::strip_ansi_codes;
/// Escapes non-printing characters in a string while preserving whitespace.
///
/// Returns borrowed data if no escaping was needed, avoiding allocations.
fn escape_nonprinting(s: &str) -> Cow<'_, str> {
// Fast path - return original if no control chars (except whitespace)
if s.chars().all(|ch| !ch.is_control() || ch.is_whitespace()) {
return Cow::Borrowed(s);
}
// Allocate with extra capacity for possible escape sequences
let mut escaped = String::with_capacity(s.len() * 2);
let mut chars = s.chars().peekable();
while let Some(ch) = chars.next() {
match ch {
// Handle ANSI escape sequences
'\x1B' => continue,
// Escape non-whitespace control characters
ch if ch.is_control() && !ch.is_whitespace() => {
use std::fmt::Write;
write!(escaped, "{}", ch.escape_unicode()).expect("string writing must succeed");
}
// Pass through all other characters unchanged
ch => escaped.push(ch),
}
}
Cow::Owned(escaped)
}
/// A wrapper around `&[u8]` that provides safe string formatting.
///
/// When displayed, `Escaped` will:
/// 1. Convert from UTF-8 with replacement of invalid sequences
/// 2. Remove ANSI control sequences
/// 3. Escape remaining control characters (except whitespace)
///
/// # Examples
///
/// ```
/// use kingfisher_core::Escaped;
///
/// let bytes = b"Hello\x00World";
/// let escaped = Escaped(bytes);
/// assert_eq!(escaped.to_string(), "Hello\\u{0}World");
/// ```
#[derive(Debug, Clone, Copy)]
pub struct Escaped<'a>(pub &'a [u8]);
impl Display for Escaped<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
// First handle UTF-8 decoding with replacement characters
let decoded = String::from_utf8_lossy(self.0);
// Then strip ANSI sequences and escape control chars
let stripped = strip_ansi_codes(&decoded);
let escaped = escape_nonprinting(&stripped);
f.write_str(&escaped)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_escape_normal_text() {
let input = "Hello, World!";
let result = escape_nonprinting(input);
assert!(matches!(result, Cow::Borrowed(_)));
assert_eq!(result, "Hello, World!");
}
#[test]
fn test_escape_with_whitespace() {
let input = "Hello\n\t World!";
let result = escape_nonprinting(input);
assert!(matches!(result, Cow::Borrowed(_)));
assert_eq!(result, "Hello\n\t World!");
}
#[test]
fn test_escape_control_chars() {
let input = "Hello\x00World\x01";
let result = escape_nonprinting(input);
assert!(matches!(result, Cow::Owned(_)));
assert_eq!(result, "Hello\\u{0}World\\u{1}");
}
#[test]
fn test_escaped_struct_simple() {
let bytes = b"Hello World";
let escaped = Escaped(bytes);
assert_eq!(escaped.to_string(), "Hello World");
}
#[test]
fn test_escaped_struct_ansi_codes() {
let bytes = b"\x1b[31mRed\x1b[0m \x1b[32mGreen\x1b[0m";
let escaped = Escaped(bytes);
assert_eq!(escaped.to_string(), "Red Green");
}
#[test]
fn test_escaped_struct_invalid_utf8() {
let bytes = b"Hello\xFF\xFEWorld";
let escaped = Escaped(bytes);
assert_eq!(escaped.to_string(), "Hello\u{FFFD}\u{FFFD}World");
}
#[test]
fn test_escaped_struct_empty() {
let bytes = b"";
let escaped = Escaped(bytes);
assert_eq!(escaped.to_string(), "");
}
}

View file

@ -0,0 +1,89 @@
//! Shannon entropy calculation.
//!
//! Entropy is used to filter out low-entropy strings that are unlikely
//! to be real secrets.
/// Calculates the Shannon entropy of a byte slice.
///
/// Returns a value between 0.0 (completely uniform) and 8.0 (maximum entropy
/// for random bytes). Typical thresholds for secret detection are around 3.5-4.5.
///
/// # Examples
///
/// ```
/// use kingfisher_core::calculate_shannon_entropy;
///
/// // Low entropy (repeated character)
/// let entropy = calculate_shannon_entropy(b"aaaaaaaaaa");
/// assert!(entropy < 0.1);
///
/// // High entropy (random-looking)
/// let entropy = calculate_shannon_entropy(b"j2k#9K$mL*p&vN3");
/// assert!(entropy > 3.5);
/// ```
pub fn calculate_shannon_entropy(bytes: &[u8]) -> f32 {
if bytes.is_empty() {
return 0.0;
}
// Count occurrences of each byte value (0-255)
let mut counts = [0u32; 256];
for &byte in bytes {
counts[byte as usize] += 1;
}
let total_bytes = bytes.len() as f32;
// Sum entropy contribution for each byte that appears at least once
counts.iter().filter(|&&count| count > 0).fold(0.0, |entropy, &count| {
let probability = count as f32 / total_bytes;
entropy - probability * probability.log2()
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_entropy_empty() {
let entropy = calculate_shannon_entropy(&[]);
assert_eq!(entropy, 0.0);
assert!(entropy.is_finite());
}
#[test]
fn test_entropy_uniform() {
// Single repeated byte should return 0.0
let entropy = calculate_shannon_entropy(&[65, 65, 65, 65]);
assert_eq!(entropy, 0.0);
assert!(entropy.is_finite());
}
#[test]
fn test_entropy_two_values() {
// Even distribution of two bytes should be exactly 1.0
let input = &[1, 2, 1, 2];
let entropy = calculate_shannon_entropy(input);
assert!((entropy - 1.0).abs() < 0.0001);
assert!(entropy.is_finite());
}
#[test]
fn test_entropy_password() {
// Real password example should have mid-range entropy
let password = "Password123!".as_bytes();
let entropy = calculate_shannon_entropy(password);
assert!(entropy > 2.5);
assert!(entropy.is_finite());
}
#[test]
fn test_entropy_random() {
// Random-looking string should have high entropy
let random = "j2k#9K$mL*p&vN3".as_bytes();
let entropy = calculate_shannon_entropy(random);
assert!(entropy > 3.5);
assert!(entropy.is_finite());
}
}

View file

@ -0,0 +1,36 @@
//! Error types for kingfisher-core.
use thiserror::Error;
/// The primary error type for kingfisher-core operations.
#[derive(Error, Debug)]
pub enum Error {
/// An I/O error occurred.
#[error("I/O error: {0}")]
Io(#[from] std::io::Error),
/// Failed to parse a blob ID from hex.
#[error("Invalid blob ID: {0}")]
InvalidBlobId(String),
/// A hex decoding error occurred.
#[error("Hex decode error: {0}")]
HexDecode(#[from] hex::FromHexError),
/// Failed to open or read a Git repository.
#[error("Git error: {0}")]
Git(String),
/// A generic error with a message.
#[error("{0}")]
Other(String),
}
impl From<gix::open::Error> for Error {
fn from(e: gix::open::Error) -> Self {
Error::Git(e.to_string())
}
}
/// A specialized Result type for kingfisher-core operations.
pub type Result<T> = std::result::Result<T, Error>;

View file

@ -0,0 +1,159 @@
//! Git commit metadata types.
//!
//! This module provides types for tracking commit information associated
//! with blobs found in git history.
use gix::{date::Time, ObjectId};
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
// Wrapper for serializing gix::date::Time as text
#[repr(transparent)]
#[derive(Serialize, Deserialize, Copy, Clone)]
#[serde(remote = "Time")]
struct TextTime(
#[serde(
getter = "text_time::getter",
serialize_with = "text_time::serialize",
deserialize_with = "text_time::deserialize"
)]
Time,
);
impl From<TextTime> for Time {
fn from(v: TextTime) -> Self {
v.0
}
}
impl From<Time> for TextTime {
fn from(v: Time) -> Self {
Self(v)
}
}
mod text_time {
use super::*;
#[inline]
pub fn getter(v: &Time) -> &Time {
v
}
#[inline]
pub fn serialize<S: serde::Serializer>(v: &Time, serializer: S) -> Result<S::Ok, S::Error> {
serializer.collect_str(v)
}
pub fn deserialize<'de, D: serde::Deserializer<'de>>(d: D) -> Result<Time, D::Error> {
struct Vis;
impl<'a> serde::de::Visitor<'a> for Vis {
type Value = Time;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
formatter.write_str("a string representing a Git timestamp")
}
fn visit_str<E: serde::de::Error>(self, v: &str) -> Result<Self::Value, E> {
gix::date::parse(v, None).map_err(E::custom)
}
}
d.deserialize_str(Vis)
}
}
impl JsonSchema for TextTime {
fn schema_name() -> String {
"Time".into()
}
fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
String::json_schema(gen)
}
}
// Wrapper for serializing gix::ObjectId as hex
#[repr(transparent)]
#[derive(Serialize, Deserialize, Copy, Clone)]
#[serde(remote = "ObjectId")]
struct HexObjectId(
#[serde(
getter = "hex_object_id::getter",
serialize_with = "hex_object_id::serialize",
deserialize_with = "hex_object_id::deserialize"
)]
ObjectId,
);
impl From<ObjectId> for HexObjectId {
fn from(v: ObjectId) -> Self {
HexObjectId(v)
}
}
impl From<HexObjectId> for ObjectId {
fn from(v: HexObjectId) -> Self {
v.0
}
}
mod hex_object_id {
use super::*;
#[inline]
pub fn getter(v: &ObjectId) -> &ObjectId {
v
}
#[inline]
pub fn serialize<S: serde::Serializer>(v: &ObjectId, serializer: S) -> Result<S::Ok, S::Error> {
serializer.collect_str(&v.to_hex())
}
pub fn deserialize<'de, D: serde::Deserializer<'de>>(d: D) -> Result<ObjectId, D::Error> {
struct Vis;
impl<'a> serde::de::Visitor<'a> for Vis {
type Value = ObjectId;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
formatter.write_str("a 40-character hex string representing a Git object ID")
}
fn visit_str<E: serde::de::Error>(self, v: &str) -> Result<Self::Value, E> {
ObjectId::from_hex(v.as_bytes()).map_err(E::custom)
}
}
d.deserialize_str(Vis)
}
}
impl JsonSchema for HexObjectId {
fn schema_name() -> String {
"ObjectId".into()
}
fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
let s = String::json_schema(gen);
let mut o = s.into_object();
o.string().pattern = Some("[0-9a-f]{40}".into());
let md = o.metadata();
md.description = Some("A hex-encoded object ID as computed by Git".into());
schemars::schema::Schema::Object(o)
}
}
/// Metadata about a Git commit.
///
/// This is used to track the provenance of blobs found in git history.
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)]
pub struct CommitMetadata {
/// The SHA-1 commit ID.
#[serde(with = "HexObjectId")]
pub commit_id: ObjectId,
/// The committer's name.
pub committer_name: String,
/// The committer's email address.
pub committer_email: String,
/// The commit timestamp.
#[serde(with = "TextTime")]
pub committer_timestamp: Time,
}

View file

@ -0,0 +1,27 @@
//! `kingfisher-core` provides the foundational types and traits shared across
//! the Kingfisher secret scanning library.
//!
//! This crate contains:
//! - [`Blob`] - Representation of scannable content (files, buffers, git objects)
//! - [`Location`] - Source location tracking (byte offsets and line/column)
//! - [`Origin`] - Provenance tracking (where content came from)
//! - Utility functions for entropy calculation, string escaping, etc.
pub mod blob;
pub mod bstring_escape;
pub mod entropy;
pub mod error;
pub mod git_commit_metadata;
pub mod location;
pub mod origin;
// Re-export commonly used types at the crate root
pub use blob::{
Blob, BlobAppearance, BlobAppearanceSet, BlobData, BlobId, BlobIdMap, BlobMetadata,
};
pub use bstring_escape::Escaped;
pub use entropy::calculate_shannon_entropy;
pub use error::{Error, Result};
pub use git_commit_metadata::CommitMetadata;
pub use location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan};
pub use origin::{CommitOrigin, ExtendedOrigin, FileOrigin, GitRepoOrigin, Origin, OriginSet};

View file

@ -0,0 +1,296 @@
//! Source location tracking.
//!
//! This module provides types for tracking locations within source content:
//! - [`OffsetSpan`] - Byte offset ranges
//! - [`SourceSpan`] - Line/column ranges
//! - [`Location`] - Combined byte and source location
//! - [`LocationMapping`] - Efficient offset-to-line/column conversion
use core::ops::Range;
use std::cell::RefCell;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
/// A point defined by a byte offset.
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, Copy, Clone)]
pub struct OffsetPoint(pub usize);
impl OffsetPoint {
#[inline]
pub fn new(idx: usize) -> Self {
OffsetPoint(idx)
}
}
/// A non-empty span defined by two byte offsets (half-open interval `[start, end)`).
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)]
pub struct OffsetSpan {
pub start: usize,
pub end: usize,
}
impl std::fmt::Display for OffsetSpan {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}-{}", self.start, self.end)
}
}
impl OffsetSpan {
/// Creates an `OffsetSpan` from two `OffsetPoint`s.
#[inline]
pub fn from_offsets(start: OffsetPoint, end: OffsetPoint) -> Self {
OffsetSpan { start: start.0, end: end.0 }
}
/// Creates an `OffsetSpan` from a `Range<usize>`.
#[inline]
pub fn from_range(range: Range<usize>) -> Self {
OffsetSpan { start: range.start, end: range.end }
}
/// Returns the length in bytes.
#[inline]
#[must_use]
pub fn len(&self) -> usize {
self.end.saturating_sub(self.start)
}
/// Returns true if empty or inverted.
#[inline]
#[must_use]
pub fn is_empty(&self) -> bool {
self.start >= self.end
}
/// Returns true if `other` lies entirely within `self`.
#[inline]
#[must_use]
pub fn fully_contains(&self, other: &Self) -> bool {
self.start <= other.start && other.end <= self.end
}
}
/// A point in source text (1-indexed line, 0-indexed column).
#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SourcePoint {
pub line: usize,
pub column: usize,
}
impl std::fmt::Display for SourcePoint {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}:{}", self.line, self.column)
}
}
/// A span between two source points (closed interval).
#[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SourceSpan {
pub start: SourcePoint,
pub end: SourcePoint,
}
impl std::fmt::Display for SourceSpan {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}-{}", self.start, self.end)
}
}
/// Efficiently maps byte offsets to line/column positions.
///
/// This lazily scans for newlines as needed, avoiding upfront cost for
/// files where only a few locations are needed.
pub struct LocationMapping<'a> {
bytes: &'a [u8],
newline_offsets: RefCell<Vec<usize>>,
}
impl<'a> LocationMapping<'a> {
/// Creates a new mapping for the given input bytes.
pub fn new(input: &'a [u8]) -> Self {
LocationMapping { bytes: input, newline_offsets: RefCell::new(Vec::new()) }
}
fn ensure_offsets_up_to(&self, offset: usize) {
let mut offsets = self.newline_offsets.borrow_mut();
let start = offsets.last().map_or(0, |&last| last + 1);
if offset < start {
return;
}
let end = offset.min(self.bytes.len());
for nl in memchr::memchr_iter(b'\n', &self.bytes[start..end]) {
offsets.push(start + nl);
}
}
fn source_point_from_offsets(offsets: &[usize], offset: usize) -> SourcePoint {
let line = match offsets.binary_search(&offset) {
Ok(idx) => idx + 2,
Err(idx) => idx + 1,
};
let column = if let Some(&last) = offsets.get(line.saturating_sub(2)) {
offset.saturating_sub(last + 1)
} else {
offset
};
SourcePoint { line, column }
}
/// Maps a byte offset to a `SourcePoint`.
pub fn get_source_point(&self, offset: usize) -> SourcePoint {
self.ensure_offsets_up_to(offset);
let offsets = self.newline_offsets.borrow();
Self::source_point_from_offsets(&offsets, offset)
}
/// Maps an `OffsetSpan` to a `SourceSpan`.
pub fn get_source_span(&self, span: &OffsetSpan) -> SourceSpan {
self.ensure_offsets_up_to(span.end.saturating_sub(1));
let offsets = self.newline_offsets.borrow();
let start = Self::source_point_from_offsets(&offsets, span.start);
let end = Self::source_point_from_offsets(&offsets, span.end.saturating_sub(1));
SourceSpan { start, end }
}
}
/// Compact representation of a source span to reduce per-match footprint.
#[derive(Debug, Clone, Copy, Deserialize, Serialize, JsonSchema)]
pub struct CompactSourceSpan {
pub start_line: u32,
pub start_column: u32,
pub end_line: u32,
pub end_column: u32,
}
impl CompactSourceSpan {
#[inline]
pub fn zero() -> Self {
Self { start_line: 0, start_column: 0, end_line: 0, end_column: 0 }
}
#[inline]
pub fn from_source_span(span: &SourceSpan) -> Self {
Self {
start_line: span.start.line.try_into().unwrap_or(0),
start_column: span.start.column.try_into().unwrap_or(0),
end_line: span.end.line.try_into().unwrap_or(0),
end_column: span.end.column.try_into().unwrap_or(0),
}
}
#[inline]
pub fn to_source_span(self) -> SourceSpan {
SourceSpan {
start: SourcePoint {
line: usize::try_from(self.start_line).unwrap_or(0),
column: usize::try_from(self.start_column).unwrap_or(0),
},
end: SourcePoint {
line: usize::try_from(self.end_line).unwrap_or(0),
column: usize::try_from(self.end_column).unwrap_or(0),
},
}
}
}
/// Combined byte offset and source location information.
#[derive(Debug, Clone, Deserialize, JsonSchema)]
pub struct Location {
/// The byte offset span.
pub offset_span: OffsetSpan,
/// The optional source (line/column) span.
#[serde(
default,
serialize_with = "serialize_compact_source_span",
deserialize_with = "deserialize_compact_source_span"
)]
#[schemars(with = "SourceSpan")]
pub source_span: Option<CompactSourceSpan>,
}
impl serde::Serialize for Location {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeStruct;
let mut state = serializer.serialize_struct("Location", 2)?;
state.serialize_field("offset_span", &self.offset_span)?;
let source_span = self.source_span().unwrap_or_else(CompactSourceSpan::zero);
state.serialize_field("source_span", &source_span.to_source_span())?;
state.end()
}
}
impl Location {
/// Creates a new `Location` with both offset and source spans.
#[inline]
pub fn with_source_span(offset_span: OffsetSpan, source_span: Option<SourceSpan>) -> Self {
Self {
offset_span,
source_span: source_span.as_ref().map(CompactSourceSpan::from_source_span),
}
}
/// Returns the compact source span if available.
#[inline]
pub fn source_span(&self) -> Option<CompactSourceSpan> {
self.source_span
}
/// Returns the source span, defaulting to zeros if not available.
#[inline]
pub fn resolved_source_span(&self) -> SourceSpan {
self.source_span.unwrap_or_else(CompactSourceSpan::zero).to_source_span()
}
}
fn serialize_compact_source_span<S>(
span: &Option<CompactSourceSpan>,
serializer: S,
) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let source_span = span.unwrap_or_else(CompactSourceSpan::zero).to_source_span();
source_span.serialize(serializer)
}
fn deserialize_compact_source_span<'de, D>(
deserializer: D,
) -> Result<Option<CompactSourceSpan>, D::Error>
where
D: serde::Deserializer<'de>,
{
let span = SourceSpan::deserialize(deserializer)?;
Ok(Some(CompactSourceSpan::from_source_span(&span)))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_offset_span() {
let span = OffsetSpan::from_range(10..20);
assert_eq!(span.len(), 10);
assert!(!span.is_empty());
}
#[test]
fn test_location_mapping() {
let input = b"line1\nline2\nline3";
let mapping = LocationMapping::new(input);
// First line, first character
assert_eq!(mapping.get_source_point(0), SourcePoint { line: 1, column: 0 });
// First line, last character
assert_eq!(mapping.get_source_point(4), SourcePoint { line: 1, column: 4 });
// Second line, first character
assert_eq!(mapping.get_source_point(6), SourcePoint { line: 2, column: 0 });
}
}

View file

@ -0,0 +1,310 @@
//! Provenance tracking for scanned content.
//!
//! This module provides types for tracking where content came from:
//! - [`FileOrigin`] - Content from a file path
//! - [`GitRepoOrigin`] - Content from a git repository
//! - [`ExtendedOrigin`] - Content from other sources (Jira, Confluence, etc.)
//! - [`OriginSet`] - A non-empty collection of origins
use std::{
path::{Path, PathBuf},
sync::Arc,
};
use dashmap::DashMap;
use once_cell::sync::Lazy;
use rustc_hash::FxHashSet;
use schemars::JsonSchema;
use serde::{ser::SerializeSeq, Deserialize, Serialize};
use smallvec::SmallVec;
use crate::git_commit_metadata::CommitMetadata;
// Cache for git remote URLs to avoid repeated lookups
static URL_CACHE: Lazy<DashMap<PathBuf, Arc<str>>> = Lazy::new(DashMap::default);
fn compute_url(repo_path: &Path) -> anyhow::Result<String> {
let repo = gix::open(repo_path)?;
let config = repo.config_snapshot();
let url_bytes =
config.string("remote.origin.url").ok_or_else(|| anyhow::anyhow!("No remote URL found"))?;
use bstr::ByteSlice;
if url_bytes.starts_with(b"http://") || url_bytes.starts_with(b"https://") {
Ok(String::from_utf8_lossy(url_bytes.as_bytes()).into_owned())
} else if url_bytes.starts_with(b"git@") {
let url_str = String::from_utf8_lossy(url_bytes.as_bytes());
if let Some(stripped) = url_str.strip_prefix("git@") {
if let Some((domain, path)) = stripped.split_once(':') {
Ok(format!("https://{}/{}", domain, path))
} else {
Err(anyhow::anyhow!("Invalid SSH URL format"))
}
} else {
Err(anyhow::anyhow!("Invalid SSH URL format"))
}
} else {
Err(anyhow::anyhow!(
"Unsupported remote URL format: {}",
String::from_utf8_lossy(url_bytes.as_bytes())
))
}
}
/// Gets the remote URL for a git repository, with caching.
pub fn get_repo_url(repo_path: &Path) -> anyhow::Result<Arc<str>> {
// Fast path: cache hit
if let Some(u) = URL_CACHE.get(repo_path) {
return Ok(u.clone());
}
// Slow path: compute, intern, cache
let url_arc: Arc<str> = compute_url(repo_path)?.into();
URL_CACHE.insert(repo_path.to_path_buf(), url_arc.clone());
Ok(url_arc)
}
/// The provenance of a scanned blob.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "snake_case", tag = "kind")]
#[allow(clippy::large_enum_variant)]
pub enum Origin {
/// Content from a file on disk.
File(FileOrigin),
/// Content from a git repository.
GitRepo(GitRepoOrigin),
/// Content from an extended source (arbitrary JSON metadata).
Extended(ExtendedOrigin),
}
impl Origin {
/// Creates an `Origin` for a plain file.
pub fn from_file(path: PathBuf) -> Self {
Origin::File(FileOrigin::new(path))
}
/// Creates an `Origin` for a blob in a git repository without commit info.
pub fn from_git_repo(repo_path: Arc<PathBuf>) -> Self {
Origin::GitRepo(GitRepoOrigin { repo_path, first_commit: None })
}
/// Creates an `Origin` for a blob in a git repository with commit info.
pub fn from_git_repo_with_first_commit(
repo_path: Arc<PathBuf>,
commit_metadata: Arc<CommitMetadata>,
blob_path: String,
) -> Self {
let first_commit = Some(CommitOrigin { commit_metadata, blob_path });
Origin::GitRepo(GitRepoOrigin { repo_path, first_commit })
}
/// Creates an `Origin` from arbitrary JSON metadata.
pub fn from_extended(value: serde_json::Value) -> Self {
Origin::Extended(ExtendedOrigin(value))
}
/// Returns the path of the blob, if available.
pub fn blob_path(&self) -> Option<&Path> {
match self {
Self::File(e) => Some(&e.path),
Self::GitRepo(e) => e.first_commit.as_ref().map(|c| Path::new(&c.blob_path)),
Self::Extended(e) => e.path(),
}
}
/// Returns the full filesystem path to the content, if available.
pub fn full_path(&self) -> Option<PathBuf> {
match self {
Self::File(e) => Some((*e.path).clone()),
Self::GitRepo(e) => e.first_commit.as_ref().map(|c| e.repo_path.join(&c.blob_path)),
Self::Extended(e) => e.path().map(PathBuf::from),
}
}
}
impl std::fmt::Display for Origin {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Origin::File(e) => write!(f, "file {}", e.path.display()),
Origin::GitRepo(e) => match &e.first_commit {
Some(md) => write!(
f,
"git repo {}: first seen in commit {} as {}",
e.repo_path.display(),
md.commit_metadata.commit_id,
&md.blob_path,
),
None => write!(f, "git repo {}", e.repo_path.display()),
},
Origin::Extended(e) => write!(f, "extended {}", e),
}
}
}
/// Origin information for a file on disk.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Hash)]
pub struct FileOrigin {
/// The file path.
pub path: Arc<PathBuf>,
}
impl FileOrigin {
/// Creates a new `FileOrigin` from a path.
pub fn new<P: Into<PathBuf>>(p: P) -> Self {
Self { path: Arc::new(p.into()) }
}
}
/// Origin information for a blob in a git repository.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Hash)]
pub struct GitRepoOrigin {
/// Path to the repository on disk.
pub repo_path: Arc<PathBuf>,
/// Information about the first commit where this blob was seen.
pub first_commit: Option<CommitOrigin>,
}
/// Information about where a blob was first seen in git history.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Hash)]
pub struct CommitOrigin {
/// Metadata about the commit.
pub commit_metadata: Arc<CommitMetadata>,
/// The path of the blob within the commit.
pub blob_path: String,
}
/// An extended origin with arbitrary JSON metadata.
///
/// This is used for sources like Jira, Confluence, Slack, etc.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Hash)]
pub struct ExtendedOrigin(pub serde_json::Value);
impl std::fmt::Display for ExtendedOrigin {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Display::fmt(&self.0, f)
}
}
impl ExtendedOrigin {
/// Returns the path from the extended origin, if available.
pub fn path(&self) -> Option<&Path> {
let p = self.0.get("path")?.as_str()?;
Some(Path::new(p))
}
}
/// A non-empty set of [`Origin`] entries.
///
/// This is used when a blob has been seen in multiple locations
/// (e.g., the same content in multiple files or commits).
#[derive(Debug, Clone)]
pub struct OriginSet {
origin: Origin,
more_provenance: SmallVec<[Origin; 1]>,
}
impl serde::Serialize for OriginSet {
fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
let mut seq = s.serialize_seq(Some(self.len()))?;
for p in self.iter() {
seq.serialize_element(p)?;
}
seq.end()
}
}
impl JsonSchema for OriginSet {
fn schema_name() -> String {
"OriginSet".into()
}
fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
let s = <Vec<Origin>>::json_schema(gen);
let mut o = s.into_object();
o.array().min_items = Some(1);
let md = o.metadata();
md.description = Some("A non-empty set of `Origin` entries".into());
schemars::schema::Schema::Object(o)
}
}
impl OriginSet {
/// Creates a new `OriginSet` with a single origin.
#[inline]
pub fn single(origin: Origin) -> Self {
Self { origin, more_provenance: SmallVec::new() }
}
/// Creates a new `OriginSet` from multiple origins.
///
/// Filters out redundant less-specific origins.
pub fn new(origin: Origin, more_origin: Vec<Origin>) -> Self {
let mut git_repos_with_detailed: FxHashSet<Arc<PathBuf>> = FxHashSet::default();
for p in std::iter::once(&origin).chain(&more_origin) {
if let Origin::GitRepo(e) = p {
if e.first_commit.is_some() {
git_repos_with_detailed.insert(e.repo_path.clone());
}
}
}
let mut filtered = std::iter::once(origin).chain(more_origin).filter(|p| match p {
Origin::GitRepo(e) => {
e.first_commit.is_some() || !git_repos_with_detailed.contains(&e.repo_path)
}
Origin::File(_) => true,
Origin::Extended(_) => true,
});
Self { origin: filtered.next().unwrap(), more_provenance: filtered.collect() }
}
/// Attempts to create an `OriginSet` from an iterator.
///
/// Returns `None` if the iterator is empty.
#[inline]
pub fn try_from_iter<I>(it: I) -> Option<Self>
where
I: IntoIterator<Item = Origin>,
{
let mut it = it.into_iter();
let provenance = it.next()?;
let more_provenance = it.collect();
Some(Self::new(provenance, more_provenance))
}
/// Returns the first origin in the set.
#[inline]
pub fn first(&self) -> &Origin {
&self.origin
}
/// Returns the number of origins in the set.
#[allow(clippy::len_without_is_empty)]
#[inline]
pub fn len(&self) -> usize {
1 + self.more_provenance.len()
}
/// Returns an iterator over all origins in the set.
#[inline]
pub fn iter(&self) -> impl Iterator<Item = &Origin> {
std::iter::once(&self.origin).chain(&self.more_provenance)
}
}
impl IntoIterator for OriginSet {
type IntoIter =
std::iter::Chain<std::iter::Once<Origin>, <Vec<Origin> as IntoIterator>::IntoIter>;
type Item = Origin;
#[inline]
fn into_iter(self) -> Self::IntoIter {
std::iter::once(self.origin).chain(self.more_provenance.into_vec().into_iter())
}
}
impl From<Origin> for OriginSet {
fn from(p: Origin) -> Self {
Self::single(p)
}
}

View file

@ -0,0 +1,64 @@
[package]
name = "kingfisher-rules"
version = "0.1.0"
description = "Rule definitions and database for Kingfisher secret scanner"
edition.workspace = true
rust-version.workspace = true
license.workspace = true
authors.workspace = true
homepage.workspace = true
repository.workspace = true
publish.workspace = true
[dependencies]
# Internal dependencies
kingfisher-core = { path = "../kingfisher-core" }
# Serialization
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
serde_yaml = "0.9"
schemars = "0.8"
# Error handling
anyhow = "1.0"
thiserror = "1.0"
# Regex
regex = "1.12"
lazy_static = "1.5"
# Hashing
xxhash-rust = { version = "0.8", features = ["xxh3"] }
# Liquid templating (for checksum validation)
liquid = "0.26"
liquid-core = "0.26"
# Crypto for liquid filters
base64 = "0.22"
crc32fast = "1.5"
hmac = "0.12"
sha1 = "0.10"
sha2 = "0.10"
percent-encoding = "2.3"
time = "0.3"
uuid = { version = "1.19", features = ["v4"] }
rand = "0.9"
# Vectorscan for pattern matching
vectorscan-rs = "0.0.5"
# Logging
tracing = "0.1"
# File walking
walkdir = "2.5"
ignore = "0.4"
# Embedded rule files
include_dir = "0.7"
[dev-dependencies]
pretty_assertions = "1.4"
proptest = "1.6"

View file

@ -80,4 +80,4 @@ rules:
- https://htmlpreview.github.io/?https://github.com/FiloSottile/age/blob/main/doc/age.1.html
- https://github.com/C2SP/C2SP/blob/037e546d164a89fd7577df2c18df80bb54bd246e/age.md#the-mlkem768-x25519-ie-x-wing-hybrid-post-quantum-recipient-type
categories:
- secret
- secret

View file

@ -2,9 +2,9 @@ rules:
- name: Alibaba Access Key ID
id: kingfisher.alibabacloud.1
pattern: |
(?xi)
(?x)
(
LTAI[a-z0-9]{17,21}
LTAI([a-zA-Z0-9]{12,20})
)
\b
pattern_requirements:

View file

@ -63,6 +63,8 @@ rules:
},
validation:
type: AWS
revocation:
type: AWS
depends_on_rule:
- rule_id: kingfisher.aws.1
variable: AKID
@ -107,7 +109,7 @@ rules:
- name: AWS Bedrock API Key (Short-lived)
id: kingfisher.aws.6
pattern: |
pattern: |
(?x)
(
bedrock-api-key-YmVkcm9jay5hbWF6b25hd3MuY29t[A-Za-z0-9+/]+={0,2}

View file

@ -30,4 +30,16 @@ rules:
status: [200]
- type: WordMatch
words: ['"uuid"', '"user"']
revocation:
type: Http
content:
request:
method: DELETE
url: https://api.buildkite.com/v2/access-token
headers:
Authorization: "Bearer {{ TOKEN }}"
response_matcher:
- report_response: true
- type: StatusMatch
status: [204]

View file

@ -2,12 +2,12 @@ rules:
- name: Docker Hub Personal Access Token
id: kingfisher.dockerhub.1
pattern: |
(?x)
(?xi)
\b
(
dckr_pat_[A-Za-z0-9_-]{27}
dckr_pat_[A-Z0-9_-]{27}
)
(?: $ | [^A-Za-z0-9_-] )
(?: $ | [^A-Z0-9_-] )
pattern_requirements:
min_digits: 2
min_entropy: 3.3
@ -17,30 +17,62 @@ rules:
- docker login -u gemesa -p dckr_pat_tkzBYxjNNC3R_Yg6jd_O-G8FbrJ
- docker login -u gemesa -p dckr_pat_1q8yKET1VDJTpfCwseUDzT8vFh-
references:
- https://docs.docker.com/reference/api/hub/latest/#tag/access-tokens/paths/~1v2~1access-tokens~1%7Buuid%7D/get
- https://docs.docker.com/reference/api/hub/latest/#tag/authentication-api/operation/AuthCreateAccessToken
depends_on_rule:
- rule_id: kingfisher.dockerhub.2
variable: DOCKER_USERNAME
validation:
type: Http
content:
request:
method: POST
url: https://hub.docker.com/v2/auth/token
headers:
Authorization: Bearer {{ TOKEN }}
Content-Type: application/json
Accept: application/json
method: GET
body: '{"identifier":"{{ DOCKER_USERNAME | json_escape }}","secret":"{{ TOKEN | json_escape }}"}'
response_matcher:
- report_response: true
- status:
- 200
type: StatusMatch
url: https://hub.docker.com/v2/access-tokens?page_size=1
- name: Docker Hub Organization Access Token
- type: StatusMatch
status: [200]
- type: WordMatch
words:
- '"access_token"'
- name: Docker Hub Username
id: kingfisher.dockerhub.2
pattern: |
(?x)
(?xi)
\b
docker
(?:.|[\n\r]){0,32}?
(?:
-u\s+
| --username[=\s]+
| user(?:name)?[=:\s]+["']?
)
(
[a-z0-9][a-z0-9_-]{2,29}
)
\b
min_entropy: 1.0
confidence: medium
visible: false
examples:
- docker login -u gemesa -p dckr_pat_xxx
- docker login --username=myuser
- DOCKER_USERNAME=myuser
- 'docker_user: "myuser"'
- name: Docker Hub Organization Access Token
id: kingfisher.dockerhub.3
pattern: |
(?xi)
\b
(
dckr_oat_[A-Za-z0-9_-]{32}
dckr_oat_[A-Z0-9_-]{32}
)
(?: $ | [^A-Za-z0-9_-] )
(?: $ | [^A-Z0-9_-] )
pattern_requirements:
min_digits: 2
min_entropy: 3.3

Some files were not shown because too many files have changed in this diff Show more