forked from mirrors/kingfisher
commit
2f41d159e2
360 changed files with 11759 additions and 4351 deletions
14
.github/workflows/ci.yml
vendored
14
.github/workflows/ci.yml
vendored
|
|
@ -18,11 +18,25 @@ jobs:
|
|||
runs-on: ubuntu-24.04-arm
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
# Free up disk space on Ubuntu runners
|
||||
- name: Free Disk Space
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
||||
sudo docker image prune --all --force
|
||||
df -h
|
||||
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: ${{ env.RUST_TOOLCHAIN }}
|
||||
profile: minimal
|
||||
override: true
|
||||
|
||||
- uses: swatinem/rust-cache@v2
|
||||
|
||||
- name: Build (Makefile linux-arm64)
|
||||
run: make ubuntu-arm64
|
||||
- name: Run tests
|
||||
|
|
|
|||
22
.github/workflows/release.yml
vendored
22
.github/workflows/release.yml
vendored
|
|
@ -27,6 +27,16 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
# Free up disk space on Ubuntu runners
|
||||
- name: Free Disk Space
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
||||
sudo docker image prune --all --force
|
||||
df -h
|
||||
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: ${{ env.RUST_TOOLCHAIN }}
|
||||
|
|
@ -81,12 +91,24 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
# Free up disk space on Ubuntu runners
|
||||
- name: Free Disk Space
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
||||
sudo docker image prune --all --force
|
||||
df -h
|
||||
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: ${{ env.RUST_TOOLCHAIN }}
|
||||
profile: minimal
|
||||
override: true
|
||||
|
||||
- uses: swatinem/rust-cache@v2
|
||||
|
||||
- name: Install packaging tools
|
||||
run: cargo install cargo-deb cargo-generate-rpm
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,14 @@
|
|||
- id: kingfisher-auto
|
||||
name: kingfisher (auto-install)
|
||||
description: Automatically downloads and caches the Kingfisher binary, then scans staged changes. No manual installation required.
|
||||
entry: scripts/kingfisher-pre-commit-auto.sh
|
||||
language: script
|
||||
pass_filenames: false
|
||||
stages: [commit]
|
||||
|
||||
- id: kingfisher-docker
|
||||
name: kingfisher (docker)
|
||||
description: Run Kingfisher in Docker against staged changes at the repository root. No local install required.
|
||||
description: Run Kingfisher in Docker against staged changes at the repository root. Requires Docker but no local install.
|
||||
entry: ghcr.io/mongodb/kingfisher:latest
|
||||
language: docker
|
||||
args: ["scan", ".", "--staged", "--quiet", "--no-update-check"]
|
||||
|
|
|
|||
12
CHANGELOG.md
12
CHANGELOG.md
|
|
@ -2,6 +2,18 @@
|
|||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
## [v1.77.0]
|
||||
- Added `kingfisher revoke` subcommand for revoking leaked credentials directly with the provider.
|
||||
- Added optional `revocation` section to rules to support credential revocation (currently supporting AWS, GCP, GitHub, GitLab, Slack, and Buildkite).
|
||||
- Added `kingfisher validate` subcommand to validate credentials without running a full scan.
|
||||
- Added `validate_command` and `revoke_command` fields to scan output (pretty, JSON, JSONL, BSON, SARIF formats) showing the exact `kingfisher validate` or `kingfisher revoke` command to run for each finding. The `validate_command` is included for all findings with validation support; `revoke_command` is included only for active credentials with revocation support. These fields are omitted when `--redact` is used since they contain the secret value.
|
||||
- Updated the HTML report viewer to display validate and revoke commands in the Finding Details panel with copy-to-clipboard functionality.
|
||||
- Refactored project into multiple crates for better modularity and maintainability.
|
||||
- Ensured more CLI arguments are global and available across all subcommands.
|
||||
- Added `kingfisher-auto` pre-commit hook that automatically downloads and caches the appropriate binary for your platform (no Docker or manual installation required).
|
||||
- Added Husky integration support with `install-husky.sh` helper script and documentation for Node.js projects.
|
||||
- Added `kingfisher-pre-commit-auto.sh` and `kingfisher-pre-commit-auto.ps1` scripts for automatic binary download in Git hooks (Linux, macOS, Windows support).
|
||||
|
||||
## [v1.76.0]
|
||||
- Fixed validation deduplication for rules with nested unnamed captures (e.g. `(?<REGEX>...(ABC|DEF)...)`) to use the primary capture for grouping, ensuring each unique match triggers a separate validation request.
|
||||
- Added trace-level (`-vv`) logging for internal validation dedup keys and grouping to aid debugging.
|
||||
|
|
|
|||
16
Cargo.toml
16
Cargo.toml
|
|
@ -1,3 +1,12 @@
|
|||
[workspace]
|
||||
members = [
|
||||
".",
|
||||
"crates/kingfisher-core",
|
||||
"crates/kingfisher-rules",
|
||||
"crates/kingfisher-scanner",
|
||||
]
|
||||
resolver = "2"
|
||||
|
||||
[workspace.package]
|
||||
edition = "2021"
|
||||
rust-version = "1.90"
|
||||
|
|
@ -10,7 +19,7 @@ publish = false
|
|||
|
||||
[package]
|
||||
name = "kingfisher"
|
||||
version = "1.76.0"
|
||||
version = "1.77.0"
|
||||
description = "MongoDB's blazingly fast and accurate secret scanning and validation tool"
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
|
|
@ -40,6 +49,11 @@ assets = [
|
|||
]
|
||||
|
||||
[dependencies]
|
||||
# Library crates
|
||||
kingfisher-core = { path = "crates/kingfisher-core" }
|
||||
kingfisher-rules = { path = "crates/kingfisher-rules" }
|
||||
kingfisher-scanner = { path = "crates/kingfisher-scanner" }
|
||||
|
||||
clap = { version = "4.5", features = [
|
||||
"cargo",
|
||||
"derive",
|
||||
|
|
|
|||
48
crates/kingfisher-core/Cargo.toml
Normal file
48
crates/kingfisher-core/Cargo.toml
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
[package]
|
||||
name = "kingfisher-core"
|
||||
version = "0.1.0"
|
||||
description = "Core types and traits for Kingfisher secret scanner"
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
license.workspace = true
|
||||
authors.workspace = true
|
||||
homepage.workspace = true
|
||||
repository.workspace = true
|
||||
publish.workspace = true
|
||||
|
||||
[dependencies]
|
||||
# Serialization
|
||||
serde = { version = "1.0", features = ["derive", "rc"] }
|
||||
serde_json = "1.0"
|
||||
schemars = "0.8"
|
||||
|
||||
# Error handling
|
||||
anyhow = "1.0"
|
||||
thiserror = "1.0"
|
||||
|
||||
# Hashing and crypto
|
||||
sha1 = "0.10"
|
||||
hex = "0.4"
|
||||
|
||||
# Memory management
|
||||
memmap2 = "0.9"
|
||||
once_cell = "1.21"
|
||||
parking_lot = "0.12"
|
||||
|
||||
# Collections
|
||||
smallvec = { version = "1", features = ["const_generics", "const_new", "union"] }
|
||||
rustc-hash = "2.1"
|
||||
dashmap = "6.1"
|
||||
|
||||
# Byte string handling
|
||||
bstr = { version = "1.12", features = ["serde"] }
|
||||
memchr = "2.7"
|
||||
|
||||
# Git types (minimal, for ObjectId and Time)
|
||||
gix = { version = "0.73", default-features = false, features = ["serde"] }
|
||||
|
||||
# Console formatting
|
||||
console = "0.15"
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions = "1.4"
|
||||
495
crates/kingfisher-core/src/blob.rs
Normal file
495
crates/kingfisher-core/src/blob.rs
Normal file
|
|
@ -0,0 +1,495 @@
|
|||
//! Blob representation for scannable content.
|
||||
//!
|
||||
//! A [`Blob`] represents content that can be scanned for secrets. It can be
|
||||
//! created from:
|
||||
//! - In-memory bytes ([`Blob::from_bytes`])
|
||||
//! - A file path ([`Blob::from_file`])
|
||||
//! - Borrowed data ([`Blob::from_borrowed`])
|
||||
//!
|
||||
//! Large files are automatically memory-mapped for efficiency.
|
||||
|
||||
use std::{
|
||||
convert::TryInto,
|
||||
fs::File,
|
||||
io::{Read, Write},
|
||||
path::Path,
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
};
|
||||
|
||||
use bstr::{BString, ByteSlice};
|
||||
use gix::ObjectId;
|
||||
use once_cell::sync::OnceCell;
|
||||
use parking_lot::Mutex;
|
||||
use rustc_hash::FxHashMap;
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sha1::{Digest, Sha1};
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::git_commit_metadata::CommitMetadata;
|
||||
|
||||
/// Threshold above which files are memory-mapped instead of read into memory.
|
||||
const LARGE_FILE_THRESHOLD: u64 = 0; // Currently: always mmap
|
||||
|
||||
/// Global counter for temporary blob IDs.
|
||||
static NEXT_ID: AtomicU64 = AtomicU64::new(1);
|
||||
|
||||
/// Tracks where a blob was seen in git history.
|
||||
#[derive(Clone, Debug, serde::Serialize)]
|
||||
pub struct BlobAppearance {
|
||||
/// Metadata about the commit where this blob appeared.
|
||||
pub commit_metadata: Arc<CommitMetadata>,
|
||||
|
||||
/// The path of the blob within the repository.
|
||||
pub path: BString,
|
||||
}
|
||||
|
||||
impl BlobAppearance {
|
||||
/// Returns the path as a `&Path`, if it's valid UTF-8.
|
||||
#[inline]
|
||||
pub fn path(&self) -> std::result::Result<&Path, bstr::Utf8Error> {
|
||||
self.path.to_path()
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of [`BlobAppearance`] entries, optimized for the common case of a single appearance.
|
||||
pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>;
|
||||
|
||||
/// The underlying data storage for a [`Blob`].
|
||||
pub enum BlobData<'a> {
|
||||
/// Small blobs stored as owned bytes.
|
||||
Owned(Vec<u8>),
|
||||
|
||||
/// Large blobs that are memory-mapped from disk.
|
||||
Mapped(memmap2::Mmap),
|
||||
|
||||
/// Borrowed bytes (e.g., from a git pack file).
|
||||
Borrowed(&'a [u8]),
|
||||
}
|
||||
|
||||
impl<'a> AsRef<[u8]> for BlobData<'a> {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
match self {
|
||||
BlobData::Owned(v) => v,
|
||||
BlobData::Mapped(m) => m,
|
||||
BlobData::Borrowed(slice) => slice,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BlobData<'a> {
|
||||
/// Returns the length of the blob data in bytes.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.as_ref().len()
|
||||
}
|
||||
|
||||
/// Returns true if the blob data is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.as_ref().is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
/// A scannable blob of content.
|
||||
///
|
||||
/// `Blob` is the primary type for representing content to be scanned. It lazily
|
||||
/// computes a content-based ID (SHA-1) and supports multiple backing storage types.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use kingfisher_core::Blob;
|
||||
///
|
||||
/// // Create from bytes
|
||||
/// let blob = Blob::from_bytes(b"my secret content".to_vec());
|
||||
/// assert_eq!(blob.len(), 17);
|
||||
///
|
||||
/// // Create from file
|
||||
/// // let blob = Blob::from_file("path/to/file.txt")?;
|
||||
/// ```
|
||||
pub struct Blob<'a> {
|
||||
/// Lazily computed content-based ID.
|
||||
id: OnceCell<BlobId>,
|
||||
/// The underlying data.
|
||||
data: BlobData<'a>,
|
||||
/// Temporary ID assigned at creation (for debugging/tracking).
|
||||
temp_id: u64,
|
||||
}
|
||||
|
||||
impl Blob<'_> {
|
||||
/// Create a new `Blob` by reading from a file.
|
||||
///
|
||||
/// Large files are automatically memory-mapped for efficiency.
|
||||
#[inline]
|
||||
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||
let mut file = File::open(&path)?;
|
||||
let file_size = file.metadata()?.len();
|
||||
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
if file_size > LARGE_FILE_THRESHOLD {
|
||||
// Large files: one mmap, zero extra copies.
|
||||
let mmap = unsafe { memmap2::Mmap::map(&file)? };
|
||||
Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id })
|
||||
} else {
|
||||
// Small files: read into memory.
|
||||
let mut bytes = Vec::with_capacity(file_size as usize);
|
||||
file.read_to_end(&mut bytes)?;
|
||||
Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id })
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `Blob` from a vector of bytes.
|
||||
#[inline]
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> Self {
|
||||
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
|
||||
Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }
|
||||
}
|
||||
|
||||
/// Create a new `Blob` with a pre-computed ID and owned data.
|
||||
#[inline]
|
||||
pub fn new(id: BlobId, bytes: Vec<u8>) -> Self {
|
||||
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
|
||||
let cell = OnceCell::new();
|
||||
let _ = cell.set(id);
|
||||
Blob { id: cell, data: BlobData::Owned(bytes), temp_id }
|
||||
}
|
||||
|
||||
/// Returns the blob's content as a byte slice.
|
||||
#[inline]
|
||||
pub fn bytes(&self) -> &[u8] {
|
||||
self.data.as_ref()
|
||||
}
|
||||
|
||||
/// Lazily computes and returns the blob's content-based [`BlobId`].
|
||||
#[inline]
|
||||
pub fn id(&self) -> BlobId {
|
||||
*self.id.get_or_init(|| BlobId::new(self.bytes()))
|
||||
}
|
||||
|
||||
/// Returns a reference to the blob's [`BlobId`], computing it if necessary.
|
||||
#[inline]
|
||||
pub fn id_ref(&self) -> &BlobId {
|
||||
self.id.get_or_init(|| BlobId::new(self.bytes()))
|
||||
}
|
||||
|
||||
/// Returns the temporary ID assigned when this blob was created.
|
||||
#[inline]
|
||||
pub fn temp_id(&self) -> u64 {
|
||||
self.temp_id
|
||||
}
|
||||
|
||||
/// Returns the length of the blob in bytes.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.bytes().len()
|
||||
}
|
||||
|
||||
/// Returns true if the blob is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.bytes().is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Blob<'a> {
|
||||
/// Create a new `Blob` from borrowed bytes.
|
||||
///
|
||||
/// This is useful for zero-copy scanning of data that already exists
|
||||
/// in memory (e.g., from a git pack file).
|
||||
#[inline]
|
||||
pub fn from_borrowed(bytes: &'a [u8]) -> Self {
|
||||
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
|
||||
Blob { id: OnceCell::new(), data: BlobData::Borrowed(bytes), temp_id }
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Blob<'_> {
|
||||
fn drop(&mut self) {
|
||||
// For owned data, clear and shrink to free memory promptly.
|
||||
if let BlobData::Owned(ref mut v) = self.data {
|
||||
v.clear();
|
||||
v.shrink_to_fit();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A content-based identifier for a blob, computed as a Git-compatible SHA-1 hash.
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone, Serialize)]
|
||||
#[serde(into = "String")]
|
||||
pub struct BlobId([u8; 20]);
|
||||
|
||||
impl BlobId {
|
||||
/// Creates a zero-filled (default) `BlobId`.
|
||||
pub fn default() -> Self {
|
||||
BlobId([0; 20])
|
||||
}
|
||||
|
||||
/// Computes a `BlobId` from raw bytes.
|
||||
///
|
||||
/// For large inputs, only the first and last 64KB are hashed for performance.
|
||||
#[inline]
|
||||
pub fn new(input: &[u8]) -> Self {
|
||||
const CHUNK: usize = 64 * 1024; // 64KB from start and end
|
||||
let mut hasher = Sha1::new();
|
||||
write!(&mut hasher, "blob {}\0", input.len()).unwrap();
|
||||
if input.len() <= CHUNK * 2 {
|
||||
hasher.update(input);
|
||||
} else {
|
||||
hasher.update(&input[..CHUNK]);
|
||||
hasher.update(&input[input.len() - CHUNK..]);
|
||||
}
|
||||
let digest: [u8; 20] = hasher.finalize().into();
|
||||
BlobId(digest)
|
||||
}
|
||||
|
||||
/// Computes a `BlobId` from the complete bytes (no truncation).
|
||||
pub fn compute_from_bytes(bytes: &[u8]) -> Self {
|
||||
let mut hasher = Sha1::new();
|
||||
write!(&mut hasher, "blob {}\0", bytes.len()).unwrap();
|
||||
hasher.update(bytes);
|
||||
let digest: [u8; 20] = hasher.finalize().into();
|
||||
BlobId(digest)
|
||||
}
|
||||
|
||||
/// Parses a `BlobId` from a hex string.
|
||||
#[inline]
|
||||
pub fn from_hex(v: &str) -> crate::Result<Self> {
|
||||
let bytes = hex::decode(v)?;
|
||||
let arr: [u8; 20] =
|
||||
bytes.as_slice().try_into().map_err(|_| crate::Error::InvalidBlobId(v.to_string()))?;
|
||||
Ok(BlobId(arr))
|
||||
}
|
||||
|
||||
/// Returns the blob ID as a hex string.
|
||||
#[inline]
|
||||
pub fn hex(&self) -> String {
|
||||
hex::encode(self.0)
|
||||
}
|
||||
|
||||
/// Returns the raw bytes of the blob ID.
|
||||
#[inline]
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for BlobId {
|
||||
fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
|
||||
struct Vis;
|
||||
impl serde::de::Visitor<'_> for Vis {
|
||||
type Value = BlobId;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
formatter.write_str("a 40-character hex string")
|
||||
}
|
||||
|
||||
fn visit_str<E: serde::de::Error>(
|
||||
self,
|
||||
v: &str,
|
||||
) -> std::result::Result<Self::Value, E> {
|
||||
BlobId::from_hex(v).map_err(|e| serde::de::Error::custom(e))
|
||||
}
|
||||
}
|
||||
d.deserialize_str(Vis)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for BlobId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "BlobId({})", self.hex())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BlobId {
|
||||
#[inline]
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.hex())
|
||||
}
|
||||
}
|
||||
|
||||
impl JsonSchema for BlobId {
|
||||
fn schema_name() -> String {
|
||||
"BlobId".into()
|
||||
}
|
||||
|
||||
fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
|
||||
let s = String::json_schema(gen);
|
||||
let mut o = s.into_object();
|
||||
o.string().pattern = Some("[0-9a-f]{40}".into());
|
||||
let md = o.metadata();
|
||||
md.description = Some("A hex-encoded blob ID as computed by Git".into());
|
||||
schemars::schema::Schema::Object(o)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BlobId> for String {
|
||||
#[inline]
|
||||
fn from(blob_id: BlobId) -> String {
|
||||
blob_id.hex()
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&str> for BlobId {
|
||||
type Error = crate::Error;
|
||||
|
||||
#[inline]
|
||||
fn try_from(s: &str) -> std::result::Result<Self, Self::Error> {
|
||||
BlobId::from_hex(s)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a gix::ObjectId> for BlobId {
|
||||
#[inline]
|
||||
fn from(id: &'a gix::ObjectId) -> Self {
|
||||
BlobId(id.as_bytes().try_into().expect("oid should be a 20-byte value"))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<gix::ObjectId> for BlobId {
|
||||
#[inline]
|
||||
fn from(id: gix::ObjectId) -> Self {
|
||||
BlobId(id.as_bytes().try_into().expect("oid should be a 20-byte value"))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a BlobId> for gix::ObjectId {
|
||||
#[inline]
|
||||
fn from(blob_id: &'a BlobId) -> Self {
|
||||
gix::hash::ObjectId::try_from(blob_id.as_bytes()).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BlobId> for gix::ObjectId {
|
||||
#[inline]
|
||||
fn from(blob_id: BlobId) -> Self {
|
||||
gix::hash::ObjectId::try_from(blob_id.as_bytes()).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
/// A concurrent map with [`BlobId`] keys, optimized for low contention.
|
||||
///
|
||||
/// This implementation uses 256 shards (based on the first byte of the blob ID)
|
||||
/// to minimize lock contention during parallel scanning.
|
||||
pub struct BlobIdMap<V> {
|
||||
maps: [Mutex<FxHashMap<ObjectId, V>>; 256],
|
||||
}
|
||||
|
||||
impl<V> BlobIdMap<V> {
|
||||
/// Creates a new empty `BlobIdMap`.
|
||||
pub fn new() -> Self {
|
||||
BlobIdMap { maps: std::array::from_fn(|_| Mutex::new(FxHashMap::default())) }
|
||||
}
|
||||
|
||||
/// Inserts a value, returning the previous value if one existed.
|
||||
#[inline]
|
||||
pub fn insert(&self, blob_id: BlobId, v: V) -> Option<V> {
|
||||
let idx = blob_id.as_bytes()[0] as usize;
|
||||
self.maps[idx].lock().insert(blob_id.into(), v)
|
||||
}
|
||||
|
||||
/// Returns true if the map contains the given key.
|
||||
#[inline]
|
||||
pub fn contains_key(&self, blob_id: &BlobId) -> bool {
|
||||
let idx = blob_id.as_bytes()[0] as usize;
|
||||
self.maps[idx].lock().contains_key(&ObjectId::from(blob_id))
|
||||
}
|
||||
|
||||
/// Returns the total number of entries in the map.
|
||||
///
|
||||
/// Note: This is not a cheap operation as it must lock all shards.
|
||||
pub fn len(&self) -> usize {
|
||||
self.maps.iter().map(|m| m.lock().len()).sum()
|
||||
}
|
||||
|
||||
/// Returns true if the map is empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.maps.iter().all(|m| m.lock().is_empty())
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Copy> BlobIdMap<V> {
|
||||
/// Gets a copy of the value for the given key.
|
||||
#[inline]
|
||||
pub fn get(&self, blob_id: &BlobId) -> Option<V> {
|
||||
let idx = blob_id.as_bytes()[0] as usize;
|
||||
self.maps[idx].lock().get(&ObjectId::from(blob_id)).copied()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V> Default for BlobIdMap<V> {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata about a blob.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, JsonSchema)]
|
||||
pub struct BlobMetadata {
|
||||
/// The blob's content-based ID.
|
||||
pub id: BlobId,
|
||||
|
||||
/// The length of the blob in bytes.
|
||||
pub num_bytes: usize,
|
||||
|
||||
/// The guessed MIME type of the blob (e.g., "text/plain").
|
||||
pub mime_essence: Option<String>,
|
||||
|
||||
/// The guessed programming language of the blob (e.g., "Python").
|
||||
pub language: Option<String>,
|
||||
}
|
||||
|
||||
impl BlobMetadata {
|
||||
/// Returns the size in bytes.
|
||||
#[inline]
|
||||
pub fn num_bytes(&self) -> usize {
|
||||
self.num_bytes
|
||||
}
|
||||
|
||||
/// Returns the size in megabytes, rounded to 3 decimal places.
|
||||
#[inline]
|
||||
pub fn num_megabytes(&self) -> f64 {
|
||||
let mb = self.num_bytes as f64 / 1_048_576.0;
|
||||
format!("{:.3}", mb).parse::<f64>().unwrap_or(mb)
|
||||
}
|
||||
|
||||
/// Returns the MIME essence if known.
|
||||
#[inline]
|
||||
pub fn mime_essence(&self) -> Option<&str> {
|
||||
self.mime_essence.as_deref()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_blob_id_empty() {
|
||||
assert_eq!(BlobId::new(&[]).hex(), "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_blob_id_small() {
|
||||
assert_eq!(BlobId::new(&vec![0; 1024]).hex(), "06d7405020018ddf3cacee90fd4af10487da3d20");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_blob_from_bytes() {
|
||||
let blob = Blob::from_bytes(b"hello world".to_vec());
|
||||
assert_eq!(blob.len(), 11);
|
||||
assert_eq!(blob.bytes(), b"hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_blob_id_roundtrip() {
|
||||
let original = BlobId::new(b"test data");
|
||||
let hex = original.hex();
|
||||
let parsed = BlobId::from_hex(&hex).unwrap();
|
||||
assert_eq!(original, parsed);
|
||||
}
|
||||
}
|
||||
128
crates/kingfisher-core/src/bstring_escape.rs
Normal file
128
crates/kingfisher-core/src/bstring_escape.rs
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
//! Safe string escaping utilities.
|
||||
//!
|
||||
//! This module provides utilities for safely displaying byte strings that may
|
||||
//! contain non-UTF8 data, ANSI escape codes, or control characters.
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
fmt::{Display, Formatter},
|
||||
};
|
||||
|
||||
use console::strip_ansi_codes;
|
||||
|
||||
/// Escapes non-printing characters in a string while preserving whitespace.
|
||||
///
|
||||
/// Returns borrowed data if no escaping was needed, avoiding allocations.
|
||||
fn escape_nonprinting(s: &str) -> Cow<'_, str> {
|
||||
// Fast path - return original if no control chars (except whitespace)
|
||||
if s.chars().all(|ch| !ch.is_control() || ch.is_whitespace()) {
|
||||
return Cow::Borrowed(s);
|
||||
}
|
||||
|
||||
// Allocate with extra capacity for possible escape sequences
|
||||
let mut escaped = String::with_capacity(s.len() * 2);
|
||||
let mut chars = s.chars().peekable();
|
||||
|
||||
while let Some(ch) = chars.next() {
|
||||
match ch {
|
||||
// Handle ANSI escape sequences
|
||||
'\x1B' => continue,
|
||||
// Escape non-whitespace control characters
|
||||
ch if ch.is_control() && !ch.is_whitespace() => {
|
||||
use std::fmt::Write;
|
||||
write!(escaped, "{}", ch.escape_unicode()).expect("string writing must succeed");
|
||||
}
|
||||
// Pass through all other characters unchanged
|
||||
ch => escaped.push(ch),
|
||||
}
|
||||
}
|
||||
|
||||
Cow::Owned(escaped)
|
||||
}
|
||||
|
||||
/// A wrapper around `&[u8]` that provides safe string formatting.
|
||||
///
|
||||
/// When displayed, `Escaped` will:
|
||||
/// 1. Convert from UTF-8 with replacement of invalid sequences
|
||||
/// 2. Remove ANSI control sequences
|
||||
/// 3. Escape remaining control characters (except whitespace)
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use kingfisher_core::Escaped;
|
||||
///
|
||||
/// let bytes = b"Hello\x00World";
|
||||
/// let escaped = Escaped(bytes);
|
||||
/// assert_eq!(escaped.to_string(), "Hello\\u{0}World");
|
||||
/// ```
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Escaped<'a>(pub &'a [u8]);
|
||||
|
||||
impl Display for Escaped<'_> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
// First handle UTF-8 decoding with replacement characters
|
||||
let decoded = String::from_utf8_lossy(self.0);
|
||||
// Then strip ANSI sequences and escape control chars
|
||||
let stripped = strip_ansi_codes(&decoded);
|
||||
let escaped = escape_nonprinting(&stripped);
|
||||
f.write_str(&escaped)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_escape_normal_text() {
|
||||
let input = "Hello, World!";
|
||||
let result = escape_nonprinting(input);
|
||||
assert!(matches!(result, Cow::Borrowed(_)));
|
||||
assert_eq!(result, "Hello, World!");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_with_whitespace() {
|
||||
let input = "Hello\n\t World!";
|
||||
let result = escape_nonprinting(input);
|
||||
assert!(matches!(result, Cow::Borrowed(_)));
|
||||
assert_eq!(result, "Hello\n\t World!");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_control_chars() {
|
||||
let input = "Hello\x00World\x01";
|
||||
let result = escape_nonprinting(input);
|
||||
assert!(matches!(result, Cow::Owned(_)));
|
||||
assert_eq!(result, "Hello\\u{0}World\\u{1}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escaped_struct_simple() {
|
||||
let bytes = b"Hello World";
|
||||
let escaped = Escaped(bytes);
|
||||
assert_eq!(escaped.to_string(), "Hello World");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escaped_struct_ansi_codes() {
|
||||
let bytes = b"\x1b[31mRed\x1b[0m \x1b[32mGreen\x1b[0m";
|
||||
let escaped = Escaped(bytes);
|
||||
assert_eq!(escaped.to_string(), "Red Green");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escaped_struct_invalid_utf8() {
|
||||
let bytes = b"Hello\xFF\xFEWorld";
|
||||
let escaped = Escaped(bytes);
|
||||
assert_eq!(escaped.to_string(), "Hello\u{FFFD}\u{FFFD}World");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escaped_struct_empty() {
|
||||
let bytes = b"";
|
||||
let escaped = Escaped(bytes);
|
||||
assert_eq!(escaped.to_string(), "");
|
||||
}
|
||||
}
|
||||
89
crates/kingfisher-core/src/entropy.rs
Normal file
89
crates/kingfisher-core/src/entropy.rs
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
//! Shannon entropy calculation.
|
||||
//!
|
||||
//! Entropy is used to filter out low-entropy strings that are unlikely
|
||||
//! to be real secrets.
|
||||
|
||||
/// Calculates the Shannon entropy of a byte slice.
|
||||
///
|
||||
/// Returns a value between 0.0 (completely uniform) and 8.0 (maximum entropy
|
||||
/// for random bytes). Typical thresholds for secret detection are around 3.5-4.5.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use kingfisher_core::calculate_shannon_entropy;
|
||||
///
|
||||
/// // Low entropy (repeated character)
|
||||
/// let entropy = calculate_shannon_entropy(b"aaaaaaaaaa");
|
||||
/// assert!(entropy < 0.1);
|
||||
///
|
||||
/// // High entropy (random-looking)
|
||||
/// let entropy = calculate_shannon_entropy(b"j2k#9K$mL*p&vN3");
|
||||
/// assert!(entropy > 3.5);
|
||||
/// ```
|
||||
pub fn calculate_shannon_entropy(bytes: &[u8]) -> f32 {
|
||||
if bytes.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Count occurrences of each byte value (0-255)
|
||||
let mut counts = [0u32; 256];
|
||||
for &byte in bytes {
|
||||
counts[byte as usize] += 1;
|
||||
}
|
||||
|
||||
let total_bytes = bytes.len() as f32;
|
||||
|
||||
// Sum entropy contribution for each byte that appears at least once
|
||||
counts.iter().filter(|&&count| count > 0).fold(0.0, |entropy, &count| {
|
||||
let probability = count as f32 / total_bytes;
|
||||
entropy - probability * probability.log2()
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_entropy_empty() {
|
||||
let entropy = calculate_shannon_entropy(&[]);
|
||||
assert_eq!(entropy, 0.0);
|
||||
assert!(entropy.is_finite());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_entropy_uniform() {
|
||||
// Single repeated byte should return 0.0
|
||||
let entropy = calculate_shannon_entropy(&[65, 65, 65, 65]);
|
||||
assert_eq!(entropy, 0.0);
|
||||
assert!(entropy.is_finite());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_entropy_two_values() {
|
||||
// Even distribution of two bytes should be exactly 1.0
|
||||
let input = &[1, 2, 1, 2];
|
||||
let entropy = calculate_shannon_entropy(input);
|
||||
assert!((entropy - 1.0).abs() < 0.0001);
|
||||
assert!(entropy.is_finite());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_entropy_password() {
|
||||
// Real password example should have mid-range entropy
|
||||
let password = "Password123!".as_bytes();
|
||||
let entropy = calculate_shannon_entropy(password);
|
||||
assert!(entropy > 2.5);
|
||||
assert!(entropy.is_finite());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_entropy_random() {
|
||||
// Random-looking string should have high entropy
|
||||
let random = "j2k#9K$mL*p&vN3".as_bytes();
|
||||
let entropy = calculate_shannon_entropy(random);
|
||||
assert!(entropy > 3.5);
|
||||
assert!(entropy.is_finite());
|
||||
}
|
||||
}
|
||||
36
crates/kingfisher-core/src/error.rs
Normal file
36
crates/kingfisher-core/src/error.rs
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
//! Error types for kingfisher-core.
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
/// The primary error type for kingfisher-core operations.
|
||||
#[derive(Error, Debug)]
|
||||
pub enum Error {
|
||||
/// An I/O error occurred.
|
||||
#[error("I/O error: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
|
||||
/// Failed to parse a blob ID from hex.
|
||||
#[error("Invalid blob ID: {0}")]
|
||||
InvalidBlobId(String),
|
||||
|
||||
/// A hex decoding error occurred.
|
||||
#[error("Hex decode error: {0}")]
|
||||
HexDecode(#[from] hex::FromHexError),
|
||||
|
||||
/// Failed to open or read a Git repository.
|
||||
#[error("Git error: {0}")]
|
||||
Git(String),
|
||||
|
||||
/// A generic error with a message.
|
||||
#[error("{0}")]
|
||||
Other(String),
|
||||
}
|
||||
|
||||
impl From<gix::open::Error> for Error {
|
||||
fn from(e: gix::open::Error) -> Self {
|
||||
Error::Git(e.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// A specialized Result type for kingfisher-core operations.
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
159
crates/kingfisher-core/src/git_commit_metadata.rs
Normal file
159
crates/kingfisher-core/src/git_commit_metadata.rs
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
//! Git commit metadata types.
|
||||
//!
|
||||
//! This module provides types for tracking commit information associated
|
||||
//! with blobs found in git history.
|
||||
|
||||
use gix::{date::Time, ObjectId};
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
// Wrapper for serializing gix::date::Time as text
|
||||
#[repr(transparent)]
|
||||
#[derive(Serialize, Deserialize, Copy, Clone)]
|
||||
#[serde(remote = "Time")]
|
||||
struct TextTime(
|
||||
#[serde(
|
||||
getter = "text_time::getter",
|
||||
serialize_with = "text_time::serialize",
|
||||
deserialize_with = "text_time::deserialize"
|
||||
)]
|
||||
Time,
|
||||
);
|
||||
|
||||
impl From<TextTime> for Time {
|
||||
fn from(v: TextTime) -> Self {
|
||||
v.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Time> for TextTime {
|
||||
fn from(v: Time) -> Self {
|
||||
Self(v)
|
||||
}
|
||||
}
|
||||
|
||||
mod text_time {
|
||||
use super::*;
|
||||
|
||||
#[inline]
|
||||
pub fn getter(v: &Time) -> &Time {
|
||||
v
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn serialize<S: serde::Serializer>(v: &Time, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
serializer.collect_str(v)
|
||||
}
|
||||
|
||||
pub fn deserialize<'de, D: serde::Deserializer<'de>>(d: D) -> Result<Time, D::Error> {
|
||||
struct Vis;
|
||||
impl<'a> serde::de::Visitor<'a> for Vis {
|
||||
type Value = Time;
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
formatter.write_str("a string representing a Git timestamp")
|
||||
}
|
||||
fn visit_str<E: serde::de::Error>(self, v: &str) -> Result<Self::Value, E> {
|
||||
gix::date::parse(v, None).map_err(E::custom)
|
||||
}
|
||||
}
|
||||
d.deserialize_str(Vis)
|
||||
}
|
||||
}
|
||||
|
||||
impl JsonSchema for TextTime {
|
||||
fn schema_name() -> String {
|
||||
"Time".into()
|
||||
}
|
||||
|
||||
fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
|
||||
String::json_schema(gen)
|
||||
}
|
||||
}
|
||||
|
||||
// Wrapper for serializing gix::ObjectId as hex
|
||||
#[repr(transparent)]
|
||||
#[derive(Serialize, Deserialize, Copy, Clone)]
|
||||
#[serde(remote = "ObjectId")]
|
||||
struct HexObjectId(
|
||||
#[serde(
|
||||
getter = "hex_object_id::getter",
|
||||
serialize_with = "hex_object_id::serialize",
|
||||
deserialize_with = "hex_object_id::deserialize"
|
||||
)]
|
||||
ObjectId,
|
||||
);
|
||||
|
||||
impl From<ObjectId> for HexObjectId {
|
||||
fn from(v: ObjectId) -> Self {
|
||||
HexObjectId(v)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<HexObjectId> for ObjectId {
|
||||
fn from(v: HexObjectId) -> Self {
|
||||
v.0
|
||||
}
|
||||
}
|
||||
|
||||
mod hex_object_id {
|
||||
use super::*;
|
||||
|
||||
#[inline]
|
||||
pub fn getter(v: &ObjectId) -> &ObjectId {
|
||||
v
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn serialize<S: serde::Serializer>(v: &ObjectId, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
serializer.collect_str(&v.to_hex())
|
||||
}
|
||||
|
||||
pub fn deserialize<'de, D: serde::Deserializer<'de>>(d: D) -> Result<ObjectId, D::Error> {
|
||||
struct Vis;
|
||||
impl<'a> serde::de::Visitor<'a> for Vis {
|
||||
type Value = ObjectId;
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
formatter.write_str("a 40-character hex string representing a Git object ID")
|
||||
}
|
||||
fn visit_str<E: serde::de::Error>(self, v: &str) -> Result<Self::Value, E> {
|
||||
ObjectId::from_hex(v.as_bytes()).map_err(E::custom)
|
||||
}
|
||||
}
|
||||
d.deserialize_str(Vis)
|
||||
}
|
||||
}
|
||||
|
||||
impl JsonSchema for HexObjectId {
|
||||
fn schema_name() -> String {
|
||||
"ObjectId".into()
|
||||
}
|
||||
|
||||
fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
|
||||
let s = String::json_schema(gen);
|
||||
let mut o = s.into_object();
|
||||
o.string().pattern = Some("[0-9a-f]{40}".into());
|
||||
let md = o.metadata();
|
||||
md.description = Some("A hex-encoded object ID as computed by Git".into());
|
||||
schemars::schema::Schema::Object(o)
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata about a Git commit.
|
||||
///
|
||||
/// This is used to track the provenance of blobs found in git history.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)]
|
||||
pub struct CommitMetadata {
|
||||
/// The SHA-1 commit ID.
|
||||
#[serde(with = "HexObjectId")]
|
||||
pub commit_id: ObjectId,
|
||||
|
||||
/// The committer's name.
|
||||
pub committer_name: String,
|
||||
|
||||
/// The committer's email address.
|
||||
pub committer_email: String,
|
||||
|
||||
/// The commit timestamp.
|
||||
#[serde(with = "TextTime")]
|
||||
pub committer_timestamp: Time,
|
||||
}
|
||||
27
crates/kingfisher-core/src/lib.rs
Normal file
27
crates/kingfisher-core/src/lib.rs
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
//! `kingfisher-core` provides the foundational types and traits shared across
|
||||
//! the Kingfisher secret scanning library.
|
||||
//!
|
||||
//! This crate contains:
|
||||
//! - [`Blob`] - Representation of scannable content (files, buffers, git objects)
|
||||
//! - [`Location`] - Source location tracking (byte offsets and line/column)
|
||||
//! - [`Origin`] - Provenance tracking (where content came from)
|
||||
//! - Utility functions for entropy calculation, string escaping, etc.
|
||||
|
||||
pub mod blob;
|
||||
pub mod bstring_escape;
|
||||
pub mod entropy;
|
||||
pub mod error;
|
||||
pub mod git_commit_metadata;
|
||||
pub mod location;
|
||||
pub mod origin;
|
||||
|
||||
// Re-export commonly used types at the crate root
|
||||
pub use blob::{
|
||||
Blob, BlobAppearance, BlobAppearanceSet, BlobData, BlobId, BlobIdMap, BlobMetadata,
|
||||
};
|
||||
pub use bstring_escape::Escaped;
|
||||
pub use entropy::calculate_shannon_entropy;
|
||||
pub use error::{Error, Result};
|
||||
pub use git_commit_metadata::CommitMetadata;
|
||||
pub use location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan};
|
||||
pub use origin::{CommitOrigin, ExtendedOrigin, FileOrigin, GitRepoOrigin, Origin, OriginSet};
|
||||
296
crates/kingfisher-core/src/location.rs
Normal file
296
crates/kingfisher-core/src/location.rs
Normal file
|
|
@ -0,0 +1,296 @@
|
|||
//! Source location tracking.
|
||||
//!
|
||||
//! This module provides types for tracking locations within source content:
|
||||
//! - [`OffsetSpan`] - Byte offset ranges
|
||||
//! - [`SourceSpan`] - Line/column ranges
|
||||
//! - [`Location`] - Combined byte and source location
|
||||
//! - [`LocationMapping`] - Efficient offset-to-line/column conversion
|
||||
|
||||
use core::ops::Range;
|
||||
use std::cell::RefCell;
|
||||
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// A point defined by a byte offset.
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, Copy, Clone)]
|
||||
pub struct OffsetPoint(pub usize);
|
||||
|
||||
impl OffsetPoint {
|
||||
#[inline]
|
||||
pub fn new(idx: usize) -> Self {
|
||||
OffsetPoint(idx)
|
||||
}
|
||||
}
|
||||
|
||||
/// A non-empty span defined by two byte offsets (half-open interval `[start, end)`).
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)]
|
||||
pub struct OffsetSpan {
|
||||
pub start: usize,
|
||||
pub end: usize,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for OffsetSpan {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}-{}", self.start, self.end)
|
||||
}
|
||||
}
|
||||
|
||||
impl OffsetSpan {
|
||||
/// Creates an `OffsetSpan` from two `OffsetPoint`s.
|
||||
#[inline]
|
||||
pub fn from_offsets(start: OffsetPoint, end: OffsetPoint) -> Self {
|
||||
OffsetSpan { start: start.0, end: end.0 }
|
||||
}
|
||||
|
||||
/// Creates an `OffsetSpan` from a `Range<usize>`.
|
||||
#[inline]
|
||||
pub fn from_range(range: Range<usize>) -> Self {
|
||||
OffsetSpan { start: range.start, end: range.end }
|
||||
}
|
||||
|
||||
/// Returns the length in bytes.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn len(&self) -> usize {
|
||||
self.end.saturating_sub(self.start)
|
||||
}
|
||||
|
||||
/// Returns true if empty or inverted.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.start >= self.end
|
||||
}
|
||||
|
||||
/// Returns true if `other` lies entirely within `self`.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn fully_contains(&self, other: &Self) -> bool {
|
||||
self.start <= other.start && other.end <= self.end
|
||||
}
|
||||
}
|
||||
|
||||
/// A point in source text (1-indexed line, 0-indexed column).
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone, Serialize, Deserialize, JsonSchema)]
|
||||
pub struct SourcePoint {
|
||||
pub line: usize,
|
||||
pub column: usize,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for SourcePoint {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}:{}", self.line, self.column)
|
||||
}
|
||||
}
|
||||
|
||||
/// A span between two source points (closed interval).
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize, JsonSchema)]
|
||||
pub struct SourceSpan {
|
||||
pub start: SourcePoint,
|
||||
pub end: SourcePoint,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for SourceSpan {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}-{}", self.start, self.end)
|
||||
}
|
||||
}
|
||||
|
||||
/// Efficiently maps byte offsets to line/column positions.
|
||||
///
|
||||
/// This lazily scans for newlines as needed, avoiding upfront cost for
|
||||
/// files where only a few locations are needed.
|
||||
pub struct LocationMapping<'a> {
|
||||
bytes: &'a [u8],
|
||||
newline_offsets: RefCell<Vec<usize>>,
|
||||
}
|
||||
|
||||
impl<'a> LocationMapping<'a> {
|
||||
/// Creates a new mapping for the given input bytes.
|
||||
pub fn new(input: &'a [u8]) -> Self {
|
||||
LocationMapping { bytes: input, newline_offsets: RefCell::new(Vec::new()) }
|
||||
}
|
||||
|
||||
fn ensure_offsets_up_to(&self, offset: usize) {
|
||||
let mut offsets = self.newline_offsets.borrow_mut();
|
||||
let start = offsets.last().map_or(0, |&last| last + 1);
|
||||
if offset < start {
|
||||
return;
|
||||
}
|
||||
let end = offset.min(self.bytes.len());
|
||||
for nl in memchr::memchr_iter(b'\n', &self.bytes[start..end]) {
|
||||
offsets.push(start + nl);
|
||||
}
|
||||
}
|
||||
|
||||
fn source_point_from_offsets(offsets: &[usize], offset: usize) -> SourcePoint {
|
||||
let line = match offsets.binary_search(&offset) {
|
||||
Ok(idx) => idx + 2,
|
||||
Err(idx) => idx + 1,
|
||||
};
|
||||
let column = if let Some(&last) = offsets.get(line.saturating_sub(2)) {
|
||||
offset.saturating_sub(last + 1)
|
||||
} else {
|
||||
offset
|
||||
};
|
||||
SourcePoint { line, column }
|
||||
}
|
||||
|
||||
/// Maps a byte offset to a `SourcePoint`.
|
||||
pub fn get_source_point(&self, offset: usize) -> SourcePoint {
|
||||
self.ensure_offsets_up_to(offset);
|
||||
let offsets = self.newline_offsets.borrow();
|
||||
Self::source_point_from_offsets(&offsets, offset)
|
||||
}
|
||||
|
||||
/// Maps an `OffsetSpan` to a `SourceSpan`.
|
||||
pub fn get_source_span(&self, span: &OffsetSpan) -> SourceSpan {
|
||||
self.ensure_offsets_up_to(span.end.saturating_sub(1));
|
||||
let offsets = self.newline_offsets.borrow();
|
||||
let start = Self::source_point_from_offsets(&offsets, span.start);
|
||||
let end = Self::source_point_from_offsets(&offsets, span.end.saturating_sub(1));
|
||||
SourceSpan { start, end }
|
||||
}
|
||||
}
|
||||
|
||||
/// Compact representation of a source span to reduce per-match footprint.
|
||||
#[derive(Debug, Clone, Copy, Deserialize, Serialize, JsonSchema)]
|
||||
pub struct CompactSourceSpan {
|
||||
pub start_line: u32,
|
||||
pub start_column: u32,
|
||||
pub end_line: u32,
|
||||
pub end_column: u32,
|
||||
}
|
||||
|
||||
impl CompactSourceSpan {
|
||||
#[inline]
|
||||
pub fn zero() -> Self {
|
||||
Self { start_line: 0, start_column: 0, end_line: 0, end_column: 0 }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn from_source_span(span: &SourceSpan) -> Self {
|
||||
Self {
|
||||
start_line: span.start.line.try_into().unwrap_or(0),
|
||||
start_column: span.start.column.try_into().unwrap_or(0),
|
||||
end_line: span.end.line.try_into().unwrap_or(0),
|
||||
end_column: span.end.column.try_into().unwrap_or(0),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn to_source_span(self) -> SourceSpan {
|
||||
SourceSpan {
|
||||
start: SourcePoint {
|
||||
line: usize::try_from(self.start_line).unwrap_or(0),
|
||||
column: usize::try_from(self.start_column).unwrap_or(0),
|
||||
},
|
||||
end: SourcePoint {
|
||||
line: usize::try_from(self.end_line).unwrap_or(0),
|
||||
column: usize::try_from(self.end_column).unwrap_or(0),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Combined byte offset and source location information.
|
||||
#[derive(Debug, Clone, Deserialize, JsonSchema)]
|
||||
pub struct Location {
|
||||
/// The byte offset span.
|
||||
pub offset_span: OffsetSpan,
|
||||
/// The optional source (line/column) span.
|
||||
#[serde(
|
||||
default,
|
||||
serialize_with = "serialize_compact_source_span",
|
||||
deserialize_with = "deserialize_compact_source_span"
|
||||
)]
|
||||
#[schemars(with = "SourceSpan")]
|
||||
pub source_span: Option<CompactSourceSpan>,
|
||||
}
|
||||
|
||||
impl serde::Serialize for Location {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeStruct;
|
||||
|
||||
let mut state = serializer.serialize_struct("Location", 2)?;
|
||||
state.serialize_field("offset_span", &self.offset_span)?;
|
||||
let source_span = self.source_span().unwrap_or_else(CompactSourceSpan::zero);
|
||||
state.serialize_field("source_span", &source_span.to_source_span())?;
|
||||
state.end()
|
||||
}
|
||||
}
|
||||
|
||||
impl Location {
|
||||
/// Creates a new `Location` with both offset and source spans.
|
||||
#[inline]
|
||||
pub fn with_source_span(offset_span: OffsetSpan, source_span: Option<SourceSpan>) -> Self {
|
||||
Self {
|
||||
offset_span,
|
||||
source_span: source_span.as_ref().map(CompactSourceSpan::from_source_span),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the compact source span if available.
|
||||
#[inline]
|
||||
pub fn source_span(&self) -> Option<CompactSourceSpan> {
|
||||
self.source_span
|
||||
}
|
||||
|
||||
/// Returns the source span, defaulting to zeros if not available.
|
||||
#[inline]
|
||||
pub fn resolved_source_span(&self) -> SourceSpan {
|
||||
self.source_span.unwrap_or_else(CompactSourceSpan::zero).to_source_span()
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_compact_source_span<S>(
|
||||
span: &Option<CompactSourceSpan>,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
let source_span = span.unwrap_or_else(CompactSourceSpan::zero).to_source_span();
|
||||
source_span.serialize(serializer)
|
||||
}
|
||||
|
||||
fn deserialize_compact_source_span<'de, D>(
|
||||
deserializer: D,
|
||||
) -> Result<Option<CompactSourceSpan>, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let span = SourceSpan::deserialize(deserializer)?;
|
||||
Ok(Some(CompactSourceSpan::from_source_span(&span)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_offset_span() {
|
||||
let span = OffsetSpan::from_range(10..20);
|
||||
assert_eq!(span.len(), 10);
|
||||
assert!(!span.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_location_mapping() {
|
||||
let input = b"line1\nline2\nline3";
|
||||
let mapping = LocationMapping::new(input);
|
||||
|
||||
// First line, first character
|
||||
assert_eq!(mapping.get_source_point(0), SourcePoint { line: 1, column: 0 });
|
||||
|
||||
// First line, last character
|
||||
assert_eq!(mapping.get_source_point(4), SourcePoint { line: 1, column: 4 });
|
||||
|
||||
// Second line, first character
|
||||
assert_eq!(mapping.get_source_point(6), SourcePoint { line: 2, column: 0 });
|
||||
}
|
||||
}
|
||||
310
crates/kingfisher-core/src/origin.rs
Normal file
310
crates/kingfisher-core/src/origin.rs
Normal file
|
|
@ -0,0 +1,310 @@
|
|||
//! Provenance tracking for scanned content.
|
||||
//!
|
||||
//! This module provides types for tracking where content came from:
|
||||
//! - [`FileOrigin`] - Content from a file path
|
||||
//! - [`GitRepoOrigin`] - Content from a git repository
|
||||
//! - [`ExtendedOrigin`] - Content from other sources (Jira, Confluence, etc.)
|
||||
//! - [`OriginSet`] - A non-empty collection of origins
|
||||
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use dashmap::DashMap;
|
||||
use once_cell::sync::Lazy;
|
||||
use rustc_hash::FxHashSet;
|
||||
use schemars::JsonSchema;
|
||||
use serde::{ser::SerializeSeq, Deserialize, Serialize};
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use crate::git_commit_metadata::CommitMetadata;
|
||||
|
||||
// Cache for git remote URLs to avoid repeated lookups
|
||||
static URL_CACHE: Lazy<DashMap<PathBuf, Arc<str>>> = Lazy::new(DashMap::default);
|
||||
|
||||
fn compute_url(repo_path: &Path) -> anyhow::Result<String> {
|
||||
let repo = gix::open(repo_path)?;
|
||||
let config = repo.config_snapshot();
|
||||
|
||||
let url_bytes =
|
||||
config.string("remote.origin.url").ok_or_else(|| anyhow::anyhow!("No remote URL found"))?;
|
||||
|
||||
use bstr::ByteSlice;
|
||||
if url_bytes.starts_with(b"http://") || url_bytes.starts_with(b"https://") {
|
||||
Ok(String::from_utf8_lossy(url_bytes.as_bytes()).into_owned())
|
||||
} else if url_bytes.starts_with(b"git@") {
|
||||
let url_str = String::from_utf8_lossy(url_bytes.as_bytes());
|
||||
if let Some(stripped) = url_str.strip_prefix("git@") {
|
||||
if let Some((domain, path)) = stripped.split_once(':') {
|
||||
Ok(format!("https://{}/{}", domain, path))
|
||||
} else {
|
||||
Err(anyhow::anyhow!("Invalid SSH URL format"))
|
||||
}
|
||||
} else {
|
||||
Err(anyhow::anyhow!("Invalid SSH URL format"))
|
||||
}
|
||||
} else {
|
||||
Err(anyhow::anyhow!(
|
||||
"Unsupported remote URL format: {}",
|
||||
String::from_utf8_lossy(url_bytes.as_bytes())
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the remote URL for a git repository, with caching.
|
||||
pub fn get_repo_url(repo_path: &Path) -> anyhow::Result<Arc<str>> {
|
||||
// Fast path: cache hit
|
||||
if let Some(u) = URL_CACHE.get(repo_path) {
|
||||
return Ok(u.clone());
|
||||
}
|
||||
|
||||
// Slow path: compute, intern, cache
|
||||
let url_arc: Arc<str> = compute_url(repo_path)?.into();
|
||||
URL_CACHE.insert(repo_path.to_path_buf(), url_arc.clone());
|
||||
Ok(url_arc)
|
||||
}
|
||||
|
||||
/// The provenance of a scanned blob.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)]
|
||||
#[serde(rename_all = "snake_case", tag = "kind")]
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
pub enum Origin {
|
||||
/// Content from a file on disk.
|
||||
File(FileOrigin),
|
||||
/// Content from a git repository.
|
||||
GitRepo(GitRepoOrigin),
|
||||
/// Content from an extended source (arbitrary JSON metadata).
|
||||
Extended(ExtendedOrigin),
|
||||
}
|
||||
|
||||
impl Origin {
|
||||
/// Creates an `Origin` for a plain file.
|
||||
pub fn from_file(path: PathBuf) -> Self {
|
||||
Origin::File(FileOrigin::new(path))
|
||||
}
|
||||
|
||||
/// Creates an `Origin` for a blob in a git repository without commit info.
|
||||
pub fn from_git_repo(repo_path: Arc<PathBuf>) -> Self {
|
||||
Origin::GitRepo(GitRepoOrigin { repo_path, first_commit: None })
|
||||
}
|
||||
|
||||
/// Creates an `Origin` for a blob in a git repository with commit info.
|
||||
pub fn from_git_repo_with_first_commit(
|
||||
repo_path: Arc<PathBuf>,
|
||||
commit_metadata: Arc<CommitMetadata>,
|
||||
blob_path: String,
|
||||
) -> Self {
|
||||
let first_commit = Some(CommitOrigin { commit_metadata, blob_path });
|
||||
Origin::GitRepo(GitRepoOrigin { repo_path, first_commit })
|
||||
}
|
||||
|
||||
/// Creates an `Origin` from arbitrary JSON metadata.
|
||||
pub fn from_extended(value: serde_json::Value) -> Self {
|
||||
Origin::Extended(ExtendedOrigin(value))
|
||||
}
|
||||
|
||||
/// Returns the path of the blob, if available.
|
||||
pub fn blob_path(&self) -> Option<&Path> {
|
||||
match self {
|
||||
Self::File(e) => Some(&e.path),
|
||||
Self::GitRepo(e) => e.first_commit.as_ref().map(|c| Path::new(&c.blob_path)),
|
||||
Self::Extended(e) => e.path(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the full filesystem path to the content, if available.
|
||||
pub fn full_path(&self) -> Option<PathBuf> {
|
||||
match self {
|
||||
Self::File(e) => Some((*e.path).clone()),
|
||||
Self::GitRepo(e) => e.first_commit.as_ref().map(|c| e.repo_path.join(&c.blob_path)),
|
||||
Self::Extended(e) => e.path().map(PathBuf::from),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Origin {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Origin::File(e) => write!(f, "file {}", e.path.display()),
|
||||
Origin::GitRepo(e) => match &e.first_commit {
|
||||
Some(md) => write!(
|
||||
f,
|
||||
"git repo {}: first seen in commit {} as {}",
|
||||
e.repo_path.display(),
|
||||
md.commit_metadata.commit_id,
|
||||
&md.blob_path,
|
||||
),
|
||||
None => write!(f, "git repo {}", e.repo_path.display()),
|
||||
},
|
||||
Origin::Extended(e) => write!(f, "extended {}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Origin information for a file on disk.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Hash)]
|
||||
pub struct FileOrigin {
|
||||
/// The file path.
|
||||
pub path: Arc<PathBuf>,
|
||||
}
|
||||
|
||||
impl FileOrigin {
|
||||
/// Creates a new `FileOrigin` from a path.
|
||||
pub fn new<P: Into<PathBuf>>(p: P) -> Self {
|
||||
Self { path: Arc::new(p.into()) }
|
||||
}
|
||||
}
|
||||
|
||||
/// Origin information for a blob in a git repository.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Hash)]
|
||||
pub struct GitRepoOrigin {
|
||||
/// Path to the repository on disk.
|
||||
pub repo_path: Arc<PathBuf>,
|
||||
/// Information about the first commit where this blob was seen.
|
||||
pub first_commit: Option<CommitOrigin>,
|
||||
}
|
||||
|
||||
/// Information about where a blob was first seen in git history.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Hash)]
|
||||
pub struct CommitOrigin {
|
||||
/// Metadata about the commit.
|
||||
pub commit_metadata: Arc<CommitMetadata>,
|
||||
/// The path of the blob within the commit.
|
||||
pub blob_path: String,
|
||||
}
|
||||
|
||||
/// An extended origin with arbitrary JSON metadata.
|
||||
///
|
||||
/// This is used for sources like Jira, Confluence, Slack, etc.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Hash)]
|
||||
pub struct ExtendedOrigin(pub serde_json::Value);
|
||||
|
||||
impl std::fmt::Display for ExtendedOrigin {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Display::fmt(&self.0, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl ExtendedOrigin {
|
||||
/// Returns the path from the extended origin, if available.
|
||||
pub fn path(&self) -> Option<&Path> {
|
||||
let p = self.0.get("path")?.as_str()?;
|
||||
Some(Path::new(p))
|
||||
}
|
||||
}
|
||||
|
||||
/// A non-empty set of [`Origin`] entries.
|
||||
///
|
||||
/// This is used when a blob has been seen in multiple locations
|
||||
/// (e.g., the same content in multiple files or commits).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OriginSet {
|
||||
origin: Origin,
|
||||
more_provenance: SmallVec<[Origin; 1]>,
|
||||
}
|
||||
|
||||
impl serde::Serialize for OriginSet {
|
||||
fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
|
||||
let mut seq = s.serialize_seq(Some(self.len()))?;
|
||||
for p in self.iter() {
|
||||
seq.serialize_element(p)?;
|
||||
}
|
||||
seq.end()
|
||||
}
|
||||
}
|
||||
|
||||
impl JsonSchema for OriginSet {
|
||||
fn schema_name() -> String {
|
||||
"OriginSet".into()
|
||||
}
|
||||
|
||||
fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
|
||||
let s = <Vec<Origin>>::json_schema(gen);
|
||||
let mut o = s.into_object();
|
||||
o.array().min_items = Some(1);
|
||||
let md = o.metadata();
|
||||
md.description = Some("A non-empty set of `Origin` entries".into());
|
||||
schemars::schema::Schema::Object(o)
|
||||
}
|
||||
}
|
||||
|
||||
impl OriginSet {
|
||||
/// Creates a new `OriginSet` with a single origin.
|
||||
#[inline]
|
||||
pub fn single(origin: Origin) -> Self {
|
||||
Self { origin, more_provenance: SmallVec::new() }
|
||||
}
|
||||
|
||||
/// Creates a new `OriginSet` from multiple origins.
|
||||
///
|
||||
/// Filters out redundant less-specific origins.
|
||||
pub fn new(origin: Origin, more_origin: Vec<Origin>) -> Self {
|
||||
let mut git_repos_with_detailed: FxHashSet<Arc<PathBuf>> = FxHashSet::default();
|
||||
for p in std::iter::once(&origin).chain(&more_origin) {
|
||||
if let Origin::GitRepo(e) = p {
|
||||
if e.first_commit.is_some() {
|
||||
git_repos_with_detailed.insert(e.repo_path.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut filtered = std::iter::once(origin).chain(more_origin).filter(|p| match p {
|
||||
Origin::GitRepo(e) => {
|
||||
e.first_commit.is_some() || !git_repos_with_detailed.contains(&e.repo_path)
|
||||
}
|
||||
Origin::File(_) => true,
|
||||
Origin::Extended(_) => true,
|
||||
});
|
||||
Self { origin: filtered.next().unwrap(), more_provenance: filtered.collect() }
|
||||
}
|
||||
|
||||
/// Attempts to create an `OriginSet` from an iterator.
|
||||
///
|
||||
/// Returns `None` if the iterator is empty.
|
||||
#[inline]
|
||||
pub fn try_from_iter<I>(it: I) -> Option<Self>
|
||||
where
|
||||
I: IntoIterator<Item = Origin>,
|
||||
{
|
||||
let mut it = it.into_iter();
|
||||
let provenance = it.next()?;
|
||||
let more_provenance = it.collect();
|
||||
Some(Self::new(provenance, more_provenance))
|
||||
}
|
||||
|
||||
/// Returns the first origin in the set.
|
||||
#[inline]
|
||||
pub fn first(&self) -> &Origin {
|
||||
&self.origin
|
||||
}
|
||||
|
||||
/// Returns the number of origins in the set.
|
||||
#[allow(clippy::len_without_is_empty)]
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
1 + self.more_provenance.len()
|
||||
}
|
||||
|
||||
/// Returns an iterator over all origins in the set.
|
||||
#[inline]
|
||||
pub fn iter(&self) -> impl Iterator<Item = &Origin> {
|
||||
std::iter::once(&self.origin).chain(&self.more_provenance)
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for OriginSet {
|
||||
type IntoIter =
|
||||
std::iter::Chain<std::iter::Once<Origin>, <Vec<Origin> as IntoIterator>::IntoIter>;
|
||||
type Item = Origin;
|
||||
|
||||
#[inline]
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
std::iter::once(self.origin).chain(self.more_provenance.into_vec().into_iter())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Origin> for OriginSet {
|
||||
fn from(p: Origin) -> Self {
|
||||
Self::single(p)
|
||||
}
|
||||
}
|
||||
64
crates/kingfisher-rules/Cargo.toml
Normal file
64
crates/kingfisher-rules/Cargo.toml
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
[package]
|
||||
name = "kingfisher-rules"
|
||||
version = "0.1.0"
|
||||
description = "Rule definitions and database for Kingfisher secret scanner"
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
license.workspace = true
|
||||
authors.workspace = true
|
||||
homepage.workspace = true
|
||||
repository.workspace = true
|
||||
publish.workspace = true
|
||||
|
||||
[dependencies]
|
||||
# Internal dependencies
|
||||
kingfisher-core = { path = "../kingfisher-core" }
|
||||
|
||||
# Serialization
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
serde_yaml = "0.9"
|
||||
schemars = "0.8"
|
||||
|
||||
# Error handling
|
||||
anyhow = "1.0"
|
||||
thiserror = "1.0"
|
||||
|
||||
# Regex
|
||||
regex = "1.12"
|
||||
lazy_static = "1.5"
|
||||
|
||||
# Hashing
|
||||
xxhash-rust = { version = "0.8", features = ["xxh3"] }
|
||||
|
||||
# Liquid templating (for checksum validation)
|
||||
liquid = "0.26"
|
||||
liquid-core = "0.26"
|
||||
|
||||
# Crypto for liquid filters
|
||||
base64 = "0.22"
|
||||
crc32fast = "1.5"
|
||||
hmac = "0.12"
|
||||
sha1 = "0.10"
|
||||
sha2 = "0.10"
|
||||
percent-encoding = "2.3"
|
||||
time = "0.3"
|
||||
uuid = { version = "1.19", features = ["v4"] }
|
||||
rand = "0.9"
|
||||
|
||||
# Vectorscan for pattern matching
|
||||
vectorscan-rs = "0.0.5"
|
||||
|
||||
# Logging
|
||||
tracing = "0.1"
|
||||
|
||||
# File walking
|
||||
walkdir = "2.5"
|
||||
ignore = "0.4"
|
||||
|
||||
# Embedded rule files
|
||||
include_dir = "0.7"
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions = "1.4"
|
||||
proptest = "1.6"
|
||||
|
|
@ -80,4 +80,4 @@ rules:
|
|||
- https://htmlpreview.github.io/?https://github.com/FiloSottile/age/blob/main/doc/age.1.html
|
||||
- https://github.com/C2SP/C2SP/blob/037e546d164a89fd7577df2c18df80bb54bd246e/age.md#the-mlkem768-x25519-ie-x-wing-hybrid-post-quantum-recipient-type
|
||||
categories:
|
||||
- secret
|
||||
- secret
|
||||
|
|
@ -2,9 +2,9 @@ rules:
|
|||
- name: Alibaba Access Key ID
|
||||
id: kingfisher.alibabacloud.1
|
||||
pattern: |
|
||||
(?xi)
|
||||
(?x)
|
||||
(
|
||||
LTAI[a-z0-9]{17,21}
|
||||
LTAI([a-zA-Z0-9]{12,20})
|
||||
)
|
||||
\b
|
||||
pattern_requirements:
|
||||
|
|
@ -63,6 +63,8 @@ rules:
|
|||
},
|
||||
validation:
|
||||
type: AWS
|
||||
revocation:
|
||||
type: AWS
|
||||
depends_on_rule:
|
||||
- rule_id: kingfisher.aws.1
|
||||
variable: AKID
|
||||
|
|
@ -107,7 +109,7 @@ rules:
|
|||
|
||||
- name: AWS Bedrock API Key (Short-lived)
|
||||
id: kingfisher.aws.6
|
||||
pattern: |
|
||||
pattern: |
|
||||
(?x)
|
||||
(
|
||||
bedrock-api-key-YmVkcm9jay5hbWF6b25hd3MuY29t[A-Za-z0-9+/]+={0,2}
|
||||
|
|
@ -30,4 +30,16 @@ rules:
|
|||
status: [200]
|
||||
- type: WordMatch
|
||||
words: ['"uuid"', '"user"']
|
||||
revocation:
|
||||
type: Http
|
||||
content:
|
||||
request:
|
||||
method: DELETE
|
||||
url: https://api.buildkite.com/v2/access-token
|
||||
headers:
|
||||
Authorization: "Bearer {{ TOKEN }}"
|
||||
response_matcher:
|
||||
- report_response: true
|
||||
- type: StatusMatch
|
||||
status: [204]
|
||||
|
||||
|
|
@ -2,12 +2,12 @@ rules:
|
|||
- name: Docker Hub Personal Access Token
|
||||
id: kingfisher.dockerhub.1
|
||||
pattern: |
|
||||
(?x)
|
||||
(?xi)
|
||||
\b
|
||||
(
|
||||
dckr_pat_[A-Za-z0-9_-]{27}
|
||||
dckr_pat_[A-Z0-9_-]{27}
|
||||
)
|
||||
(?: $ | [^A-Za-z0-9_-] )
|
||||
(?: $ | [^A-Z0-9_-] )
|
||||
pattern_requirements:
|
||||
min_digits: 2
|
||||
min_entropy: 3.3
|
||||
|
|
@ -17,30 +17,62 @@ rules:
|
|||
- docker login -u gemesa -p dckr_pat_tkzBYxjNNC3R_Yg6jd_O-G8FbrJ
|
||||
- docker login -u gemesa -p dckr_pat_1q8yKET1VDJTpfCwseUDzT8vFh-
|
||||
references:
|
||||
- https://docs.docker.com/reference/api/hub/latest/#tag/access-tokens/paths/~1v2~1access-tokens~1%7Buuid%7D/get
|
||||
- https://docs.docker.com/reference/api/hub/latest/#tag/authentication-api/operation/AuthCreateAccessToken
|
||||
depends_on_rule:
|
||||
- rule_id: kingfisher.dockerhub.2
|
||||
variable: DOCKER_USERNAME
|
||||
validation:
|
||||
type: Http
|
||||
content:
|
||||
request:
|
||||
method: POST
|
||||
url: https://hub.docker.com/v2/auth/token
|
||||
headers:
|
||||
Authorization: Bearer {{ TOKEN }}
|
||||
Content-Type: application/json
|
||||
Accept: application/json
|
||||
method: GET
|
||||
body: '{"identifier":"{{ DOCKER_USERNAME | json_escape }}","secret":"{{ TOKEN | json_escape }}"}'
|
||||
response_matcher:
|
||||
- report_response: true
|
||||
- status:
|
||||
- 200
|
||||
type: StatusMatch
|
||||
url: https://hub.docker.com/v2/access-tokens?page_size=1
|
||||
- name: Docker Hub Organization Access Token
|
||||
- type: StatusMatch
|
||||
status: [200]
|
||||
- type: WordMatch
|
||||
words:
|
||||
- '"access_token"'
|
||||
|
||||
- name: Docker Hub Username
|
||||
id: kingfisher.dockerhub.2
|
||||
pattern: |
|
||||
(?x)
|
||||
(?xi)
|
||||
\b
|
||||
docker
|
||||
(?:.|[\n\r]){0,32}?
|
||||
(?:
|
||||
-u\s+
|
||||
| --username[=\s]+
|
||||
| user(?:name)?[=:\s]+["']?
|
||||
)
|
||||
(
|
||||
[a-z0-9][a-z0-9_-]{2,29}
|
||||
)
|
||||
\b
|
||||
min_entropy: 1.0
|
||||
confidence: medium
|
||||
visible: false
|
||||
examples:
|
||||
- docker login -u gemesa -p dckr_pat_xxx
|
||||
- docker login --username=myuser
|
||||
- DOCKER_USERNAME=myuser
|
||||
- 'docker_user: "myuser"'
|
||||
|
||||
- name: Docker Hub Organization Access Token
|
||||
id: kingfisher.dockerhub.3
|
||||
pattern: |
|
||||
(?xi)
|
||||
\b
|
||||
(
|
||||
dckr_oat_[A-Za-z0-9_-]{32}
|
||||
dckr_oat_[A-Z0-9_-]{32}
|
||||
)
|
||||
(?: $ | [^A-Za-z0-9_-] )
|
||||
(?: $ | [^A-Z0-9_-] )
|
||||
pattern_requirements:
|
||||
min_digits: 2
|
||||
min_entropy: 3.3
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue