diff --git a/Cargo.toml b/Cargo.toml index 896bfe5..f80052d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,12 @@ +[workspace] +members = [ + ".", + "crates/kingfisher-core", + "crates/kingfisher-rules", + "crates/kingfisher-scanner", +] +resolver = "2" + [workspace.package] edition = "2021" rust-version = "1.90" @@ -40,6 +49,11 @@ assets = [ ] [dependencies] +# Library crates +kingfisher-core = { path = "crates/kingfisher-core" } +kingfisher-rules = { path = "crates/kingfisher-rules" } +kingfisher-scanner = { path = "crates/kingfisher-scanner" } + clap = { version = "4.5", features = [ "cargo", "derive", diff --git a/README.md b/README.md index f92e38a..157c525 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ Designed for offensive security engineers and blue-teamers alike, Kingfisher hel - **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) - **Checksum-aware detection**: verifies tokens with built-in checksums (e.g., GitHub, Confluent, Zuplo) — no API calls required - **Built-in Report Viewer**: Visualize and triage findings locally with `kingisher view ./report-file.json` +- **Library crates**: Embed Kingfisher's scanning engine in your own Rust applications ([docs/LIBRARY.md](docs/LIBRARY.md)) **Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation) @@ -58,7 +59,7 @@ NOTE: Replay has been slowed down for demo ![alt text](docs/kingfisher-usage-01.gif) ## Report Viewer Demo -Explore Kingfisher’s built-in report viewer and its `--access-map`, which can show what the token (AWS, GCP, Azure, GitHub, and GitLab...more coming) can actually access : [Access map outputs and viewer](#access-map-outputs-and-viewer) +Explore Kingfisher’s built-in report viewer and its `--access-map`, which can show what the token (AWS, GCP, Azure, GitHub, GitLab, and Slack...more coming) can actually access : [Access map outputs and viewer](#access-map-outputs-and-viewer) Note: when you pass `--view-report`, Kingfisher starts a **localhost-only** web server on port `7890` and opens it in your default browser. You’ll see this near the end of the scan output, and **Kingfisher will keep running** until you stop it. @@ -117,6 +118,7 @@ kingfisher scan /path/to/scan --access-map --view-report - [Access map outputs and viewer](#access-map-outputs-and-viewer) - [View access-map reports locally](#view-access-map-reports-locally) - [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--) + - [Direct secret validation with `kingfisher validate`](#direct-secret-validation-with-kingfisher-validate) - [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size) - [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag) - [Display rule performance statistics](#display-rule-performance-statistics) @@ -189,6 +191,7 @@ kingfisher scan /path/to/scan --access-map --view-report - [Rule Performance Profiling](#rule-performance-profiling) - [CLI Options](#cli-options) - [Lineage and Evolution](#lineage-and-evolution) +- [Library Usage](#library-usage) - [Roadmap](#roadmap) - [License](#license) @@ -593,7 +596,7 @@ kingfisher scan /path/to/repo --format sarif --output findings.sarif Finding a leaked credential is only the first step. The critical question isn’t just “Is this a secret?”—it’s “What can an attacker do with it?” -Kingfisher's `--access-map` feature transforms secret detection from a simple alert into a comprehensive threat assessment. Instead of leaving you with a cryptic API key, Kingfisher actively authenticates against your cloud provider (AWS, GCP, Azure Storage, Azure DevOps, GitHub, or GitLab) to map the full extent of the credential's power. +Kingfisher's `--access-map` feature transforms secret detection from a simple alert into a comprehensive threat assessment. Instead of leaving you with a cryptic API key, Kingfisher actively authenticates against your cloud provider (AWS, GCP, Azure Storage, Azure DevOps, GitHub, GitLab, or Slack) to map the full extent of the credential's power. * Instant Identity Resolution: Immediately identify who the key belongs to—whether it's a specific IAM user, an assumed role, or a service account. * Visualize the Blast Radius: See exactly which resources (S3 buckets, EC2 instances, projects, storage containers) are exposed and at risk. @@ -624,6 +627,77 @@ cat /path/to/file.py | kingfisher scan - ``` +### Direct secret validation with `kingfisher validate` + +When you already know a secret's type and have the raw value, use `kingfisher validate` to check if it's still active—without needing the surrounding context that detection rules require. + +This is useful for: +- Re-validating a known secret from a previous scan +- Checking if a credential is still active before rotation +- Validating secrets from external sources (password managers, ticketing systems, etc.) + +```bash +# Validate an OpsGenie API key (using rule prefix matching) +kingfisher validate --rule kingfisher.opsgenie "12345678-9abc-def0-1234-56789abcdef0" + +# Validate from stdin +echo "ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" | kingfisher validate --rule kingfisher.github - + +# JSON output for scripting +kingfisher validate --rule kingfisher.slack "xoxb-..." --format json + +# AWS credentials - use --arg to auto-assign additional values +kingfisher validate --rule kingfisher.aws --arg AKIAIOSFODNN7EXAMPLE \ + "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + +# Or use --var if you know the variable name +kingfisher validate --rule kingfisher.aws.2 --var AKID=AKIAIOSFODNN7EXAMPLE \ + "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + +# GCP service account (pass JSON as secret) +kingfisher validate --rule kingfisher.gcp "$(cat service-account.json)" + +# MongoDB connection string +kingfisher validate --rule kingfisher.mongodb.3 \ + "mongodb+srv://user:password@cluster.mongodb.net/db" + +# PostgreSQL connection +kingfisher validate --rule kingfisher.postgres \ + "postgres://admin:password@db.example.com:5432/mydb" + +# JWT token +kingfisher validate --rule kingfisher.jwt \ + "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9..." +``` + +**Supported validators:** HTTP, AWS, GCP, MongoDB, MySQL, Postgres, JDBC, JWT, Azure Storage, and Coinbase. + +**Exit codes:** Returns `0` if any matching rule validates the secret as valid, `1` if all are invalid or an error occurred. + +**Passing additional values (`--arg` and `--var`):** + +Some validators need more than just the secret. For example, AWS needs both an access key ID and the secret key (see the rule for `dependent_rule` section): + +- `--arg VALUE` — Auto-assigns values to template variables (in alphabetical order). Use when you don't know the exact variable name. +- `--var NAME=VALUE` — Explicitly sets a variable. Use when you know the exact name, or to override `--arg`. + +```bash +# --arg auto-assigns to AKID (the only non-TOKEN variable for AWS) +kingfisher validate --rule kingfisher.aws --arg AKIAEXAMPLE "secret_key" + +# --var for explicit assignment +kingfisher validate --rule kingfisher.aws --var AKID=AKIAEXAMPLE "secret_key" +``` + +**Rule prefix matching:** Use partial rule IDs like `kingfisher.opsgenie` instead of the full `kingfisher.opsgenie.1`. If the prefix matches multiple rules, **all matching rules with compatible variables are tried**: + +```bash +$ kingfisher validate --rule kingfisher.aws --arg AKIAEXAMPLE "secret_key" +Rule: AWS Secret Access Key (kingfisher.aws.2) +Result: ✓ VALID +Response: arn:aws:iam::123456789012:user/example +``` + ### Limit maximum file size scanned (`--max-file-size`) By default, Kingfisher skips files larger than **256 MB**. You can raise or lower this cap per run with `--max-file-size`, which takes a value in **megabytes**. @@ -1613,6 +1687,41 @@ Since then it has evolved far beyond that starting point, introducing live valid - **Unified workflow** with JSON/BSON/SARIF outputs - **Cross-platform builds** for Linux, macOS, and Windows +# Library Usage + +Kingfisher's scanning engine is available as a set of Rust library crates that can be embedded into other applications: + +| Crate | Description | +|-------|-------------| +| `kingfisher-core` | Core types: `Blob`, `BlobId`, `Location`, `Origin`, entropy calculation | +| `kingfisher-rules` | Rule definitions, YAML parsing, compiled rule database, 200+ builtin rules | +| `kingfisher-scanner` | High-level scanning API with `Scanner` and `Finding` types | + +**Quick example:** + +```rust +use std::sync::Arc; +use kingfisher_rules::{get_builtin_rules, RulesDatabase, Rule}; +use kingfisher_scanner::Scanner; + +// Load builtin rules and compile +let rules = get_builtin_rules(None)?; +let rule_vec: Vec = rules.iter_rules() + .map(|syntax| Rule::new(syntax.clone())) + .collect(); +let rules_db = Arc::new(RulesDatabase::from_rules(rule_vec)?); + +// Create scanner and scan +let scanner = Scanner::new(rules_db); +let findings = scanner.scan_file("config.yml")?; + +for finding in findings { + println!("{}: {}", finding.rule_name, finding.secret); +} +``` + +For complete documentation, see **[docs/LIBRARY.md](docs/LIBRARY.md)**. + # Roadmap - More rules diff --git a/crates/kingfisher-core/Cargo.toml b/crates/kingfisher-core/Cargo.toml new file mode 100644 index 0000000..dfd9513 --- /dev/null +++ b/crates/kingfisher-core/Cargo.toml @@ -0,0 +1,48 @@ +[package] +name = "kingfisher-core" +version = "0.1.0" +description = "Core types and traits for Kingfisher secret scanner" +edition.workspace = true +rust-version.workspace = true +license.workspace = true +authors.workspace = true +homepage.workspace = true +repository.workspace = true +publish.workspace = true + +[dependencies] +# Serialization +serde = { version = "1.0", features = ["derive", "rc"] } +serde_json = "1.0" +schemars = "0.8" + +# Error handling +anyhow = "1.0" +thiserror = "1.0" + +# Hashing and crypto +sha1 = "0.10" +hex = "0.4" + +# Memory management +memmap2 = "0.9" +once_cell = "1.21" +parking_lot = "0.12" + +# Collections +smallvec = { version = "1", features = ["const_generics", "const_new", "union"] } +rustc-hash = "2.1" +dashmap = "6.1" + +# Byte string handling +bstr = { version = "1.12", features = ["serde"] } +memchr = "2.7" + +# Git types (minimal, for ObjectId and Time) +gix = { version = "0.73", default-features = false, features = ["serde"] } + +# Console formatting +console = "0.15" + +[dev-dependencies] +pretty_assertions = "1.4" diff --git a/crates/kingfisher-core/src/blob.rs b/crates/kingfisher-core/src/blob.rs new file mode 100644 index 0000000..a25e60d --- /dev/null +++ b/crates/kingfisher-core/src/blob.rs @@ -0,0 +1,495 @@ +//! Blob representation for scannable content. +//! +//! A [`Blob`] represents content that can be scanned for secrets. It can be +//! created from: +//! - In-memory bytes ([`Blob::from_bytes`]) +//! - A file path ([`Blob::from_file`]) +//! - Borrowed data ([`Blob::from_borrowed`]) +//! +//! Large files are automatically memory-mapped for efficiency. + +use std::{ + convert::TryInto, + fs::File, + io::{Read, Write}, + path::Path, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, +}; + +use bstr::{BString, ByteSlice}; +use gix::ObjectId; +use once_cell::sync::OnceCell; +use parking_lot::Mutex; +use rustc_hash::FxHashMap; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use sha1::{Digest, Sha1}; +use smallvec::SmallVec; + +use crate::error::Result; +use crate::git_commit_metadata::CommitMetadata; + +/// Threshold above which files are memory-mapped instead of read into memory. +const LARGE_FILE_THRESHOLD: u64 = 0; // Currently: always mmap + +/// Global counter for temporary blob IDs. +static NEXT_ID: AtomicU64 = AtomicU64::new(1); + +/// Tracks where a blob was seen in git history. +#[derive(Clone, Debug, serde::Serialize)] +pub struct BlobAppearance { + /// Metadata about the commit where this blob appeared. + pub commit_metadata: Arc, + + /// The path of the blob within the repository. + pub path: BString, +} + +impl BlobAppearance { + /// Returns the path as a `&Path`, if it's valid UTF-8. + #[inline] + pub fn path(&self) -> std::result::Result<&Path, bstr::Utf8Error> { + self.path.to_path() + } +} + +/// A set of [`BlobAppearance`] entries, optimized for the common case of a single appearance. +pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>; + +/// The underlying data storage for a [`Blob`]. +pub enum BlobData<'a> { + /// Small blobs stored as owned bytes. + Owned(Vec), + + /// Large blobs that are memory-mapped from disk. + Mapped(memmap2::Mmap), + + /// Borrowed bytes (e.g., from a git pack file). + Borrowed(&'a [u8]), +} + +impl<'a> AsRef<[u8]> for BlobData<'a> { + fn as_ref(&self) -> &[u8] { + match self { + BlobData::Owned(v) => v, + BlobData::Mapped(m) => m, + BlobData::Borrowed(slice) => slice, + } + } +} + +impl<'a> BlobData<'a> { + /// Returns the length of the blob data in bytes. + #[inline] + pub fn len(&self) -> usize { + self.as_ref().len() + } + + /// Returns true if the blob data is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.as_ref().is_empty() + } +} + +/// A scannable blob of content. +/// +/// `Blob` is the primary type for representing content to be scanned. It lazily +/// computes a content-based ID (SHA-1) and supports multiple backing storage types. +/// +/// # Examples +/// +/// ``` +/// use kingfisher_core::Blob; +/// +/// // Create from bytes +/// let blob = Blob::from_bytes(b"my secret content".to_vec()); +/// assert_eq!(blob.len(), 17); +/// +/// // Create from file +/// // let blob = Blob::from_file("path/to/file.txt")?; +/// ``` +pub struct Blob<'a> { + /// Lazily computed content-based ID. + id: OnceCell, + /// The underlying data. + data: BlobData<'a>, + /// Temporary ID assigned at creation (for debugging/tracking). + temp_id: u64, +} + +impl Blob<'_> { + /// Create a new `Blob` by reading from a file. + /// + /// Large files are automatically memory-mapped for efficiency. + #[inline] + pub fn from_file>(path: P) -> Result { + let mut file = File::open(&path)?; + let file_size = file.metadata()?.len(); + let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + + if file_size > LARGE_FILE_THRESHOLD { + // Large files: one mmap, zero extra copies. + let mmap = unsafe { memmap2::Mmap::map(&file)? }; + Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id }) + } else { + // Small files: read into memory. + let mut bytes = Vec::with_capacity(file_size as usize); + file.read_to_end(&mut bytes)?; + Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }) + } + } + + /// Create a new `Blob` from a vector of bytes. + #[inline] + pub fn from_bytes(bytes: Vec) -> Self { + let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id } + } + + /// Create a new `Blob` with a pre-computed ID and owned data. + #[inline] + pub fn new(id: BlobId, bytes: Vec) -> Self { + let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + let cell = OnceCell::new(); + let _ = cell.set(id); + Blob { id: cell, data: BlobData::Owned(bytes), temp_id } + } + + /// Returns the blob's content as a byte slice. + #[inline] + pub fn bytes(&self) -> &[u8] { + self.data.as_ref() + } + + /// Lazily computes and returns the blob's content-based [`BlobId`]. + #[inline] + pub fn id(&self) -> BlobId { + *self.id.get_or_init(|| BlobId::new(self.bytes())) + } + + /// Returns a reference to the blob's [`BlobId`], computing it if necessary. + #[inline] + pub fn id_ref(&self) -> &BlobId { + self.id.get_or_init(|| BlobId::new(self.bytes())) + } + + /// Returns the temporary ID assigned when this blob was created. + #[inline] + pub fn temp_id(&self) -> u64 { + self.temp_id + } + + /// Returns the length of the blob in bytes. + #[inline] + pub fn len(&self) -> usize { + self.bytes().len() + } + + /// Returns true if the blob is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.bytes().is_empty() + } +} + +impl<'a> Blob<'a> { + /// Create a new `Blob` from borrowed bytes. + /// + /// This is useful for zero-copy scanning of data that already exists + /// in memory (e.g., from a git pack file). + #[inline] + pub fn from_borrowed(bytes: &'a [u8]) -> Self { + let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + Blob { id: OnceCell::new(), data: BlobData::Borrowed(bytes), temp_id } + } +} + +impl Drop for Blob<'_> { + fn drop(&mut self) { + // For owned data, clear and shrink to free memory promptly. + if let BlobData::Owned(ref mut v) = self.data { + v.clear(); + v.shrink_to_fit(); + } + } +} + +/// A content-based identifier for a blob, computed as a Git-compatible SHA-1 hash. +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone, Serialize)] +#[serde(into = "String")] +pub struct BlobId([u8; 20]); + +impl BlobId { + /// Creates a zero-filled (default) `BlobId`. + pub fn default() -> Self { + BlobId([0; 20]) + } + + /// Computes a `BlobId` from raw bytes. + /// + /// For large inputs, only the first and last 64KB are hashed for performance. + #[inline] + pub fn new(input: &[u8]) -> Self { + const CHUNK: usize = 64 * 1024; // 64KB from start and end + let mut hasher = Sha1::new(); + write!(&mut hasher, "blob {}\0", input.len()).unwrap(); + if input.len() <= CHUNK * 2 { + hasher.update(input); + } else { + hasher.update(&input[..CHUNK]); + hasher.update(&input[input.len() - CHUNK..]); + } + let digest: [u8; 20] = hasher.finalize().into(); + BlobId(digest) + } + + /// Computes a `BlobId` from the complete bytes (no truncation). + pub fn compute_from_bytes(bytes: &[u8]) -> Self { + let mut hasher = Sha1::new(); + write!(&mut hasher, "blob {}\0", bytes.len()).unwrap(); + hasher.update(bytes); + let digest: [u8; 20] = hasher.finalize().into(); + BlobId(digest) + } + + /// Parses a `BlobId` from a hex string. + #[inline] + pub fn from_hex(v: &str) -> crate::Result { + let bytes = hex::decode(v)?; + let arr: [u8; 20] = + bytes.as_slice().try_into().map_err(|_| crate::Error::InvalidBlobId(v.to_string()))?; + Ok(BlobId(arr)) + } + + /// Returns the blob ID as a hex string. + #[inline] + pub fn hex(&self) -> String { + hex::encode(self.0) + } + + /// Returns the raw bytes of the blob ID. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.0 + } +} + +impl<'de> Deserialize<'de> for BlobId { + fn deserialize>(d: D) -> std::result::Result { + struct Vis; + impl serde::de::Visitor<'_> for Vis { + type Value = BlobId; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("a 40-character hex string") + } + + fn visit_str( + self, + v: &str, + ) -> std::result::Result { + BlobId::from_hex(v).map_err(|e| serde::de::Error::custom(e)) + } + } + d.deserialize_str(Vis) + } +} + +impl std::fmt::Debug for BlobId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "BlobId({})", self.hex()) + } +} + +impl std::fmt::Display for BlobId { + #[inline] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.hex()) + } +} + +impl JsonSchema for BlobId { + fn schema_name() -> String { + "BlobId".into() + } + + fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema { + let s = String::json_schema(gen); + let mut o = s.into_object(); + o.string().pattern = Some("[0-9a-f]{40}".into()); + let md = o.metadata(); + md.description = Some("A hex-encoded blob ID as computed by Git".into()); + schemars::schema::Schema::Object(o) + } +} + +impl From for String { + #[inline] + fn from(blob_id: BlobId) -> String { + blob_id.hex() + } +} + +impl TryFrom<&str> for BlobId { + type Error = crate::Error; + + #[inline] + fn try_from(s: &str) -> std::result::Result { + BlobId::from_hex(s) + } +} + +impl<'a> From<&'a gix::ObjectId> for BlobId { + #[inline] + fn from(id: &'a gix::ObjectId) -> Self { + BlobId(id.as_bytes().try_into().expect("oid should be a 20-byte value")) + } +} + +impl From for BlobId { + #[inline] + fn from(id: gix::ObjectId) -> Self { + BlobId(id.as_bytes().try_into().expect("oid should be a 20-byte value")) + } +} + +impl<'a> From<&'a BlobId> for gix::ObjectId { + #[inline] + fn from(blob_id: &'a BlobId) -> Self { + gix::hash::ObjectId::try_from(blob_id.as_bytes()).unwrap() + } +} + +impl From for gix::ObjectId { + #[inline] + fn from(blob_id: BlobId) -> Self { + gix::hash::ObjectId::try_from(blob_id.as_bytes()).unwrap() + } +} + +/// A concurrent map with [`BlobId`] keys, optimized for low contention. +/// +/// This implementation uses 256 shards (based on the first byte of the blob ID) +/// to minimize lock contention during parallel scanning. +pub struct BlobIdMap { + maps: [Mutex>; 256], +} + +impl BlobIdMap { + /// Creates a new empty `BlobIdMap`. + pub fn new() -> Self { + BlobIdMap { maps: std::array::from_fn(|_| Mutex::new(FxHashMap::default())) } + } + + /// Inserts a value, returning the previous value if one existed. + #[inline] + pub fn insert(&self, blob_id: BlobId, v: V) -> Option { + let idx = blob_id.as_bytes()[0] as usize; + self.maps[idx].lock().insert(blob_id.into(), v) + } + + /// Returns true if the map contains the given key. + #[inline] + pub fn contains_key(&self, blob_id: &BlobId) -> bool { + let idx = blob_id.as_bytes()[0] as usize; + self.maps[idx].lock().contains_key(&ObjectId::from(blob_id)) + } + + /// Returns the total number of entries in the map. + /// + /// Note: This is not a cheap operation as it must lock all shards. + pub fn len(&self) -> usize { + self.maps.iter().map(|m| m.lock().len()).sum() + } + + /// Returns true if the map is empty. + pub fn is_empty(&self) -> bool { + self.maps.iter().all(|m| m.lock().is_empty()) + } +} + +impl BlobIdMap { + /// Gets a copy of the value for the given key. + #[inline] + pub fn get(&self, blob_id: &BlobId) -> Option { + let idx = blob_id.as_bytes()[0] as usize; + self.maps[idx].lock().get(&ObjectId::from(blob_id)).copied() + } +} + +impl Default for BlobIdMap { + fn default() -> Self { + Self::new() + } +} + +/// Metadata about a blob. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, JsonSchema)] +pub struct BlobMetadata { + /// The blob's content-based ID. + pub id: BlobId, + + /// The length of the blob in bytes. + pub num_bytes: usize, + + /// The guessed MIME type of the blob (e.g., "text/plain"). + pub mime_essence: Option, + + /// The guessed programming language of the blob (e.g., "Python"). + pub language: Option, +} + +impl BlobMetadata { + /// Returns the size in bytes. + #[inline] + pub fn num_bytes(&self) -> usize { + self.num_bytes + } + + /// Returns the size in megabytes, rounded to 3 decimal places. + #[inline] + pub fn num_megabytes(&self) -> f64 { + let mb = self.num_bytes as f64 / 1_048_576.0; + format!("{:.3}", mb).parse::().unwrap_or(mb) + } + + /// Returns the MIME essence if known. + #[inline] + pub fn mime_essence(&self) -> Option<&str> { + self.mime_essence.as_deref() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_blob_id_empty() { + assert_eq!(BlobId::new(&[]).hex(), "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"); + } + + #[test] + fn test_blob_id_small() { + assert_eq!(BlobId::new(&vec![0; 1024]).hex(), "06d7405020018ddf3cacee90fd4af10487da3d20"); + } + + #[test] + fn test_blob_from_bytes() { + let blob = Blob::from_bytes(b"hello world".to_vec()); + assert_eq!(blob.len(), 11); + assert_eq!(blob.bytes(), b"hello world"); + } + + #[test] + fn test_blob_id_roundtrip() { + let original = BlobId::new(b"test data"); + let hex = original.hex(); + let parsed = BlobId::from_hex(&hex).unwrap(); + assert_eq!(original, parsed); + } +} diff --git a/crates/kingfisher-core/src/bstring_escape.rs b/crates/kingfisher-core/src/bstring_escape.rs new file mode 100644 index 0000000..26e4c82 --- /dev/null +++ b/crates/kingfisher-core/src/bstring_escape.rs @@ -0,0 +1,128 @@ +//! Safe string escaping utilities. +//! +//! This module provides utilities for safely displaying byte strings that may +//! contain non-UTF8 data, ANSI escape codes, or control characters. + +use std::{ + borrow::Cow, + fmt::{Display, Formatter}, +}; + +use console::strip_ansi_codes; + +/// Escapes non-printing characters in a string while preserving whitespace. +/// +/// Returns borrowed data if no escaping was needed, avoiding allocations. +fn escape_nonprinting(s: &str) -> Cow<'_, str> { + // Fast path - return original if no control chars (except whitespace) + if s.chars().all(|ch| !ch.is_control() || ch.is_whitespace()) { + return Cow::Borrowed(s); + } + + // Allocate with extra capacity for possible escape sequences + let mut escaped = String::with_capacity(s.len() * 2); + let mut chars = s.chars().peekable(); + + while let Some(ch) = chars.next() { + match ch { + // Handle ANSI escape sequences + '\x1B' => continue, + // Escape non-whitespace control characters + ch if ch.is_control() && !ch.is_whitespace() => { + use std::fmt::Write; + write!(escaped, "{}", ch.escape_unicode()).expect("string writing must succeed"); + } + // Pass through all other characters unchanged + ch => escaped.push(ch), + } + } + + Cow::Owned(escaped) +} + +/// A wrapper around `&[u8]` that provides safe string formatting. +/// +/// When displayed, `Escaped` will: +/// 1. Convert from UTF-8 with replacement of invalid sequences +/// 2. Remove ANSI control sequences +/// 3. Escape remaining control characters (except whitespace) +/// +/// # Examples +/// +/// ``` +/// use kingfisher_core::Escaped; +/// +/// let bytes = b"Hello\x00World"; +/// let escaped = Escaped(bytes); +/// assert_eq!(escaped.to_string(), "Hello\\u{0}World"); +/// ``` +#[derive(Debug, Clone, Copy)] +pub struct Escaped<'a>(pub &'a [u8]); + +impl Display for Escaped<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + // First handle UTF-8 decoding with replacement characters + let decoded = String::from_utf8_lossy(self.0); + // Then strip ANSI sequences and escape control chars + let stripped = strip_ansi_codes(&decoded); + let escaped = escape_nonprinting(&stripped); + f.write_str(&escaped) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_escape_normal_text() { + let input = "Hello, World!"; + let result = escape_nonprinting(input); + assert!(matches!(result, Cow::Borrowed(_))); + assert_eq!(result, "Hello, World!"); + } + + #[test] + fn test_escape_with_whitespace() { + let input = "Hello\n\t World!"; + let result = escape_nonprinting(input); + assert!(matches!(result, Cow::Borrowed(_))); + assert_eq!(result, "Hello\n\t World!"); + } + + #[test] + fn test_escape_control_chars() { + let input = "Hello\x00World\x01"; + let result = escape_nonprinting(input); + assert!(matches!(result, Cow::Owned(_))); + assert_eq!(result, "Hello\\u{0}World\\u{1}"); + } + + #[test] + fn test_escaped_struct_simple() { + let bytes = b"Hello World"; + let escaped = Escaped(bytes); + assert_eq!(escaped.to_string(), "Hello World"); + } + + #[test] + fn test_escaped_struct_ansi_codes() { + let bytes = b"\x1b[31mRed\x1b[0m \x1b[32mGreen\x1b[0m"; + let escaped = Escaped(bytes); + assert_eq!(escaped.to_string(), "Red Green"); + } + + #[test] + fn test_escaped_struct_invalid_utf8() { + let bytes = b"Hello\xFF\xFEWorld"; + let escaped = Escaped(bytes); + assert_eq!(escaped.to_string(), "Hello\u{FFFD}\u{FFFD}World"); + } + + #[test] + fn test_escaped_struct_empty() { + let bytes = b""; + let escaped = Escaped(bytes); + assert_eq!(escaped.to_string(), ""); + } +} diff --git a/crates/kingfisher-core/src/entropy.rs b/crates/kingfisher-core/src/entropy.rs new file mode 100644 index 0000000..a3cebc6 --- /dev/null +++ b/crates/kingfisher-core/src/entropy.rs @@ -0,0 +1,89 @@ +//! Shannon entropy calculation. +//! +//! Entropy is used to filter out low-entropy strings that are unlikely +//! to be real secrets. + +/// Calculates the Shannon entropy of a byte slice. +/// +/// Returns a value between 0.0 (completely uniform) and 8.0 (maximum entropy +/// for random bytes). Typical thresholds for secret detection are around 3.5-4.5. +/// +/// # Examples +/// +/// ``` +/// use kingfisher_core::calculate_shannon_entropy; +/// +/// // Low entropy (repeated character) +/// let entropy = calculate_shannon_entropy(b"aaaaaaaaaa"); +/// assert!(entropy < 0.1); +/// +/// // High entropy (random-looking) +/// let entropy = calculate_shannon_entropy(b"j2k#9K$mL*p&vN3"); +/// assert!(entropy > 3.5); +/// ``` +pub fn calculate_shannon_entropy(bytes: &[u8]) -> f32 { + if bytes.is_empty() { + return 0.0; + } + + // Count occurrences of each byte value (0-255) + let mut counts = [0u32; 256]; + for &byte in bytes { + counts[byte as usize] += 1; + } + + let total_bytes = bytes.len() as f32; + + // Sum entropy contribution for each byte that appears at least once + counts.iter().filter(|&&count| count > 0).fold(0.0, |entropy, &count| { + let probability = count as f32 / total_bytes; + entropy - probability * probability.log2() + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_entropy_empty() { + let entropy = calculate_shannon_entropy(&[]); + assert_eq!(entropy, 0.0); + assert!(entropy.is_finite()); + } + + #[test] + fn test_entropy_uniform() { + // Single repeated byte should return 0.0 + let entropy = calculate_shannon_entropy(&[65, 65, 65, 65]); + assert_eq!(entropy, 0.0); + assert!(entropy.is_finite()); + } + + #[test] + fn test_entropy_two_values() { + // Even distribution of two bytes should be exactly 1.0 + let input = &[1, 2, 1, 2]; + let entropy = calculate_shannon_entropy(input); + assert!((entropy - 1.0).abs() < 0.0001); + assert!(entropy.is_finite()); + } + + #[test] + fn test_entropy_password() { + // Real password example should have mid-range entropy + let password = "Password123!".as_bytes(); + let entropy = calculate_shannon_entropy(password); + assert!(entropy > 2.5); + assert!(entropy.is_finite()); + } + + #[test] + fn test_entropy_random() { + // Random-looking string should have high entropy + let random = "j2k#9K$mL*p&vN3".as_bytes(); + let entropy = calculate_shannon_entropy(random); + assert!(entropy > 3.5); + assert!(entropy.is_finite()); + } +} diff --git a/crates/kingfisher-core/src/error.rs b/crates/kingfisher-core/src/error.rs new file mode 100644 index 0000000..b288d3c --- /dev/null +++ b/crates/kingfisher-core/src/error.rs @@ -0,0 +1,36 @@ +//! Error types for kingfisher-core. + +use thiserror::Error; + +/// The primary error type for kingfisher-core operations. +#[derive(Error, Debug)] +pub enum Error { + /// An I/O error occurred. + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), + + /// Failed to parse a blob ID from hex. + #[error("Invalid blob ID: {0}")] + InvalidBlobId(String), + + /// A hex decoding error occurred. + #[error("Hex decode error: {0}")] + HexDecode(#[from] hex::FromHexError), + + /// Failed to open or read a Git repository. + #[error("Git error: {0}")] + Git(String), + + /// A generic error with a message. + #[error("{0}")] + Other(String), +} + +impl From for Error { + fn from(e: gix::open::Error) -> Self { + Error::Git(e.to_string()) + } +} + +/// A specialized Result type for kingfisher-core operations. +pub type Result = std::result::Result; diff --git a/crates/kingfisher-core/src/git_commit_metadata.rs b/crates/kingfisher-core/src/git_commit_metadata.rs new file mode 100644 index 0000000..4268d5d --- /dev/null +++ b/crates/kingfisher-core/src/git_commit_metadata.rs @@ -0,0 +1,159 @@ +//! Git commit metadata types. +//! +//! This module provides types for tracking commit information associated +//! with blobs found in git history. + +use gix::{date::Time, ObjectId}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +// Wrapper for serializing gix::date::Time as text +#[repr(transparent)] +#[derive(Serialize, Deserialize, Copy, Clone)] +#[serde(remote = "Time")] +struct TextTime( + #[serde( + getter = "text_time::getter", + serialize_with = "text_time::serialize", + deserialize_with = "text_time::deserialize" + )] + Time, +); + +impl From for Time { + fn from(v: TextTime) -> Self { + v.0 + } +} + +impl From