Merge pull request #213 from mongodb/development

v1.77.0
2026-02-02 10:39:34 -08:00 · 2026-02-02 10:39:34 -08:00 · 2f41d159e2
commit 2f41d159e2
parent c8b770be1d 301c656f38
360 changed files with 11759 additions and 4351 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -18,11 +18,25 @@ jobs:
    runs-on: ubuntu-24.04-arm
    steps:
      - uses: actions/checkout@v4
+      
+      # Free up disk space on Ubuntu runners
+      - name: Free Disk Space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo docker image prune --all --force
+          df -h
+      
      - uses: actions-rs/toolchain@v1
        with:
          toolchain: ${{ env.RUST_TOOLCHAIN }}
          profile: minimal
          override: true
+      
+      - uses: swatinem/rust-cache@v2
+      
      - name: Build (Makefile linux-arm64)
        run: make ubuntu-arm64
      - name: Run tests
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -27,6 +27,16 @@ jobs:
    steps:
      - uses: actions/checkout@v4

+      # Free up disk space on Ubuntu runners
+      - name: Free Disk Space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo docker image prune --all --force
+          df -h
+
      - uses: actions-rs/toolchain@v1
        with:
          toolchain: ${{ env.RUST_TOOLCHAIN }}
@ -81,12 +91,24 @@ jobs:
    steps:
      - uses: actions/checkout@v4

+      # Free up disk space on Ubuntu runners
+      - name: Free Disk Space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo docker image prune --all --force
+          df -h
+
      - uses: actions-rs/toolchain@v1
        with:
          toolchain: ${{ env.RUST_TOOLCHAIN }}
          profile: minimal
          override: true

+      - uses: swatinem/rust-cache@v2
+
      - name: Install packaging tools
        run: cargo install cargo-deb cargo-generate-rpm

--- a/.pre-commit-hooks.yaml
+++ b/.pre-commit-hooks.yaml
@ -1,6 +1,14 @@
+- id: kingfisher-auto
+  name: kingfisher (auto-install)
+  description: Automatically downloads and caches the Kingfisher binary, then scans staged changes. No manual installation required.
+  entry: scripts/kingfisher-pre-commit-auto.sh
+  language: script
+  pass_filenames: false
+  stages: [commit]
+
 - id: kingfisher-docker
  name: kingfisher (docker)
-  description: Run Kingfisher in Docker against staged changes at the repository root. No local install required.
+  description: Run Kingfisher in Docker against staged changes at the repository root. Requires Docker but no local install.
  entry: ghcr.io/mongodb/kingfisher:latest
  language: docker
  args: ["scan", ".", "--staged", "--quiet", "--no-update-check"]
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,18 @@

 All notable changes to this project will be documented in this file.

+## [v1.77.0]
+- Added `kingfisher revoke` subcommand for revoking leaked credentials directly with the provider.
+- Added optional `revocation` section to rules to support credential revocation (currently supporting AWS, GCP, GitHub, GitLab, Slack, and Buildkite).
+- Added `kingfisher validate` subcommand to validate credentials without running a full scan.
+- Added `validate_command` and `revoke_command` fields to scan output (pretty, JSON, JSONL, BSON, SARIF formats) showing the exact `kingfisher validate` or `kingfisher revoke` command to run for each finding. The `validate_command` is included for all findings with validation support; `revoke_command` is included only for active credentials with revocation support. These fields are omitted when `--redact` is used since they contain the secret value.
+- Updated the HTML report viewer to display validate and revoke commands in the Finding Details panel with copy-to-clipboard functionality.
+- Refactored project into multiple crates for better modularity and maintainability.
+- Ensured more CLI arguments are global and available across all subcommands.
+- Added `kingfisher-auto` pre-commit hook that automatically downloads and caches the appropriate binary for your platform (no Docker or manual installation required).
+- Added Husky integration support with `install-husky.sh` helper script and documentation for Node.js projects.
+- Added `kingfisher-pre-commit-auto.sh` and `kingfisher-pre-commit-auto.ps1` scripts for automatic binary download in Git hooks (Linux, macOS, Windows support).
+
 ## [v1.76.0]
 - Fixed validation deduplication for rules with nested unnamed captures (e.g. `(?<REGEX>...(ABC|DEF)...)`) to use the primary capture for grouping, ensuring each unique match triggers a separate validation request.
 - Added trace-level (`-vv`) logging for internal validation dedup keys and grouping to aid debugging.
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,3 +1,12 @@
+[workspace]
+members = [
+    ".",
+    "crates/kingfisher-core",
+    "crates/kingfisher-rules",
+    "crates/kingfisher-scanner",
+]
+resolver = "2"
+
 [workspace.package]
 edition = "2021"
 rust-version = "1.90"
@ -10,7 +19,7 @@ publish = false

 [package]
 name = "kingfisher"
-version = "1.76.0"
+version = "1.77.0"
 description = "MongoDB's blazingly fast and accurate secret scanning and validation tool"
 edition.workspace = true
 rust-version.workspace = true
@ -40,6 +49,11 @@ assets = [
 ]

 [dependencies]
+# Library crates
+kingfisher-core = { path = "crates/kingfisher-core" }
+kingfisher-rules = { path = "crates/kingfisher-rules" }
+kingfisher-scanner = { path = "crates/kingfisher-scanner" }
+
 clap = { version = "4.5", features = [
    "cargo",
    "derive",
--- a/README.md
+++ b/README.md
--- a/crates/kingfisher-core/Cargo.toml
+++ b/crates/kingfisher-core/Cargo.toml
@ -0,0 +1,48 @@
+[package]
+name = "kingfisher-core"
+version = "0.1.0"
+description = "Core types and traits for Kingfisher secret scanner"
+edition.workspace = true
+rust-version.workspace = true
+license.workspace = true
+authors.workspace = true
+homepage.workspace = true
+repository.workspace = true
+publish.workspace = true
+
+[dependencies]
+# Serialization
+serde = { version = "1.0", features = ["derive", "rc"] }
+serde_json = "1.0"
+schemars = "0.8"
+
+# Error handling
+anyhow = "1.0"
+thiserror = "1.0"
+
+# Hashing and crypto
+sha1 = "0.10"
+hex = "0.4"
+
+# Memory management
+memmap2 = "0.9"
+once_cell = "1.21"
+parking_lot = "0.12"
+
+# Collections
+smallvec = { version = "1", features = ["const_generics", "const_new", "union"] }
+rustc-hash = "2.1"
+dashmap = "6.1"
+
+# Byte string handling
+bstr = { version = "1.12", features = ["serde"] }
+memchr = "2.7"
+
+# Git types (minimal, for ObjectId and Time)
+gix = { version = "0.73", default-features = false, features = ["serde"] }
+
+# Console formatting
+console = "0.15"
+
+[dev-dependencies]
+pretty_assertions = "1.4"
--- a/crates/kingfisher-core/src/blob.rs
+++ b/crates/kingfisher-core/src/blob.rs
@ -0,0 +1,495 @@
+//! Blob representation for scannable content.
+//!
+//! A [`Blob`] represents content that can be scanned for secrets. It can be
+//! created from:
+//! - In-memory bytes ([`Blob::from_bytes`])
+//! - A file path ([`Blob::from_file`])
+//! - Borrowed data ([`Blob::from_borrowed`])
+//!
+//! Large files are automatically memory-mapped for efficiency.
+
+use std::{
+    convert::TryInto,
+    fs::File,
+    io::{Read, Write},
+    path::Path,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+};
+
+use bstr::{BString, ByteSlice};
+use gix::ObjectId;
+use once_cell::sync::OnceCell;
+use parking_lot::Mutex;
+use rustc_hash::FxHashMap;
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+use sha1::{Digest, Sha1};
+use smallvec::SmallVec;
+
+use crate::error::Result;
+use crate::git_commit_metadata::CommitMetadata;
+
+/// Threshold above which files are memory-mapped instead of read into memory.
+const LARGE_FILE_THRESHOLD: u64 = 0; // Currently: always mmap
+
+/// Global counter for temporary blob IDs.
+static NEXT_ID: AtomicU64 = AtomicU64::new(1);
+
+/// Tracks where a blob was seen in git history.
+#[derive(Clone, Debug, serde::Serialize)]
+pub struct BlobAppearance {
+    /// Metadata about the commit where this blob appeared.
+    pub commit_metadata: Arc<CommitMetadata>,
+
+    /// The path of the blob within the repository.
+    pub path: BString,
+}
+
+impl BlobAppearance {
+    /// Returns the path as a `&Path`, if it's valid UTF-8.
+    #[inline]
+    pub fn path(&self) -> std::result::Result<&Path, bstr::Utf8Error> {
+        self.path.to_path()
+    }
+}
+
+/// A set of [`BlobAppearance`] entries, optimized for the common case of a single appearance.
+pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>;
+
+/// The underlying data storage for a [`Blob`].
+pub enum BlobData<'a> {
+    /// Small blobs stored as owned bytes.
+    Owned(Vec<u8>),
+
+    /// Large blobs that are memory-mapped from disk.
+    Mapped(memmap2::Mmap),
+
+    /// Borrowed bytes (e.g., from a git pack file).
+    Borrowed(&'a [u8]),
+}
+
+impl<'a> AsRef<[u8]> for BlobData<'a> {
+    fn as_ref(&self) -> &[u8] {
+        match self {
+            BlobData::Owned(v) => v,
+            BlobData::Mapped(m) => m,
+            BlobData::Borrowed(slice) => slice,
+        }
+    }
+}
+
+impl<'a> BlobData<'a> {
+    /// Returns the length of the blob data in bytes.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.as_ref().len()
+    }
+
+    /// Returns true if the blob data is empty.
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.as_ref().is_empty()
+    }
+}
+
+/// A scannable blob of content.
+///
+/// `Blob` is the primary type for representing content to be scanned. It lazily
+/// computes a content-based ID (SHA-1) and supports multiple backing storage types.
+///
+/// # Examples
+///
+/// ```
+/// use kingfisher_core::Blob;
+///
+/// // Create from bytes
+/// let blob = Blob::from_bytes(b"my secret content".to_vec());
+/// assert_eq!(blob.len(), 17);
+///
+/// // Create from file
+/// // let blob = Blob::from_file("path/to/file.txt")?;
+/// ```
+pub struct Blob<'a> {
+    /// Lazily computed content-based ID.
+    id: OnceCell<BlobId>,
+    /// The underlying data.
+    data: BlobData<'a>,
+    /// Temporary ID assigned at creation (for debugging/tracking).
+    temp_id: u64,
+}
+
+impl Blob<'_> {
+    /// Create a new `Blob` by reading from a file.
+    ///
+    /// Large files are automatically memory-mapped for efficiency.
+    #[inline]
+    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
+        let mut file = File::open(&path)?;
+        let file_size = file.metadata()?.len();
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+
+        if file_size > LARGE_FILE_THRESHOLD {
+            // Large files: one mmap, zero extra copies.
+            let mmap = unsafe { memmap2::Mmap::map(&file)? };
+            Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id })
+        } else {
+            // Small files: read into memory.
+            let mut bytes = Vec::with_capacity(file_size as usize);
+            file.read_to_end(&mut bytes)?;
+            Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id })
+        }
+    }
+
+    /// Create a new `Blob` from a vector of bytes.
+    #[inline]
+    pub fn from_bytes(bytes: Vec<u8>) -> Self {
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+        Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }
+    }
+
+    /// Create a new `Blob` with a pre-computed ID and owned data.
+    #[inline]
+    pub fn new(id: BlobId, bytes: Vec<u8>) -> Self {
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+        let cell = OnceCell::new();
+        let _ = cell.set(id);
+        Blob { id: cell, data: BlobData::Owned(bytes), temp_id }
+    }
+
+    /// Returns the blob's content as a byte slice.
+    #[inline]
+    pub fn bytes(&self) -> &[u8] {
+        self.data.as_ref()
+    }
+
+    /// Lazily computes and returns the blob's content-based [`BlobId`].
+    #[inline]
+    pub fn id(&self) -> BlobId {
+        *self.id.get_or_init(|| BlobId::new(self.bytes()))
+    }
+
+    /// Returns a reference to the blob's [`BlobId`], computing it if necessary.
+    #[inline]
+    pub fn id_ref(&self) -> &BlobId {
+        self.id.get_or_init(|| BlobId::new(self.bytes()))
+    }
+
+    /// Returns the temporary ID assigned when this blob was created.
+    #[inline]
+    pub fn temp_id(&self) -> u64 {
+        self.temp_id
+    }
+
+    /// Returns the length of the blob in bytes.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.bytes().len()
+    }
+
+    /// Returns true if the blob is empty.
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.bytes().is_empty()
+    }
+}
+
+impl<'a> Blob<'a> {
+    /// Create a new `Blob` from borrowed bytes.
+    ///
+    /// This is useful for zero-copy scanning of data that already exists
+    /// in memory (e.g., from a git pack file).
+    #[inline]
+    pub fn from_borrowed(bytes: &'a [u8]) -> Self {
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+        Blob { id: OnceCell::new(), data: BlobData::Borrowed(bytes), temp_id }
+    }
+}
+
+impl Drop for Blob<'_> {
+    fn drop(&mut self) {
+        // For owned data, clear and shrink to free memory promptly.
+        if let BlobData::Owned(ref mut v) = self.data {
+            v.clear();
+            v.shrink_to_fit();
+        }
+    }
+}
+
+/// A content-based identifier for a blob, computed as a Git-compatible SHA-1 hash.
+#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone, Serialize)]
+#[serde(into = "String")]
+pub struct BlobId([u8; 20]);
+
+impl BlobId {
+    /// Creates a zero-filled (default) `BlobId`.
+    pub fn default() -> Self {
+        BlobId([0; 20])
+    }
+
+    /// Computes a `BlobId` from raw bytes.
+    ///
+    /// For large inputs, only the first and last 64KB are hashed for performance.
+    #[inline]
+    pub fn new(input: &[u8]) -> Self {
+        const CHUNK: usize = 64 * 1024; // 64KB from start and end
+        let mut hasher = Sha1::new();
+        write!(&mut hasher, "blob {}\0", input.len()).unwrap();
+        if input.len() <= CHUNK * 2 {
+            hasher.update(input);
+        } else {
+            hasher.update(&input[..CHUNK]);
+            hasher.update(&input[input.len() - CHUNK..]);
+        }
+        let digest: [u8; 20] = hasher.finalize().into();
+        BlobId(digest)
+    }
+
+    /// Computes a `BlobId` from the complete bytes (no truncation).
+    pub fn compute_from_bytes(bytes: &[u8]) -> Self {
+        let mut hasher = Sha1::new();
+        write!(&mut hasher, "blob {}\0", bytes.len()).unwrap();
+        hasher.update(bytes);
+        let digest: [u8; 20] = hasher.finalize().into();
+        BlobId(digest)
+    }
+
+    /// Parses a `BlobId` from a hex string.
+    #[inline]
+    pub fn from_hex(v: &str) -> crate::Result<Self> {
+        let bytes = hex::decode(v)?;
+        let arr: [u8; 20] =
+            bytes.as_slice().try_into().map_err(|_| crate::Error::InvalidBlobId(v.to_string()))?;
+        Ok(BlobId(arr))
+    }
+
+    /// Returns the blob ID as a hex string.
+    #[inline]
+    pub fn hex(&self) -> String {
+        hex::encode(self.0)
+    }
+
+    /// Returns the raw bytes of the blob ID.
+    #[inline]
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl<'de> Deserialize<'de> for BlobId {
+    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
+        struct Vis;
+        impl serde::de::Visitor<'_> for Vis {
+            type Value = BlobId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                formatter.write_str("a 40-character hex string")
+            }
+
+            fn visit_str<E: serde::de::Error>(
+                self,
+                v: &str,
+            ) -> std::result::Result<Self::Value, E> {
+                BlobId::from_hex(v).map_err(|e| serde::de::Error::custom(e))
+            }
+        }
+        d.deserialize_str(Vis)
+    }
+}
+
+impl std::fmt::Debug for BlobId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "BlobId({})", self.hex())
+    }
+}
+
+impl std::fmt::Display for BlobId {
+    #[inline]
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.hex())
+    }
+}
+
+impl JsonSchema for BlobId {
+    fn schema_name() -> String {
+        "BlobId".into()
+    }
+
+    fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
+        let s = String::json_schema(gen);
+        let mut o = s.into_object();
+        o.string().pattern = Some("[0-9a-f]{40}".into());
+        let md = o.metadata();
+        md.description = Some("A hex-encoded blob ID as computed by Git".into());
+        schemars::schema::Schema::Object(o)
+    }
+}
+
+impl From<BlobId> for String {
+    #[inline]
+    fn from(blob_id: BlobId) -> String {
+        blob_id.hex()
+    }
+}
+
+impl TryFrom<&str> for BlobId {
+    type Error = crate::Error;
+
+    #[inline]
+    fn try_from(s: &str) -> std::result::Result<Self, Self::Error> {
+        BlobId::from_hex(s)
+    }
+}
+
+impl<'a> From<&'a gix::ObjectId> for BlobId {
+    #[inline]
+    fn from(id: &'a gix::ObjectId) -> Self {
+        BlobId(id.as_bytes().try_into().expect("oid should be a 20-byte value"))
+    }
+}
+
+impl From<gix::ObjectId> for BlobId {
+    #[inline]
+    fn from(id: gix::ObjectId) -> Self {
+        BlobId(id.as_bytes().try_into().expect("oid should be a 20-byte value"))
+    }
+}
+
+impl<'a> From<&'a BlobId> for gix::ObjectId {
+    #[inline]
+    fn from(blob_id: &'a BlobId) -> Self {
+        gix::hash::ObjectId::try_from(blob_id.as_bytes()).unwrap()
+    }
+}
+
+impl From<BlobId> for gix::ObjectId {
+    #[inline]
+    fn from(blob_id: BlobId) -> Self {
+        gix::hash::ObjectId::try_from(blob_id.as_bytes()).unwrap()
+    }
+}
+
+/// A concurrent map with [`BlobId`] keys, optimized for low contention.
+///
+/// This implementation uses 256 shards (based on the first byte of the blob ID)
+/// to minimize lock contention during parallel scanning.
+pub struct BlobIdMap<V> {
+    maps: [Mutex<FxHashMap<ObjectId, V>>; 256],
+}
+
+impl<V> BlobIdMap<V> {
+    /// Creates a new empty `BlobIdMap`.
+    pub fn new() -> Self {
+        BlobIdMap { maps: std::array::from_fn(|_| Mutex::new(FxHashMap::default())) }
+    }
+
+    /// Inserts a value, returning the previous value if one existed.
+    #[inline]
+    pub fn insert(&self, blob_id: BlobId, v: V) -> Option<V> {
+        let idx = blob_id.as_bytes()[0] as usize;
+        self.maps[idx].lock().insert(blob_id.into(), v)
+    }
+
+    /// Returns true if the map contains the given key.
+    #[inline]
+    pub fn contains_key(&self, blob_id: &BlobId) -> bool {
+        let idx = blob_id.as_bytes()[0] as usize;
+        self.maps[idx].lock().contains_key(&ObjectId::from(blob_id))
+    }
+
+    /// Returns the total number of entries in the map.
+    ///
+    /// Note: This is not a cheap operation as it must lock all shards.
+    pub fn len(&self) -> usize {
+        self.maps.iter().map(|m| m.lock().len()).sum()
+    }
+
+    /// Returns true if the map is empty.
+    pub fn is_empty(&self) -> bool {
+        self.maps.iter().all(|m| m.lock().is_empty())
+    }
+}
+
+impl<V: Copy> BlobIdMap<V> {
+    /// Gets a copy of the value for the given key.
+    #[inline]
+    pub fn get(&self, blob_id: &BlobId) -> Option<V> {
+        let idx = blob_id.as_bytes()[0] as usize;
+        self.maps[idx].lock().get(&ObjectId::from(blob_id)).copied()
+    }
+}
+
+impl<V> Default for BlobIdMap<V> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Metadata about a blob.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, JsonSchema)]
+pub struct BlobMetadata {
+    /// The blob's content-based ID.
+    pub id: BlobId,
+
+    /// The length of the blob in bytes.
+    pub num_bytes: usize,
+
+    /// The guessed MIME type of the blob (e.g., "text/plain").
+    pub mime_essence: Option<String>,
+
+    /// The guessed programming language of the blob (e.g., "Python").
+    pub language: Option<String>,
+}
+
+impl BlobMetadata {
+    /// Returns the size in bytes.
+    #[inline]
+    pub fn num_bytes(&self) -> usize {
+        self.num_bytes
+    }
+
+    /// Returns the size in megabytes, rounded to 3 decimal places.
+    #[inline]
+    pub fn num_megabytes(&self) -> f64 {
+        let mb = self.num_bytes as f64 / 1_048_576.0;
+        format!("{:.3}", mb).parse::<f64>().unwrap_or(mb)
+    }
+
+    /// Returns the MIME essence if known.
+    #[inline]
+    pub fn mime_essence(&self) -> Option<&str> {
+        self.mime_essence.as_deref()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_blob_id_empty() {
+        assert_eq!(BlobId::new(&[]).hex(), "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391");
+    }
+
+    #[test]
+    fn test_blob_id_small() {
+        assert_eq!(BlobId::new(&vec![0; 1024]).hex(), "06d7405020018ddf3cacee90fd4af10487da3d20");
+    }
+
+    #[test]
+    fn test_blob_from_bytes() {
+        let blob = Blob::from_bytes(b"hello world".to_vec());
+        assert_eq!(blob.len(), 11);
+        assert_eq!(blob.bytes(), b"hello world");
+    }
+
+    #[test]
+    fn test_blob_id_roundtrip() {
+        let original = BlobId::new(b"test data");
+        let hex = original.hex();
+        let parsed = BlobId::from_hex(&hex).unwrap();
+        assert_eq!(original, parsed);
+    }
+}
--- a/crates/kingfisher-core/src/bstring_escape.rs
+++ b/crates/kingfisher-core/src/bstring_escape.rs
@ -0,0 +1,128 @@
+//! Safe string escaping utilities.
+//!
+//! This module provides utilities for safely displaying byte strings that may
+//! contain non-UTF8 data, ANSI escape codes, or control characters.
+
+use std::{
+    borrow::Cow,
+    fmt::{Display, Formatter},
+};
+
+use console::strip_ansi_codes;
+
+/// Escapes non-printing characters in a string while preserving whitespace.
+///
+/// Returns borrowed data if no escaping was needed, avoiding allocations.
+fn escape_nonprinting(s: &str) -> Cow<'_, str> {
+    // Fast path - return original if no control chars (except whitespace)
+    if s.chars().all(|ch| !ch.is_control() || ch.is_whitespace()) {
+        return Cow::Borrowed(s);
+    }
+
+    // Allocate with extra capacity for possible escape sequences
+    let mut escaped = String::with_capacity(s.len() * 2);
+    let mut chars = s.chars().peekable();
+
+    while let Some(ch) = chars.next() {
+        match ch {
+            // Handle ANSI escape sequences
+            '\x1B' => continue,
+            // Escape non-whitespace control characters
+            ch if ch.is_control() && !ch.is_whitespace() => {
+                use std::fmt::Write;
+                write!(escaped, "{}", ch.escape_unicode()).expect("string writing must succeed");
+            }
+            // Pass through all other characters unchanged
+            ch => escaped.push(ch),
+        }
+    }
+
+    Cow::Owned(escaped)
+}
+
+/// A wrapper around `&[u8]` that provides safe string formatting.
+///
+/// When displayed, `Escaped` will:
+/// 1. Convert from UTF-8 with replacement of invalid sequences
+/// 2. Remove ANSI control sequences
+/// 3. Escape remaining control characters (except whitespace)
+///
+/// # Examples
+///
+/// ```
+/// use kingfisher_core::Escaped;
+///
+/// let bytes = b"Hello\x00World";
+/// let escaped = Escaped(bytes);
+/// assert_eq!(escaped.to_string(), "Hello\\u{0}World");
+/// ```
+#[derive(Debug, Clone, Copy)]
+pub struct Escaped<'a>(pub &'a [u8]);
+
+impl Display for Escaped<'_> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        // First handle UTF-8 decoding with replacement characters
+        let decoded = String::from_utf8_lossy(self.0);
+        // Then strip ANSI sequences and escape control chars
+        let stripped = strip_ansi_codes(&decoded);
+        let escaped = escape_nonprinting(&stripped);
+        f.write_str(&escaped)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_escape_normal_text() {
+        let input = "Hello, World!";
+        let result = escape_nonprinting(input);
+        assert!(matches!(result, Cow::Borrowed(_)));
+        assert_eq!(result, "Hello, World!");
+    }
+
+    #[test]
+    fn test_escape_with_whitespace() {
+        let input = "Hello\n\t World!";
+        let result = escape_nonprinting(input);
+        assert!(matches!(result, Cow::Borrowed(_)));
+        assert_eq!(result, "Hello\n\t World!");
+    }
+
+    #[test]
+    fn test_escape_control_chars() {
+        let input = "Hello\x00World\x01";
+        let result = escape_nonprinting(input);
+        assert!(matches!(result, Cow::Owned(_)));
+        assert_eq!(result, "Hello\\u{0}World\\u{1}");
+    }
+
+    #[test]
+    fn test_escaped_struct_simple() {
+        let bytes = b"Hello World";
+        let escaped = Escaped(bytes);
+        assert_eq!(escaped.to_string(), "Hello World");
+    }
+
+    #[test]
+    fn test_escaped_struct_ansi_codes() {
+        let bytes = b"\x1b[31mRed\x1b[0m \x1b[32mGreen\x1b[0m";
+        let escaped = Escaped(bytes);
+        assert_eq!(escaped.to_string(), "Red Green");
+    }
+
+    #[test]
+    fn test_escaped_struct_invalid_utf8() {
+        let bytes = b"Hello\xFF\xFEWorld";
+        let escaped = Escaped(bytes);
+        assert_eq!(escaped.to_string(), "Hello\u{FFFD}\u{FFFD}World");
+    }
+
+    #[test]
+    fn test_escaped_struct_empty() {
+        let bytes = b"";
+        let escaped = Escaped(bytes);
+        assert_eq!(escaped.to_string(), "");
+    }
+}
--- a/crates/kingfisher-core/src/entropy.rs
+++ b/crates/kingfisher-core/src/entropy.rs
@ -0,0 +1,89 @@
+//! Shannon entropy calculation.
+//!
+//! Entropy is used to filter out low-entropy strings that are unlikely
+//! to be real secrets.
+
+/// Calculates the Shannon entropy of a byte slice.
+///
+/// Returns a value between 0.0 (completely uniform) and 8.0 (maximum entropy
+/// for random bytes). Typical thresholds for secret detection are around 3.5-4.5.
+///
+/// # Examples
+///
+/// ```
+/// use kingfisher_core::calculate_shannon_entropy;
+///
+/// // Low entropy (repeated character)
+/// let entropy = calculate_shannon_entropy(b"aaaaaaaaaa");
+/// assert!(entropy < 0.1);
+///
+/// // High entropy (random-looking)
+/// let entropy = calculate_shannon_entropy(b"j2k#9K$mL*p&vN3");
+/// assert!(entropy > 3.5);
+/// ```
+pub fn calculate_shannon_entropy(bytes: &[u8]) -> f32 {
+    if bytes.is_empty() {
+        return 0.0;
+    }
+
+    // Count occurrences of each byte value (0-255)
+    let mut counts = [0u32; 256];
+    for &byte in bytes {
+        counts[byte as usize] += 1;
+    }
+
+    let total_bytes = bytes.len() as f32;
+
+    // Sum entropy contribution for each byte that appears at least once
+    counts.iter().filter(|&&count| count > 0).fold(0.0, |entropy, &count| {
+        let probability = count as f32 / total_bytes;
+        entropy - probability * probability.log2()
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_entropy_empty() {
+        let entropy = calculate_shannon_entropy(&[]);
+        assert_eq!(entropy, 0.0);
+        assert!(entropy.is_finite());
+    }
+
+    #[test]
+    fn test_entropy_uniform() {
+        // Single repeated byte should return 0.0
+        let entropy = calculate_shannon_entropy(&[65, 65, 65, 65]);
+        assert_eq!(entropy, 0.0);
+        assert!(entropy.is_finite());
+    }
+
+    #[test]
+    fn test_entropy_two_values() {
+        // Even distribution of two bytes should be exactly 1.0
+        let input = &[1, 2, 1, 2];
+        let entropy = calculate_shannon_entropy(input);
+        assert!((entropy - 1.0).abs() < 0.0001);
+        assert!(entropy.is_finite());
+    }
+
+    #[test]
+    fn test_entropy_password() {
+        // Real password example should have mid-range entropy
+        let password = "Password123!".as_bytes();
+        let entropy = calculate_shannon_entropy(password);
+        assert!(entropy > 2.5);
+        assert!(entropy.is_finite());
+    }
+
+    #[test]
+    fn test_entropy_random() {
+        // Random-looking string should have high entropy
+        let random = "j2k#9K$mL*p&vN3".as_bytes();
+        let entropy = calculate_shannon_entropy(random);
+        assert!(entropy > 3.5);
+        assert!(entropy.is_finite());
+    }
+}
--- a/crates/kingfisher-core/src/error.rs
+++ b/crates/kingfisher-core/src/error.rs
@ -0,0 +1,36 @@
+//! Error types for kingfisher-core.
+
+use thiserror::Error;
+
+/// The primary error type for kingfisher-core operations.
+#[derive(Error, Debug)]
+pub enum Error {
+    /// An I/O error occurred.
+    #[error("I/O error: {0}")]
+    Io(#[from] std::io::Error),
+
+    /// Failed to parse a blob ID from hex.
+    #[error("Invalid blob ID: {0}")]
+    InvalidBlobId(String),
+
+    /// A hex decoding error occurred.
+    #[error("Hex decode error: {0}")]
+    HexDecode(#[from] hex::FromHexError),
+
+    /// Failed to open or read a Git repository.
+    #[error("Git error: {0}")]
+    Git(String),
+
+    /// A generic error with a message.
+    #[error("{0}")]
+    Other(String),
+}
+
+impl From<gix::open::Error> for Error {
+    fn from(e: gix::open::Error) -> Self {
+        Error::Git(e.to_string())
+    }
+}
+
+/// A specialized Result type for kingfisher-core operations.
+pub type Result<T> = std::result::Result<T, Error>;
--- a/crates/kingfisher-core/src/git_commit_metadata.rs
+++ b/crates/kingfisher-core/src/git_commit_metadata.rs
@ -0,0 +1,159 @@
+//! Git commit metadata types.
+//!
+//! This module provides types for tracking commit information associated
+//! with blobs found in git history.
+
+use gix::{date::Time, ObjectId};
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+
+// Wrapper for serializing gix::date::Time as text
+#[repr(transparent)]
+#[derive(Serialize, Deserialize, Copy, Clone)]
+#[serde(remote = "Time")]
+struct TextTime(
+    #[serde(
+        getter = "text_time::getter",
+        serialize_with = "text_time::serialize",
+        deserialize_with = "text_time::deserialize"
+    )]
+    Time,
+);
+
+impl From<TextTime> for Time {
+    fn from(v: TextTime) -> Self {
+        v.0
+    }
+}
+
+impl From<Time> for TextTime {
+    fn from(v: Time) -> Self {
+        Self(v)
+    }
+}
+
+mod text_time {
+    use super::*;
+
+    #[inline]
+    pub fn getter(v: &Time) -> &Time {
+        v
+    }
+
+    #[inline]
+    pub fn serialize<S: serde::Serializer>(v: &Time, serializer: S) -> Result<S::Ok, S::Error> {
+        serializer.collect_str(v)
+    }
+
+    pub fn deserialize<'de, D: serde::Deserializer<'de>>(d: D) -> Result<Time, D::Error> {
+        struct Vis;
+        impl<'a> serde::de::Visitor<'a> for Vis {
+            type Value = Time;
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                formatter.write_str("a string representing a Git timestamp")
+            }
+            fn visit_str<E: serde::de::Error>(self, v: &str) -> Result<Self::Value, E> {
+                gix::date::parse(v, None).map_err(E::custom)
+            }
+        }
+        d.deserialize_str(Vis)
+    }
+}
+
+impl JsonSchema for TextTime {
+    fn schema_name() -> String {
+        "Time".into()
+    }
+
+    fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
+        String::json_schema(gen)
+    }
+}
+
+// Wrapper for serializing gix::ObjectId as hex
+#[repr(transparent)]
+#[derive(Serialize, Deserialize, Copy, Clone)]
+#[serde(remote = "ObjectId")]
+struct HexObjectId(
+    #[serde(
+        getter = "hex_object_id::getter",
+        serialize_with = "hex_object_id::serialize",
+        deserialize_with = "hex_object_id::deserialize"
+    )]
+    ObjectId,
+);
+
+impl From<ObjectId> for HexObjectId {
+    fn from(v: ObjectId) -> Self {
+        HexObjectId(v)
+    }
+}
+
+impl From<HexObjectId> for ObjectId {
+    fn from(v: HexObjectId) -> Self {
+        v.0
+    }
+}
+
+mod hex_object_id {
+    use super::*;
+
+    #[inline]
+    pub fn getter(v: &ObjectId) -> &ObjectId {
+        v
+    }
+
+    #[inline]
+    pub fn serialize<S: serde::Serializer>(v: &ObjectId, serializer: S) -> Result<S::Ok, S::Error> {
+        serializer.collect_str(&v.to_hex())
+    }
+
+    pub fn deserialize<'de, D: serde::Deserializer<'de>>(d: D) -> Result<ObjectId, D::Error> {
+        struct Vis;
+        impl<'a> serde::de::Visitor<'a> for Vis {
+            type Value = ObjectId;
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                formatter.write_str("a 40-character hex string representing a Git object ID")
+            }
+            fn visit_str<E: serde::de::Error>(self, v: &str) -> Result<Self::Value, E> {
+                ObjectId::from_hex(v.as_bytes()).map_err(E::custom)
+            }
+        }
+        d.deserialize_str(Vis)
+    }
+}
+
+impl JsonSchema for HexObjectId {
+    fn schema_name() -> String {
+        "ObjectId".into()
+    }
+
+    fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
+        let s = String::json_schema(gen);
+        let mut o = s.into_object();
+        o.string().pattern = Some("[0-9a-f]{40}".into());
+        let md = o.metadata();
+        md.description = Some("A hex-encoded object ID as computed by Git".into());
+        schemars::schema::Schema::Object(o)
+    }
+}
+
+/// Metadata about a Git commit.
+///
+/// This is used to track the provenance of blobs found in git history.
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)]
+pub struct CommitMetadata {
+    /// The SHA-1 commit ID.
+    #[serde(with = "HexObjectId")]
+    pub commit_id: ObjectId,
+
+    /// The committer's name.
+    pub committer_name: String,
+
+    /// The committer's email address.
+    pub committer_email: String,
+
+    /// The commit timestamp.
+    #[serde(with = "TextTime")]
+    pub committer_timestamp: Time,
+}
--- a/crates/kingfisher-core/src/lib.rs
+++ b/crates/kingfisher-core/src/lib.rs
@ -0,0 +1,27 @@
+//! `kingfisher-core` provides the foundational types and traits shared across
+//! the Kingfisher secret scanning library.
+//!
+//! This crate contains:
+//! - [`Blob`] - Representation of scannable content (files, buffers, git objects)
+//! - [`Location`] - Source location tracking (byte offsets and line/column)
+//! - [`Origin`] - Provenance tracking (where content came from)
+//! - Utility functions for entropy calculation, string escaping, etc.
+
+pub mod blob;
+pub mod bstring_escape;
+pub mod entropy;
+pub mod error;
+pub mod git_commit_metadata;
+pub mod location;
+pub mod origin;
+
+// Re-export commonly used types at the crate root
+pub use blob::{
+    Blob, BlobAppearance, BlobAppearanceSet, BlobData, BlobId, BlobIdMap, BlobMetadata,
+};
+pub use bstring_escape::Escaped;
+pub use entropy::calculate_shannon_entropy;
+pub use error::{Error, Result};
+pub use git_commit_metadata::CommitMetadata;
+pub use location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan};
+pub use origin::{CommitOrigin, ExtendedOrigin, FileOrigin, GitRepoOrigin, Origin, OriginSet};
--- a/crates/kingfisher-core/src/location.rs
+++ b/crates/kingfisher-core/src/location.rs
@ -0,0 +1,296 @@
+//! Source location tracking.
+//!
+//! This module provides types for tracking locations within source content:
+//! - [`OffsetSpan`] - Byte offset ranges
+//! - [`SourceSpan`] - Line/column ranges
+//! - [`Location`] - Combined byte and source location
+//! - [`LocationMapping`] - Efficient offset-to-line/column conversion
+
+use core::ops::Range;
+use std::cell::RefCell;
+
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+
+/// A point defined by a byte offset.
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, Copy, Clone)]
+pub struct OffsetPoint(pub usize);
+
+impl OffsetPoint {
+    #[inline]
+    pub fn new(idx: usize) -> Self {
+        OffsetPoint(idx)
+    }
+}
+
+/// A non-empty span defined by two byte offsets (half-open interval `[start, end)`).
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)]
+pub struct OffsetSpan {
+    pub start: usize,
+    pub end: usize,
+}
+
+impl std::fmt::Display for OffsetSpan {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}-{}", self.start, self.end)
+    }
+}
+
+impl OffsetSpan {
+    /// Creates an `OffsetSpan` from two `OffsetPoint`s.
+    #[inline]
+    pub fn from_offsets(start: OffsetPoint, end: OffsetPoint) -> Self {
+        OffsetSpan { start: start.0, end: end.0 }
+    }
+
+    /// Creates an `OffsetSpan` from a `Range<usize>`.
+    #[inline]
+    pub fn from_range(range: Range<usize>) -> Self {
+        OffsetSpan { start: range.start, end: range.end }
+    }
+
+    /// Returns the length in bytes.
+    #[inline]
+    #[must_use]
+    pub fn len(&self) -> usize {
+        self.end.saturating_sub(self.start)
+    }
+
+    /// Returns true if empty or inverted.
+    #[inline]
+    #[must_use]
+    pub fn is_empty(&self) -> bool {
+        self.start >= self.end
+    }
+
+    /// Returns true if `other` lies entirely within `self`.
+    #[inline]
+    #[must_use]
+    pub fn fully_contains(&self, other: &Self) -> bool {
+        self.start <= other.start && other.end <= self.end
+    }
+}
+
+/// A point in source text (1-indexed line, 0-indexed column).
+#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone, Serialize, Deserialize, JsonSchema)]
+pub struct SourcePoint {
+    pub line: usize,
+    pub column: usize,
+}
+
+impl std::fmt::Display for SourcePoint {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}:{}", self.line, self.column)
+    }
+}
+
+/// A span between two source points (closed interval).
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize, JsonSchema)]
+pub struct SourceSpan {
+    pub start: SourcePoint,
+    pub end: SourcePoint,
+}
+
+impl std::fmt::Display for SourceSpan {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}-{}", self.start, self.end)
+    }
+}
+
+/// Efficiently maps byte offsets to line/column positions.
+///
+/// This lazily scans for newlines as needed, avoiding upfront cost for
+/// files where only a few locations are needed.
+pub struct LocationMapping<'a> {
+    bytes: &'a [u8],
+    newline_offsets: RefCell<Vec<usize>>,
+}
+
+impl<'a> LocationMapping<'a> {
+    /// Creates a new mapping for the given input bytes.
+    pub fn new(input: &'a [u8]) -> Self {
+        LocationMapping { bytes: input, newline_offsets: RefCell::new(Vec::new()) }
+    }
+
+    fn ensure_offsets_up_to(&self, offset: usize) {
+        let mut offsets = self.newline_offsets.borrow_mut();
+        let start = offsets.last().map_or(0, |&last| last + 1);
+        if offset < start {
+            return;
+        }
+        let end = offset.min(self.bytes.len());
+        for nl in memchr::memchr_iter(b'\n', &self.bytes[start..end]) {
+            offsets.push(start + nl);
+        }
+    }
+
+    fn source_point_from_offsets(offsets: &[usize], offset: usize) -> SourcePoint {
+        let line = match offsets.binary_search(&offset) {
+            Ok(idx) => idx + 2,
+            Err(idx) => idx + 1,
+        };
+        let column = if let Some(&last) = offsets.get(line.saturating_sub(2)) {
+            offset.saturating_sub(last + 1)
+        } else {
+            offset
+        };
+        SourcePoint { line, column }
+    }
+
+    /// Maps a byte offset to a `SourcePoint`.
+    pub fn get_source_point(&self, offset: usize) -> SourcePoint {
+        self.ensure_offsets_up_to(offset);
+        let offsets = self.newline_offsets.borrow();
+        Self::source_point_from_offsets(&offsets, offset)
+    }
+
+    /// Maps an `OffsetSpan` to a `SourceSpan`.
+    pub fn get_source_span(&self, span: &OffsetSpan) -> SourceSpan {
+        self.ensure_offsets_up_to(span.end.saturating_sub(1));
+        let offsets = self.newline_offsets.borrow();
+        let start = Self::source_point_from_offsets(&offsets, span.start);
+        let end = Self::source_point_from_offsets(&offsets, span.end.saturating_sub(1));
+        SourceSpan { start, end }
+    }
+}
+
+/// Compact representation of a source span to reduce per-match footprint.
+#[derive(Debug, Clone, Copy, Deserialize, Serialize, JsonSchema)]
+pub struct CompactSourceSpan {
+    pub start_line: u32,
+    pub start_column: u32,
+    pub end_line: u32,
+    pub end_column: u32,
+}
+
+impl CompactSourceSpan {
+    #[inline]
+    pub fn zero() -> Self {
+        Self { start_line: 0, start_column: 0, end_line: 0, end_column: 0 }
+    }
+
+    #[inline]
+    pub fn from_source_span(span: &SourceSpan) -> Self {
+        Self {
+            start_line: span.start.line.try_into().unwrap_or(0),
+            start_column: span.start.column.try_into().unwrap_or(0),
+            end_line: span.end.line.try_into().unwrap_or(0),
+            end_column: span.end.column.try_into().unwrap_or(0),
+        }
+    }
+
+    #[inline]
+    pub fn to_source_span(self) -> SourceSpan {
+        SourceSpan {
+            start: SourcePoint {
+                line: usize::try_from(self.start_line).unwrap_or(0),
+                column: usize::try_from(self.start_column).unwrap_or(0),
+            },
+            end: SourcePoint {
+                line: usize::try_from(self.end_line).unwrap_or(0),
+                column: usize::try_from(self.end_column).unwrap_or(0),
+            },
+        }
+    }
+}
+
+/// Combined byte offset and source location information.
+#[derive(Debug, Clone, Deserialize, JsonSchema)]
+pub struct Location {
+    /// The byte offset span.
+    pub offset_span: OffsetSpan,
+    /// The optional source (line/column) span.
+    #[serde(
+        default,
+        serialize_with = "serialize_compact_source_span",
+        deserialize_with = "deserialize_compact_source_span"
+    )]
+    #[schemars(with = "SourceSpan")]
+    pub source_span: Option<CompactSourceSpan>,
+}
+
+impl serde::Serialize for Location {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+
+        let mut state = serializer.serialize_struct("Location", 2)?;
+        state.serialize_field("offset_span", &self.offset_span)?;
+        let source_span = self.source_span().unwrap_or_else(CompactSourceSpan::zero);
+        state.serialize_field("source_span", &source_span.to_source_span())?;
+        state.end()
+    }
+}
+
+impl Location {
+    /// Creates a new `Location` with both offset and source spans.
+    #[inline]
+    pub fn with_source_span(offset_span: OffsetSpan, source_span: Option<SourceSpan>) -> Self {
+        Self {
+            offset_span,
+            source_span: source_span.as_ref().map(CompactSourceSpan::from_source_span),
+        }
+    }
+
+    /// Returns the compact source span if available.
+    #[inline]
+    pub fn source_span(&self) -> Option<CompactSourceSpan> {
+        self.source_span
+    }
+
+    /// Returns the source span, defaulting to zeros if not available.
+    #[inline]
+    pub fn resolved_source_span(&self) -> SourceSpan {
+        self.source_span.unwrap_or_else(CompactSourceSpan::zero).to_source_span()
+    }
+}
+
+fn serialize_compact_source_span<S>(
+    span: &Option<CompactSourceSpan>,
+    serializer: S,
+) -> Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+{
+    let source_span = span.unwrap_or_else(CompactSourceSpan::zero).to_source_span();
+    source_span.serialize(serializer)
+}
+
+fn deserialize_compact_source_span<'de, D>(
+    deserializer: D,
+) -> Result<Option<CompactSourceSpan>, D::Error>
+where
+    D: serde::Deserializer<'de>,
+{
+    let span = SourceSpan::deserialize(deserializer)?;
+    Ok(Some(CompactSourceSpan::from_source_span(&span)))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_offset_span() {
+        let span = OffsetSpan::from_range(10..20);
+        assert_eq!(span.len(), 10);
+        assert!(!span.is_empty());
+    }
+
+    #[test]
+    fn test_location_mapping() {
+        let input = b"line1\nline2\nline3";
+        let mapping = LocationMapping::new(input);
+
+        // First line, first character
+        assert_eq!(mapping.get_source_point(0), SourcePoint { line: 1, column: 0 });
+
+        // First line, last character
+        assert_eq!(mapping.get_source_point(4), SourcePoint { line: 1, column: 4 });
+
+        // Second line, first character
+        assert_eq!(mapping.get_source_point(6), SourcePoint { line: 2, column: 0 });
+    }
+}
--- a/crates/kingfisher-core/src/origin.rs
+++ b/crates/kingfisher-core/src/origin.rs
@ -0,0 +1,310 @@
+//! Provenance tracking for scanned content.
+//!
+//! This module provides types for tracking where content came from:
+//! - [`FileOrigin`] - Content from a file path
+//! - [`GitRepoOrigin`] - Content from a git repository
+//! - [`ExtendedOrigin`] - Content from other sources (Jira, Confluence, etc.)
+//! - [`OriginSet`] - A non-empty collection of origins
+
+use std::{
+    path::{Path, PathBuf},
+    sync::Arc,
+};
+
+use dashmap::DashMap;
+use once_cell::sync::Lazy;
+use rustc_hash::FxHashSet;
+use schemars::JsonSchema;
+use serde::{ser::SerializeSeq, Deserialize, Serialize};
+use smallvec::SmallVec;
+
+use crate::git_commit_metadata::CommitMetadata;
+
+// Cache for git remote URLs to avoid repeated lookups
+static URL_CACHE: Lazy<DashMap<PathBuf, Arc<str>>> = Lazy::new(DashMap::default);
+
+fn compute_url(repo_path: &Path) -> anyhow::Result<String> {
+    let repo = gix::open(repo_path)?;
+    let config = repo.config_snapshot();
+
+    let url_bytes =
+        config.string("remote.origin.url").ok_or_else(|| anyhow::anyhow!("No remote URL found"))?;
+
+    use bstr::ByteSlice;
+    if url_bytes.starts_with(b"http://") || url_bytes.starts_with(b"https://") {
+        Ok(String::from_utf8_lossy(url_bytes.as_bytes()).into_owned())
+    } else if url_bytes.starts_with(b"git@") {
+        let url_str = String::from_utf8_lossy(url_bytes.as_bytes());
+        if let Some(stripped) = url_str.strip_prefix("git@") {
+            if let Some((domain, path)) = stripped.split_once(':') {
+                Ok(format!("https://{}/{}", domain, path))
+            } else {
+                Err(anyhow::anyhow!("Invalid SSH URL format"))
+            }
+        } else {
+            Err(anyhow::anyhow!("Invalid SSH URL format"))
+        }
+    } else {
+        Err(anyhow::anyhow!(
+            "Unsupported remote URL format: {}",
+            String::from_utf8_lossy(url_bytes.as_bytes())
+        ))
+    }
+}
+
+/// Gets the remote URL for a git repository, with caching.
+pub fn get_repo_url(repo_path: &Path) -> anyhow::Result<Arc<str>> {
+    // Fast path: cache hit
+    if let Some(u) = URL_CACHE.get(repo_path) {
+        return Ok(u.clone());
+    }
+
+    // Slow path: compute, intern, cache
+    let url_arc: Arc<str> = compute_url(repo_path)?.into();
+    URL_CACHE.insert(repo_path.to_path_buf(), url_arc.clone());
+    Ok(url_arc)
+}
+
+/// The provenance of a scanned blob.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)]
+#[serde(rename_all = "snake_case", tag = "kind")]
+#[allow(clippy::large_enum_variant)]
+pub enum Origin {
+    /// Content from a file on disk.
+    File(FileOrigin),
+    /// Content from a git repository.
+    GitRepo(GitRepoOrigin),
+    /// Content from an extended source (arbitrary JSON metadata).
+    Extended(ExtendedOrigin),
+}
+
+impl Origin {
+    /// Creates an `Origin` for a plain file.
+    pub fn from_file(path: PathBuf) -> Self {
+        Origin::File(FileOrigin::new(path))
+    }
+
+    /// Creates an `Origin` for a blob in a git repository without commit info.
+    pub fn from_git_repo(repo_path: Arc<PathBuf>) -> Self {
+        Origin::GitRepo(GitRepoOrigin { repo_path, first_commit: None })
+    }
+
+    /// Creates an `Origin` for a blob in a git repository with commit info.
+    pub fn from_git_repo_with_first_commit(
+        repo_path: Arc<PathBuf>,
+        commit_metadata: Arc<CommitMetadata>,
+        blob_path: String,
+    ) -> Self {
+        let first_commit = Some(CommitOrigin { commit_metadata, blob_path });
+        Origin::GitRepo(GitRepoOrigin { repo_path, first_commit })
+    }
+
+    /// Creates an `Origin` from arbitrary JSON metadata.
+    pub fn from_extended(value: serde_json::Value) -> Self {
+        Origin::Extended(ExtendedOrigin(value))
+    }
+
+    /// Returns the path of the blob, if available.
+    pub fn blob_path(&self) -> Option<&Path> {
+        match self {
+            Self::File(e) => Some(&e.path),
+            Self::GitRepo(e) => e.first_commit.as_ref().map(|c| Path::new(&c.blob_path)),
+            Self::Extended(e) => e.path(),
+        }
+    }
+
+    /// Returns the full filesystem path to the content, if available.
+    pub fn full_path(&self) -> Option<PathBuf> {
+        match self {
+            Self::File(e) => Some((*e.path).clone()),
+            Self::GitRepo(e) => e.first_commit.as_ref().map(|c| e.repo_path.join(&c.blob_path)),
+            Self::Extended(e) => e.path().map(PathBuf::from),
+        }
+    }
+}
+
+impl std::fmt::Display for Origin {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Origin::File(e) => write!(f, "file {}", e.path.display()),
+            Origin::GitRepo(e) => match &e.first_commit {
+                Some(md) => write!(
+                    f,
+                    "git repo {}: first seen in commit {} as {}",
+                    e.repo_path.display(),
+                    md.commit_metadata.commit_id,
+                    &md.blob_path,
+                ),
+                None => write!(f, "git repo {}", e.repo_path.display()),
+            },
+            Origin::Extended(e) => write!(f, "extended {}", e),
+        }
+    }
+}
+
+/// Origin information for a file on disk.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Hash)]
+pub struct FileOrigin {
+    /// The file path.
+    pub path: Arc<PathBuf>,
+}
+
+impl FileOrigin {
+    /// Creates a new `FileOrigin` from a path.
+    pub fn new<P: Into<PathBuf>>(p: P) -> Self {
+        Self { path: Arc::new(p.into()) }
+    }
+}
+
+/// Origin information for a blob in a git repository.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Hash)]
+pub struct GitRepoOrigin {
+    /// Path to the repository on disk.
+    pub repo_path: Arc<PathBuf>,
+    /// Information about the first commit where this blob was seen.
+    pub first_commit: Option<CommitOrigin>,
+}
+
+/// Information about where a blob was first seen in git history.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Hash)]
+pub struct CommitOrigin {
+    /// Metadata about the commit.
+    pub commit_metadata: Arc<CommitMetadata>,
+    /// The path of the blob within the commit.
+    pub blob_path: String,
+}
+
+/// An extended origin with arbitrary JSON metadata.
+///
+/// This is used for sources like Jira, Confluence, Slack, etc.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Hash)]
+pub struct ExtendedOrigin(pub serde_json::Value);
+
+impl std::fmt::Display for ExtendedOrigin {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        std::fmt::Display::fmt(&self.0, f)
+    }
+}
+
+impl ExtendedOrigin {
+    /// Returns the path from the extended origin, if available.
+    pub fn path(&self) -> Option<&Path> {
+        let p = self.0.get("path")?.as_str()?;
+        Some(Path::new(p))
+    }
+}
+
+/// A non-empty set of [`Origin`] entries.
+///
+/// This is used when a blob has been seen in multiple locations
+/// (e.g., the same content in multiple files or commits).
+#[derive(Debug, Clone)]
+pub struct OriginSet {
+    origin: Origin,
+    more_provenance: SmallVec<[Origin; 1]>,
+}
+
+impl serde::Serialize for OriginSet {
+    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+        let mut seq = s.serialize_seq(Some(self.len()))?;
+        for p in self.iter() {
+            seq.serialize_element(p)?;
+        }
+        seq.end()
+    }
+}
+
+impl JsonSchema for OriginSet {
+    fn schema_name() -> String {
+        "OriginSet".into()
+    }
+
+    fn json_schema(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
+        let s = <Vec<Origin>>::json_schema(gen);
+        let mut o = s.into_object();
+        o.array().min_items = Some(1);
+        let md = o.metadata();
+        md.description = Some("A non-empty set of `Origin` entries".into());
+        schemars::schema::Schema::Object(o)
+    }
+}
+
+impl OriginSet {
+    /// Creates a new `OriginSet` with a single origin.
+    #[inline]
+    pub fn single(origin: Origin) -> Self {
+        Self { origin, more_provenance: SmallVec::new() }
+    }
+
+    /// Creates a new `OriginSet` from multiple origins.
+    ///
+    /// Filters out redundant less-specific origins.
+    pub fn new(origin: Origin, more_origin: Vec<Origin>) -> Self {
+        let mut git_repos_with_detailed: FxHashSet<Arc<PathBuf>> = FxHashSet::default();
+        for p in std::iter::once(&origin).chain(&more_origin) {
+            if let Origin::GitRepo(e) = p {
+                if e.first_commit.is_some() {
+                    git_repos_with_detailed.insert(e.repo_path.clone());
+                }
+            }
+        }
+        let mut filtered = std::iter::once(origin).chain(more_origin).filter(|p| match p {
+            Origin::GitRepo(e) => {
+                e.first_commit.is_some() || !git_repos_with_detailed.contains(&e.repo_path)
+            }
+            Origin::File(_) => true,
+            Origin::Extended(_) => true,
+        });
+        Self { origin: filtered.next().unwrap(), more_provenance: filtered.collect() }
+    }
+
+    /// Attempts to create an `OriginSet` from an iterator.
+    ///
+    /// Returns `None` if the iterator is empty.
+    #[inline]
+    pub fn try_from_iter<I>(it: I) -> Option<Self>
+    where
+        I: IntoIterator<Item = Origin>,
+    {
+        let mut it = it.into_iter();
+        let provenance = it.next()?;
+        let more_provenance = it.collect();
+        Some(Self::new(provenance, more_provenance))
+    }
+
+    /// Returns the first origin in the set.
+    #[inline]
+    pub fn first(&self) -> &Origin {
+        &self.origin
+    }
+
+    /// Returns the number of origins in the set.
+    #[allow(clippy::len_without_is_empty)]
+    #[inline]
+    pub fn len(&self) -> usize {
+        1 + self.more_provenance.len()
+    }
+
+    /// Returns an iterator over all origins in the set.
+    #[inline]
+    pub fn iter(&self) -> impl Iterator<Item = &Origin> {
+        std::iter::once(&self.origin).chain(&self.more_provenance)
+    }
+}
+
+impl IntoIterator for OriginSet {
+    type IntoIter =
+        std::iter::Chain<std::iter::Once<Origin>, <Vec<Origin> as IntoIterator>::IntoIter>;
+    type Item = Origin;
+
+    #[inline]
+    fn into_iter(self) -> Self::IntoIter {
+        std::iter::once(self.origin).chain(self.more_provenance.into_vec().into_iter())
+    }
+}
+
+impl From<Origin> for OriginSet {
+    fn from(p: Origin) -> Self {
+        Self::single(p)
+    }
+}
--- a/crates/kingfisher-rules/Cargo.toml
+++ b/crates/kingfisher-rules/Cargo.toml
@ -0,0 +1,64 @@
+[package]
+name = "kingfisher-rules"
+version = "0.1.0"
+description = "Rule definitions and database for Kingfisher secret scanner"
+edition.workspace = true
+rust-version.workspace = true
+license.workspace = true
+authors.workspace = true
+homepage.workspace = true
+repository.workspace = true
+publish.workspace = true
+
+[dependencies]
+# Internal dependencies
+kingfisher-core = { path = "../kingfisher-core" }
+
+# Serialization
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+serde_yaml = "0.9"
+schemars = "0.8"
+
+# Error handling
+anyhow = "1.0"
+thiserror = "1.0"
+
+# Regex
+regex = "1.12"
+lazy_static = "1.5"
+
+# Hashing
+xxhash-rust = { version = "0.8", features = ["xxh3"] }
+
+# Liquid templating (for checksum validation)
+liquid = "0.26"
+liquid-core = "0.26"
+
+# Crypto for liquid filters
+base64 = "0.22"
+crc32fast = "1.5"
+hmac = "0.12"
+sha1 = "0.10"
+sha2 = "0.10"
+percent-encoding = "2.3"
+time = "0.3"
+uuid = { version = "1.19", features = ["v4"] }
+rand = "0.9"
+
+# Vectorscan for pattern matching
+vectorscan-rs = "0.0.5"
+
+# Logging
+tracing = "0.1"
+
+# File walking
+walkdir = "2.5"
+ignore = "0.4"
+
+# Embedded rule files
+include_dir = "0.7"
+
+[dev-dependencies]
+pretty_assertions = "1.4"
+proptest = "1.6"
--- a/crates/kingfisher-rules/data/rules/adafruitio.yml
+++ b/crates/kingfisher-rules/data/rules/adafruitio.yml
--- a/crates/kingfisher-rules/data/rules/adobe.yml
+++ b/crates/kingfisher-rules/data/rules/adobe.yml
--- a/crates/kingfisher-rules/data/rules/age.yml
+++ b/crates/kingfisher-rules/data/rules/age.yml
@ -80,4 +80,4 @@ rules:
      - https://htmlpreview.github.io/?https://github.com/FiloSottile/age/blob/main/doc/age.1.html
      - https://github.com/C2SP/C2SP/blob/037e546d164a89fd7577df2c18df80bb54bd246e/age.md#the-mlkem768-x25519-ie-x-wing-hybrid-post-quantum-recipient-type
    categories:
-      - secret
+      - secret
--- a/crates/kingfisher-rules/data/rules/ai21.yml
+++ b/crates/kingfisher-rules/data/rules/ai21.yml
--- a/crates/kingfisher-rules/data/rules/airbrake.yml
+++ b/crates/kingfisher-rules/data/rules/airbrake.yml
--- a/crates/kingfisher-rules/data/rules/airtable.yml
+++ b/crates/kingfisher-rules/data/rules/airtable.yml
--- a/crates/kingfisher-rules/data/rules/aiven.yml
+++ b/crates/kingfisher-rules/data/rules/aiven.yml
--- a/crates/kingfisher-rules/data/rules/alchemy.yml
+++ b/crates/kingfisher-rules/data/rules/alchemy.yml
--- a/crates/kingfisher-rules/data/rules/algolia.yml
+++ b/crates/kingfisher-rules/data/rules/algolia.yml
--- a/crates/kingfisher-rules/data/rules/alibaba.yml
+++ b/crates/kingfisher-rules/data/rules/alibaba.yml
@ -2,9 +2,9 @@ rules:
  - name: Alibaba Access Key ID
    id: kingfisher.alibabacloud.1
    pattern: |
-      (?xi)
+      (?x)
      (
-        LTAI[a-z0-9]{17,21}
+        LTAI([a-zA-Z0-9]{12,20})
      )
      \b
    pattern_requirements:
--- a/crates/kingfisher-rules/data/rules/anthropic.yml
+++ b/crates/kingfisher-rules/data/rules/anthropic.yml
--- a/crates/kingfisher-rules/data/rules/anypoint.yml
+++ b/crates/kingfisher-rules/data/rules/anypoint.yml
--- a/crates/kingfisher-rules/data/rules/apify.yml
+++ b/crates/kingfisher-rules/data/rules/apify.yml
--- a/crates/kingfisher-rules/data/rules/apollo.yml
+++ b/crates/kingfisher-rules/data/rules/apollo.yml
--- a/crates/kingfisher-rules/data/rules/artifactory.yml
+++ b/crates/kingfisher-rules/data/rules/artifactory.yml
--- a/crates/kingfisher-rules/data/rules/asana.yml
+++ b/crates/kingfisher-rules/data/rules/asana.yml
--- a/crates/kingfisher-rules/data/rules/assemblyai.yml
+++ b/crates/kingfisher-rules/data/rules/assemblyai.yml
--- a/crates/kingfisher-rules/data/rules/atlassian.yml
+++ b/crates/kingfisher-rules/data/rules/atlassian.yml
--- a/crates/kingfisher-rules/data/rules/auth0.yml
+++ b/crates/kingfisher-rules/data/rules/auth0.yml
--- a/crates/kingfisher-rules/data/rules/authress.yml
+++ b/crates/kingfisher-rules/data/rules/authress.yml
--- a/crates/kingfisher-rules/data/rules/aws.yml
+++ b/crates/kingfisher-rules/data/rules/aws.yml
@ -63,6 +63,8 @@ rules:
          },
    validation:
      type: AWS
+    revocation:
+      type: AWS
    depends_on_rule:
      - rule_id: kingfisher.aws.1
        variable: AKID
@ -107,7 +109,7 @@ rules:

  - name: AWS Bedrock API Key (Short-lived)
    id: kingfisher.aws.6
-    pattern:  |
+    pattern: |
      (?x)
      (
        bedrock-api-key-YmVkcm9jay5hbWF6b25hd3MuY29t[A-Za-z0-9+/]+={0,2}
--- a/crates/kingfisher-rules/data/rules/azure.yml
+++ b/crates/kingfisher-rules/data/rules/azure.yml
--- a/crates/kingfisher-rules/data/rules/azuredevops.yml
+++ b/crates/kingfisher-rules/data/rules/azuredevops.yml
--- a/crates/kingfisher-rules/data/rules/azureopenai.yml
+++ b/crates/kingfisher-rules/data/rules/azureopenai.yml
--- a/crates/kingfisher-rules/data/rules/azuresearchquery.yml
+++ b/crates/kingfisher-rules/data/rules/azuresearchquery.yml
--- a/crates/kingfisher-rules/data/rules/azurestorage.yml
+++ b/crates/kingfisher-rules/data/rules/azurestorage.yml
--- a/crates/kingfisher-rules/data/rules/baremetrics.yml
+++ b/crates/kingfisher-rules/data/rules/baremetrics.yml
--- a/crates/kingfisher-rules/data/rules/baseten.yml
+++ b/crates/kingfisher-rules/data/rules/baseten.yml
--- a/crates/kingfisher-rules/data/rules/beamer.yml
+++ b/crates/kingfisher-rules/data/rules/beamer.yml
--- a/crates/kingfisher-rules/data/rules/bitbucket.yml
+++ b/crates/kingfisher-rules/data/rules/bitbucket.yml
--- a/crates/kingfisher-rules/data/rules/bitly.yml
+++ b/crates/kingfisher-rules/data/rules/bitly.yml
--- a/crates/kingfisher-rules/data/rules/blynk.yml
+++ b/crates/kingfisher-rules/data/rules/blynk.yml
--- a/crates/kingfisher-rules/data/rules/buildkite.yml
+++ b/crates/kingfisher-rules/data/rules/buildkite.yml
@ -30,4 +30,16 @@ rules:
              status: [200]
            - type: WordMatch
              words: ['"uuid"', '"user"']
+    revocation:
+      type: Http
+      content:
+        request:
+          method: DELETE
+          url: https://api.buildkite.com/v2/access-token
+          headers:
+            Authorization: "Bearer {{ TOKEN }}"
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status: [204]
  
--- a/crates/kingfisher-rules/data/rules/cerebras.yml
+++ b/crates/kingfisher-rules/data/rules/cerebras.yml
--- a/crates/kingfisher-rules/data/rules/circleci.yml
+++ b/crates/kingfisher-rules/data/rules/circleci.yml
--- a/crates/kingfisher-rules/data/rules/ciscomeraki.yml
+++ b/crates/kingfisher-rules/data/rules/ciscomeraki.yml
--- a/crates/kingfisher-rules/data/rules/clarifai.yml
+++ b/crates/kingfisher-rules/data/rules/clarifai.yml
--- a/crates/kingfisher-rules/data/rules/clay.yml
+++ b/crates/kingfisher-rules/data/rules/clay.yml
--- a/crates/kingfisher-rules/data/rules/clearbit.yml
+++ b/crates/kingfisher-rules/data/rules/clearbit.yml
--- a/crates/kingfisher-rules/data/rules/clickhouse.yml
+++ b/crates/kingfisher-rules/data/rules/clickhouse.yml
--- a/crates/kingfisher-rules/data/rules/clojars.yml
+++ b/crates/kingfisher-rules/data/rules/clojars.yml
--- a/crates/kingfisher-rules/data/rules/cloudflare.yml
+++ b/crates/kingfisher-rules/data/rules/cloudflare.yml
--- a/crates/kingfisher-rules/data/rules/cloudsight.yml
+++ b/crates/kingfisher-rules/data/rules/cloudsight.yml
--- a/crates/kingfisher-rules/data/rules/codacy.yml
+++ b/crates/kingfisher-rules/data/rules/codacy.yml
--- a/crates/kingfisher-rules/data/rules/codeclimate.yml
+++ b/crates/kingfisher-rules/data/rules/codeclimate.yml
--- a/crates/kingfisher-rules/data/rules/codecov.yml
+++ b/crates/kingfisher-rules/data/rules/codecov.yml
--- a/crates/kingfisher-rules/data/rules/coderabbit.yml
+++ b/crates/kingfisher-rules/data/rules/coderabbit.yml
--- a/crates/kingfisher-rules/data/rules/cohere.yml
+++ b/crates/kingfisher-rules/data/rules/cohere.yml
--- a/crates/kingfisher-rules/data/rules/coinbase.yml
+++ b/crates/kingfisher-rules/data/rules/coinbase.yml
--- a/crates/kingfisher-rules/data/rules/confluent.yml
+++ b/crates/kingfisher-rules/data/rules/confluent.yml
--- a/crates/kingfisher-rules/data/rules/contentful.yml
+++ b/crates/kingfisher-rules/data/rules/contentful.yml
--- a/crates/kingfisher-rules/data/rules/coveralls.yml
+++ b/crates/kingfisher-rules/data/rules/coveralls.yml
--- a/crates/kingfisher-rules/data/rules/coze.yml
+++ b/crates/kingfisher-rules/data/rules/coze.yml
--- a/crates/kingfisher-rules/data/rules/crates.io.yml
+++ b/crates/kingfisher-rules/data/rules/crates.io.yml
--- a/crates/kingfisher-rules/data/rules/credentials.yml
+++ b/crates/kingfisher-rules/data/rules/credentials.yml
--- a/crates/kingfisher-rules/data/rules/curl.yml
+++ b/crates/kingfisher-rules/data/rules/curl.yml
--- a/crates/kingfisher-rules/data/rules/cursor.yml
+++ b/crates/kingfisher-rules/data/rules/cursor.yml
--- a/crates/kingfisher-rules/data/rules/customerio.yml
+++ b/crates/kingfisher-rules/data/rules/customerio.yml
--- a/crates/kingfisher-rules/data/rules/databricks.yml
+++ b/crates/kingfisher-rules/data/rules/databricks.yml
--- a/crates/kingfisher-rules/data/rules/datadog.yml
+++ b/crates/kingfisher-rules/data/rules/datadog.yml
--- a/crates/kingfisher-rules/data/rules/datagov.yml
+++ b/crates/kingfisher-rules/data/rules/datagov.yml
--- a/crates/kingfisher-rules/data/rules/deepgram.yml
+++ b/crates/kingfisher-rules/data/rules/deepgram.yml
--- a/crates/kingfisher-rules/data/rules/deepseek.yml
+++ b/crates/kingfisher-rules/data/rules/deepseek.yml
--- a/crates/kingfisher-rules/data/rules/definednetworking.yml
+++ b/crates/kingfisher-rules/data/rules/definednetworking.yml
--- a/crates/kingfisher-rules/data/rules/dependency_track.yml
+++ b/crates/kingfisher-rules/data/rules/dependency_track.yml
--- a/crates/kingfisher-rules/data/rules/diffbot.yml
+++ b/crates/kingfisher-rules/data/rules/diffbot.yml
--- a/crates/kingfisher-rules/data/rules/digitalocean.yml
+++ b/crates/kingfisher-rules/data/rules/digitalocean.yml
--- a/crates/kingfisher-rules/data/rules/discord.yml
+++ b/crates/kingfisher-rules/data/rules/discord.yml
--- a/crates/kingfisher-rules/data/rules/disqus.yml
+++ b/crates/kingfisher-rules/data/rules/disqus.yml
--- a/crates/kingfisher-rules/data/rules/django.yml
+++ b/crates/kingfisher-rules/data/rules/django.yml
--- a/crates/kingfisher-rules/data/rules/docker.yml
+++ b/crates/kingfisher-rules/data/rules/docker.yml
--- a/crates/kingfisher-rules/data/rules/dockerhub.yml
+++ b/crates/kingfisher-rules/data/rules/dockerhub.yml
@ -2,12 +2,12 @@ rules:
  - name: Docker Hub Personal Access Token
    id: kingfisher.dockerhub.1
    pattern: |
-      (?x)
+      (?xi)
      \b
      (
-        dckr_pat_[A-Za-z0-9_-]{27}
+        dckr_pat_[A-Z0-9_-]{27}
      )
-      (?: $ | [^A-Za-z0-9_-] )
+      (?: $ | [^A-Z0-9_-] )
    pattern_requirements:
      min_digits: 2
    min_entropy: 3.3
@ -17,30 +17,62 @@ rules:
      - docker login -u gemesa -p dckr_pat_tkzBYxjNNC3R_Yg6jd_O-G8FbrJ
      - docker login -u gemesa -p dckr_pat_1q8yKET1VDJTpfCwseUDzT8vFh-
    references:
-      - https://docs.docker.com/reference/api/hub/latest/#tag/access-tokens/paths/~1v2~1access-tokens~1%7Buuid%7D/get
+      - https://docs.docker.com/reference/api/hub/latest/#tag/authentication-api/operation/AuthCreateAccessToken
+    depends_on_rule:
+      - rule_id: kingfisher.dockerhub.2
+        variable: DOCKER_USERNAME
    validation:
      type: Http
      content:
        request:
+          method: POST
+          url: https://hub.docker.com/v2/auth/token
          headers:
-            Authorization: Bearer {{ TOKEN }}
+            Content-Type: application/json
            Accept: application/json
-          method: GET
+          body: '{"identifier":"{{ DOCKER_USERNAME | json_escape }}","secret":"{{ TOKEN | json_escape }}"}'
          response_matcher:
            - report_response: true
-            - status:
-                - 200
-              type: StatusMatch
-          url: https://hub.docker.com/v2/access-tokens?page_size=1
-  - name: Docker Hub Organization Access Token
+            - type: StatusMatch
+              status: [200]
+            - type: WordMatch
+              words:
+                - '"access_token"'
+
+  - name: Docker Hub Username
    id: kingfisher.dockerhub.2
    pattern: |
-      (?x)
+      (?xi)
+      \b
+      docker
+      (?:.|[\n\r]){0,32}?
+      (?:
+        -u\s+
+        | --username[=\s]+
+        | user(?:name)?[=:\s]+["']?
+      )
+      (
+        [a-z0-9][a-z0-9_-]{2,29}
+      )
+      \b
+    min_entropy: 1.0
+    confidence: medium
+    visible: false
+    examples:
+      - docker login -u gemesa -p dckr_pat_xxx
+      - docker login --username=myuser
+      - DOCKER_USERNAME=myuser
+      - 'docker_user: "myuser"'
+
+  - name: Docker Hub Organization Access Token
+    id: kingfisher.dockerhub.3
+    pattern: |
+      (?xi)
      \b
      (
-        dckr_oat_[A-Za-z0-9_-]{32}
+        dckr_oat_[A-Z0-9_-]{32}
      )
-      (?: $ | [^A-Za-z0-9_-] )
+      (?: $ | [^A-Z0-9_-] )
    pattern_requirements:
      min_digits: 2
    min_entropy: 3.3
--- a/crates/kingfisher-rules/data/rules/doppler.yml
+++ b/crates/kingfisher-rules/data/rules/doppler.yml
--- a/crates/kingfisher-rules/data/rules/droneci.yml
+++ b/crates/kingfisher-rules/data/rules/droneci.yml
--- a/crates/kingfisher-rules/data/rules/dropbox.yml
+++ b/crates/kingfisher-rules/data/rules/dropbox.yml
--- a/crates/kingfisher-rules/data/rules/duffel.yml
+++ b/crates/kingfisher-rules/data/rules/duffel.yml
--- a/crates/kingfisher-rules/data/rules/dynatrace.yml
+++ b/crates/kingfisher-rules/data/rules/dynatrace.yml
--- a/crates/kingfisher-rules/data/rules/easypost.yml
+++ b/crates/kingfisher-rules/data/rules/easypost.yml
--- a/crates/kingfisher-rules/data/rules/elevenlabs.yml
+++ b/crates/kingfisher-rules/data/rules/elevenlabs.yml
--- a/crates/kingfisher-rules/data/rules/endorlabs.yml
+++ b/crates/kingfisher-rules/data/rules/endorlabs.yml
--- a/crates/kingfisher-rules/data/rules/eraserio.yml
+++ b/crates/kingfisher-rules/data/rules/eraserio.yml
--- a/crates/kingfisher-rules/data/rules/eventbrite.yml
+++ b/crates/kingfisher-rules/data/rules/eventbrite.yml
--- a/crates/kingfisher-rules/data/rules/exaai.yml
+++ b/crates/kingfisher-rules/data/rules/exaai.yml
--- a/crates/kingfisher-rules/data/rules/facebook.yml
+++ b/crates/kingfisher-rules/data/rules/facebook.yml
--- a/Show more
+++ b/Show more