Merge pull request #105 from mongodb/development

v1.48.0
2025-09-05 11:30:10 -07:00 · 2025-09-05 11:30:10 -07:00 · f43a26030d
commit f43a26030d
parent d541a06c9c 82caff531f
22 changed files with 440 additions and 1332 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,15 @@

 All notable changes to this project will be documented in this file.

+## [1.48.0]
+- Improved error message when self-update cannot find the current binary
+- Optimized memory usage via string interning and extensive data sharing
+- Replaced quadratic match filtering with a per-rule span map, fixing missed secrets in extremely large files and improving scan performance
+- Support scanning extremely large files by chunking input into 1 GiB segments with small overlaps, avoiding vectorscan buffer limits while preserving match offsets
+- Always use chunked vectorscan, eliminating the slow regex fallback for blobs over 4 GiB
+- Skip Base64 scanning for blobs over 64 MB to avoid a second pass over massive files
+- Increased max-file-size default to 64 MB (up from 25 MB)
+
 ## [1.47.0]
 - MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them
 - Improved rules: github oauth2, diffbot, mailchimp, aws
--- a/Cargo.toml
+++ b/Cargo.toml
@ -10,7 +10,7 @@ publish = false

 [package]
 name = "kingfisher"
-version = "1.47.0"
+version = "1.48.0"
 description = "MongoDB's blazingly fast secret scanning and validation tool"
 edition.workspace = true
 rust-version.workspace = true
@ -65,7 +65,7 @@ smallvec = { version = "1", features = [
 tracing = "0.1.41"
 indicatif = { version = "0.17", features = ["improved_unicode"] }
 rayon = "1.10"
-sha1 = "0.10.6"
+sha1 = { version = "0.10.6", features = ["asm"] }
 hex = "0.4.3"
 vectorscan-rs = "0.0.5"
 regex = "1.11.1"
@ -151,6 +151,7 @@ tar = "0.4.44"
 xz2 = "0.1.7"
 asar = "0.3.0"
 blake3 = "1.8.2"
+memchr = "2.7"
 memmap2 = "0.9.7"
 futures = "0.3.31"
 dashmap = "6.1.0"
--- a/README.md
+++ b/README.md
@ -36,6 +36,64 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
  <img src="docs/runtime-comparison.png" alt="Kingfisher Runtime Comparison" style="vertical-align: center;" />
 </p>

+- [Kingfisher](#kingfisher)
+  - [Key Features](#key-features)
+- [Benchmark Results](#benchmark-results)
+- [Getting Started](#getting-started)
+  - [Installation](#installation)
+    - [Run Kingfisher in Docker](#run-kingfisher-in-docker)
+- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance)
+  - [Write Custom Rules!](#write-custom-rules)
+- [Usage](#usage)
+  - [Basic Examples](#basic-examples)
+    - [Scan with secret validation](#scan-with-secret-validation)
+    - [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories)
+    - [Scan a Git repository without validation](#scan-a-git-repository-without-validation)
+    - [Display only secrets confirmed active by third‑party APIs](#display-only-secrets-confirmed-active-by-thirdparty-apis)
+    - [Output JSON and capture to a file](#output-json-and-capture-to-a-file)
+    - [Output SARIF directly to disk](#output-sarif-directly-to-disk)
+    - [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--)
+    - [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size)
+    - [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag)
+    - [Display rule performance statistics](#display-rule-performance-statistics)
+    - [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files)
+    - [Exclude specific paths](#exclude-specific-paths)
+  - [Scan an S3 bucket](#scan-an-s3-bucket)
+  - [Scanning Docker Images](#scanning-docker-images)
+  - [Scanning GitHub](#scanning-github)
+    - [Scan GitHub organisation (requires `KF_GITHUB_TOKEN`)](#scan-github-organisation-requires-kf_github_token)
+    - [Scan remote GitHub repository](#scan-remote-github-repository)
+  - [Scanning GitLab](#scanning-gitlab)
+    - [Scan GitLab group (requires `KF_GITLAB_TOKEN`)](#scan-gitlab-group-requires-kf_gitlab_token)
+    - [Scan GitLab user](#scan-gitlab-user)
+    - [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url)
+    - [List GitLab repositories](#list-gitlab-repositories)
+  - [Scanning Jira](#scanning-jira)
+    - [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query)
+    - [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues)
+  - [Scanning Confluence](#scanning-confluence)
+    - [Scan Confluence pages matching a CQL query](#scan-confluence-pages-matching-a-cql-query)
+  - [Scanning Slack](#scanning-slack)
+    - [Scan Slack messages matching a search query](#scan-slack-messages-matching-a-search-query)
+  - [Environment Variables for Tokens](#environment-variables-for-tokens)
+  - [Exit Codes](#exit-codes)
+  - [Update Checks](#update-checks)
+- [Advanced Options](#advanced-options)
+  - [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets)
+  - [List Builtin Rules](#list-builtin-rules)
+  - [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run)
+  - [To add your rules alongside the built‑ins:](#to-add-your-rules-alongside-the-builtins)
+  - [Other Examples](#other-examples)
+  - [Notable Scan Options](#notable-scan-options)
+  - [Understanding `--confidence`](#understanding---confidence)
+    - [Ignore known false positives](#ignore-known-false-positives)
+  - [Finding Fingerprint](#finding-fingerprint)
+  - [Rule Performance Profiling](#rule-performance-profiling)
+  - [CLI Options](#cli-options)
+  - [Origins and Divergence](#origins-and-divergence)
+- [Roadmap](#roadmap)
+- [License](#license)
+
 # Getting Started
 ## Installation

@ -225,9 +283,18 @@ cat /path/to/file.py | kingfisher scan -

 ```

+### Limit maximum file size scanned (`--max-file-size`)
+
+By default, Kingfisher skips files larger than **64 MB**. You can raise or lower this cap per run with `--max-file-size`, which takes a value in **megabytes**.
+
+```bash
+# Scan files up to 250 mb in size
+kingfisher scan /some/file --max-file-size 250
+```
+
 ### Scan using a rule _family_ with one flag

-_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws._`)\*
+_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws.*`)_

 ```bash
 # Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2)
@ -617,6 +684,14 @@ kingfisher github repos list --organization my-org
 - `--skip-regex <PATTERN>`: Ignore findings whose text matches this regex (repeatable)
 - `--skip-word <WORD>`: Ignore findings containing this case-insensitive word (repeatable)

+## Understanding `--confidence`
+
+The `--confidence` flag sets a minimum confidence threshold, not an exact match.
+
+- If you pass `--confidence medium`, findings with **medium and higher** confidence (medium + high) will be included.
+- If you pass `--confidence low`, you’ll see **all levels** (low, medium, high).
+
+
 ### Ignore known false positives

 Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. 
--- a/data/rules/jwt.yml
+++ b/data/rules/jwt.yml
@ -5,7 +5,7 @@ rules:
      (?x)
      \b
      (
-        ey[A-Za-z0-9_-]{12,}  (?# header )
+        (?:ey|ewogIC)[A-Za-z0-9_-]{12,}  (?# header )
        \.
        ey[A-Za-z0-9_-]{12,}  (?# payload )
        \.
--- a/src/blob.rs
+++ b/src/blob.rs
@ -10,16 +10,20 @@ use anyhow::Result;
 use bstr::{BString, ByteSlice};
 use gix::ObjectId;
 use hex;
+use once_cell::sync::OnceCell;
 use parking_lot::Mutex;
 use rustc_hash::FxHashMap;
 use serde::{Deserialize, Serialize};
 use sha1::{Digest, Sha1};
 use smallvec::SmallVec;
+use std::sync::atomic::{AtomicU64, Ordering};

 use crate::git_commit_metadata::CommitMetadata;
 // const LARGE_FILE_THRESHOLD: u64 = 512 * 1024; // 512 KB
 const LARGE_FILE_THRESHOLD: u64 = 0; // always mmap

+static NEXT_ID: AtomicU64 = AtomicU64::new(1);
+
 /// The data of a blob, either owned (small files) or memory mapped (large files).
 pub enum BlobData<'a> {
    /// Small blobs – remains as-is.
@ -75,47 +79,68 @@ pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>;
 /// A Git blob, storing its SHA-1 id and its contents.

 pub struct Blob<'a> {
-    pub id: BlobId,
-    pub data: BlobData<'a>,
+    id: OnceCell<BlobId>,
+    data: BlobData<'a>,
+    temp_id: u64,
 }

 impl Blob<'_> {
    #[inline]
-
    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
        let mut file = File::open(&path)?;
        let file_size = file.metadata()?.len();
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);

        if file_size > LARGE_FILE_THRESHOLD {
            // Large files: one mmap, zero extra copies.
            let mmap = unsafe { memmap2::Mmap::map(&file)? };
-            let id = BlobId::new(mmap.as_ref());
-            Ok(Blob { id, data: BlobData::Mapped(mmap) })
+            Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id })
        } else {
            // Small files: reuse the same handle and pre-allocate exact capacity
            let mut bytes = Vec::with_capacity(file_size as usize);
            file.read_to_end(&mut bytes)?;
-            let id = BlobId::new(&bytes);
-            Ok(Blob { id, data: BlobData::Owned(bytes) })
+            Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id })
        }
    }
+
    /// Returns the blob's bytes as a slice.
    #[inline]
    pub fn bytes(&self) -> &[u8] {
        self.data.as_ref()
    }

+    /// Lazily compute and return the blob's SHA-1 `BlobId`.
+    #[inline]
+    pub fn id(&self) -> BlobId {
+        *self.id.get_or_init(|| BlobId::new(self.bytes()))
+    }
+
+    /// Get a reference to the blob's SHA-1 `BlobId`, computing it if necessary.
+    #[inline]
+    pub fn id_ref(&self) -> &BlobId {
+        self.id.get_or_init(|| BlobId::new(self.bytes()))
+    }
+
+    /// Return the temporary identifier assigned on blob creation.
+    #[inline]
+    pub fn temp_id(&self) -> u64 {
+        self.temp_id
+    }
+
    /// Create a new `Blob` from a vector of bytes.
    #[inline]
    pub fn from_bytes(bytes: Vec<u8>) -> Self {
-        let id = BlobId::compute_from_bytes(&bytes);
-        Blob { id, data: BlobData::Owned(bytes) }
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+        Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }
    }

    /// Create a new `Blob` with the given id and data.
    #[inline]
    pub fn new(id: BlobId, bytes: Vec<u8>) -> Self {
-        Blob { id, data: BlobData::Owned(bytes) }
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+        let cell = OnceCell::new();
+        let _ = cell.set(id);
+        Blob { id: cell, data: BlobData::Owned(bytes), temp_id }
    }

    #[inline]
@ -269,9 +294,15 @@ impl BlobId {
    /// Create a new BlobId computed from the given input.
    #[inline]
    pub fn new(input: &[u8]) -> Self {
+        const CHUNK: usize = 64 * 1024; // 64KB from start and end
        let mut hasher = Sha1::new();
        write!(&mut hasher, "blob {}\0", input.len()).unwrap();
-        hasher.update(input);
+        if input.len() <= CHUNK * 2 {
+            hasher.update(input);
+        } else {
+            hasher.update(&input[..CHUNK]);
+            hasher.update(&input[input.len() - CHUNK..]);
+        }
        BlobId(hasher.finalize().as_slice().try_into().expect("SHA-1 output size mismatch"))
    }

@ -360,9 +391,6 @@ pub struct BlobMetadata {
    /// The guessed multimedia type of the blob
    pub mime_essence: Option<String>,

-    /// The guessed charset of the blob
-    pub charset: Option<String>,
-
    /// The guessed programming language of the blob
    pub language: Option<String>,
 }
@ -384,9 +412,4 @@ impl BlobMetadata {
    pub fn mime_essence(&self) -> Option<&str> {
        self.mime_essence.as_deref()
    }
-
-    #[inline]
-    pub fn charset(&self) -> Option<&str> {
-        self.charset.as_deref()
-    }
 }
--- a/src/cli/commands/inputs.rs
+++ b/src/cli/commands/inputs.rs
@ -169,7 +169,7 @@ pub struct InputSpecifierArgs {
 #[derive(Args, Debug, Clone)]
 pub struct ContentFilteringArgs {
    /// Ignore files larger than the given size in MB
-    #[arg(long("max-file-size"), default_value_t = 25.0)]
+    #[arg(long("max-file-size"), default_value_t = 64.0)]
    pub max_file_size_mb: f64,

    // /// Use custom path-based ignore rules from the given file(s)
--- a/src/content_type.rs
+++ b/src/content_type.rs
@ -96,9 +96,8 @@ impl ContentInspector {
    #[inline]
    #[must_use]
    pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
-        String::from_utf8(bytes.to_vec()).ok().map(|_| "UTF-8".to_string())
+        std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
    }
-
    /// Guess programming language with broad coverage using `tokei`.
    ///
    /// Strategy (no disk I/O):
--- a/src/decompress.rs
+++ b/src/decompress.rs
@ -161,11 +161,8 @@ fn handle_zip_archive_streaming(
    Ok(CompressedContent::ArchiveFiles(entries_on_disk))
 }

-fn handle_asar_archive_in_memory(
-    buffer: Vec<u8>,
-    archive_path: &Path,
-) -> Result<CompressedContent> {
-    match AsarReader::new(&buffer, None) {
+fn handle_asar_archive_in_memory(buffer: &[u8], archive_path: &Path) -> Result<CompressedContent> {
+    match AsarReader::new(buffer, None) {
        Ok(reader) => {
            let mut contents = Vec::new();
            for (path_in_asar, file) in reader.files() {
@ -200,7 +197,7 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result<CompressedCon
        match ext {
            "asar" => {
                let mmap = unsafe { Mmap::map(&file)? };
-                return handle_asar_archive_in_memory(mmap.to_vec(), path);
+                return handle_asar_archive_in_memory(&mmap, path);
            }
            "tar" => {
                if let Some(base) = base_dir {
--- a/src/findings_store.rs
+++ b/src/findings_store.rs
@ -18,6 +18,7 @@ use crate::{
    matcher::Match,
    origin::{Origin, OriginSet},
    rules::rule::Rule,
+    util::intern,
 };

 // share with Arc so every blob/origin is materialised once
@ -151,7 +152,7 @@ impl FindingsStore {
                    .captures
                    .get(1)
                    .or_else(|| m.groups.captures.get(0))
-                    .map_or("", |c| c.value.as_ref());
+                    .map_or("", |c| c.value);

                let origin_kind = match origin.first() {
                    Origin::GitRepo(_) => "git",
@ -160,7 +161,7 @@ impl FindingsStore {
                };

                let key = xxh3_64(
-                    format!("{}|{}|{}", m.rule_text_id.to_uppercase(), origin_kind, snippet)
+                    format!("{}|{}|{}", m.rule.id().to_uppercase(), origin_kind, snippet)
                        .as_bytes(),
                );

@ -280,7 +281,7 @@ impl FindingsStore {
    pub fn get_summary(&self) -> FxHashMap<&'static str, usize> {
        self.matches.iter().fold(FxHashMap::default(), |mut acc, msg| {
            let (_, _, m) = &**msg;
-            *acc.entry(m.rule_name).or_insert(0) += 1; // borrow, no alloc
+            *acc.entry(intern(m.rule.name())).or_insert(0) += 1;
            acc
        })
    }
@ -342,13 +343,13 @@ impl FindingsStore {
        self.matches.iter().map(|msg| {
            let (_, _, match_item) = &**msg;
            finding_data::FindingMetadata {
-                rule_name: match_item.rule_name.to_string(),
+                rule_name: match_item.rule.name().to_string(),
                num_matches: 1,
                comment: None,
                visible: match_item.visible,
                finding_id: match_item.finding_id(),
-                rule_finding_fingerprint: match_item.rule_finding_fingerprint.to_string(),
-                rule_text_id: match_item.rule_text_id.to_string(),
+                rule_finding_fingerprint: match_item.rule.finding_sha1_fingerprint().to_string(),
+                rule_text_id: match_item.rule.id().to_string(),
            }
        })
    }
@ -362,7 +363,7 @@ impl FindingsStore {
            .iter()
            .filter(|msg| {
                let (_, _, match_item) = &***msg;
-                match_item.rule_name == metadata.rule_name
+                match_item.rule.name() == metadata.rule_name
            })
            .map(|msg| {
                let (origin, blob_metadata, match_item) = &**msg;
@ -373,7 +374,7 @@ impl FindingsStore {
                    match_id: MatchIdInt::from_str(&match_item.finding_id())?,
                    match_comment: None,
                    visible: match_item.visible,
-                    match_confidence: match_item.rule_confidence,
+                    match_confidence: match_item.rule.confidence(),
                    validation_response_body: match_item.validation_response_body.clone(),
                    validation_response_status: match_item.validation_response_status,
                    validation_success: match_item.validation_success,
--- a/src/location.rs
+++ b/src/location.rs
@ -1,4 +1,5 @@
 use core::ops::Range;
+use std::cell::RefCell;

 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
@ -87,25 +88,35 @@ impl std::fmt::Display for SourceSpan {
 }

 /// Records newline byte‑offsets to map offsets -- (line, column).
-pub struct LocationMapping {
-    newline_offsets: Vec<usize>,
+pub struct LocationMapping<'a> {
+    bytes: &'a [u8],
+    newline_offsets: RefCell<Vec<usize>>,
 }

-impl LocationMapping {
-    /// Scan once for all `\n` positions.
-    pub fn new(input: &[u8]) -> Self {
-        let newline_offsets =
-            input.iter().enumerate().filter_map(|(i, &b)| (b == b'\n').then_some(i)).collect();
-        LocationMapping { newline_offsets }
+impl<'a> LocationMapping<'a> {
+    /// Create a new mapping without pre-scanning the entire input.
+    pub fn new(input: &'a [u8]) -> Self {
+        LocationMapping { bytes: input, newline_offsets: RefCell::new(Vec::new()) }
    }

-    /// Map a byte offset to a `SourcePoint`.
-    pub fn get_source_point(&self, offset: usize) -> SourcePoint {
-        let line = match self.newline_offsets.binary_search(&offset) {
-            Ok(idx) => idx + 2, // exact newline -- next line
+    fn ensure_offsets_up_to(&self, offset: usize) {
+        let mut offsets = self.newline_offsets.borrow_mut();
+        let start = offsets.last().map_or(0, |&last| last + 1);
+        if offset < start {
+            return;
+        }
+        let end = offset.min(self.bytes.len());
+        for nl in memchr::memchr_iter(b'\n', &self.bytes[start..end]) {
+            offsets.push(start + nl);
+        }
+    }
+
+    fn source_point_from_offsets(offsets: &[usize], offset: usize) -> SourcePoint {
+        let line = match offsets.binary_search(&offset) {
+            Ok(idx) => idx + 2,
            Err(idx) => idx + 1,
        };
-        let column = if let Some(&last) = self.newline_offsets.get(line.saturating_sub(2)) {
+        let column = if let Some(&last) = offsets.get(line.saturating_sub(2)) {
            offset.saturating_sub(last + 1)
        } else {
            offset
@ -113,10 +124,19 @@ impl LocationMapping {
        SourcePoint { line, column }
    }

+    /// Map a byte offset to a `SourcePoint`.
+    pub fn get_source_point(&self, offset: usize) -> SourcePoint {
+        self.ensure_offsets_up_to(offset);
+        let offsets = self.newline_offsets.borrow();
+        Self::source_point_from_offsets(&offsets, offset)
+    }
+
    /// Map an `OffsetSpan` -- `SourceSpan` (closed interval).
    pub fn get_source_span(&self, span: &OffsetSpan) -> SourceSpan {
-        let start = self.get_source_point(span.start);
-        let end = self.get_source_point(span.end.saturating_sub(1));
+        self.ensure_offsets_up_to(span.end.saturating_sub(1));
+        let offsets = self.newline_offsets.borrow();
+        let start = Self::source_point_from_offsets(&offsets, span.start);
+        let end = Self::source_point_from_offsets(&offsets, span.end.saturating_sub(1));
        SourceSpan { start, end }
    }
 }
--- a/src/matcher.rs
+++ b/src/matcher.rs
@ -10,7 +10,6 @@ use anyhow::Result;
 use base64::{engine::general_purpose, Engine};
 use bstr::BString;
 use http::StatusCode;
-use lazy_static::lazy_static;
 use regex::bytes::Regex;
 use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
 use schemars::{
@ -27,7 +26,7 @@ use xxhash_rust::xxh3::xxh3_64;
 use crate::{
    blob::{Blob, BlobId, BlobIdMap},
    entropy::calculate_shannon_entropy,
-    location::{Location, LocationMapping, OffsetSpan},
+    location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
    origin::OriginSet,
    parser,
    parser::{Checker, Language},
@ -40,6 +39,10 @@ use crate::{
    util::{intern, redact_value},
 };

+const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
+const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
+const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
+
 // -------------------------------------------------------------------------------------------------
 // RawMatch
 // -------------------------------------------------------------------------------------------------
@ -255,17 +258,28 @@ impl<'a> Matcher<'a> {
        self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64));

        self.user_data.input_len = input.len() as u64;
-        // self.vs_scanner.scan(input, |rid, from, to, _flags| {
-        self.scanner_pool.with(|scanner| {
-            scanner.scan(input, |rule_id, from, to, _flags| {
-                self.user_data.raw_matches_scratch.push(RawMatch {
-                    rule_id,
-                    start_idx: from,
-                    end_idx: to,
-                });
-                vectorscan_rs::Scan::Continue
-            })
-        })?;
+
+        let mut offset: usize = 0;
+        while offset < input.len() {
+            let end = (offset + MAX_CHUNK_SIZE).min(input.len());
+            let slice = &input[offset..end];
+            let base = offset as u64;
+            self.scanner_pool.with(|scanner| {
+                scanner.scan(slice, |rule_id, from, to, _flags| {
+                    self.user_data.raw_matches_scratch.push(RawMatch {
+                        rule_id,
+                        start_idx: from + base,
+                        end_idx: to + base,
+                    });
+                    vectorscan_rs::Scan::Continue
+                })
+            })?;
+
+            if end == input.len() {
+                break;
+            }
+            offset = end.saturating_sub(CHUNK_OVERLAP);
+        }

        Ok(())
    }
@ -288,17 +302,6 @@ impl<'a> Matcher<'a> {
        self.local_stats.blobs_scanned += 1;
        self.local_stats.bytes_scanned += blob.bytes().len() as u64;

-        // Check if blob was already seen and respect no_dedup flag
-        if !no_dedup {
-            if let Some(had_matches) = self.seen_blobs.get(&blob.id) {
-                return Ok(if had_matches {
-                    ScanResult::SeenWithMatches
-                } else {
-                    ScanResult::SeenSansMatches
-                });
-            }
-        }
-
        // Extract filename from origin
        let filename = origin
            .first()
@ -313,24 +316,19 @@ impl<'a> Matcher<'a> {
        // Opportunistically look for standalone Base64 blobs. If neither
        // the raw scan nor this check yields anything, we can return early
        // before doing any heavier work.
-        let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) };
+        let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
+            Vec::new()
+        } else {
+            get_base64_strings(blob.bytes())
+        };

        if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() {
-            // Only record in seen_blobs if deduplication is enabled
-            if !no_dedup {
-                return Ok(match self.seen_blobs.insert(blob.id, false) {
-                    None => ScanResult::New(Vec::new()),
-                    Some(true) => ScanResult::SeenWithMatches,
-                    Some(false) => ScanResult::SeenSansMatches,
-                });
-            } else {
-                return Ok(ScanResult::New(Vec::new()));
-            }
+            return Ok(ScanResult::New(Vec::new()));
        }

        let rules_db = self.rules_db;
        let mut seen_matches = FxHashSet::default();
-        let mut previous_matches = Vec::new();
+        let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
        let tree_sitter_result = if self.user_data.raw_matches_scratch.is_empty() {
            None
        } else {
@ -363,27 +361,19 @@ impl<'a> Matcher<'a> {
                })
                .collect::<Vec<_>>()
        });
-        let mut previous_raw_matches: Vec<(usize, OffsetSpan)> = Vec::new();
+        let mut previous_raw_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
        for &RawMatch { rule_id, start_idx, end_idx } in
            self.user_data.raw_matches_scratch.iter().rev()
        {
            let rule_id_usize: usize = rule_id as usize;
-            // let rule = &rules_db.rules[rule_id_usize];
            let rule = Arc::clone(&rules_db.rules[rule_id_usize]);
            let re = &rules_db.anchored_regexes[rule_id_usize];
            let start_idx_usize = start_idx as usize;
            let end_idx_usize = end_idx as usize;
            let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
-            // Skip if fully contained in a previous match
-            if previous_raw_matches.iter().any(|(prev_id, prev_span): &(usize, OffsetSpan)| {
-                *prev_id == rule_id_usize
-                    && (prev_span.fully_contains(&current_span)
-                        || current_span.fully_contains(prev_span))
-            }) {
+            if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
                continue;
            }
-            let matching_input_offset_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
-            previous_raw_matches.push((rule_id_usize, matching_input_offset_span));
            filter_match(
                blob,
                rule,
@ -472,9 +462,15 @@ impl<'a> Matcher<'a> {
            }
        }
        // Finalize
-        // Only record in seen_blobs if deduplication is enabled
-        if !no_dedup {
-            self.seen_blobs.insert(blob.id, !matches.is_empty());
+        if !no_dedup && !matches.is_empty() {
+            let blob_id = blob.id();
+            if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
+                return Ok(if had_matches {
+                    ScanResult::SeenWithMatches
+                } else {
+                    ScanResult::SeenSansMatches
+                });
+            }
        }

        // --- opportunistic capacity cap ---------------------------------
@ -501,6 +497,39 @@ fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -
    end.hash(&mut hasher);
    hasher.finish()
 }
+
+#[inline]
+fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
+    let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
+    if idx > 0 {
+        if spans[idx - 1].fully_contains(&span) {
+            return false;
+        }
+        if span.fully_contains(&spans[idx - 1]) {
+            spans.remove(idx - 1);
+            idx -= 1;
+        }
+    }
+    if idx < spans.len() {
+        if spans[idx].fully_contains(&span) {
+            return false;
+        }
+        if span.fully_contains(&spans[idx]) {
+            spans.remove(idx);
+        }
+    }
+    spans.insert(idx, span);
+    true
+}
+
+#[inline]
+fn record_match(
+    map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
+    rule_id: usize,
+    span: OffsetSpan,
+) -> bool {
+    insert_span(map.entry(rule_id).or_default(), span)
+}
 fn filter_match<'b>(
    blob: &'b Blob,
    // rule: &'b Rule,
@ -509,7 +538,7 @@ fn filter_match<'b>(
    start: usize,
    end: usize,
    matches: &mut Vec<BlobMatch<'b>>,
-    previous_matches: &mut Vec<(usize, OffsetSpan)>,
+    previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
    rule_id: usize,
    seen_matches: &mut FxHashSet<u64>,
    _origin: &OriginSet,
@ -558,11 +587,7 @@ fn filter_match<'b>(
        if !seen_matches.insert(match_key) {
            continue;
        }
-        if previous_matches.iter().any(|(prev_rule_id, prev_loc)| {
-            *prev_rule_id == rule_id
-                && (prev_loc.fully_contains(&matching_input_offset_span)
-                    || matching_input_offset_span.fully_contains(prev_loc))
-        }) {
+        if !record_match(previous_matches, rule_id, matching_input_offset_span) {
            continue;
        }
        let only_matching_input =
@ -571,7 +596,7 @@ fn filter_match<'b>(
            SerializableCaptures::from_captures(&captures, byte_slice.as_ref(), re, redact);
        matches.push(BlobMatch {
            rule: Arc::clone(&rule),
-            blob_id: &blob.id,
+            blob_id: blob.id_ref(),
            matching_input: only_matching_input,
            matching_input_offset_span,
            captures: groups,
@ -581,7 +606,6 @@ fn filter_match<'b>(
            calculated_entropy,
            is_base64,
        });
-        previous_matches.push((rule_id, matching_input_offset_span));
    }
    if let Some(t) = timer.take() {
        let new_count = (matches.len() - initial_len) as u64;
@ -682,20 +706,20 @@ impl JsonSchema for Groups {
 //     pub end: usize,    // End position of the match
 //     pub value: String, // The actual captured value
 // }
-#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[derive(Debug, Clone, Serialize, JsonSchema)]
 pub struct SerializableCapture {
    pub name: Option<String>,
    pub match_number: i32,
    pub start: usize,
    pub end: usize,
-    // Instead of storing an owned String, store a borrowed or interned value.
-    // Here we use Cow to allow either borrowing or owning as needed.
-    pub value: std::borrow::Cow<'static, str>,
+    /// Interned value of the capture.
+    pub value: &'static str,
 }

-#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[derive(Debug, Clone, Serialize, JsonSchema)]
 pub struct SerializableCaptures {
-    pub captures: Vec<SerializableCapture>, // All captures (named and unnamed)
+    #[schemars(with = "Vec<SerializableCapture>")]
+    pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
 }
 impl SerializableCaptures {
    pub fn from_captures(
@ -704,7 +728,7 @@ impl SerializableCaptures {
        re: &Regex,
        redact: bool,
    ) -> Self {
-        let mut serialized_captures = Vec::new();
+        let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
        // Process named captures
        for name in re.capture_names().flatten() {
            if let Some(capture) = captures.name(name) {
@ -718,7 +742,7 @@ impl SerializableCaptures {
                    match_number: -1,
                    start: capture.start(),
                    end: capture.end(),
-                    value: value.into(),
+                    value: intern(&value),
                });
            }
        }
@ -735,7 +759,7 @@ impl SerializableCaptures {
                    match_number: i32::try_from(i).unwrap_or(0),
                    start: capture.start(),
                    end: capture.end(),
-                    value: value.into(),
+                    value: intern(&value),
                });
            }
        }
@ -764,16 +788,9 @@ pub struct Match {
    pub finding_fingerprint: u64,

    /// The rule that produced this match
-    pub rule_finding_fingerprint: &'static str,
-
-    /// The text identifier of the rule that produced this match
-    pub rule_text_id: &'static str,
-
-    /// The name of the rule that produced this match
-    pub rule_name: &'static str,
-
-    /// The confidence property of the rule that produced this match
-    pub rule_confidence: crate::rules::rule::Confidence,
+    #[serde(skip_serializing)]
+    #[schemars(skip)]
+    pub rule: Arc<Rule>,

    /// Validation Body
    pub validation_response_body: String,
@ -794,7 +811,7 @@ pub struct Match {
 impl Match {
    #[inline]
    pub fn convert_owned_blobmatch_to_match<'a>(
-        loc_mapping: &'a LocationMapping,
+        loc_mapping: Option<&'a LocationMapping<'a>>,
        owned_blob_match: &'a OwnedBlobMatch,
        origin_type: &'a str,
    ) -> Self {
@ -812,9 +829,11 @@ impl Match {
        // The fingerprint will be based on the content of the secret.
        let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");

-        let source_span = loc_mapping.get_source_span(&offset_span);
-        let rule_finding_fingerprint = owned_blob_match.rule.finding_sha1_fingerprint().to_owned();
-
+        let source_span =
+            loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
+                start: SourcePoint { line: 0, column: 0 },
+                end: SourcePoint { line: 0, column: 0 },
+            });
        let offset_start: u64 =
            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
@ -828,10 +847,7 @@ impl Match {

        // matching_snippet
        Match {
-            rule_finding_fingerprint: intern(&rule_finding_fingerprint),
-            rule_name: intern(owned_blob_match.rule.name()),
-            rule_confidence: owned_blob_match.rule.confidence(),
-            rule_text_id: intern(owned_blob_match.rule.id()),
+            rule: owned_blob_match.rule.clone(),
            visible: owned_blob_match.rule.visible().to_owned(),
            location: Location { offset_span, source_span: source_span.clone() },
            groups: owned_blob_match.captures.clone(),
@ -852,7 +868,7 @@ impl Match {

    pub fn finding_id(&self) -> String {
        let mut h = Sha1::new();
-        write!(&mut h, "{}\0", self.rule_finding_fingerprint)
+        write!(&mut h, "{}\0", self.rule.finding_sha1_fingerprint())
            .expect("should be able to write to memory");
        serde_json::to_writer(&mut h, &self.groups)
            .expect("should be able to serialize groups as JSON");
@ -892,33 +908,48 @@ pub struct DecodedData {
    pub pos_start: usize,
    pub pos_end: usize,
 }
+#[inline]
+fn is_base64_byte(b: u8) -> bool {
+    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/')
+}
+
 pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
-    lazy_static! {
-        // Require a reasonably long run of valid Base64 characters to reduce
-        // noise. 32 bytes corresponds to 24 decoded bytes.
-        static ref RE_BASE64: Regex = Regex::new(r"[A-Za-z0-9+/]{32,}={0,2}").unwrap();
-    }
    let mut results = Vec::new();
-    for m in RE_BASE64.find_iter(input) {
-        let base64_string = m.as_bytes();
-        // Skip candidates whose length isn't a multiple of four – they cannot
-        // be valid Base64.
-        if base64_string.len() % 4 != 0 {
-            continue;
+    let mut i = 0;
+    while i < input.len() {
+        while i < input.len() && !is_base64_byte(input[i]) {
+            i += 1;
        }
-        if let Ok(decoded) = general_purpose::STANDARD.decode(base64_string) {
-            if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
-                if decoded_str.is_ascii() {
-                    results.push(DecodedData {
-                        original: String::from_utf8_lossy(base64_string).into_owned(),
-                        decoded: decoded_str.to_string(),
-                        pos_start: m.start(),
-                        pos_end: m.end(),
-                    });
+        let start = i;
+        while i < input.len() && is_base64_byte(input[i]) {
+            i += 1;
+        }
+
+        let mut eq_count = 0;
+        while i < input.len() && input[i] == b'=' && eq_count < 2 {
+            i += 1;
+            eq_count += 1;
+        }
+        let end = i;
+
+        let len = end - start;
+        if len >= 32 && len % 4 == 0 {
+            let base64_slice = &input[start..end];
+            if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) {
+                if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
+                    if decoded_str.is_ascii() {
+                        results.push(DecodedData {
+                            original: String::from_utf8_lossy(base64_slice).into_owned(),
+                            decoded: decoded_str.to_string(),
+                            pos_start: start,
+                            pos_end: end,
+                        });
+                    }
                }
            }
        }
    }
+
    results
 }

--- a/src/reporter.rs
+++ b/src/reporter.rs
@ -216,7 +216,7 @@ impl DetailsReporter {
                    m: match_item.clone(),
                    comment: None,
                    visible: match_item.visible,
-                    match_confidence: match_item.rule_confidence,
+                    match_confidence: match_item.rule.confidence(),
                    validation_response_body: match_item.validation_response_body.clone(),
                    validation_response_status: match_item.validation_response_status,
                    validation_success: match_item.validation_success,
@ -366,13 +366,13 @@ impl DetailsReporter {

        FindingReporterRecord {
            rule: RuleMetadata {
-                name: rm.m.rule_name.to_string(),
-                id: rm.m.rule_text_id.to_string(),
+                name: rm.m.rule.name().to_string(),
+                id: rm.m.rule.id().to_string(),
            },
            finding: FindingRecordData {
                snippet,
                fingerprint: rm.m.finding_fingerprint.to_string(),
-                confidence: rm.match_confidence.to_string(),
+                confidence: rm.m.rule.confidence().to_string(),
                entropy: format!("{:.2}", rm.m.calculated_entropy),
                validation: ValidationInfo { status: validation_status, response: response_body },
                language: rm
--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@ -35,6 +35,7 @@ mod tests {
    use crate::cli::commands::github::GitHistoryMode;
    use crate::cli::commands::rules::RuleSpecifierArgs;
    use crate::matcher::{SerializableCapture, SerializableCaptures};
+    use crate::rules::rule::{Confidence, Rule, RuleSyntax};
    use crate::util::intern;
    use crate::{
        blob::BlobId,
@ -49,6 +50,7 @@ mod tests {
        origin::Origin,
        reporter::styles::Styles,
    };
+    use smallvec::smallvec;
    use std::{
        io::Cursor,
        path::PathBuf,
@ -132,12 +134,21 @@ mod tests {
        }
    }

-    fn create_mock_match(
-        rule_name: &str,
-        rule_text_id: &str,
-        rule_finding_fingerprint: &str,
-        validation_success: bool,
-    ) -> Match {
+    fn create_mock_match(rule_name: &str, rule_text_id: &str, validation_success: bool) -> Match {
+        let syntax = RuleSyntax {
+            name: rule_name.to_string(),
+            id: rule_text_id.to_string(),
+            pattern: "dummy".to_string(),
+            min_entropy: 0.0,
+            confidence: Confidence::Medium,
+            visible: true,
+            examples: vec![],
+            negative_examples: vec![],
+            references: vec![],
+            validation: None,
+            depends_on_rule: vec![],
+        };
+        let rule = Arc::new(Rule::new(syntax));
        Match {
            location: Location {
                offset_span: OffsetSpan { start: 10, end: 20 },
@ -147,20 +158,17 @@ mod tests {
                },
            },
            groups: SerializableCaptures {
-                captures: vec![SerializableCapture {
+                captures: smallvec![SerializableCapture {
                    name: Some("token".to_string()),
                    match_number: 1,
                    start: 10,
                    end: 20,
-                    value: "mock_token".into(),
+                    value: intern("mock_token"),
                }],
            },
            blob_id: BlobId::new(b"mock_blob"),
            finding_fingerprint: 0123,
-            rule_finding_fingerprint: intern(rule_finding_fingerprint),
-            rule_text_id: intern(rule_text_id),
-            rule_name: intern(rule_name),
-            rule_confidence: Confidence::Medium,
+            rule,
            validation_response_body: "validation response".to_string(),
            validation_response_status: 200,
            validation_success,
@ -177,7 +185,6 @@ mod tests {
                id: BlobId::new(b"mock_blob"),
                num_bytes: 1024,
                mime_essence: Some("text/plain".to_string()),
-                charset: Some("UTF-8".to_string()),
                language: Some("Rust".to_string()),
            };
            let dedup = true;
@ -204,15 +211,13 @@ mod tests {

    #[test]
    fn test_json_format() -> Result<()> {
-        let mock_match =
-            create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
+        let mock_match = create_mock_match("MockRule", "mock_rule_1", true);
        let matches = vec![ReportMatch {
            origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]),
            blob_metadata: BlobMetadata {
                id: BlobId::new(b"mock_blob"),
                num_bytes: 1024,
                mime_essence: Some("text/plain".to_string()),
-                charset: Some("UTF-8".to_string()),
                language: Some("Rust".to_string()),
            },
            m: mock_match,
@ -238,12 +243,7 @@ mod tests {
    fn test_validation_status_in_json() -> Result<()> {
        let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")];
        for (validation_success, expected_status) in test_cases {
-            let mock_match = create_mock_match(
-                "MockRule",
-                "mock_rule_1",
-                "mock_finding_fingerprint",
-                validation_success,
-            );
+            let mock_match = create_mock_match("MockRule", "mock_rule_1", validation_success);
            let matches = vec![ReportMatch {
                origin: OriginSet::new(
                    Origin::from_file(PathBuf::from("/mock/path/file.rs")),
@ -253,7 +253,6 @@ mod tests {
                    id: BlobId::new(b"mock_blob"),
                    num_bytes: 1024,
                    mime_essence: Some("text/plain".to_string()),
-                    charset: Some("UTF-8".to_string()),
                    language: Some("Rust".to_string()),
                },
                m: mock_match,
--- a/src/scanner/docker.rs
+++ b/src/scanner/docker.rs
@ -175,9 +175,11 @@ impl Docker {

        pb.set_length(layer_paths.len() as u64);
        for p in layer_paths {
-            let mut data = Vec::new();
-            File::open(&p)?.read_to_end(&mut data)?;
-            let digest = format!("{:x}", Sha256::digest(&data));
+            let mut file = File::open(&p)?;
+            let mut hasher = Sha256::new();
+            std::io::copy(&mut file, &mut hasher)?;
+            let digest = format!("{:x}", hasher.finalize());
+
            let new_path = out_dir.join(format!("layer_{digest}.tar"));
            std::fs::rename(&p, &new_path)?;
            // extract layer contents so inner filenames appear in scan results
--- a/src/scanner/enumerate.rs
+++ b/src/scanner/enumerate.rs
@ -172,10 +172,15 @@ pub fn enumerate_filesystem_inputs(
                };
                if should_skip {
                    progress.suspend(|| {
+                        let path = origin
+                            .first()
+                            .blob_path()
+                            .map(|p| p.display().to_string())
+                            .unwrap_or_else(|| blob.temp_id().to_string());
                        if is_archive {
-                            debug!("Skipping archive: {}", blob.id);
+                            debug!("Skipping archive: {path}");
                        } else {
-                            debug!("Skipping binary blob: {}", blob.id);
+                            debug!("Skipping binary blob: {path}");
                        }
                    });
                    return Ok(());
--- a/src/scanner/processing.rs
+++ b/src/scanner/processing.rs
@ -12,6 +12,8 @@ use crate::{
    Path,
 };

+const LOCATION_LIMIT_BYTES: usize = 256 * 1024 * 1024;
+
 /// A matcher along with parameters that remain constant during a single
 /// `scan` run
 pub struct BlobProcessor<'a> {
@ -27,8 +29,7 @@ impl<'a> BlobProcessor<'a> {
        redact: bool,
        no_base64: bool,
    ) -> Result<Option<DatastoreMessage>> {
-        let blob_id = blob.id.hex();
-        let _span = debug_span!("matcher", blob_id).entered();
+        let _span = debug_span!("matcher", temp_id = blob.temp_id()).entered();
        let t1 = Instant::now();
        let res = self.matcher.scan_blob(&blob, &origin, None, redact, no_dedup, no_base64)?;
        let scan_us = t1.elapsed().as_micros();
@ -43,10 +44,9 @@ impl<'a> BlobProcessor<'a> {
            ScanResult::SeenWithMatches => {
                trace!("({scan_us}us) blob already scanned with matches");
                let metadata = BlobMetadata {
-                    id: blob.id,
+                    id: blob.id(),
                    num_bytes: blob.len(),
                    mime_essence: None,
-                    charset: None,
                    language: None,
                };
                Ok(Some((origin, metadata, Vec::new())))
@ -63,10 +63,9 @@ impl<'a> BlobProcessor<'a> {
                }
                let md = MetadataResult::from_blob_and_origin(&blob, &origin);
                let metadata = BlobMetadata {
-                    id: blob.id,
+                    id: blob.id(),
                    num_bytes: blob.len(),
                    mime_essence: md.mime_essence,
-                    charset: md.charset,
                    language: md.language,
                };

@ -84,12 +83,17 @@ impl<'a> BlobProcessor<'a> {
                    }
                }

-                let loc_mapping = LocationMapping::new(&blob.bytes());
+                let bytes = blob.bytes();
+                let loc_mapping = if bytes.len() <= LOCATION_LIMIT_BYTES {
+                    Some(LocationMapping::new(bytes))
+                } else {
+                    None
+                };
                let converted_matches: Vec<(Option<f64>, Match)> = matches
                    .into_iter()
                    .map(|m| {
                        let converted_match = Match::convert_owned_blobmatch_to_match(
-                            &loc_mapping,
+                            loc_mapping.as_ref(),
                            &OwnedBlobMatch::from_blob_match(m),
                            origin_type,
                        );
@ -106,7 +110,6 @@ impl<'a> BlobProcessor<'a> {
 struct MetadataResult {
    mime_essence: Option<String>,
    language: Option<String>,
-    charset: Option<String>,
 }
 impl MetadataResult {
    fn from_blob_and_origin(blob: &Blob, origin: &OriginSet) -> MetadataResult {
@ -115,7 +118,6 @@ impl MetadataResult {
        let mime_essence = Some(tree_magic_mini::from_u8(bytes).to_string());
        let inspector = ContentInspector::default();
        let language = blob_path.and_then(|p| inspector.guess_language(p, bytes));
-        let charset = inspector.guess_charset(bytes);
-        MetadataResult { mime_essence, language, charset }
+        MetadataResult { mime_essence, language }
    }
 }
--- a/src/scanner/validation.rs
+++ b/src/scanner/validation.rs
@ -13,7 +13,7 @@ use futures::{stream, StreamExt};
 use indicatif::{ProgressBar, ProgressStyle};
 use liquid::Parser;
 use reqwest::{Client, StatusCode};
-use rustc_hash::{FxHashMap, FxHashSet};
+use rustc_hash::FxHashMap;
 use tokio::{sync::Notify, time::timeout};

 use crate::{
@ -21,7 +21,6 @@ use crate::{
    findings_store::{FindingsStore, FindingsStoreMessage},
    location::OffsetSpan,
    matcher::{Match, OwnedBlobMatch},
-    rules::rule,
    validation::{collect_variables_and_dependencies, validate_single_match, CachedResponse},
 };

@ -40,7 +39,7 @@ pub async fn run_secret_validation(
    let fail_count = Arc::new(AtomicUsize::new(0));

    // ── 2. Fetch rules + matches ────────────────────────────────────────────
-    let (all_rules, all_matches_by_blob) = {
+    let (_all_rules, all_matches_by_blob) = {
        let ds = datastore.lock().unwrap();
        let rules = ds.get_rules()?;
        let mut map: FxHashMap<BlobId, Vec<Arc<FindingsStoreMessage>>> = FxHashMap::default();
@ -51,16 +50,10 @@ pub async fn run_secret_validation(
    };

    // ── 3. Partition blobs ──────────────────────────────────────────────────
-    let rules_with_deps: FxHashSet<&str> = all_rules
-        .iter()
-        .filter(|r| !r.syntax().depends_on_rule.is_empty())
-        .map(|r| r.id())
-        .collect();
-
    let mut simple_matches = Vec::new();
    let mut dependent_blobs = FxHashMap::default(); // blob_id -- Vec<Arc<…>>
    for (blob_id, matches) in all_matches_by_blob {
-        if matches.iter().any(|m| rules_with_deps.contains(m.2.rule_text_id)) {
+        if matches.iter().any(|m| !m.2.rule.syntax().depends_on_rule.is_empty()) {
            dependent_blobs.insert(blob_id, matches);
        } else {
            simple_matches.extend(matches);
@ -80,11 +73,8 @@ pub async fn run_secret_validation(
                .captures
                .get(1)
                .or_else(|| arc_msg.2.groups.captures.get(0))
-                .map_or("", |c| c.value.as_ref());
-            groups
-                .entry(format!("{}|{}", arc_msg.2.rule_text_id, secret))
-                .or_default()
-                .push(arc_msg);
+                .map_or("", |c| c.value);
+            groups.entry(format!("{}|{}", arc_msg.2.rule.id(), secret)).or_default().push(arc_msg);
        }

        let validation_results = DashMap::<String, CachedResponse>::new();
@ -109,7 +99,6 @@ pub async fn run_secret_validation(
            let client = client.clone();
            let cache_glob = cache.clone();
            let val_res = &validation_results;
-            let rules = &all_rules;
            let success = success_count.clone();
            let fail = fail_count.clone();
            // *** FIX: Clone the progress bar for each concurrent task ***
@ -122,8 +111,8 @@ pub async fn run_secret_validation(
                    .captures
                    .get(1)
                    .or_else(|| rep_arc.2.groups.captures.get(0))
-                    .map_or("", |c| c.value.as_ref());
-                let key = format!("{}|{}", rep_arc.2.rule_text_id, secret);
+                    .map_or("", |c| c.value);
+                let key = format!("{}|{}", rep_arc.2.rule.id(), secret);

                match val_res.entry(key.clone()) {
                    dashmap::mapref::entry::Entry::Occupied(_) => return,
@ -138,8 +127,10 @@ pub async fn run_secret_validation(
                    }
                }

-                let rule = find_rule_for_match(rules, rep_arc.2.rule_text_id).unwrap();
-                let mut om = OwnedBlobMatch::convert_match_to_owned_blobmatch(&rep_arc.2, rule);
+                let mut om = OwnedBlobMatch::convert_match_to_owned_blobmatch(
+                    &rep_arc.2,
+                    rep_arc.2.rule.clone(),
+                );

                validate_single(
                    &mut om,
@ -211,7 +202,6 @@ pub async fn run_secret_validation(

        let val_cache = Arc::new(DashMap::<String, CachedResponse>::new());
        let in_flight = Arc::new(DashMap::<String, ()>::new());
-        let rules_ref = Arc::new(all_rules.clone());

        for chunk in blob_ids.chunks(chunk_size) {
            let tasks: Vec<_> = chunk
@ -225,15 +215,15 @@ pub async fn run_secret_validation(
                    let success = success_count.clone();
                    let fail = fail_count.clone();
                    let cache_glob = cache.clone();
-                    let rules = rules_ref.clone();

                    async move {
                        let owned = matches_for_blob
                            .iter()
                            .map(|arc_msg| {
-                                let rule = find_rule_for_match(&rules, arc_msg.2.rule_text_id)
-                                    .expect("rule");
-                                OwnedBlobMatch::convert_match_to_owned_blobmatch(&arc_msg.2, rule)
+                                OwnedBlobMatch::convert_match_to_owned_blobmatch(
+                                    &arc_msg.2,
+                                    arc_msg.2.rule.clone(),
+                                )
                            })
                            .collect::<Vec<_>>();

@ -338,21 +328,6 @@ pub async fn run_secret_validation(
    Ok(())
 }

-/// Returns `Some(Arc<Rule>)` if a matching rule is found; otherwise returns `None`.
-/// Callers can decide how to handle the `None` case (e.g., skip processing).
-fn find_rule_for_match(
-    all_rules: &[Arc<rule::Rule>],
-    rule_text_id: &str,
-) -> Option<Arc<rule::Rule>> {
-    match all_rules.iter().find(|r| r.syntax().id == rule_text_id).cloned() {
-        Some(rule) => Some(rule),
-        None => {
-            eprintln!("Warning: no rule found with id '{}'. Skipping.", rule_text_id);
-            None
-        }
-    }
-}
-
 // ---------------------------------------------------
 // The core validation logic, used in an async pipeline
 // ---------------------------------------------------
--- a/src/update.rs
+++ b/src/update.rs
@ -129,16 +129,29 @@ pub fn check_for_update(global_args: &GlobalArgs, base_url: Option<&str>) -> Opt
                    .apply_to(&format!("Updated to version {}", status.version()))
            ),
            Err(e) => match e {
-                UpdError::Io(ref io_err) if io_err.kind() == ErrorKind::PermissionDenied => {
-                    warn!(
-                        "{}",
-                        styles.style_finding_active_heading.apply_to(
-                            "Cannot replace the current binary - permission denied.\n\
-                             If you installed via a package manager, run its upgrade command.\n\
-                             Otherwise reinstall to a user-writable directory or re-run with sudo."
-                        )
-                    );
-                }
+                UpdError::Io(ref io_err) => match io_err.kind() {
+                    ErrorKind::PermissionDenied => {
+                        warn!(
+                            "{}",
+                            styles.style_finding_active_heading.apply_to(
+                                "Cannot replace the current binary - permission denied.\n\
+                                 If you installed via a package manager, run its upgrade command.\n\
+                                 Otherwise reinstall to a user-writable directory or re-run with sudo."
+                            )
+                        );
+                    }
+                    ErrorKind::NotFound => {
+                        warn!(
+                            "{}",
+                            styles.style_finding_active_heading.apply_to(
+                                "Cannot replace the current binary - file not found.\n\
+                                 If you installed via a package manager, run its upgrade command.\n\
+                                 Otherwise reinstall to a user-writable directory."
+                            )
+                        );
+                    }
+                    _ => error!("Failed to update: {e}"),
+                },
                _ => error!("Failed to update: {e}"),
            },
        }
--- a/src/validation.rs
+++ b/src/validation.rs
@ -910,6 +910,7 @@ mod tests {
    use crossbeam_skiplist::SkipMap;
    use http::StatusCode;
    use rustc_hash::FxHashMap;
+    use smallvec::smallvec;

    use crate::{
        blob::BlobId,
@ -920,6 +921,7 @@ mod tests {
            rule::{Confidence, Rule},
            Rules,
        },
+        util::intern,
        validation::{validate_single_match, Cache},
    };
    #[tokio::test]
@ -1016,12 +1018,12 @@ rules:
            // matching_input: token.as_bytes().to_vec(),
            matching_input_offset_span: OffsetSpan { start: 0, end: token.len() },
            captures: SerializableCaptures {
-                captures: vec![SerializableCapture {
+                captures: smallvec![SerializableCapture {
                    name: Some("TOKEN".to_string()),
                    match_number: -1,
                    start: 0,
                    end: token.len(),
-                    value: token.into(),
+                    value: intern(token),
                }],
            },
            validation_response_body: String::new(),
--- a/src/validation.rs.orig
+++ b/src/validation.rs.orig
--- a/src/validation/utils.rs
+++ b/src/validation/utils.rs
@ -22,7 +22,7 @@ pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String,
        .map(|cap| {
            let name =
                cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string());
-            (name, cap.value.clone().into_owned(), cap.start, cap.end)
+            (name, cap.value.to_string(), cap.start, cap.end)
        })
        .collect()
 }
@ -76,16 +76,17 @@ mod tests {
    use super::*;
    use crate::matcher::{SerializableCapture, SerializableCaptures};
    use pretty_assertions::assert_eq;
+    use smallvec::smallvec;

    #[test]
    fn single_unnamed_capture_is_returned() {
        let captures = SerializableCaptures {
-            captures: vec![SerializableCapture {
+            captures: smallvec![SerializableCapture {
                name: None,
                match_number: 0,
                start: 1,
                end: 4,
-                value: "abc".into(),
+                value: "abc",
            }],
        };
        let result = process_captures(&captures);
@ -95,20 +96,20 @@ mod tests {
    #[test]
    fn skips_whole_match_when_multiple() {
        let captures = SerializableCaptures {
-            captures: vec![
+            captures: smallvec![
                SerializableCapture {
                    name: None,
                    match_number: 0,
                    start: 0,
                    end: 5,
-                    value: "abcde".into(),
+                    value: "abcde",
                },
                SerializableCapture {
                    name: Some("foo".to_string()),
                    match_number: -1,
                    start: 1,
                    end: 4,
-                    value: "bcd".into(),
+                    value: "bcd",
                },
            ],
        };
@ -119,28 +120,22 @@ mod tests {
    #[test]
    fn includes_unnamed_groups_but_skips_whole_match() {
        let captures = SerializableCaptures {
-            captures: vec![
+            captures: smallvec![
                SerializableCapture {
                    name: None,
                    match_number: 0,
                    start: 0,
                    end: 6,
-                    value: "aabbcc".into(),
+                    value: "aabbcc",
                },
                SerializableCapture {
                    name: Some("foo".to_string()),
                    match_number: -1,
                    start: 0,
                    end: 2,
-                    value: "aa".into(),
-                },
-                SerializableCapture {
-                    name: None,
-                    match_number: 1,
-                    start: 4,
-                    end: 6,
-                    value: "cc".into(),
+                    value: "aa",
                },
+                SerializableCapture { name: None, match_number: 1, start: 4, end: 6, value: "cc" },
            ],
        };
        let result = process_captures(&captures);
--- a/tests/fingerprint_dedup.rs
+++ b/tests/fingerprint_dedup.rs
@ -14,11 +14,27 @@ use kingfisher::{
    matcher::{Match, SerializableCapture, SerializableCaptures},
    origin::{Origin, OriginSet},
    reporter::{styles::Styles, DetailsReporter, ReportMatch},
-    rules::rule::Confidence,
+    rules::rule::{Confidence, Rule, RuleSyntax},
+    util::intern,
 };
+use smallvec::smallvec;
 // ---- helpers -------------------------------------------------------------------------------

 fn make_match(fp: u64) -> Match {
+    let syntax = RuleSyntax {
+        name: "Example Rule".to_string(),
+        id: "RULE.1".to_string(),
+        pattern: "dummy".to_string(),
+        min_entropy: 0.0,
+        confidence: Confidence::Medium,
+        visible: true,
+        examples: vec![],
+        negative_examples: vec![],
+        references: vec![],
+        validation: None,
+        depends_on_rule: vec![],
+    };
+    let rule = Arc::new(Rule::new(syntax));
    Match {
        location: Location {
            offset_span: OffsetSpan { start: 0, end: 10 },
@ -28,20 +44,17 @@ fn make_match(fp: u64) -> Match {
            },
        },
        groups: SerializableCaptures {
-            captures: vec![SerializableCapture {
+            captures: smallvec![SerializableCapture {
                name: None,
                match_number: 0,
                start: 0,
                end: 10,
-                value: "dummy".into(),
+                value: intern("dummy"),
            }],
        },
        blob_id: BlobId::new(b"dummy"),
        finding_fingerprint: fp,
-        rule_finding_fingerprint: "structural.1".into(),
-        rule_text_id: "RULE.1".into(),
-        rule_name: "Example Rule".into(),
-        rule_confidence: Confidence::Medium,
+        rule,
        validation_response_body: String::new(),
        validation_response_status: 0,
        validation_success: false,
@ -106,7 +119,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> {
                id: BlobId::new(b"dummy"),
                num_bytes: 10,
                mime_essence: None,
-                charset: None,
                language: None,
            },
            m: m1,
@ -123,7 +135,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> {
                id: BlobId::new(b"dummy"),
                num_bytes: 10,
                mime_essence: None,
-                charset: None,
                language: None,
            },
            m: m2,