diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6e57853..fb350a4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,15 @@
 
 All notable changes to this project will be documented in this file.
 
+## [1.48.0]
+- Improved error message when self-update cannot find the current binary
+- Optimized memory usage via string interning and extensive data sharing
+- Replaced quadratic match filtering with a per-rule span map, fixing missed secrets in extremely large files and improving scan performance
+- Support scanning extremely large files by chunking input into 1 GiB segments with small overlaps, avoiding vectorscan buffer limits while preserving match offsets
+- Always use chunked vectorscan, eliminating the slow regex fallback for blobs over 4 GiB
+- Skip Base64 scanning for blobs over 64 MB to avoid a second pass over massive files
+- Increased max-file-size default to 64 MB (up from 25 MB)
+
 ## [1.47.0]
 - MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them
 - Improved rules: github oauth2, diffbot, mailchimp, aws
diff --git a/Cargo.toml b/Cargo.toml
index 4e774bf..797ef62 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,7 +10,7 @@ publish = false
 
 [package]
 name = "kingfisher"
-version = "1.47.0"
+version = "1.48.0"
 description = "MongoDB's blazingly fast secret scanning and validation tool"
 edition.workspace = true
 rust-version.workspace = true
@@ -65,7 +65,7 @@ smallvec = { version = "1", features = [
 tracing = "0.1.41"
 indicatif = { version = "0.17", features = ["improved_unicode"] }
 rayon = "1.10"
-sha1 = "0.10.6"
+sha1 = { version = "0.10.6", features = ["asm"] }
 hex = "0.4.3"
 vectorscan-rs = "0.0.5"
 regex = "1.11.1"
@@ -151,6 +151,7 @@ tar = "0.4.44"
 xz2 = "0.1.7"
 asar = "0.3.0"
 blake3 = "1.8.2"
+memchr = "2.7"
 memmap2 = "0.9.7"
 futures = "0.3.31"
 dashmap = "6.1.0"
diff --git a/README.md b/README.md
index cccbf34..1da58dd 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,64 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
   <img src="docs/runtime-comparison.png" alt="Kingfisher Runtime Comparison" style="vertical-align: center;" />
 </p>
 
+- [Kingfisher](#kingfisher)
+  - [Key Features](#key-features)
+- [Benchmark Results](#benchmark-results)
+- [Getting Started](#getting-started)
+  - [Installation](#installation)
+    - [Run Kingfisher in Docker](#run-kingfisher-in-docker)
+- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance)
+  - [Write Custom Rules!](#write-custom-rules)
+- [Usage](#usage)
+  - [Basic Examples](#basic-examples)
+    - [Scan with secret validation](#scan-with-secret-validation)
+    - [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories)
+    - [Scan a Git repository without validation](#scan-a-git-repository-without-validation)
+    - [Display only secrets confirmed active by third‑party APIs](#display-only-secrets-confirmed-active-by-thirdparty-apis)
+    - [Output JSON and capture to a file](#output-json-and-capture-to-a-file)
+    - [Output SARIF directly to disk](#output-sarif-directly-to-disk)
+    - [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--)
+    - [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size)
+    - [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag)
+    - [Display rule performance statistics](#display-rule-performance-statistics)
+    - [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files)
+    - [Exclude specific paths](#exclude-specific-paths)
+  - [Scan an S3 bucket](#scan-an-s3-bucket)
+  - [Scanning Docker Images](#scanning-docker-images)
+  - [Scanning GitHub](#scanning-github)
+    - [Scan GitHub organisation (requires `KF_GITHUB_TOKEN`)](#scan-github-organisation-requires-kf_github_token)
+    - [Scan remote GitHub repository](#scan-remote-github-repository)
+  - [Scanning GitLab](#scanning-gitlab)
+    - [Scan GitLab group (requires `KF_GITLAB_TOKEN`)](#scan-gitlab-group-requires-kf_gitlab_token)
+    - [Scan GitLab user](#scan-gitlab-user)
+    - [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url)
+    - [List GitLab repositories](#list-gitlab-repositories)
+  - [Scanning Jira](#scanning-jira)
+    - [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query)
+    - [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues)
+  - [Scanning Confluence](#scanning-confluence)
+    - [Scan Confluence pages matching a CQL query](#scan-confluence-pages-matching-a-cql-query)
+  - [Scanning Slack](#scanning-slack)
+    - [Scan Slack messages matching a search query](#scan-slack-messages-matching-a-search-query)
+  - [Environment Variables for Tokens](#environment-variables-for-tokens)
+  - [Exit Codes](#exit-codes)
+  - [Update Checks](#update-checks)
+- [Advanced Options](#advanced-options)
+  - [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets)
+  - [List Builtin Rules](#list-builtin-rules)
+  - [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run)
+  - [To add your rules alongside the built‑ins:](#to-add-your-rules-alongside-the-builtins)
+  - [Other Examples](#other-examples)
+  - [Notable Scan Options](#notable-scan-options)
+  - [Understanding `--confidence`](#understanding---confidence)
+    - [Ignore known false positives](#ignore-known-false-positives)
+  - [Finding Fingerprint](#finding-fingerprint)
+  - [Rule Performance Profiling](#rule-performance-profiling)
+  - [CLI Options](#cli-options)
+  - [Origins and Divergence](#origins-and-divergence)
+- [Roadmap](#roadmap)
+- [License](#license)
+
 # Getting Started
 ## Installation
 
@@ -225,9 +283,18 @@ cat /path/to/file.py | kingfisher scan -
 
 ```
 
+### Limit maximum file size scanned (`--max-file-size`)
+
+By default, Kingfisher skips files larger than **64 MB**. You can raise or lower this cap per run with `--max-file-size`, which takes a value in **megabytes**.
+
+```bash
+# Scan files up to 250 mb in size
+kingfisher scan /some/file --max-file-size 250
+```
+
 ### Scan using a rule _family_ with one flag
 
-_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws._`)\*
+_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws.*`)_
 
 ```bash
 # Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2)
@@ -617,6 +684,14 @@ kingfisher github repos list --organization my-org
 - `--skip-regex <PATTERN>`: Ignore findings whose text matches this regex (repeatable)
 - `--skip-word <WORD>`: Ignore findings containing this case-insensitive word (repeatable)
 
+## Understanding `--confidence`
+
+The `--confidence` flag sets a minimum confidence threshold, not an exact match.
+
+- If you pass `--confidence medium`, findings with **medium and higher** confidence (medium + high) will be included.
+- If you pass `--confidence low`, you’ll see **all levels** (low, medium, high).
+
+
 ### Ignore known false positives
 
 Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. 
diff --git a/data/rules/jwt.yml b/data/rules/jwt.yml
index e596027..b237b4c 100644
--- a/data/rules/jwt.yml
+++ b/data/rules/jwt.yml
@@ -5,7 +5,7 @@ rules:
       (?x)
       \b
       (
-        ey[A-Za-z0-9_-]{12,}  (?# header )
+        (?:ey|ewogIC)[A-Za-z0-9_-]{12,}  (?# header )
         \.
         ey[A-Za-z0-9_-]{12,}  (?# payload )
         \.
diff --git a/src/blob.rs b/src/blob.rs
index a54a4a8..9b1aff1 100644
--- a/src/blob.rs
+++ b/src/blob.rs
@@ -10,16 +10,20 @@ use anyhow::Result;
 use bstr::{BString, ByteSlice};
 use gix::ObjectId;
 use hex;
+use once_cell::sync::OnceCell;
 use parking_lot::Mutex;
 use rustc_hash::FxHashMap;
 use serde::{Deserialize, Serialize};
 use sha1::{Digest, Sha1};
 use smallvec::SmallVec;
+use std::sync::atomic::{AtomicU64, Ordering};
 
 use crate::git_commit_metadata::CommitMetadata;
 // const LARGE_FILE_THRESHOLD: u64 = 512 * 1024; // 512 KB
 const LARGE_FILE_THRESHOLD: u64 = 0; // always mmap
 
+static NEXT_ID: AtomicU64 = AtomicU64::new(1);
+
 /// The data of a blob, either owned (small files) or memory mapped (large files).
 pub enum BlobData<'a> {
     /// Small blobs – remains as-is.
@@ -75,47 +79,68 @@ pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>;
 /// A Git blob, storing its SHA-1 id and its contents.
 
 pub struct Blob<'a> {
-    pub id: BlobId,
-    pub data: BlobData<'a>,
+    id: OnceCell<BlobId>,
+    data: BlobData<'a>,
+    temp_id: u64,
 }
 
 impl Blob<'_> {
     #[inline]
-
     pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
         let mut file = File::open(&path)?;
         let file_size = file.metadata()?.len();
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
 
         if file_size > LARGE_FILE_THRESHOLD {
             // Large files: one mmap, zero extra copies.
             let mmap = unsafe { memmap2::Mmap::map(&file)? };
-            let id = BlobId::new(mmap.as_ref());
-            Ok(Blob { id, data: BlobData::Mapped(mmap) })
+            Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id })
         } else {
             // Small files: reuse the same handle and pre-allocate exact capacity
             let mut bytes = Vec::with_capacity(file_size as usize);
             file.read_to_end(&mut bytes)?;
-            let id = BlobId::new(&bytes);
-            Ok(Blob { id, data: BlobData::Owned(bytes) })
+            Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id })
         }
     }
+
     /// Returns the blob's bytes as a slice.
     #[inline]
     pub fn bytes(&self) -> &[u8] {
         self.data.as_ref()
     }
 
+    /// Lazily compute and return the blob's SHA-1 `BlobId`.
+    #[inline]
+    pub fn id(&self) -> BlobId {
+        *self.id.get_or_init(|| BlobId::new(self.bytes()))
+    }
+
+    /// Get a reference to the blob's SHA-1 `BlobId`, computing it if necessary.
+    #[inline]
+    pub fn id_ref(&self) -> &BlobId {
+        self.id.get_or_init(|| BlobId::new(self.bytes()))
+    }
+
+    /// Return the temporary identifier assigned on blob creation.
+    #[inline]
+    pub fn temp_id(&self) -> u64 {
+        self.temp_id
+    }
+
     /// Create a new `Blob` from a vector of bytes.
     #[inline]
     pub fn from_bytes(bytes: Vec<u8>) -> Self {
-        let id = BlobId::compute_from_bytes(&bytes);
-        Blob { id, data: BlobData::Owned(bytes) }
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+        Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }
     }
 
     /// Create a new `Blob` with the given id and data.
     #[inline]
     pub fn new(id: BlobId, bytes: Vec<u8>) -> Self {
-        Blob { id, data: BlobData::Owned(bytes) }
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+        let cell = OnceCell::new();
+        let _ = cell.set(id);
+        Blob { id: cell, data: BlobData::Owned(bytes), temp_id }
     }
 
     #[inline]
@@ -269,9 +294,15 @@ impl BlobId {
     /// Create a new BlobId computed from the given input.
     #[inline]
     pub fn new(input: &[u8]) -> Self {
+        const CHUNK: usize = 64 * 1024; // 64KB from start and end
         let mut hasher = Sha1::new();
         write!(&mut hasher, "blob {}\0", input.len()).unwrap();
-        hasher.update(input);
+        if input.len() <= CHUNK * 2 {
+            hasher.update(input);
+        } else {
+            hasher.update(&input[..CHUNK]);
+            hasher.update(&input[input.len() - CHUNK..]);
+        }
         BlobId(hasher.finalize().as_slice().try_into().expect("SHA-1 output size mismatch"))
     }
 
@@ -360,9 +391,6 @@ pub struct BlobMetadata {
     /// The guessed multimedia type of the blob
     pub mime_essence: Option<String>,
 
-    /// The guessed charset of the blob
-    pub charset: Option<String>,
-
     /// The guessed programming language of the blob
     pub language: Option<String>,
 }
@@ -384,9 +412,4 @@ impl BlobMetadata {
     pub fn mime_essence(&self) -> Option<&str> {
         self.mime_essence.as_deref()
     }
-
-    #[inline]
-    pub fn charset(&self) -> Option<&str> {
-        self.charset.as_deref()
-    }
 }
diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs
index f0b8fa8..4f2dc5b 100644
--- a/src/cli/commands/inputs.rs
+++ b/src/cli/commands/inputs.rs
@@ -169,7 +169,7 @@ pub struct InputSpecifierArgs {
 #[derive(Args, Debug, Clone)]
 pub struct ContentFilteringArgs {
     /// Ignore files larger than the given size in MB
-    #[arg(long("max-file-size"), default_value_t = 25.0)]
+    #[arg(long("max-file-size"), default_value_t = 64.0)]
     pub max_file_size_mb: f64,
 
     // /// Use custom path-based ignore rules from the given file(s)
diff --git a/src/content_type.rs b/src/content_type.rs
index 197258d..ee44da6 100644
--- a/src/content_type.rs
+++ b/src/content_type.rs
@@ -96,9 +96,8 @@ impl ContentInspector {
     #[inline]
     #[must_use]
     pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
-        String::from_utf8(bytes.to_vec()).ok().map(|_| "UTF-8".to_string())
+        std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
     }
-
     /// Guess programming language with broad coverage using `tokei`.
     ///
     /// Strategy (no disk I/O):
diff --git a/src/decompress.rs b/src/decompress.rs
index 49dc7ef..46ae5d5 100644
--- a/src/decompress.rs
+++ b/src/decompress.rs
@@ -161,11 +161,8 @@ fn handle_zip_archive_streaming(
     Ok(CompressedContent::ArchiveFiles(entries_on_disk))
 }
 
-fn handle_asar_archive_in_memory(
-    buffer: Vec<u8>,
-    archive_path: &Path,
-) -> Result<CompressedContent> {
-    match AsarReader::new(&buffer, None) {
+fn handle_asar_archive_in_memory(buffer: &[u8], archive_path: &Path) -> Result<CompressedContent> {
+    match AsarReader::new(buffer, None) {
         Ok(reader) => {
             let mut contents = Vec::new();
             for (path_in_asar, file) in reader.files() {
@@ -200,7 +197,7 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result<CompressedCon
         match ext {
             "asar" => {
                 let mmap = unsafe { Mmap::map(&file)? };
-                return handle_asar_archive_in_memory(mmap.to_vec(), path);
+                return handle_asar_archive_in_memory(&mmap, path);
             }
             "tar" => {
                 if let Some(base) = base_dir {
diff --git a/src/findings_store.rs b/src/findings_store.rs
index fc610f2..23d8c7b 100644
--- a/src/findings_store.rs
+++ b/src/findings_store.rs
@@ -18,6 +18,7 @@ use crate::{
     matcher::Match,
     origin::{Origin, OriginSet},
     rules::rule::Rule,
+    util::intern,
 };
 
 // share with Arc so every blob/origin is materialised once
@@ -151,7 +152,7 @@ impl FindingsStore {
                     .captures
                     .get(1)
                     .or_else(|| m.groups.captures.get(0))
-                    .map_or("", |c| c.value.as_ref());
+                    .map_or("", |c| c.value);
 
                 let origin_kind = match origin.first() {
                     Origin::GitRepo(_) => "git",
@@ -160,7 +161,7 @@ impl FindingsStore {
                 };
 
                 let key = xxh3_64(
-                    format!("{}|{}|{}", m.rule_text_id.to_uppercase(), origin_kind, snippet)
+                    format!("{}|{}|{}", m.rule.id().to_uppercase(), origin_kind, snippet)
                         .as_bytes(),
                 );
 
@@ -280,7 +281,7 @@ impl FindingsStore {
     pub fn get_summary(&self) -> FxHashMap<&'static str, usize> {
         self.matches.iter().fold(FxHashMap::default(), |mut acc, msg| {
             let (_, _, m) = &**msg;
-            *acc.entry(m.rule_name).or_insert(0) += 1; // borrow, no alloc
+            *acc.entry(intern(m.rule.name())).or_insert(0) += 1;
             acc
         })
     }
@@ -342,13 +343,13 @@ impl FindingsStore {
         self.matches.iter().map(|msg| {
             let (_, _, match_item) = &**msg;
             finding_data::FindingMetadata {
-                rule_name: match_item.rule_name.to_string(),
+                rule_name: match_item.rule.name().to_string(),
                 num_matches: 1,
                 comment: None,
                 visible: match_item.visible,
                 finding_id: match_item.finding_id(),
-                rule_finding_fingerprint: match_item.rule_finding_fingerprint.to_string(),
-                rule_text_id: match_item.rule_text_id.to_string(),
+                rule_finding_fingerprint: match_item.rule.finding_sha1_fingerprint().to_string(),
+                rule_text_id: match_item.rule.id().to_string(),
             }
         })
     }
@@ -362,7 +363,7 @@ impl FindingsStore {
             .iter()
             .filter(|msg| {
                 let (_, _, match_item) = &***msg;
-                match_item.rule_name == metadata.rule_name
+                match_item.rule.name() == metadata.rule_name
             })
             .map(|msg| {
                 let (origin, blob_metadata, match_item) = &**msg;
@@ -373,7 +374,7 @@ impl FindingsStore {
                     match_id: MatchIdInt::from_str(&match_item.finding_id())?,
                     match_comment: None,
                     visible: match_item.visible,
-                    match_confidence: match_item.rule_confidence,
+                    match_confidence: match_item.rule.confidence(),
                     validation_response_body: match_item.validation_response_body.clone(),
                     validation_response_status: match_item.validation_response_status,
                     validation_success: match_item.validation_success,
diff --git a/src/location.rs b/src/location.rs
index 4af7021..a50602a 100644
--- a/src/location.rs
+++ b/src/location.rs
@@ -1,4 +1,5 @@
 use core::ops::Range;
+use std::cell::RefCell;
 
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
@@ -87,25 +88,35 @@ impl std::fmt::Display for SourceSpan {
 }
 
 /// Records newline byte‑offsets to map offsets -- (line, column).
-pub struct LocationMapping {
-    newline_offsets: Vec<usize>,
+pub struct LocationMapping<'a> {
+    bytes: &'a [u8],
+    newline_offsets: RefCell<Vec<usize>>,
 }
 
-impl LocationMapping {
-    /// Scan once for all `\n` positions.
-    pub fn new(input: &[u8]) -> Self {
-        let newline_offsets =
-            input.iter().enumerate().filter_map(|(i, &b)| (b == b'\n').then_some(i)).collect();
-        LocationMapping { newline_offsets }
+impl<'a> LocationMapping<'a> {
+    /// Create a new mapping without pre-scanning the entire input.
+    pub fn new(input: &'a [u8]) -> Self {
+        LocationMapping { bytes: input, newline_offsets: RefCell::new(Vec::new()) }
     }
 
-    /// Map a byte offset to a `SourcePoint`.
-    pub fn get_source_point(&self, offset: usize) -> SourcePoint {
-        let line = match self.newline_offsets.binary_search(&offset) {
-            Ok(idx) => idx + 2, // exact newline -- next line
+    fn ensure_offsets_up_to(&self, offset: usize) {
+        let mut offsets = self.newline_offsets.borrow_mut();
+        let start = offsets.last().map_or(0, |&last| last + 1);
+        if offset < start {
+            return;
+        }
+        let end = offset.min(self.bytes.len());
+        for nl in memchr::memchr_iter(b'\n', &self.bytes[start..end]) {
+            offsets.push(start + nl);
+        }
+    }
+
+    fn source_point_from_offsets(offsets: &[usize], offset: usize) -> SourcePoint {
+        let line = match offsets.binary_search(&offset) {
+            Ok(idx) => idx + 2,
             Err(idx) => idx + 1,
         };
-        let column = if let Some(&last) = self.newline_offsets.get(line.saturating_sub(2)) {
+        let column = if let Some(&last) = offsets.get(line.saturating_sub(2)) {
             offset.saturating_sub(last + 1)
         } else {
             offset
@@ -113,10 +124,19 @@ impl LocationMapping {
         SourcePoint { line, column }
     }
 
+    /// Map a byte offset to a `SourcePoint`.
+    pub fn get_source_point(&self, offset: usize) -> SourcePoint {
+        self.ensure_offsets_up_to(offset);
+        let offsets = self.newline_offsets.borrow();
+        Self::source_point_from_offsets(&offsets, offset)
+    }
+
     /// Map an `OffsetSpan` -- `SourceSpan` (closed interval).
     pub fn get_source_span(&self, span: &OffsetSpan) -> SourceSpan {
-        let start = self.get_source_point(span.start);
-        let end = self.get_source_point(span.end.saturating_sub(1));
+        self.ensure_offsets_up_to(span.end.saturating_sub(1));
+        let offsets = self.newline_offsets.borrow();
+        let start = Self::source_point_from_offsets(&offsets, span.start);
+        let end = Self::source_point_from_offsets(&offsets, span.end.saturating_sub(1));
         SourceSpan { start, end }
     }
 }
diff --git a/src/matcher.rs b/src/matcher.rs
index 1cbb16f..d30b0d8 100644
--- a/src/matcher.rs
+++ b/src/matcher.rs
@@ -10,7 +10,6 @@ use anyhow::Result;
 use base64::{engine::general_purpose, Engine};
 use bstr::BString;
 use http::StatusCode;
-use lazy_static::lazy_static;
 use regex::bytes::Regex;
 use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
 use schemars::{
@@ -27,7 +26,7 @@ use xxhash_rust::xxh3::xxh3_64;
 use crate::{
     blob::{Blob, BlobId, BlobIdMap},
     entropy::calculate_shannon_entropy,
-    location::{Location, LocationMapping, OffsetSpan},
+    location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
     origin::OriginSet,
     parser,
     parser::{Checker, Language},
@@ -40,6 +39,10 @@ use crate::{
     util::{intern, redact_value},
 };
 
+const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
+const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
+const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
+
 // -------------------------------------------------------------------------------------------------
 // RawMatch
 // -------------------------------------------------------------------------------------------------
@@ -255,17 +258,28 @@ impl<'a> Matcher<'a> {
         self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64));
 
         self.user_data.input_len = input.len() as u64;
-        // self.vs_scanner.scan(input, |rid, from, to, _flags| {
-        self.scanner_pool.with(|scanner| {
-            scanner.scan(input, |rule_id, from, to, _flags| {
-                self.user_data.raw_matches_scratch.push(RawMatch {
-                    rule_id,
-                    start_idx: from,
-                    end_idx: to,
-                });
-                vectorscan_rs::Scan::Continue
-            })
-        })?;
+
+        let mut offset: usize = 0;
+        while offset < input.len() {
+            let end = (offset + MAX_CHUNK_SIZE).min(input.len());
+            let slice = &input[offset..end];
+            let base = offset as u64;
+            self.scanner_pool.with(|scanner| {
+                scanner.scan(slice, |rule_id, from, to, _flags| {
+                    self.user_data.raw_matches_scratch.push(RawMatch {
+                        rule_id,
+                        start_idx: from + base,
+                        end_idx: to + base,
+                    });
+                    vectorscan_rs::Scan::Continue
+                })
+            })?;
+
+            if end == input.len() {
+                break;
+            }
+            offset = end.saturating_sub(CHUNK_OVERLAP);
+        }
 
         Ok(())
     }
@@ -288,17 +302,6 @@ impl<'a> Matcher<'a> {
         self.local_stats.blobs_scanned += 1;
         self.local_stats.bytes_scanned += blob.bytes().len() as u64;
 
-        // Check if blob was already seen and respect no_dedup flag
-        if !no_dedup {
-            if let Some(had_matches) = self.seen_blobs.get(&blob.id) {
-                return Ok(if had_matches {
-                    ScanResult::SeenWithMatches
-                } else {
-                    ScanResult::SeenSansMatches
-                });
-            }
-        }
-
         // Extract filename from origin
         let filename = origin
             .first()
@@ -313,24 +316,19 @@ impl<'a> Matcher<'a> {
         // Opportunistically look for standalone Base64 blobs. If neither
         // the raw scan nor this check yields anything, we can return early
         // before doing any heavier work.
-        let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) };
+        let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
+            Vec::new()
+        } else {
+            get_base64_strings(blob.bytes())
+        };
 
         if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() {
-            // Only record in seen_blobs if deduplication is enabled
-            if !no_dedup {
-                return Ok(match self.seen_blobs.insert(blob.id, false) {
-                    None => ScanResult::New(Vec::new()),
-                    Some(true) => ScanResult::SeenWithMatches,
-                    Some(false) => ScanResult::SeenSansMatches,
-                });
-            } else {
-                return Ok(ScanResult::New(Vec::new()));
-            }
+            return Ok(ScanResult::New(Vec::new()));
         }
 
         let rules_db = self.rules_db;
         let mut seen_matches = FxHashSet::default();
-        let mut previous_matches = Vec::new();
+        let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
         let tree_sitter_result = if self.user_data.raw_matches_scratch.is_empty() {
             None
         } else {
@@ -363,27 +361,19 @@ impl<'a> Matcher<'a> {
                 })
                 .collect::<Vec<_>>()
         });
-        let mut previous_raw_matches: Vec<(usize, OffsetSpan)> = Vec::new();
+        let mut previous_raw_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
         for &RawMatch { rule_id, start_idx, end_idx } in
             self.user_data.raw_matches_scratch.iter().rev()
         {
             let rule_id_usize: usize = rule_id as usize;
-            // let rule = &rules_db.rules[rule_id_usize];
             let rule = Arc::clone(&rules_db.rules[rule_id_usize]);
             let re = &rules_db.anchored_regexes[rule_id_usize];
             let start_idx_usize = start_idx as usize;
             let end_idx_usize = end_idx as usize;
             let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
-            // Skip if fully contained in a previous match
-            if previous_raw_matches.iter().any(|(prev_id, prev_span): &(usize, OffsetSpan)| {
-                *prev_id == rule_id_usize
-                    && (prev_span.fully_contains(&current_span)
-                        || current_span.fully_contains(prev_span))
-            }) {
+            if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
                 continue;
             }
-            let matching_input_offset_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
-            previous_raw_matches.push((rule_id_usize, matching_input_offset_span));
             filter_match(
                 blob,
                 rule,
@@ -472,9 +462,15 @@ impl<'a> Matcher<'a> {
             }
         }
         // Finalize
-        // Only record in seen_blobs if deduplication is enabled
-        if !no_dedup {
-            self.seen_blobs.insert(blob.id, !matches.is_empty());
+        if !no_dedup && !matches.is_empty() {
+            let blob_id = blob.id();
+            if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
+                return Ok(if had_matches {
+                    ScanResult::SeenWithMatches
+                } else {
+                    ScanResult::SeenSansMatches
+                });
+            }
         }
 
         // --- opportunistic capacity cap ---------------------------------
@@ -501,6 +497,39 @@ fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -
     end.hash(&mut hasher);
     hasher.finish()
 }
+
+#[inline]
+fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
+    let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
+    if idx > 0 {
+        if spans[idx - 1].fully_contains(&span) {
+            return false;
+        }
+        if span.fully_contains(&spans[idx - 1]) {
+            spans.remove(idx - 1);
+            idx -= 1;
+        }
+    }
+    if idx < spans.len() {
+        if spans[idx].fully_contains(&span) {
+            return false;
+        }
+        if span.fully_contains(&spans[idx]) {
+            spans.remove(idx);
+        }
+    }
+    spans.insert(idx, span);
+    true
+}
+
+#[inline]
+fn record_match(
+    map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
+    rule_id: usize,
+    span: OffsetSpan,
+) -> bool {
+    insert_span(map.entry(rule_id).or_default(), span)
+}
 fn filter_match<'b>(
     blob: &'b Blob,
     // rule: &'b Rule,
@@ -509,7 +538,7 @@ fn filter_match<'b>(
     start: usize,
     end: usize,
     matches: &mut Vec<BlobMatch<'b>>,
-    previous_matches: &mut Vec<(usize, OffsetSpan)>,
+    previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
     rule_id: usize,
     seen_matches: &mut FxHashSet<u64>,
     _origin: &OriginSet,
@@ -558,11 +587,7 @@ fn filter_match<'b>(
         if !seen_matches.insert(match_key) {
             continue;
         }
-        if previous_matches.iter().any(|(prev_rule_id, prev_loc)| {
-            *prev_rule_id == rule_id
-                && (prev_loc.fully_contains(&matching_input_offset_span)
-                    || matching_input_offset_span.fully_contains(prev_loc))
-        }) {
+        if !record_match(previous_matches, rule_id, matching_input_offset_span) {
             continue;
         }
         let only_matching_input =
@@ -571,7 +596,7 @@ fn filter_match<'b>(
             SerializableCaptures::from_captures(&captures, byte_slice.as_ref(), re, redact);
         matches.push(BlobMatch {
             rule: Arc::clone(&rule),
-            blob_id: &blob.id,
+            blob_id: blob.id_ref(),
             matching_input: only_matching_input,
             matching_input_offset_span,
             captures: groups,
@@ -581,7 +606,6 @@ fn filter_match<'b>(
             calculated_entropy,
             is_base64,
         });
-        previous_matches.push((rule_id, matching_input_offset_span));
     }
     if let Some(t) = timer.take() {
         let new_count = (matches.len() - initial_len) as u64;
@@ -682,20 +706,20 @@ impl JsonSchema for Groups {
 //     pub end: usize,    // End position of the match
 //     pub value: String, // The actual captured value
 // }
-#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[derive(Debug, Clone, Serialize, JsonSchema)]
 pub struct SerializableCapture {
     pub name: Option<String>,
     pub match_number: i32,
     pub start: usize,
     pub end: usize,
-    // Instead of storing an owned String, store a borrowed or interned value.
-    // Here we use Cow to allow either borrowing or owning as needed.
-    pub value: std::borrow::Cow<'static, str>,
+    /// Interned value of the capture.
+    pub value: &'static str,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[derive(Debug, Clone, Serialize, JsonSchema)]
 pub struct SerializableCaptures {
-    pub captures: Vec<SerializableCapture>, // All captures (named and unnamed)
+    #[schemars(with = "Vec<SerializableCapture>")]
+    pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
 }
 impl SerializableCaptures {
     pub fn from_captures(
@@ -704,7 +728,7 @@ impl SerializableCaptures {
         re: &Regex,
         redact: bool,
     ) -> Self {
-        let mut serialized_captures = Vec::new();
+        let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
         // Process named captures
         for name in re.capture_names().flatten() {
             if let Some(capture) = captures.name(name) {
@@ -718,7 +742,7 @@ impl SerializableCaptures {
                     match_number: -1,
                     start: capture.start(),
                     end: capture.end(),
-                    value: value.into(),
+                    value: intern(&value),
                 });
             }
         }
@@ -735,7 +759,7 @@ impl SerializableCaptures {
                     match_number: i32::try_from(i).unwrap_or(0),
                     start: capture.start(),
                     end: capture.end(),
-                    value: value.into(),
+                    value: intern(&value),
                 });
             }
         }
@@ -764,16 +788,9 @@ pub struct Match {
     pub finding_fingerprint: u64,
 
     /// The rule that produced this match
-    pub rule_finding_fingerprint: &'static str,
-
-    /// The text identifier of the rule that produced this match
-    pub rule_text_id: &'static str,
-
-    /// The name of the rule that produced this match
-    pub rule_name: &'static str,
-
-    /// The confidence property of the rule that produced this match
-    pub rule_confidence: crate::rules::rule::Confidence,
+    #[serde(skip_serializing)]
+    #[schemars(skip)]
+    pub rule: Arc<Rule>,
 
     /// Validation Body
     pub validation_response_body: String,
@@ -794,7 +811,7 @@ pub struct Match {
 impl Match {
     #[inline]
     pub fn convert_owned_blobmatch_to_match<'a>(
-        loc_mapping: &'a LocationMapping,
+        loc_mapping: Option<&'a LocationMapping<'a>>,
         owned_blob_match: &'a OwnedBlobMatch,
         origin_type: &'a str,
     ) -> Self {
@@ -812,9 +829,11 @@ impl Match {
         // The fingerprint will be based on the content of the secret.
         let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
 
-        let source_span = loc_mapping.get_source_span(&offset_span);
-        let rule_finding_fingerprint = owned_blob_match.rule.finding_sha1_fingerprint().to_owned();
-
+        let source_span =
+            loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
+                start: SourcePoint { line: 0, column: 0 },
+                end: SourcePoint { line: 0, column: 0 },
+            });
         let offset_start: u64 =
             owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
         let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
@@ -828,10 +847,7 @@ impl Match {
 
         // matching_snippet
         Match {
-            rule_finding_fingerprint: intern(&rule_finding_fingerprint),
-            rule_name: intern(owned_blob_match.rule.name()),
-            rule_confidence: owned_blob_match.rule.confidence(),
-            rule_text_id: intern(owned_blob_match.rule.id()),
+            rule: owned_blob_match.rule.clone(),
             visible: owned_blob_match.rule.visible().to_owned(),
             location: Location { offset_span, source_span: source_span.clone() },
             groups: owned_blob_match.captures.clone(),
@@ -852,7 +868,7 @@ impl Match {
 
     pub fn finding_id(&self) -> String {
         let mut h = Sha1::new();
-        write!(&mut h, "{}\0", self.rule_finding_fingerprint)
+        write!(&mut h, "{}\0", self.rule.finding_sha1_fingerprint())
             .expect("should be able to write to memory");
         serde_json::to_writer(&mut h, &self.groups)
             .expect("should be able to serialize groups as JSON");
@@ -892,33 +908,48 @@ pub struct DecodedData {
     pub pos_start: usize,
     pub pos_end: usize,
 }
+#[inline]
+fn is_base64_byte(b: u8) -> bool {
+    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/')
+}
+
 pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
-    lazy_static! {
-        // Require a reasonably long run of valid Base64 characters to reduce
-        // noise. 32 bytes corresponds to 24 decoded bytes.
-        static ref RE_BASE64: Regex = Regex::new(r"[A-Za-z0-9+/]{32,}={0,2}").unwrap();
-    }
     let mut results = Vec::new();
-    for m in RE_BASE64.find_iter(input) {
-        let base64_string = m.as_bytes();
-        // Skip candidates whose length isn't a multiple of four – they cannot
-        // be valid Base64.
-        if base64_string.len() % 4 != 0 {
-            continue;
+    let mut i = 0;
+    while i < input.len() {
+        while i < input.len() && !is_base64_byte(input[i]) {
+            i += 1;
         }
-        if let Ok(decoded) = general_purpose::STANDARD.decode(base64_string) {
-            if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
-                if decoded_str.is_ascii() {
-                    results.push(DecodedData {
-                        original: String::from_utf8_lossy(base64_string).into_owned(),
-                        decoded: decoded_str.to_string(),
-                        pos_start: m.start(),
-                        pos_end: m.end(),
-                    });
+        let start = i;
+        while i < input.len() && is_base64_byte(input[i]) {
+            i += 1;
+        }
+
+        let mut eq_count = 0;
+        while i < input.len() && input[i] == b'=' && eq_count < 2 {
+            i += 1;
+            eq_count += 1;
+        }
+        let end = i;
+
+        let len = end - start;
+        if len >= 32 && len % 4 == 0 {
+            let base64_slice = &input[start..end];
+            if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) {
+                if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
+                    if decoded_str.is_ascii() {
+                        results.push(DecodedData {
+                            original: String::from_utf8_lossy(base64_slice).into_owned(),
+                            decoded: decoded_str.to_string(),
+                            pos_start: start,
+                            pos_end: end,
+                        });
+                    }
                 }
             }
         }
     }
+
     results
 }
 
diff --git a/src/reporter.rs b/src/reporter.rs
index c9c2a62..dacf75b 100644
--- a/src/reporter.rs
+++ b/src/reporter.rs
@@ -216,7 +216,7 @@ impl DetailsReporter {
                     m: match_item.clone(),
                     comment: None,
                     visible: match_item.visible,
-                    match_confidence: match_item.rule_confidence,
+                    match_confidence: match_item.rule.confidence(),
                     validation_response_body: match_item.validation_response_body.clone(),
                     validation_response_status: match_item.validation_response_status,
                     validation_success: match_item.validation_success,
@@ -366,13 +366,13 @@ impl DetailsReporter {
 
         FindingReporterRecord {
             rule: RuleMetadata {
-                name: rm.m.rule_name.to_string(),
-                id: rm.m.rule_text_id.to_string(),
+                name: rm.m.rule.name().to_string(),
+                id: rm.m.rule.id().to_string(),
             },
             finding: FindingRecordData {
                 snippet,
                 fingerprint: rm.m.finding_fingerprint.to_string(),
-                confidence: rm.match_confidence.to_string(),
+                confidence: rm.m.rule.confidence().to_string(),
                 entropy: format!("{:.2}", rm.m.calculated_entropy),
                 validation: ValidationInfo { status: validation_status, response: response_body },
                 language: rm
diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs
index 0b562d2..1ee7fb7 100644
--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@@ -35,6 +35,7 @@ mod tests {
     use crate::cli::commands::github::GitHistoryMode;
     use crate::cli::commands::rules::RuleSpecifierArgs;
     use crate::matcher::{SerializableCapture, SerializableCaptures};
+    use crate::rules::rule::{Confidence, Rule, RuleSyntax};
     use crate::util::intern;
     use crate::{
         blob::BlobId,
@@ -49,6 +50,7 @@ mod tests {
         origin::Origin,
         reporter::styles::Styles,
     };
+    use smallvec::smallvec;
     use std::{
         io::Cursor,
         path::PathBuf,
@@ -132,12 +134,21 @@ mod tests {
         }
     }
 
-    fn create_mock_match(
-        rule_name: &str,
-        rule_text_id: &str,
-        rule_finding_fingerprint: &str,
-        validation_success: bool,
-    ) -> Match {
+    fn create_mock_match(rule_name: &str, rule_text_id: &str, validation_success: bool) -> Match {
+        let syntax = RuleSyntax {
+            name: rule_name.to_string(),
+            id: rule_text_id.to_string(),
+            pattern: "dummy".to_string(),
+            min_entropy: 0.0,
+            confidence: Confidence::Medium,
+            visible: true,
+            examples: vec![],
+            negative_examples: vec![],
+            references: vec![],
+            validation: None,
+            depends_on_rule: vec![],
+        };
+        let rule = Arc::new(Rule::new(syntax));
         Match {
             location: Location {
                 offset_span: OffsetSpan { start: 10, end: 20 },
@@ -147,20 +158,17 @@ mod tests {
                 },
             },
             groups: SerializableCaptures {
-                captures: vec![SerializableCapture {
+                captures: smallvec![SerializableCapture {
                     name: Some("token".to_string()),
                     match_number: 1,
                     start: 10,
                     end: 20,
-                    value: "mock_token".into(),
+                    value: intern("mock_token"),
                 }],
             },
             blob_id: BlobId::new(b"mock_blob"),
             finding_fingerprint: 0123,
-            rule_finding_fingerprint: intern(rule_finding_fingerprint),
-            rule_text_id: intern(rule_text_id),
-            rule_name: intern(rule_name),
-            rule_confidence: Confidence::Medium,
+            rule,
             validation_response_body: "validation response".to_string(),
             validation_response_status: 200,
             validation_success,
@@ -177,7 +185,6 @@ mod tests {
                 id: BlobId::new(b"mock_blob"),
                 num_bytes: 1024,
                 mime_essence: Some("text/plain".to_string()),
-                charset: Some("UTF-8".to_string()),
                 language: Some("Rust".to_string()),
             };
             let dedup = true;
@@ -204,15 +211,13 @@ mod tests {
 
     #[test]
     fn test_json_format() -> Result<()> {
-        let mock_match =
-            create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
+        let mock_match = create_mock_match("MockRule", "mock_rule_1", true);
         let matches = vec![ReportMatch {
             origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]),
             blob_metadata: BlobMetadata {
                 id: BlobId::new(b"mock_blob"),
                 num_bytes: 1024,
                 mime_essence: Some("text/plain".to_string()),
-                charset: Some("UTF-8".to_string()),
                 language: Some("Rust".to_string()),
             },
             m: mock_match,
@@ -238,12 +243,7 @@ mod tests {
     fn test_validation_status_in_json() -> Result<()> {
         let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")];
         for (validation_success, expected_status) in test_cases {
-            let mock_match = create_mock_match(
-                "MockRule",
-                "mock_rule_1",
-                "mock_finding_fingerprint",
-                validation_success,
-            );
+            let mock_match = create_mock_match("MockRule", "mock_rule_1", validation_success);
             let matches = vec![ReportMatch {
                 origin: OriginSet::new(
                     Origin::from_file(PathBuf::from("/mock/path/file.rs")),
@@ -253,7 +253,6 @@ mod tests {
                     id: BlobId::new(b"mock_blob"),
                     num_bytes: 1024,
                     mime_essence: Some("text/plain".to_string()),
-                    charset: Some("UTF-8".to_string()),
                     language: Some("Rust".to_string()),
                 },
                 m: mock_match,
diff --git a/src/scanner/docker.rs b/src/scanner/docker.rs
index b427a3b..7ebb2d4 100644
--- a/src/scanner/docker.rs
+++ b/src/scanner/docker.rs
@@ -175,9 +175,11 @@ impl Docker {
 
         pb.set_length(layer_paths.len() as u64);
         for p in layer_paths {
-            let mut data = Vec::new();
-            File::open(&p)?.read_to_end(&mut data)?;
-            let digest = format!("{:x}", Sha256::digest(&data));
+            let mut file = File::open(&p)?;
+            let mut hasher = Sha256::new();
+            std::io::copy(&mut file, &mut hasher)?;
+            let digest = format!("{:x}", hasher.finalize());
+
             let new_path = out_dir.join(format!("layer_{digest}.tar"));
             std::fs::rename(&p, &new_path)?;
             // extract layer contents so inner filenames appear in scan results
diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs
index 14861ff..970ca5c 100644
--- a/src/scanner/enumerate.rs
+++ b/src/scanner/enumerate.rs
@@ -172,10 +172,15 @@ pub fn enumerate_filesystem_inputs(
                 };
                 if should_skip {
                     progress.suspend(|| {
+                        let path = origin
+                            .first()
+                            .blob_path()
+                            .map(|p| p.display().to_string())
+                            .unwrap_or_else(|| blob.temp_id().to_string());
                         if is_archive {
-                            debug!("Skipping archive: {}", blob.id);
+                            debug!("Skipping archive: {path}");
                         } else {
-                            debug!("Skipping binary blob: {}", blob.id);
+                            debug!("Skipping binary blob: {path}");
                         }
                     });
                     return Ok(());
diff --git a/src/scanner/processing.rs b/src/scanner/processing.rs
index 9447dbf..5132209 100644
--- a/src/scanner/processing.rs
+++ b/src/scanner/processing.rs
@@ -12,6 +12,8 @@ use crate::{
     Path,
 };
 
+const LOCATION_LIMIT_BYTES: usize = 256 * 1024 * 1024;
+
 /// A matcher along with parameters that remain constant during a single
 /// `scan` run
 pub struct BlobProcessor<'a> {
@@ -27,8 +29,7 @@ impl<'a> BlobProcessor<'a> {
         redact: bool,
         no_base64: bool,
     ) -> Result<Option<DatastoreMessage>> {
-        let blob_id = blob.id.hex();
-        let _span = debug_span!("matcher", blob_id).entered();
+        let _span = debug_span!("matcher", temp_id = blob.temp_id()).entered();
         let t1 = Instant::now();
         let res = self.matcher.scan_blob(&blob, &origin, None, redact, no_dedup, no_base64)?;
         let scan_us = t1.elapsed().as_micros();
@@ -43,10 +44,9 @@ impl<'a> BlobProcessor<'a> {
             ScanResult::SeenWithMatches => {
                 trace!("({scan_us}us) blob already scanned with matches");
                 let metadata = BlobMetadata {
-                    id: blob.id,
+                    id: blob.id(),
                     num_bytes: blob.len(),
                     mime_essence: None,
-                    charset: None,
                     language: None,
                 };
                 Ok(Some((origin, metadata, Vec::new())))
@@ -63,10 +63,9 @@ impl<'a> BlobProcessor<'a> {
                 }
                 let md = MetadataResult::from_blob_and_origin(&blob, &origin);
                 let metadata = BlobMetadata {
-                    id: blob.id,
+                    id: blob.id(),
                     num_bytes: blob.len(),
                     mime_essence: md.mime_essence,
-                    charset: md.charset,
                     language: md.language,
                 };
 
@@ -84,12 +83,17 @@ impl<'a> BlobProcessor<'a> {
                     }
                 }
 
-                let loc_mapping = LocationMapping::new(&blob.bytes());
+                let bytes = blob.bytes();
+                let loc_mapping = if bytes.len() <= LOCATION_LIMIT_BYTES {
+                    Some(LocationMapping::new(bytes))
+                } else {
+                    None
+                };
                 let converted_matches: Vec<(Option<f64>, Match)> = matches
                     .into_iter()
                     .map(|m| {
                         let converted_match = Match::convert_owned_blobmatch_to_match(
-                            &loc_mapping,
+                            loc_mapping.as_ref(),
                             &OwnedBlobMatch::from_blob_match(m),
                             origin_type,
                         );
@@ -106,7 +110,6 @@ impl<'a> BlobProcessor<'a> {
 struct MetadataResult {
     mime_essence: Option<String>,
     language: Option<String>,
-    charset: Option<String>,
 }
 impl MetadataResult {
     fn from_blob_and_origin(blob: &Blob, origin: &OriginSet) -> MetadataResult {
@@ -115,7 +118,6 @@ impl MetadataResult {
         let mime_essence = Some(tree_magic_mini::from_u8(bytes).to_string());
         let inspector = ContentInspector::default();
         let language = blob_path.and_then(|p| inspector.guess_language(p, bytes));
-        let charset = inspector.guess_charset(bytes);
-        MetadataResult { mime_essence, language, charset }
+        MetadataResult { mime_essence, language }
     }
 }
diff --git a/src/scanner/validation.rs b/src/scanner/validation.rs
index 05865b2..d69dbeb 100644
--- a/src/scanner/validation.rs
+++ b/src/scanner/validation.rs
@@ -13,7 +13,7 @@ use futures::{stream, StreamExt};
 use indicatif::{ProgressBar, ProgressStyle};
 use liquid::Parser;
 use reqwest::{Client, StatusCode};
-use rustc_hash::{FxHashMap, FxHashSet};
+use rustc_hash::FxHashMap;
 use tokio::{sync::Notify, time::timeout};
 
 use crate::{
@@ -21,7 +21,6 @@ use crate::{
     findings_store::{FindingsStore, FindingsStoreMessage},
     location::OffsetSpan,
     matcher::{Match, OwnedBlobMatch},
-    rules::rule,
     validation::{collect_variables_and_dependencies, validate_single_match, CachedResponse},
 };
 
@@ -40,7 +39,7 @@ pub async fn run_secret_validation(
     let fail_count = Arc::new(AtomicUsize::new(0));
 
     // ── 2. Fetch rules + matches ────────────────────────────────────────────
-    let (all_rules, all_matches_by_blob) = {
+    let (_all_rules, all_matches_by_blob) = {
         let ds = datastore.lock().unwrap();
         let rules = ds.get_rules()?;
         let mut map: FxHashMap<BlobId, Vec<Arc<FindingsStoreMessage>>> = FxHashMap::default();
@@ -51,16 +50,10 @@ pub async fn run_secret_validation(
     };
 
     // ── 3. Partition blobs ──────────────────────────────────────────────────
-    let rules_with_deps: FxHashSet<&str> = all_rules
-        .iter()
-        .filter(|r| !r.syntax().depends_on_rule.is_empty())
-        .map(|r| r.id())
-        .collect();
-
     let mut simple_matches = Vec::new();
     let mut dependent_blobs = FxHashMap::default(); // blob_id -- Vec<Arc<…>>
     for (blob_id, matches) in all_matches_by_blob {
-        if matches.iter().any(|m| rules_with_deps.contains(m.2.rule_text_id)) {
+        if matches.iter().any(|m| !m.2.rule.syntax().depends_on_rule.is_empty()) {
             dependent_blobs.insert(blob_id, matches);
         } else {
             simple_matches.extend(matches);
@@ -80,11 +73,8 @@ pub async fn run_secret_validation(
                 .captures
                 .get(1)
                 .or_else(|| arc_msg.2.groups.captures.get(0))
-                .map_or("", |c| c.value.as_ref());
-            groups
-                .entry(format!("{}|{}", arc_msg.2.rule_text_id, secret))
-                .or_default()
-                .push(arc_msg);
+                .map_or("", |c| c.value);
+            groups.entry(format!("{}|{}", arc_msg.2.rule.id(), secret)).or_default().push(arc_msg);
         }
 
         let validation_results = DashMap::<String, CachedResponse>::new();
@@ -109,7 +99,6 @@ pub async fn run_secret_validation(
             let client = client.clone();
             let cache_glob = cache.clone();
             let val_res = &validation_results;
-            let rules = &all_rules;
             let success = success_count.clone();
             let fail = fail_count.clone();
             // *** FIX: Clone the progress bar for each concurrent task ***
@@ -122,8 +111,8 @@ pub async fn run_secret_validation(
                     .captures
                     .get(1)
                     .or_else(|| rep_arc.2.groups.captures.get(0))
-                    .map_or("", |c| c.value.as_ref());
-                let key = format!("{}|{}", rep_arc.2.rule_text_id, secret);
+                    .map_or("", |c| c.value);
+                let key = format!("{}|{}", rep_arc.2.rule.id(), secret);
 
                 match val_res.entry(key.clone()) {
                     dashmap::mapref::entry::Entry::Occupied(_) => return,
@@ -138,8 +127,10 @@ pub async fn run_secret_validation(
                     }
                 }
 
-                let rule = find_rule_for_match(rules, rep_arc.2.rule_text_id).unwrap();
-                let mut om = OwnedBlobMatch::convert_match_to_owned_blobmatch(&rep_arc.2, rule);
+                let mut om = OwnedBlobMatch::convert_match_to_owned_blobmatch(
+                    &rep_arc.2,
+                    rep_arc.2.rule.clone(),
+                );
 
                 validate_single(
                     &mut om,
@@ -211,7 +202,6 @@ pub async fn run_secret_validation(
 
         let val_cache = Arc::new(DashMap::<String, CachedResponse>::new());
         let in_flight = Arc::new(DashMap::<String, ()>::new());
-        let rules_ref = Arc::new(all_rules.clone());
 
         for chunk in blob_ids.chunks(chunk_size) {
             let tasks: Vec<_> = chunk
@@ -225,15 +215,15 @@ pub async fn run_secret_validation(
                     let success = success_count.clone();
                     let fail = fail_count.clone();
                     let cache_glob = cache.clone();
-                    let rules = rules_ref.clone();
 
                     async move {
                         let owned = matches_for_blob
                             .iter()
                             .map(|arc_msg| {
-                                let rule = find_rule_for_match(&rules, arc_msg.2.rule_text_id)
-                                    .expect("rule");
-                                OwnedBlobMatch::convert_match_to_owned_blobmatch(&arc_msg.2, rule)
+                                OwnedBlobMatch::convert_match_to_owned_blobmatch(
+                                    &arc_msg.2,
+                                    arc_msg.2.rule.clone(),
+                                )
                             })
                             .collect::<Vec<_>>();
 
@@ -338,21 +328,6 @@ pub async fn run_secret_validation(
     Ok(())
 }
 
-/// Returns `Some(Arc<Rule>)` if a matching rule is found; otherwise returns `None`.
-/// Callers can decide how to handle the `None` case (e.g., skip processing).
-fn find_rule_for_match(
-    all_rules: &[Arc<rule::Rule>],
-    rule_text_id: &str,
-) -> Option<Arc<rule::Rule>> {
-    match all_rules.iter().find(|r| r.syntax().id == rule_text_id).cloned() {
-        Some(rule) => Some(rule),
-        None => {
-            eprintln!("Warning: no rule found with id '{}'. Skipping.", rule_text_id);
-            None
-        }
-    }
-}
-
 // ---------------------------------------------------
 // The core validation logic, used in an async pipeline
 // ---------------------------------------------------
diff --git a/src/update.rs b/src/update.rs
index 76629be..3ab5208 100644
--- a/src/update.rs
+++ b/src/update.rs
@@ -129,16 +129,29 @@ pub fn check_for_update(global_args: &GlobalArgs, base_url: Option<&str>) -> Opt
                     .apply_to(&format!("Updated to version {}", status.version()))
             ),
             Err(e) => match e {
-                UpdError::Io(ref io_err) if io_err.kind() == ErrorKind::PermissionDenied => {
-                    warn!(
-                        "{}",
-                        styles.style_finding_active_heading.apply_to(
-                            "Cannot replace the current binary - permission denied.\n\
-                             If you installed via a package manager, run its upgrade command.\n\
-                             Otherwise reinstall to a user-writable directory or re-run with sudo."
-                        )
-                    );
-                }
+                UpdError::Io(ref io_err) => match io_err.kind() {
+                    ErrorKind::PermissionDenied => {
+                        warn!(
+                            "{}",
+                            styles.style_finding_active_heading.apply_to(
+                                "Cannot replace the current binary - permission denied.\n\
+                                 If you installed via a package manager, run its upgrade command.\n\
+                                 Otherwise reinstall to a user-writable directory or re-run with sudo."
+                            )
+                        );
+                    }
+                    ErrorKind::NotFound => {
+                        warn!(
+                            "{}",
+                            styles.style_finding_active_heading.apply_to(
+                                "Cannot replace the current binary - file not found.\n\
+                                 If you installed via a package manager, run its upgrade command.\n\
+                                 Otherwise reinstall to a user-writable directory."
+                            )
+                        );
+                    }
+                    _ => error!("Failed to update: {e}"),
+                },
                 _ => error!("Failed to update: {e}"),
             },
         }
diff --git a/src/validation.rs b/src/validation.rs
index 3548d33..a645ce7 100644
--- a/src/validation.rs
+++ b/src/validation.rs
@@ -910,6 +910,7 @@ mod tests {
     use crossbeam_skiplist::SkipMap;
     use http::StatusCode;
     use rustc_hash::FxHashMap;
+    use smallvec::smallvec;
 
     use crate::{
         blob::BlobId,
@@ -920,6 +921,7 @@ mod tests {
             rule::{Confidence, Rule},
             Rules,
         },
+        util::intern,
         validation::{validate_single_match, Cache},
     };
     #[tokio::test]
@@ -1016,12 +1018,12 @@ rules:
             // matching_input: token.as_bytes().to_vec(),
             matching_input_offset_span: OffsetSpan { start: 0, end: token.len() },
             captures: SerializableCaptures {
-                captures: vec![SerializableCapture {
+                captures: smallvec![SerializableCapture {
                     name: Some("TOKEN".to_string()),
                     match_number: -1,
                     start: 0,
                     end: token.len(),
-                    value: token.into(),
+                    value: intern(token),
                 }],
             },
             validation_response_body: String::new(),
diff --git a/src/validation.rs.orig b/src/validation.rs.orig
deleted file mode 100644
index ca76943..0000000
--- a/src/validation.rs.orig
+++ /dev/null
@@ -1,1052 +0,0 @@
-use std::{
-    collections::BTreeMap,
-    fs,
-    hash::{Hash, Hasher},
-    sync::Arc,
-    time::{Duration, Instant},
-};
-
-use anyhow::Result;
-use crossbeam_skiplist::SkipMap;
-use dashmap::DashMap;
-use http::StatusCode;
-use liquid::Object;
-use liquid_core::{Value, ValueView};
-use once_cell::sync::OnceCell;
-use reqwest::{header, header::HeaderValue, multipart, Client, Url};
-use rustc_hash::FxHashMap;
-use tokio::{sync::Notify, time};
-use tracing::debug;
-
-use crate::{
-    location::OffsetSpan,
-    matcher::{OwnedBlobMatch, SerializableCaptures},
-    rules::rule::Validation,
-};
-
-mod aws;
-mod azure;
-mod coinbase;
-mod gcp;
-mod httpvalidation;
-mod jwt;
-mod mongodb;
-mod postgres;
-mod utils;
-
-const VALIDATION_CACHE_SECONDS: u64 = 1200; // 20 minutes
-const MAX_VALIDATION_BODY_LEN: usize = 2048;
-
-// Use SkipMap-based cache instead of a mutex-wrapped FxHashMap.
-type Cache = Arc<SkipMap<String, CachedResponse>>;
-
-/// Returns an opaque 64-bit fingerprint for “same secret under the same rule”.
-fn secret_fingerprint(m: &OwnedBlobMatch) -> u64 {
-    let mut hasher = xxhash_rust::xxh3::Xxh3::new();
-    m.rule.syntax().id.hash(&mut hasher);
-
-    // first capture = the secret text itself
-    if let Some(c0) = m.captures.captures.get(0) {
-        c0.value.hash(&mut hasher);
-    }
-    hasher.finish()
-}
-
-static VALIDATION_CACHE: OnceCell<DashMap<u64, CachedResponse>> = OnceCell::new();
-static IN_FLIGHT: OnceCell<DashMap<u64, Arc<Notify>>> = OnceCell::new();
-
-/// Call this once near program start (e.g. in `main()`)
-pub fn init_validation_caches() {
-    VALIDATION_CACHE.set(DashMap::new()).ok();
-    IN_FLIGHT.set(DashMap::new()).ok();
-}
-
-#[derive(Clone)]
-pub struct CachedResponse {
-    pub body: String,
-    pub status: StatusCode,
-    pub is_valid: bool,
-    pub timestamp: Instant,
-}
-
-impl CachedResponse {
-    pub fn new(body: String, status: StatusCode, is_valid: bool) -> Self {
-        Self { body, status, is_valid, timestamp: Instant::now() }
-    }
-
-    pub fn is_still_valid(&self, cache_duration: Duration) -> bool {
-        self.timestamp.elapsed() < cache_duration
-    }
-}
-
-/// Collect dependent variables and missing dependencies from the provided matches.
-pub fn collect_variables_and_dependencies(
-    matches: &[OwnedBlobMatch],
-) -> (FxHashMap<String, Vec<(String, OffsetSpan)>>, FxHashMap<String, Vec<String>>) {
-    let mut variable_map: FxHashMap<String, Vec<(String, OffsetSpan)>> = FxHashMap::default();
-    let mut missing_deps: FxHashMap<String, Vec<String>> = FxHashMap::default();
-
-    for m in matches {
-        let rule_id = m.rule.syntax().id.clone();
-        for dependency in m.rule.syntax().depends_on_rule.iter().flatten() {
-            let dependency_rule_id = &dependency.rule_id;
-            // Use iterator adapter to get all matching dependencies.
-            let matching_dependencies: Vec<_> =
-                matches.iter().filter(|x| x.rule.syntax().id == *dependency_rule_id).collect();
-
-            if !matching_dependencies.is_empty() {
-                for other_match in matching_dependencies {
-                    let matching_input = other_match
-                        .captures
-                        .captures
-                        .get(1)
-                        .or_else(|| other_match.captures.captures.get(0))
-                        .expect("Expected at least one capture");
-                    variable_map
-                        .entry(dependency.variable.to_uppercase())
-                        .or_insert_with(Vec::new)
-                        .push((
-                            matching_input.value.to_string(),
-                            other_match.matching_input_offset_span,
-                        ));
-                }
-            } else {
-                missing_deps.entry(rule_id.clone()).or_default().push(dependency.rule_id.clone());
-            }
-        }
-    }
-    (variable_map, missing_deps)
-}
-
-/// Render a template and parse the resulting string as a URL.
-async fn render_and_parse_url(
-    parser: &liquid::Parser,
-    globals: &liquid::Object,
-    rule_name: &str,
-    template_url: &str,
-) -> Result<Url, String> {
-    let rendered_url_str =
-        render_template(parser, globals, rule_name, template_url).await.map_err(|e| {
-            let error_msg = format!("Error rendering URL template: <{}> {}", rule_name, e);
-            debug!("{}", error_msg);
-            error_msg
-        })?;
-
-    let url = Url::parse(&rendered_url_str).map_err(|e| {
-        let error_msg = format!("Error parsing rendered URL: {}", e);
-        debug!("{}", error_msg);
-        error_msg
-    })?;
-
-    // Check if the URL is resolvable.
-    utils::check_url_resolvable(&url).await.map_err(|e| {
-        let error_msg = format!("URL resolution failed: {}", e);
-        error_msg
-    })?;
-
-    Ok(url)
-}
-
-/// Render a template string using Liquid.
-async fn render_template(
-    parser: &liquid::Parser,
-    globals: &liquid::Object,
-    rule_name: &str,
-    template_str: &str,
-) -> Result<String, String> {
-    parser
-        .parse(template_str)
-        .map_err(|e| {
-            let msg = format!("Error parsing template for rule <{}>: {}", rule_name, e);
-            debug!("{}", msg);
-            msg
-        })
-        .and_then(|template| {
-            template.render(globals).map_err(|e| {
-                let msg = format!("Error rendering template for rule <{}>: {}", rule_name, e);
-                debug!("{}", msg);
-                msg
-            })
-        })
-}
-
-/// Validate a single match with a timeout of 60 seconds.
-pub async fn validate_single_match(
-    m: &mut OwnedBlobMatch,
-    parser: &liquid::Parser,
-    client: &Client,
-    dependent_variables: &FxHashMap<String, Vec<(String, OffsetSpan)>>,
-    missing_dependencies: &FxHashMap<String, Vec<String>>,
-    cache: &Cache,
-) {
-    let timeout_result = time::timeout(Duration::from_secs(60), async {
-        timed_validate_single_match(
-            m,
-            parser,
-            client,
-            dependent_variables,
-            missing_dependencies,
-            cache,
-        )
-        .await
-    })
-    .await;
-
-    if timeout_result.is_err() {
-        m.validation_success = false;
-        m.validation_response_body = "Validation timed out after 60 seconds".to_string();
-        m.validation_response_status = StatusCode::REQUEST_TIMEOUT;
-    }
-}
-
-/// Perform the actual validation of a match.
-/// Guarantees that each <RULE-ID>|<secret> is validated only once per process,
-/// even when `--no-dedup` is used.
-async fn timed_validate_single_match<'a>(
-    m: &mut OwnedBlobMatch,
-    parser: &liquid::Parser,
-    client: &Client,
-    dependent_variables: &FxHashMap<String, Vec<(String, OffsetSpan)>>,
-    missing_dependencies: &FxHashMap<String, Vec<String>>,
-    cache: &Cache,
-) {
-    // ──────────────────────────────────────────────────────────
-    // 1. process-wide fingerprint de-dup
-    // ──────────────────────────────────────────────────────────
-    let fp = secret_fingerprint(m);
-
-    if let Some(entry) = VALIDATION_CACHE.get_or_init(DashMap::new).get(&fp) {
-        if entry.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
-            m.validation_success = entry.is_valid;
-            m.validation_response_body = entry.body.clone();
-            m.validation_response_status = entry.status;
-            return;
-        }
-    }
-    if let Some(wait) = IN_FLIGHT.get_or_init(DashMap::new).get(&fp) {
-        wait.notified().await;
-        if let Some(entry) = VALIDATION_CACHE.get().unwrap().get(&fp) {
-            m.validation_success = entry.is_valid;
-            m.validation_response_body = entry.body.clone();
-            m.validation_response_status = entry.status;
-        }
-        return;
-    }
-    let notify = Arc::new(Notify::new());
-    IN_FLIGHT.get().unwrap().insert(fp, notify.clone());
-
-    // helper to persist result + notify waiters
-    let commit_and_return = |m: &OwnedBlobMatch| {
-        VALIDATION_CACHE.get().unwrap().insert(
-            fp,
-            CachedResponse {
-                body: m.validation_response_body.clone(),
-                status: m.validation_response_status,
-                is_valid: m.validation_success,
-                timestamp: Instant::now(),
-            },
-        );
-        IN_FLIGHT.get().unwrap().remove(&fp);
-        notify.notify_waiters();
-    };
-    // ──────────────────────────────────────────────────────────
-
-    // 2. dependency check
-    if let Some(missing) = missing_dependencies.get(&m.rule.syntax().id) {
-        if !missing.is_empty() {
-            m.validation_success = false;
-            m.validation_response_body =
-                format!("Validation skipped - missing dependent rules: {}", missing.join(", "));
-            m.validation_response_status = StatusCode::PRECONDITION_REQUIRED;
-            commit_and_return(m);
-            return;
-        }
-    }
-
-    // 3. capture processing
-    let match_re_result = m.rule.syntax().as_anchored_regex();
-    let mut captured_values: Vec<(String, String, usize, usize)> = match match_re_result {
-        Ok(_) => utils::process_captures(&m.captures),
-        Err(e) => {
-            m.validation_success = false;
-            m.validation_response_body = format!("Regex error: {}", e);
-            m.validation_response_status = StatusCode::INTERNAL_SERVER_ERROR;
-            commit_and_return(m);
-            return;
-        }
-    };
-
-    for dep in m.rule.syntax().depends_on_rule.iter().flatten() {
-        if let Some(vals) = dependent_variables.get(&dep.variable.to_uppercase()) {
-            for (val, span) in vals {
-                captured_values.push((
-                    dep.variable.to_uppercase(),
-                    val.clone(),
-                    span.start,
-                    span.end,
-                ));
-            }
-        }
-    }
-
-    let mut globals = Object::new();
-    for (k, v, ..) in &captured_values {
-        globals.insert(k.to_uppercase().into(), Value::scalar(v.clone()));
-    }
-
-    let rule_syntax = m.rule.syntax();
-
-    // ──────────────────────────────────────────────────────────
-    // 4. validator switch
-    // ──────────────────────────────────────────────────────────
-    match &rule_syntax.validation {
-        // ---------------------------------------------------- HTTP validator
-        Some(Validation::Http(http_validation)) => {
-            // render URL
-            let url = match render_and_parse_url(
-                parser,
-                &globals,
-                &rule_syntax.name,
-                &http_validation.request.url,
-            )
-            .await
-            {
-                Ok(u) => u,
-                Err(e) => {
-                    m.validation_success = false;
-                    m.validation_response_body = e;
-                    m.validation_response_status = StatusCode::BAD_REQUEST;
-                    commit_and_return(m);
-                    return;
-                }
-            };
-
-            // build request builder
-            let request_builder = match httpvalidation::build_request_builder(
-                client,
-                &http_validation.request.method,
-                &url,
-                &http_validation.request.headers,
-                &http_validation.request.body,
-                parser,
-                &globals,
-            ) {
-                Ok(rb) => rb,
-                Err(e) => {
-                    m.validation_success = false;
-                    m.validation_response_body = e;
-                    m.validation_response_status = StatusCode::BAD_REQUEST;
-                    commit_and_return(m);
-                    return;
-                }
-            };
-
-            let is_multipart = http_validation.request.multipart.is_some();
-            let mut cache_key = String::new();
-
-            // old per-request cache (optional)
-            if !is_multipart {
-                let rendered_headers = httpvalidation::process_headers(
-                    &http_validation.request.headers,
-                    parser,
-                    &globals,
-                    &url,
-                )
-                .unwrap_or_default();
-
-                let mut header_map = BTreeMap::new();
-                for (name, value) in rendered_headers.iter() {
-                    if let Ok(v) = value.to_str() {
-                        header_map.insert(name.as_str().to_string(), v.to_string());
-                    }
-                }
-                cache_key = httpvalidation::generate_http_cache_key_parts(
-                    http_validation.request.method.as_str(),
-                    &url,
-                    &header_map,
-                );
-                if let Some(cached) = cache.get(&cache_key) {
-                    let c = cached.value();
-                    if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
-                        m.validation_success = c.is_valid;
-                        m.validation_response_body = c.body.clone();
-                        m.validation_response_status = c.status;
-                        commit_and_return(m);
-                        return;
-                    }
-                }
-            }
-
-            // helper to execute single non-multipart request with retry
-            let exec_single = |builder: reqwest::RequestBuilder| async {
-                httpvalidation::retry_request(
-                    builder,
-                    1,
-                    Duration::from_millis(500),
-                    Duration::from_secs(2),
-                )
-                .await
-            };
-
-            // run request (multipart vs non-multipart)
-            let resp_res = if is_multipart {
-                // build multipart request each retry
-                let build_request = || async {
-                    let method = httpvalidation::parse_http_method(&http_validation.request.method)
-                        .unwrap_or(reqwest::Method::GET);
-
-                    let mut fresh_builder =
-                        client.request(method, url.clone()).timeout(Duration::from_secs(5));
-
-                    if let Ok(mut headers) = httpvalidation::process_headers(
-                        &http_validation.request.headers,
-                        parser,
-                        &globals,
-                        &url,
-                    ) {
-                        // add realistic UA & accept headers
-                        let ua = format!(
-                            "{} {}/{}",
-                            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
-                             AppleWebKit/537.36 (KHTML, like Gecko) \
-                             Chrome/132.0.0.0 Safari/537.36",
-                            env!("CARGO_PKG_NAME"),
-                            env!("CARGO_PKG_VERSION")
-                        );
-                        let std_headers = [
-                            (header::USER_AGENT, ua.as_str()),
-                            (header::ACCEPT , "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"),
-                            (header::ACCEPT_LANGUAGE, "en-US,en;q=0.5"),
-                            (header::ACCEPT_ENCODING, "gzip, deflate, br"),
-                            (header::CONNECTION, "keep-alive"),
-                        ];
-                        for (hn, hv) in &std_headers {
-                            if let Ok(v) = HeaderValue::from_str(hv) {
-                                headers.insert(hn.clone(), v);
-                            }
-                        }
-                        fresh_builder = fresh_builder.headers(headers);
-                    }
-
-                    // build multipart form
-                    let mut form = multipart::Form::new();
-                    for part in http_validation.request.multipart.as_ref().unwrap().parts.iter() {
-                        match part.part_type.as_str() {
-                            "file" => {
-                                let path = render_template(
-                                    parser,
-                                    &globals,
-                                    &rule_syntax.name,
-                                    &part.content,
-                                )
-                                .await
-                                .unwrap_or_default();
-                                let bytes = fs::read(path).unwrap_or_default();
-                                let p = multipart::Part::bytes(bytes)
-                                    .mime_str(
-                                        part.content_type
-                                            .as_deref()
-                                            .unwrap_or("application/octet-stream"),
-                                    )
-                                    .unwrap_or_else(|_| multipart::Part::text("invalid"));
-                                form = form.part(part.name.clone(), p);
-                            }
-                            "text" => {
-                                let txt = render_template(
-                                    parser,
-                                    &globals,
-                                    &rule_syntax.name,
-                                    &part.content,
-                                )
-                                .await
-                                .unwrap_or_default();
-                                let p = multipart::Part::text(txt)
-                                    .mime_str(part.content_type.as_deref().unwrap_or("text/plain"))
-                                    .unwrap_or_else(|_| multipart::Part::text("invalid"));
-                                form = form.part(part.name.clone(), p);
-                            }
-                            _ => { /* ignore */ }
-                        }
-                    }
-                    fresh_builder.multipart(form)
-                };
-
-                httpvalidation::retry_multipart_request(
-                    build_request,
-                    1,
-                    Duration::from_millis(500),
-                    Duration::from_secs(2),
-                )
-                .await
-            } else {
-                exec_single(request_builder).await
-            };
-
-            // handle result
-            match resp_res {
-                Ok(resp) => {
-                    let status = resp.status();
-                    let headers = resp.headers().clone();
-                    let mut body = match resp.text().await {
-                        Ok(b) => b,
-                        Err(e) => {
-                            m.validation_success = false;
-                            m.validation_response_body = format!("Error reading response: {}", e);
-                            m.validation_response_status = StatusCode::BAD_GATEWAY;
-                            commit_and_return(m);
-                            return;
-                        }
-                    };
-                    if body.len() > MAX_VALIDATION_BODY_LEN {
-                        body.truncate(MAX_VALIDATION_BODY_LEN);
-                    }
-
-                    m.validation_response_status = status;
-                    m.validation_response_body = body.clone();
-                    let matchers = http_validation
-                        .request
-                        .response_matcher
-                        .as_ref()
-                        .expect("missing response_matcher");
-
-                    m.validation_success = httpvalidation::validate_response(
-                        matchers,
-                        &body,
-                        &status,
-                        &headers,
-                        http_validation.request.response_is_html,
-                    );
-
-                    if !is_multipart && !cache_key.is_empty() {
-                        cache.insert(
-                            cache_key,
-                            CachedResponse {
-                                body,
-                                status,
-                                is_valid: m.validation_success,
-                                timestamp: Instant::now(),
-                            },
-                        );
-                    }
-                }
-                Err(e) => {
-                    m.validation_success = false;
-                    m.validation_response_body = format!("HTTP error: {:?}", e);
-                    m.validation_response_status = StatusCode::BAD_GATEWAY;
-                }
-            }
-        }
-
-        // ---------------------------------------------------- MongoDB validator
-        Some(Validation::MongoDB) => {
-            let uri = globals
-                .get("TOKEN")
-                .and_then(|v| v.as_scalar())
-                .map(|s| s.into_owned().to_kstr().to_string())
-                .unwrap_or_default();
-
-            if uri.is_empty() {
-                m.validation_success = false;
-                m.validation_response_body = "MongoDB URI not found.".to_string();
-                m.validation_response_status = StatusCode::BAD_REQUEST;
-                commit_and_return(m);
-                return;
-            }
-
-            let cache_key = mongodb::generate_mongodb_cache_key(&uri);
-            if let Some(cached) = cache.get(&cache_key) {
-                let c = cached.value();
-                if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
-                    m.validation_success = c.is_valid;
-                    m.validation_response_body = c.body.clone();
-                    m.validation_response_status = c.status;
-                    commit_and_return(m);
-                    return;
-                }
-            }
-
-            match mongodb::validate_mongodb(&uri).await {
-                Ok((ok, msg)) => {
-                    m.validation_success = ok;
-                    m.validation_response_body = msg;
-                    m.validation_response_status =
-                        if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
-                }
-                Err(e) => {
-                    m.validation_success = false;
-                    m.validation_response_body = format!("MongoDB validation error: {}", e);
-                    m.validation_response_status = StatusCode::BAD_GATEWAY;
-                }
-            }
-        }
-
-        // ------------------------------------------------ Azure Storage validator
-        Some(Validation::AzureStorage) => {
-            let storage_key = captured_values
-                .iter()
-                .find(|(n, ..)| n == "TOKEN")
-                .map(|(_, v, ..)| v.clone())
-                .unwrap_or_default();
-            let storage_account =
-                utils::find_closest_variable(&captured_values, &storage_key, "TOKEN", "AZURENAME")
-                    .unwrap_or_default();
-
-            if storage_account.is_empty() || storage_key.is_empty() {
-                m.validation_success = false;
-                m.validation_response_body = "Missing Azure Storage account or key.".to_string();
-                m.validation_response_status = StatusCode::BAD_REQUEST;
-                commit_and_return(m);
-                return;
-            }
-
-            let creds_json = format!(
-                r#"{{"storage_account":"{}","storage_key":"{}"}}"#,
-                storage_account, storage_key
-            );
-            let cache_key = azure::generate_azure_cache_key(&creds_json);
-
-            if let Some(cached) = cache.get(&cache_key) {
-                let c = cached.value();
-                if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
-                    m.validation_success = c.is_valid;
-                    m.validation_response_body = c.body.clone();
-                    m.validation_response_status = c.status;
-                    commit_and_return(m);
-                    return;
-                }
-            }
-
-            match azure::validate_azure_storage_credentials(&creds_json, cache).await {
-                Ok((ok, msg)) => {
-                    m.validation_success = ok;
-                    m.validation_response_body = msg;
-                    m.validation_response_status =
-                        if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
-                }
-                Err(e) => {
-                    m.validation_success = false;
-                    m.validation_response_body = format!("Azure Storage error: {}", e);
-                    m.validation_response_status = StatusCode::BAD_GATEWAY;
-                }
-            }
-            cache.insert(
-                cache_key,
-                CachedResponse {
-                    body: m.validation_response_body.clone(),
-                    status: m.validation_response_status,
-                    is_valid: m.validation_success,
-                    timestamp: Instant::now(),
-                },
-            );
-        }
-
-        // ------------------------------------------------ Postgres validator
-        Some(Validation::Postgres) => {
-            let pg_url = globals
-                .get("TOKEN")
-                .and_then(|v| v.as_scalar())
-                .map(|s| s.into_owned().to_kstr().to_string())
-                .unwrap_or_default();
-
-            if pg_url.is_empty() {
-                m.validation_success = false;
-                m.validation_response_body = "Postgres URL not found.".to_string();
-                m.validation_response_status = StatusCode::BAD_REQUEST;
-                commit_and_return(m);
-                return;
-            }
-
-            let cache_key = postgres::generate_postgres_cache_key(&pg_url);
-            if let Some(cached) = cache.get(&cache_key) {
-                let c = cached.value();
-                if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
-                    m.validation_success = c.is_valid;
-                    m.validation_response_body = c.body.clone();
-                    m.validation_response_status = c.status;
-                    commit_and_return(m);
-                    return;
-                }
-            }
-
-            match postgres::validate_postgres(&pg_url).await {
-                Ok((ok, meta)) => {
-                    m.validation_success = ok;
-                    m.validation_response_body = if ok {
-                        format!("Postgres connection is valid. Metadata: {:?}", meta)
-                    } else {
-                        "Postgres connection failed.".to_string()
-                    };
-                    m.validation_response_status =
-                        if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
-                }
-                Err(e) => {
-                    m.validation_success = false;
-                    m.validation_response_body = format!("Postgres error: {}", e);
-                    m.validation_response_status = StatusCode::BAD_GATEWAY;
-                }
-            }
-            cache.insert(
-                cache_key,
-                CachedResponse {
-                    body: m.validation_response_body.clone(),
-                    status: m.validation_response_status,
-                    is_valid: m.validation_success,
-                    timestamp: Instant::now(),
-                },
-            );
-        }
-        // ---------------------------------------------------- JWT validator
-        Some(Validation::JWT) => {
-            let token = captured_values
-                .iter()
-                .find(|(n, ..)| n == "TOKEN")
-                .map(|(_, v, ..)| v.clone())
-                .unwrap_or_default();
-
-            if token.is_empty() {
-                m.validation_success = false;
-                m.validation_response_body = "JWT token not found.".to_string();
-                m.validation_response_status = StatusCode::BAD_REQUEST;
-                commit_and_return(m);
-                return;
-            }
-
-            match jwt::validate_jwt(&token).await {
-                Ok((ok, msg)) => {
-                    m.validation_success = ok;
-                    m.validation_response_body = msg;
-                    m.validation_response_status =
-                        if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
-                }
-                Err(e) => {
-                    m.validation_success = false;
-                    m.validation_response_body = format!("JWT validation error: {}", e);
-                    m.validation_response_status = StatusCode::BAD_REQUEST;
-                }
-            }
-        }
-        // ---------------------------------------------------- AWS validator
-        Some(Validation::AWS) => {
-            let secret = captured_values
-                .iter()
-                .find(|(n, ..)| n == "TOKEN")
-                .map(|(_, v, ..)| v.clone())
-                .unwrap_or_default();
-            let akid = utils::find_closest_variable(&captured_values, &secret, "TOKEN", "AKID")
-                .unwrap_or_default();
-
-            if akid.is_empty() || secret.is_empty() {
-                m.validation_success = false;
-                m.validation_response_body = "Missing AWS access-key ID or secret.".to_string();
-                m.validation_response_status = StatusCode::BAD_REQUEST;
-                commit_and_return(m);
-                return;
-            }
-
-            let cache_key = aws::generate_aws_cache_key(&akid, &secret);
-            if let Some(cached) = cache.get(&cache_key) {
-                let c = cached.value();
-                if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
-                    m.validation_success = c.is_valid;
-                    m.validation_response_body = c.body.clone();
-                    m.validation_response_status = c.status;
-                    commit_and_return(m);
-                    return;
-                }
-            }
-
-            if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) {
-                m.validation_success = false;
-                m.validation_response_body = format!("Invalid AWS credentials ({}): {}", akid, e);
-                m.validation_response_status = StatusCode::BAD_REQUEST;
-                commit_and_return(m);
-                return;
-            }
-
-            match aws::validate_aws_credentials(&akid, &secret, cache).await {
-                Ok((ok, arn)) => {
-                    m.validation_success = ok;
-                    m.validation_response_body = format!("{} --- ARN: {}", akid, arn);
-                    m.validation_response_status =
-                        if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
-                    if let Ok(acct) = aws::aws_key_to_account_number(&akid) {
-                        m.validation_response_body
-                            .push_str(&format!(" --- AWS Account Number: {:012}", acct));
-                    }
-                }
-                Err(e) => {
-                    m.validation_success = false;
-                    m.validation_response_body = format!("AWS validation error ({}): {}", akid, e);
-                    m.validation_response_status = StatusCode::BAD_GATEWAY;
-                }
-            }
-            cache.insert(
-                cache_key,
-                CachedResponse {
-                    body: m.validation_response_body.clone(),
-                    status: m.validation_response_status,
-                    is_valid: m.validation_success,
-                    timestamp: Instant::now(),
-                },
-            );
-        }
-
-        // ----------------------------------------------------- GCP validator
-        Some(Validation::GCP) => {
-            let gcp_json = globals
-                .get("TOKEN")
-                .and_then(|v| v.as_scalar())
-                .map(|s| s.into_owned().to_kstr().to_string())
-                .unwrap_or_default();
-
-            if gcp_json.is_empty() {
-                m.validation_success = false;
-                m.validation_response_body = "GCP JSON not found.".to_string();
-                m.validation_response_status = StatusCode::BAD_REQUEST;
-                commit_and_return(m);
-                return;
-            }
-
-            let cache_key = gcp::generate_gcp_cache_key(&gcp_json);
-            if let Some(cached) = cache.get(&cache_key) {
-                let c = cached.value();
-                if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
-                    m.validation_success = c.is_valid;
-                    m.validation_response_body = c.body.clone();
-                    m.validation_response_status = c.status;
-                    commit_and_return(m);
-                    return;
-                }
-            }
-
-            match gcp::GcpValidator::global() {
-                Ok(validator) => {
-                    match validator.validate_gcp_credentials(&gcp_json.as_bytes()).await {
-                        Ok((ok, meta)) => {
-                            m.validation_success = ok;
-                            m.validation_response_body = meta.join("\n");
-                            m.validation_response_status =
-                                if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
-                        }
-                        Err(e) => {
-                            m.validation_success = false;
-                            m.validation_response_body = format!("GCP validation error: {}", e);
-                            m.validation_response_status = StatusCode::BAD_GATEWAY;
-                        }
-                    }
-                }
-                Err(e) => {
-                    m.validation_success = false;
-                    m.validation_response_body = format!("Failed to create GCP validator: {}", e);
-                    m.validation_response_status = StatusCode::INTERNAL_SERVER_ERROR;
-                }
-            }
-            cache.insert(
-                cache_key,
-                CachedResponse {
-                    body: m.validation_response_body.clone(),
-                    status: m.validation_response_status,
-                    is_valid: m.validation_success,
-                    timestamp: Instant::now(),
-                },
-            );
-        }
-        // ----------------------------------------------------- Coinbase validator
-        Some(Validation::Coinbase) => {
-            let cred_name = globals
-                .get("CRED_NAME")
-                .and_then(|v| v.as_scalar())
-                .map(|s| s.into_owned().to_kstr().to_string())
-                .unwrap_or_default();
-            let private_key = globals
-                .get("PRIVATE_KEY")
-                .and_then(|v| v.as_scalar())
-                .map(|s| s.into_owned().to_kstr().to_string())
-                .unwrap_or_default();
-
-            if cred_name.is_empty() || private_key.is_empty() {
-                m.validation_success = false;
-                m.validation_response_body = "Missing key name or private key.".to_string();
-                m.validation_response_status = StatusCode::BAD_REQUEST;
-                commit_and_return(m);
-                return;
-            }
-
-            match coinbase::validate_cdp_api_key(&cred_name, &private_key, client, parser, cache)
-                .await
-            {
-                Ok((ok, msg)) => {
-                    m.validation_success = ok;
-                    m.validation_response_body = msg;
-                    m.validation_response_status =
-                        if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
-                }
-                Err(e) => {
-                    m.validation_success = false;
-                    m.validation_response_body = format!("Coinbase validation error: {}", e);
-                    m.validation_response_status = StatusCode::BAD_GATEWAY;
-                }
-            }
-        }
-        // --------------------------------------------------------- Raw / none
-        Some(Validation::Raw(raw)) => {
-            debug!("Raw validation not implemented: {}", raw);
-            m.validation_success = false;
-            m.validation_response_body = "Validator not implemented".to_string();
-            m.validation_response_status = StatusCode::NOT_IMPLEMENTED;
-        }
-        None => { /* no validation specified */ }
-    }
-
-    // 5. persist result for success path
-    commit_and_return(m);
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use anyhow::Result;
-    use crossbeam_skiplist::SkipMap;
-    use http::StatusCode;
-    use rustc_hash::FxHashMap;
-
-    use crate::{
-        blob::BlobId,
-        liquid_filters::register_all,
-        location::OffsetSpan,
-        matcher::{OwnedBlobMatch, SerializableCapture, SerializableCaptures},
-        rules::{
-            rule::{Confidence, Rule},
-            Rules,
-        },
-        validation::{validate_single_match, Cache},
-    };
-    #[tokio::test]
-    async fn test_actual_pypi_token_validation() -> Result<()> {
-        // Minimal PyPI YAML snippet for testing
-        let pypi_yaml = r#"
-rules:
-  - name: PyPI Upload Token
-    id: kingfisher.pypi.1
-    pattern: |
-      (?x)
-      \b
-      (
-        pypi-AgEIcHlwaS5vcmc[a-zA-Z0-9_-]{50,}
-      )
-      (?:[^a-zA-Z0-9_-]|$)
-    min_entropy: 4.0
-    confidence: medium
-    examples:
-      - '# password = pypi-AgEIcHlwaS5vcmcCJDkwNzYwNzU1LWMwOTUtNGNkOC1iYjQzLTU3OWNhZjI1NDQ1MwACJXsicGVybWCf99lvbnMiOiAidXNlciIsICJ2ZXJzaW9uIjogMX0AAAYgSpW5PAywXvchMUQnkF5H6-SolJysfUvIWopMsxE4hCM'
-      - 'password: pypi-AgEIcHlwaS5vcmcCJGExMDIxZjRhLTFhZDMtNDc4YS1iOWNmLWQwCf99OTIwZjFjNwACSHsicGVybWlzc2lvbnMiOiB7InByb2plY3RzIjogWyJkamFuZ28tY2hhbm5lbHMtanNvbnJwYyJdfSwgInZlcnNpb24iOiAxfQAABiBZg48cIBQt7HckwM4G3q-462xphsLbm7IZvjqMS4jvQw'
-    validation:
-      type: Http
-      content:
-        request:
-          method: POST
-          url: https://upload.pypi.org/legacy/
-          response_is_html: true
-          response_matcher:
-            - report_response: true
-            - type: WordMatch
-              words:
-                - "isn't allowed to upload to project"
-          headers:
-            Authorization: 'Basic {{ "__token__:" | append: TOKEN | b64enc }}'
-          multipart:
-            parts:
-              - name: name
-                type: text
-                content: "my-package"
-              - name: version
-                type: text
-                content: "0.0.1"
-              - name: filetype
-                type: text
-                content: "sdist"
-              - name: metadata_version
-                type: text
-                content: "2.1"
-              - name: summary
-                type: text
-                content: "A simple example package"
-              - name: home_page
-                type: text
-                content: "https://github.com/yourusername/my_package"
-              - name: sha256_digest
-                type: text
-                content: "0447379dd46c4ca8b8992bda56d07b358d015efb9300e6e16f224f4536e71d64"
-              - name: md5_digest
-                type: text
-                content: "9b4036ab91a71124ab9f1d32a518e2bb"
-              - name: :action
-                type: text
-                content: "file_upload"
-              - name: protocol_version
-                type: text
-                content: "1"
-              - name: content
-                type: file
-                content: "path/to/my_package-0.0.1.tar.gz"
-                content_type: "application/octet-stream"
-        "#;
-        // Use from_paths_and_contents to parse the YAML snippet into a Rules object
-        let data = vec![(std::path::Path::new("pypi_test.yaml"), pypi_yaml.as_bytes())];
-        let rules = Rules::from_paths_and_contents(data, Confidence::Low)?;
-        // Find the PyPI rule we just loaded
-        let pypi_rule_syntax = rules
-            .iter_rules()
-            .find(|r| r.id == "kingfisher.pypi.1")
-            .expect("Failed to find PyPI rule in test YAML")
-            .clone(); // Clone so we can create a `Rule` from it
-                      // Wrap that into a `Rule` object
-        let pypi_rule = Rule::new(pypi_rule_syntax);
-        //////////////////////////////////////////
-        //
-        // Your actual PyPI token to test
-        let token = "<enter_pypi_token_here>";
-        let id = BlobId::new(&pypi_yaml.as_bytes());
-        // Construct an `OwnedBlobMatch` (all fields needed):
-        let mut owned_blob_match = OwnedBlobMatch {
-            rule: pypi_rule.into(),
-            blob_id: id,
-            finding_fingerprint: 0, // dummy value
-            // matching_input: token.as_bytes().to_vec(),
-            matching_input_offset_span: OffsetSpan { start: 0, end: token.len() },
-            captures: SerializableCaptures {
-                captures: vec![SerializableCapture {
-                    name: Some("TOKEN".to_string()),
-                    match_number: -1,
-                    start: 0,
-                    end: token.len(),
-                    value: token.into(),
-                }],
-            },
-            validation_response_body: String::new(),
-            validation_response_status: StatusCode::OK,
-            validation_success: false,
-            calculated_entropy: 0.0, // or compute your own
-        };
-        let parser = register_all(liquid::ParserBuilder::with_stdlib()).build()?;
-        let client = reqwest::Client::new();
-        let cache: Cache = Arc::new(SkipMap::new());
-        let dependent_vars = FxHashMap::default();
-        let missing_deps = FxHashMap::default();
-        // Run the validation
-        validate_single_match(
-            &mut owned_blob_match,
-            &parser,
-            &client,
-            &dependent_vars,
-            &missing_deps,
-            &cache,
-        )
-        .await;
-        println!("Success? {:?}", owned_blob_match.validation_success);
-        println!("Status: {:?}", owned_blob_match.validation_response_status);
-        println!("Body: {:?}", owned_blob_match.validation_response_body);
-        Ok(())
-    }
-}
diff --git a/src/validation/utils.rs b/src/validation/utils.rs
index ee118c8..ea0b020 100644
--- a/src/validation/utils.rs
+++ b/src/validation/utils.rs
@@ -22,7 +22,7 @@ pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String,
         .map(|cap| {
             let name =
                 cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string());
-            (name, cap.value.clone().into_owned(), cap.start, cap.end)
+            (name, cap.value.to_string(), cap.start, cap.end)
         })
         .collect()
 }
@@ -76,16 +76,17 @@ mod tests {
     use super::*;
     use crate::matcher::{SerializableCapture, SerializableCaptures};
     use pretty_assertions::assert_eq;
+    use smallvec::smallvec;
 
     #[test]
     fn single_unnamed_capture_is_returned() {
         let captures = SerializableCaptures {
-            captures: vec![SerializableCapture {
+            captures: smallvec![SerializableCapture {
                 name: None,
                 match_number: 0,
                 start: 1,
                 end: 4,
-                value: "abc".into(),
+                value: "abc",
             }],
         };
         let result = process_captures(&captures);
@@ -95,20 +96,20 @@ mod tests {
     #[test]
     fn skips_whole_match_when_multiple() {
         let captures = SerializableCaptures {
-            captures: vec![
+            captures: smallvec![
                 SerializableCapture {
                     name: None,
                     match_number: 0,
                     start: 0,
                     end: 5,
-                    value: "abcde".into(),
+                    value: "abcde",
                 },
                 SerializableCapture {
                     name: Some("foo".to_string()),
                     match_number: -1,
                     start: 1,
                     end: 4,
-                    value: "bcd".into(),
+                    value: "bcd",
                 },
             ],
         };
@@ -119,28 +120,22 @@ mod tests {
     #[test]
     fn includes_unnamed_groups_but_skips_whole_match() {
         let captures = SerializableCaptures {
-            captures: vec![
+            captures: smallvec![
                 SerializableCapture {
                     name: None,
                     match_number: 0,
                     start: 0,
                     end: 6,
-                    value: "aabbcc".into(),
+                    value: "aabbcc",
                 },
                 SerializableCapture {
                     name: Some("foo".to_string()),
                     match_number: -1,
                     start: 0,
                     end: 2,
-                    value: "aa".into(),
-                },
-                SerializableCapture {
-                    name: None,
-                    match_number: 1,
-                    start: 4,
-                    end: 6,
-                    value: "cc".into(),
+                    value: "aa",
                 },
+                SerializableCapture { name: None, match_number: 1, start: 4, end: 6, value: "cc" },
             ],
         };
         let result = process_captures(&captures);
diff --git a/tests/fingerprint_dedup.rs b/tests/fingerprint_dedup.rs
index e4380ac..f458e7d 100644
--- a/tests/fingerprint_dedup.rs
+++ b/tests/fingerprint_dedup.rs
@@ -14,11 +14,27 @@ use kingfisher::{
     matcher::{Match, SerializableCapture, SerializableCaptures},
     origin::{Origin, OriginSet},
     reporter::{styles::Styles, DetailsReporter, ReportMatch},
-    rules::rule::Confidence,
+    rules::rule::{Confidence, Rule, RuleSyntax},
+    util::intern,
 };
+use smallvec::smallvec;
 // ---- helpers -------------------------------------------------------------------------------
 
 fn make_match(fp: u64) -> Match {
+    let syntax = RuleSyntax {
+        name: "Example Rule".to_string(),
+        id: "RULE.1".to_string(),
+        pattern: "dummy".to_string(),
+        min_entropy: 0.0,
+        confidence: Confidence::Medium,
+        visible: true,
+        examples: vec![],
+        negative_examples: vec![],
+        references: vec![],
+        validation: None,
+        depends_on_rule: vec![],
+    };
+    let rule = Arc::new(Rule::new(syntax));
     Match {
         location: Location {
             offset_span: OffsetSpan { start: 0, end: 10 },
@@ -28,20 +44,17 @@ fn make_match(fp: u64) -> Match {
             },
         },
         groups: SerializableCaptures {
-            captures: vec![SerializableCapture {
+            captures: smallvec![SerializableCapture {
                 name: None,
                 match_number: 0,
                 start: 0,
                 end: 10,
-                value: "dummy".into(),
+                value: intern("dummy"),
             }],
         },
         blob_id: BlobId::new(b"dummy"),
         finding_fingerprint: fp,
-        rule_finding_fingerprint: "structural.1".into(),
-        rule_text_id: "RULE.1".into(),
-        rule_name: "Example Rule".into(),
-        rule_confidence: Confidence::Medium,
+        rule,
         validation_response_body: String::new(),
         validation_response_status: 0,
         validation_success: false,
@@ -106,7 +119,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> {
                 id: BlobId::new(b"dummy"),
                 num_bytes: 10,
                 mime_essence: None,
-                charset: None,
                 language: None,
             },
             m: m1,
@@ -123,7 +135,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> {
                 id: BlobId::new(b"dummy"),
                 num_bytes: 10,
                 mime_essence: None,
-                charset: None,
                 language: None,
             },
             m: m2,