refactored code

2026-02-14 13:12:26 -08:00 · 2026-02-14 13:12:26 -08:00 · d3e659491d
commit d3e659491d
parent f62bfe103b
23 changed files with 1868 additions and 1828 deletions
--- a/crates/kingfisher-core/Cargo.toml
+++ b/crates/kingfisher-core/Cargo.toml
@ -44,5 +44,8 @@ gix = { version = "0.73", default-features = false, features = ["serde"] }
 # Console formatting
 console = "0.15"

+# Language detection for content types
+tokei = "14.0.0"
+
 [dev-dependencies]
 pretty_assertions = "1.4"
--- a/crates/kingfisher-core/src/content_type.rs
+++ b/crates/kingfisher-core/src/content_type.rs
@ -0,0 +1,232 @@
+use once_cell::sync::Lazy;
+use std::path::Path;
+use tokei::LanguageType;
+
+// Precompute all (shebang_prefix_bytes, language) pairs once.
+// Sort longest-first so more specific shebangs win.
+static SHEBANG_PREFIXES: Lazy<Vec<(&'static [u8], LanguageType)>> = Lazy::new(|| {
+    let mut v = Vec::new();
+    for &(lang, shebangs) in LanguageType::list() {
+        for &sb in shebangs {
+            v.push((sb.as_bytes(), lang));
+        }
+    }
+    // Longest prefix first to prefer e.g. "#!/usr/bin/env python3" over "#!/usr/bin/env python"
+    v.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
+    v
+});
+
+/// The type of content detected.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ContentType {
+    /// Unprintable or control-heavy data.
+    BINARY,
+    /// Mostly printable text.
+    TEXT,
+}
+
+/// Heuristic thresholds for text vs. binary detection.
+pub struct ContentInspector {
+    max_null_bytes: usize,
+    max_control_ratio: f64,
+}
+
+impl Default for ContentInspector {
+    fn default() -> Self {
+        Self { max_null_bytes: 4, max_control_ratio: 0.3 }
+    }
+}
+
+impl ContentInspector {
+    /// Create a new inspector with default thresholds.
+    #[inline]
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// Classify `bytes` as TEXT or BINARY:
+    ///
+    /// 1. If null-byte count > `max_null_bytes` -- `BINARY`.
+    /// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` →
+    ///    `BINARY`.
+    /// 3. Otherwise,  `TEXT`.
+    #[inline]
+    #[must_use]
+    pub fn inspect(&self, bytes: &[u8]) -> ContentType {
+        let nulls = bytes.iter().filter(|&&b| b == 0).count();
+        if nulls > self.max_null_bytes {
+            return ContentType::BINARY;
+        }
+        let controls =
+            bytes.iter().filter(|&&b| b < 32 && !matches!(b, b'\n' | b'\r' | b'\t')).count();
+        let ratio = if bytes.is_empty() { 0.0 } else { controls as f64 / bytes.len() as f64 };
+        if ratio > self.max_control_ratio {
+            ContentType::BINARY
+        } else {
+            ContentType::TEXT
+        }
+    }
+
+    /// Guess MIME type from `path` extension.
+    ///
+    /// Returns:
+    /// - `Some(mime)` if the extension is one of the known text or image types.
+    /// - `None` if there is no extension or it’s unrecognized.
+    #[inline]
+    #[must_use]
+    pub fn guess_mime_type(&self, path: &Path) -> Option<String> {
+        let ext = path.extension()?.to_str()?.to_ascii_lowercase();
+        let mime = match ext.as_str() {
+            "txt" | "md" | "rst" => "text/plain",
+            "html" | "htm" => "text/html",
+            "css" => "text/css",
+            "js" => "application/javascript",
+            "json" => "application/json",
+            "xml" => "application/xml",
+            "pdf" => "application/pdf",
+            "jpg" | "jpeg" => "image/jpeg",
+            "png" => "image/png",
+            "gif" => "image/gif",
+            _ => return None,
+        };
+        Some(mime.to_string())
+    }
+
+    /// Detect UTF-8 encoding by attempting a lossless conversion.
+    #[inline]
+    #[must_use]
+    pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
+        std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
+    }
+    /// Guess programming language with broad coverage using `tokei`.
+    ///
+    /// Strategy (no disk I/O):
+    /// 1) Try extension via `LanguageType::from_file_extension`.
+    /// 2) Handle common extensionless filenames (e.g., Makefile, Dockerfile, CMakeLists.txt).
+    /// 3) Parse an in-memory shebang (first line) against `LanguageType::shebangs`.
+    /// 4) Minimal content markers as a last resort.
+    ///
+    /// Returns the canonical `tokei` language name (e.g., `Rust`, `Bash`, `Python`).
+    #[inline]
+    #[must_use]
+    pub fn guess_language(&self, path: &Path, content: &[u8]) -> Option<String> {
+        // 1) Extension mapping (fast, no I/O).
+        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
+            if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) {
+                return Some(lang.name().to_string());
+            }
+        }
+
+        // 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure).
+        if let Some(file) = path.file_name().and_then(|f| f.to_str()) {
+            match file {
+                "Makefile" | "makefile" => {
+                    return Some(LanguageType::Makefile.name().to_string());
+                }
+                "Dockerfile" | "dockerfile" => {
+                    return Some(LanguageType::Dockerfile.name().to_string());
+                }
+                "CMakeLists.txt" => {
+                    return Some(LanguageType::CMake.name().to_string());
+                }
+                "Rakefile" | "rakefile" => {
+                    return Some(LanguageType::Rakefile.name().to_string());
+                }
+                // Common ecosystem files; map to their primary language where sensible.
+                "Gemfile" | "gemfile" => {
+                    return Some(LanguageType::Ruby.name().to_string());
+                }
+                _ => {}
+            }
+        }
+
+        // 3) Shebang detection (in-memory): match by longest prefix, byte-wise (no UTF-8 needed).
+        if let Some(first_line) = content.split(|&b| b == b'\n').next() {
+            if first_line.starts_with(b"#!") {
+                for (prefix, lang) in SHEBANG_PREFIXES.iter() {
+                    if first_line.starts_with(prefix) {
+                        return Some(lang.name().to_string());
+                    }
+                }
+            }
+        }
+
+        // 4) Lightweight content markers to catch a few ubiquitous cases without I/O.
+        let s = String::from_utf8_lossy(content);
+        if s.contains("<?php") {
+            return Some(LanguageType::Php.name().to_string());
+        }
+        if s.contains("package main") {
+            return Some(LanguageType::Go.name().to_string());
+        }
+        if s.contains("public class") {
+            return Some(LanguageType::Java.name().to_string());
+        }
+        if s.contains("#!/usr/bin/env bash") || s.contains("#!/bin/bash") {
+            return Some(LanguageType::Bash.name().to_string());
+        }
+        if s.contains("#!/usr/bin/env python") {
+            return Some(LanguageType::Python.name().to_string());
+        }
+
+        None
+    }
+}
+
+/// Shorthand: inspect with default thresholds.
+#[inline]
+#[must_use]
+pub fn inspect(bytes: &[u8]) -> ContentType {
+    ContentInspector::default().inspect(bytes)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::path::PathBuf;
+
+    use super::*;
+
+    #[test]
+    fn binary_vs_text() {
+        let ins = ContentInspector::default();
+        let bin = vec![0, 1, 2, 0, 0, 0, 5];
+        assert_eq!(ins.inspect(&bin), ContentType::BINARY);
+        let txt = b"Hello\nWorld";
+        assert_eq!(ins.inspect(txt), ContentType::TEXT);
+    }
+
+    #[test]
+    fn mime_guess() {
+        let ins = ContentInspector::default();
+        assert_eq!(ins.guess_mime_type(&PathBuf::from("a.md")), Some("text/plain".into()));
+        assert_eq!(ins.guess_mime_type(&PathBuf::from("img.png")), Some("image/png".into()));
+        assert_eq!(ins.guess_mime_type(&PathBuf::from("x.xyz")), None);
+    }
+
+    #[test]
+    fn charset_guess() {
+        let ins = ContentInspector::default();
+        assert_eq!(ins.guess_charset("ok".as_bytes()), Some("UTF-8".into()));
+        assert_eq!(ins.guess_charset(&[0xFF, 0xFE, 0xFD]), None);
+    }
+
+    #[test]
+    fn language_guess() {
+        let ins = ContentInspector::default();
+
+        // Compare case-insensitively by lowercasing both sides.
+        let rust =
+            ins.guess_language(&PathBuf::from("main.rs"), b"").map(|s| s.to_ascii_lowercase());
+        assert_eq!(rust, Some("rust".into()));
+
+        let php = ins
+            .guess_language(&PathBuf::from("x"), b"<?php echo; ?>")
+            .map(|s| s.to_ascii_lowercase());
+        assert_eq!(php, Some("php".into()));
+
+        let bash = ins
+            .guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi")
+            .map(|s| s.to_ascii_lowercase());
+        assert_eq!(bash, Some("bash".into()));
+    }
+}
--- a/crates/kingfisher-core/src/lib.rs
+++ b/crates/kingfisher-core/src/lib.rs
@ -9,6 +9,7 @@

 pub mod blob;
 pub mod bstring_escape;
+pub mod content_type;
 pub mod entropy;
 pub mod error;
 pub mod git_commit_metadata;
@ -20,6 +21,7 @@ pub use blob::{
    Blob, BlobAppearance, BlobAppearanceSet, BlobData, BlobId, BlobIdMap, BlobMetadata,
 };
 pub use bstring_escape::Escaped;
+pub use content_type::{ContentInspector, ContentType};
 pub use entropy::calculate_shannon_entropy;
 pub use error::{Error, Result};
 pub use git_commit_metadata::CommitMetadata;
--- a/crates/kingfisher-scanner/src/lib.rs
+++ b/crates/kingfisher-scanner/src/lib.rs
@ -52,6 +52,7 @@
 //! - **validation-all**: Enable all validation features

 mod finding;
+pub mod primitives;
 mod scanner;
 mod scanner_pool;

--- a/crates/kingfisher-scanner/src/primitives.rs
+++ b/crates/kingfisher-scanner/src/primitives.rs
@ -0,0 +1,187 @@
+//! Shared matching primitives for secret detection.
+//!
+//! These functions are used by both the high-level `Scanner` API and the
+//! binary crate's `Matcher`. Having a single canonical implementation
+//! eliminates duplicated logic across the codebase.
+
+use std::hash::{Hash, Hasher};
+
+use base64::{engine::general_purpose, Engine};
+use kingfisher_core::OffsetSpan;
+use rustc_hash::{FxHashMap, FxHasher};
+use xxhash_rust::xxh3::xxh3_64;
+
+// -------------------------------------------------------------------------------------------------
+// Base64 detection
+// -------------------------------------------------------------------------------------------------
+
+/// Decoded Base64 data with position information.
+#[derive(Debug, Clone)]
+pub struct DecodedData {
+    pub decoded: Vec<u8>,
+    pub pos_start: usize,
+    pub pos_end: usize,
+}
+
+#[inline]
+pub fn is_base64_byte(b: u8) -> bool {
+    // Accepts both standard base64 ('+', '/') and URL-safe base64 ('-', '_') characters.
+    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
+}
+
+/// Finds standalone Base64-encoded strings in the input and returns decoded data
+/// with byte-offset positions.
+pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
+    let mut results = Vec::new();
+    let mut i = 0;
+    while i < input.len() {
+        while i < input.len() && !is_base64_byte(input[i]) {
+            i += 1;
+        }
+        let start = i;
+        while i < input.len() && is_base64_byte(input[i]) {
+            i += 1;
+        }
+
+        let mut eq_count = 0;
+        while i < input.len() && input[i] == b'=' && eq_count < 2 {
+            i += 1;
+            eq_count += 1;
+        }
+        let end = i;
+
+        let len = end - start;
+        if len >= 32 && len % 4 == 0 {
+            let base64_slice = &input[start..end];
+
+            // Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD
+            let decode_result = general_purpose::STANDARD
+                .decode(base64_slice)
+                .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
+                .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
+
+            if let Ok(decoded) = decode_result {
+                if decoded.is_ascii() {
+                    results.push(DecodedData { decoded, pos_start: start, pos_end: end });
+                }
+            }
+        }
+    }
+
+    results
+}
+
+// -------------------------------------------------------------------------------------------------
+// Match deduplication
+// -------------------------------------------------------------------------------------------------
+
+/// Computes a deduplication key for a match based on content, rule ID, and span.
+#[inline]
+pub fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
+    let mut hasher = FxHasher::default();
+    // Hash each component directly without allocation
+    content.hash(&mut hasher);
+    rule_id.hash(&mut hasher);
+    start.hash(&mut hasher);
+    end.hash(&mut hasher);
+    hasher.finish()
+}
+
+/// Inserts a span into a sorted list of spans, handling containment.
+///
+/// Returns `false` if the span is already contained in an existing span
+/// (i.e., it's redundant and should be skipped).
+#[inline]
+pub fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
+    let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
+    if idx > 0 {
+        if spans[idx - 1].fully_contains(&span) {
+            return false;
+        }
+        if span.fully_contains(&spans[idx - 1]) {
+            spans.remove(idx - 1);
+            idx -= 1;
+        }
+    }
+    if idx < spans.len() {
+        if spans[idx].fully_contains(&span) {
+            return false;
+        }
+        if span.fully_contains(&spans[idx]) {
+            spans.remove(idx);
+        }
+    }
+    spans.insert(idx, span);
+    true
+}
+
+/// Records a match span for a given rule, returning `false` if it's a duplicate.
+#[inline]
+pub fn record_match(
+    map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
+    rule_id: usize,
+    span: OffsetSpan,
+) -> bool {
+    insert_span(map.entry(rule_id).or_default(), span)
+}
+
+// -------------------------------------------------------------------------------------------------
+// Finding fingerprint
+// -------------------------------------------------------------------------------------------------
+
+/// Computes a stable fingerprint for a finding based on its value, location, and origin.
+pub fn compute_finding_fingerprint(
+    finding_value: &str,
+    file_or_commit: &str,
+    offset_start: u64,
+    offset_end: u64,
+) -> u64 {
+    // Combine all into a byte buffer and hash it directly:
+    let mut buf = Vec::with_capacity(
+        finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::<u64>(),
+    );
+    buf.extend_from_slice(finding_value.as_bytes());
+    buf.extend_from_slice(file_or_commit.as_bytes());
+    buf.extend_from_slice(&offset_start.to_le_bytes());
+    buf.extend_from_slice(&offset_end.to_le_bytes());
+
+    xxh3_64(&buf)
+}
+
+// -------------------------------------------------------------------------------------------------
+// Secret capture selection
+// -------------------------------------------------------------------------------------------------
+
+/// Selects the "secret" capture from the regex match using the priority:
+/// 1. Named capture called TOKEN (case-insensitive)
+/// 2. First matched named capture
+/// 3. First positional capture (group 1)
+/// 4. Full match (group 0)
+pub fn find_secret_capture<'a>(
+    re: &regex::bytes::Regex,
+    captures: &regex::bytes::Captures<'a>,
+) -> regex::bytes::Match<'a> {
+    // 1. Prefer a named capture called TOKEN (case-insensitive).
+    if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
+        name_opt.filter(|name| name.eq_ignore_ascii_case("TOKEN")).and_then(|_| captures.get(i))
+    }) {
+        return token_cap;
+    }
+
+    // 2. Otherwise, prefer the first *matched* named capture.
+    if let Some(named_cap) = re
+        .capture_names()
+        .enumerate()
+        .find_map(|(i, name_opt)| name_opt.and_then(|_| captures.get(i)))
+    {
+        return named_cap;
+    }
+
+    // 3. Otherwise, fall back to the first positional capture (group 1).
+    if let Some(pos_cap) = captures.get(1) {
+        return pos_cap;
+    }
+
+    // 4. Finally, fall back to the full match (group 0).
+    captures.get(0).unwrap()
+}
--- a/crates/kingfisher-scanner/src/scanner.rs
+++ b/crates/kingfisher-scanner/src/scanner.rs
@ -5,15 +5,13 @@ use std::path::Path;
 use std::sync::Arc;

 use anyhow::Result;
-use base64::{engine::general_purpose, Engine};
 use kingfisher_core::{calculate_shannon_entropy, Blob, BlobIdMap, LocationMapping, OffsetSpan};
 use kingfisher_rules::RulesDatabase;
-use regex::bytes::Regex;
 use rustc_hash::{FxHashMap, FxHashSet};
 use tracing::debug;
-use xxhash_rust::xxh3::xxh3_64;

 use crate::finding::{Finding, FindingLocation};
+use crate::primitives;
 use crate::scanner_pool::ScannerPool;

 /// Configuration options for the scanner.
@ -185,21 +183,15 @@ impl Scanner {
            let current_span = OffsetSpan::from_range(start..end);

            // Check for overlapping spans
-            if !self.record_span(&mut previous_spans, rule_id, current_span) {
+            if !primitives::record_match(&mut previous_spans, rule_id, current_span) {
                continue;
            }

            let haystack = &bytes[start..end];

            for captures in anchored_regex.captures_iter(haystack) {
-                let full_capture = match captures.get(0) {
-                    Some(c) => c,
-                    None => continue,
-                };
-
                // Get the primary secret value
-                let secret_capture =
-                    self.get_secret_capture(&anchored_regex, &captures, full_capture);
+                let secret_capture = primitives::find_secret_capture(&anchored_regex, &captures);
                let secret_bytes = secret_capture.as_bytes();

                // Check entropy
@ -211,7 +203,7 @@ impl Scanner {
                }

                // Compute match key for dedup
-                let match_key = self.compute_match_key(
+                let match_key = primitives::compute_match_key(
                    secret_bytes,
                    rule.id().as_bytes(),
                    start + secret_capture.start(),
@ -242,7 +234,7 @@ impl Scanner {
                    }
                }

-                let fingerprint = self.compute_fingerprint(
+                let fingerprint = primitives::compute_finding_fingerprint(
                    &secret,
                    &blob.id().to_string(),
                    offset_span.start as u64,
@ -295,92 +287,6 @@ impl Scanner {
        // In a real implementation, you'd want to add a clear method or use a different approach
    }

-    fn get_secret_capture<'a>(
-        &self,
-        regex: &Regex,
-        captures: &regex::bytes::Captures<'a>,
-        full_capture: regex::bytes::Match<'a>,
-    ) -> regex::bytes::Match<'a> {
-        // Prefer named capture called TOKEN
-        for (i, name_opt) in regex.capture_names().enumerate() {
-            if let Some(name) = name_opt {
-                if name.eq_ignore_ascii_case("TOKEN") {
-                    if let Some(cap) = captures.get(i) {
-                        return cap;
-                    }
-                }
-            }
-        }
-
-        // Otherwise, first named capture
-        for (i, name_opt) in regex.capture_names().enumerate() {
-            if name_opt.is_some() {
-                if let Some(cap) = captures.get(i) {
-                    return cap;
-                }
-            }
-        }
-
-        // Otherwise, first positional capture (group 1)
-        if let Some(cap) = captures.get(1) {
-            return cap;
-        }
-
-        // Fall back to full match
-        full_capture
-    }
-
-    fn record_span(
-        &self,
-        map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
-        rule_id: usize,
-        span: OffsetSpan,
-    ) -> bool {
-        let spans = map.entry(rule_id).or_default();
-
-        // Binary search for insertion point
-        let idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
-
-        // Check if new span is contained in an existing one
-        if idx > 0 && spans[idx - 1].fully_contains(&span) {
-            return false;
-        }
-        if idx < spans.len() && spans[idx].fully_contains(&span) {
-            return false;
-        }
-
-        // Remove spans that the new span contains
-        let remove_idx = idx;
-        while remove_idx < spans.len() && span.fully_contains(&spans[remove_idx]) {
-            spans.remove(remove_idx);
-        }
-        if idx > 0 && span.fully_contains(&spans[idx - 1]) {
-            spans.remove(idx - 1);
-        }
-
-        spans.insert(idx.min(spans.len()), span);
-        true
-    }
-
-    fn compute_match_key(&self, content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
-        use std::hash::{Hash, Hasher};
-        let mut hasher = rustc_hash::FxHasher::default();
-        content.hash(&mut hasher);
-        rule_id.hash(&mut hasher);
-        start.hash(&mut hasher);
-        end.hash(&mut hasher);
-        hasher.finish()
-    }
-
-    fn compute_fingerprint(&self, value: &str, blob_id: &str, start: u64, end: u64) -> u64 {
-        let mut buf = Vec::with_capacity(value.len() + blob_id.len() + 16);
-        buf.extend_from_slice(value.as_bytes());
-        buf.extend_from_slice(blob_id.as_bytes());
-        buf.extend_from_slice(&start.to_le_bytes());
-        buf.extend_from_slice(&end.to_le_bytes());
-        xxh3_64(&buf)
-    }
-
    fn redact(&self, bytes: &[u8]) -> String {
        let s = String::from_utf8_lossy(bytes);
        if s.len() <= 8 {
@ -400,7 +306,7 @@ impl Scanner {
        let bytes = blob.bytes();

        // Find Base64-encoded strings
-        let b64_items = self.find_base64_strings(bytes);
+        let b64_items = primitives::get_base64_strings(bytes);

        for item in b64_items {
            // Try to match decoded content against all rules
@ -411,12 +317,7 @@ impl Scanner {
                };

                for captures in regex.captures_iter(&item.decoded) {
-                    let full_capture = match captures.get(0) {
-                        Some(c) => c,
-                        None => continue,
-                    };
-
-                    let secret_capture = self.get_secret_capture(&regex, &captures, full_capture);
+                    let secret_capture = primitives::find_secret_capture(&regex, &captures);
                    let secret_bytes = secret_capture.as_bytes();

                    let min_entropy =
@ -426,7 +327,7 @@ impl Scanner {
                        continue;
                    }

-                    let match_key = self.compute_match_key(
+                    let match_key = primitives::compute_match_key(
                        secret_bytes,
                        rule.id().as_bytes(),
                        item.pos_start,
@ -455,7 +356,7 @@ impl Scanner {
                        }
                    }

-                    let fingerprint = self.compute_fingerprint(
+                    let fingerprint = primitives::compute_finding_fingerprint(
                        &secret,
                        &blob.id().to_string(),
                        offset_span.start as u64,
@ -488,62 +389,6 @@ impl Scanner {

        findings
    }
-
-    fn find_base64_strings(&self, input: &[u8]) -> Vec<DecodedData> {
-        let mut results = Vec::new();
-        let mut i = 0;
-
-        while i < input.len() {
-            // Skip non-base64 characters
-            while i < input.len() && !Self::is_base64_byte(input[i]) {
-                i += 1;
-            }
-            let start = i;
-
-            // Collect base64 characters
-            while i < input.len() && Self::is_base64_byte(input[i]) {
-                i += 1;
-            }
-
-            // Handle padding
-            let mut eq_count = 0;
-            while i < input.len() && input[i] == b'=' && eq_count < 2 {
-                i += 1;
-                eq_count += 1;
-            }
-            let end = i;
-
-            let len = end - start;
-            if len >= 32 && len % 4 == 0 {
-                let base64_slice = &input[start..end];
-
-                // Try decoding
-                let decode_result = general_purpose::STANDARD
-                    .decode(base64_slice)
-                    .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
-                    .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
-
-                if let Ok(decoded) = decode_result {
-                    if decoded.is_ascii() {
-                        results.push(DecodedData { decoded, pos_start: start, pos_end: end });
-                    }
-                }
-            }
-        }
-
-        results
-    }
-
-    #[inline]
-    fn is_base64_byte(b: u8) -> bool {
-        matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
-    }
-}
-
-struct DecodedData {
-    decoded: Vec<u8>,
-    pos_start: usize,
-    pos_end: usize,
 }

 #[cfg(test)]
--- a/src/access_map.rs
+++ b/src/access_map.rs
@ -18,6 +18,22 @@ pub(crate) mod postgres;
 mod report;
 mod slack;

+/// Trait for access map providers that map a single token to an access profile.
+///
+/// This covers the majority of providers (GitHub, GitLab, Slack, HuggingFace,
+/// Gitea, Bitbucket). Providers with more complex credentials (AWS, GCP, Azure,
+/// Postgres, MongoDB) use their own custom interfaces.
+pub trait TokenAccessMapper: Send + Sync {
+    /// The cloud/platform name for results (e.g., `"github"`, `"slack"`).
+    fn cloud_name(&self) -> &'static str;
+
+    /// Maps a single token to an access map result.
+    fn map_access_from_token(
+        &self,
+        token: &str,
+    ) -> impl std::future::Future<Output = Result<AccessMapResult>> + Send;
+}
+
 /// Run the identity mapping workflow for the selected cloud provider.
 pub async fn run(args: AccessMapArgs) -> Result<()> {
    let result = match args.provider {
@ -244,24 +260,15 @@ pub async fn map_requests(requests: Vec<AccessMapRequest>) -> Vec<AccessMapResul
                    .unwrap_or_else(|err| build_failed_result("azure_devops", "pat", err)),
                fingerprint,
            ),
-            AccessMapRequest::Github { token, fingerprint } => (
-                github::map_access_from_token(&token)
-                    .await
-                    .unwrap_or_else(|err| build_failed_result("github", "token", err)),
-                fingerprint,
-            ),
-            AccessMapRequest::Gitlab { token, fingerprint } => (
-                gitlab::map_access_from_token(&token)
-                    .await
-                    .unwrap_or_else(|err| build_failed_result("gitlab", "token", err)),
-                fingerprint,
-            ),
-            AccessMapRequest::Slack { token, fingerprint } => (
-                slack::map_access_from_token(&token)
-                    .await
-                    .unwrap_or_else(|err| build_failed_result("slack", "token", err)),
-                fingerprint,
-            ),
+            AccessMapRequest::Github { token, fingerprint } => {
+                (map_token(&GithubMapper, &token).await, fingerprint)
+            }
+            AccessMapRequest::Gitlab { token, fingerprint } => {
+                (map_token(&GitlabMapper, &token).await, fingerprint)
+            }
+            AccessMapRequest::Slack { token, fingerprint } => {
+                (map_token(&SlackMapper, &token).await, fingerprint)
+            }
            AccessMapRequest::Postgres { uri, fingerprint } => (
                postgres::map_access_from_uri(&uri)
                    .await
@ -274,24 +281,15 @@ pub async fn map_requests(requests: Vec<AccessMapRequest>) -> Vec<AccessMapResul
                    .unwrap_or_else(|err| build_failed_result("mongodb", "uri", err)),
                fingerprint,
            ),
-            AccessMapRequest::HuggingFace { token, fingerprint } => (
-                huggingface::map_access_from_token(&token)
-                    .await
-                    .unwrap_or_else(|err| build_failed_result("huggingface", "token", err)),
-                fingerprint,
-            ),
-            AccessMapRequest::Gitea { token, fingerprint } => (
-                gitea::map_access_from_token(&token)
-                    .await
-                    .unwrap_or_else(|err| build_failed_result("gitea", "token", err)),
-                fingerprint,
-            ),
-            AccessMapRequest::Bitbucket { token, fingerprint } => (
-                bitbucket::map_access_from_token(&token)
-                    .await
-                    .unwrap_or_else(|err| build_failed_result("bitbucket", "token", err)),
-                fingerprint,
-            ),
+            AccessMapRequest::HuggingFace { token, fingerprint } => {
+                (map_token(&HuggingFaceMapper, &token).await, fingerprint)
+            }
+            AccessMapRequest::Gitea { token, fingerprint } => {
+                (map_token(&GiteaMapper, &token).await, fingerprint)
+            }
+            AccessMapRequest::Bitbucket { token, fingerprint } => {
+                (map_token(&BitbucketMapper, &token).await, fingerprint)
+            }
        };

        mapped.fingerprint = Some(fp);
@ -301,12 +299,106 @@ pub async fn map_requests(requests: Vec<AccessMapRequest>) -> Vec<AccessMapResul
    results
 }

+/// Maps a token credential using a `TokenAccessMapper`, with fallback error handling.
+async fn map_token(mapper: &impl TokenAccessMapper, token: &str) -> AccessMapResult {
+    mapper
+        .map_access_from_token(token)
+        .await
+        .unwrap_or_else(|err| build_failed_result(mapper.cloud_name(), "token", err))
+}
+
 /// Write HTML/JSON outputs for a collection of identity map results.
 pub fn write_reports(results: &[AccessMapResult], html_out: &std::path::Path) -> Result<()> {
    report::generate_html_report_multi(results, html_out)?;
    Ok(())
 }

+// -------------------------------------------------------------------------------------------------
+// TokenAccessMapper implementations
+// -------------------------------------------------------------------------------------------------
+
+/// GitHub access mapper.
+pub struct GithubMapper;
+
+impl TokenAccessMapper for GithubMapper {
+    fn cloud_name(&self) -> &'static str {
+        "github"
+    }
+
+    async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
+        github::map_access_from_token(token).await
+    }
+}
+
+/// GitLab access mapper.
+pub struct GitlabMapper;
+
+impl TokenAccessMapper for GitlabMapper {
+    fn cloud_name(&self) -> &'static str {
+        "gitlab"
+    }
+
+    async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
+        gitlab::map_access_from_token(token).await
+    }
+}
+
+/// Slack access mapper.
+pub struct SlackMapper;
+
+impl TokenAccessMapper for SlackMapper {
+    fn cloud_name(&self) -> &'static str {
+        "slack"
+    }
+
+    async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
+        slack::map_access_from_token(token).await
+    }
+}
+
+/// HuggingFace access mapper.
+pub struct HuggingFaceMapper;
+
+impl TokenAccessMapper for HuggingFaceMapper {
+    fn cloud_name(&self) -> &'static str {
+        "huggingface"
+    }
+
+    async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
+        huggingface::map_access_from_token(token).await
+    }
+}
+
+/// Gitea access mapper.
+pub struct GiteaMapper;
+
+impl TokenAccessMapper for GiteaMapper {
+    fn cloud_name(&self) -> &'static str {
+        "gitea"
+    }
+
+    async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
+        gitea::map_access_from_token(token).await
+    }
+}
+
+/// Bitbucket access mapper.
+pub struct BitbucketMapper;
+
+impl TokenAccessMapper for BitbucketMapper {
+    fn cloud_name(&self) -> &'static str {
+        "bitbucket"
+    }
+
+    async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
+        bitbucket::map_access_from_token(token).await
+    }
+}
+
+// -------------------------------------------------------------------------------------------------
+// Helper functions
+// -------------------------------------------------------------------------------------------------
+
 fn severity_to_str(severity: Severity) -> &'static str {
    match severity {
        Severity::Low => "low",
--- a/src/azure.rs
+++ b/src/azure.rs
@ -13,13 +13,12 @@ use std::{
 // let us opt into newer API versions as Microsoft rolls them out.

 use anyhow::{anyhow, Context, Result};
-use globset::{Glob, GlobSet, GlobSetBuilder};
 use indicatif::{ProgressBar, ProgressStyle};
 use serde::Deserialize;
 use tracing::warn;
 use url::{form_urlencoded, Url};

-use crate::{findings_store, git_url::GitUrl};
+use crate::{findings_store, git_host, git_url::GitUrl};

 const API_VERSION: &str = "7.1-preview.1";

@ -55,33 +54,6 @@ impl RepoSpecifiers {
    }
 }

-#[derive(Debug)]
-struct ExcludeMatcher {
-    exact: HashSet<String>,
-    globs: Option<GlobSet>,
-}
-
-impl ExcludeMatcher {
-    fn matches(&self, name: &str) -> bool {
-        let candidate = name.to_lowercase();
-        if self.exact.contains(&candidate) {
-            return true;
-        }
-        if let Some(globs) = &self.globs {
-            return globs.is_match(&candidate);
-        }
-        false
-    }
-
-    fn is_empty(&self) -> bool {
-        self.exact.is_empty() && self.globs.is_none()
-    }
-}
-
-fn looks_like_glob(pattern: &str) -> bool {
-    pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
-}
-
 fn encode_segment(segment: &str) -> String {
    form_urlencoded::byte_serialize(segment.as_bytes()).collect::<String>()
 }
@ -129,7 +101,7 @@ fn parse_repo_identifier_from_path(path: &str) -> Option<String> {

        let org = org.to_lowercase();
        let project_raw = project.to_string();
-        if looks_like_glob(&project_raw) {
+        if git_host::looks_like_glob(&project_raw) {
            let pattern = format!("{org}/{}/**", project_raw.to_lowercase());
            return Some(pattern);
        }
@ -197,59 +169,16 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
    parse_repo_identifier_from_path(trimmed)
 }

-fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
-    let mut exact = HashSet::new();
-    let mut glob_builder = GlobSetBuilder::new();
-    let mut has_glob = false;
-
-    for raw in exclude_repos {
-        match parse_excluded_repo(raw) {
-            Some(name) => {
-                let normalized = name.to_lowercase();
-                if looks_like_glob(&normalized) {
-                    match Glob::new(&normalized) {
-                        Ok(glob) => {
-                            glob_builder.add(glob);
-                            has_glob = true;
-                        }
-                        Err(err) => {
-                            warn!("Ignoring invalid Azure exclusion pattern '{raw}': {err}");
-                            exact.insert(normalized);
-                        }
-                    }
-                } else {
-                    exact.insert(normalized);
-                }
-            }
-            None => {
-                warn!("Ignoring invalid Azure exclusion '{raw}' (expected organization/project[/repository])");
-            }
-        }
-    }
-
-    let globs = if has_glob {
-        match glob_builder.build() {
-            Ok(set) => Some(set),
-            Err(err) => {
-                warn!("Failed to build Azure exclusion patterns: {err}");
-                None
-            }
-        }
-    } else {
-        None
-    };
-
-    ExcludeMatcher { exact, globs }
+fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
+    git_host::build_exclude_matcher(
+        exclude_repos,
+        |raw| parse_excluded_repo(raw).map(|name| name.to_lowercase()),
+        "Azure",
+    )
 }

-fn should_exclude_repo(repo_url: &str, excludes: &ExcludeMatcher) -> bool {
-    if excludes.is_empty() {
-        return false;
-    }
-    if let Some(name) = parse_repo_identifier_from_url(repo_url) {
-        return excludes.matches(&name);
-    }
-    false
+fn should_exclude_repo(repo_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
+    git_host::should_exclude_repo(repo_url, excludes, parse_repo_identifier_from_url)
 }

 #[derive(Debug, Deserialize, Default)]
--- a/src/bitbucket.rs
+++ b/src/bitbucket.rs
@ -6,8 +6,8 @@ use std::{
    time::Duration,
 };

+use crate::git_host;
 use anyhow::{Context, Result};
-use globset::{Glob, GlobSet, GlobSetBuilder};
 use indicatif::{ProgressBar, ProgressStyle};
 use reqwest::Url;
 use serde::Deserialize;
@ -136,32 +136,6 @@ impl BitbucketKind {
    }
 }

-#[derive(Debug)]
-struct ExcludeMatcher {
-    exact: HashSet<String>,
-    globs: Option<GlobSet>,
-}
-
-impl ExcludeMatcher {
-    fn matches(&self, name: &str) -> bool {
-        if self.exact.contains(name) {
-            return true;
-        }
-        if let Some(globs) = &self.globs {
-            return globs.is_match(name);
-        }
-        false
-    }
-
-    fn is_empty(&self) -> bool {
-        self.exact.is_empty() && self.globs.is_none()
-    }
-}
-
-fn looks_like_glob(pattern: &str) -> bool {
-    pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
-}
-
 fn normalize_repo_identifier(owner: &str, repo: &str) -> Option<String> {
    let owner = owner.trim().trim_matches('/');
    let repo = repo.trim().trim_matches('/');
@ -207,58 +181,12 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
    parse_repo_name_from_path(trimmed)
 }

-fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
-    let mut exact = HashSet::new();
-    let mut glob_builder = GlobSetBuilder::new();
-    let mut has_glob = false;
-
-    for raw in exclude_repos {
-        match parse_excluded_repo(raw) {
-            Some(name) => {
-                if looks_like_glob(&name) {
-                    match Glob::new(&name) {
-                        Ok(glob) => {
-                            glob_builder.add(glob);
-                            has_glob = true;
-                        }
-                        Err(err) => {
-                            warn!("Ignoring invalid Bitbucket exclusion pattern '{raw}': {err}");
-                            exact.insert(name);
-                        }
-                    }
-                } else {
-                    exact.insert(name);
-                }
-            }
-            None => {
-                warn!("Ignoring invalid Bitbucket exclusion '{raw}' (expected owner/repo)");
-            }
-        }
-    }
-
-    let globs = if has_glob {
-        match glob_builder.build() {
-            Ok(set) => Some(set),
-            Err(err) => {
-                warn!("Failed to build Bitbucket exclusion patterns: {err}");
-                None
-            }
-        }
-    } else {
-        None
-    };
-
-    ExcludeMatcher { exact, globs }
+fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
+    git_host::build_exclude_matcher(exclude_repos, parse_excluded_repo, "Bitbucket")
 }

-fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool {
-    if excludes.is_empty() {
-        return false;
-    }
-    if let Some(name) = parse_repo_name_from_url(clone_url) {
-        return excludes.matches(&name);
-    }
-    false
+fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
+    git_host::should_exclude_repo(clone_url, excludes, parse_repo_name_from_url)
 }

 fn repo_clone_url_from_links(links: &[CloneLink]) -> Option<String> {
@ -343,7 +271,7 @@ async fn fetch_cloud_repositories(
    owner: &str,
    auth: &AuthConfig,
    repo_filter: RepoType,
-    excludes: &ExcludeMatcher,
+    excludes: &git_host::ExcludeMatcher,
    results: &mut Vec<String>,
 ) -> Result<()> {
    let mut next = base
@ -387,7 +315,7 @@ async fn fetch_server_repositories(
    path: &str,
    auth: &AuthConfig,
    repo_filter: RepoType,
-    excludes: &ExcludeMatcher,
+    excludes: &git_host::ExcludeMatcher,
    results: &mut Vec<String>,
 ) -> Result<()> {
    let mut start = 0u64;
--- a/src/content_type.rs
+++ b/src/content_type.rs
@ -1,232 +1,2 @@
-use once_cell::sync::Lazy;
-use std::path::Path;
-use tokei::LanguageType;
-
-// Precompute all (shebang_prefix_bytes, language) pairs once.
-// Sort longest-first so more specific shebangs win.
-static SHEBANG_PREFIXES: Lazy<Vec<(&'static [u8], LanguageType)>> = Lazy::new(|| {
-    let mut v = Vec::new();
-    for &(lang, shebangs) in LanguageType::list() {
-        for &sb in shebangs {
-            v.push((sb.as_bytes(), lang));
-        }
-    }
-    // Longest prefix first to prefer e.g. "#!/usr/bin/env python3" over "#!/usr/bin/env python"
-    v.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
-    v
-});
-
-/// The type of content detected.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum ContentType {
-    /// Unprintable or control-heavy data.
-    BINARY,
-    /// Mostly printable text.
-    TEXT,
-}
-
-/// Heuristic thresholds for text vs. binary detection.
-pub struct ContentInspector {
-    max_null_bytes: usize,
-    max_control_ratio: f64,
-}
-
-impl Default for ContentInspector {
-    fn default() -> Self {
-        Self { max_null_bytes: 4, max_control_ratio: 0.3 }
-    }
-}
-
-impl ContentInspector {
-    /// Create a new inspector with default thresholds.
-    #[inline]
-    pub fn new() -> Self {
-        Default::default()
-    }
-
-    /// Classify `bytes` as TEXT or BINARY:
-    ///
-    /// 1. If null-byte count > `max_null_bytes` -- `BINARY`.
-    /// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` →
-    ///    `BINARY`.
-    /// 3. Otherwise,  `TEXT`.
-    #[inline]
-    #[must_use]
-    pub fn inspect(&self, bytes: &[u8]) -> ContentType {
-        let nulls = bytes.iter().filter(|&&b| b == 0).count();
-        if nulls > self.max_null_bytes {
-            return ContentType::BINARY;
-        }
-        let controls =
-            bytes.iter().filter(|&&b| b < 32 && !matches!(b, b'\n' | b'\r' | b'\t')).count();
-        let ratio = if bytes.is_empty() { 0.0 } else { controls as f64 / bytes.len() as f64 };
-        if ratio > self.max_control_ratio {
-            ContentType::BINARY
-        } else {
-            ContentType::TEXT
-        }
-    }
-
-    /// Guess MIME type from `path` extension.
-    ///
-    /// Returns:
-    /// - `Some(mime)` if the extension is one of the known text or image types.
-    /// - `None` if there is no extension or it’s unrecognized.
-    #[inline]
-    #[must_use]
-    pub fn guess_mime_type(&self, path: &Path) -> Option<String> {
-        let ext = path.extension()?.to_str()?.to_ascii_lowercase();
-        let mime = match ext.as_str() {
-            "txt" | "md" | "rst" => "text/plain",
-            "html" | "htm" => "text/html",
-            "css" => "text/css",
-            "js" => "application/javascript",
-            "json" => "application/json",
-            "xml" => "application/xml",
-            "pdf" => "application/pdf",
-            "jpg" | "jpeg" => "image/jpeg",
-            "png" => "image/png",
-            "gif" => "image/gif",
-            _ => return None,
-        };
-        Some(mime.to_string())
-    }
-
-    /// Detect UTF-8 encoding by attempting a lossless conversion.
-    #[inline]
-    #[must_use]
-    pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
-        std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
-    }
-    /// Guess programming language with broad coverage using `tokei`.
-    ///
-    /// Strategy (no disk I/O):
-    /// 1) Try extension via `LanguageType::from_file_extension`.
-    /// 2) Handle common extensionless filenames (e.g., Makefile, Dockerfile, CMakeLists.txt).
-    /// 3) Parse an in-memory shebang (first line) against `LanguageType::shebangs`.
-    /// 4) Minimal content markers as a last resort.
-    ///
-    /// Returns the canonical `tokei` language name (e.g., `Rust`, `Bash`, `Python`).
-    #[inline]
-    #[must_use]
-    pub fn guess_language(&self, path: &Path, content: &[u8]) -> Option<String> {
-        // 1) Extension mapping (fast, no I/O).
-        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
-            if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) {
-                return Some(lang.name().to_string());
-            }
-        }
-
-        // 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure).
-        if let Some(file) = path.file_name().and_then(|f| f.to_str()) {
-            match file {
-                "Makefile" | "makefile" => {
-                    return Some(LanguageType::Makefile.name().to_string());
-                }
-                "Dockerfile" | "dockerfile" => {
-                    return Some(LanguageType::Dockerfile.name().to_string());
-                }
-                "CMakeLists.txt" => {
-                    return Some(LanguageType::CMake.name().to_string());
-                }
-                "Rakefile" | "rakefile" => {
-                    return Some(LanguageType::Rakefile.name().to_string());
-                }
-                // Common ecosystem files; map to their primary language where sensible.
-                "Gemfile" | "gemfile" => {
-                    return Some(LanguageType::Ruby.name().to_string());
-                }
-                _ => {}
-            }
-        }
-
-        // 3) Shebang detection (in-memory): match by longest prefix, byte-wise (no UTF-8 needed).
-        if let Some(first_line) = content.split(|&b| b == b'\n').next() {
-            if first_line.starts_with(b"#!") {
-                for (prefix, lang) in SHEBANG_PREFIXES.iter() {
-                    if first_line.starts_with(prefix) {
-                        return Some(lang.name().to_string());
-                    }
-                }
-            }
-        }
-
-        // 4) Lightweight content markers to catch a few ubiquitous cases without I/O.
-        let s = String::from_utf8_lossy(content);
-        if s.contains("<?php") {
-            return Some(LanguageType::Php.name().to_string());
-        }
-        if s.contains("package main") {
-            return Some(LanguageType::Go.name().to_string());
-        }
-        if s.contains("public class") {
-            return Some(LanguageType::Java.name().to_string());
-        }
-        if s.contains("#!/usr/bin/env bash") || s.contains("#!/bin/bash") {
-            return Some(LanguageType::Bash.name().to_string());
-        }
-        if s.contains("#!/usr/bin/env python") {
-            return Some(LanguageType::Python.name().to_string());
-        }
-
-        None
-    }
-}
-
-/// Shorthand: inspect with default thresholds.
-#[inline]
-#[must_use]
-pub fn inspect(bytes: &[u8]) -> ContentType {
-    ContentInspector::default().inspect(bytes)
-}
-
-#[cfg(test)]
-mod tests {
-    use std::path::PathBuf;
-
-    use super::*;
-
-    #[test]
-    fn binary_vs_text() {
-        let ins = ContentInspector::default();
-        let bin = vec![0, 1, 2, 0, 0, 0, 5];
-        assert_eq!(ins.inspect(&bin), ContentType::BINARY);
-        let txt = b"Hello\nWorld";
-        assert_eq!(ins.inspect(txt), ContentType::TEXT);
-    }
-
-    #[test]
-    fn mime_guess() {
-        let ins = ContentInspector::default();
-        assert_eq!(ins.guess_mime_type(&PathBuf::from("a.md")), Some("text/plain".into()));
-        assert_eq!(ins.guess_mime_type(&PathBuf::from("img.png")), Some("image/png".into()));
-        assert_eq!(ins.guess_mime_type(&PathBuf::from("x.xyz")), None);
-    }
-
-    #[test]
-    fn charset_guess() {
-        let ins = ContentInspector::default();
-        assert_eq!(ins.guess_charset("ok".as_bytes()), Some("UTF-8".into()));
-        assert_eq!(ins.guess_charset(&[0xFF, 0xFE, 0xFD]), None);
-    }
-
-    #[test]
-    fn language_guess() {
-        let ins = ContentInspector::default();
-
-        // Compare case-insensitively by lowercasing both sides.
-        let rust =
-            ins.guess_language(&PathBuf::from("main.rs"), b"").map(|s| s.to_ascii_lowercase());
-        assert_eq!(rust, Some("rust".into()));
-
-        let php = ins
-            .guess_language(&PathBuf::from("x"), b"<?php echo; ?>")
-            .map(|s| s.to_ascii_lowercase());
-        assert_eq!(php, Some("php".into()));
-
-        let bash = ins
-            .guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi")
-            .map(|s| s.to_ascii_lowercase());
-        assert_eq!(bash, Some("bash".into()));
-    }
-}
+// Re-export from the canonical implementation in kingfisher-core.
+pub use kingfisher_core::content_type::*;
--- a/src/git_host.rs
+++ b/src/git_host.rs
@ -0,0 +1,121 @@
+//! Shared utilities for Git hosting platform integrations.
+//!
+//! This module provides the `ExcludeMatcher` struct and related helpers
+//! used across all platform integration modules (GitHub, GitLab, Azure,
+//! Bitbucket, Gitea).
+
+use std::collections::HashSet;
+
+use globset::{Glob, GlobSet, GlobSetBuilder};
+use tracing::warn;
+
+// -------------------------------------------------------------------------------------------------
+// ExcludeMatcher
+// -------------------------------------------------------------------------------------------------
+
+/// Matches repository names against exact strings and glob patterns.
+///
+/// Used by all platform integrations to filter out excluded repositories.
+pub struct ExcludeMatcher {
+    exact: HashSet<String>,
+    globs: Option<GlobSet>,
+}
+
+impl ExcludeMatcher {
+    /// Returns `true` if no exclusion patterns are configured.
+    pub fn is_empty(&self) -> bool {
+        self.exact.is_empty() && self.globs.is_none()
+    }
+
+    /// Returns `true` if the given name matches any exclusion pattern.
+    pub fn matches(&self, name: &str) -> bool {
+        if self.exact.contains(name) {
+            return true;
+        }
+        if let Some(globs) = &self.globs {
+            return globs.is_match(name);
+        }
+        false
+    }
+}
+
+/// Returns `true` if the pattern contains glob metacharacters.
+pub fn looks_like_glob(pattern: &str) -> bool {
+    pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
+}
+
+/// Builds an `ExcludeMatcher` from a list of exclusion patterns.
+///
+/// The `parse_fn` transforms each raw pattern string into a normalized
+/// repository name (e.g., `"owner/repo"` or `"group/project"`). It returns
+/// `None` if the pattern is invalid.
+///
+/// The `platform_name` is used in warning messages (e.g., `"GitHub"`,
+/// `"GitLab"`).
+pub fn build_exclude_matcher(
+    exclude_repos: &[String],
+    parse_fn: impl Fn(&str) -> Option<String>,
+    platform_name: &str,
+) -> ExcludeMatcher {
+    let mut exact = HashSet::new();
+    let mut glob_builder = GlobSetBuilder::new();
+    let mut has_glob = false;
+
+    for raw in exclude_repos {
+        match parse_fn(raw) {
+            Some(name) => {
+                if looks_like_glob(&name) {
+                    match Glob::new(&name) {
+                        Ok(glob) => {
+                            glob_builder.add(glob);
+                            has_glob = true;
+                        }
+                        Err(err) => {
+                            warn!(
+                                "Ignoring invalid {platform_name} exclusion pattern '{raw}': {err}"
+                            );
+                            exact.insert(name);
+                        }
+                    }
+                } else {
+                    exact.insert(name);
+                }
+            }
+            None => {
+                warn!("Ignoring invalid {platform_name} exclusion '{raw}'");
+            }
+        }
+    }
+
+    let globs = if has_glob {
+        match glob_builder.build() {
+            Ok(set) => Some(set),
+            Err(err) => {
+                warn!("Failed to build {platform_name} exclusion patterns: {err}");
+                None
+            }
+        }
+    } else {
+        None
+    };
+
+    ExcludeMatcher { exact, globs }
+}
+
+/// Checks whether a repository URL should be excluded.
+///
+/// The `extract_name` function extracts a normalized repository identifier
+/// from the URL (e.g., `"owner/repo"` from `"https://github.com/owner/repo.git"`).
+pub fn should_exclude_repo(
+    repo_url: &str,
+    excludes: &ExcludeMatcher,
+    extract_name: impl Fn(&str) -> Option<String>,
+) -> bool {
+    if excludes.is_empty() {
+        return false;
+    }
+    if let Some(name) = extract_name(repo_url) {
+        return excludes.matches(&name);
+    }
+    false
+}
--- a/src/gitea.rs
+++ b/src/gitea.rs
@ -1,14 +1,13 @@
 use std::{collections::HashSet, env, str::FromStr, time::Duration};

 use anyhow::{anyhow, Result};
-use globset::{Glob, GlobSet, GlobSetBuilder};
 use indicatif::{ProgressBar, ProgressStyle};
 use reqwest::StatusCode;
 use serde::Deserialize;
 use tracing::warn;
 use url::Url;

-use crate::{git_url::GitUrl, validation::GLOBAL_USER_AGENT};
+use crate::{git_host, git_url::GitUrl, validation::GLOBAL_USER_AGENT};

 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum RepoType {
@ -55,31 +54,6 @@ struct GiteaOrganization {
    username: String,
 }

-struct ExcludeMatcher {
-    exact: HashSet<String>,
-    globs: Option<GlobSet>,
-}
-
-impl ExcludeMatcher {
-    fn matches(&self, name: &str) -> bool {
-        if self.exact.contains(name) {
-            return true;
-        }
-        if let Some(globs) = &self.globs {
-            return globs.is_match(name);
-        }
-        false
-    }
-
-    fn is_empty(&self) -> bool {
-        self.exact.is_empty() && self.globs.is_none()
-    }
-}
-
-fn looks_like_glob(pattern: &str) -> bool {
-    pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
-}
-
 fn normalize_repo_identifier(raw: &str) -> Option<String> {
    let trimmed = raw.trim().trim_matches('/');
    if trimmed.is_empty() {
@ -113,51 +87,11 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
    normalize_repo_identifier(trimmed)
 }

-fn build_exclude_matcher(excludes: &[String]) -> ExcludeMatcher {
-    let mut exact = HashSet::new();
-    let mut glob_builder = GlobSetBuilder::new();
-    let mut has_glob = false;
-
-    for raw in excludes {
-        match parse_excluded_repo(raw) {
-            Some(name) => {
-                if looks_like_glob(&name) {
-                    match Glob::new(&name) {
-                        Ok(glob) => {
-                            glob_builder.add(glob);
-                            has_glob = true;
-                        }
-                        Err(err) => {
-                            warn!("Ignoring invalid Gitea exclusion pattern '{raw}': {err}");
-                            exact.insert(name);
-                        }
-                    }
-                } else {
-                    exact.insert(name);
-                }
-            }
-            None => {
-                warn!("Ignoring invalid Gitea exclusion '{raw}' (expected owner/repo)");
-            }
-        }
-    }
-
-    let globs = if has_glob {
-        match glob_builder.build() {
-            Ok(set) => Some(set),
-            Err(err) => {
-                warn!("Failed to build Gitea exclusion patterns: {err}");
-                None
-            }
-        }
-    } else {
-        None
-    };
-
-    ExcludeMatcher { exact, globs }
+fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
+    git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_repo(raw), "Gitea")
 }

-fn should_exclude_repo(repo: &GiteaRepository, excludes: &ExcludeMatcher) -> bool {
+fn should_exclude_repo(repo: &GiteaRepository, excludes: &git_host::ExcludeMatcher) -> bool {
    if excludes.is_empty() {
        return false;
    }
@ -169,7 +103,7 @@ async fn fetch_paginated_repos(
    token: Option<&str>,
    mut url: Url,
    repo_filter: RepoType,
-    excludes: &ExcludeMatcher,
+    excludes: &git_host::ExcludeMatcher,
    progress: Option<&ProgressBar>,
 ) -> Result<Vec<String>> {
    let mut page = 1u32;
@ -221,7 +155,7 @@ async fn fetch_user_repos(
    api_url: &Url,
    username: &str,
    repo_filter: RepoType,
-    excludes: &ExcludeMatcher,
+    excludes: &git_host::ExcludeMatcher,
    progress: Option<&ProgressBar>,
 ) -> Result<Vec<String>> {
    let endpoint = format!("users/{}/repos", username);
@ -235,7 +169,7 @@ async fn fetch_org_repos(
    api_url: &Url,
    org: &str,
    repo_filter: RepoType,
-    excludes: &ExcludeMatcher,
+    excludes: &git_host::ExcludeMatcher,
    progress: Option<&ProgressBar>,
 ) -> Result<Vec<String>> {
    let endpoint = format!("orgs/{}/repos", org);
--- a/src/github.rs
+++ b/src/github.rs
@ -7,7 +7,6 @@ use std::{
 };

 use anyhow::{Context, Result};
-use globset::{Glob, GlobSet, GlobSetBuilder};
 use indicatif::{ProgressBar, ProgressStyle};
 use octorust::{
    auth::Credentials,
@ -119,83 +118,14 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
    parse_repo_name_from_path(trimmed)
 }

-struct ExcludeMatcher {
-    exact: HashSet<String>,
-    globs: Option<GlobSet>,
+use crate::git_host;
+
+fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
+    git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_repo(raw), "GitHub")
 }

-impl ExcludeMatcher {
-    fn is_empty(&self) -> bool {
-        self.exact.is_empty() && self.globs.is_none()
-    }
-
-    fn matches(&self, name: &str) -> bool {
-        if self.exact.contains(name) {
-            return true;
-        }
-        if let Some(globs) = &self.globs {
-            return globs.is_match(name);
-        }
-        false
-    }
-}
-
-fn looks_like_glob(pattern: &str) -> bool {
-    pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
-}
-
-fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
-    let mut exact = HashSet::new();
-    let mut glob_builder = GlobSetBuilder::new();
-    let mut has_glob = false;
-
-    for raw in exclude_repos {
-        match parse_excluded_repo(raw) {
-            Some(name) => {
-                if looks_like_glob(&name) {
-                    match Glob::new(&name) {
-                        Ok(glob) => {
-                            glob_builder.add(glob);
-                            has_glob = true;
-                        }
-                        Err(err) => {
-                            warn!("Ignoring invalid GitHub exclusion pattern '{raw}': {err}");
-                            exact.insert(name);
-                        }
-                    }
-                } else {
-                    exact.insert(name);
-                }
-            }
-            None => {
-                warn!("Ignoring invalid GitHub exclusion '{raw}' (expected owner/repo)");
-            }
-        }
-    }
-
-    let globs = if has_glob {
-        match glob_builder.build() {
-            Ok(set) => Some(set),
-            Err(err) => {
-                warn!("Failed to build GitHub exclusion patterns: {err}");
-                None
-            }
-        }
-    } else {
-        None
-    };
-
-    ExcludeMatcher { exact, globs }
-}
-
-fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool {
-    if excludes.is_empty() {
-        return false;
-    }
-    if let Some(name) = parse_repo_name_from_url(clone_url) {
-        return excludes.matches(&name);
-    }
-    false
+fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
+    git_host::should_exclude_repo(clone_url, excludes, parse_repo_name_from_url)
 }
 fn create_github_client(github_url: &url::Url, ignore_certs: bool) -> Result<Arc<Client>> {
    // Try personal access token
--- a/src/gitlab.rs
+++ b/src/gitlab.rs
@ -16,7 +16,6 @@ use gitlab::{
    },
    Gitlab, GitlabBuilder,
 };
-use globset::{Glob, GlobSet, GlobSetBuilder};
 use indicatif::{ProgressBar, ProgressStyle};
 use reqwest::StatusCode;
 use serde::Deserialize;
@ -25,7 +24,7 @@ use tokio::task;
 use tracing::{info, warn};
 use url::{form_urlencoded, Url};

-use crate::{findings_store, git_url::GitUrl};
+use crate::{findings_store, git_host, git_url::GitUrl};
 use std::str::FromStr;

 #[derive(Deserialize)]
@ -128,83 +127,12 @@ fn parse_excluded_project(raw: &str) -> Option<String> {
    parse_project_path(trimmed)
 }

-struct ExcludeMatcher {
-    exact: HashSet<String>,
-    globs: Option<GlobSet>,
+fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
+    git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_project(raw), "GitLab")
 }

-impl ExcludeMatcher {
-    fn is_empty(&self) -> bool {
-        self.exact.is_empty() && self.globs.is_none()
-    }
-
-    fn matches(&self, name: &str) -> bool {
-        if self.exact.contains(name) {
-            return true;
-        }
-        if let Some(globs) = &self.globs {
-            return globs.is_match(name);
-        }
-        false
-    }
-}
-
-fn looks_like_glob(pattern: &str) -> bool {
-    pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
-}
-
-fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
-    let mut exact = HashSet::new();
-    let mut glob_builder = GlobSetBuilder::new();
-    let mut has_glob = false;
-
-    for raw in exclude_repos {
-        match parse_excluded_project(raw) {
-            Some(name) => {
-                if looks_like_glob(&name) {
-                    match Glob::new(&name) {
-                        Ok(glob) => {
-                            glob_builder.add(glob);
-                            has_glob = true;
-                        }
-                        Err(err) => {
-                            warn!("Ignoring invalid GitLab exclusion pattern '{raw}': {err}");
-                            exact.insert(name);
-                        }
-                    }
-                } else {
-                    exact.insert(name);
-                }
-            }
-            None => {
-                warn!("Ignoring invalid GitLab exclusion '{raw}' (expected group/project)");
-            }
-        }
-    }
-
-    let globs = if has_glob {
-        match glob_builder.build() {
-            Ok(set) => Some(set),
-            Err(err) => {
-                warn!("Failed to build GitLab exclusion patterns: {err}");
-                None
-            }
-        }
-    } else {
-        None
-    };
-
-    ExcludeMatcher { exact, globs }
-}
-
-fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool {
-    if excludes.is_empty() {
-        return false;
-    }
-    if let Some(name) = parse_project_path_from_url(clone_url) {
-        return excludes.matches(&name);
-    }
-    false
+fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
+    git_host::should_exclude_repo(clone_url, excludes, parse_project_path_from_url)
 }

 fn create_gitlab_client(gitlab_url: &Url, ignore_certs: bool) -> Result<Gitlab> {
--- a/src/lib.rs
+++ b/src/lib.rs
@ -24,6 +24,7 @@ pub mod findings_store;
 pub mod gcs;
 pub mod git_binary;
 pub mod git_commit_metadata;
+pub mod git_host;
 pub mod git_metadata_graph;
 mod git_repo_enumerator;
 pub mod git_url;
--- a/src/matcher/base64_decode.rs
+++ b/src/matcher/base64_decode.rs
@ -0,0 +1,2 @@
+// Re-export from the canonical implementation in kingfisher-scanner.
+pub use kingfisher_scanner::primitives::{get_base64_strings, DecodedData};
--- a/src/matcher/captures.rs
+++ b/src/matcher/captures.rs
@ -0,0 +1,140 @@
+use bstr::BString;
+use regex::bytes::Regex;
+use schemars::{
+    gen::SchemaGenerator,
+    schema::{ArrayValidation, InstanceType, Schema},
+    JsonSchema,
+};
+use serde::{Deserialize, Serialize};
+use smallvec::SmallVec;
+
+use crate::{snippet::Base64BString, util::intern};
+
+// -------------------------------------------------------------------------------------------------
+// Group
+// -------------------------------------------------------------------------------------------------
+#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)]
+pub struct Group(pub Base64BString);
+impl Group {
+    pub fn new(m: regex::bytes::Match<'_>) -> Self {
+        Self(Base64BString(BString::from(m.as_bytes())))
+    }
+}
+// -------------------------------------------------------------------------------------------------
+// Groups
+// -------------------------------------------------------------------------------------------------
+#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
+pub struct Groups(pub SmallVec<[Group; 1]>);
+impl JsonSchema for Groups {
+    fn schema_name() -> String {
+        "Groups".to_string()
+    }
+
+    fn json_schema(gen: &mut SchemaGenerator) -> Schema {
+        let group_schema = gen.subschema_for::<Group>();
+        Schema::Object(schemars::schema::SchemaObject {
+            instance_type: Some(InstanceType::Array.into()),
+            array: Some(Box::new(ArrayValidation {
+                items: Some(group_schema.into()),
+                ..Default::default()
+            })),
+            ..Default::default()
+        })
+    }
+}
+
+#[derive(Debug, Clone, JsonSchema)]
+pub struct SerializableCapture {
+    pub name: Option<&'static str>,
+    pub match_number: i32,
+    pub start: usize,
+    pub end: usize,
+    /// Interned original (unredacted) value.
+    #[serde(skip_serializing, skip_deserializing)]
+    pub value: &'static str,
+}
+
+impl SerializableCapture {
+    /// Returns the original captured value.
+    pub fn raw_value(&self) -> &'static str {
+        self.value
+    }
+
+    /// Returns the value that should be shown in user-facing output.
+    pub fn display_value(&self) -> std::borrow::Cow<'static, str> {
+        crate::util::display_value(self.value)
+    }
+}
+
+impl serde::Serialize for SerializableCapture {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+
+        let mut state = serializer.serialize_struct("SerializableCapture", 5)?;
+        state.serialize_field("name", &self.name)?;
+        state.serialize_field("match_number", &self.match_number)?;
+        state.serialize_field("start", &self.start)?;
+        state.serialize_field("end", &self.end)?;
+        let value = self.display_value();
+        state.serialize_field("value", &value)?;
+        state.end()
+    }
+}
+
+#[derive(Debug, Clone, Serialize, JsonSchema)]
+pub struct SerializableCaptures {
+    #[schemars(with = "Vec<SerializableCapture>")]
+    pub captures: SmallVec<[SerializableCapture; 2]>,
+}
+
+impl SerializableCaptures {
+    pub fn from_captures(captures: &regex::bytes::Captures, _input: &[u8], re: &Regex) -> Self {
+        let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
+
+        let capture_names: SmallVec<[Option<&'static str>; 4]> =
+            re.capture_names().map(|name| name.map(intern)).collect();
+
+        // If there are explicit capture groups (e.g., group 1, 2, ...),
+        // only serialize those.
+        if captures.len() > 1 {
+            for i in 1..captures.len() {
+                // Start from 1
+                if let Some(cap) = captures.get(i) {
+                    let raw_value = String::from_utf8_lossy(cap.as_bytes());
+                    let raw_interned = intern(raw_value.as_ref());
+                    let name = capture_names.get(i).and_then(|opt| *opt);
+
+                    serialized_captures.push(SerializableCapture {
+                        name,
+                        match_number: i32::try_from(i).unwrap_or(0),
+                        start: cap.start(),
+                        end: cap.end(),
+                        value: raw_interned,
+                    });
+                }
+            }
+        } else if captures.len() == 1 {
+            // ELSE, if there is ONLY the full match (len == 1),
+            // serialize just that full match (group 0) as the fallback.
+            if let Some(cap) = captures.get(0) {
+                let raw_value = String::from_utf8_lossy(cap.as_bytes());
+                let raw_interned = intern(raw_value.as_ref());
+                let name = capture_names.get(0).and_then(|opt| *opt);
+
+                serialized_captures.push(SerializableCapture {
+                    name,
+                    match_number: 0,
+                    start: cap.start(),
+                    end: cap.end(),
+                    value: raw_interned,
+                });
+            }
+        }
+        // If len == 0 (no match), loop is skipped, empty vec is returned.
+
+        SerializableCaptures { captures: serialized_captures }
+    }
+}
--- a/src/matcher/conversion.rs
+++ b/src/matcher/conversion.rs
@ -0,0 +1,251 @@
+use std::sync::Arc;
+
+use http::StatusCode;
+use schemars::JsonSchema;
+use serde::Serialize;
+use xxhash_rust::xxh3::xxh3_64;
+
+use crate::{
+    blob::BlobId,
+    location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
+    rules::rule::Rule,
+    validation_body::{self, ValidationResponseBody},
+};
+
+use super::{captures::SerializableCaptures, BlobMatch};
+
+use kingfisher_scanner::primitives::compute_finding_fingerprint;
+
+// -------------------------------------------------------------------------------------------------
+// OwnedBlobMatch
+// -------------------------------------------------------------------------------------------------
+
+#[derive(Clone)]
+pub struct OwnedBlobMatch {
+    pub rule: Arc<Rule>,
+    pub blob_id: BlobId,
+    /// The unique content-based identifier of this match
+    pub finding_fingerprint: u64,
+    pub matching_input_offset_span: OffsetSpan,
+    pub captures: SerializableCaptures,
+    pub validation_response_body: ValidationResponseBody,
+    pub validation_response_status: StatusCode,
+    pub validation_success: bool,
+    pub calculated_entropy: f32,
+    pub is_base64: bool,
+    /// Variables captured from dependent rules (from depends_on_rule).
+    /// Maps variable name (uppercase) to captured value.
+    pub dependent_captures: std::collections::BTreeMap<String, String>,
+}
+
+impl OwnedBlobMatch {
+    pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc<Rule>) -> OwnedBlobMatch {
+        OwnedBlobMatch {
+            rule,
+            blob_id: m.blob_id,
+            finding_fingerprint: m.finding_fingerprint,
+            // matching_input: m.snippet.matching.0.to_vec(),
+            matching_input_offset_span: m.location.offset_span.clone(),
+            captures: m.groups.clone(),
+            validation_response_body: m.validation_response_body.clone(),
+            validation_response_status: StatusCode::from_u16(m.validation_response_status)
+                .unwrap_or(StatusCode::CONTINUE),
+            validation_success: m.validation_success,
+            calculated_entropy: m.calculated_entropy,
+            is_base64: m.is_base64,
+            dependent_captures: m.dependent_captures.clone(),
+        }
+    }
+
+    pub fn from_blob_match(blob_match: BlobMatch) -> Self {
+        // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
+        //
+        // This indexing is intentionally different from the internal `validation_dedup_key()`
+        // (which uses get(0)) to maintain stable external fingerprints. Changing this would break:
+        // - Historical baselines that rely on fingerprint matching
+        // - Dedup entries stored in external systems
+        //
+        // For rules with nested captures like (?<REGEX>...(ABC)...), this may pick up
+        // the inner group, but that behavior is now established and must be preserved.
+        let matching_finding = blob_match
+            .captures
+            .captures
+            .get(1)
+            .or_else(|| blob_match.captures.captures.get(0))
+            .map(|capture| capture.raw_value().as_bytes().to_vec())
+            .unwrap_or_else(Vec::new);
+
+        let mut owned_blob_match = OwnedBlobMatch {
+            rule: blob_match.rule,
+            blob_id: blob_match.blob_id.clone(),
+            matching_input_offset_span: blob_match.matching_input_offset_span,
+            captures: blob_match.captures.clone(),
+            validation_response_body: blob_match.validation_response_body,
+            validation_response_status: blob_match.validation_response_status,
+            validation_success: blob_match.validation_success,
+            calculated_entropy: blob_match.calculated_entropy,
+            finding_fingerprint: 0, //default
+            is_base64: blob_match.is_base64,
+            dependent_captures: std::collections::BTreeMap::new(),
+        };
+
+        // Convert matching_finding to a &str (using lossy conversion if needed)
+        let finding_value = std::str::from_utf8(&matching_finding).unwrap_or("");
+        // Use blob_id as the file/commit identifier
+        let file_or_commit = &blob_match.blob_id.to_string();
+
+        let offset_start: u64 =
+            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
+        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
+
+        owned_blob_match.finding_fingerprint =
+            compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end);
+
+        owned_blob_match
+    }
+}
+
+// -------------------------------------------------------------------------------------------------
+// Match
+// -------------------------------------------------------------------------------------------------
+
+#[derive(Debug, Clone, Serialize, JsonSchema)]
+pub struct Match {
+    /// The location of the entire matching content
+    pub location: Location,
+
+    /// The capture groups
+    pub groups: SerializableCaptures, // Store serialized captures
+
+    /// unique identifier of file / blob where this match was found
+    pub blob_id: BlobId,
+
+    /// The unique content-based identifier of this match
+    pub finding_fingerprint: u64,
+
+    /// The rule that produced this match
+    #[serde(skip_serializing)]
+    #[schemars(skip)]
+    pub rule: Arc<Rule>,
+
+    /// Validation Body
+    #[serde(
+        default,
+        serialize_with = "validation_body::serialize",
+        deserialize_with = "validation_body::deserialize"
+    )]
+    #[schemars(schema_with = "validation_body::schema")]
+    pub validation_response_body: ValidationResponseBody,
+
+    /// Validation Status Code
+    pub validation_response_status: u16,
+
+    /// Validation Success
+    pub validation_success: bool,
+
+    /// Validation Success
+    pub calculated_entropy: f32,
+
+    pub visible: bool,
+    #[serde(default)]
+    pub is_base64: bool,
+
+    /// Variables captured from dependent rules (from depends_on_rule).
+    /// Maps variable name (uppercase) to captured value.
+    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
+    pub dependent_captures: std::collections::BTreeMap<String, String>,
+}
+
+impl Match {
+    #[inline]
+    pub fn convert_owned_blobmatch_to_match<'a>(
+        loc_mapping: Option<&'a LocationMapping<'a>>,
+        owned_blob_match: &'a OwnedBlobMatch,
+        origin_type: &'a str,
+    ) -> Self {
+        let offset_span = owned_blob_match.matching_input_offset_span;
+        // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
+        // See comment in from_blob_match() for why this differs from validation_dedup_key().
+        let matching_finding_bytes = owned_blob_match
+            .captures
+            .captures
+            .get(1)
+            .or_else(|| owned_blob_match.captures.captures.get(0))
+            .map(|capture| capture.raw_value().as_bytes())
+            .unwrap_or_default();
+
+        // The fingerprint will be based on the content of the secret.
+        let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
+
+        let source_span =
+            loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
+                start: SourcePoint { line: 0, column: 0 },
+                end: SourcePoint { line: 0, column: 0 },
+            });
+        let offset_start: u64 =
+            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
+        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
+
+        let finding_fingerprint = compute_finding_fingerprint(
+            finding_value_for_fp,
+            origin_type, // file_or_commit,
+            offset_start,
+            offset_end,
+        );
+
+        // matching_snippet
+        Match {
+            rule: owned_blob_match.rule.clone(),
+            visible: owned_blob_match.rule.visible().to_owned(),
+            location: Location::with_source_span(offset_span, Some(source_span.clone())),
+            groups: owned_blob_match.captures.clone(),
+            blob_id: owned_blob_match.blob_id,
+            finding_fingerprint,
+            validation_response_body: owned_blob_match.validation_response_body.clone(),
+            validation_response_status: owned_blob_match.validation_response_status.as_u16(),
+            validation_success: owned_blob_match.validation_success,
+            calculated_entropy: owned_blob_match.calculated_entropy,
+            is_base64: owned_blob_match.is_base64,
+            dependent_captures: owned_blob_match.dependent_captures.clone(),
+        }
+    }
+
+    /// Returns the `blob_id` of the match.
+    pub fn get_blob_id(&self) -> BlobId {
+        self.blob_id.clone()
+    }
+
+    pub fn finding_id(&self) -> String {
+        let mut buffer = Vec::with_capacity(128);
+        buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes());
+        buffer.push(0);
+        serde_json::to_writer(&mut buffer, &self.groups)
+            .expect("should be able to serialize groups as JSON");
+        let mut num = xxh3_64(&buffer);
+        // Ensure the number is positive and within i64 range
+        num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive
+                                      // Convert to string
+        num.to_string()
+    }
+}
+
+// -------------------------------------------------------------------------------------------------
+// MatcherStats
+// -------------------------------------------------------------------------------------------------
+
+#[derive(Debug, Default, Clone)]
+pub struct MatcherStats {
+    pub blobs_seen: u64,
+    pub blobs_scanned: u64,
+    pub bytes_seen: u64,
+    pub bytes_scanned: u64,
+}
+
+impl MatcherStats {
+    pub fn update(&mut self, other: &Self) {
+        self.blobs_seen += other.blobs_seen;
+        self.blobs_scanned += other.blobs_scanned;
+        self.bytes_seen += other.bytes_seen;
+        self.bytes_scanned += other.bytes_scanned;
+    }
+}
--- a/src/matcher/dedup.rs
+++ b/src/matcher/dedup.rs
@ -0,0 +1,2 @@
+// Re-export from the canonical implementation in kingfisher-scanner.
+pub(crate) use kingfisher_scanner::primitives::{compute_match_key, record_match};
--- a/src/matcher/filter.rs
+++ b/src/matcher/filter.rs
@ -0,0 +1,274 @@
+use std::sync::Arc;
+
+use http::StatusCode;
+use regex::bytes::Regex;
+use rustc_hash::{FxHashMap, FxHashSet};
+use tracing::debug;
+
+use crate::{
+    blob::Blob,
+    entropy::calculate_shannon_entropy,
+    inline_ignore::InlineIgnoreConfig,
+    location::OffsetSpan,
+    origin::OriginSet,
+    rule_profiling::{ConcurrentRuleProfiler, RuleTimer},
+    rules::rule::{PatternRequirementContext, PatternValidationResult, Rule, Validation},
+    safe_list::{is_safe_match, is_user_match},
+    validation::{is_parseable_mongodb_uri, is_parseable_mysql_uri, is_parseable_postgres_uri},
+};
+
+use super::{
+    captures::SerializableCaptures,
+    dedup::{compute_match_key, record_match},
+    BlobMatch,
+};
+
+// Re-use the canonical secret capture selection from kingfisher-scanner.
+use kingfisher_scanner::primitives::find_secret_capture;
+
+// -------------------------------------------------------------------------------------------------
+// Entropy and safe-list check
+// -------------------------------------------------------------------------------------------------
+
+/// Returns `Some(entropy)` if the match passes entropy and safe-list checks,
+/// `None` if it should be skipped.
+fn check_entropy_and_safelist(
+    entropy_bytes: &[u8],
+    full_bytes: &[u8],
+    min_entropy: f32,
+) -> Option<f32> {
+    let calculated_entropy = calculate_shannon_entropy(entropy_bytes);
+    if calculated_entropy <= min_entropy
+        || is_safe_match(entropy_bytes)
+        || is_user_match(entropy_bytes, full_bytes)
+    {
+        debug!(
+            "Skipping match with entropy {} <= {} or safe match",
+            calculated_entropy, min_entropy
+        );
+        None
+    } else {
+        Some(calculated_entropy)
+    }
+}
+
+// -------------------------------------------------------------------------------------------------
+// Pattern requirements check
+// -------------------------------------------------------------------------------------------------
+
+/// Returns `true` if the match passes pattern requirements, `false` if it should be skipped.
+fn check_pattern_requirements(
+    rule: &Rule,
+    re: &Regex,
+    captures: &regex::bytes::Captures,
+    full_bytes: &[u8],
+    entropy_bytes: &[u8],
+    respect_ignore_if_contains: bool,
+) -> bool {
+    let Some(char_reqs) = rule.pattern_requirements() else {
+        return true;
+    };
+
+    let context = PatternRequirementContext { regex: re, captures, full_match: full_bytes };
+
+    // Decide which bytes to validate:
+    // - If there are multiple capture groups OR any named captures -> use full match
+    // - Otherwise -> use entropy_bytes (the actual secret)
+    let use_full_match = {
+        let has_named_captures = re.capture_names().any(|n| n.is_some());
+        let capture_count = captures.len(); // includes group 0
+        has_named_captures || capture_count > 2
+    };
+
+    let validation_bytes = if use_full_match { full_bytes } else { entropy_bytes };
+
+    match char_reqs.validate(validation_bytes, Some(context), respect_ignore_if_contains) {
+        PatternValidationResult::Passed => true,
+        PatternValidationResult::Failed => {
+            debug!(
+                "Skipping match that does not meet character requirements for rule {}",
+                rule.id()
+            );
+            false
+        }
+        PatternValidationResult::FailedChecksum { actual_len, expected_len } => {
+            debug!(
+                "Skipping match for rule {} due to checksum mismatch (actual_len={}, expected_len={})",
+                rule.id(),
+                actual_len,
+                expected_len
+            );
+            false
+        }
+        PatternValidationResult::IgnoredBySubstring { matched_term } => {
+            debug!(
+                "Skipping match for rule {} because it contains ignored term {matched_term}",
+                rule.id()
+            );
+            false
+        }
+    }
+}
+
+// -------------------------------------------------------------------------------------------------
+// URI validation
+// -------------------------------------------------------------------------------------------------
+
+/// Returns `true` if the match passes URI validation (for database rules), `false` if it should
+/// be skipped.
+fn check_uri_validation(rule: &Rule, matching_input_bytes: &[u8]) -> bool {
+    let Some(validation) = rule.syntax.validation.as_ref() else {
+        return true;
+    };
+
+    match validation {
+        Validation::MongoDB => {
+            let Ok(uri) = std::str::from_utf8(matching_input_bytes) else {
+                debug!("Skipping match for rule {} due to non-UTF8 MongoDB URI", rule.id());
+                return false;
+            };
+            if !is_parseable_mongodb_uri(uri) {
+                debug!("Skipping match for rule {} due to invalid MongoDB URI", rule.id());
+                return false;
+            }
+        }
+        Validation::Postgres => {
+            let Ok(uri) = std::str::from_utf8(matching_input_bytes) else {
+                debug!("Skipping match for rule {} due to non-UTF8 Postgres URI", rule.id());
+                return false;
+            };
+            if !is_parseable_postgres_uri(uri) {
+                debug!("Skipping match for rule {} due to invalid Postgres URI", rule.id());
+                return false;
+            }
+        }
+        Validation::MySQL => {
+            let Ok(uri) = std::str::from_utf8(matching_input_bytes) else {
+                debug!("Skipping match for rule {} due to non-UTF8 MySQL URI", rule.id());
+                return false;
+            };
+            if !is_parseable_mysql_uri(uri) {
+                debug!("Skipping match for rule {} due to invalid MySQL URI", rule.id());
+                return false;
+            }
+        }
+        _ => {}
+    }
+    true
+}
+
+// -------------------------------------------------------------------------------------------------
+// filter_match — main entry point
+// -------------------------------------------------------------------------------------------------
+
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn filter_match<'b>(
+    blob: &'b Blob,
+    rule: Arc<Rule>,
+    re: &Regex,
+    start: usize,
+    end: usize,
+    matches: &mut Vec<BlobMatch<'b>>,
+    previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
+    rule_id: usize,
+    seen_matches: &mut FxHashSet<u64>,
+    _origin: &OriginSet,
+    ts_match: Option<&[u8]>,
+    is_base64: bool,
+    _redact: bool,
+    filename: &str,
+    profiler: Option<&Arc<ConcurrentRuleProfiler>>,
+    respect_ignore_if_contains: bool,
+    inline_ignore_config: &InlineIgnoreConfig,
+) {
+    let mut timer =
+        profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
+
+    let initial_len = matches.len();
+
+    let blob_bytes = blob.bytes();
+    let default_slice = &blob_bytes[start..end];
+    let haystack = ts_match.unwrap_or(default_slice);
+
+    for captures in re.captures_iter(haystack) {
+        let full_capture = captures.get(0).unwrap();
+        let matching_input_for_entropy = find_secret_capture(re, &captures);
+
+        let min_entropy = rule.min_entropy();
+        let entropy_bytes = matching_input_for_entropy.as_bytes();
+        let full_bytes = full_capture.as_bytes();
+
+        // Check entropy and safe-listing
+        let calculated_entropy =
+            match check_entropy_and_safelist(entropy_bytes, full_bytes, min_entropy) {
+                Some(e) => e,
+                None => continue,
+            };
+
+        // Check pattern requirements
+        if !check_pattern_requirements(
+            &rule,
+            re,
+            &captures,
+            full_bytes,
+            entropy_bytes,
+            respect_ignore_if_contains,
+        ) {
+            continue;
+        }
+
+        // Use the `matching_input_for_entropy` as the span/key for the finding.
+        let matching_input = matching_input_for_entropy;
+
+        let matching_input_offset_span = OffsetSpan::from_range(
+            (start + matching_input.start())..(start + matching_input.end()),
+        );
+
+        // Check inline ignore directives
+        if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
+            debug!("Skipping match due to inline ignore directive");
+            continue;
+        }
+
+        // Check URI validation (MongoDB, Postgres, MySQL)
+        if !check_uri_validation(&rule, matching_input.as_bytes()) {
+            continue;
+        }
+
+        // Deduplication
+        let match_key = compute_match_key(
+            matching_input.as_bytes(),
+            rule.id().as_bytes(),
+            matching_input_offset_span.start,
+            matching_input_offset_span.end,
+        );
+        if !seen_matches.insert(match_key) {
+            continue;
+        }
+        if !record_match(previous_matches, rule_id, matching_input_offset_span) {
+            continue;
+        }
+        let only_matching_input =
+            &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
+
+        // Pass the *full* capture object to from_captures
+        let groups = SerializableCaptures::from_captures(&captures, haystack, re);
+
+        matches.push(BlobMatch {
+            rule: Arc::clone(&rule),
+            blob_id: blob.id_ref(),
+            matching_input: only_matching_input,
+            matching_input_offset_span,
+            captures: groups,
+            validation_response_body: None,
+            validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
+            validation_success: false,
+            calculated_entropy,
+            is_base64,
+        });
+    }
+    if let Some(t) = timer.take() {
+        let new_count = (matches.len() - initial_len) as u64;
+        t.end(new_count > 0, new_count, 0);
+    }
+}
--- a/src/matcher/fingerprint.rs
+++ b/src/matcher/fingerprint.rs
@ -0,0 +1,2 @@
+// Re-export from the canonical implementation in kingfisher-scanner.
+pub use kingfisher_scanner::primitives::compute_finding_fingerprint;
--- a/src/matcher/mod.rs
+++ b/src/matcher/mod.rs
@ -1,42 +1,38 @@
-use std::{
-    hash::{Hash, Hasher},
-    str,
-    sync::{Arc, Mutex},
-};
+mod base64_decode;
+mod captures;
+mod conversion;
+mod dedup;
+mod filter;
+mod fingerprint;
+
+// Re-export public API
+pub use base64_decode::{get_base64_strings, DecodedData};
+pub use captures::{Group, Groups, SerializableCapture, SerializableCaptures};
+pub use conversion::{Match, MatcherStats, OwnedBlobMatch};
+pub use fingerprint::compute_finding_fingerprint;
+
+use std::sync::{Arc, Mutex};

 use anyhow::Result;
-use base64::{engine::general_purpose, Engine};
-use bstr::BString;
 use http::StatusCode;
-use regex::bytes::Regex;
-use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
-use schemars::{
-    gen::SchemaGenerator,
-    schema::{ArrayValidation, InstanceType, Schema},
-    JsonSchema,
-};
-use serde::{Deserialize, Serialize};
-use smallvec::SmallVec;
-use tracing::debug;
-use xxhash_rust::xxh3::xxh3_64;
+use rustc_hash::{FxHashMap, FxHashSet};

 use crate::{
    blob::{Blob, BlobId, BlobIdMap},
-    entropy::calculate_shannon_entropy,
    inline_ignore::InlineIgnoreConfig,
-    location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
+    location::OffsetSpan,
    origin::OriginSet,
    parser,
    parser::{Checker, Language},
-    rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer},
-    rules::rule::{PatternRequirementContext, PatternValidationResult, Rule, Validation},
+    rule_profiling::{ConcurrentRuleProfiler, RuleStats},
+    rules::rule::Rule,
    rules_database::RulesDatabase,
-    safe_list::{is_safe_match, is_user_match},
    scanner_pool::ScannerPool,
-    snippet::Base64BString,
-    util::intern,
-    validation::{is_parseable_mongodb_uri, is_parseable_mysql_uri, is_parseable_postgres_uri},
-    validation_body::{self, ValidationResponseBody},
+    validation_body::ValidationResponseBody,
+};
+
+use self::{
+    base64_decode::get_base64_strings as get_b64_strings, dedup::record_match, filter::filter_match,
 };

 const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
@ -58,94 +54,7 @@ struct RawMatch {
    start_idx: u64,
    end_idx: u64,
 }
-#[derive(Clone)]
-pub struct OwnedBlobMatch {
-    pub rule: Arc<Rule>,
-    pub blob_id: BlobId,
-    /// The unique content-based identifier of this match
-    pub finding_fingerprint: u64,
-    pub matching_input_offset_span: OffsetSpan,
-    pub captures: SerializableCaptures,
-    pub validation_response_body: ValidationResponseBody,
-    pub validation_response_status: StatusCode,
-    pub validation_success: bool,
-    pub calculated_entropy: f32,
-    pub is_base64: bool,
-    /// Variables captured from dependent rules (from depends_on_rule).
-    /// Maps variable name (uppercase) to captured value.
-    pub dependent_captures: std::collections::BTreeMap<String, String>,
-}
-impl<'a> Matcher<'a> {
-    pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
-        self.profiler.as_ref().map(|p| p.generate_report())
-    }
-}
-impl OwnedBlobMatch {
-    pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc<Rule>) -> OwnedBlobMatch {
-        OwnedBlobMatch {
-            rule,
-            blob_id: m.blob_id,
-            finding_fingerprint: m.finding_fingerprint,
-            // matching_input: m.snippet.matching.0.to_vec(),
-            matching_input_offset_span: m.location.offset_span.clone(),
-            captures: m.groups.clone(),
-            validation_response_body: m.validation_response_body.clone(),
-            validation_response_status: StatusCode::from_u16(m.validation_response_status)
-                .unwrap_or(StatusCode::CONTINUE),
-            validation_success: m.validation_success,
-            calculated_entropy: m.calculated_entropy,
-            is_base64: m.is_base64,
-            dependent_captures: m.dependent_captures.clone(),
-        }
-    }

-    pub fn from_blob_match(blob_match: BlobMatch) -> Self {
-        // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
-        //
-        // This indexing is intentionally different from the internal `validation_dedup_key()`
-        // (which uses get(0)) to maintain stable external fingerprints. Changing this would break:
-        // - Historical baselines that rely on fingerprint matching
-        // - Dedup entries stored in external systems
-        //
-        // For rules with nested captures like (?<REGEX>...(ABC)...), this may pick up
-        // the inner group, but that behavior is now established and must be preserved.
-        let matching_finding = blob_match
-            .captures
-            .captures
-            .get(1)
-            .or_else(|| blob_match.captures.captures.get(0))
-            .map(|capture| capture.raw_value().as_bytes().to_vec())
-            .unwrap_or_else(Vec::new);
-
-        let mut owned_blob_match = OwnedBlobMatch {
-            rule: blob_match.rule,
-            blob_id: blob_match.blob_id.clone(),
-            matching_input_offset_span: blob_match.matching_input_offset_span,
-            captures: blob_match.captures.clone(),
-            validation_response_body: blob_match.validation_response_body,
-            validation_response_status: blob_match.validation_response_status,
-            validation_success: blob_match.validation_success,
-            calculated_entropy: blob_match.calculated_entropy,
-            finding_fingerprint: 0, //default
-            is_base64: blob_match.is_base64,
-            dependent_captures: std::collections::BTreeMap::new(),
-        };
-
-        // Convert matching_finding to a &str (using lossy conversion if needed)
-        let finding_value = std::str::from_utf8(&matching_finding).unwrap_or("");
-        // Use blob_id as the file/commit identifier
-        let file_or_commit = &blob_match.blob_id.to_string();
-
-        let offset_start: u64 =
-            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
-        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
-
-        owned_blob_match.finding_fingerprint =
-            compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end);
-
-        owned_blob_match
-    }
-}
 // -------------------------------------------------------------------------------------------------
 // BlobMatch
 // -------------------------------------------------------------------------------------------------
@ -156,7 +65,7 @@ impl OwnedBlobMatch {
 /// `Match`.
 pub struct BlobMatch<'a> {
    /// The rule that was matched
-    pub rule: Arc<Rule>, // Changed from `&'a Rule` to `Arc<Rule
+    pub rule: Arc<Rule>,

    /// The blob that was matched
    pub blob_id: &'a BlobId,
@ -168,7 +77,7 @@ pub struct BlobMatch<'a> {
    pub matching_input_offset_span: OffsetSpan,

    /// The capture groups from the match
-    pub captures: SerializableCaptures, // regex::bytes::Captures<'a>,
+    pub captures: SerializableCaptures,

    pub validation_response_body: ValidationResponseBody,
    pub validation_response_status: StatusCode,
@ -177,6 +86,7 @@ pub struct BlobMatch<'a> {
    pub calculated_entropy: f32,
    pub is_base64: bool,
 }
+
 #[derive(Clone)]
 struct UserData {
    /// A scratch vector for raw matches from Vectorscan, to minimize allocation
@ -185,6 +95,7 @@ struct UserData {
    /// The length of the input being scanned
    input_len: u64,
 }
+
 // -------------------------------------------------------------------------------------------------
 // Matcher
 // -------------------------------------------------------------------------------------------------
@ -222,6 +133,7 @@ pub struct Matcher<'a> {
    /// Whether matches should honour `ignore_if_contains` requirements.
    respect_ignore_if_contains: bool,
 }
+
 /// This `Drop` implementation updates the `global_stats` with the local stats
 impl<'a> Drop for Matcher<'a> {
    fn drop(&mut self) {
@ -231,11 +143,19 @@ impl<'a> Drop for Matcher<'a> {
        }
    }
 }
+
 pub enum ScanResult<'a> {
    SeenWithMatches,
    SeenSansMatches,
    New(Vec<BlobMatch<'a>>),
 }
+
+impl<'a> Matcher<'a> {
+    pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
+        self.profiler.as_ref().map(|p| p.generate_report())
+    }
+}
+
 impl<'a> Matcher<'a> {
    /// Create a new `Matcher` from the given `RulesDatabase`.
    ///
@ -255,8 +175,6 @@ impl<'a> Matcher<'a> {
        // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec
        let raw_matches_scratch = Vec::new();
        let user_data = UserData { raw_matches_scratch, input_len: 0 };
-        // let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?;
-        // pool is created once per scan run (see Scanner section below)
        let profiler = shared_profiler.or_else(|| {
            if enable_profiling {
                Some(Arc::new(ConcurrentRuleProfiler::new()))
@ -349,7 +267,7 @@ impl<'a> Matcher<'a> {
        let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
            Vec::new()
        } else {
-            get_base64_strings(blob.bytes())
+            get_b64_strings(blob.bytes())
        };

        let lang_hint = lang.as_deref();
@ -530,7 +448,7 @@ impl<'a> Matcher<'a> {
                    );
                }
                if depth + 1 < MAX_B64_DEPTH {
-                    for nested in get_base64_strings(item.decoded.as_slice()) {
+                    for nested in get_b64_strings(item.decoded.as_slice()) {
                        b64_stack.push((
                            DecodedData {
                                decoded: nested.decoded,
@ -559,268 +477,12 @@ impl<'a> Matcher<'a> {
        if self.user_data.raw_matches_scratch.capacity()
            > self.user_data.raw_matches_scratch.len() * 4
        {
-            // Vec::shrink_to_fit may re-allocate, but we’re about to leave scan_blob
+            // Vec::shrink_to_fit may re-allocate, but we're about to leave scan_blob
            // so the cost is hidden off the hot path.
            self.user_data.raw_matches_scratch.shrink_to_fit();
        }

        Ok(ScanResult::New(matches))
-        // Ok(result)
-    }
-}
-
-#[inline]
-fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
-    let mut hasher = FxHasher::default();
-    // Hash each component directly without allocation
-    content.hash(&mut hasher);
-    rule_id.hash(&mut hasher);
-    start.hash(&mut hasher);
-    end.hash(&mut hasher);
-    hasher.finish()
-}
-
-#[inline]
-fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
-    let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
-    if idx > 0 {
-        if spans[idx - 1].fully_contains(&span) {
-            return false;
-        }
-        if span.fully_contains(&spans[idx - 1]) {
-            spans.remove(idx - 1);
-            idx -= 1;
-        }
-    }
-    if idx < spans.len() {
-        if spans[idx].fully_contains(&span) {
-            return false;
-        }
-        if span.fully_contains(&spans[idx]) {
-            spans.remove(idx);
-        }
-    }
-    spans.insert(idx, span);
-    true
-}
-
-#[inline]
-fn record_match(
-    map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
-    rule_id: usize,
-    span: OffsetSpan,
-) -> bool {
-    insert_span(map.entry(rule_id).or_default(), span)
-}
-
-#[allow(clippy::too_many_arguments)]
-fn filter_match<'b>(
-    blob: &'b Blob,
-    rule: Arc<Rule>,
-    re: &Regex,
-    start: usize,
-    end: usize,
-    matches: &mut Vec<BlobMatch<'b>>,
-    previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
-    rule_id: usize,
-    seen_matches: &mut FxHashSet<u64>,
-    _origin: &OriginSet,
-    ts_match: Option<&[u8]>,
-    is_base64: bool,
-    _redact: bool,
-    filename: &str,
-    profiler: Option<&Arc<ConcurrentRuleProfiler>>,
-    respect_ignore_if_contains: bool,
-    inline_ignore_config: &InlineIgnoreConfig,
-) {
-    let mut timer =
-        profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
-
-    let initial_len = matches.len();
-
-    let blob_bytes = blob.bytes();
-    let default_slice = &blob_bytes[start..end];
-    let haystack = ts_match.unwrap_or(default_slice);
-
-    for captures in re.captures_iter(haystack) {
-        let full_capture = captures.get(0).unwrap();
-
-        // --- LOGIC TO FIND THE "SECRET" FOR ENTROPY/SAFE-LISTING ---
-        let matching_input_for_entropy = 'block: {
-            // 1. Prefer a named capture called TOKEN (case-insensitive).
-            if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
-                name_opt
-                    .filter(|name| name.eq_ignore_ascii_case("TOKEN"))
-                    .and_then(|_| captures.get(i))
-            }) {
-                break 'block token_cap;
-            }
-
-            // 2. Otherwise, prefer the first *matched* named capture.
-            if let Some(named_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
-                name_opt.and_then(|_| captures.get(i)) // find(i > 0 && name_opt.is_some())
-            }) {
-                break 'block named_cap;
-            }
-
-            // 3. Otherwise, fall back to the first positional capture (group 1).
-            if let Some(pos_cap) = captures.get(1) {
-                break 'block pos_cap;
-            }
-
-            // 4. Finally, fall back to the full match (group 0).
-            break 'block full_capture;
-        };
-        // --- END LOGIC ---
-
-        let min_entropy = rule.min_entropy();
-        let entropy_bytes = matching_input_for_entropy.as_bytes();
-        let full_bytes = full_capture.as_bytes();
-        let calculated_entropy = calculate_shannon_entropy(entropy_bytes);
-
-        // Check entropy and safe-listing against the *selected* secret bytes
-        if calculated_entropy <= min_entropy
-            || is_safe_match(entropy_bytes)
-            || is_user_match(entropy_bytes, full_bytes)
-        {
-            debug!(
-                "Skipping match with entropy {} <= {} or safe match",
-                calculated_entropy, min_entropy
-            );
-            continue;
-        }
-
-        // Check character requirements if specified
-        if let Some(char_reqs) = rule.pattern_requirements() {
-            let context = PatternRequirementContext {
-                regex: re,
-                captures: &captures,
-                full_match: full_bytes,
-            };
-
-            // Decide which bytes to validate:
-            // - If there are multiple capture groups OR any named captures → use full match
-            // - Otherwise → use entropy_bytes (the actual secret)
-            let use_full_match = {
-                let has_named_captures = re.capture_names().any(|n| n.is_some());
-                let capture_count = captures.len(); // includes group 0
-                has_named_captures || capture_count > 2
-            };
-
-            let validation_bytes = if use_full_match { full_bytes } else { entropy_bytes };
-
-            match char_reqs.validate(validation_bytes, Some(context), respect_ignore_if_contains) {
-                //
-                // --- END FIX ---
-                PatternValidationResult::Passed => {}
-                PatternValidationResult::Failed => {
-                    debug!(
-                        "Skipping match that does not meet character requirements for rule {}",
-                        rule.id()
-                    );
-                    continue;
-                }
-                PatternValidationResult::FailedChecksum { actual_len, expected_len } => {
-                    debug!(
-                        "Skipping match for rule {} due to checksum mismatch (actual_len={}, expected_len={})",
-                        rule.id(),
-                        actual_len,
-                        expected_len
-                    );
-                    continue;
-                }
-                PatternValidationResult::IgnoredBySubstring { matched_term } => {
-                    debug!(
-                        "Skipping match for rule {} because it contains ignored term {matched_term}",
-                        rule.id()
-                    );
-                    continue;
-                }
-            }
-        }
-
-        // Use the `matching_input_for_entropy` as the span/key for the finding.
-        let matching_input = matching_input_for_entropy;
-
-        let matching_input_offset_span = OffsetSpan::from_range(
-            (start + matching_input.start())..(start + matching_input.end()),
-        );
-        if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
-            debug!("Skipping match due to inline ignore directive");
-            continue;
-        }
-        if let Some(validation) = rule.syntax.validation.as_ref() {
-            match validation {
-                Validation::MongoDB => {
-                    let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
-                        debug!("Skipping match for rule {} due to non-UTF8 MongoDB URI", rule.id());
-                        continue;
-                    };
-                    if !is_parseable_mongodb_uri(uri) {
-                        debug!("Skipping match for rule {} due to invalid MongoDB URI", rule.id());
-                        continue;
-                    }
-                }
-                Validation::Postgres => {
-                    let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
-                        debug!(
-                            "Skipping match for rule {} due to non-UTF8 Postgres URI",
-                            rule.id()
-                        );
-                        continue;
-                    };
-                    if !is_parseable_postgres_uri(uri) {
-                        debug!("Skipping match for rule {} due to invalid Postgres URI", rule.id());
-                        continue;
-                    }
-                }
-                Validation::MySQL => {
-                    let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
-                        debug!("Skipping match for rule {} due to non-UTF8 MySQL URI", rule.id());
-                        continue;
-                    };
-                    if !is_parseable_mysql_uri(uri) {
-                        debug!("Skipping match for rule {} due to invalid MySQL URI", rule.id());
-                        continue;
-                    }
-                }
-                _ => {}
-            }
-        }
-        let match_key = compute_match_key(
-            matching_input.as_bytes(),
-            rule.id().as_bytes(),
-            matching_input_offset_span.start,
-            matching_input_offset_span.end,
-        );
-        if !seen_matches.insert(match_key) {
-            continue;
-        }
-        if !record_match(previous_matches, rule_id, matching_input_offset_span) {
-            continue;
-        }
-        let only_matching_input =
-            &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
-
-        // Pass the *full* capture object to from_captures
-        let groups = SerializableCaptures::from_captures(&captures, haystack, re);
-
-        matches.push(BlobMatch {
-            rule: Arc::clone(&rule),
-            blob_id: blob.id_ref(),
-            matching_input: only_matching_input,
-            matching_input_offset_span,
-            captures: groups,
-            validation_response_body: None,
-            validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
-            validation_success: false,
-            calculated_entropy,
-            is_base64,
-        });
-    }
-    if let Some(t) = timer.take() {
-        let new_count = (matches.len() - initial_len) as u64;
-        t.end(new_count > 0, new_count, 0);
    }
 }

@ -837,10 +499,6 @@ fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, S
        "javascript" | "js" => {
            Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries()))
        }
-        // "kotlin" => Some((
-        //     Language::Kotlin,
-        //     parser::queries::kotlin::get_kotlin_queries(),
-        // )),
        "php" => Some((Language::Php, parser::queries::php::get_php_queries())),
        "python" | "py" | "starlark" => {
            Some((Language::Python, parser::queries::python::get_python_queries()))
@ -855,354 +513,6 @@ fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, S
        _ => None,
    }
 }
-// -------------------------------------------------------------------------------------------------
-// MatchStats
-// -------------------------------------------------------------------------------------------------
-#[derive(Debug, Default, Clone)]
-pub struct MatcherStats {
-    pub blobs_seen: u64,
-    pub blobs_scanned: u64,
-    pub bytes_seen: u64,
-    pub bytes_scanned: u64,
-    // #[cfg(feature = "rule_profiling")]
-    // pub rule_stats: crate::rule_profiling::RuleProfile,
-}
-impl MatcherStats {
-    pub fn update(&mut self, other: &Self) {
-        self.blobs_seen += other.blobs_seen;
-        self.blobs_scanned += other.blobs_scanned;
-        self.bytes_seen += other.bytes_seen;
-        self.bytes_scanned += other.bytes_scanned;
-
-        // #[cfg(feature = "rule_profiling")]
-        // self.rule_stats.update(&other.rule_stats);
-    }
-}
-// -------------------------------------------------------------------------------------------------
-// Group
-// -------------------------------------------------------------------------------------------------
-#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)]
-pub struct Group(pub Base64BString);
-impl Group {
-    pub fn new(m: regex::bytes::Match<'_>) -> Self {
-        Self(Base64BString(BString::from(m.as_bytes())))
-    }
-}
-// -------------------------------------------------------------------------------------------------
-// Groups
-// -------------------------------------------------------------------------------------------------
-#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
-pub struct Groups(pub SmallVec<[Group; 1]>);
-impl JsonSchema for Groups {
-    fn schema_name() -> String {
-        "Groups".to_string()
-    }
-
-    fn json_schema(gen: &mut SchemaGenerator) -> Schema {
-        let group_schema = gen.subschema_for::<Group>();
-        Schema::Object(schemars::schema::SchemaObject {
-            instance_type: Some(InstanceType::Array.into()),
-            array: Some(Box::new(ArrayValidation {
-                items: Some(group_schema.into()),
-                ..Default::default()
-            })),
-            ..Default::default()
-        })
-    }
-}
-// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
-// pub struct SerializableCapture {
-//     pub name: Option<String>, // Named group (if available)
-//     pub match_number: i32,
-//     pub start: usize,  // Start position of the match
-//     pub end: usize,    // End position of the match
-//     pub value: String, // The actual captured value
-// }
-#[derive(Debug, Clone, JsonSchema)]
-pub struct SerializableCapture {
-    pub name: Option<&'static str>,
-    pub match_number: i32,
-    pub start: usize,
-    pub end: usize,
-    /// Interned original (unredacted) value.
-    #[serde(skip_serializing, skip_deserializing)]
-    pub value: &'static str,
-}
-
-impl SerializableCapture {
-    /// Returns the original captured value.
-    pub fn raw_value(&self) -> &'static str {
-        self.value
-    }
-
-    /// Returns the value that should be shown in user-facing output.
-    pub fn display_value(&self) -> std::borrow::Cow<'static, str> {
-        crate::util::display_value(self.value)
-    }
-}
-
-impl serde::Serialize for SerializableCapture {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        use serde::ser::SerializeStruct;
-
-        let mut state = serializer.serialize_struct("SerializableCapture", 5)?;
-        state.serialize_field("name", &self.name)?;
-        state.serialize_field("match_number", &self.match_number)?;
-        state.serialize_field("start", &self.start)?;
-        state.serialize_field("end", &self.end)?;
-        let value = self.display_value();
-        state.serialize_field("value", &value)?;
-        state.end()
-    }
-}
-
-#[derive(Debug, Clone, Serialize, JsonSchema)]
-pub struct SerializableCaptures {
-    #[schemars(with = "Vec<SerializableCapture>")]
-    pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
-}
-
-impl SerializableCaptures {
-    pub fn from_captures(captures: &regex::bytes::Captures, _input: &[u8], re: &Regex) -> Self {
-        let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
-
-        let capture_names: SmallVec<[Option<&'static str>; 4]> =
-            re.capture_names().map(|name| name.map(intern)).collect();
-
-        // If there are explicit capture groups (e.g., group 1, 2, ...),
-        // only serialize those.
-        if captures.len() > 1 {
-            for i in 1..captures.len() {
-                // Start from 1
-                if let Some(cap) = captures.get(i) {
-                    let raw_value = String::from_utf8_lossy(cap.as_bytes());
-                    let raw_interned = intern(raw_value.as_ref());
-                    let name = capture_names.get(i).and_then(|opt| *opt);
-
-                    serialized_captures.push(SerializableCapture {
-                        name,
-                        match_number: i32::try_from(i).unwrap_or(0),
-                        start: cap.start(),
-                        end: cap.end(),
-                        value: raw_interned,
-                    });
-                }
-            }
-        } else if captures.len() == 1 {
-            // ELSE, if there is ONLY the full match (len == 1),
-            // serialize just that full match (group 0) as the fallback.
-            if let Some(cap) = captures.get(0) {
-                let raw_value = String::from_utf8_lossy(cap.as_bytes());
-                let raw_interned = intern(raw_value.as_ref());
-                let name = capture_names.get(0).and_then(|opt| *opt);
-
-                serialized_captures.push(SerializableCapture {
-                    name,
-                    match_number: 0,
-                    start: cap.start(),
-                    end: cap.end(),
-                    value: raw_interned,
-                });
-            }
-        }
-        // If len == 0 (no match), loop is skipped, empty vec is returned.
-
-        SerializableCaptures { captures: serialized_captures }
-    }
-}
-// -------------------------------------------------------------------------------------------------
-// Match
-// -------------------------------------------------------------------------------------------------
-#[derive(Debug, Clone, Serialize, JsonSchema)]
-pub struct Match {
-    /// The location of the entire matching content
-    pub location: Location,
-
-    /// The capture groups
-    pub groups: SerializableCaptures, // Store serialized captures
-
-    /// unique identifier of file / blob where this match was found
-    pub blob_id: BlobId,
-
-    /// The unique content-based identifier of this match
-    pub finding_fingerprint: u64,
-
-    /// The rule that produced this match
-    #[serde(skip_serializing)]
-    #[schemars(skip)]
-    pub rule: Arc<Rule>,
-
-    /// Validation Body
-    #[serde(
-        default,
-        serialize_with = "validation_body::serialize",
-        deserialize_with = "validation_body::deserialize"
-    )]
-    #[schemars(schema_with = "validation_body::schema")]
-    pub validation_response_body: ValidationResponseBody,
-
-    /// Validation Status Code
-    pub validation_response_status: u16,
-
-    /// Validation Success
-    pub validation_success: bool,
-
-    /// Validation Success
-    pub calculated_entropy: f32,
-
-    pub visible: bool,
-    #[serde(default)]
-    pub is_base64: bool,
-
-    /// Variables captured from dependent rules (from depends_on_rule).
-    /// Maps variable name (uppercase) to captured value.
-    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
-    pub dependent_captures: std::collections::BTreeMap<String, String>,
-}
-impl Match {
-    #[inline]
-    pub fn convert_owned_blobmatch_to_match<'a>(
-        loc_mapping: Option<&'a LocationMapping<'a>>,
-        owned_blob_match: &'a OwnedBlobMatch,
-        origin_type: &'a str,
-    ) -> Self {
-        let offset_span = owned_blob_match.matching_input_offset_span;
-        // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
-        // See comment in from_blob_match() for why this differs from validation_dedup_key().
-        let matching_finding_bytes = owned_blob_match
-            .captures
-            .captures
-            .get(1)
-            .or_else(|| owned_blob_match.captures.captures.get(0))
-            .map(|capture| capture.raw_value().as_bytes())
-            .unwrap_or_default();
-
-        // The fingerprint will be based on the content of the secret.
-        let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
-
-        let source_span =
-            loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
-                start: SourcePoint { line: 0, column: 0 },
-                end: SourcePoint { line: 0, column: 0 },
-            });
-        let offset_start: u64 =
-            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
-        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
-
-        let finding_fingerprint = compute_finding_fingerprint(
-            finding_value_for_fp,
-            origin_type, // file_or_commit,
-            offset_start,
-            offset_end,
-        );
-
-        // matching_snippet
-        Match {
-            rule: owned_blob_match.rule.clone(),
-            visible: owned_blob_match.rule.visible().to_owned(),
-            location: Location::with_source_span(offset_span, Some(source_span.clone())),
-            groups: owned_blob_match.captures.clone(),
-            blob_id: owned_blob_match.blob_id,
-            finding_fingerprint,
-            validation_response_body: owned_blob_match.validation_response_body.clone(),
-            validation_response_status: owned_blob_match.validation_response_status.as_u16(),
-            validation_success: owned_blob_match.validation_success,
-            calculated_entropy: owned_blob_match.calculated_entropy,
-            is_base64: owned_blob_match.is_base64,
-            dependent_captures: owned_blob_match.dependent_captures.clone(),
-        }
-    }
-
-    /// Returns the `blob_id` of the match.
-    pub fn get_blob_id(&self) -> BlobId {
-        self.blob_id.clone()
-    }
-
-    pub fn finding_id(&self) -> String {
-        let mut buffer = Vec::with_capacity(128);
-        buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes());
-        buffer.push(0);
-        serde_json::to_writer(&mut buffer, &self.groups)
-            .expect("should be able to serialize groups as JSON");
-        let mut num = xxh3_64(&buffer);
-        // Ensure the number is positive and within i64 range
-        num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive
-                                      // Convert to string
-        num.to_string()
-    }
-}
-#[derive(Debug, Clone)]
-pub struct DecodedData {
-    pub decoded: Vec<u8>,
-    pub pos_start: usize,
-    pub pos_end: usize,
-}
-#[inline]
-fn is_base64_byte(b: u8) -> bool {
-    // Accepts both standard base64 ('+', '/') and URL-safe base64 ('-', '_') characters.
-    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
-}
-
-pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
-    let mut results = Vec::new();
-    let mut i = 0;
-    while i < input.len() {
-        while i < input.len() && !is_base64_byte(input[i]) {
-            i += 1;
-        }
-        let start = i;
-        while i < input.len() && is_base64_byte(input[i]) {
-            i += 1;
-        }
-
-        let mut eq_count = 0;
-        while i < input.len() && input[i] == b'=' && eq_count < 2 {
-            i += 1;
-            eq_count += 1;
-        }
-        let end = i;
-
-        let len = end - start;
-        if len >= 32 && len % 4 == 0 {
-            let base64_slice = &input[start..end];
-
-            // Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD
-            let decode_result = general_purpose::STANDARD
-                .decode(base64_slice)
-                .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
-                .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
-
-            if let Ok(decoded) = decode_result {
-                if decoded.is_ascii() {
-                    results.push(DecodedData { decoded, pos_start: start, pos_end: end });
-                }
-            }
-        }
-    }
-
-    results
-}
-
-pub fn compute_finding_fingerprint(
-    finding_value: &str,
-    file_or_commit: &str,
-    offset_start: u64,
-    offset_end: u64,
-) -> u64 {
-    // Combine all into a byte buffer and hash it directly:
-    let mut buf = Vec::with_capacity(
-        finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::<u64>(),
-    );
-    buf.extend_from_slice(finding_value.as_bytes());
-    buf.extend_from_slice(file_or_commit.as_bytes());
-    buf.extend_from_slice(&offset_start.to_le_bytes());
-    buf.extend_from_slice(&offset_end.to_le_bytes());
-
-    xxh3_64(&buf)
-}

 // -------------------------------------------------------------------------------------------------
 // test
@ -1220,6 +530,7 @@ mod test {
    use super::*;
    use crate::{
        blob::{Blob, BlobIdMap},
+        entropy::calculate_shannon_entropy,
        origin::{Origin, OriginSet},
        rules::rule::{
            DependsOnRule, HttpRequest, HttpValidation, PatternRequirements, RuleSyntax, Validation,
@ -1341,8 +652,6 @@ mod test {
        let input = "some test data for vectorscan";
        let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
        let enable_rule_profiling = true;
-        // let mut matcher = Matcher::new(&rules_db, &seen_blobs, None,
-        // enable_rule_profiling)?;
        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
        let mut matcher = Matcher::new(
            &rules_db,
@ -1513,7 +822,7 @@ mod test {
        assert_eq!((item.pos_start, item.pos_end), (4, 4 + base64_payload.len()));
    }

-    /// `compute_finding_fingerprint` must be stable (same input ⇒ same output)
+    /// `compute_finding_fingerprint` must be stable (same input => same output)
    /// and sensitive to any input component.
    #[test]
    fn test_finding_fingerprint_stability_and_uniqueness() {
@ -1535,7 +844,7 @@ mod test {
    /// keys as soon as *anything* changes.
    #[test]
    fn test_compute_match_key_uniqueness() {
-        use super::compute_match_key;
+        use super::dedup::compute_match_key;

        let k1 = compute_match_key(b"abc", b"rule-1", 0, 3);
        let k2 = compute_match_key(b"abc", b"rule-1", 0, 3);
@ -1721,6 +1030,8 @@ line2

    #[test]
    fn serializes_captures_in_numeric_order() {
+        use regex::bytes::Regex;
+
        let re =
            Regex::new(r"(?xi)\b(ghp_(?P<body>[A-Z0-9]{3})(?P<checksum>[A-Z0-9]{2}))").unwrap();
        let caps = re.captures(b"ghp_ABC12").expect("expected captures");
--- a/src/scanner/runner.rs
+++ b/src/scanner/runner.rs
@ -42,9 +42,18 @@ use crate::{
        AccessMapCollector,
    },
    util::set_redaction_enabled,
+    validation::CachedResponse,
    validation_rate_limit::ValidationRateLimiter,
 };

+/// Shared validation dependencies: (liquid parser, HTTP clients, validation cache, rate limiter).
+type ValidationDeps = Arc<(
+    liquid::Parser,
+    crate::validation::ValidationClients,
+    Arc<SkipMap<String, CachedResponse>>,
+    Option<Arc<ValidationRateLimiter>>,
+)>;
+
 pub async fn run_scan(
    global_args: &global::GlobalArgs,
    scan_args: &scan::ScanArgs,
@ -64,22 +73,9 @@ pub async fn run_async_scan(
    rules_db: &RulesDatabase,
    update_status: &crate::update::UpdateStatus,
 ) -> Result<()> {
-    // Ensure all provided paths exist before proceeding
-    for path in &args.input_specifier_args.path_inputs {
-        if !path.exists() {
-            error!("Specified input path does not exist: {}", path.display());
-            bail!("Invalid input: Path does not exist - {}", path.display());
-        }
-    }
-
-    // Register user-provided allow-list patterns
-    for pattern in &args.skip_regex {
-        safe_list::add_user_regex(pattern)
-            .map_err(|e| anyhow::anyhow!("Invalid skip-regex '{pattern}': {e}"))?;
-    }
-    for word in &args.skip_word {
-        safe_list::add_user_skipword(word);
-    }
+    // ── Phase 1: Input validation and environment setup ──────────────────
+    validate_inputs(args)?;
+    register_safe_list_patterns(args)?;

    let start_time = Instant::now();
    let scan_started_at = chrono::Local::now();
@ -90,123 +86,26 @@ pub async fn run_async_scan(

    set_redaction_enabled(args.redact);

-    let mut repo_urls = enumerate_github_repos(args, global_args).await?;
-    let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
-    let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
-    let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
-    let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
-    let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
-
-    // Combine repository URLs
-    repo_urls.extend(gitlab_repo_urls);
-    repo_urls.extend(gitea_repo_urls);
-    repo_urls.extend(huggingface_repo_urls);
-    repo_urls.extend(bitbucket_repo_urls);
-    repo_urls.extend(azure_repo_urls);
-
-    // Add wiki repositories for each URL when requested
-    if args.input_specifier_args.repo_artifacts {
-        let mut wiki_urls = Vec::new();
-        for url in &repo_urls {
-            if let Some(w) = github::wiki_url(url) {
-                wiki_urls.push(w);
-            }
-            if let Some(w) = gitlab::wiki_url(url) {
-                wiki_urls.push(w);
-            }
-            if let Some(w) = gitea::wiki_url(url) {
-                wiki_urls.push(w);
-            }
-            if let Some(w) = bitbucket::wiki_url(url) {
-                wiki_urls.push(w);
-            }
-            if let Some(w) = azure::wiki_url(url) {
-                wiki_urls.push(w);
-            }
-        }
-        repo_urls.extend(wiki_urls);
-    }
-
-    // just sort and dedup once
-    repo_urls.sort();
-    repo_urls.dedup();
+    // ── Phase 2: Repository enumeration ─────────────────────────────────
+    let repo_urls = enumerate_all_repos(args, global_args).await?;

    let mut input_roots = args.input_specifier_args.path_inputs.clone();
    let (repo_tx, repo_rx) = crossbeam_channel::unbounded();
-    let repo_clone_handle = if repo_urls.is_empty() {
-        None
-    } else {
-        let clone_args = args.clone();
-        let clone_globals = global_args.clone();
-        let clone_repo_urls = repo_urls.clone();
-        let clone_datastore = Arc::clone(&datastore);
-        let clone_repo_tx = repo_tx.clone();
-        Some(std::thread::spawn(move || {
-            if let Err(e) = clone_or_update_git_repos_streaming(
-                &clone_args,
-                &clone_globals,
-                &clone_repo_urls,
-                &clone_datastore,
-                |path| {
-                    let _ = clone_repo_tx.send(path);
-                },
-            ) {
-                error!("Failed to fetch one or more Git repositories: {e}");
-            }
-        }))
-    };
-    drop(repo_tx);
+    let repo_clone_handle =
+        start_repo_cloning(&repo_urls, args, global_args, &datastore, repo_tx, progress_enabled);

-    // Fetch issues, gists, and wikis if enabled
-    let bitbucket_auth = bitbucket::AuthConfig::from_env();
-    let bitbucket_host =
-        args.input_specifier_args.bitbucket_api_url.host_str().map(|s| s.to_string());
-
-    if args.input_specifier_args.repo_artifacts {
-        let repo_artifact_dirs = fetch_git_host_artifacts(
-            &repo_urls,
-            &args.input_specifier_args.bitbucket_api_url,
-            &bitbucket_auth,
-            bitbucket_host.clone(),
-            global_args,
-            &datastore,
-        )
-        .await?;
-        input_roots.extend(repo_artifact_dirs);
-    }
-    // Fetch Jira issues if requested
-    let jira_dirs = fetch_jira_issues(args, global_args, &datastore).await?;
-    input_roots.extend(jira_dirs);
-
-    // Fetch Confluence pages if requested
-    let confluence_dirs = fetch_confluence_pages(args, global_args, &datastore).await?;
-    input_roots.extend(confluence_dirs);
-
-    // Fetch Slack messages if requested
-    let slack_dirs = fetch_slack_messages(args, global_args, &datastore).await?;
-    input_roots.extend(slack_dirs);
-
-    // Save Docker images if specified
-    if !args.input_specifier_args.docker_image.is_empty() {
-        let clone_root = {
-            let ds = datastore.lock().unwrap();
-            ds.clone_root()
-        };
-        let docker_dirs = save_docker_images(
-            &args.input_specifier_args.docker_image,
-            &clone_root,
-            progress_enabled,
-        )
-        .await?;
-        for (dir, img) in docker_dirs {
-            {
-                let mut ds = datastore.lock().unwrap();
-                ds.register_docker_image(dir.clone(), img);
-            }
-            input_roots.push(dir);
-        }
-    }
+    // ── Phase 3: Artifact fetching ──────────────────────────────────────
+    fetch_all_artifacts(
+        args,
+        global_args,
+        &repo_urls,
+        &datastore,
+        &mut input_roots,
+        progress_enabled,
+    )
+    .await?;

+    // ── Phase 4: Scan configuration ─────────────────────────────────────
    let shared_profiler = Arc::new(ConcurrentRuleProfiler::new());
    let enable_profiling = args.rule_stats;
    let matcher_stats = Arc::new(Mutex::new(MatcherStats::default()));
@ -246,11 +145,257 @@ pub async fn run_async_scan(
            .unwrap_or_else(|| std::path::PathBuf::from("baseline-file.yaml")),
    );

-    let mut skip_aws_accounts = args.skip_aws_account.clone();
+    let skip_aws_accounts = load_skip_aws_accounts(args)?;
+    crate::validation::set_skip_aws_account_ids(skip_aws_accounts);

    let mut access_map_collector =
        if args.access_map { Some(AccessMapCollector::default()) } else { None };

+    let repo_roots = expand_repo_roots(&input_roots)?;
+    let git_repo_count =
+        repo_roots.iter().filter(|p| p.join(".git").is_dir()).count() + repo_urls.len();
+    let use_parallel_repo_scan = git_repo_count > 10;
+
+    let validation_rate_limiter =
+        ValidationRateLimiter::from_cli(args.validation_rps, &args.validation_rps_rule)?
+            .map(Arc::new);
+
+    let validation_deps: Option<ValidationDeps> = if !args.no_validate {
+        info!("Starting secret validation phase...");
+        Some(Arc::new((
+            register_all(liquid::ParserBuilder::with_stdlib()).build()?,
+            crate::validation::ValidationClients::new(global_args.tls_mode)?,
+            Arc::new(SkipMap::new()),
+            validation_rate_limiter.clone(),
+        )))
+    } else {
+        None
+    };
+
+    // ── Phase 5: Scanning ───────────────────────────────────────────────
+    if !use_parallel_repo_scan {
+        run_sequential_scan(
+            args,
+            global_args,
+            &datastore,
+            rules_db,
+            &mut input_roots,
+            repo_rx,
+            repo_clone_handle,
+            &shared_profiler,
+            enable_profiling,
+            &matcher_stats,
+            &baseline_path,
+            &validation_deps,
+            &mut access_map_collector,
+            progress_enabled,
+            start_time,
+            scan_started_at,
+            update_status,
+        )
+        .await?;
+        return Ok(());
+    }
+
+    run_parallel_scan(
+        args,
+        global_args,
+        &datastore,
+        rules_db,
+        &repo_roots,
+        repo_rx,
+        repo_clone_handle,
+        &shared_profiler,
+        enable_profiling,
+        &matcher_stats,
+        &baseline_path,
+        &validation_deps,
+        &mut access_map_collector,
+        progress_enabled,
+        start_time,
+        scan_started_at,
+        update_status,
+    )
+    .await
+}
+
+// =================================================================================================
+// Phase helpers
+// =================================================================================================
+
+/// Validates that all provided input paths exist.
+fn validate_inputs(args: &scan::ScanArgs) -> Result<()> {
+    for path in &args.input_specifier_args.path_inputs {
+        if !path.exists() {
+            error!("Specified input path does not exist: {}", path.display());
+            bail!("Invalid input: Path does not exist - {}", path.display());
+        }
+    }
+    Ok(())
+}
+
+/// Registers user-provided allow-list patterns (skip-regex and skip-word).
+fn register_safe_list_patterns(args: &scan::ScanArgs) -> Result<()> {
+    for pattern in &args.skip_regex {
+        safe_list::add_user_regex(pattern)
+            .map_err(|e| anyhow::anyhow!("Invalid skip-regex '{pattern}': {e}"))?;
+    }
+    for word in &args.skip_word {
+        safe_list::add_user_skipword(word);
+    }
+    Ok(())
+}
+
+/// Enumerates repositories from all configured platforms, adds wiki URLs, and deduplicates.
+async fn enumerate_all_repos(
+    args: &scan::ScanArgs,
+    global_args: &global::GlobalArgs,
+) -> Result<Vec<crate::git_url::GitUrl>> {
+    let mut repo_urls = enumerate_github_repos(args, global_args).await?;
+    let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
+    let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
+    let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
+    let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
+    let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
+
+    repo_urls.extend(gitlab_repo_urls);
+    repo_urls.extend(gitea_repo_urls);
+    repo_urls.extend(huggingface_repo_urls);
+    repo_urls.extend(bitbucket_repo_urls);
+    repo_urls.extend(azure_repo_urls);
+
+    // Add wiki repositories for each URL when requested
+    if args.input_specifier_args.repo_artifacts {
+        let mut wiki_urls = Vec::new();
+        for url in &repo_urls {
+            if let Some(w) = github::wiki_url(url) {
+                wiki_urls.push(w);
+            }
+            if let Some(w) = gitlab::wiki_url(url) {
+                wiki_urls.push(w);
+            }
+            if let Some(w) = gitea::wiki_url(url) {
+                wiki_urls.push(w);
+            }
+            if let Some(w) = bitbucket::wiki_url(url) {
+                wiki_urls.push(w);
+            }
+            if let Some(w) = azure::wiki_url(url) {
+                wiki_urls.push(w);
+            }
+        }
+        repo_urls.extend(wiki_urls);
+    }
+
+    repo_urls.sort();
+    repo_urls.dedup();
+
+    Ok(repo_urls)
+}
+
+/// Spawns a background thread to clone/update git repositories, streaming results via a channel.
+fn start_repo_cloning(
+    repo_urls: &[crate::git_url::GitUrl],
+    args: &scan::ScanArgs,
+    global_args: &global::GlobalArgs,
+    datastore: &Arc<Mutex<FindingsStore>>,
+    repo_tx: crossbeam_channel::Sender<PathBuf>,
+    _progress_enabled: bool,
+) -> Option<std::thread::JoinHandle<()>> {
+    if repo_urls.is_empty() {
+        drop(repo_tx);
+        return None;
+    }
+
+    let clone_args = args.clone();
+    let clone_globals = global_args.clone();
+    let clone_repo_urls = repo_urls.to_vec();
+    let clone_datastore = Arc::clone(datastore);
+    let clone_repo_tx = repo_tx.clone();
+
+    let handle = std::thread::spawn(move || {
+        if let Err(e) = clone_or_update_git_repos_streaming(
+            &clone_args,
+            &clone_globals,
+            &clone_repo_urls,
+            &clone_datastore,
+            |path| {
+                let _ = clone_repo_tx.send(path);
+            },
+        ) {
+            error!("Failed to fetch one or more Git repositories: {e}");
+        }
+    });
+    drop(repo_tx);
+    Some(handle)
+}
+
+/// Fetches artifacts from various platforms (issues, wikis, Jira, Confluence, Slack, Docker).
+async fn fetch_all_artifacts(
+    args: &scan::ScanArgs,
+    global_args: &global::GlobalArgs,
+    repo_urls: &[crate::git_url::GitUrl],
+    datastore: &Arc<Mutex<FindingsStore>>,
+    input_roots: &mut Vec<PathBuf>,
+    progress_enabled: bool,
+) -> Result<()> {
+    let bitbucket_auth = bitbucket::AuthConfig::from_env();
+    let bitbucket_host =
+        args.input_specifier_args.bitbucket_api_url.host_str().map(|s| s.to_string());
+
+    if args.input_specifier_args.repo_artifacts {
+        let repo_artifact_dirs = fetch_git_host_artifacts(
+            repo_urls,
+            &args.input_specifier_args.bitbucket_api_url,
+            &bitbucket_auth,
+            bitbucket_host.clone(),
+            global_args,
+            datastore,
+        )
+        .await?;
+        input_roots.extend(repo_artifact_dirs);
+    }
+
+    // Fetch Jira issues if requested
+    let jira_dirs = fetch_jira_issues(args, global_args, datastore).await?;
+    input_roots.extend(jira_dirs);
+
+    // Fetch Confluence pages if requested
+    let confluence_dirs = fetch_confluence_pages(args, global_args, datastore).await?;
+    input_roots.extend(confluence_dirs);
+
+    // Fetch Slack messages if requested
+    let slack_dirs = fetch_slack_messages(args, global_args, datastore).await?;
+    input_roots.extend(slack_dirs);
+
+    // Save Docker images if specified
+    if !args.input_specifier_args.docker_image.is_empty() {
+        let clone_root = {
+            let ds = datastore.lock().unwrap();
+            ds.clone_root()
+        };
+        let docker_dirs = save_docker_images(
+            &args.input_specifier_args.docker_image,
+            &clone_root,
+            progress_enabled,
+        )
+        .await?;
+        for (dir, img) in docker_dirs {
+            {
+                let mut ds = datastore.lock().unwrap();
+                ds.register_docker_image(dir.clone(), img);
+            }
+            input_roots.push(dir);
+        }
+    }
+
+    Ok(())
+}
+
+/// Loads AWS account IDs to skip from CLI args and optional file.
+fn load_skip_aws_accounts(args: &scan::ScanArgs) -> Result<Vec<String>> {
+    let mut skip_aws_accounts = args.skip_aws_account.clone();
+
    if let Some(path) = args.skip_aws_account_file.as_ref() {
        let contents = fs::read_to_string(path).with_context(|| {
            format!("Failed to read --skip-aws-account-file {}", path.display())
@ -267,183 +412,214 @@ pub async fn run_async_scan(
        }
    }

-    crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
+    Ok(skip_aws_accounts)
+}

-    let repo_roots = expand_repo_roots(&input_roots)?;
-    let git_repo_count =
-        repo_roots.iter().filter(|p| p.join(".git").is_dir()).count() + repo_urls.len();
-    let use_parallel_repo_scan = git_repo_count > 10;
-
-    let validation_rate_limiter =
-        ValidationRateLimiter::from_cli(args.validation_rps, &args.validation_rps_rule)?
-            .map(Arc::new);
-
-    let validation_deps = if !args.no_validate {
-        info!("Starting secret validation phase...");
-        Some(Arc::new((
-            register_all(liquid::ParserBuilder::with_stdlib()).build()?,
-            crate::validation::ValidationClients::new(global_args.tls_mode)?,
-            Arc::new(SkipMap::new()),
-            validation_rate_limiter.clone(),
-        )))
-    } else {
-        None
-    };
-
-    if !use_parallel_repo_scan {
-        let mut streamed_roots = Vec::new();
-        if !input_roots.is_empty() {
-            let _inputs = enumerate_filesystem_inputs(
-                args,
-                datastore.clone(),
-                &input_roots,
-                progress_enabled,
-                rules_db,
-                enable_profiling,
-                Arc::clone(&shared_profiler),
-                matcher_stats.as_ref(),
-            )?;
-        }
-
-        for repo_root in repo_rx.clone().iter() {
-            enumerate_filesystem_inputs(
-                args,
-                datastore.clone(),
-                &[repo_root.clone()],
-                progress_enabled,
-                rules_db,
-                enable_profiling,
-                Arc::clone(&shared_profiler),
-                matcher_stats.as_ref(),
-            )?;
-            streamed_roots.push(repo_root);
-        }
-        input_roots.extend(streamed_roots);
-
-        if let Some(handle) = repo_clone_handle {
-            let _ = handle.join();
-        }
-
-        if !args.no_dedup {
-            let reporter = crate::reporter::DetailsReporter {
-                datastore: Arc::clone(&datastore),
-                styles: Styles::new(global_args.use_color(std::io::stdout())),
-                only_valid: args.only_valid,
-            };
-
-            let all_matches = reporter.get_unfiltered_matches(Some(false))?;
-            let deduped_matches = reporter.deduplicate_matches(all_matches, args.no_dedup);
-
-            let deduped_arcs: Vec<Arc<FindingsStoreMessage>> = deduped_matches
-                .into_iter()
-                .map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m)))
-                .collect();
-            let mut ds = datastore.lock().unwrap();
-            ds.replace_matches(deduped_arcs);
-        }
-
-        if args.baseline_file.is_some() || args.manage_baseline {
-            let mut ds = datastore.lock().unwrap();
-            crate::baseline::apply_baseline(
-                &mut ds,
-                baseline_path.as_ref(),
-                args.manage_baseline,
-                &input_roots,
-            )?;
-        }
-
-        if let Some(validation) = &validation_deps {
-            let (parser, clients, cache, rate_limiter) =
-                (&validation.0, &validation.1, &validation.2, &validation.3);
-            run_secret_validation(
-                Arc::clone(&datastore),
-                parser,
-                clients,
-                cache,
-                args.num_jobs,
-                None,
-                access_map_collector.clone(),
-                rate_limiter.clone(),
-                Duration::from_secs(args.validation_timeout),
-                args.validation_retries,
-            )
-            .await?;
-        }
-
-        if let Some(collector) = access_map_collector.take() {
-            finalize_access_map(&datastore, collector, args).await?;
-        }
-
-        crate::reporter::run(global_args, Arc::clone(&datastore), args)
-            .context("Failed to run report command")?;
-        print_scan_summary(
-            start_time,
-            scan_started_at,
-            &datastore,
-            global_args,
-            args,
-            rules_db,
-            matcher_stats.as_ref(),
-            if enable_profiling { Some(shared_profiler.as_ref()) } else { None },
-            update_status,
-            None,
-            None,
-        );
-        maybe_hint_access_map(&datastore, args);
+/// Deduplicates matches in the datastore starting from `start_index`.
+fn deduplicate_new_matches(
+    store: &Arc<Mutex<FindingsStore>>,
+    global_args: &global::GlobalArgs,
+    args: &scan::ScanArgs,
+    start_index: usize,
+) -> Result<()> {
+    if args.no_dedup {
        return Ok(());
    }

-    let deduplicate_new_matches =
-        |store: &Arc<Mutex<FindingsStore>>, start_index: usize| -> Result<()> {
-            if args.no_dedup {
-                return Ok(());
-            }
+    let reporter = crate::reporter::DetailsReporter {
+        datastore: Arc::clone(store),
+        styles: Styles::new(global_args.use_color(std::io::stdout())),
+        only_valid: args.only_valid,
+    };

-            let reporter = crate::reporter::DetailsReporter {
-                datastore: Arc::clone(store),
-                styles: Styles::new(global_args.use_color(std::io::stdout())),
-                only_valid: args.only_valid,
-            };
+    let all_matches = reporter.get_unfiltered_matches(Some(false))?;
+    if start_index >= all_matches.len() {
+        return Ok(());
+    }

-            let all_matches = reporter.get_unfiltered_matches(Some(false))?;
-            if start_index >= all_matches.len() {
-                return Ok(());
-            }
+    let slice = if start_index == 0 { all_matches } else { all_matches[start_index..].to_vec() };
+    let deduped_matches = reporter.deduplicate_matches(slice, args.no_dedup);

-            let deduped_matches =
-                reporter.deduplicate_matches(all_matches[start_index..].to_vec(), args.no_dedup);
+    let deduped_arcs: Vec<Arc<FindingsStoreMessage>> = deduped_matches
+        .into_iter()
+        .map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m)))
+        .collect();

-            let deduped_arcs: Vec<Arc<FindingsStoreMessage>> = deduped_matches
-                .into_iter()
-                .map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m)))
-                .collect();
-
-            let mut ds = store.lock().unwrap();
-            let mut preserved = ds.get_matches()[..start_index].to_vec();
-            preserved.extend(deduped_arcs);
-            ds.replace_matches(preserved);
-            Ok(())
-        };
-
-    deduplicate_new_matches(&datastore, 0)?;
+    let mut ds = store.lock().unwrap();
+    if start_index == 0 {
+        ds.replace_matches(deduped_arcs);
+    } else {
+        let mut preserved = ds.get_matches()[..start_index].to_vec();
+        preserved.extend(deduped_arcs);
+        ds.replace_matches(preserved);
+    }
+    Ok(())
+}

+/// Applies baseline filtering if configured.
+fn apply_baseline_if_configured(
+    args: &scan::ScanArgs,
+    datastore: &Arc<Mutex<FindingsStore>>,
+    baseline_path: &std::path::Path,
+    roots: &[PathBuf],
+) -> Result<()> {
    if args.baseline_file.is_some() || args.manage_baseline {
        let mut ds = datastore.lock().unwrap();
-        crate::baseline::apply_baseline(
-            &mut ds,
-            baseline_path.as_ref(),
-            args.manage_baseline,
-            &repo_roots,
+        crate::baseline::apply_baseline(&mut ds, baseline_path, args.manage_baseline, roots)?;
+    }
+    Ok(())
+}
+
+/// Runs the validation phase on matches in the datastore.
+#[allow(clippy::too_many_arguments)]
+async fn run_validation_phase(
+    datastore: &Arc<Mutex<FindingsStore>>,
+    validation_deps: &Option<ValidationDeps>,
+    args: &scan::ScanArgs,
+    match_range: Option<std::ops::Range<usize>>,
+    access_map_collector: Option<AccessMapCollector>,
+) -> Result<()> {
+    if let Some(validation) = validation_deps {
+        let (parser, clients, cache, rate_limiter) =
+            (&validation.0, &validation.1, &validation.2, &validation.3);
+        run_secret_validation(
+            Arc::clone(datastore),
+            parser,
+            clients,
+            cache,
+            args.num_jobs,
+            match_range,
+            access_map_collector,
+            rate_limiter.clone(),
+            Duration::from_secs(args.validation_timeout),
+            args.validation_retries,
+        )
+        .await?;
+    }
+    Ok(())
+}
+
+// =================================================================================================
+// Sequential scan path
+// =================================================================================================
+
+#[allow(clippy::too_many_arguments)]
+async fn run_sequential_scan(
+    args: &scan::ScanArgs,
+    global_args: &global::GlobalArgs,
+    datastore: &Arc<Mutex<FindingsStore>>,
+    rules_db: &RulesDatabase,
+    input_roots: &mut Vec<PathBuf>,
+    repo_rx: crossbeam_channel::Receiver<PathBuf>,
+    repo_clone_handle: Option<std::thread::JoinHandle<()>>,
+    shared_profiler: &Arc<ConcurrentRuleProfiler>,
+    enable_profiling: bool,
+    matcher_stats: &Arc<Mutex<MatcherStats>>,
+    baseline_path: &Arc<PathBuf>,
+    validation_deps: &Option<ValidationDeps>,
+    access_map_collector: &mut Option<AccessMapCollector>,
+    progress_enabled: bool,
+    start_time: Instant,
+    scan_started_at: chrono::DateTime<chrono::Local>,
+    update_status: &crate::update::UpdateStatus,
+) -> Result<()> {
+    let mut streamed_roots = Vec::new();
+    if !input_roots.is_empty() {
+        let _inputs = enumerate_filesystem_inputs(
+            args,
+            datastore.clone(),
+            input_roots,
+            progress_enabled,
+            rules_db,
+            enable_profiling,
+            Arc::clone(shared_profiler),
+            matcher_stats.as_ref(),
        )?;
    }

-    if let Some(validation) = &validation_deps {
+    for repo_root in repo_rx.iter() {
+        enumerate_filesystem_inputs(
+            args,
+            datastore.clone(),
+            &[repo_root.clone()],
+            progress_enabled,
+            rules_db,
+            enable_profiling,
+            Arc::clone(shared_profiler),
+            matcher_stats.as_ref(),
+        )?;
+        streamed_roots.push(repo_root);
+    }
+    input_roots.extend(streamed_roots);
+
+    if let Some(handle) = repo_clone_handle {
+        let _ = handle.join();
+    }
+
+    deduplicate_new_matches(datastore, global_args, args, 0)?;
+    apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), input_roots)?;
+
+    run_validation_phase(datastore, validation_deps, args, None, access_map_collector.clone())
+        .await?;
+
+    if let Some(collector) = access_map_collector.take() {
+        finalize_access_map(datastore, collector, args).await?;
+    }
+
+    crate::reporter::run(global_args, Arc::clone(datastore), args)
+        .context("Failed to run report command")?;
+    print_scan_summary(
+        start_time,
+        scan_started_at,
+        datastore,
+        global_args,
+        args,
+        rules_db,
+        matcher_stats.as_ref(),
+        if enable_profiling { Some(shared_profiler.as_ref()) } else { None },
+        update_status,
+        None,
+        None,
+    );
+    maybe_hint_access_map(datastore, args);
+    Ok(())
+}
+
+// =================================================================================================
+// Parallel scan path
+// =================================================================================================
+
+#[allow(clippy::too_many_arguments)]
+async fn run_parallel_scan(
+    args: &scan::ScanArgs,
+    global_args: &global::GlobalArgs,
+    datastore: &Arc<Mutex<FindingsStore>>,
+    rules_db: &RulesDatabase,
+    repo_roots: &[PathBuf],
+    repo_rx: crossbeam_channel::Receiver<PathBuf>,
+    repo_clone_handle: Option<std::thread::JoinHandle<()>>,
+    shared_profiler: &Arc<ConcurrentRuleProfiler>,
+    enable_profiling: bool,
+    matcher_stats: &Arc<Mutex<MatcherStats>>,
+    baseline_path: &Arc<PathBuf>,
+    validation_deps: &Option<ValidationDeps>,
+    access_map_collector: &mut Option<AccessMapCollector>,
+    progress_enabled: bool,
+    start_time: Instant,
+    scan_started_at: chrono::DateTime<chrono::Local>,
+    update_status: &crate::update::UpdateStatus,
+) -> Result<()> {
+    deduplicate_new_matches(datastore, global_args, args, 0)?;
+    apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), repo_roots)?;
+
+    // Validate initial (non-repo) matches
+    if let Some(validation) = validation_deps {
        let (parser, clients, cache, rate_limiter) =
            (&validation.0, &validation.1, &validation.2, &validation.3);
        let initial_match_count = { datastore.lock().unwrap().get_matches().len() };
        if initial_match_count > 0 {
            run_secret_validation(
-                Arc::clone(&datastore),
+                Arc::clone(datastore),
                parser,
                clients,
                cache,
@ -458,6 +634,7 @@ pub async fn run_async_scan(
        }
    }

+    // Parallel per-repo scanning
    let repo_concurrency = std::cmp::max(1, args.num_jobs);
    let rt_handle = Handle::current();

@ -476,16 +653,16 @@ pub async fn run_async_scan(
            let spawn_repo_scan = |root: PathBuf| {
                let repo_rules = repo_rules.clone();
                let base_clone_root = base_clone_root.clone();
-                let baseline_path = Arc::clone(&baseline_path);
-                let shared_profiler = Arc::clone(&shared_profiler);
+                let baseline_path = Arc::clone(baseline_path);
+                let shared_profiler = Arc::clone(shared_profiler);
                let args = args.clone();
                let root = root.clone();
                let validation_deps = validation_deps.clone();
-                let matcher_stats = Arc::clone(&matcher_stats);
+                let matcher_stats = Arc::clone(matcher_stats);
                let rt_handle = rt_handle.clone();
                let ran_repo_scan = Arc::clone(&ran_repo_scan);
                let repo_errors = Arc::clone(&repo_errors);
-                let datastore = Arc::clone(&datastore);
+                let datastore = Arc::clone(datastore);
                let access_map = access_map_collector.clone();

                scope.spawn(move |_| {
@ -509,7 +686,9 @@ pub async fn run_async_scan(
                            Arc::clone(&shared_profiler),
                            &repo_matcher_stats,
                        )
-                        .and_then(|_| deduplicate_new_matches(&repo_datastore, 0))?;
+                        .and_then(|_| {
+                            deduplicate_new_matches(&repo_datastore, global_args, &args, 0)
+                        })?;

                        if args.baseline_file.is_some() || args.manage_baseline {
                            let mut ds = repo_datastore.lock().unwrap();
@ -568,11 +747,11 @@ pub async fn run_async_scan(
                });
            };

-            for root in repo_roots.clone() {
+            for root in repo_roots.iter().cloned() {
                spawn_repo_scan(root);
            }

-            for root in repo_rx.clone().iter() {
+            for root in repo_rx.iter() {
                spawn_repo_scan(root);
            }
        });
@ -586,51 +765,27 @@ pub async fn run_async_scan(
    }

    if output_to_file && ran_repo_scan.load(Ordering::Relaxed) {
-        crate::reporter::run(global_args, Arc::clone(&datastore), args)
+        crate::reporter::run(global_args, Arc::clone(datastore), args)
            .context("Failed to run report command")?;
    }

    if !ran_repo_scan.load(Ordering::Relaxed) {
-        deduplicate_new_matches(&datastore, 0)?;
+        deduplicate_new_matches(datastore, global_args, args, 0)?;
+        apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), repo_roots)?;

-        if args.baseline_file.is_some() || args.manage_baseline {
-            let mut ds = datastore.lock().unwrap();
-            crate::baseline::apply_baseline(
-                &mut ds,
-                baseline_path.as_ref(),
-                args.manage_baseline,
-                &repo_roots,
-            )?;
-        }
-
-        if let Some(validation) = &validation_deps {
-            let (parser, clients, cache, rate_limiter) =
-                (&validation.0, &validation.1, &validation.2, &validation.3);
-            run_secret_validation(
-                Arc::clone(&datastore),
-                parser,
-                clients,
-                cache,
-                args.num_jobs,
-                None,
-                access_map_collector.clone(),
-                rate_limiter.clone(),
-                Duration::from_secs(args.validation_timeout),
-                args.validation_retries,
-            )
+        run_validation_phase(datastore, validation_deps, args, None, access_map_collector.clone())
            .await?;
-        }

        if let Some(collector) = access_map_collector.take() {
-            finalize_access_map(&datastore, collector, args).await?;
+            finalize_access_map(datastore, collector, args).await?;
        }

-        crate::reporter::run(global_args, Arc::clone(&datastore), args)
+        crate::reporter::run(global_args, Arc::clone(datastore), args)
            .context("Failed to run report command")?;
    }

    let aggregate_summary = if ran_repo_scan.load(Ordering::Relaxed) {
-        let totals = compute_scan_totals(&datastore, args, matcher_stats.as_ref());
+        let totals = compute_scan_totals(datastore, args, matcher_stats.as_ref());
        let mut sorted: Vec<_> = datastore.lock().unwrap().get_summary().into_iter().collect();
        sorted.sort_by(|a, b| b.1.cmp(&a.1));
        Some((totals, sorted))
@ -641,7 +796,7 @@ pub async fn run_async_scan(
    print_scan_summary(
        start_time,
        scan_started_at,
-        &datastore,
+        datastore,
        global_args,
        args,
        rules_db,
@ -652,14 +807,18 @@ pub async fn run_async_scan(
        aggregate_summary,
    );

-    if let Some(collector) = access_map_collector {
-        finalize_access_map(&datastore, collector, args).await?;
+    if let Some(collector) = access_map_collector.take() {
+        finalize_access_map(datastore, collector, args).await?;
    } else {
-        maybe_hint_access_map(&datastore, args);
+        maybe_hint_access_map(datastore, args);
    }
    Ok(())
 }

+// =================================================================================================
+// Existing helper functions (unchanged)
+// =================================================================================================
+
 async fn finalize_access_map(
    datastore: &Arc<Mutex<FindingsStore>>,
    collector: AccessMapCollector,
@ -780,8 +939,6 @@ pub fn create_datastore_channel(
 ) {
    const BATCH_SIZE: usize = 1024;
    let channel_size = std::cmp::max(num_jobs * BATCH_SIZE, 16 * BATCH_SIZE);
-    // const BATCH_SIZE: usize = 256;
-    // let channel_size = std::cmp::max(num_jobs * BATCH_SIZE, 4096);
    crossbeam_channel::bounded(channel_size)
 }

@ -864,7 +1021,6 @@ pub fn load_and_record_rules(
 ) -> Result<RulesDatabase> {
    let init_progress =
        if use_progress { ProgressBar::new_spinner() } else { ProgressBar::hidden() };
-    // init_progress.set_message("Compiling rules...");
    let rules_db = {
        let loaded = RuleLoader::from_rule_specifiers(&args.rules)
            .load(args)
@ -876,7 +1032,6 @@ pub fn load_and_record_rules(
            .cloned()
            .map(|mut rule| {
                if let Some(min_entropy) = args.min_entropy {
-                    // rule.syntax.min_entropy = min_entropy;
                    let _ = rule.set_entropy(min_entropy);
                }
                rule