From d3e659491d3314349fc2e169350aab05759a1dc8 Mon Sep 17 00:00:00 2001
From: Mick Grove <mick.grove@mongodb.com>
Date: Sat, 14 Feb 2026 13:12:26 -0800
Subject: [PATCH] refactored code

---
 crates/kingfisher-core/Cargo.toml           |   3 +
 crates/kingfisher-core/src/content_type.rs  | 232 ++++++
 crates/kingfisher-core/src/lib.rs           |   2 +
 crates/kingfisher-scanner/src/lib.rs        |   1 +
 crates/kingfisher-scanner/src/primitives.rs | 187 +++++
 crates/kingfisher-scanner/src/scanner.rs    | 173 +----
 src/access_map.rs                           | 164 +++-
 src/azure.rs                                |  91 +--
 src/bitbucket.rs                            |  86 +-
 src/content_type.rs                         | 234 +-----
 src/git_host.rs                             | 121 +++
 src/gitea.rs                                |  80 +-
 src/github.rs                               |  82 +-
 src/gitlab.rs                               |  82 +-
 src/lib.rs                                  |   1 +
 src/matcher/base64_decode.rs                |   2 +
 src/matcher/captures.rs                     | 140 ++++
 src/matcher/conversion.rs                   | 251 ++++++
 src/matcher/dedup.rs                        |   2 +
 src/matcher/filter.rs                       | 274 +++++++
 src/matcher/fingerprint.rs                  |   2 +
 src/{matcher.rs => matcher/mod.rs}          | 777 ++----------------
 src/scanner/runner.rs                       | 821 ++++++++++++--------
 23 files changed, 1924 insertions(+), 1884 deletions(-)
 create mode 100644 crates/kingfisher-core/src/content_type.rs
 create mode 100644 crates/kingfisher-scanner/src/primitives.rs
 create mode 100644 src/git_host.rs
 create mode 100644 src/matcher/base64_decode.rs
 create mode 100644 src/matcher/captures.rs
 create mode 100644 src/matcher/conversion.rs
 create mode 100644 src/matcher/dedup.rs
 create mode 100644 src/matcher/filter.rs
 create mode 100644 src/matcher/fingerprint.rs
 rename src/{matcher.rs => matcher/mod.rs} (58%)

diff --git a/crates/kingfisher-core/Cargo.toml b/crates/kingfisher-core/Cargo.toml
index ef532f3..d8a0902 100644
--- a/crates/kingfisher-core/Cargo.toml
+++ b/crates/kingfisher-core/Cargo.toml
@@ -44,5 +44,8 @@ gix = { version = "0.73", default-features = false, features = ["serde"] }
 # Console formatting
 console = "0.15"
 
+# Language detection for content types
+tokei = "14.0.0"
+
 [dev-dependencies]
 pretty_assertions = "1.4"
diff --git a/crates/kingfisher-core/src/content_type.rs b/crates/kingfisher-core/src/content_type.rs
new file mode 100644
index 0000000..611badb
--- /dev/null
+++ b/crates/kingfisher-core/src/content_type.rs
@@ -0,0 +1,232 @@
+use once_cell::sync::Lazy;
+use std::path::Path;
+use tokei::LanguageType;
+
+// Precompute all (shebang_prefix_bytes, language) pairs once.
+// Sort longest-first so more specific shebangs win.
+static SHEBANG_PREFIXES: Lazy<Vec<(&'static [u8], LanguageType)>> = Lazy::new(|| {
+    let mut v = Vec::new();
+    for &(lang, shebangs) in LanguageType::list() {
+        for &sb in shebangs {
+            v.push((sb.as_bytes(), lang));
+        }
+    }
+    // Longest prefix first to prefer e.g. "#!/usr/bin/env python3" over "#!/usr/bin/env python"
+    v.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
+    v
+});
+
+/// The type of content detected.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ContentType {
+    /// Unprintable or control-heavy data.
+    BINARY,
+    /// Mostly printable text.
+    TEXT,
+}
+
+/// Heuristic thresholds for text vs. binary detection.
+pub struct ContentInspector {
+    max_null_bytes: usize,
+    max_control_ratio: f64,
+}
+
+impl Default for ContentInspector {
+    fn default() -> Self {
+        Self { max_null_bytes: 4, max_control_ratio: 0.3 }
+    }
+}
+
+impl ContentInspector {
+    /// Create a new inspector with default thresholds.
+    #[inline]
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// Classify `bytes` as TEXT or BINARY:
+    ///
+    /// 1. If null-byte count > `max_null_bytes` -- `BINARY`.
+    /// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` →
+    ///    `BINARY`.
+    /// 3. Otherwise,  `TEXT`.
+    #[inline]
+    #[must_use]
+    pub fn inspect(&self, bytes: &[u8]) -> ContentType {
+        let nulls = bytes.iter().filter(|&&b| b == 0).count();
+        if nulls > self.max_null_bytes {
+            return ContentType::BINARY;
+        }
+        let controls =
+            bytes.iter().filter(|&&b| b < 32 && !matches!(b, b'\n' | b'\r' | b'\t')).count();
+        let ratio = if bytes.is_empty() { 0.0 } else { controls as f64 / bytes.len() as f64 };
+        if ratio > self.max_control_ratio {
+            ContentType::BINARY
+        } else {
+            ContentType::TEXT
+        }
+    }
+
+    /// Guess MIME type from `path` extension.
+    ///
+    /// Returns:
+    /// - `Some(mime)` if the extension is one of the known text or image types.
+    /// - `None` if there is no extension or it’s unrecognized.
+    #[inline]
+    #[must_use]
+    pub fn guess_mime_type(&self, path: &Path) -> Option<String> {
+        let ext = path.extension()?.to_str()?.to_ascii_lowercase();
+        let mime = match ext.as_str() {
+            "txt" | "md" | "rst" => "text/plain",
+            "html" | "htm" => "text/html",
+            "css" => "text/css",
+            "js" => "application/javascript",
+            "json" => "application/json",
+            "xml" => "application/xml",
+            "pdf" => "application/pdf",
+            "jpg" | "jpeg" => "image/jpeg",
+            "png" => "image/png",
+            "gif" => "image/gif",
+            _ => return None,
+        };
+        Some(mime.to_string())
+    }
+
+    /// Detect UTF-8 encoding by attempting a lossless conversion.
+    #[inline]
+    #[must_use]
+    pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
+        std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
+    }
+    /// Guess programming language with broad coverage using `tokei`.
+    ///
+    /// Strategy (no disk I/O):
+    /// 1) Try extension via `LanguageType::from_file_extension`.
+    /// 2) Handle common extensionless filenames (e.g., Makefile, Dockerfile, CMakeLists.txt).
+    /// 3) Parse an in-memory shebang (first line) against `LanguageType::shebangs`.
+    /// 4) Minimal content markers as a last resort.
+    ///
+    /// Returns the canonical `tokei` language name (e.g., `Rust`, `Bash`, `Python`).
+    #[inline]
+    #[must_use]
+    pub fn guess_language(&self, path: &Path, content: &[u8]) -> Option<String> {
+        // 1) Extension mapping (fast, no I/O).
+        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
+            if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) {
+                return Some(lang.name().to_string());
+            }
+        }
+
+        // 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure).
+        if let Some(file) = path.file_name().and_then(|f| f.to_str()) {
+            match file {
+                "Makefile" | "makefile" => {
+                    return Some(LanguageType::Makefile.name().to_string());
+                }
+                "Dockerfile" | "dockerfile" => {
+                    return Some(LanguageType::Dockerfile.name().to_string());
+                }
+                "CMakeLists.txt" => {
+                    return Some(LanguageType::CMake.name().to_string());
+                }
+                "Rakefile" | "rakefile" => {
+                    return Some(LanguageType::Rakefile.name().to_string());
+                }
+                // Common ecosystem files; map to their primary language where sensible.
+                "Gemfile" | "gemfile" => {
+                    return Some(LanguageType::Ruby.name().to_string());
+                }
+                _ => {}
+            }
+        }
+
+        // 3) Shebang detection (in-memory): match by longest prefix, byte-wise (no UTF-8 needed).
+        if let Some(first_line) = content.split(|&b| b == b'\n').next() {
+            if first_line.starts_with(b"#!") {
+                for (prefix, lang) in SHEBANG_PREFIXES.iter() {
+                    if first_line.starts_with(prefix) {
+                        return Some(lang.name().to_string());
+                    }
+                }
+            }
+        }
+
+        // 4) Lightweight content markers to catch a few ubiquitous cases without I/O.
+        let s = String::from_utf8_lossy(content);
+        if s.contains("<?php") {
+            return Some(LanguageType::Php.name().to_string());
+        }
+        if s.contains("package main") {
+            return Some(LanguageType::Go.name().to_string());
+        }
+        if s.contains("public class") {
+            return Some(LanguageType::Java.name().to_string());
+        }
+        if s.contains("#!/usr/bin/env bash") || s.contains("#!/bin/bash") {
+            return Some(LanguageType::Bash.name().to_string());
+        }
+        if s.contains("#!/usr/bin/env python") {
+            return Some(LanguageType::Python.name().to_string());
+        }
+
+        None
+    }
+}
+
+/// Shorthand: inspect with default thresholds.
+#[inline]
+#[must_use]
+pub fn inspect(bytes: &[u8]) -> ContentType {
+    ContentInspector::default().inspect(bytes)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::path::PathBuf;
+
+    use super::*;
+
+    #[test]
+    fn binary_vs_text() {
+        let ins = ContentInspector::default();
+        let bin = vec![0, 1, 2, 0, 0, 0, 5];
+        assert_eq!(ins.inspect(&bin), ContentType::BINARY);
+        let txt = b"Hello\nWorld";
+        assert_eq!(ins.inspect(txt), ContentType::TEXT);
+    }
+
+    #[test]
+    fn mime_guess() {
+        let ins = ContentInspector::default();
+        assert_eq!(ins.guess_mime_type(&PathBuf::from("a.md")), Some("text/plain".into()));
+        assert_eq!(ins.guess_mime_type(&PathBuf::from("img.png")), Some("image/png".into()));
+        assert_eq!(ins.guess_mime_type(&PathBuf::from("x.xyz")), None);
+    }
+
+    #[test]
+    fn charset_guess() {
+        let ins = ContentInspector::default();
+        assert_eq!(ins.guess_charset("ok".as_bytes()), Some("UTF-8".into()));
+        assert_eq!(ins.guess_charset(&[0xFF, 0xFE, 0xFD]), None);
+    }
+
+    #[test]
+    fn language_guess() {
+        let ins = ContentInspector::default();
+
+        // Compare case-insensitively by lowercasing both sides.
+        let rust =
+            ins.guess_language(&PathBuf::from("main.rs"), b"").map(|s| s.to_ascii_lowercase());
+        assert_eq!(rust, Some("rust".into()));
+
+        let php = ins
+            .guess_language(&PathBuf::from("x"), b"<?php echo; ?>")
+            .map(|s| s.to_ascii_lowercase());
+        assert_eq!(php, Some("php".into()));
+
+        let bash = ins
+            .guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi")
+            .map(|s| s.to_ascii_lowercase());
+        assert_eq!(bash, Some("bash".into()));
+    }
+}
diff --git a/crates/kingfisher-core/src/lib.rs b/crates/kingfisher-core/src/lib.rs
index dc964f3..f656ea2 100644
--- a/crates/kingfisher-core/src/lib.rs
+++ b/crates/kingfisher-core/src/lib.rs
@@ -9,6 +9,7 @@
 
 pub mod blob;
 pub mod bstring_escape;
+pub mod content_type;
 pub mod entropy;
 pub mod error;
 pub mod git_commit_metadata;
@@ -20,6 +21,7 @@ pub use blob::{
     Blob, BlobAppearance, BlobAppearanceSet, BlobData, BlobId, BlobIdMap, BlobMetadata,
 };
 pub use bstring_escape::Escaped;
+pub use content_type::{ContentInspector, ContentType};
 pub use entropy::calculate_shannon_entropy;
 pub use error::{Error, Result};
 pub use git_commit_metadata::CommitMetadata;
diff --git a/crates/kingfisher-scanner/src/lib.rs b/crates/kingfisher-scanner/src/lib.rs
index 620cb61..f6976ea 100644
--- a/crates/kingfisher-scanner/src/lib.rs
+++ b/crates/kingfisher-scanner/src/lib.rs
@@ -52,6 +52,7 @@
 //! - **validation-all**: Enable all validation features
 
 mod finding;
+pub mod primitives;
 mod scanner;
 mod scanner_pool;
 
diff --git a/crates/kingfisher-scanner/src/primitives.rs b/crates/kingfisher-scanner/src/primitives.rs
new file mode 100644
index 0000000..a9e11fe
--- /dev/null
+++ b/crates/kingfisher-scanner/src/primitives.rs
@@ -0,0 +1,187 @@
+//! Shared matching primitives for secret detection.
+//!
+//! These functions are used by both the high-level `Scanner` API and the
+//! binary crate's `Matcher`. Having a single canonical implementation
+//! eliminates duplicated logic across the codebase.
+
+use std::hash::{Hash, Hasher};
+
+use base64::{engine::general_purpose, Engine};
+use kingfisher_core::OffsetSpan;
+use rustc_hash::{FxHashMap, FxHasher};
+use xxhash_rust::xxh3::xxh3_64;
+
+// -------------------------------------------------------------------------------------------------
+// Base64 detection
+// -------------------------------------------------------------------------------------------------
+
+/// Decoded Base64 data with position information.
+#[derive(Debug, Clone)]
+pub struct DecodedData {
+    pub decoded: Vec<u8>,
+    pub pos_start: usize,
+    pub pos_end: usize,
+}
+
+#[inline]
+pub fn is_base64_byte(b: u8) -> bool {
+    // Accepts both standard base64 ('+', '/') and URL-safe base64 ('-', '_') characters.
+    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
+}
+
+/// Finds standalone Base64-encoded strings in the input and returns decoded data
+/// with byte-offset positions.
+pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
+    let mut results = Vec::new();
+    let mut i = 0;
+    while i < input.len() {
+        while i < input.len() && !is_base64_byte(input[i]) {
+            i += 1;
+        }
+        let start = i;
+        while i < input.len() && is_base64_byte(input[i]) {
+            i += 1;
+        }
+
+        let mut eq_count = 0;
+        while i < input.len() && input[i] == b'=' && eq_count < 2 {
+            i += 1;
+            eq_count += 1;
+        }
+        let end = i;
+
+        let len = end - start;
+        if len >= 32 && len % 4 == 0 {
+            let base64_slice = &input[start..end];
+
+            // Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD
+            let decode_result = general_purpose::STANDARD
+                .decode(base64_slice)
+                .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
+                .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
+
+            if let Ok(decoded) = decode_result {
+                if decoded.is_ascii() {
+                    results.push(DecodedData { decoded, pos_start: start, pos_end: end });
+                }
+            }
+        }
+    }
+
+    results
+}
+
+// -------------------------------------------------------------------------------------------------
+// Match deduplication
+// -------------------------------------------------------------------------------------------------
+
+/// Computes a deduplication key for a match based on content, rule ID, and span.
+#[inline]
+pub fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
+    let mut hasher = FxHasher::default();
+    // Hash each component directly without allocation
+    content.hash(&mut hasher);
+    rule_id.hash(&mut hasher);
+    start.hash(&mut hasher);
+    end.hash(&mut hasher);
+    hasher.finish()
+}
+
+/// Inserts a span into a sorted list of spans, handling containment.
+///
+/// Returns `false` if the span is already contained in an existing span
+/// (i.e., it's redundant and should be skipped).
+#[inline]
+pub fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
+    let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
+    if idx > 0 {
+        if spans[idx - 1].fully_contains(&span) {
+            return false;
+        }
+        if span.fully_contains(&spans[idx - 1]) {
+            spans.remove(idx - 1);
+            idx -= 1;
+        }
+    }
+    if idx < spans.len() {
+        if spans[idx].fully_contains(&span) {
+            return false;
+        }
+        if span.fully_contains(&spans[idx]) {
+            spans.remove(idx);
+        }
+    }
+    spans.insert(idx, span);
+    true
+}
+
+/// Records a match span for a given rule, returning `false` if it's a duplicate.
+#[inline]
+pub fn record_match(
+    map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
+    rule_id: usize,
+    span: OffsetSpan,
+) -> bool {
+    insert_span(map.entry(rule_id).or_default(), span)
+}
+
+// -------------------------------------------------------------------------------------------------
+// Finding fingerprint
+// -------------------------------------------------------------------------------------------------
+
+/// Computes a stable fingerprint for a finding based on its value, location, and origin.
+pub fn compute_finding_fingerprint(
+    finding_value: &str,
+    file_or_commit: &str,
+    offset_start: u64,
+    offset_end: u64,
+) -> u64 {
+    // Combine all into a byte buffer and hash it directly:
+    let mut buf = Vec::with_capacity(
+        finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::<u64>(),
+    );
+    buf.extend_from_slice(finding_value.as_bytes());
+    buf.extend_from_slice(file_or_commit.as_bytes());
+    buf.extend_from_slice(&offset_start.to_le_bytes());
+    buf.extend_from_slice(&offset_end.to_le_bytes());
+
+    xxh3_64(&buf)
+}
+
+// -------------------------------------------------------------------------------------------------
+// Secret capture selection
+// -------------------------------------------------------------------------------------------------
+
+/// Selects the "secret" capture from the regex match using the priority:
+/// 1. Named capture called TOKEN (case-insensitive)
+/// 2. First matched named capture
+/// 3. First positional capture (group 1)
+/// 4. Full match (group 0)
+pub fn find_secret_capture<'a>(
+    re: &regex::bytes::Regex,
+    captures: &regex::bytes::Captures<'a>,
+) -> regex::bytes::Match<'a> {
+    // 1. Prefer a named capture called TOKEN (case-insensitive).
+    if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
+        name_opt.filter(|name| name.eq_ignore_ascii_case("TOKEN")).and_then(|_| captures.get(i))
+    }) {
+        return token_cap;
+    }
+
+    // 2. Otherwise, prefer the first *matched* named capture.
+    if let Some(named_cap) = re
+        .capture_names()
+        .enumerate()
+        .find_map(|(i, name_opt)| name_opt.and_then(|_| captures.get(i)))
+    {
+        return named_cap;
+    }
+
+    // 3. Otherwise, fall back to the first positional capture (group 1).
+    if let Some(pos_cap) = captures.get(1) {
+        return pos_cap;
+    }
+
+    // 4. Finally, fall back to the full match (group 0).
+    captures.get(0).unwrap()
+}
diff --git a/crates/kingfisher-scanner/src/scanner.rs b/crates/kingfisher-scanner/src/scanner.rs
index 9917f26..d3d83d6 100644
--- a/crates/kingfisher-scanner/src/scanner.rs
+++ b/crates/kingfisher-scanner/src/scanner.rs
@@ -5,15 +5,13 @@ use std::path::Path;
 use std::sync::Arc;
 
 use anyhow::Result;
-use base64::{engine::general_purpose, Engine};
 use kingfisher_core::{calculate_shannon_entropy, Blob, BlobIdMap, LocationMapping, OffsetSpan};
 use kingfisher_rules::RulesDatabase;
-use regex::bytes::Regex;
 use rustc_hash::{FxHashMap, FxHashSet};
 use tracing::debug;
-use xxhash_rust::xxh3::xxh3_64;
 
 use crate::finding::{Finding, FindingLocation};
+use crate::primitives;
 use crate::scanner_pool::ScannerPool;
 
 /// Configuration options for the scanner.
@@ -185,21 +183,15 @@ impl Scanner {
             let current_span = OffsetSpan::from_range(start..end);
 
             // Check for overlapping spans
-            if !self.record_span(&mut previous_spans, rule_id, current_span) {
+            if !primitives::record_match(&mut previous_spans, rule_id, current_span) {
                 continue;
             }
 
             let haystack = &bytes[start..end];
 
             for captures in anchored_regex.captures_iter(haystack) {
-                let full_capture = match captures.get(0) {
-                    Some(c) => c,
-                    None => continue,
-                };
-
                 // Get the primary secret value
-                let secret_capture =
-                    self.get_secret_capture(&anchored_regex, &captures, full_capture);
+                let secret_capture = primitives::find_secret_capture(&anchored_regex, &captures);
                 let secret_bytes = secret_capture.as_bytes();
 
                 // Check entropy
@@ -211,7 +203,7 @@ impl Scanner {
                 }
 
                 // Compute match key for dedup
-                let match_key = self.compute_match_key(
+                let match_key = primitives::compute_match_key(
                     secret_bytes,
                     rule.id().as_bytes(),
                     start + secret_capture.start(),
@@ -242,7 +234,7 @@ impl Scanner {
                     }
                 }
 
-                let fingerprint = self.compute_fingerprint(
+                let fingerprint = primitives::compute_finding_fingerprint(
                     &secret,
                     &blob.id().to_string(),
                     offset_span.start as u64,
@@ -295,92 +287,6 @@ impl Scanner {
         // In a real implementation, you'd want to add a clear method or use a different approach
     }
 
-    fn get_secret_capture<'a>(
-        &self,
-        regex: &Regex,
-        captures: &regex::bytes::Captures<'a>,
-        full_capture: regex::bytes::Match<'a>,
-    ) -> regex::bytes::Match<'a> {
-        // Prefer named capture called TOKEN
-        for (i, name_opt) in regex.capture_names().enumerate() {
-            if let Some(name) = name_opt {
-                if name.eq_ignore_ascii_case("TOKEN") {
-                    if let Some(cap) = captures.get(i) {
-                        return cap;
-                    }
-                }
-            }
-        }
-
-        // Otherwise, first named capture
-        for (i, name_opt) in regex.capture_names().enumerate() {
-            if name_opt.is_some() {
-                if let Some(cap) = captures.get(i) {
-                    return cap;
-                }
-            }
-        }
-
-        // Otherwise, first positional capture (group 1)
-        if let Some(cap) = captures.get(1) {
-            return cap;
-        }
-
-        // Fall back to full match
-        full_capture
-    }
-
-    fn record_span(
-        &self,
-        map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
-        rule_id: usize,
-        span: OffsetSpan,
-    ) -> bool {
-        let spans = map.entry(rule_id).or_default();
-
-        // Binary search for insertion point
-        let idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
-
-        // Check if new span is contained in an existing one
-        if idx > 0 && spans[idx - 1].fully_contains(&span) {
-            return false;
-        }
-        if idx < spans.len() && spans[idx].fully_contains(&span) {
-            return false;
-        }
-
-        // Remove spans that the new span contains
-        let remove_idx = idx;
-        while remove_idx < spans.len() && span.fully_contains(&spans[remove_idx]) {
-            spans.remove(remove_idx);
-        }
-        if idx > 0 && span.fully_contains(&spans[idx - 1]) {
-            spans.remove(idx - 1);
-        }
-
-        spans.insert(idx.min(spans.len()), span);
-        true
-    }
-
-    fn compute_match_key(&self, content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
-        use std::hash::{Hash, Hasher};
-        let mut hasher = rustc_hash::FxHasher::default();
-        content.hash(&mut hasher);
-        rule_id.hash(&mut hasher);
-        start.hash(&mut hasher);
-        end.hash(&mut hasher);
-        hasher.finish()
-    }
-
-    fn compute_fingerprint(&self, value: &str, blob_id: &str, start: u64, end: u64) -> u64 {
-        let mut buf = Vec::with_capacity(value.len() + blob_id.len() + 16);
-        buf.extend_from_slice(value.as_bytes());
-        buf.extend_from_slice(blob_id.as_bytes());
-        buf.extend_from_slice(&start.to_le_bytes());
-        buf.extend_from_slice(&end.to_le_bytes());
-        xxh3_64(&buf)
-    }
-
     fn redact(&self, bytes: &[u8]) -> String {
         let s = String::from_utf8_lossy(bytes);
         if s.len() <= 8 {
@@ -400,7 +306,7 @@ impl Scanner {
         let bytes = blob.bytes();
 
         // Find Base64-encoded strings
-        let b64_items = self.find_base64_strings(bytes);
+        let b64_items = primitives::get_base64_strings(bytes);
 
         for item in b64_items {
             // Try to match decoded content against all rules
@@ -411,12 +317,7 @@ impl Scanner {
                 };
 
                 for captures in regex.captures_iter(&item.decoded) {
-                    let full_capture = match captures.get(0) {
-                        Some(c) => c,
-                        None => continue,
-                    };
-
-                    let secret_capture = self.get_secret_capture(&regex, &captures, full_capture);
+                    let secret_capture = primitives::find_secret_capture(&regex, &captures);
                     let secret_bytes = secret_capture.as_bytes();
 
                     let min_entropy =
@@ -426,7 +327,7 @@ impl Scanner {
                         continue;
                     }
 
-                    let match_key = self.compute_match_key(
+                    let match_key = primitives::compute_match_key(
                         secret_bytes,
                         rule.id().as_bytes(),
                         item.pos_start,
@@ -455,7 +356,7 @@ impl Scanner {
                         }
                     }
 
-                    let fingerprint = self.compute_fingerprint(
+                    let fingerprint = primitives::compute_finding_fingerprint(
                         &secret,
                         &blob.id().to_string(),
                         offset_span.start as u64,
@@ -488,62 +389,6 @@ impl Scanner {
 
         findings
     }
-
-    fn find_base64_strings(&self, input: &[u8]) -> Vec<DecodedData> {
-        let mut results = Vec::new();
-        let mut i = 0;
-
-        while i < input.len() {
-            // Skip non-base64 characters
-            while i < input.len() && !Self::is_base64_byte(input[i]) {
-                i += 1;
-            }
-            let start = i;
-
-            // Collect base64 characters
-            while i < input.len() && Self::is_base64_byte(input[i]) {
-                i += 1;
-            }
-
-            // Handle padding
-            let mut eq_count = 0;
-            while i < input.len() && input[i] == b'=' && eq_count < 2 {
-                i += 1;
-                eq_count += 1;
-            }
-            let end = i;
-
-            let len = end - start;
-            if len >= 32 && len % 4 == 0 {
-                let base64_slice = &input[start..end];
-
-                // Try decoding
-                let decode_result = general_purpose::STANDARD
-                    .decode(base64_slice)
-                    .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
-                    .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
-
-                if let Ok(decoded) = decode_result {
-                    if decoded.is_ascii() {
-                        results.push(DecodedData { decoded, pos_start: start, pos_end: end });
-                    }
-                }
-            }
-        }
-
-        results
-    }
-
-    #[inline]
-    fn is_base64_byte(b: u8) -> bool {
-        matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
-    }
-}
-
-struct DecodedData {
-    decoded: Vec<u8>,
-    pos_start: usize,
-    pos_end: usize,
 }
 
 #[cfg(test)]
diff --git a/src/access_map.rs b/src/access_map.rs
index dc26d9f..eb07647 100644
--- a/src/access_map.rs
+++ b/src/access_map.rs
@@ -18,6 +18,22 @@ pub(crate) mod postgres;
 mod report;
 mod slack;
 
+/// Trait for access map providers that map a single token to an access profile.
+///
+/// This covers the majority of providers (GitHub, GitLab, Slack, HuggingFace,
+/// Gitea, Bitbucket). Providers with more complex credentials (AWS, GCP, Azure,
+/// Postgres, MongoDB) use their own custom interfaces.
+pub trait TokenAccessMapper: Send + Sync {
+    /// The cloud/platform name for results (e.g., `"github"`, `"slack"`).
+    fn cloud_name(&self) -> &'static str;
+
+    /// Maps a single token to an access map result.
+    fn map_access_from_token(
+        &self,
+        token: &str,
+    ) -> impl std::future::Future<Output = Result<AccessMapResult>> + Send;
+}
+
 /// Run the identity mapping workflow for the selected cloud provider.
 pub async fn run(args: AccessMapArgs) -> Result<()> {
     let result = match args.provider {
@@ -244,24 +260,15 @@ pub async fn map_requests(requests: Vec<AccessMapRequest>) -> Vec<AccessMapResul
                     .unwrap_or_else(|err| build_failed_result("azure_devops", "pat", err)),
                 fingerprint,
             ),
-            AccessMapRequest::Github { token, fingerprint } => (
-                github::map_access_from_token(&token)
-                    .await
-                    .unwrap_or_else(|err| build_failed_result("github", "token", err)),
-                fingerprint,
-            ),
-            AccessMapRequest::Gitlab { token, fingerprint } => (
-                gitlab::map_access_from_token(&token)
-                    .await
-                    .unwrap_or_else(|err| build_failed_result("gitlab", "token", err)),
-                fingerprint,
-            ),
-            AccessMapRequest::Slack { token, fingerprint } => (
-                slack::map_access_from_token(&token)
-                    .await
-                    .unwrap_or_else(|err| build_failed_result("slack", "token", err)),
-                fingerprint,
-            ),
+            AccessMapRequest::Github { token, fingerprint } => {
+                (map_token(&GithubMapper, &token).await, fingerprint)
+            }
+            AccessMapRequest::Gitlab { token, fingerprint } => {
+                (map_token(&GitlabMapper, &token).await, fingerprint)
+            }
+            AccessMapRequest::Slack { token, fingerprint } => {
+                (map_token(&SlackMapper, &token).await, fingerprint)
+            }
             AccessMapRequest::Postgres { uri, fingerprint } => (
                 postgres::map_access_from_uri(&uri)
                     .await
@@ -274,24 +281,15 @@ pub async fn map_requests(requests: Vec<AccessMapRequest>) -> Vec<AccessMapResul
                     .unwrap_or_else(|err| build_failed_result("mongodb", "uri", err)),
                 fingerprint,
             ),
-            AccessMapRequest::HuggingFace { token, fingerprint } => (
-                huggingface::map_access_from_token(&token)
-                    .await
-                    .unwrap_or_else(|err| build_failed_result("huggingface", "token", err)),
-                fingerprint,
-            ),
-            AccessMapRequest::Gitea { token, fingerprint } => (
-                gitea::map_access_from_token(&token)
-                    .await
-                    .unwrap_or_else(|err| build_failed_result("gitea", "token", err)),
-                fingerprint,
-            ),
-            AccessMapRequest::Bitbucket { token, fingerprint } => (
-                bitbucket::map_access_from_token(&token)
-                    .await
-                    .unwrap_or_else(|err| build_failed_result("bitbucket", "token", err)),
-                fingerprint,
-            ),
+            AccessMapRequest::HuggingFace { token, fingerprint } => {
+                (map_token(&HuggingFaceMapper, &token).await, fingerprint)
+            }
+            AccessMapRequest::Gitea { token, fingerprint } => {
+                (map_token(&GiteaMapper, &token).await, fingerprint)
+            }
+            AccessMapRequest::Bitbucket { token, fingerprint } => {
+                (map_token(&BitbucketMapper, &token).await, fingerprint)
+            }
         };
 
         mapped.fingerprint = Some(fp);
@@ -301,12 +299,106 @@ pub async fn map_requests(requests: Vec<AccessMapRequest>) -> Vec<AccessMapResul
     results
 }
 
+/// Maps a token credential using a `TokenAccessMapper`, with fallback error handling.
+async fn map_token(mapper: &impl TokenAccessMapper, token: &str) -> AccessMapResult {
+    mapper
+        .map_access_from_token(token)
+        .await
+        .unwrap_or_else(|err| build_failed_result(mapper.cloud_name(), "token", err))
+}
+
 /// Write HTML/JSON outputs for a collection of identity map results.
 pub fn write_reports(results: &[AccessMapResult], html_out: &std::path::Path) -> Result<()> {
     report::generate_html_report_multi(results, html_out)?;
     Ok(())
 }
 
+// -------------------------------------------------------------------------------------------------
+// TokenAccessMapper implementations
+// -------------------------------------------------------------------------------------------------
+
+/// GitHub access mapper.
+pub struct GithubMapper;
+
+impl TokenAccessMapper for GithubMapper {
+    fn cloud_name(&self) -> &'static str {
+        "github"
+    }
+
+    async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
+        github::map_access_from_token(token).await
+    }
+}
+
+/// GitLab access mapper.
+pub struct GitlabMapper;
+
+impl TokenAccessMapper for GitlabMapper {
+    fn cloud_name(&self) -> &'static str {
+        "gitlab"
+    }
+
+    async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
+        gitlab::map_access_from_token(token).await
+    }
+}
+
+/// Slack access mapper.
+pub struct SlackMapper;
+
+impl TokenAccessMapper for SlackMapper {
+    fn cloud_name(&self) -> &'static str {
+        "slack"
+    }
+
+    async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
+        slack::map_access_from_token(token).await
+    }
+}
+
+/// HuggingFace access mapper.
+pub struct HuggingFaceMapper;
+
+impl TokenAccessMapper for HuggingFaceMapper {
+    fn cloud_name(&self) -> &'static str {
+        "huggingface"
+    }
+
+    async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
+        huggingface::map_access_from_token(token).await
+    }
+}
+
+/// Gitea access mapper.
+pub struct GiteaMapper;
+
+impl TokenAccessMapper for GiteaMapper {
+    fn cloud_name(&self) -> &'static str {
+        "gitea"
+    }
+
+    async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
+        gitea::map_access_from_token(token).await
+    }
+}
+
+/// Bitbucket access mapper.
+pub struct BitbucketMapper;
+
+impl TokenAccessMapper for BitbucketMapper {
+    fn cloud_name(&self) -> &'static str {
+        "bitbucket"
+    }
+
+    async fn map_access_from_token(&self, token: &str) -> Result<AccessMapResult> {
+        bitbucket::map_access_from_token(token).await
+    }
+}
+
+// -------------------------------------------------------------------------------------------------
+// Helper functions
+// -------------------------------------------------------------------------------------------------
+
 fn severity_to_str(severity: Severity) -> &'static str {
     match severity {
         Severity::Low => "low",
diff --git a/src/azure.rs b/src/azure.rs
index 9a3b6d5..9f2a752 100644
--- a/src/azure.rs
+++ b/src/azure.rs
@@ -13,13 +13,12 @@ use std::{
 // let us opt into newer API versions as Microsoft rolls them out.
 
 use anyhow::{anyhow, Context, Result};
-use globset::{Glob, GlobSet, GlobSetBuilder};
 use indicatif::{ProgressBar, ProgressStyle};
 use serde::Deserialize;
 use tracing::warn;
 use url::{form_urlencoded, Url};
 
-use crate::{findings_store, git_url::GitUrl};
+use crate::{findings_store, git_host, git_url::GitUrl};
 
 const API_VERSION: &str = "7.1-preview.1";
 
@@ -55,33 +54,6 @@ impl RepoSpecifiers {
     }
 }
 
-#[derive(Debug)]
-struct ExcludeMatcher {
-    exact: HashSet<String>,
-    globs: Option<GlobSet>,
-}
-
-impl ExcludeMatcher {
-    fn matches(&self, name: &str) -> bool {
-        let candidate = name.to_lowercase();
-        if self.exact.contains(&candidate) {
-            return true;
-        }
-        if let Some(globs) = &self.globs {
-            return globs.is_match(&candidate);
-        }
-        false
-    }
-
-    fn is_empty(&self) -> bool {
-        self.exact.is_empty() && self.globs.is_none()
-    }
-}
-
-fn looks_like_glob(pattern: &str) -> bool {
-    pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
-}
-
 fn encode_segment(segment: &str) -> String {
     form_urlencoded::byte_serialize(segment.as_bytes()).collect::<String>()
 }
@@ -129,7 +101,7 @@ fn parse_repo_identifier_from_path(path: &str) -> Option<String> {
 
         let org = org.to_lowercase();
         let project_raw = project.to_string();
-        if looks_like_glob(&project_raw) {
+        if git_host::looks_like_glob(&project_raw) {
             let pattern = format!("{org}/{}/**", project_raw.to_lowercase());
             return Some(pattern);
         }
@@ -197,59 +169,16 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
     parse_repo_identifier_from_path(trimmed)
 }
 
-fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
-    let mut exact = HashSet::new();
-    let mut glob_builder = GlobSetBuilder::new();
-    let mut has_glob = false;
-
-    for raw in exclude_repos {
-        match parse_excluded_repo(raw) {
-            Some(name) => {
-                let normalized = name.to_lowercase();
-                if looks_like_glob(&normalized) {
-                    match Glob::new(&normalized) {
-                        Ok(glob) => {
-                            glob_builder.add(glob);
-                            has_glob = true;
-                        }
-                        Err(err) => {
-                            warn!("Ignoring invalid Azure exclusion pattern '{raw}': {err}");
-                            exact.insert(normalized);
-                        }
-                    }
-                } else {
-                    exact.insert(normalized);
-                }
-            }
-            None => {
-                warn!("Ignoring invalid Azure exclusion '{raw}' (expected organization/project[/repository])");
-            }
-        }
-    }
-
-    let globs = if has_glob {
-        match glob_builder.build() {
-            Ok(set) => Some(set),
-            Err(err) => {
-                warn!("Failed to build Azure exclusion patterns: {err}");
-                None
-            }
-        }
-    } else {
-        None
-    };
-
-    ExcludeMatcher { exact, globs }
+fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
+    git_host::build_exclude_matcher(
+        exclude_repos,
+        |raw| parse_excluded_repo(raw).map(|name| name.to_lowercase()),
+        "Azure",
+    )
 }
 
-fn should_exclude_repo(repo_url: &str, excludes: &ExcludeMatcher) -> bool {
-    if excludes.is_empty() {
-        return false;
-    }
-    if let Some(name) = parse_repo_identifier_from_url(repo_url) {
-        return excludes.matches(&name);
-    }
-    false
+fn should_exclude_repo(repo_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
+    git_host::should_exclude_repo(repo_url, excludes, parse_repo_identifier_from_url)
 }
 
 #[derive(Debug, Deserialize, Default)]
diff --git a/src/bitbucket.rs b/src/bitbucket.rs
index 12edf6c..948cf90 100644
--- a/src/bitbucket.rs
+++ b/src/bitbucket.rs
@@ -6,8 +6,8 @@ use std::{
     time::Duration,
 };
 
+use crate::git_host;
 use anyhow::{Context, Result};
-use globset::{Glob, GlobSet, GlobSetBuilder};
 use indicatif::{ProgressBar, ProgressStyle};
 use reqwest::Url;
 use serde::Deserialize;
@@ -136,32 +136,6 @@ impl BitbucketKind {
     }
 }
 
-#[derive(Debug)]
-struct ExcludeMatcher {
-    exact: HashSet<String>,
-    globs: Option<GlobSet>,
-}
-
-impl ExcludeMatcher {
-    fn matches(&self, name: &str) -> bool {
-        if self.exact.contains(name) {
-            return true;
-        }
-        if let Some(globs) = &self.globs {
-            return globs.is_match(name);
-        }
-        false
-    }
-
-    fn is_empty(&self) -> bool {
-        self.exact.is_empty() && self.globs.is_none()
-    }
-}
-
-fn looks_like_glob(pattern: &str) -> bool {
-    pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
-}
-
 fn normalize_repo_identifier(owner: &str, repo: &str) -> Option<String> {
     let owner = owner.trim().trim_matches('/');
     let repo = repo.trim().trim_matches('/');
@@ -207,58 +181,12 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
     parse_repo_name_from_path(trimmed)
 }
 
-fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
-    let mut exact = HashSet::new();
-    let mut glob_builder = GlobSetBuilder::new();
-    let mut has_glob = false;
-
-    for raw in exclude_repos {
-        match parse_excluded_repo(raw) {
-            Some(name) => {
-                if looks_like_glob(&name) {
-                    match Glob::new(&name) {
-                        Ok(glob) => {
-                            glob_builder.add(glob);
-                            has_glob = true;
-                        }
-                        Err(err) => {
-                            warn!("Ignoring invalid Bitbucket exclusion pattern '{raw}': {err}");
-                            exact.insert(name);
-                        }
-                    }
-                } else {
-                    exact.insert(name);
-                }
-            }
-            None => {
-                warn!("Ignoring invalid Bitbucket exclusion '{raw}' (expected owner/repo)");
-            }
-        }
-    }
-
-    let globs = if has_glob {
-        match glob_builder.build() {
-            Ok(set) => Some(set),
-            Err(err) => {
-                warn!("Failed to build Bitbucket exclusion patterns: {err}");
-                None
-            }
-        }
-    } else {
-        None
-    };
-
-    ExcludeMatcher { exact, globs }
+fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
+    git_host::build_exclude_matcher(exclude_repos, parse_excluded_repo, "Bitbucket")
 }
 
-fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool {
-    if excludes.is_empty() {
-        return false;
-    }
-    if let Some(name) = parse_repo_name_from_url(clone_url) {
-        return excludes.matches(&name);
-    }
-    false
+fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
+    git_host::should_exclude_repo(clone_url, excludes, parse_repo_name_from_url)
 }
 
 fn repo_clone_url_from_links(links: &[CloneLink]) -> Option<String> {
@@ -343,7 +271,7 @@ async fn fetch_cloud_repositories(
     owner: &str,
     auth: &AuthConfig,
     repo_filter: RepoType,
-    excludes: &ExcludeMatcher,
+    excludes: &git_host::ExcludeMatcher,
     results: &mut Vec<String>,
 ) -> Result<()> {
     let mut next = base
@@ -387,7 +315,7 @@ async fn fetch_server_repositories(
     path: &str,
     auth: &AuthConfig,
     repo_filter: RepoType,
-    excludes: &ExcludeMatcher,
+    excludes: &git_host::ExcludeMatcher,
     results: &mut Vec<String>,
 ) -> Result<()> {
     let mut start = 0u64;
diff --git a/src/content_type.rs b/src/content_type.rs
index 611badb..f2ffbc6 100644
--- a/src/content_type.rs
+++ b/src/content_type.rs
@@ -1,232 +1,2 @@
-use once_cell::sync::Lazy;
-use std::path::Path;
-use tokei::LanguageType;
-
-// Precompute all (shebang_prefix_bytes, language) pairs once.
-// Sort longest-first so more specific shebangs win.
-static SHEBANG_PREFIXES: Lazy<Vec<(&'static [u8], LanguageType)>> = Lazy::new(|| {
-    let mut v = Vec::new();
-    for &(lang, shebangs) in LanguageType::list() {
-        for &sb in shebangs {
-            v.push((sb.as_bytes(), lang));
-        }
-    }
-    // Longest prefix first to prefer e.g. "#!/usr/bin/env python3" over "#!/usr/bin/env python"
-    v.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
-    v
-});
-
-/// The type of content detected.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum ContentType {
-    /// Unprintable or control-heavy data.
-    BINARY,
-    /// Mostly printable text.
-    TEXT,
-}
-
-/// Heuristic thresholds for text vs. binary detection.
-pub struct ContentInspector {
-    max_null_bytes: usize,
-    max_control_ratio: f64,
-}
-
-impl Default for ContentInspector {
-    fn default() -> Self {
-        Self { max_null_bytes: 4, max_control_ratio: 0.3 }
-    }
-}
-
-impl ContentInspector {
-    /// Create a new inspector with default thresholds.
-    #[inline]
-    pub fn new() -> Self {
-        Default::default()
-    }
-
-    /// Classify `bytes` as TEXT or BINARY:
-    ///
-    /// 1. If null-byte count > `max_null_bytes` -- `BINARY`.
-    /// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` →
-    ///    `BINARY`.
-    /// 3. Otherwise,  `TEXT`.
-    #[inline]
-    #[must_use]
-    pub fn inspect(&self, bytes: &[u8]) -> ContentType {
-        let nulls = bytes.iter().filter(|&&b| b == 0).count();
-        if nulls > self.max_null_bytes {
-            return ContentType::BINARY;
-        }
-        let controls =
-            bytes.iter().filter(|&&b| b < 32 && !matches!(b, b'\n' | b'\r' | b'\t')).count();
-        let ratio = if bytes.is_empty() { 0.0 } else { controls as f64 / bytes.len() as f64 };
-        if ratio > self.max_control_ratio {
-            ContentType::BINARY
-        } else {
-            ContentType::TEXT
-        }
-    }
-
-    /// Guess MIME type from `path` extension.
-    ///
-    /// Returns:
-    /// - `Some(mime)` if the extension is one of the known text or image types.
-    /// - `None` if there is no extension or it’s unrecognized.
-    #[inline]
-    #[must_use]
-    pub fn guess_mime_type(&self, path: &Path) -> Option<String> {
-        let ext = path.extension()?.to_str()?.to_ascii_lowercase();
-        let mime = match ext.as_str() {
-            "txt" | "md" | "rst" => "text/plain",
-            "html" | "htm" => "text/html",
-            "css" => "text/css",
-            "js" => "application/javascript",
-            "json" => "application/json",
-            "xml" => "application/xml",
-            "pdf" => "application/pdf",
-            "jpg" | "jpeg" => "image/jpeg",
-            "png" => "image/png",
-            "gif" => "image/gif",
-            _ => return None,
-        };
-        Some(mime.to_string())
-    }
-
-    /// Detect UTF-8 encoding by attempting a lossless conversion.
-    #[inline]
-    #[must_use]
-    pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
-        std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
-    }
-    /// Guess programming language with broad coverage using `tokei`.
-    ///
-    /// Strategy (no disk I/O):
-    /// 1) Try extension via `LanguageType::from_file_extension`.
-    /// 2) Handle common extensionless filenames (e.g., Makefile, Dockerfile, CMakeLists.txt).
-    /// 3) Parse an in-memory shebang (first line) against `LanguageType::shebangs`.
-    /// 4) Minimal content markers as a last resort.
-    ///
-    /// Returns the canonical `tokei` language name (e.g., `Rust`, `Bash`, `Python`).
-    #[inline]
-    #[must_use]
-    pub fn guess_language(&self, path: &Path, content: &[u8]) -> Option<String> {
-        // 1) Extension mapping (fast, no I/O).
-        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
-            if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) {
-                return Some(lang.name().to_string());
-            }
-        }
-
-        // 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure).
-        if let Some(file) = path.file_name().and_then(|f| f.to_str()) {
-            match file {
-                "Makefile" | "makefile" => {
-                    return Some(LanguageType::Makefile.name().to_string());
-                }
-                "Dockerfile" | "dockerfile" => {
-                    return Some(LanguageType::Dockerfile.name().to_string());
-                }
-                "CMakeLists.txt" => {
-                    return Some(LanguageType::CMake.name().to_string());
-                }
-                "Rakefile" | "rakefile" => {
-                    return Some(LanguageType::Rakefile.name().to_string());
-                }
-                // Common ecosystem files; map to their primary language where sensible.
-                "Gemfile" | "gemfile" => {
-                    return Some(LanguageType::Ruby.name().to_string());
-                }
-                _ => {}
-            }
-        }
-
-        // 3) Shebang detection (in-memory): match by longest prefix, byte-wise (no UTF-8 needed).
-        if let Some(first_line) = content.split(|&b| b == b'\n').next() {
-            if first_line.starts_with(b"#!") {
-                for (prefix, lang) in SHEBANG_PREFIXES.iter() {
-                    if first_line.starts_with(prefix) {
-                        return Some(lang.name().to_string());
-                    }
-                }
-            }
-        }
-
-        // 4) Lightweight content markers to catch a few ubiquitous cases without I/O.
-        let s = String::from_utf8_lossy(content);
-        if s.contains("<?php") {
-            return Some(LanguageType::Php.name().to_string());
-        }
-        if s.contains("package main") {
-            return Some(LanguageType::Go.name().to_string());
-        }
-        if s.contains("public class") {
-            return Some(LanguageType::Java.name().to_string());
-        }
-        if s.contains("#!/usr/bin/env bash") || s.contains("#!/bin/bash") {
-            return Some(LanguageType::Bash.name().to_string());
-        }
-        if s.contains("#!/usr/bin/env python") {
-            return Some(LanguageType::Python.name().to_string());
-        }
-
-        None
-    }
-}
-
-/// Shorthand: inspect with default thresholds.
-#[inline]
-#[must_use]
-pub fn inspect(bytes: &[u8]) -> ContentType {
-    ContentInspector::default().inspect(bytes)
-}
-
-#[cfg(test)]
-mod tests {
-    use std::path::PathBuf;
-
-    use super::*;
-
-    #[test]
-    fn binary_vs_text() {
-        let ins = ContentInspector::default();
-        let bin = vec![0, 1, 2, 0, 0, 0, 5];
-        assert_eq!(ins.inspect(&bin), ContentType::BINARY);
-        let txt = b"Hello\nWorld";
-        assert_eq!(ins.inspect(txt), ContentType::TEXT);
-    }
-
-    #[test]
-    fn mime_guess() {
-        let ins = ContentInspector::default();
-        assert_eq!(ins.guess_mime_type(&PathBuf::from("a.md")), Some("text/plain".into()));
-        assert_eq!(ins.guess_mime_type(&PathBuf::from("img.png")), Some("image/png".into()));
-        assert_eq!(ins.guess_mime_type(&PathBuf::from("x.xyz")), None);
-    }
-
-    #[test]
-    fn charset_guess() {
-        let ins = ContentInspector::default();
-        assert_eq!(ins.guess_charset("ok".as_bytes()), Some("UTF-8".into()));
-        assert_eq!(ins.guess_charset(&[0xFF, 0xFE, 0xFD]), None);
-    }
-
-    #[test]
-    fn language_guess() {
-        let ins = ContentInspector::default();
-
-        // Compare case-insensitively by lowercasing both sides.
-        let rust =
-            ins.guess_language(&PathBuf::from("main.rs"), b"").map(|s| s.to_ascii_lowercase());
-        assert_eq!(rust, Some("rust".into()));
-
-        let php = ins
-            .guess_language(&PathBuf::from("x"), b"<?php echo; ?>")
-            .map(|s| s.to_ascii_lowercase());
-        assert_eq!(php, Some("php".into()));
-
-        let bash = ins
-            .guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi")
-            .map(|s| s.to_ascii_lowercase());
-        assert_eq!(bash, Some("bash".into()));
-    }
-}
+// Re-export from the canonical implementation in kingfisher-core.
+pub use kingfisher_core::content_type::*;
diff --git a/src/git_host.rs b/src/git_host.rs
new file mode 100644
index 0000000..5f432fa
--- /dev/null
+++ b/src/git_host.rs
@@ -0,0 +1,121 @@
+//! Shared utilities for Git hosting platform integrations.
+//!
+//! This module provides the `ExcludeMatcher` struct and related helpers
+//! used across all platform integration modules (GitHub, GitLab, Azure,
+//! Bitbucket, Gitea).
+
+use std::collections::HashSet;
+
+use globset::{Glob, GlobSet, GlobSetBuilder};
+use tracing::warn;
+
+// -------------------------------------------------------------------------------------------------
+// ExcludeMatcher
+// -------------------------------------------------------------------------------------------------
+
+/// Matches repository names against exact strings and glob patterns.
+///
+/// Used by all platform integrations to filter out excluded repositories.
+pub struct ExcludeMatcher {
+    exact: HashSet<String>,
+    globs: Option<GlobSet>,
+}
+
+impl ExcludeMatcher {
+    /// Returns `true` if no exclusion patterns are configured.
+    pub fn is_empty(&self) -> bool {
+        self.exact.is_empty() && self.globs.is_none()
+    }
+
+    /// Returns `true` if the given name matches any exclusion pattern.
+    pub fn matches(&self, name: &str) -> bool {
+        if self.exact.contains(name) {
+            return true;
+        }
+        if let Some(globs) = &self.globs {
+            return globs.is_match(name);
+        }
+        false
+    }
+}
+
+/// Returns `true` if the pattern contains glob metacharacters.
+pub fn looks_like_glob(pattern: &str) -> bool {
+    pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
+}
+
+/// Builds an `ExcludeMatcher` from a list of exclusion patterns.
+///
+/// The `parse_fn` transforms each raw pattern string into a normalized
+/// repository name (e.g., `"owner/repo"` or `"group/project"`). It returns
+/// `None` if the pattern is invalid.
+///
+/// The `platform_name` is used in warning messages (e.g., `"GitHub"`,
+/// `"GitLab"`).
+pub fn build_exclude_matcher(
+    exclude_repos: &[String],
+    parse_fn: impl Fn(&str) -> Option<String>,
+    platform_name: &str,
+) -> ExcludeMatcher {
+    let mut exact = HashSet::new();
+    let mut glob_builder = GlobSetBuilder::new();
+    let mut has_glob = false;
+
+    for raw in exclude_repos {
+        match parse_fn(raw) {
+            Some(name) => {
+                if looks_like_glob(&name) {
+                    match Glob::new(&name) {
+                        Ok(glob) => {
+                            glob_builder.add(glob);
+                            has_glob = true;
+                        }
+                        Err(err) => {
+                            warn!(
+                                "Ignoring invalid {platform_name} exclusion pattern '{raw}': {err}"
+                            );
+                            exact.insert(name);
+                        }
+                    }
+                } else {
+                    exact.insert(name);
+                }
+            }
+            None => {
+                warn!("Ignoring invalid {platform_name} exclusion '{raw}'");
+            }
+        }
+    }
+
+    let globs = if has_glob {
+        match glob_builder.build() {
+            Ok(set) => Some(set),
+            Err(err) => {
+                warn!("Failed to build {platform_name} exclusion patterns: {err}");
+                None
+            }
+        }
+    } else {
+        None
+    };
+
+    ExcludeMatcher { exact, globs }
+}
+
+/// Checks whether a repository URL should be excluded.
+///
+/// The `extract_name` function extracts a normalized repository identifier
+/// from the URL (e.g., `"owner/repo"` from `"https://github.com/owner/repo.git"`).
+pub fn should_exclude_repo(
+    repo_url: &str,
+    excludes: &ExcludeMatcher,
+    extract_name: impl Fn(&str) -> Option<String>,
+) -> bool {
+    if excludes.is_empty() {
+        return false;
+    }
+    if let Some(name) = extract_name(repo_url) {
+        return excludes.matches(&name);
+    }
+    false
+}
diff --git a/src/gitea.rs b/src/gitea.rs
index 90a00ff..d748685 100644
--- a/src/gitea.rs
+++ b/src/gitea.rs
@@ -1,14 +1,13 @@
 use std::{collections::HashSet, env, str::FromStr, time::Duration};
 
 use anyhow::{anyhow, Result};
-use globset::{Glob, GlobSet, GlobSetBuilder};
 use indicatif::{ProgressBar, ProgressStyle};
 use reqwest::StatusCode;
 use serde::Deserialize;
 use tracing::warn;
 use url::Url;
 
-use crate::{git_url::GitUrl, validation::GLOBAL_USER_AGENT};
+use crate::{git_host, git_url::GitUrl, validation::GLOBAL_USER_AGENT};
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum RepoType {
@@ -55,31 +54,6 @@ struct GiteaOrganization {
     username: String,
 }
 
-struct ExcludeMatcher {
-    exact: HashSet<String>,
-    globs: Option<GlobSet>,
-}
-
-impl ExcludeMatcher {
-    fn matches(&self, name: &str) -> bool {
-        if self.exact.contains(name) {
-            return true;
-        }
-        if let Some(globs) = &self.globs {
-            return globs.is_match(name);
-        }
-        false
-    }
-
-    fn is_empty(&self) -> bool {
-        self.exact.is_empty() && self.globs.is_none()
-    }
-}
-
-fn looks_like_glob(pattern: &str) -> bool {
-    pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
-}
-
 fn normalize_repo_identifier(raw: &str) -> Option<String> {
     let trimmed = raw.trim().trim_matches('/');
     if trimmed.is_empty() {
@@ -113,51 +87,11 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
     normalize_repo_identifier(trimmed)
 }
 
-fn build_exclude_matcher(excludes: &[String]) -> ExcludeMatcher {
-    let mut exact = HashSet::new();
-    let mut glob_builder = GlobSetBuilder::new();
-    let mut has_glob = false;
-
-    for raw in excludes {
-        match parse_excluded_repo(raw) {
-            Some(name) => {
-                if looks_like_glob(&name) {
-                    match Glob::new(&name) {
-                        Ok(glob) => {
-                            glob_builder.add(glob);
-                            has_glob = true;
-                        }
-                        Err(err) => {
-                            warn!("Ignoring invalid Gitea exclusion pattern '{raw}': {err}");
-                            exact.insert(name);
-                        }
-                    }
-                } else {
-                    exact.insert(name);
-                }
-            }
-            None => {
-                warn!("Ignoring invalid Gitea exclusion '{raw}' (expected owner/repo)");
-            }
-        }
-    }
-
-    let globs = if has_glob {
-        match glob_builder.build() {
-            Ok(set) => Some(set),
-            Err(err) => {
-                warn!("Failed to build Gitea exclusion patterns: {err}");
-                None
-            }
-        }
-    } else {
-        None
-    };
-
-    ExcludeMatcher { exact, globs }
+fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
+    git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_repo(raw), "Gitea")
 }
 
-fn should_exclude_repo(repo: &GiteaRepository, excludes: &ExcludeMatcher) -> bool {
+fn should_exclude_repo(repo: &GiteaRepository, excludes: &git_host::ExcludeMatcher) -> bool {
     if excludes.is_empty() {
         return false;
     }
@@ -169,7 +103,7 @@ async fn fetch_paginated_repos(
     token: Option<&str>,
     mut url: Url,
     repo_filter: RepoType,
-    excludes: &ExcludeMatcher,
+    excludes: &git_host::ExcludeMatcher,
     progress: Option<&ProgressBar>,
 ) -> Result<Vec<String>> {
     let mut page = 1u32;
@@ -221,7 +155,7 @@ async fn fetch_user_repos(
     api_url: &Url,
     username: &str,
     repo_filter: RepoType,
-    excludes: &ExcludeMatcher,
+    excludes: &git_host::ExcludeMatcher,
     progress: Option<&ProgressBar>,
 ) -> Result<Vec<String>> {
     let endpoint = format!("users/{}/repos", username);
@@ -235,7 +169,7 @@ async fn fetch_org_repos(
     api_url: &Url,
     org: &str,
     repo_filter: RepoType,
-    excludes: &ExcludeMatcher,
+    excludes: &git_host::ExcludeMatcher,
     progress: Option<&ProgressBar>,
 ) -> Result<Vec<String>> {
     let endpoint = format!("orgs/{}/repos", org);
diff --git a/src/github.rs b/src/github.rs
index 092095c..ca2847d 100644
--- a/src/github.rs
+++ b/src/github.rs
@@ -7,7 +7,6 @@ use std::{
 };
 
 use anyhow::{Context, Result};
-use globset::{Glob, GlobSet, GlobSetBuilder};
 use indicatif::{ProgressBar, ProgressStyle};
 use octorust::{
     auth::Credentials,
@@ -119,83 +118,14 @@ fn parse_excluded_repo(raw: &str) -> Option<String> {
     parse_repo_name_from_path(trimmed)
 }
 
-struct ExcludeMatcher {
-    exact: HashSet<String>,
-    globs: Option<GlobSet>,
+use crate::git_host;
+
+fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
+    git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_repo(raw), "GitHub")
 }
 
-impl ExcludeMatcher {
-    fn is_empty(&self) -> bool {
-        self.exact.is_empty() && self.globs.is_none()
-    }
-
-    fn matches(&self, name: &str) -> bool {
-        if self.exact.contains(name) {
-            return true;
-        }
-        if let Some(globs) = &self.globs {
-            return globs.is_match(name);
-        }
-        false
-    }
-}
-
-fn looks_like_glob(pattern: &str) -> bool {
-    pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
-}
-
-fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
-    let mut exact = HashSet::new();
-    let mut glob_builder = GlobSetBuilder::new();
-    let mut has_glob = false;
-
-    for raw in exclude_repos {
-        match parse_excluded_repo(raw) {
-            Some(name) => {
-                if looks_like_glob(&name) {
-                    match Glob::new(&name) {
-                        Ok(glob) => {
-                            glob_builder.add(glob);
-                            has_glob = true;
-                        }
-                        Err(err) => {
-                            warn!("Ignoring invalid GitHub exclusion pattern '{raw}': {err}");
-                            exact.insert(name);
-                        }
-                    }
-                } else {
-                    exact.insert(name);
-                }
-            }
-            None => {
-                warn!("Ignoring invalid GitHub exclusion '{raw}' (expected owner/repo)");
-            }
-        }
-    }
-
-    let globs = if has_glob {
-        match glob_builder.build() {
-            Ok(set) => Some(set),
-            Err(err) => {
-                warn!("Failed to build GitHub exclusion patterns: {err}");
-                None
-            }
-        }
-    } else {
-        None
-    };
-
-    ExcludeMatcher { exact, globs }
-}
-
-fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool {
-    if excludes.is_empty() {
-        return false;
-    }
-    if let Some(name) = parse_repo_name_from_url(clone_url) {
-        return excludes.matches(&name);
-    }
-    false
+fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
+    git_host::should_exclude_repo(clone_url, excludes, parse_repo_name_from_url)
 }
 fn create_github_client(github_url: &url::Url, ignore_certs: bool) -> Result<Arc<Client>> {
     // Try personal access token
diff --git a/src/gitlab.rs b/src/gitlab.rs
index df394fa..4bfdefe 100644
--- a/src/gitlab.rs
+++ b/src/gitlab.rs
@@ -16,7 +16,6 @@ use gitlab::{
     },
     Gitlab, GitlabBuilder,
 };
-use globset::{Glob, GlobSet, GlobSetBuilder};
 use indicatif::{ProgressBar, ProgressStyle};
 use reqwest::StatusCode;
 use serde::Deserialize;
@@ -25,7 +24,7 @@ use tokio::task;
 use tracing::{info, warn};
 use url::{form_urlencoded, Url};
 
-use crate::{findings_store, git_url::GitUrl};
+use crate::{findings_store, git_host, git_url::GitUrl};
 use std::str::FromStr;
 
 #[derive(Deserialize)]
@@ -128,83 +127,12 @@ fn parse_excluded_project(raw: &str) -> Option<String> {
     parse_project_path(trimmed)
 }
 
-struct ExcludeMatcher {
-    exact: HashSet<String>,
-    globs: Option<GlobSet>,
+fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
+    git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_project(raw), "GitLab")
 }
 
-impl ExcludeMatcher {
-    fn is_empty(&self) -> bool {
-        self.exact.is_empty() && self.globs.is_none()
-    }
-
-    fn matches(&self, name: &str) -> bool {
-        if self.exact.contains(name) {
-            return true;
-        }
-        if let Some(globs) = &self.globs {
-            return globs.is_match(name);
-        }
-        false
-    }
-}
-
-fn looks_like_glob(pattern: &str) -> bool {
-    pattern.contains('*') || pattern.contains('?') || pattern.contains('[')
-}
-
-fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher {
-    let mut exact = HashSet::new();
-    let mut glob_builder = GlobSetBuilder::new();
-    let mut has_glob = false;
-
-    for raw in exclude_repos {
-        match parse_excluded_project(raw) {
-            Some(name) => {
-                if looks_like_glob(&name) {
-                    match Glob::new(&name) {
-                        Ok(glob) => {
-                            glob_builder.add(glob);
-                            has_glob = true;
-                        }
-                        Err(err) => {
-                            warn!("Ignoring invalid GitLab exclusion pattern '{raw}': {err}");
-                            exact.insert(name);
-                        }
-                    }
-                } else {
-                    exact.insert(name);
-                }
-            }
-            None => {
-                warn!("Ignoring invalid GitLab exclusion '{raw}' (expected group/project)");
-            }
-        }
-    }
-
-    let globs = if has_glob {
-        match glob_builder.build() {
-            Ok(set) => Some(set),
-            Err(err) => {
-                warn!("Failed to build GitLab exclusion patterns: {err}");
-                None
-            }
-        }
-    } else {
-        None
-    };
-
-    ExcludeMatcher { exact, globs }
-}
-
-fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool {
-    if excludes.is_empty() {
-        return false;
-    }
-    if let Some(name) = parse_project_path_from_url(clone_url) {
-        return excludes.matches(&name);
-    }
-    false
+fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
+    git_host::should_exclude_repo(clone_url, excludes, parse_project_path_from_url)
 }
 
 fn create_gitlab_client(gitlab_url: &Url, ignore_certs: bool) -> Result<Gitlab> {
diff --git a/src/lib.rs b/src/lib.rs
index f1ca2a5..781d920 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -24,6 +24,7 @@ pub mod findings_store;
 pub mod gcs;
 pub mod git_binary;
 pub mod git_commit_metadata;
+pub mod git_host;
 pub mod git_metadata_graph;
 mod git_repo_enumerator;
 pub mod git_url;
diff --git a/src/matcher/base64_decode.rs b/src/matcher/base64_decode.rs
new file mode 100644
index 0000000..0ee1f26
--- /dev/null
+++ b/src/matcher/base64_decode.rs
@@ -0,0 +1,2 @@
+// Re-export from the canonical implementation in kingfisher-scanner.
+pub use kingfisher_scanner::primitives::{get_base64_strings, DecodedData};
diff --git a/src/matcher/captures.rs b/src/matcher/captures.rs
new file mode 100644
index 0000000..075f66f
--- /dev/null
+++ b/src/matcher/captures.rs
@@ -0,0 +1,140 @@
+use bstr::BString;
+use regex::bytes::Regex;
+use schemars::{
+    gen::SchemaGenerator,
+    schema::{ArrayValidation, InstanceType, Schema},
+    JsonSchema,
+};
+use serde::{Deserialize, Serialize};
+use smallvec::SmallVec;
+
+use crate::{snippet::Base64BString, util::intern};
+
+// -------------------------------------------------------------------------------------------------
+// Group
+// -------------------------------------------------------------------------------------------------
+#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)]
+pub struct Group(pub Base64BString);
+impl Group {
+    pub fn new(m: regex::bytes::Match<'_>) -> Self {
+        Self(Base64BString(BString::from(m.as_bytes())))
+    }
+}
+// -------------------------------------------------------------------------------------------------
+// Groups
+// -------------------------------------------------------------------------------------------------
+#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
+pub struct Groups(pub SmallVec<[Group; 1]>);
+impl JsonSchema for Groups {
+    fn schema_name() -> String {
+        "Groups".to_string()
+    }
+
+    fn json_schema(gen: &mut SchemaGenerator) -> Schema {
+        let group_schema = gen.subschema_for::<Group>();
+        Schema::Object(schemars::schema::SchemaObject {
+            instance_type: Some(InstanceType::Array.into()),
+            array: Some(Box::new(ArrayValidation {
+                items: Some(group_schema.into()),
+                ..Default::default()
+            })),
+            ..Default::default()
+        })
+    }
+}
+
+#[derive(Debug, Clone, JsonSchema)]
+pub struct SerializableCapture {
+    pub name: Option<&'static str>,
+    pub match_number: i32,
+    pub start: usize,
+    pub end: usize,
+    /// Interned original (unredacted) value.
+    #[serde(skip_serializing, skip_deserializing)]
+    pub value: &'static str,
+}
+
+impl SerializableCapture {
+    /// Returns the original captured value.
+    pub fn raw_value(&self) -> &'static str {
+        self.value
+    }
+
+    /// Returns the value that should be shown in user-facing output.
+    pub fn display_value(&self) -> std::borrow::Cow<'static, str> {
+        crate::util::display_value(self.value)
+    }
+}
+
+impl serde::Serialize for SerializableCapture {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+
+        let mut state = serializer.serialize_struct("SerializableCapture", 5)?;
+        state.serialize_field("name", &self.name)?;
+        state.serialize_field("match_number", &self.match_number)?;
+        state.serialize_field("start", &self.start)?;
+        state.serialize_field("end", &self.end)?;
+        let value = self.display_value();
+        state.serialize_field("value", &value)?;
+        state.end()
+    }
+}
+
+#[derive(Debug, Clone, Serialize, JsonSchema)]
+pub struct SerializableCaptures {
+    #[schemars(with = "Vec<SerializableCapture>")]
+    pub captures: SmallVec<[SerializableCapture; 2]>,
+}
+
+impl SerializableCaptures {
+    pub fn from_captures(captures: &regex::bytes::Captures, _input: &[u8], re: &Regex) -> Self {
+        let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
+
+        let capture_names: SmallVec<[Option<&'static str>; 4]> =
+            re.capture_names().map(|name| name.map(intern)).collect();
+
+        // If there are explicit capture groups (e.g., group 1, 2, ...),
+        // only serialize those.
+        if captures.len() > 1 {
+            for i in 1..captures.len() {
+                // Start from 1
+                if let Some(cap) = captures.get(i) {
+                    let raw_value = String::from_utf8_lossy(cap.as_bytes());
+                    let raw_interned = intern(raw_value.as_ref());
+                    let name = capture_names.get(i).and_then(|opt| *opt);
+
+                    serialized_captures.push(SerializableCapture {
+                        name,
+                        match_number: i32::try_from(i).unwrap_or(0),
+                        start: cap.start(),
+                        end: cap.end(),
+                        value: raw_interned,
+                    });
+                }
+            }
+        } else if captures.len() == 1 {
+            // ELSE, if there is ONLY the full match (len == 1),
+            // serialize just that full match (group 0) as the fallback.
+            if let Some(cap) = captures.get(0) {
+                let raw_value = String::from_utf8_lossy(cap.as_bytes());
+                let raw_interned = intern(raw_value.as_ref());
+                let name = capture_names.get(0).and_then(|opt| *opt);
+
+                serialized_captures.push(SerializableCapture {
+                    name,
+                    match_number: 0,
+                    start: cap.start(),
+                    end: cap.end(),
+                    value: raw_interned,
+                });
+            }
+        }
+        // If len == 0 (no match), loop is skipped, empty vec is returned.
+
+        SerializableCaptures { captures: serialized_captures }
+    }
+}
diff --git a/src/matcher/conversion.rs b/src/matcher/conversion.rs
new file mode 100644
index 0000000..551119c
--- /dev/null
+++ b/src/matcher/conversion.rs
@@ -0,0 +1,251 @@
+use std::sync::Arc;
+
+use http::StatusCode;
+use schemars::JsonSchema;
+use serde::Serialize;
+use xxhash_rust::xxh3::xxh3_64;
+
+use crate::{
+    blob::BlobId,
+    location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
+    rules::rule::Rule,
+    validation_body::{self, ValidationResponseBody},
+};
+
+use super::{captures::SerializableCaptures, BlobMatch};
+
+use kingfisher_scanner::primitives::compute_finding_fingerprint;
+
+// -------------------------------------------------------------------------------------------------
+// OwnedBlobMatch
+// -------------------------------------------------------------------------------------------------
+
+#[derive(Clone)]
+pub struct OwnedBlobMatch {
+    pub rule: Arc<Rule>,
+    pub blob_id: BlobId,
+    /// The unique content-based identifier of this match
+    pub finding_fingerprint: u64,
+    pub matching_input_offset_span: OffsetSpan,
+    pub captures: SerializableCaptures,
+    pub validation_response_body: ValidationResponseBody,
+    pub validation_response_status: StatusCode,
+    pub validation_success: bool,
+    pub calculated_entropy: f32,
+    pub is_base64: bool,
+    /// Variables captured from dependent rules (from depends_on_rule).
+    /// Maps variable name (uppercase) to captured value.
+    pub dependent_captures: std::collections::BTreeMap<String, String>,
+}
+
+impl OwnedBlobMatch {
+    pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc<Rule>) -> OwnedBlobMatch {
+        OwnedBlobMatch {
+            rule,
+            blob_id: m.blob_id,
+            finding_fingerprint: m.finding_fingerprint,
+            // matching_input: m.snippet.matching.0.to_vec(),
+            matching_input_offset_span: m.location.offset_span.clone(),
+            captures: m.groups.clone(),
+            validation_response_body: m.validation_response_body.clone(),
+            validation_response_status: StatusCode::from_u16(m.validation_response_status)
+                .unwrap_or(StatusCode::CONTINUE),
+            validation_success: m.validation_success,
+            calculated_entropy: m.calculated_entropy,
+            is_base64: m.is_base64,
+            dependent_captures: m.dependent_captures.clone(),
+        }
+    }
+
+    pub fn from_blob_match(blob_match: BlobMatch) -> Self {
+        // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
+        //
+        // This indexing is intentionally different from the internal `validation_dedup_key()`
+        // (which uses get(0)) to maintain stable external fingerprints. Changing this would break:
+        // - Historical baselines that rely on fingerprint matching
+        // - Dedup entries stored in external systems
+        //
+        // For rules with nested captures like (?<REGEX>...(ABC)...), this may pick up
+        // the inner group, but that behavior is now established and must be preserved.
+        let matching_finding = blob_match
+            .captures
+            .captures
+            .get(1)
+            .or_else(|| blob_match.captures.captures.get(0))
+            .map(|capture| capture.raw_value().as_bytes().to_vec())
+            .unwrap_or_else(Vec::new);
+
+        let mut owned_blob_match = OwnedBlobMatch {
+            rule: blob_match.rule,
+            blob_id: blob_match.blob_id.clone(),
+            matching_input_offset_span: blob_match.matching_input_offset_span,
+            captures: blob_match.captures.clone(),
+            validation_response_body: blob_match.validation_response_body,
+            validation_response_status: blob_match.validation_response_status,
+            validation_success: blob_match.validation_success,
+            calculated_entropy: blob_match.calculated_entropy,
+            finding_fingerprint: 0, //default
+            is_base64: blob_match.is_base64,
+            dependent_captures: std::collections::BTreeMap::new(),
+        };
+
+        // Convert matching_finding to a &str (using lossy conversion if needed)
+        let finding_value = std::str::from_utf8(&matching_finding).unwrap_or("");
+        // Use blob_id as the file/commit identifier
+        let file_or_commit = &blob_match.blob_id.to_string();
+
+        let offset_start: u64 =
+            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
+        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
+
+        owned_blob_match.finding_fingerprint =
+            compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end);
+
+        owned_blob_match
+    }
+}
+
+// -------------------------------------------------------------------------------------------------
+// Match
+// -------------------------------------------------------------------------------------------------
+
+#[derive(Debug, Clone, Serialize, JsonSchema)]
+pub struct Match {
+    /// The location of the entire matching content
+    pub location: Location,
+
+    /// The capture groups
+    pub groups: SerializableCaptures, // Store serialized captures
+
+    /// unique identifier of file / blob where this match was found
+    pub blob_id: BlobId,
+
+    /// The unique content-based identifier of this match
+    pub finding_fingerprint: u64,
+
+    /// The rule that produced this match
+    #[serde(skip_serializing)]
+    #[schemars(skip)]
+    pub rule: Arc<Rule>,
+
+    /// Validation Body
+    #[serde(
+        default,
+        serialize_with = "validation_body::serialize",
+        deserialize_with = "validation_body::deserialize"
+    )]
+    #[schemars(schema_with = "validation_body::schema")]
+    pub validation_response_body: ValidationResponseBody,
+
+    /// Validation Status Code
+    pub validation_response_status: u16,
+
+    /// Validation Success
+    pub validation_success: bool,
+
+    /// Validation Success
+    pub calculated_entropy: f32,
+
+    pub visible: bool,
+    #[serde(default)]
+    pub is_base64: bool,
+
+    /// Variables captured from dependent rules (from depends_on_rule).
+    /// Maps variable name (uppercase) to captured value.
+    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
+    pub dependent_captures: std::collections::BTreeMap<String, String>,
+}
+
+impl Match {
+    #[inline]
+    pub fn convert_owned_blobmatch_to_match<'a>(
+        loc_mapping: Option<&'a LocationMapping<'a>>,
+        owned_blob_match: &'a OwnedBlobMatch,
+        origin_type: &'a str,
+    ) -> Self {
+        let offset_span = owned_blob_match.matching_input_offset_span;
+        // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
+        // See comment in from_blob_match() for why this differs from validation_dedup_key().
+        let matching_finding_bytes = owned_blob_match
+            .captures
+            .captures
+            .get(1)
+            .or_else(|| owned_blob_match.captures.captures.get(0))
+            .map(|capture| capture.raw_value().as_bytes())
+            .unwrap_or_default();
+
+        // The fingerprint will be based on the content of the secret.
+        let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
+
+        let source_span =
+            loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
+                start: SourcePoint { line: 0, column: 0 },
+                end: SourcePoint { line: 0, column: 0 },
+            });
+        let offset_start: u64 =
+            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
+        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
+
+        let finding_fingerprint = compute_finding_fingerprint(
+            finding_value_for_fp,
+            origin_type, // file_or_commit,
+            offset_start,
+            offset_end,
+        );
+
+        // matching_snippet
+        Match {
+            rule: owned_blob_match.rule.clone(),
+            visible: owned_blob_match.rule.visible().to_owned(),
+            location: Location::with_source_span(offset_span, Some(source_span.clone())),
+            groups: owned_blob_match.captures.clone(),
+            blob_id: owned_blob_match.blob_id,
+            finding_fingerprint,
+            validation_response_body: owned_blob_match.validation_response_body.clone(),
+            validation_response_status: owned_blob_match.validation_response_status.as_u16(),
+            validation_success: owned_blob_match.validation_success,
+            calculated_entropy: owned_blob_match.calculated_entropy,
+            is_base64: owned_blob_match.is_base64,
+            dependent_captures: owned_blob_match.dependent_captures.clone(),
+        }
+    }
+
+    /// Returns the `blob_id` of the match.
+    pub fn get_blob_id(&self) -> BlobId {
+        self.blob_id.clone()
+    }
+
+    pub fn finding_id(&self) -> String {
+        let mut buffer = Vec::with_capacity(128);
+        buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes());
+        buffer.push(0);
+        serde_json::to_writer(&mut buffer, &self.groups)
+            .expect("should be able to serialize groups as JSON");
+        let mut num = xxh3_64(&buffer);
+        // Ensure the number is positive and within i64 range
+        num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive
+                                      // Convert to string
+        num.to_string()
+    }
+}
+
+// -------------------------------------------------------------------------------------------------
+// MatcherStats
+// -------------------------------------------------------------------------------------------------
+
+#[derive(Debug, Default, Clone)]
+pub struct MatcherStats {
+    pub blobs_seen: u64,
+    pub blobs_scanned: u64,
+    pub bytes_seen: u64,
+    pub bytes_scanned: u64,
+}
+
+impl MatcherStats {
+    pub fn update(&mut self, other: &Self) {
+        self.blobs_seen += other.blobs_seen;
+        self.blobs_scanned += other.blobs_scanned;
+        self.bytes_seen += other.bytes_seen;
+        self.bytes_scanned += other.bytes_scanned;
+    }
+}
diff --git a/src/matcher/dedup.rs b/src/matcher/dedup.rs
new file mode 100644
index 0000000..5755a45
--- /dev/null
+++ b/src/matcher/dedup.rs
@@ -0,0 +1,2 @@
+// Re-export from the canonical implementation in kingfisher-scanner.
+pub(crate) use kingfisher_scanner::primitives::{compute_match_key, record_match};
diff --git a/src/matcher/filter.rs b/src/matcher/filter.rs
new file mode 100644
index 0000000..96ec74b
--- /dev/null
+++ b/src/matcher/filter.rs
@@ -0,0 +1,274 @@
+use std::sync::Arc;
+
+use http::StatusCode;
+use regex::bytes::Regex;
+use rustc_hash::{FxHashMap, FxHashSet};
+use tracing::debug;
+
+use crate::{
+    blob::Blob,
+    entropy::calculate_shannon_entropy,
+    inline_ignore::InlineIgnoreConfig,
+    location::OffsetSpan,
+    origin::OriginSet,
+    rule_profiling::{ConcurrentRuleProfiler, RuleTimer},
+    rules::rule::{PatternRequirementContext, PatternValidationResult, Rule, Validation},
+    safe_list::{is_safe_match, is_user_match},
+    validation::{is_parseable_mongodb_uri, is_parseable_mysql_uri, is_parseable_postgres_uri},
+};
+
+use super::{
+    captures::SerializableCaptures,
+    dedup::{compute_match_key, record_match},
+    BlobMatch,
+};
+
+// Re-use the canonical secret capture selection from kingfisher-scanner.
+use kingfisher_scanner::primitives::find_secret_capture;
+
+// -------------------------------------------------------------------------------------------------
+// Entropy and safe-list check
+// -------------------------------------------------------------------------------------------------
+
+/// Returns `Some(entropy)` if the match passes entropy and safe-list checks,
+/// `None` if it should be skipped.
+fn check_entropy_and_safelist(
+    entropy_bytes: &[u8],
+    full_bytes: &[u8],
+    min_entropy: f32,
+) -> Option<f32> {
+    let calculated_entropy = calculate_shannon_entropy(entropy_bytes);
+    if calculated_entropy <= min_entropy
+        || is_safe_match(entropy_bytes)
+        || is_user_match(entropy_bytes, full_bytes)
+    {
+        debug!(
+            "Skipping match with entropy {} <= {} or safe match",
+            calculated_entropy, min_entropy
+        );
+        None
+    } else {
+        Some(calculated_entropy)
+    }
+}
+
+// -------------------------------------------------------------------------------------------------
+// Pattern requirements check
+// -------------------------------------------------------------------------------------------------
+
+/// Returns `true` if the match passes pattern requirements, `false` if it should be skipped.
+fn check_pattern_requirements(
+    rule: &Rule,
+    re: &Regex,
+    captures: &regex::bytes::Captures,
+    full_bytes: &[u8],
+    entropy_bytes: &[u8],
+    respect_ignore_if_contains: bool,
+) -> bool {
+    let Some(char_reqs) = rule.pattern_requirements() else {
+        return true;
+    };
+
+    let context = PatternRequirementContext { regex: re, captures, full_match: full_bytes };
+
+    // Decide which bytes to validate:
+    // - If there are multiple capture groups OR any named captures -> use full match
+    // - Otherwise -> use entropy_bytes (the actual secret)
+    let use_full_match = {
+        let has_named_captures = re.capture_names().any(|n| n.is_some());
+        let capture_count = captures.len(); // includes group 0
+        has_named_captures || capture_count > 2
+    };
+
+    let validation_bytes = if use_full_match { full_bytes } else { entropy_bytes };
+
+    match char_reqs.validate(validation_bytes, Some(context), respect_ignore_if_contains) {
+        PatternValidationResult::Passed => true,
+        PatternValidationResult::Failed => {
+            debug!(
+                "Skipping match that does not meet character requirements for rule {}",
+                rule.id()
+            );
+            false
+        }
+        PatternValidationResult::FailedChecksum { actual_len, expected_len } => {
+            debug!(
+                "Skipping match for rule {} due to checksum mismatch (actual_len={}, expected_len={})",
+                rule.id(),
+                actual_len,
+                expected_len
+            );
+            false
+        }
+        PatternValidationResult::IgnoredBySubstring { matched_term } => {
+            debug!(
+                "Skipping match for rule {} because it contains ignored term {matched_term}",
+                rule.id()
+            );
+            false
+        }
+    }
+}
+
+// -------------------------------------------------------------------------------------------------
+// URI validation
+// -------------------------------------------------------------------------------------------------
+
+/// Returns `true` if the match passes URI validation (for database rules), `false` if it should
+/// be skipped.
+fn check_uri_validation(rule: &Rule, matching_input_bytes: &[u8]) -> bool {
+    let Some(validation) = rule.syntax.validation.as_ref() else {
+        return true;
+    };
+
+    match validation {
+        Validation::MongoDB => {
+            let Ok(uri) = std::str::from_utf8(matching_input_bytes) else {
+                debug!("Skipping match for rule {} due to non-UTF8 MongoDB URI", rule.id());
+                return false;
+            };
+            if !is_parseable_mongodb_uri(uri) {
+                debug!("Skipping match for rule {} due to invalid MongoDB URI", rule.id());
+                return false;
+            }
+        }
+        Validation::Postgres => {
+            let Ok(uri) = std::str::from_utf8(matching_input_bytes) else {
+                debug!("Skipping match for rule {} due to non-UTF8 Postgres URI", rule.id());
+                return false;
+            };
+            if !is_parseable_postgres_uri(uri) {
+                debug!("Skipping match for rule {} due to invalid Postgres URI", rule.id());
+                return false;
+            }
+        }
+        Validation::MySQL => {
+            let Ok(uri) = std::str::from_utf8(matching_input_bytes) else {
+                debug!("Skipping match for rule {} due to non-UTF8 MySQL URI", rule.id());
+                return false;
+            };
+            if !is_parseable_mysql_uri(uri) {
+                debug!("Skipping match for rule {} due to invalid MySQL URI", rule.id());
+                return false;
+            }
+        }
+        _ => {}
+    }
+    true
+}
+
+// -------------------------------------------------------------------------------------------------
+// filter_match — main entry point
+// -------------------------------------------------------------------------------------------------
+
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn filter_match<'b>(
+    blob: &'b Blob,
+    rule: Arc<Rule>,
+    re: &Regex,
+    start: usize,
+    end: usize,
+    matches: &mut Vec<BlobMatch<'b>>,
+    previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
+    rule_id: usize,
+    seen_matches: &mut FxHashSet<u64>,
+    _origin: &OriginSet,
+    ts_match: Option<&[u8]>,
+    is_base64: bool,
+    _redact: bool,
+    filename: &str,
+    profiler: Option<&Arc<ConcurrentRuleProfiler>>,
+    respect_ignore_if_contains: bool,
+    inline_ignore_config: &InlineIgnoreConfig,
+) {
+    let mut timer =
+        profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
+
+    let initial_len = matches.len();
+
+    let blob_bytes = blob.bytes();
+    let default_slice = &blob_bytes[start..end];
+    let haystack = ts_match.unwrap_or(default_slice);
+
+    for captures in re.captures_iter(haystack) {
+        let full_capture = captures.get(0).unwrap();
+        let matching_input_for_entropy = find_secret_capture(re, &captures);
+
+        let min_entropy = rule.min_entropy();
+        let entropy_bytes = matching_input_for_entropy.as_bytes();
+        let full_bytes = full_capture.as_bytes();
+
+        // Check entropy and safe-listing
+        let calculated_entropy =
+            match check_entropy_and_safelist(entropy_bytes, full_bytes, min_entropy) {
+                Some(e) => e,
+                None => continue,
+            };
+
+        // Check pattern requirements
+        if !check_pattern_requirements(
+            &rule,
+            re,
+            &captures,
+            full_bytes,
+            entropy_bytes,
+            respect_ignore_if_contains,
+        ) {
+            continue;
+        }
+
+        // Use the `matching_input_for_entropy` as the span/key for the finding.
+        let matching_input = matching_input_for_entropy;
+
+        let matching_input_offset_span = OffsetSpan::from_range(
+            (start + matching_input.start())..(start + matching_input.end()),
+        );
+
+        // Check inline ignore directives
+        if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
+            debug!("Skipping match due to inline ignore directive");
+            continue;
+        }
+
+        // Check URI validation (MongoDB, Postgres, MySQL)
+        if !check_uri_validation(&rule, matching_input.as_bytes()) {
+            continue;
+        }
+
+        // Deduplication
+        let match_key = compute_match_key(
+            matching_input.as_bytes(),
+            rule.id().as_bytes(),
+            matching_input_offset_span.start,
+            matching_input_offset_span.end,
+        );
+        if !seen_matches.insert(match_key) {
+            continue;
+        }
+        if !record_match(previous_matches, rule_id, matching_input_offset_span) {
+            continue;
+        }
+        let only_matching_input =
+            &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
+
+        // Pass the *full* capture object to from_captures
+        let groups = SerializableCaptures::from_captures(&captures, haystack, re);
+
+        matches.push(BlobMatch {
+            rule: Arc::clone(&rule),
+            blob_id: blob.id_ref(),
+            matching_input: only_matching_input,
+            matching_input_offset_span,
+            captures: groups,
+            validation_response_body: None,
+            validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
+            validation_success: false,
+            calculated_entropy,
+            is_base64,
+        });
+    }
+    if let Some(t) = timer.take() {
+        let new_count = (matches.len() - initial_len) as u64;
+        t.end(new_count > 0, new_count, 0);
+    }
+}
diff --git a/src/matcher/fingerprint.rs b/src/matcher/fingerprint.rs
new file mode 100644
index 0000000..0a764da
--- /dev/null
+++ b/src/matcher/fingerprint.rs
@@ -0,0 +1,2 @@
+// Re-export from the canonical implementation in kingfisher-scanner.
+pub use kingfisher_scanner::primitives::compute_finding_fingerprint;
diff --git a/src/matcher.rs b/src/matcher/mod.rs
similarity index 58%
rename from src/matcher.rs
rename to src/matcher/mod.rs
index ac89bae..6324ba7 100644
--- a/src/matcher.rs
+++ b/src/matcher/mod.rs
@@ -1,42 +1,38 @@
-use std::{
-    hash::{Hash, Hasher},
-    str,
-    sync::{Arc, Mutex},
-};
+mod base64_decode;
+mod captures;
+mod conversion;
+mod dedup;
+mod filter;
+mod fingerprint;
+
+// Re-export public API
+pub use base64_decode::{get_base64_strings, DecodedData};
+pub use captures::{Group, Groups, SerializableCapture, SerializableCaptures};
+pub use conversion::{Match, MatcherStats, OwnedBlobMatch};
+pub use fingerprint::compute_finding_fingerprint;
+
+use std::sync::{Arc, Mutex};
 
 use anyhow::Result;
-use base64::{engine::general_purpose, Engine};
-use bstr::BString;
 use http::StatusCode;
-use regex::bytes::Regex;
-use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
-use schemars::{
-    gen::SchemaGenerator,
-    schema::{ArrayValidation, InstanceType, Schema},
-    JsonSchema,
-};
-use serde::{Deserialize, Serialize};
-use smallvec::SmallVec;
-use tracing::debug;
-use xxhash_rust::xxh3::xxh3_64;
+use rustc_hash::{FxHashMap, FxHashSet};
 
 use crate::{
     blob::{Blob, BlobId, BlobIdMap},
-    entropy::calculate_shannon_entropy,
     inline_ignore::InlineIgnoreConfig,
-    location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
+    location::OffsetSpan,
     origin::OriginSet,
     parser,
     parser::{Checker, Language},
-    rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer},
-    rules::rule::{PatternRequirementContext, PatternValidationResult, Rule, Validation},
+    rule_profiling::{ConcurrentRuleProfiler, RuleStats},
+    rules::rule::Rule,
     rules_database::RulesDatabase,
-    safe_list::{is_safe_match, is_user_match},
     scanner_pool::ScannerPool,
-    snippet::Base64BString,
-    util::intern,
-    validation::{is_parseable_mongodb_uri, is_parseable_mysql_uri, is_parseable_postgres_uri},
-    validation_body::{self, ValidationResponseBody},
+    validation_body::ValidationResponseBody,
+};
+
+use self::{
+    base64_decode::get_base64_strings as get_b64_strings, dedup::record_match, filter::filter_match,
 };
 
 const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
@@ -58,94 +54,7 @@ struct RawMatch {
     start_idx: u64,
     end_idx: u64,
 }
-#[derive(Clone)]
-pub struct OwnedBlobMatch {
-    pub rule: Arc<Rule>,
-    pub blob_id: BlobId,
-    /// The unique content-based identifier of this match
-    pub finding_fingerprint: u64,
-    pub matching_input_offset_span: OffsetSpan,
-    pub captures: SerializableCaptures,
-    pub validation_response_body: ValidationResponseBody,
-    pub validation_response_status: StatusCode,
-    pub validation_success: bool,
-    pub calculated_entropy: f32,
-    pub is_base64: bool,
-    /// Variables captured from dependent rules (from depends_on_rule).
-    /// Maps variable name (uppercase) to captured value.
-    pub dependent_captures: std::collections::BTreeMap<String, String>,
-}
-impl<'a> Matcher<'a> {
-    pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
-        self.profiler.as_ref().map(|p| p.generate_report())
-    }
-}
-impl OwnedBlobMatch {
-    pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc<Rule>) -> OwnedBlobMatch {
-        OwnedBlobMatch {
-            rule,
-            blob_id: m.blob_id,
-            finding_fingerprint: m.finding_fingerprint,
-            // matching_input: m.snippet.matching.0.to_vec(),
-            matching_input_offset_span: m.location.offset_span.clone(),
-            captures: m.groups.clone(),
-            validation_response_body: m.validation_response_body.clone(),
-            validation_response_status: StatusCode::from_u16(m.validation_response_status)
-                .unwrap_or(StatusCode::CONTINUE),
-            validation_success: m.validation_success,
-            calculated_entropy: m.calculated_entropy,
-            is_base64: m.is_base64,
-            dependent_captures: m.dependent_captures.clone(),
-        }
-    }
 
-    pub fn from_blob_match(blob_match: BlobMatch) -> Self {
-        // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
-        //
-        // This indexing is intentionally different from the internal `validation_dedup_key()`
-        // (which uses get(0)) to maintain stable external fingerprints. Changing this would break:
-        // - Historical baselines that rely on fingerprint matching
-        // - Dedup entries stored in external systems
-        //
-        // For rules with nested captures like (?<REGEX>...(ABC)...), this may pick up
-        // the inner group, but that behavior is now established and must be preserved.
-        let matching_finding = blob_match
-            .captures
-            .captures
-            .get(1)
-            .or_else(|| blob_match.captures.captures.get(0))
-            .map(|capture| capture.raw_value().as_bytes().to_vec())
-            .unwrap_or_else(Vec::new);
-
-        let mut owned_blob_match = OwnedBlobMatch {
-            rule: blob_match.rule,
-            blob_id: blob_match.blob_id.clone(),
-            matching_input_offset_span: blob_match.matching_input_offset_span,
-            captures: blob_match.captures.clone(),
-            validation_response_body: blob_match.validation_response_body,
-            validation_response_status: blob_match.validation_response_status,
-            validation_success: blob_match.validation_success,
-            calculated_entropy: blob_match.calculated_entropy,
-            finding_fingerprint: 0, //default
-            is_base64: blob_match.is_base64,
-            dependent_captures: std::collections::BTreeMap::new(),
-        };
-
-        // Convert matching_finding to a &str (using lossy conversion if needed)
-        let finding_value = std::str::from_utf8(&matching_finding).unwrap_or("");
-        // Use blob_id as the file/commit identifier
-        let file_or_commit = &blob_match.blob_id.to_string();
-
-        let offset_start: u64 =
-            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
-        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
-
-        owned_blob_match.finding_fingerprint =
-            compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end);
-
-        owned_blob_match
-    }
-}
 // -------------------------------------------------------------------------------------------------
 // BlobMatch
 // -------------------------------------------------------------------------------------------------
@@ -156,7 +65,7 @@ impl OwnedBlobMatch {
 /// `Match`.
 pub struct BlobMatch<'a> {
     /// The rule that was matched
-    pub rule: Arc<Rule>, // Changed from `&'a Rule` to `Arc<Rule
+    pub rule: Arc<Rule>,
 
     /// The blob that was matched
     pub blob_id: &'a BlobId,
@@ -168,7 +77,7 @@ pub struct BlobMatch<'a> {
     pub matching_input_offset_span: OffsetSpan,
 
     /// The capture groups from the match
-    pub captures: SerializableCaptures, // regex::bytes::Captures<'a>,
+    pub captures: SerializableCaptures,
 
     pub validation_response_body: ValidationResponseBody,
     pub validation_response_status: StatusCode,
@@ -177,6 +86,7 @@ pub struct BlobMatch<'a> {
     pub calculated_entropy: f32,
     pub is_base64: bool,
 }
+
 #[derive(Clone)]
 struct UserData {
     /// A scratch vector for raw matches from Vectorscan, to minimize allocation
@@ -185,6 +95,7 @@ struct UserData {
     /// The length of the input being scanned
     input_len: u64,
 }
+
 // -------------------------------------------------------------------------------------------------
 // Matcher
 // -------------------------------------------------------------------------------------------------
@@ -222,6 +133,7 @@ pub struct Matcher<'a> {
     /// Whether matches should honour `ignore_if_contains` requirements.
     respect_ignore_if_contains: bool,
 }
+
 /// This `Drop` implementation updates the `global_stats` with the local stats
 impl<'a> Drop for Matcher<'a> {
     fn drop(&mut self) {
@@ -231,11 +143,19 @@ impl<'a> Drop for Matcher<'a> {
         }
     }
 }
+
 pub enum ScanResult<'a> {
     SeenWithMatches,
     SeenSansMatches,
     New(Vec<BlobMatch<'a>>),
 }
+
+impl<'a> Matcher<'a> {
+    pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
+        self.profiler.as_ref().map(|p| p.generate_report())
+    }
+}
+
 impl<'a> Matcher<'a> {
     /// Create a new `Matcher` from the given `RulesDatabase`.
     ///
@@ -255,8 +175,6 @@ impl<'a> Matcher<'a> {
         // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec
         let raw_matches_scratch = Vec::new();
         let user_data = UserData { raw_matches_scratch, input_len: 0 };
-        // let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?;
-        // pool is created once per scan run (see Scanner section below)
         let profiler = shared_profiler.or_else(|| {
             if enable_profiling {
                 Some(Arc::new(ConcurrentRuleProfiler::new()))
@@ -349,7 +267,7 @@ impl<'a> Matcher<'a> {
         let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
             Vec::new()
         } else {
-            get_base64_strings(blob.bytes())
+            get_b64_strings(blob.bytes())
         };
 
         let lang_hint = lang.as_deref();
@@ -530,7 +448,7 @@ impl<'a> Matcher<'a> {
                     );
                 }
                 if depth + 1 < MAX_B64_DEPTH {
-                    for nested in get_base64_strings(item.decoded.as_slice()) {
+                    for nested in get_b64_strings(item.decoded.as_slice()) {
                         b64_stack.push((
                             DecodedData {
                                 decoded: nested.decoded,
@@ -559,268 +477,12 @@ impl<'a> Matcher<'a> {
         if self.user_data.raw_matches_scratch.capacity()
             > self.user_data.raw_matches_scratch.len() * 4
         {
-            // Vec::shrink_to_fit may re-allocate, but we’re about to leave scan_blob
+            // Vec::shrink_to_fit may re-allocate, but we're about to leave scan_blob
             // so the cost is hidden off the hot path.
             self.user_data.raw_matches_scratch.shrink_to_fit();
         }
 
         Ok(ScanResult::New(matches))
-        // Ok(result)
-    }
-}
-
-#[inline]
-fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
-    let mut hasher = FxHasher::default();
-    // Hash each component directly without allocation
-    content.hash(&mut hasher);
-    rule_id.hash(&mut hasher);
-    start.hash(&mut hasher);
-    end.hash(&mut hasher);
-    hasher.finish()
-}
-
-#[inline]
-fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
-    let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
-    if idx > 0 {
-        if spans[idx - 1].fully_contains(&span) {
-            return false;
-        }
-        if span.fully_contains(&spans[idx - 1]) {
-            spans.remove(idx - 1);
-            idx -= 1;
-        }
-    }
-    if idx < spans.len() {
-        if spans[idx].fully_contains(&span) {
-            return false;
-        }
-        if span.fully_contains(&spans[idx]) {
-            spans.remove(idx);
-        }
-    }
-    spans.insert(idx, span);
-    true
-}
-
-#[inline]
-fn record_match(
-    map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
-    rule_id: usize,
-    span: OffsetSpan,
-) -> bool {
-    insert_span(map.entry(rule_id).or_default(), span)
-}
-
-#[allow(clippy::too_many_arguments)]
-fn filter_match<'b>(
-    blob: &'b Blob,
-    rule: Arc<Rule>,
-    re: &Regex,
-    start: usize,
-    end: usize,
-    matches: &mut Vec<BlobMatch<'b>>,
-    previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
-    rule_id: usize,
-    seen_matches: &mut FxHashSet<u64>,
-    _origin: &OriginSet,
-    ts_match: Option<&[u8]>,
-    is_base64: bool,
-    _redact: bool,
-    filename: &str,
-    profiler: Option<&Arc<ConcurrentRuleProfiler>>,
-    respect_ignore_if_contains: bool,
-    inline_ignore_config: &InlineIgnoreConfig,
-) {
-    let mut timer =
-        profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
-
-    let initial_len = matches.len();
-
-    let blob_bytes = blob.bytes();
-    let default_slice = &blob_bytes[start..end];
-    let haystack = ts_match.unwrap_or(default_slice);
-
-    for captures in re.captures_iter(haystack) {
-        let full_capture = captures.get(0).unwrap();
-
-        // --- LOGIC TO FIND THE "SECRET" FOR ENTROPY/SAFE-LISTING ---
-        let matching_input_for_entropy = 'block: {
-            // 1. Prefer a named capture called TOKEN (case-insensitive).
-            if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
-                name_opt
-                    .filter(|name| name.eq_ignore_ascii_case("TOKEN"))
-                    .and_then(|_| captures.get(i))
-            }) {
-                break 'block token_cap;
-            }
-
-            // 2. Otherwise, prefer the first *matched* named capture.
-            if let Some(named_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
-                name_opt.and_then(|_| captures.get(i)) // find(i > 0 && name_opt.is_some())
-            }) {
-                break 'block named_cap;
-            }
-
-            // 3. Otherwise, fall back to the first positional capture (group 1).
-            if let Some(pos_cap) = captures.get(1) {
-                break 'block pos_cap;
-            }
-
-            // 4. Finally, fall back to the full match (group 0).
-            break 'block full_capture;
-        };
-        // --- END LOGIC ---
-
-        let min_entropy = rule.min_entropy();
-        let entropy_bytes = matching_input_for_entropy.as_bytes();
-        let full_bytes = full_capture.as_bytes();
-        let calculated_entropy = calculate_shannon_entropy(entropy_bytes);
-
-        // Check entropy and safe-listing against the *selected* secret bytes
-        if calculated_entropy <= min_entropy
-            || is_safe_match(entropy_bytes)
-            || is_user_match(entropy_bytes, full_bytes)
-        {
-            debug!(
-                "Skipping match with entropy {} <= {} or safe match",
-                calculated_entropy, min_entropy
-            );
-            continue;
-        }
-
-        // Check character requirements if specified
-        if let Some(char_reqs) = rule.pattern_requirements() {
-            let context = PatternRequirementContext {
-                regex: re,
-                captures: &captures,
-                full_match: full_bytes,
-            };
-
-            // Decide which bytes to validate:
-            // - If there are multiple capture groups OR any named captures → use full match
-            // - Otherwise → use entropy_bytes (the actual secret)
-            let use_full_match = {
-                let has_named_captures = re.capture_names().any(|n| n.is_some());
-                let capture_count = captures.len(); // includes group 0
-                has_named_captures || capture_count > 2
-            };
-
-            let validation_bytes = if use_full_match { full_bytes } else { entropy_bytes };
-
-            match char_reqs.validate(validation_bytes, Some(context), respect_ignore_if_contains) {
-                //
-                // --- END FIX ---
-                PatternValidationResult::Passed => {}
-                PatternValidationResult::Failed => {
-                    debug!(
-                        "Skipping match that does not meet character requirements for rule {}",
-                        rule.id()
-                    );
-                    continue;
-                }
-                PatternValidationResult::FailedChecksum { actual_len, expected_len } => {
-                    debug!(
-                        "Skipping match for rule {} due to checksum mismatch (actual_len={}, expected_len={})",
-                        rule.id(),
-                        actual_len,
-                        expected_len
-                    );
-                    continue;
-                }
-                PatternValidationResult::IgnoredBySubstring { matched_term } => {
-                    debug!(
-                        "Skipping match for rule {} because it contains ignored term {matched_term}",
-                        rule.id()
-                    );
-                    continue;
-                }
-            }
-        }
-
-        // Use the `matching_input_for_entropy` as the span/key for the finding.
-        let matching_input = matching_input_for_entropy;
-
-        let matching_input_offset_span = OffsetSpan::from_range(
-            (start + matching_input.start())..(start + matching_input.end()),
-        );
-        if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
-            debug!("Skipping match due to inline ignore directive");
-            continue;
-        }
-        if let Some(validation) = rule.syntax.validation.as_ref() {
-            match validation {
-                Validation::MongoDB => {
-                    let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
-                        debug!("Skipping match for rule {} due to non-UTF8 MongoDB URI", rule.id());
-                        continue;
-                    };
-                    if !is_parseable_mongodb_uri(uri) {
-                        debug!("Skipping match for rule {} due to invalid MongoDB URI", rule.id());
-                        continue;
-                    }
-                }
-                Validation::Postgres => {
-                    let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
-                        debug!(
-                            "Skipping match for rule {} due to non-UTF8 Postgres URI",
-                            rule.id()
-                        );
-                        continue;
-                    };
-                    if !is_parseable_postgres_uri(uri) {
-                        debug!("Skipping match for rule {} due to invalid Postgres URI", rule.id());
-                        continue;
-                    }
-                }
-                Validation::MySQL => {
-                    let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
-                        debug!("Skipping match for rule {} due to non-UTF8 MySQL URI", rule.id());
-                        continue;
-                    };
-                    if !is_parseable_mysql_uri(uri) {
-                        debug!("Skipping match for rule {} due to invalid MySQL URI", rule.id());
-                        continue;
-                    }
-                }
-                _ => {}
-            }
-        }
-        let match_key = compute_match_key(
-            matching_input.as_bytes(),
-            rule.id().as_bytes(),
-            matching_input_offset_span.start,
-            matching_input_offset_span.end,
-        );
-        if !seen_matches.insert(match_key) {
-            continue;
-        }
-        if !record_match(previous_matches, rule_id, matching_input_offset_span) {
-            continue;
-        }
-        let only_matching_input =
-            &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
-
-        // Pass the *full* capture object to from_captures
-        let groups = SerializableCaptures::from_captures(&captures, haystack, re);
-
-        matches.push(BlobMatch {
-            rule: Arc::clone(&rule),
-            blob_id: blob.id_ref(),
-            matching_input: only_matching_input,
-            matching_input_offset_span,
-            captures: groups,
-            validation_response_body: None,
-            validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
-            validation_success: false,
-            calculated_entropy,
-            is_base64,
-        });
-    }
-    if let Some(t) = timer.take() {
-        let new_count = (matches.len() - initial_len) as u64;
-        t.end(new_count > 0, new_count, 0);
     }
 }
 
@@ -837,10 +499,6 @@ fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, S
         "javascript" | "js" => {
             Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries()))
         }
-        // "kotlin" => Some((
-        //     Language::Kotlin,
-        //     parser::queries::kotlin::get_kotlin_queries(),
-        // )),
         "php" => Some((Language::Php, parser::queries::php::get_php_queries())),
         "python" | "py" | "starlark" => {
             Some((Language::Python, parser::queries::python::get_python_queries()))
@@ -855,354 +513,6 @@ fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, S
         _ => None,
     }
 }
-// -------------------------------------------------------------------------------------------------
-// MatchStats
-// -------------------------------------------------------------------------------------------------
-#[derive(Debug, Default, Clone)]
-pub struct MatcherStats {
-    pub blobs_seen: u64,
-    pub blobs_scanned: u64,
-    pub bytes_seen: u64,
-    pub bytes_scanned: u64,
-    // #[cfg(feature = "rule_profiling")]
-    // pub rule_stats: crate::rule_profiling::RuleProfile,
-}
-impl MatcherStats {
-    pub fn update(&mut self, other: &Self) {
-        self.blobs_seen += other.blobs_seen;
-        self.blobs_scanned += other.blobs_scanned;
-        self.bytes_seen += other.bytes_seen;
-        self.bytes_scanned += other.bytes_scanned;
-
-        // #[cfg(feature = "rule_profiling")]
-        // self.rule_stats.update(&other.rule_stats);
-    }
-}
-// -------------------------------------------------------------------------------------------------
-// Group
-// -------------------------------------------------------------------------------------------------
-#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)]
-pub struct Group(pub Base64BString);
-impl Group {
-    pub fn new(m: regex::bytes::Match<'_>) -> Self {
-        Self(Base64BString(BString::from(m.as_bytes())))
-    }
-}
-// -------------------------------------------------------------------------------------------------
-// Groups
-// -------------------------------------------------------------------------------------------------
-#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
-pub struct Groups(pub SmallVec<[Group; 1]>);
-impl JsonSchema for Groups {
-    fn schema_name() -> String {
-        "Groups".to_string()
-    }
-
-    fn json_schema(gen: &mut SchemaGenerator) -> Schema {
-        let group_schema = gen.subschema_for::<Group>();
-        Schema::Object(schemars::schema::SchemaObject {
-            instance_type: Some(InstanceType::Array.into()),
-            array: Some(Box::new(ArrayValidation {
-                items: Some(group_schema.into()),
-                ..Default::default()
-            })),
-            ..Default::default()
-        })
-    }
-}
-// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
-// pub struct SerializableCapture {
-//     pub name: Option<String>, // Named group (if available)
-//     pub match_number: i32,
-//     pub start: usize,  // Start position of the match
-//     pub end: usize,    // End position of the match
-//     pub value: String, // The actual captured value
-// }
-#[derive(Debug, Clone, JsonSchema)]
-pub struct SerializableCapture {
-    pub name: Option<&'static str>,
-    pub match_number: i32,
-    pub start: usize,
-    pub end: usize,
-    /// Interned original (unredacted) value.
-    #[serde(skip_serializing, skip_deserializing)]
-    pub value: &'static str,
-}
-
-impl SerializableCapture {
-    /// Returns the original captured value.
-    pub fn raw_value(&self) -> &'static str {
-        self.value
-    }
-
-    /// Returns the value that should be shown in user-facing output.
-    pub fn display_value(&self) -> std::borrow::Cow<'static, str> {
-        crate::util::display_value(self.value)
-    }
-}
-
-impl serde::Serialize for SerializableCapture {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        use serde::ser::SerializeStruct;
-
-        let mut state = serializer.serialize_struct("SerializableCapture", 5)?;
-        state.serialize_field("name", &self.name)?;
-        state.serialize_field("match_number", &self.match_number)?;
-        state.serialize_field("start", &self.start)?;
-        state.serialize_field("end", &self.end)?;
-        let value = self.display_value();
-        state.serialize_field("value", &value)?;
-        state.end()
-    }
-}
-
-#[derive(Debug, Clone, Serialize, JsonSchema)]
-pub struct SerializableCaptures {
-    #[schemars(with = "Vec<SerializableCapture>")]
-    pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
-}
-
-impl SerializableCaptures {
-    pub fn from_captures(captures: &regex::bytes::Captures, _input: &[u8], re: &Regex) -> Self {
-        let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
-
-        let capture_names: SmallVec<[Option<&'static str>; 4]> =
-            re.capture_names().map(|name| name.map(intern)).collect();
-
-        // If there are explicit capture groups (e.g., group 1, 2, ...),
-        // only serialize those.
-        if captures.len() > 1 {
-            for i in 1..captures.len() {
-                // Start from 1
-                if let Some(cap) = captures.get(i) {
-                    let raw_value = String::from_utf8_lossy(cap.as_bytes());
-                    let raw_interned = intern(raw_value.as_ref());
-                    let name = capture_names.get(i).and_then(|opt| *opt);
-
-                    serialized_captures.push(SerializableCapture {
-                        name,
-                        match_number: i32::try_from(i).unwrap_or(0),
-                        start: cap.start(),
-                        end: cap.end(),
-                        value: raw_interned,
-                    });
-                }
-            }
-        } else if captures.len() == 1 {
-            // ELSE, if there is ONLY the full match (len == 1),
-            // serialize just that full match (group 0) as the fallback.
-            if let Some(cap) = captures.get(0) {
-                let raw_value = String::from_utf8_lossy(cap.as_bytes());
-                let raw_interned = intern(raw_value.as_ref());
-                let name = capture_names.get(0).and_then(|opt| *opt);
-
-                serialized_captures.push(SerializableCapture {
-                    name,
-                    match_number: 0,
-                    start: cap.start(),
-                    end: cap.end(),
-                    value: raw_interned,
-                });
-            }
-        }
-        // If len == 0 (no match), loop is skipped, empty vec is returned.
-
-        SerializableCaptures { captures: serialized_captures }
-    }
-}
-// -------------------------------------------------------------------------------------------------
-// Match
-// -------------------------------------------------------------------------------------------------
-#[derive(Debug, Clone, Serialize, JsonSchema)]
-pub struct Match {
-    /// The location of the entire matching content
-    pub location: Location,
-
-    /// The capture groups
-    pub groups: SerializableCaptures, // Store serialized captures
-
-    /// unique identifier of file / blob where this match was found
-    pub blob_id: BlobId,
-
-    /// The unique content-based identifier of this match
-    pub finding_fingerprint: u64,
-
-    /// The rule that produced this match
-    #[serde(skip_serializing)]
-    #[schemars(skip)]
-    pub rule: Arc<Rule>,
-
-    /// Validation Body
-    #[serde(
-        default,
-        serialize_with = "validation_body::serialize",
-        deserialize_with = "validation_body::deserialize"
-    )]
-    #[schemars(schema_with = "validation_body::schema")]
-    pub validation_response_body: ValidationResponseBody,
-
-    /// Validation Status Code
-    pub validation_response_status: u16,
-
-    /// Validation Success
-    pub validation_success: bool,
-
-    /// Validation Success
-    pub calculated_entropy: f32,
-
-    pub visible: bool,
-    #[serde(default)]
-    pub is_base64: bool,
-
-    /// Variables captured from dependent rules (from depends_on_rule).
-    /// Maps variable name (uppercase) to captured value.
-    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
-    pub dependent_captures: std::collections::BTreeMap<String, String>,
-}
-impl Match {
-    #[inline]
-    pub fn convert_owned_blobmatch_to_match<'a>(
-        loc_mapping: Option<&'a LocationMapping<'a>>,
-        owned_blob_match: &'a OwnedBlobMatch,
-        origin_type: &'a str,
-    ) -> Self {
-        let offset_span = owned_blob_match.matching_input_offset_span;
-        // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
-        // See comment in from_blob_match() for why this differs from validation_dedup_key().
-        let matching_finding_bytes = owned_blob_match
-            .captures
-            .captures
-            .get(1)
-            .or_else(|| owned_blob_match.captures.captures.get(0))
-            .map(|capture| capture.raw_value().as_bytes())
-            .unwrap_or_default();
-
-        // The fingerprint will be based on the content of the secret.
-        let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
-
-        let source_span =
-            loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
-                start: SourcePoint { line: 0, column: 0 },
-                end: SourcePoint { line: 0, column: 0 },
-            });
-        let offset_start: u64 =
-            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
-        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
-
-        let finding_fingerprint = compute_finding_fingerprint(
-            finding_value_for_fp,
-            origin_type, // file_or_commit,
-            offset_start,
-            offset_end,
-        );
-
-        // matching_snippet
-        Match {
-            rule: owned_blob_match.rule.clone(),
-            visible: owned_blob_match.rule.visible().to_owned(),
-            location: Location::with_source_span(offset_span, Some(source_span.clone())),
-            groups: owned_blob_match.captures.clone(),
-            blob_id: owned_blob_match.blob_id,
-            finding_fingerprint,
-            validation_response_body: owned_blob_match.validation_response_body.clone(),
-            validation_response_status: owned_blob_match.validation_response_status.as_u16(),
-            validation_success: owned_blob_match.validation_success,
-            calculated_entropy: owned_blob_match.calculated_entropy,
-            is_base64: owned_blob_match.is_base64,
-            dependent_captures: owned_blob_match.dependent_captures.clone(),
-        }
-    }
-
-    /// Returns the `blob_id` of the match.
-    pub fn get_blob_id(&self) -> BlobId {
-        self.blob_id.clone()
-    }
-
-    pub fn finding_id(&self) -> String {
-        let mut buffer = Vec::with_capacity(128);
-        buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes());
-        buffer.push(0);
-        serde_json::to_writer(&mut buffer, &self.groups)
-            .expect("should be able to serialize groups as JSON");
-        let mut num = xxh3_64(&buffer);
-        // Ensure the number is positive and within i64 range
-        num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive
-                                      // Convert to string
-        num.to_string()
-    }
-}
-#[derive(Debug, Clone)]
-pub struct DecodedData {
-    pub decoded: Vec<u8>,
-    pub pos_start: usize,
-    pub pos_end: usize,
-}
-#[inline]
-fn is_base64_byte(b: u8) -> bool {
-    // Accepts both standard base64 ('+', '/') and URL-safe base64 ('-', '_') characters.
-    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
-}
-
-pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
-    let mut results = Vec::new();
-    let mut i = 0;
-    while i < input.len() {
-        while i < input.len() && !is_base64_byte(input[i]) {
-            i += 1;
-        }
-        let start = i;
-        while i < input.len() && is_base64_byte(input[i]) {
-            i += 1;
-        }
-
-        let mut eq_count = 0;
-        while i < input.len() && input[i] == b'=' && eq_count < 2 {
-            i += 1;
-            eq_count += 1;
-        }
-        let end = i;
-
-        let len = end - start;
-        if len >= 32 && len % 4 == 0 {
-            let base64_slice = &input[start..end];
-
-            // Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD
-            let decode_result = general_purpose::STANDARD
-                .decode(base64_slice)
-                .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
-                .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
-
-            if let Ok(decoded) = decode_result {
-                if decoded.is_ascii() {
-                    results.push(DecodedData { decoded, pos_start: start, pos_end: end });
-                }
-            }
-        }
-    }
-
-    results
-}
-
-pub fn compute_finding_fingerprint(
-    finding_value: &str,
-    file_or_commit: &str,
-    offset_start: u64,
-    offset_end: u64,
-) -> u64 {
-    // Combine all into a byte buffer and hash it directly:
-    let mut buf = Vec::with_capacity(
-        finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::<u64>(),
-    );
-    buf.extend_from_slice(finding_value.as_bytes());
-    buf.extend_from_slice(file_or_commit.as_bytes());
-    buf.extend_from_slice(&offset_start.to_le_bytes());
-    buf.extend_from_slice(&offset_end.to_le_bytes());
-
-    xxh3_64(&buf)
-}
 
 // -------------------------------------------------------------------------------------------------
 // test
@@ -1220,6 +530,7 @@ mod test {
     use super::*;
     use crate::{
         blob::{Blob, BlobIdMap},
+        entropy::calculate_shannon_entropy,
         origin::{Origin, OriginSet},
         rules::rule::{
             DependsOnRule, HttpRequest, HttpValidation, PatternRequirements, RuleSyntax, Validation,
@@ -1341,8 +652,6 @@ mod test {
         let input = "some test data for vectorscan";
         let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
         let enable_rule_profiling = true;
-        // let mut matcher = Matcher::new(&rules_db, &seen_blobs, None,
-        // enable_rule_profiling)?;
         let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
         let mut matcher = Matcher::new(
             &rules_db,
@@ -1513,7 +822,7 @@ mod test {
         assert_eq!((item.pos_start, item.pos_end), (4, 4 + base64_payload.len()));
     }
 
-    /// `compute_finding_fingerprint` must be stable (same input ⇒ same output)
+    /// `compute_finding_fingerprint` must be stable (same input => same output)
     /// and sensitive to any input component.
     #[test]
     fn test_finding_fingerprint_stability_and_uniqueness() {
@@ -1535,7 +844,7 @@ mod test {
     /// keys as soon as *anything* changes.
     #[test]
     fn test_compute_match_key_uniqueness() {
-        use super::compute_match_key;
+        use super::dedup::compute_match_key;
 
         let k1 = compute_match_key(b"abc", b"rule-1", 0, 3);
         let k2 = compute_match_key(b"abc", b"rule-1", 0, 3);
@@ -1721,6 +1030,8 @@ line2
 
     #[test]
     fn serializes_captures_in_numeric_order() {
+        use regex::bytes::Regex;
+
         let re =
             Regex::new(r"(?xi)\b(ghp_(?P<body>[A-Z0-9]{3})(?P<checksum>[A-Z0-9]{2}))").unwrap();
         let caps = re.captures(b"ghp_ABC12").expect("expected captures");
diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs
index a7ffc87..e07cec7 100644
--- a/src/scanner/runner.rs
+++ b/src/scanner/runner.rs
@@ -42,9 +42,18 @@ use crate::{
         AccessMapCollector,
     },
     util::set_redaction_enabled,
+    validation::CachedResponse,
     validation_rate_limit::ValidationRateLimiter,
 };
 
+/// Shared validation dependencies: (liquid parser, HTTP clients, validation cache, rate limiter).
+type ValidationDeps = Arc<(
+    liquid::Parser,
+    crate::validation::ValidationClients,
+    Arc<SkipMap<String, CachedResponse>>,
+    Option<Arc<ValidationRateLimiter>>,
+)>;
+
 pub async fn run_scan(
     global_args: &global::GlobalArgs,
     scan_args: &scan::ScanArgs,
@@ -64,22 +73,9 @@ pub async fn run_async_scan(
     rules_db: &RulesDatabase,
     update_status: &crate::update::UpdateStatus,
 ) -> Result<()> {
-    // Ensure all provided paths exist before proceeding
-    for path in &args.input_specifier_args.path_inputs {
-        if !path.exists() {
-            error!("Specified input path does not exist: {}", path.display());
-            bail!("Invalid input: Path does not exist - {}", path.display());
-        }
-    }
-
-    // Register user-provided allow-list patterns
-    for pattern in &args.skip_regex {
-        safe_list::add_user_regex(pattern)
-            .map_err(|e| anyhow::anyhow!("Invalid skip-regex '{pattern}': {e}"))?;
-    }
-    for word in &args.skip_word {
-        safe_list::add_user_skipword(word);
-    }
+    // ── Phase 1: Input validation and environment setup ──────────────────
+    validate_inputs(args)?;
+    register_safe_list_patterns(args)?;
 
     let start_time = Instant::now();
     let scan_started_at = chrono::Local::now();
@@ -90,123 +86,26 @@ pub async fn run_async_scan(
 
     set_redaction_enabled(args.redact);
 
-    let mut repo_urls = enumerate_github_repos(args, global_args).await?;
-    let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
-    let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
-    let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
-    let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
-    let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
-
-    // Combine repository URLs
-    repo_urls.extend(gitlab_repo_urls);
-    repo_urls.extend(gitea_repo_urls);
-    repo_urls.extend(huggingface_repo_urls);
-    repo_urls.extend(bitbucket_repo_urls);
-    repo_urls.extend(azure_repo_urls);
-
-    // Add wiki repositories for each URL when requested
-    if args.input_specifier_args.repo_artifacts {
-        let mut wiki_urls = Vec::new();
-        for url in &repo_urls {
-            if let Some(w) = github::wiki_url(url) {
-                wiki_urls.push(w);
-            }
-            if let Some(w) = gitlab::wiki_url(url) {
-                wiki_urls.push(w);
-            }
-            if let Some(w) = gitea::wiki_url(url) {
-                wiki_urls.push(w);
-            }
-            if let Some(w) = bitbucket::wiki_url(url) {
-                wiki_urls.push(w);
-            }
-            if let Some(w) = azure::wiki_url(url) {
-                wiki_urls.push(w);
-            }
-        }
-        repo_urls.extend(wiki_urls);
-    }
-
-    // just sort and dedup once
-    repo_urls.sort();
-    repo_urls.dedup();
+    // ── Phase 2: Repository enumeration ─────────────────────────────────
+    let repo_urls = enumerate_all_repos(args, global_args).await?;
 
     let mut input_roots = args.input_specifier_args.path_inputs.clone();
     let (repo_tx, repo_rx) = crossbeam_channel::unbounded();
-    let repo_clone_handle = if repo_urls.is_empty() {
-        None
-    } else {
-        let clone_args = args.clone();
-        let clone_globals = global_args.clone();
-        let clone_repo_urls = repo_urls.clone();
-        let clone_datastore = Arc::clone(&datastore);
-        let clone_repo_tx = repo_tx.clone();
-        Some(std::thread::spawn(move || {
-            if let Err(e) = clone_or_update_git_repos_streaming(
-                &clone_args,
-                &clone_globals,
-                &clone_repo_urls,
-                &clone_datastore,
-                |path| {
-                    let _ = clone_repo_tx.send(path);
-                },
-            ) {
-                error!("Failed to fetch one or more Git repositories: {e}");
-            }
-        }))
-    };
-    drop(repo_tx);
+    let repo_clone_handle =
+        start_repo_cloning(&repo_urls, args, global_args, &datastore, repo_tx, progress_enabled);
 
-    // Fetch issues, gists, and wikis if enabled
-    let bitbucket_auth = bitbucket::AuthConfig::from_env();
-    let bitbucket_host =
-        args.input_specifier_args.bitbucket_api_url.host_str().map(|s| s.to_string());
-
-    if args.input_specifier_args.repo_artifacts {
-        let repo_artifact_dirs = fetch_git_host_artifacts(
-            &repo_urls,
-            &args.input_specifier_args.bitbucket_api_url,
-            &bitbucket_auth,
-            bitbucket_host.clone(),
-            global_args,
-            &datastore,
-        )
-        .await?;
-        input_roots.extend(repo_artifact_dirs);
-    }
-    // Fetch Jira issues if requested
-    let jira_dirs = fetch_jira_issues(args, global_args, &datastore).await?;
-    input_roots.extend(jira_dirs);
-
-    // Fetch Confluence pages if requested
-    let confluence_dirs = fetch_confluence_pages(args, global_args, &datastore).await?;
-    input_roots.extend(confluence_dirs);
-
-    // Fetch Slack messages if requested
-    let slack_dirs = fetch_slack_messages(args, global_args, &datastore).await?;
-    input_roots.extend(slack_dirs);
-
-    // Save Docker images if specified
-    if !args.input_specifier_args.docker_image.is_empty() {
-        let clone_root = {
-            let ds = datastore.lock().unwrap();
-            ds.clone_root()
-        };
-        let docker_dirs = save_docker_images(
-            &args.input_specifier_args.docker_image,
-            &clone_root,
-            progress_enabled,
-        )
-        .await?;
-        for (dir, img) in docker_dirs {
-            {
-                let mut ds = datastore.lock().unwrap();
-                ds.register_docker_image(dir.clone(), img);
-            }
-            input_roots.push(dir);
-        }
-    }
+    // ── Phase 3: Artifact fetching ──────────────────────────────────────
+    fetch_all_artifacts(
+        args,
+        global_args,
+        &repo_urls,
+        &datastore,
+        &mut input_roots,
+        progress_enabled,
+    )
+    .await?;
 
+    // ── Phase 4: Scan configuration ─────────────────────────────────────
     let shared_profiler = Arc::new(ConcurrentRuleProfiler::new());
     let enable_profiling = args.rule_stats;
     let matcher_stats = Arc::new(Mutex::new(MatcherStats::default()));
@@ -246,11 +145,257 @@ pub async fn run_async_scan(
             .unwrap_or_else(|| std::path::PathBuf::from("baseline-file.yaml")),
     );
 
-    let mut skip_aws_accounts = args.skip_aws_account.clone();
+    let skip_aws_accounts = load_skip_aws_accounts(args)?;
+    crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
 
     let mut access_map_collector =
         if args.access_map { Some(AccessMapCollector::default()) } else { None };
 
+    let repo_roots = expand_repo_roots(&input_roots)?;
+    let git_repo_count =
+        repo_roots.iter().filter(|p| p.join(".git").is_dir()).count() + repo_urls.len();
+    let use_parallel_repo_scan = git_repo_count > 10;
+
+    let validation_rate_limiter =
+        ValidationRateLimiter::from_cli(args.validation_rps, &args.validation_rps_rule)?
+            .map(Arc::new);
+
+    let validation_deps: Option<ValidationDeps> = if !args.no_validate {
+        info!("Starting secret validation phase...");
+        Some(Arc::new((
+            register_all(liquid::ParserBuilder::with_stdlib()).build()?,
+            crate::validation::ValidationClients::new(global_args.tls_mode)?,
+            Arc::new(SkipMap::new()),
+            validation_rate_limiter.clone(),
+        )))
+    } else {
+        None
+    };
+
+    // ── Phase 5: Scanning ───────────────────────────────────────────────
+    if !use_parallel_repo_scan {
+        run_sequential_scan(
+            args,
+            global_args,
+            &datastore,
+            rules_db,
+            &mut input_roots,
+            repo_rx,
+            repo_clone_handle,
+            &shared_profiler,
+            enable_profiling,
+            &matcher_stats,
+            &baseline_path,
+            &validation_deps,
+            &mut access_map_collector,
+            progress_enabled,
+            start_time,
+            scan_started_at,
+            update_status,
+        )
+        .await?;
+        return Ok(());
+    }
+
+    run_parallel_scan(
+        args,
+        global_args,
+        &datastore,
+        rules_db,
+        &repo_roots,
+        repo_rx,
+        repo_clone_handle,
+        &shared_profiler,
+        enable_profiling,
+        &matcher_stats,
+        &baseline_path,
+        &validation_deps,
+        &mut access_map_collector,
+        progress_enabled,
+        start_time,
+        scan_started_at,
+        update_status,
+    )
+    .await
+}
+
+// =================================================================================================
+// Phase helpers
+// =================================================================================================
+
+/// Validates that all provided input paths exist.
+fn validate_inputs(args: &scan::ScanArgs) -> Result<()> {
+    for path in &args.input_specifier_args.path_inputs {
+        if !path.exists() {
+            error!("Specified input path does not exist: {}", path.display());
+            bail!("Invalid input: Path does not exist - {}", path.display());
+        }
+    }
+    Ok(())
+}
+
+/// Registers user-provided allow-list patterns (skip-regex and skip-word).
+fn register_safe_list_patterns(args: &scan::ScanArgs) -> Result<()> {
+    for pattern in &args.skip_regex {
+        safe_list::add_user_regex(pattern)
+            .map_err(|e| anyhow::anyhow!("Invalid skip-regex '{pattern}': {e}"))?;
+    }
+    for word in &args.skip_word {
+        safe_list::add_user_skipword(word);
+    }
+    Ok(())
+}
+
+/// Enumerates repositories from all configured platforms, adds wiki URLs, and deduplicates.
+async fn enumerate_all_repos(
+    args: &scan::ScanArgs,
+    global_args: &global::GlobalArgs,
+) -> Result<Vec<crate::git_url::GitUrl>> {
+    let mut repo_urls = enumerate_github_repos(args, global_args).await?;
+    let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
+    let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
+    let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
+    let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
+    let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
+
+    repo_urls.extend(gitlab_repo_urls);
+    repo_urls.extend(gitea_repo_urls);
+    repo_urls.extend(huggingface_repo_urls);
+    repo_urls.extend(bitbucket_repo_urls);
+    repo_urls.extend(azure_repo_urls);
+
+    // Add wiki repositories for each URL when requested
+    if args.input_specifier_args.repo_artifacts {
+        let mut wiki_urls = Vec::new();
+        for url in &repo_urls {
+            if let Some(w) = github::wiki_url(url) {
+                wiki_urls.push(w);
+            }
+            if let Some(w) = gitlab::wiki_url(url) {
+                wiki_urls.push(w);
+            }
+            if let Some(w) = gitea::wiki_url(url) {
+                wiki_urls.push(w);
+            }
+            if let Some(w) = bitbucket::wiki_url(url) {
+                wiki_urls.push(w);
+            }
+            if let Some(w) = azure::wiki_url(url) {
+                wiki_urls.push(w);
+            }
+        }
+        repo_urls.extend(wiki_urls);
+    }
+
+    repo_urls.sort();
+    repo_urls.dedup();
+
+    Ok(repo_urls)
+}
+
+/// Spawns a background thread to clone/update git repositories, streaming results via a channel.
+fn start_repo_cloning(
+    repo_urls: &[crate::git_url::GitUrl],
+    args: &scan::ScanArgs,
+    global_args: &global::GlobalArgs,
+    datastore: &Arc<Mutex<FindingsStore>>,
+    repo_tx: crossbeam_channel::Sender<PathBuf>,
+    _progress_enabled: bool,
+) -> Option<std::thread::JoinHandle<()>> {
+    if repo_urls.is_empty() {
+        drop(repo_tx);
+        return None;
+    }
+
+    let clone_args = args.clone();
+    let clone_globals = global_args.clone();
+    let clone_repo_urls = repo_urls.to_vec();
+    let clone_datastore = Arc::clone(datastore);
+    let clone_repo_tx = repo_tx.clone();
+
+    let handle = std::thread::spawn(move || {
+        if let Err(e) = clone_or_update_git_repos_streaming(
+            &clone_args,
+            &clone_globals,
+            &clone_repo_urls,
+            &clone_datastore,
+            |path| {
+                let _ = clone_repo_tx.send(path);
+            },
+        ) {
+            error!("Failed to fetch one or more Git repositories: {e}");
+        }
+    });
+    drop(repo_tx);
+    Some(handle)
+}
+
+/// Fetches artifacts from various platforms (issues, wikis, Jira, Confluence, Slack, Docker).
+async fn fetch_all_artifacts(
+    args: &scan::ScanArgs,
+    global_args: &global::GlobalArgs,
+    repo_urls: &[crate::git_url::GitUrl],
+    datastore: &Arc<Mutex<FindingsStore>>,
+    input_roots: &mut Vec<PathBuf>,
+    progress_enabled: bool,
+) -> Result<()> {
+    let bitbucket_auth = bitbucket::AuthConfig::from_env();
+    let bitbucket_host =
+        args.input_specifier_args.bitbucket_api_url.host_str().map(|s| s.to_string());
+
+    if args.input_specifier_args.repo_artifacts {
+        let repo_artifact_dirs = fetch_git_host_artifacts(
+            repo_urls,
+            &args.input_specifier_args.bitbucket_api_url,
+            &bitbucket_auth,
+            bitbucket_host.clone(),
+            global_args,
+            datastore,
+        )
+        .await?;
+        input_roots.extend(repo_artifact_dirs);
+    }
+
+    // Fetch Jira issues if requested
+    let jira_dirs = fetch_jira_issues(args, global_args, datastore).await?;
+    input_roots.extend(jira_dirs);
+
+    // Fetch Confluence pages if requested
+    let confluence_dirs = fetch_confluence_pages(args, global_args, datastore).await?;
+    input_roots.extend(confluence_dirs);
+
+    // Fetch Slack messages if requested
+    let slack_dirs = fetch_slack_messages(args, global_args, datastore).await?;
+    input_roots.extend(slack_dirs);
+
+    // Save Docker images if specified
+    if !args.input_specifier_args.docker_image.is_empty() {
+        let clone_root = {
+            let ds = datastore.lock().unwrap();
+            ds.clone_root()
+        };
+        let docker_dirs = save_docker_images(
+            &args.input_specifier_args.docker_image,
+            &clone_root,
+            progress_enabled,
+        )
+        .await?;
+        for (dir, img) in docker_dirs {
+            {
+                let mut ds = datastore.lock().unwrap();
+                ds.register_docker_image(dir.clone(), img);
+            }
+            input_roots.push(dir);
+        }
+    }
+
+    Ok(())
+}
+
+/// Loads AWS account IDs to skip from CLI args and optional file.
+fn load_skip_aws_accounts(args: &scan::ScanArgs) -> Result<Vec<String>> {
+    let mut skip_aws_accounts = args.skip_aws_account.clone();
+
     if let Some(path) = args.skip_aws_account_file.as_ref() {
         let contents = fs::read_to_string(path).with_context(|| {
             format!("Failed to read --skip-aws-account-file {}", path.display())
@@ -267,183 +412,214 @@ pub async fn run_async_scan(
         }
     }
 
-    crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
+    Ok(skip_aws_accounts)
+}
 
-    let repo_roots = expand_repo_roots(&input_roots)?;
-    let git_repo_count =
-        repo_roots.iter().filter(|p| p.join(".git").is_dir()).count() + repo_urls.len();
-    let use_parallel_repo_scan = git_repo_count > 10;
-
-    let validation_rate_limiter =
-        ValidationRateLimiter::from_cli(args.validation_rps, &args.validation_rps_rule)?
-            .map(Arc::new);
-
-    let validation_deps = if !args.no_validate {
-        info!("Starting secret validation phase...");
-        Some(Arc::new((
-            register_all(liquid::ParserBuilder::with_stdlib()).build()?,
-            crate::validation::ValidationClients::new(global_args.tls_mode)?,
-            Arc::new(SkipMap::new()),
-            validation_rate_limiter.clone(),
-        )))
-    } else {
-        None
-    };
-
-    if !use_parallel_repo_scan {
-        let mut streamed_roots = Vec::new();
-        if !input_roots.is_empty() {
-            let _inputs = enumerate_filesystem_inputs(
-                args,
-                datastore.clone(),
-                &input_roots,
-                progress_enabled,
-                rules_db,
-                enable_profiling,
-                Arc::clone(&shared_profiler),
-                matcher_stats.as_ref(),
-            )?;
-        }
-
-        for repo_root in repo_rx.clone().iter() {
-            enumerate_filesystem_inputs(
-                args,
-                datastore.clone(),
-                &[repo_root.clone()],
-                progress_enabled,
-                rules_db,
-                enable_profiling,
-                Arc::clone(&shared_profiler),
-                matcher_stats.as_ref(),
-            )?;
-            streamed_roots.push(repo_root);
-        }
-        input_roots.extend(streamed_roots);
-
-        if let Some(handle) = repo_clone_handle {
-            let _ = handle.join();
-        }
-
-        if !args.no_dedup {
-            let reporter = crate::reporter::DetailsReporter {
-                datastore: Arc::clone(&datastore),
-                styles: Styles::new(global_args.use_color(std::io::stdout())),
-                only_valid: args.only_valid,
-            };
-
-            let all_matches = reporter.get_unfiltered_matches(Some(false))?;
-            let deduped_matches = reporter.deduplicate_matches(all_matches, args.no_dedup);
-
-            let deduped_arcs: Vec<Arc<FindingsStoreMessage>> = deduped_matches
-                .into_iter()
-                .map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m)))
-                .collect();
-            let mut ds = datastore.lock().unwrap();
-            ds.replace_matches(deduped_arcs);
-        }
-
-        if args.baseline_file.is_some() || args.manage_baseline {
-            let mut ds = datastore.lock().unwrap();
-            crate::baseline::apply_baseline(
-                &mut ds,
-                baseline_path.as_ref(),
-                args.manage_baseline,
-                &input_roots,
-            )?;
-        }
-
-        if let Some(validation) = &validation_deps {
-            let (parser, clients, cache, rate_limiter) =
-                (&validation.0, &validation.1, &validation.2, &validation.3);
-            run_secret_validation(
-                Arc::clone(&datastore),
-                parser,
-                clients,
-                cache,
-                args.num_jobs,
-                None,
-                access_map_collector.clone(),
-                rate_limiter.clone(),
-                Duration::from_secs(args.validation_timeout),
-                args.validation_retries,
-            )
-            .await?;
-        }
-
-        if let Some(collector) = access_map_collector.take() {
-            finalize_access_map(&datastore, collector, args).await?;
-        }
-
-        crate::reporter::run(global_args, Arc::clone(&datastore), args)
-            .context("Failed to run report command")?;
-        print_scan_summary(
-            start_time,
-            scan_started_at,
-            &datastore,
-            global_args,
-            args,
-            rules_db,
-            matcher_stats.as_ref(),
-            if enable_profiling { Some(shared_profiler.as_ref()) } else { None },
-            update_status,
-            None,
-            None,
-        );
-        maybe_hint_access_map(&datastore, args);
+/// Deduplicates matches in the datastore starting from `start_index`.
+fn deduplicate_new_matches(
+    store: &Arc<Mutex<FindingsStore>>,
+    global_args: &global::GlobalArgs,
+    args: &scan::ScanArgs,
+    start_index: usize,
+) -> Result<()> {
+    if args.no_dedup {
         return Ok(());
     }
 
-    let deduplicate_new_matches =
-        |store: &Arc<Mutex<FindingsStore>>, start_index: usize| -> Result<()> {
-            if args.no_dedup {
-                return Ok(());
-            }
+    let reporter = crate::reporter::DetailsReporter {
+        datastore: Arc::clone(store),
+        styles: Styles::new(global_args.use_color(std::io::stdout())),
+        only_valid: args.only_valid,
+    };
 
-            let reporter = crate::reporter::DetailsReporter {
-                datastore: Arc::clone(store),
-                styles: Styles::new(global_args.use_color(std::io::stdout())),
-                only_valid: args.only_valid,
-            };
+    let all_matches = reporter.get_unfiltered_matches(Some(false))?;
+    if start_index >= all_matches.len() {
+        return Ok(());
+    }
 
-            let all_matches = reporter.get_unfiltered_matches(Some(false))?;
-            if start_index >= all_matches.len() {
-                return Ok(());
-            }
+    let slice = if start_index == 0 { all_matches } else { all_matches[start_index..].to_vec() };
+    let deduped_matches = reporter.deduplicate_matches(slice, args.no_dedup);
 
-            let deduped_matches =
-                reporter.deduplicate_matches(all_matches[start_index..].to_vec(), args.no_dedup);
+    let deduped_arcs: Vec<Arc<FindingsStoreMessage>> = deduped_matches
+        .into_iter()
+        .map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m)))
+        .collect();
 
-            let deduped_arcs: Vec<Arc<FindingsStoreMessage>> = deduped_matches
-                .into_iter()
-                .map(|rm| Arc::new((Arc::new(rm.origin), Arc::new(rm.blob_metadata), rm.m)))
-                .collect();
-
-            let mut ds = store.lock().unwrap();
-            let mut preserved = ds.get_matches()[..start_index].to_vec();
-            preserved.extend(deduped_arcs);
-            ds.replace_matches(preserved);
-            Ok(())
-        };
-
-    deduplicate_new_matches(&datastore, 0)?;
+    let mut ds = store.lock().unwrap();
+    if start_index == 0 {
+        ds.replace_matches(deduped_arcs);
+    } else {
+        let mut preserved = ds.get_matches()[..start_index].to_vec();
+        preserved.extend(deduped_arcs);
+        ds.replace_matches(preserved);
+    }
+    Ok(())
+}
 
+/// Applies baseline filtering if configured.
+fn apply_baseline_if_configured(
+    args: &scan::ScanArgs,
+    datastore: &Arc<Mutex<FindingsStore>>,
+    baseline_path: &std::path::Path,
+    roots: &[PathBuf],
+) -> Result<()> {
     if args.baseline_file.is_some() || args.manage_baseline {
         let mut ds = datastore.lock().unwrap();
-        crate::baseline::apply_baseline(
-            &mut ds,
-            baseline_path.as_ref(),
-            args.manage_baseline,
-            &repo_roots,
+        crate::baseline::apply_baseline(&mut ds, baseline_path, args.manage_baseline, roots)?;
+    }
+    Ok(())
+}
+
+/// Runs the validation phase on matches in the datastore.
+#[allow(clippy::too_many_arguments)]
+async fn run_validation_phase(
+    datastore: &Arc<Mutex<FindingsStore>>,
+    validation_deps: &Option<ValidationDeps>,
+    args: &scan::ScanArgs,
+    match_range: Option<std::ops::Range<usize>>,
+    access_map_collector: Option<AccessMapCollector>,
+) -> Result<()> {
+    if let Some(validation) = validation_deps {
+        let (parser, clients, cache, rate_limiter) =
+            (&validation.0, &validation.1, &validation.2, &validation.3);
+        run_secret_validation(
+            Arc::clone(datastore),
+            parser,
+            clients,
+            cache,
+            args.num_jobs,
+            match_range,
+            access_map_collector,
+            rate_limiter.clone(),
+            Duration::from_secs(args.validation_timeout),
+            args.validation_retries,
+        )
+        .await?;
+    }
+    Ok(())
+}
+
+// =================================================================================================
+// Sequential scan path
+// =================================================================================================
+
+#[allow(clippy::too_many_arguments)]
+async fn run_sequential_scan(
+    args: &scan::ScanArgs,
+    global_args: &global::GlobalArgs,
+    datastore: &Arc<Mutex<FindingsStore>>,
+    rules_db: &RulesDatabase,
+    input_roots: &mut Vec<PathBuf>,
+    repo_rx: crossbeam_channel::Receiver<PathBuf>,
+    repo_clone_handle: Option<std::thread::JoinHandle<()>>,
+    shared_profiler: &Arc<ConcurrentRuleProfiler>,
+    enable_profiling: bool,
+    matcher_stats: &Arc<Mutex<MatcherStats>>,
+    baseline_path: &Arc<PathBuf>,
+    validation_deps: &Option<ValidationDeps>,
+    access_map_collector: &mut Option<AccessMapCollector>,
+    progress_enabled: bool,
+    start_time: Instant,
+    scan_started_at: chrono::DateTime<chrono::Local>,
+    update_status: &crate::update::UpdateStatus,
+) -> Result<()> {
+    let mut streamed_roots = Vec::new();
+    if !input_roots.is_empty() {
+        let _inputs = enumerate_filesystem_inputs(
+            args,
+            datastore.clone(),
+            input_roots,
+            progress_enabled,
+            rules_db,
+            enable_profiling,
+            Arc::clone(shared_profiler),
+            matcher_stats.as_ref(),
         )?;
     }
 
-    if let Some(validation) = &validation_deps {
+    for repo_root in repo_rx.iter() {
+        enumerate_filesystem_inputs(
+            args,
+            datastore.clone(),
+            &[repo_root.clone()],
+            progress_enabled,
+            rules_db,
+            enable_profiling,
+            Arc::clone(shared_profiler),
+            matcher_stats.as_ref(),
+        )?;
+        streamed_roots.push(repo_root);
+    }
+    input_roots.extend(streamed_roots);
+
+    if let Some(handle) = repo_clone_handle {
+        let _ = handle.join();
+    }
+
+    deduplicate_new_matches(datastore, global_args, args, 0)?;
+    apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), input_roots)?;
+
+    run_validation_phase(datastore, validation_deps, args, None, access_map_collector.clone())
+        .await?;
+
+    if let Some(collector) = access_map_collector.take() {
+        finalize_access_map(datastore, collector, args).await?;
+    }
+
+    crate::reporter::run(global_args, Arc::clone(datastore), args)
+        .context("Failed to run report command")?;
+    print_scan_summary(
+        start_time,
+        scan_started_at,
+        datastore,
+        global_args,
+        args,
+        rules_db,
+        matcher_stats.as_ref(),
+        if enable_profiling { Some(shared_profiler.as_ref()) } else { None },
+        update_status,
+        None,
+        None,
+    );
+    maybe_hint_access_map(datastore, args);
+    Ok(())
+}
+
+// =================================================================================================
+// Parallel scan path
+// =================================================================================================
+
+#[allow(clippy::too_many_arguments)]
+async fn run_parallel_scan(
+    args: &scan::ScanArgs,
+    global_args: &global::GlobalArgs,
+    datastore: &Arc<Mutex<FindingsStore>>,
+    rules_db: &RulesDatabase,
+    repo_roots: &[PathBuf],
+    repo_rx: crossbeam_channel::Receiver<PathBuf>,
+    repo_clone_handle: Option<std::thread::JoinHandle<()>>,
+    shared_profiler: &Arc<ConcurrentRuleProfiler>,
+    enable_profiling: bool,
+    matcher_stats: &Arc<Mutex<MatcherStats>>,
+    baseline_path: &Arc<PathBuf>,
+    validation_deps: &Option<ValidationDeps>,
+    access_map_collector: &mut Option<AccessMapCollector>,
+    progress_enabled: bool,
+    start_time: Instant,
+    scan_started_at: chrono::DateTime<chrono::Local>,
+    update_status: &crate::update::UpdateStatus,
+) -> Result<()> {
+    deduplicate_new_matches(datastore, global_args, args, 0)?;
+    apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), repo_roots)?;
+
+    // Validate initial (non-repo) matches
+    if let Some(validation) = validation_deps {
         let (parser, clients, cache, rate_limiter) =
             (&validation.0, &validation.1, &validation.2, &validation.3);
         let initial_match_count = { datastore.lock().unwrap().get_matches().len() };
         if initial_match_count > 0 {
             run_secret_validation(
-                Arc::clone(&datastore),
+                Arc::clone(datastore),
                 parser,
                 clients,
                 cache,
@@ -458,6 +634,7 @@ pub async fn run_async_scan(
         }
     }
 
+    // Parallel per-repo scanning
     let repo_concurrency = std::cmp::max(1, args.num_jobs);
     let rt_handle = Handle::current();
 
@@ -476,16 +653,16 @@ pub async fn run_async_scan(
             let spawn_repo_scan = |root: PathBuf| {
                 let repo_rules = repo_rules.clone();
                 let base_clone_root = base_clone_root.clone();
-                let baseline_path = Arc::clone(&baseline_path);
-                let shared_profiler = Arc::clone(&shared_profiler);
+                let baseline_path = Arc::clone(baseline_path);
+                let shared_profiler = Arc::clone(shared_profiler);
                 let args = args.clone();
                 let root = root.clone();
                 let validation_deps = validation_deps.clone();
-                let matcher_stats = Arc::clone(&matcher_stats);
+                let matcher_stats = Arc::clone(matcher_stats);
                 let rt_handle = rt_handle.clone();
                 let ran_repo_scan = Arc::clone(&ran_repo_scan);
                 let repo_errors = Arc::clone(&repo_errors);
-                let datastore = Arc::clone(&datastore);
+                let datastore = Arc::clone(datastore);
                 let access_map = access_map_collector.clone();
 
                 scope.spawn(move |_| {
@@ -509,7 +686,9 @@ pub async fn run_async_scan(
                             Arc::clone(&shared_profiler),
                             &repo_matcher_stats,
                         )
-                        .and_then(|_| deduplicate_new_matches(&repo_datastore, 0))?;
+                        .and_then(|_| {
+                            deduplicate_new_matches(&repo_datastore, global_args, &args, 0)
+                        })?;
 
                         if args.baseline_file.is_some() || args.manage_baseline {
                             let mut ds = repo_datastore.lock().unwrap();
@@ -568,11 +747,11 @@ pub async fn run_async_scan(
                 });
             };
 
-            for root in repo_roots.clone() {
+            for root in repo_roots.iter().cloned() {
                 spawn_repo_scan(root);
             }
 
-            for root in repo_rx.clone().iter() {
+            for root in repo_rx.iter() {
                 spawn_repo_scan(root);
             }
         });
@@ -586,51 +765,27 @@ pub async fn run_async_scan(
     }
 
     if output_to_file && ran_repo_scan.load(Ordering::Relaxed) {
-        crate::reporter::run(global_args, Arc::clone(&datastore), args)
+        crate::reporter::run(global_args, Arc::clone(datastore), args)
             .context("Failed to run report command")?;
     }
 
     if !ran_repo_scan.load(Ordering::Relaxed) {
-        deduplicate_new_matches(&datastore, 0)?;
+        deduplicate_new_matches(datastore, global_args, args, 0)?;
+        apply_baseline_if_configured(args, datastore, baseline_path.as_ref(), repo_roots)?;
 
-        if args.baseline_file.is_some() || args.manage_baseline {
-            let mut ds = datastore.lock().unwrap();
-            crate::baseline::apply_baseline(
-                &mut ds,
-                baseline_path.as_ref(),
-                args.manage_baseline,
-                &repo_roots,
-            )?;
-        }
-
-        if let Some(validation) = &validation_deps {
-            let (parser, clients, cache, rate_limiter) =
-                (&validation.0, &validation.1, &validation.2, &validation.3);
-            run_secret_validation(
-                Arc::clone(&datastore),
-                parser,
-                clients,
-                cache,
-                args.num_jobs,
-                None,
-                access_map_collector.clone(),
-                rate_limiter.clone(),
-                Duration::from_secs(args.validation_timeout),
-                args.validation_retries,
-            )
+        run_validation_phase(datastore, validation_deps, args, None, access_map_collector.clone())
             .await?;
-        }
 
         if let Some(collector) = access_map_collector.take() {
-            finalize_access_map(&datastore, collector, args).await?;
+            finalize_access_map(datastore, collector, args).await?;
         }
 
-        crate::reporter::run(global_args, Arc::clone(&datastore), args)
+        crate::reporter::run(global_args, Arc::clone(datastore), args)
             .context("Failed to run report command")?;
     }
 
     let aggregate_summary = if ran_repo_scan.load(Ordering::Relaxed) {
-        let totals = compute_scan_totals(&datastore, args, matcher_stats.as_ref());
+        let totals = compute_scan_totals(datastore, args, matcher_stats.as_ref());
         let mut sorted: Vec<_> = datastore.lock().unwrap().get_summary().into_iter().collect();
         sorted.sort_by(|a, b| b.1.cmp(&a.1));
         Some((totals, sorted))
@@ -641,7 +796,7 @@ pub async fn run_async_scan(
     print_scan_summary(
         start_time,
         scan_started_at,
-        &datastore,
+        datastore,
         global_args,
         args,
         rules_db,
@@ -652,14 +807,18 @@ pub async fn run_async_scan(
         aggregate_summary,
     );
 
-    if let Some(collector) = access_map_collector {
-        finalize_access_map(&datastore, collector, args).await?;
+    if let Some(collector) = access_map_collector.take() {
+        finalize_access_map(datastore, collector, args).await?;
     } else {
-        maybe_hint_access_map(&datastore, args);
+        maybe_hint_access_map(datastore, args);
     }
     Ok(())
 }
 
+// =================================================================================================
+// Existing helper functions (unchanged)
+// =================================================================================================
+
 async fn finalize_access_map(
     datastore: &Arc<Mutex<FindingsStore>>,
     collector: AccessMapCollector,
@@ -780,8 +939,6 @@ pub fn create_datastore_channel(
 ) {
     const BATCH_SIZE: usize = 1024;
     let channel_size = std::cmp::max(num_jobs * BATCH_SIZE, 16 * BATCH_SIZE);
-    // const BATCH_SIZE: usize = 256;
-    // let channel_size = std::cmp::max(num_jobs * BATCH_SIZE, 4096);
     crossbeam_channel::bounded(channel_size)
 }
 
@@ -864,7 +1021,6 @@ pub fn load_and_record_rules(
 ) -> Result<RulesDatabase> {
     let init_progress =
         if use_progress { ProgressBar::new_spinner() } else { ProgressBar::hidden() };
-    // init_progress.set_message("Compiling rules...");
     let rules_db = {
         let loaded = RuleLoader::from_rule_specifiers(&args.rules)
             .load(args)
@@ -876,7 +1032,6 @@ pub fn load_and_record_rules(
             .cloned()
             .map(|mut rule| {
                 if let Some(min_entropy) = args.min_entropy {
-                    // rule.syntax.min_entropy = min_entropy;
                     let _ = rule.set_entropy(min_entropy);
                 }
                 rule