kingfisher/src/matcher.rs

use std::{
    hash::{Hash, Hasher},
    str,
    sync::{Arc, Mutex},
};

use anyhow::Result;
use base64::{engine::general_purpose, Engine};
use bstr::BString;
use http::StatusCode;
use regex::bytes::Regex;
use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
use schemars::{
    gen::SchemaGenerator,
    schema::{ArrayValidation, InstanceType, Schema},
    JsonSchema,
};
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use tracing::debug;
use xxhash_rust::xxh3::xxh3_64;

use crate::{
    blob::{Blob, BlobId, BlobIdMap},
    entropy::calculate_shannon_entropy,
    inline_ignore::InlineIgnoreConfig,
    location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
    origin::OriginSet,
    parser,
    parser::{Checker, Language},
    rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer},
    rules::rule::{PatternRequirementContext, PatternValidationResult, Rule, Validation},
    rules_database::RulesDatabase,
    safe_list::{is_safe_match, is_user_match},
    scanner_pool::ScannerPool,
    snippet::Base64BString,
    util::intern,
    validation::{is_parseable_mongodb_uri, is_parseable_mysql_uri, is_parseable_postgres_uri},
    validation_body::{self, ValidationResponseBody},
};

const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
const TREE_SITTER_MAX_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs <= 64 KiB
const TREE_SITTER_MIN_LIMIT: usize = 1 * 1024; // only run tree-sitter on blobs >= 1 KiB

// -------------------------------------------------------------------------------------------------
// RawMatch
// -------------------------------------------------------------------------------------------------
/// A raw match, as recorded by a callback to Vectorscan.
///
/// When matching with Vectorscan, we simply collect all matches into a
/// preallocated `Vec`, and then go through them all after scanning is complete.
#[derive(PartialEq, Eq, Debug, Clone)]
struct RawMatch {
    rule_id: u32,
    start_idx: u64,
    end_idx: u64,
}
#[derive(Clone)]
pub struct OwnedBlobMatch {
    pub rule: Arc<Rule>,
    pub blob_id: BlobId,
    /// The unique content-based identifier of this match
    pub finding_fingerprint: u64,
    pub matching_input_offset_span: OffsetSpan,
    pub captures: SerializableCaptures,
    pub validation_response_body: ValidationResponseBody,
    pub validation_response_status: StatusCode,
    pub validation_success: bool,
    pub calculated_entropy: f32,
    pub is_base64: bool,
    /// Variables captured from dependent rules (from depends_on_rule).
    /// Maps variable name (uppercase) to captured value.
    pub dependent_captures: std::collections::BTreeMap<String, String>,
}
impl<'a> Matcher<'a> {
    pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
        self.profiler.as_ref().map(|p| p.generate_report())
    }
}
impl OwnedBlobMatch {
    pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc<Rule>) -> OwnedBlobMatch {
        OwnedBlobMatch {
            rule,
            blob_id: m.blob_id,
            finding_fingerprint: m.finding_fingerprint,
            // matching_input: m.snippet.matching.0.to_vec(),
            matching_input_offset_span: m.location.offset_span.clone(),
            captures: m.groups.clone(),
            validation_response_body: m.validation_response_body.clone(),
            validation_response_status: StatusCode::from_u16(m.validation_response_status)
                .unwrap_or(StatusCode::CONTINUE),
            validation_success: m.validation_success,
            calculated_entropy: m.calculated_entropy,
            is_base64: m.is_base64,
            dependent_captures: m.dependent_captures.clone(),
        }
    }

    pub fn from_blob_match(blob_match: BlobMatch) -> Self {
        // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
        //
        // This indexing is intentionally different from the internal `validation_dedup_key()`
        // (which uses get(0)) to maintain stable external fingerprints. Changing this would break:
        // - Historical baselines that rely on fingerprint matching
        // - Dedup entries stored in external systems
        //
        // For rules with nested captures like (?<REGEX>...(ABC)...), this may pick up
        // the inner group, but that behavior is now established and must be preserved.
        let matching_finding = blob_match
            .captures
            .captures
            .get(1)
            .or_else(|| blob_match.captures.captures.get(0))
            .map(|capture| capture.raw_value().as_bytes().to_vec())
            .unwrap_or_else(Vec::new);

        let mut owned_blob_match = OwnedBlobMatch {
            rule: blob_match.rule,
            blob_id: blob_match.blob_id.clone(),
            matching_input_offset_span: blob_match.matching_input_offset_span,
            captures: blob_match.captures.clone(),
            validation_response_body: blob_match.validation_response_body,
            validation_response_status: blob_match.validation_response_status,
            validation_success: blob_match.validation_success,
            calculated_entropy: blob_match.calculated_entropy,
            finding_fingerprint: 0, //default
            is_base64: blob_match.is_base64,
            dependent_captures: std::collections::BTreeMap::new(),
        };

        // Convert matching_finding to a &str (using lossy conversion if needed)
        let finding_value = std::str::from_utf8(&matching_finding).unwrap_or("");
        // Use blob_id as the file/commit identifier
        let file_or_commit = &blob_match.blob_id.to_string();

        let offset_start: u64 =
            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();

        owned_blob_match.finding_fingerprint =
            compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end);

        owned_blob_match
    }
}
// -------------------------------------------------------------------------------------------------
// BlobMatch
// -------------------------------------------------------------------------------------------------
/// A `BlobMatch` is the result type from `Matcher::scan_blob`.
///
/// It is mostly made up of references and small data.
/// For a representation that is more friendly for human consumption, see
/// `Match`.
pub struct BlobMatch<'a> {
    /// The rule that was matched
    pub rule: Arc<Rule>, // Changed from `&'a Rule` to `Arc<Rule

    /// The blob that was matched
    pub blob_id: &'a BlobId,

    /// The matching input in `blob.input`
    pub matching_input: &'a [u8],

    /// The location of the matching input in `blob.input`
    pub matching_input_offset_span: OffsetSpan,

    /// The capture groups from the match
    pub captures: SerializableCaptures, // regex::bytes::Captures<'a>,

    pub validation_response_body: ValidationResponseBody,
    pub validation_response_status: StatusCode,

    pub validation_success: bool,
    pub calculated_entropy: f32,
    pub is_base64: bool,
}
#[derive(Clone)]
struct UserData {
    /// A scratch vector for raw matches from Vectorscan, to minimize allocation
    raw_matches_scratch: Vec<RawMatch>,

    /// The length of the input being scanned
    input_len: u64,
}
// -------------------------------------------------------------------------------------------------
// Matcher
// -------------------------------------------------------------------------------------------------
/// A `Matcher` is able to scan inputs for matches from rules in a
/// `RulesDatabase`.
///
/// If doing multi-threaded scanning, use a separate `Matcher` for each thread.
#[derive(Clone)]
pub struct Matcher<'a> {
    /// Thread-local pool that hands out a &mut BlockScanner
    scanner_pool: std::sync::Arc<crate::scanner_pool::ScannerPool>,

    /// The rules database used for matching
    rules_db: &'a RulesDatabase,

    /// Local statistics for this `Matcher`
    local_stats: MatcherStats,

    /// Global statistics, updated with the local statsistics when this
    /// `Matcher` is dropped
    global_stats: Option<&'a Mutex<MatcherStats>>,

    /// The set of blobs that have been seen
    seen_blobs: &'a BlobIdMap<bool>,

    /// Data passed to the Vectorscan callback
    user_data: UserData,

    /// Rule profiler for measuring performance of individual rules
    profiler: Option<Arc<ConcurrentRuleProfiler>>,

    /// Configuration that controls inline ignore directives
    inline_ignore_config: InlineIgnoreConfig,

    /// Whether matches should honour `ignore_if_contains` requirements.
    respect_ignore_if_contains: bool,
}
/// This `Drop` implementation updates the `global_stats` with the local stats
impl<'a> Drop for Matcher<'a> {
    fn drop(&mut self) {
        if let Some(global_stats) = self.global_stats {
            let mut global_stats = global_stats.lock().unwrap();
            global_stats.update(&self.local_stats);
        }
    }
}
pub enum ScanResult<'a> {
    SeenWithMatches,
    SeenSansMatches,
    New(Vec<BlobMatch<'a>>),
}
impl<'a> Matcher<'a> {
    /// Create a new `Matcher` from the given `RulesDatabase`.
    ///
    /// If `global_stats` is provided, it will be updated with the local stats
    /// from this `Matcher` when it is dropped.
    pub fn new(
        rules_db: &'a RulesDatabase,
        scanner_pool: Arc<ScannerPool>,
        seen_blobs: &'a BlobIdMap<bool>,
        global_stats: Option<&'a Mutex<MatcherStats>>,
        enable_profiling: bool,
        shared_profiler: Option<Arc<ConcurrentRuleProfiler>>,
        extra_ignore_directives: &[String],
        disable_inline_ignores: bool,
        respect_ignore_if_contains: bool,
    ) -> Result<Self> {
        // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec
        let raw_matches_scratch = Vec::new();
        let user_data = UserData { raw_matches_scratch, input_len: 0 };
        // let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?;
        // pool is created once per scan run (see Scanner section below)
        let profiler = shared_profiler.or_else(|| {
            if enable_profiling {
                Some(Arc::new(ConcurrentRuleProfiler::new()))
            } else {
                None
            }
        });
        Ok(Matcher {
            scanner_pool,
            rules_db,
            local_stats: MatcherStats::default(),
            global_stats,
            seen_blobs,
            user_data,
            profiler,
            inline_ignore_config: if disable_inline_ignores {
                InlineIgnoreConfig::disabled()
            } else {
                InlineIgnoreConfig::new(extra_ignore_directives)
            },
            respect_ignore_if_contains,
        })
    }

    fn scan_bytes_raw(&mut self, input: &[u8], _filename: &str) -> Result<()> {
        // Remember previous peak automatically
        let prev_capacity = self.user_data.raw_matches_scratch.capacity();
        self.user_data.raw_matches_scratch.clear();
        self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64));

        self.user_data.input_len = input.len() as u64;

        let mut offset: usize = 0;
        while offset < input.len() {
            let end = (offset + MAX_CHUNK_SIZE).min(input.len());
            let slice = &input[offset..end];
            let base = offset as u64;
            self.scanner_pool.with(|scanner| {
                scanner.scan(slice, |rule_id, from, to, _flags| {
                    self.user_data.raw_matches_scratch.push(RawMatch {
                        rule_id,
                        start_idx: from + base,
                        end_idx: to + base,
                    });
                    vectorscan_rs::Scan::Continue
                })
            })?;

            if end == input.len() {
                break;
            }
            offset = end.saturating_sub(CHUNK_OVERLAP);
        }

        Ok(())
    }

    pub fn scan_blob<'b>(
        &mut self,
        blob: &'b Blob,
        origin: &OriginSet,
        lang: Option<String>,
        redact: bool,
        no_dedup: bool,
        no_base64: bool,
    ) -> Result<ScanResult<'b>>
    where
        'a: 'b,
    {
        // Update local stats
        self.local_stats.blobs_seen += 1;
        self.local_stats.bytes_seen += blob.bytes().len() as u64;
        self.local_stats.blobs_scanned += 1;
        self.local_stats.bytes_scanned += blob.bytes().len() as u64;

        // Extract filename from origin
        let filename = origin
            .first()
            .blob_path()
            .and_then(|path| path.file_name())
            .and_then(|name| name.to_str())
            .unwrap_or("unknown_file")
            .to_string();
        // Perform the scan
        self.scan_bytes_raw(&blob.bytes(), &filename)?;

        // Opportunistically look for standalone Base64 blobs. If neither
        // the raw scan nor this check yields anything, we can return early
        // before doing any heavier work.
        let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
            Vec::new()
        } else {
            get_base64_strings(blob.bytes())
        };

        let lang_hint = lang.as_deref();
        let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty();
        let has_base64_items = !b64_items.is_empty();

        if !has_raw_matches && !has_base64_items {
            return Ok(ScanResult::New(Vec::new()));
        }

        let rules_db = self.rules_db;
        let mut seen_matches = FxHashSet::default();
        let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();

        let blob_len = blob.len();

        let should_run_tree_sitter = blob_len > 0
            && blob_len <= TREE_SITTER_MAX_LIMIT
            && blob_len >= TREE_SITTER_MIN_LIMIT
            && has_raw_matches
            && lang_hint.is_some()
            && !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled

        let tree_sitter_result = if should_run_tree_sitter {
            lang_hint.and_then(|lang_str| {
                get_language_and_queries(lang_str).and_then(|(language, queries)| {
                    let checker = Checker { language, rules: queries };
                    match checker.check(&blob.bytes()) {
                        Ok(results) => Some(results),
                        Err(e) => {
                            println!("Error in checker.check: {}", e);
                            None
                        }
                    }
                })
            })
        } else {
            None
        };
        //
        // Process matches
        //
        let mut matches = Vec::new();
        let owned_ts_results = tree_sitter_result.map(|ts_results| {
            ts_results
                .into_iter()
                .filter(|match_result| match_result.is_base64_decoded)
                .map(|match_result| {
                    (
                        match_result.range,
                        match_result.text,
                        match_result.is_base64_decoded,
                        match_result.original_base64,
                    )
                })
                .collect::<Vec<_>>()
        });
        let mut previous_raw_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
        for &RawMatch { rule_id, start_idx, end_idx } in
            self.user_data.raw_matches_scratch.iter().rev()
        {
            let rule_id_usize: usize = rule_id as usize;
            let rule = Arc::clone(&rules_db.rules()[rule_id_usize]);
            let re = &rules_db.anchored_regexes()[rule_id_usize];
            let start_idx_usize = start_idx as usize;
            let end_idx_usize = end_idx as usize;
            let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
            if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
                continue;
            }
            filter_match(
                blob,
                rule,
                re,
                start_idx_usize,
                end_idx_usize,
                &mut matches,
                &mut previous_matches,
                rule_id_usize,
                &mut seen_matches,
                origin,
                None,
                false,
                redact,
                &filename,
                self.profiler.as_ref(),
                self.respect_ignore_if_contains,
                &self.inline_ignore_config,
            );
        }
        // If tree-sitter produced base64-decoded matches, try them against all rules
        if let Some(ref ts_results) = owned_ts_results {
            for (ts_range, ts_match, is_base64_decoded, _original_base64) in ts_results.iter() {
                if *is_base64_decoded {
                    for (rule_id_usize, rule) in rules_db.rules().iter().enumerate() {
                        let re = &rules_db.anchored_regexes()[rule_id_usize];
                        filter_match(
                            blob,
                            rule.clone(),
                            re,
                            ts_range.start,
                            ts_range.end,
                            &mut matches,
                            &mut previous_matches,
                            rule_id_usize,
                            &mut seen_matches,
                            origin,
                            Some(ts_match.as_bytes()),
                            *is_base64_decoded,
                            redact,
                            &filename,
                            self.profiler.as_ref(),
                            self.respect_ignore_if_contains,
                            &self.inline_ignore_config,
                        );
                    }
                }
            }
        }

        if !no_base64 {
            // If the blob contains standalone Base64 blobs, decode and scan them as well
            const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep
            let mut b64_stack: Vec<(DecodedData, usize)> =
                b64_items.drain(..).map(|d| (d, 0)).collect();
            while let Some((item, depth)) = b64_stack.pop() {
                for (rule_id_usize, rule) in rules_db.rules().iter().enumerate() {
                    let re = &rules_db.anchored_regexes()[rule_id_usize];
                    filter_match(
                        blob,
                        rule.clone(),
                        re,
                        item.pos_start,
                        item.pos_end,
                        &mut matches,
                        &mut previous_matches,
                        rule_id_usize,
                        &mut seen_matches,
                        origin,
                        Some(item.decoded.as_slice()),
                        true,
                        redact,
                        &filename,
                        self.profiler.as_ref(),
                        self.respect_ignore_if_contains,
                        &self.inline_ignore_config,
                    );
                }
                if depth + 1 < MAX_B64_DEPTH {
                    for nested in get_base64_strings(item.decoded.as_slice()) {
                        b64_stack.push((
                            DecodedData {
                                decoded: nested.decoded,
                                pos_start: item.pos_start,
                                pos_end: item.pos_end,
                            },
                            depth + 1,
                        ));
                    }
                }
            }
        }
        // Finalize
        if !no_dedup && !matches.is_empty() {
            let blob_id = blob.id();
            if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
                return Ok(if had_matches {
                    ScanResult::SeenWithMatches
                } else {
                    ScanResult::SeenSansMatches
                });
            }
        }

        // --- opportunistic capacity cap ---------------------------------
        if self.user_data.raw_matches_scratch.capacity()
            > self.user_data.raw_matches_scratch.len() * 4
        {
            // Vec::shrink_to_fit may re-allocate, but we’re about to leave scan_blob
            // so the cost is hidden off the hot path.
            self.user_data.raw_matches_scratch.shrink_to_fit();
        }

        Ok(ScanResult::New(matches))
        // Ok(result)
    }
}

#[inline]
fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
    let mut hasher = FxHasher::default();
    // Hash each component directly without allocation
    content.hash(&mut hasher);
    rule_id.hash(&mut hasher);
    start.hash(&mut hasher);
    end.hash(&mut hasher);
    hasher.finish()
}

#[inline]
fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
    let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
    if idx > 0 {
        if spans[idx - 1].fully_contains(&span) {
            return false;
        }
        if span.fully_contains(&spans[idx - 1]) {
            spans.remove(idx - 1);
            idx -= 1;
        }
    }
    if idx < spans.len() {
        if spans[idx].fully_contains(&span) {
            return false;
        }
        if span.fully_contains(&spans[idx]) {
            spans.remove(idx);
        }
    }
    spans.insert(idx, span);
    true
}

#[inline]
fn record_match(
    map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
    rule_id: usize,
    span: OffsetSpan,
) -> bool {
    insert_span(map.entry(rule_id).or_default(), span)
}

#[allow(clippy::too_many_arguments)]
fn filter_match<'b>(
    blob: &'b Blob,
    rule: Arc<Rule>,
    re: &Regex,
    start: usize,
    end: usize,
    matches: &mut Vec<BlobMatch<'b>>,
    previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
    rule_id: usize,
    seen_matches: &mut FxHashSet<u64>,
    _origin: &OriginSet,
    ts_match: Option<&[u8]>,
    is_base64: bool,
    _redact: bool,
    filename: &str,
    profiler: Option<&Arc<ConcurrentRuleProfiler>>,
    respect_ignore_if_contains: bool,
    inline_ignore_config: &InlineIgnoreConfig,
) {
    let mut timer =
        profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));

    let initial_len = matches.len();

    let blob_bytes = blob.bytes();
    let default_slice = &blob_bytes[start..end];
    let haystack = ts_match.unwrap_or(default_slice);

    for captures in re.captures_iter(haystack) {
        let full_capture = captures.get(0).unwrap();

        // --- LOGIC TO FIND THE "SECRET" FOR ENTROPY/SAFE-LISTING ---
        let matching_input_for_entropy = 'block: {
            // 1. Prefer a named capture called TOKEN (case-insensitive).
            if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
                name_opt
                    .filter(|name| name.eq_ignore_ascii_case("TOKEN"))
                    .and_then(|_| captures.get(i))
            }) {
                break 'block token_cap;
            }

            // 2. Otherwise, prefer the first *matched* named capture.
            if let Some(named_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
                name_opt.and_then(|_| captures.get(i)) // find(i > 0 && name_opt.is_some())
            }) {
                break 'block named_cap;
            }

            // 3. Otherwise, fall back to the first positional capture (group 1).
            if let Some(pos_cap) = captures.get(1) {
                break 'block pos_cap;
            }

            // 4. Finally, fall back to the full match (group 0).
            break 'block full_capture;
        };
        // --- END LOGIC ---

        let min_entropy = rule.min_entropy();
        let entropy_bytes = matching_input_for_entropy.as_bytes();
        let full_bytes = full_capture.as_bytes();
        let calculated_entropy = calculate_shannon_entropy(entropy_bytes);

        // Check entropy and safe-listing against the *selected* secret bytes
        if calculated_entropy <= min_entropy
            || is_safe_match(entropy_bytes)
            || is_user_match(entropy_bytes, full_bytes)
        {
            debug!(
                "Skipping match with entropy {} <= {} or safe match",
                calculated_entropy, min_entropy
            );
            continue;
        }

        // Check character requirements if specified
        if let Some(char_reqs) = rule.pattern_requirements() {
            let context = PatternRequirementContext {
                regex: re,
                captures: &captures,
                full_match: full_bytes,
            };

            // Decide which bytes to validate:
            // - If there are multiple capture groups OR any named captures → use full match
            // - Otherwise → use entropy_bytes (the actual secret)
            let use_full_match = {
                let has_named_captures = re.capture_names().any(|n| n.is_some());
                let capture_count = captures.len(); // includes group 0
                has_named_captures || capture_count > 2
            };

            let validation_bytes = if use_full_match { full_bytes } else { entropy_bytes };

            match char_reqs.validate(validation_bytes, Some(context), respect_ignore_if_contains) {
                //
                // --- END FIX ---
                PatternValidationResult::Passed => {}
                PatternValidationResult::Failed => {
                    debug!(
                        "Skipping match that does not meet character requirements for rule {}",
                        rule.id()
                    );
                    continue;
                }
                PatternValidationResult::FailedChecksum { actual_len, expected_len } => {
                    debug!(
                        "Skipping match for rule {} due to checksum mismatch (actual_len={}, expected_len={})",
                        rule.id(),
                        actual_len,
                        expected_len
                    );
                    continue;
                }
                PatternValidationResult::IgnoredBySubstring { matched_term } => {
                    debug!(
                        "Skipping match for rule {} because it contains ignored term {matched_term}",
                        rule.id()
                    );
                    continue;
                }
            }
        }

        // Use the `matching_input_for_entropy` as the span/key for the finding.
        let matching_input = matching_input_for_entropy;

        let matching_input_offset_span = OffsetSpan::from_range(
            (start + matching_input.start())..(start + matching_input.end()),
        );
        if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
            debug!("Skipping match due to inline ignore directive");
            continue;
        }
        if let Some(validation) = rule.syntax.validation.as_ref() {
            match validation {
                Validation::MongoDB => {
                    let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
                        debug!("Skipping match for rule {} due to non-UTF8 MongoDB URI", rule.id());
                        continue;
                    };
                    if !is_parseable_mongodb_uri(uri) {
                        debug!("Skipping match for rule {} due to invalid MongoDB URI", rule.id());
                        continue;
                    }
                }
                Validation::Postgres => {
                    let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
                        debug!(
                            "Skipping match for rule {} due to non-UTF8 Postgres URI",
                            rule.id()
                        );
                        continue;
                    };
                    if !is_parseable_postgres_uri(uri) {
                        debug!("Skipping match for rule {} due to invalid Postgres URI", rule.id());
                        continue;
                    }
                }
                Validation::MySQL => {
                    let Ok(uri) = std::str::from_utf8(matching_input.as_bytes()) else {
                        debug!("Skipping match for rule {} due to non-UTF8 MySQL URI", rule.id());
                        continue;
                    };
                    if !is_parseable_mysql_uri(uri) {
                        debug!("Skipping match for rule {} due to invalid MySQL URI", rule.id());
                        continue;
                    }
                }
                _ => {}
            }
        }
        let match_key = compute_match_key(
            matching_input.as_bytes(),
            rule.id().as_bytes(),
            matching_input_offset_span.start,
            matching_input_offset_span.end,
        );
        if !seen_matches.insert(match_key) {
            continue;
        }
        if !record_match(previous_matches, rule_id, matching_input_offset_span) {
            continue;
        }
        let only_matching_input =
            &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];

        // Pass the *full* capture object to from_captures
        let groups = SerializableCaptures::from_captures(&captures, haystack, re);

        matches.push(BlobMatch {
            rule: Arc::clone(&rule),
            blob_id: blob.id_ref(),
            matching_input: only_matching_input,
            matching_input_offset_span,
            captures: groups,
            validation_response_body: None,
            validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
            validation_success: false,
            calculated_entropy,
            is_base64,
        });
    }
    if let Some(t) = timer.take() {
        let new_count = (matches.len() - initial_len) as u64;
        t.end(new_count > 0, new_count, 0);
    }
}

fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, String>)> {
    match lang.to_lowercase().as_str() {
        "bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())),
        "c" => Some((Language::C, parser::queries::c::get_c_queries())),
        "c#" | "csharp" => Some((Language::CSharp, parser::queries::csharp::get_csharp_queries())),
        "c++" | "cpp" => Some((Language::Cpp, parser::queries::cpp::get_cpp_queries())),
        "css" => Some((Language::Css, parser::queries::css::get_css_queries())),
        "go" => Some((Language::Go, parser::queries::go::get_go_queries())),
        "html" => Some((Language::Html, parser::queries::html::get_html_queries())),
        "java" => Some((Language::Java, parser::queries::java::get_java_queries())),
        "javascript" | "js" => {
            Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries()))
        }
        // "kotlin" => Some((
        //     Language::Kotlin,
        //     parser::queries::kotlin::get_kotlin_queries(),
        // )),
        "php" => Some((Language::Php, parser::queries::php::get_php_queries())),
        "python" | "py" | "starlark" => {
            Some((Language::Python, parser::queries::python::get_python_queries()))
        }
        "ruby" => Some((Language::Ruby, parser::queries::ruby::get_ruby_queries())),
        "rust" => Some((Language::Rust, parser::queries::rust::get_rust_queries())),
        "toml" => Some((Language::Toml, parser::queries::toml::get_toml_queries())),
        "typescript" | "ts" => {
            Some((Language::TypeScript, parser::queries::typescript::get_typescript_queries()))
        }
        "yaml" => Some((Language::Yaml, parser::queries::yaml::get_yaml_queries())),
        _ => None,
    }
}
// -------------------------------------------------------------------------------------------------
// MatchStats
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Default, Clone)]
pub struct MatcherStats {
    pub blobs_seen: u64,
    pub blobs_scanned: u64,
    pub bytes_seen: u64,
    pub bytes_scanned: u64,
    // #[cfg(feature = "rule_profiling")]
    // pub rule_stats: crate::rule_profiling::RuleProfile,
}
impl MatcherStats {
    pub fn update(&mut self, other: &Self) {
        self.blobs_seen += other.blobs_seen;
        self.blobs_scanned += other.blobs_scanned;
        self.bytes_seen += other.bytes_seen;
        self.bytes_scanned += other.bytes_scanned;

        // #[cfg(feature = "rule_profiling")]
        // self.rule_stats.update(&other.rule_stats);
    }
}
// -------------------------------------------------------------------------------------------------
// Group
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)]
pub struct Group(pub Base64BString);
impl Group {
    pub fn new(m: regex::bytes::Match<'_>) -> Self {
        Self(Base64BString(BString::from(m.as_bytes())))
    }
}
// -------------------------------------------------------------------------------------------------
// Groups
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct Groups(pub SmallVec<[Group; 1]>);
impl JsonSchema for Groups {
    fn schema_name() -> String {
        "Groups".to_string()
    }

    fn json_schema(gen: &mut SchemaGenerator) -> Schema {
        let group_schema = gen.subschema_for::<Group>();
        Schema::Object(schemars::schema::SchemaObject {
            instance_type: Some(InstanceType::Array.into()),
            array: Some(Box::new(ArrayValidation {
                items: Some(group_schema.into()),
                ..Default::default()
            })),
            ..Default::default()
        })
    }
}
// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
// pub struct SerializableCapture {
//     pub name: Option<String>, // Named group (if available)
//     pub match_number: i32,
//     pub start: usize,  // Start position of the match
//     pub end: usize,    // End position of the match
//     pub value: String, // The actual captured value
// }
#[derive(Debug, Clone, JsonSchema)]
pub struct SerializableCapture {
    pub name: Option<&'static str>,
    pub match_number: i32,
    pub start: usize,
    pub end: usize,
    /// Interned original (unredacted) value.
    #[serde(skip_serializing, skip_deserializing)]
    pub value: &'static str,
}

impl SerializableCapture {
    /// Returns the original captured value.
    pub fn raw_value(&self) -> &'static str {
        self.value
    }

    /// Returns the value that should be shown in user-facing output.
    pub fn display_value(&self) -> std::borrow::Cow<'static, str> {
        crate::util::display_value(self.value)
    }
}

impl serde::Serialize for SerializableCapture {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
    {
        use serde::ser::SerializeStruct;

        let mut state = serializer.serialize_struct("SerializableCapture", 5)?;
        state.serialize_field("name", &self.name)?;
        state.serialize_field("match_number", &self.match_number)?;
        state.serialize_field("start", &self.start)?;
        state.serialize_field("end", &self.end)?;
        let value = self.display_value();
        state.serialize_field("value", &value)?;
        state.end()
    }
}

#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCaptures {
    #[schemars(with = "Vec<SerializableCapture>")]
    pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
}

impl SerializableCaptures {
    pub fn from_captures(captures: &regex::bytes::Captures, _input: &[u8], re: &Regex) -> Self {
        let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();

        let capture_names: SmallVec<[Option<&'static str>; 4]> =
            re.capture_names().map(|name| name.map(intern)).collect();

        // If there are explicit capture groups (e.g., group 1, 2, ...),
        // only serialize those.
        if captures.len() > 1 {
            for i in 1..captures.len() {
                // Start from 1
                if let Some(cap) = captures.get(i) {
                    let raw_value = String::from_utf8_lossy(cap.as_bytes());
                    let raw_interned = intern(raw_value.as_ref());
                    let name = capture_names.get(i).and_then(|opt| *opt);

                    serialized_captures.push(SerializableCapture {
                        name,
                        match_number: i32::try_from(i).unwrap_or(0),
                        start: cap.start(),
                        end: cap.end(),
                        value: raw_interned,
                    });
                }
            }
        } else if captures.len() == 1 {
            // ELSE, if there is ONLY the full match (len == 1),
            // serialize just that full match (group 0) as the fallback.
            if let Some(cap) = captures.get(0) {
                let raw_value = String::from_utf8_lossy(cap.as_bytes());
                let raw_interned = intern(raw_value.as_ref());
                let name = capture_names.get(0).and_then(|opt| *opt);

                serialized_captures.push(SerializableCapture {
                    name,
                    match_number: 0,
                    start: cap.start(),
                    end: cap.end(),
                    value: raw_interned,
                });
            }
        }
        // If len == 0 (no match), loop is skipped, empty vec is returned.

        SerializableCaptures { captures: serialized_captures }
    }
}
// -------------------------------------------------------------------------------------------------
// Match
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct Match {
    /// The location of the entire matching content
    pub location: Location,

    /// The capture groups
    pub groups: SerializableCaptures, // Store serialized captures

    /// unique identifier of file / blob where this match was found
    pub blob_id: BlobId,

    /// The unique content-based identifier of this match
    pub finding_fingerprint: u64,

    /// The rule that produced this match
    #[serde(skip_serializing)]
    #[schemars(skip)]
    pub rule: Arc<Rule>,

    /// Validation Body
    #[serde(
        default,
        serialize_with = "validation_body::serialize",
        deserialize_with = "validation_body::deserialize"
    )]
    #[schemars(schema_with = "validation_body::schema")]
    pub validation_response_body: ValidationResponseBody,

    /// Validation Status Code
    pub validation_response_status: u16,

    /// Validation Success
    pub validation_success: bool,

    /// Validation Success
    pub calculated_entropy: f32,

    pub visible: bool,
    #[serde(default)]
    pub is_base64: bool,

    /// Variables captured from dependent rules (from depends_on_rule).
    /// Maps variable name (uppercase) to captured value.
    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
    pub dependent_captures: std::collections::BTreeMap<String, String>,
}
impl Match {
    #[inline]
    pub fn convert_owned_blobmatch_to_match<'a>(
        loc_mapping: Option<&'a LocationMapping<'a>>,
        owned_blob_match: &'a OwnedBlobMatch,
        origin_type: &'a str,
    ) -> Self {
        let offset_span = owned_blob_match.matching_input_offset_span;
        // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility.
        // See comment in from_blob_match() for why this differs from validation_dedup_key().
        let matching_finding_bytes = owned_blob_match
            .captures
            .captures
            .get(1)
            .or_else(|| owned_blob_match.captures.captures.get(0))
            .map(|capture| capture.raw_value().as_bytes())
            .unwrap_or_default();

        // The fingerprint will be based on the content of the secret.
        let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");

        let source_span =
            loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
                start: SourcePoint { line: 0, column: 0 },
                end: SourcePoint { line: 0, column: 0 },
            });
        let offset_start: u64 =
            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();

        let finding_fingerprint = compute_finding_fingerprint(
            finding_value_for_fp,
            origin_type, // file_or_commit,
            offset_start,
            offset_end,
        );

        // matching_snippet
        Match {
            rule: owned_blob_match.rule.clone(),
            visible: owned_blob_match.rule.visible().to_owned(),
            location: Location::with_source_span(offset_span, Some(source_span.clone())),
            groups: owned_blob_match.captures.clone(),
            blob_id: owned_blob_match.blob_id,
            finding_fingerprint,
            validation_response_body: owned_blob_match.validation_response_body.clone(),
            validation_response_status: owned_blob_match.validation_response_status.as_u16(),
            validation_success: owned_blob_match.validation_success,
            calculated_entropy: owned_blob_match.calculated_entropy,
            is_base64: owned_blob_match.is_base64,
            dependent_captures: owned_blob_match.dependent_captures.clone(),
        }
    }

    /// Returns the `blob_id` of the match.
    pub fn get_blob_id(&self) -> BlobId {
        self.blob_id.clone()
    }

    pub fn finding_id(&self) -> String {
        let mut buffer = Vec::with_capacity(128);
        buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes());
        buffer.push(0);
        serde_json::to_writer(&mut buffer, &self.groups)
            .expect("should be able to serialize groups as JSON");
        let mut num = xxh3_64(&buffer);
        // Ensure the number is positive and within i64 range
        num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive
                                      // Convert to string
        num.to_string()
    }
}
#[derive(Debug, Clone)]
pub struct DecodedData {
    pub decoded: Vec<u8>,
    pub pos_start: usize,
    pub pos_end: usize,
}
#[inline]
fn is_base64_byte(b: u8) -> bool {
    // Accepts both standard base64 ('+', '/') and URL-safe base64 ('-', '_') characters.
    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
}

pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
    let mut results = Vec::new();
    let mut i = 0;
    while i < input.len() {
        while i < input.len() && !is_base64_byte(input[i]) {
            i += 1;
        }
        let start = i;
        while i < input.len() && is_base64_byte(input[i]) {
            i += 1;
        }

        let mut eq_count = 0;
        while i < input.len() && input[i] == b'=' && eq_count < 2 {
            i += 1;
            eq_count += 1;
        }
        let end = i;

        let len = end - start;
        if len >= 32 && len % 4 == 0 {
            let base64_slice = &input[start..end];

            // Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD
            let decode_result = general_purpose::STANDARD
                .decode(base64_slice)
                .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
                .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));

            if let Ok(decoded) = decode_result {
                if decoded.is_ascii() {
                    results.push(DecodedData { decoded, pos_start: start, pos_end: end });
                }
            }
        }
    }

    results
}

pub fn compute_finding_fingerprint(
    finding_value: &str,
    file_or_commit: &str,
    offset_start: u64,
    offset_end: u64,
) -> u64 {
    // Combine all into a byte buffer and hash it directly:
    let mut buf = Vec::with_capacity(
        finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::<u64>(),
    );
    buf.extend_from_slice(finding_value.as_bytes());
    buf.extend_from_slice(file_or_commit.as_bytes());
    buf.extend_from_slice(&offset_start.to_le_bytes());
    buf.extend_from_slice(&offset_end.to_le_bytes());

    xxh3_64(&buf)
}

// -------------------------------------------------------------------------------------------------
// test
// -------------------------------------------------------------------------------------------------
#[cfg(test)]
mod test {
    use std::{collections::BTreeMap, path::PathBuf};

    use pretty_assertions::assert_eq;
    // ---------------------------------------------------------------------
    // proptest: raw-match dedup + entropy gate
    // ---------------------------------------------------------------------
    use proptest::prelude::*;

    use super::*;
    use crate::{
        blob::{Blob, BlobIdMap},
        origin::{Origin, OriginSet},
        rules::rule::{
            DependsOnRule, HttpRequest, HttpValidation, PatternRequirements, RuleSyntax, Validation,
        },
    };

    proptest! {
        #[test]
        fn prop_no_dupes_and_entropy(
            // random ASCII up to 300 bytes
            mut noise in proptest::collection::vec(any::<u8>().prop_filter("ascii", |b| b.is_ascii()), 0..300),
            // 0-4 random insertion points
            inserts in proptest::collection::vec(0usize..300, 0..5)
        ) {
            // Constant high-entropy secret token that matches the rule below
            const TOKEN: &[u8] = b"secret_abcd1234";

            // Splice the token at the requested offsets
            for &idx in &inserts {
                let pos = idx.min(noise.len());
                noise.splice(pos..pos, TOKEN.iter().copied());
            }

            // ── build a single test rule ──────────────────────────────────
            use crate::rules::rule::{RuleSyntax, Validation, Confidence};

            let rule = Rule::new(RuleSyntax {
                id: "prop.secret".into(),
                name: "prop secret".into(),
                pattern: "secret_[a-z]{4}[0-9]{4}".into(),
                confidence: Confidence::Low,
                min_entropy: 3.0,
                visible: true,
                examples: vec![],
                negative_examples: vec![],
                references: vec![],
                validation: None::<Validation>,          // no HTTP validation needed
                revocation: None,
                depends_on_rule: vec![],
                pattern_requirements: None,
            });

            let rules_db  = RulesDatabase::from_rules(vec![rule]).unwrap();
            let seen      = BlobIdMap::new();
            let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
            let mut m     = Matcher::new(
                &rules_db,
                scanner_pool,
                &seen,
                None,
                false,
                None,
                &[],
                false,
                true,
            )
            .unwrap();

            // ── run the scan ──────────────────────────────────────────────
            m.scan_bytes_raw(&noise, "buf").unwrap();

            // ── property 1: dedup – each (rule,start,end) is unique ──────

            let mut coords = FxHashSet::default();
            for RawMatch{rule_id, start_idx, end_idx} in &m.user_data.raw_matches_scratch {
                assert!(
                    coords.insert((*rule_id, *start_idx, *end_idx)),
                    "duplicate raw-match detected for coords ({rule_id},{start_idx},{end_idx})"
                );

                // ── property 2: entropy gate held ────────────────────────
                let slice = &noise[*start_idx as usize .. *end_idx as usize];
                let ent   = calculate_shannon_entropy(slice);
                assert!(ent > 3.0, "entropy {ent} ≤ min_entropy, gate failed");
            }
        }
    }

    #[test]
    pub fn test_simple() -> Result<()> {
        let rules = vec![Rule::new(RuleSyntax {
            id: "test.1".to_string(),
            name: "test".to_string(),
            pattern: "test".to_string(),
            confidence: crate::rules::rule::Confidence::Medium,
            min_entropy: 1.0,
            visible: true,
            examples: vec![],
            negative_examples: vec![],
            references: vec![],
            validation: Some(Validation::Http(HttpValidation {
                request: HttpRequest {
                    method: "GET".to_string(),
                    url: "https://example.com".to_string(),
                    headers: BTreeMap::new(),
                    body: None,
                    response_matcher: Some(vec![]),
                    multipart: None,
                    response_is_html: false,
                },
                multipart: None,
            })),
            revocation: None,
            depends_on_rule: vec![
                Some(DependsOnRule {
                    rule_id: "d8f3c34b-015f-4cd6-b411-b1366493104c".to_string(),
                    variable: "email".to_string(),
                }),
                Some(DependsOnRule {
                    rule_id: "8910f364-7718-4a27-a435-d2da13e6ba9e".to_string(),
                    variable: "domain".to_string(),
                }),
            ],
            pattern_requirements: None,
        })];
        let rules_db = RulesDatabase::from_rules(rules)?;
        let input = "some test data for vectorscan";
        let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
        let enable_rule_profiling = true;
        // let mut matcher = Matcher::new(&rules_db, &seen_blobs, None,
        // enable_rule_profiling)?;
        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
        let mut matcher = Matcher::new(
            &rules_db,
            scanner_pool,
            &seen_blobs,
            None,
            enable_rule_profiling,
            None, // Pass the shared profiler
            &[],
            false,
            true,
        )?;
        matcher.scan_bytes_raw(input.as_bytes(), "fname")?;
        assert_eq!(
            matcher.user_data.raw_matches_scratch,
            vec![RawMatch { rule_id: 0, start_idx: 0, end_idx: 9 },]
        );
        Ok(())
    }

    #[test]
    fn test_pattern_requirements_ignore_if_contains_filters_matches() -> Result<()> {
        let rules = vec![Rule::new(RuleSyntax {
            id: "test.exclude".to_string(),
            name: "exclude words".to_string(),
            pattern: "(?P<token>prefix[A-Za-z]+)".to_string(),
            confidence: crate::rules::rule::Confidence::Medium,
            min_entropy: 0.0,
            visible: true,
            examples: vec![],
            negative_examples: vec![],
            references: vec![],
            validation: None,
            revocation: None,
            depends_on_rule: vec![],
            pattern_requirements: Some(PatternRequirements {
                min_digits: None,
                min_uppercase: None,
                min_lowercase: None,
                min_special_chars: None,
                special_chars: None,
                ignore_if_contains: Some(vec!["TEST".to_string()]),
                checksum: None,
            }),
        })];

        let rules_db = RulesDatabase::from_rules(rules)?;
        let input = b"prefixgood prefixtest";
        let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
        let mut matcher = Matcher::new(
            &rules_db,
            scanner_pool,
            &seen_blobs,
            None,
            false,
            None,
            &[],
            false,
            true,
        )?;

        let blob = Blob::from_bytes(input.to_vec());
        let origin = OriginSet::from(Origin::from_file(PathBuf::from("exclude.txt")));

        let matches = match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
            ScanResult::New(matches) => matches,
            ScanResult::SeenWithMatches => {
                panic!("unexpected scan result: blob should not be considered previously seen with matches")
            }
            ScanResult::SeenSansMatches => {
                panic!("unexpected scan result: blob should not be considered previously seen without matches")
            }
        };

        assert_eq!(matches.len(), 1, "ignore_if_contains should drop filtered matches");
        assert_eq!(
            matches[0].matching_input, b"prefixgood",
            "remaining match should be the non-excluded token",
        );

        Ok(())
    }

    #[test]
    fn test_pattern_requirements_ignore_if_contains_can_be_disabled_in_matcher() -> Result<()> {
        let rules = vec![Rule::new(RuleSyntax {
            id: "test.exclude".to_string(),
            name: "exclude words".to_string(),
            pattern: "(?P<token>prefix[A-Za-z]+)".to_string(),
            confidence: crate::rules::rule::Confidence::Medium,
            min_entropy: 0.0,
            visible: true,
            examples: vec![],
            negative_examples: vec![],
            references: vec![],
            validation: None,
            revocation: None,
            depends_on_rule: vec![],
            pattern_requirements: Some(PatternRequirements {
                min_digits: None,
                min_uppercase: None,
                min_lowercase: None,
                min_special_chars: None,
                special_chars: None,
                ignore_if_contains: Some(vec!["TEST".to_string()]),
                checksum: None,
            }),
        })];

        let rules_db = RulesDatabase::from_rules(rules)?;
        let input = b"prefixgood prefixtest";
        let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
        let mut matcher = Matcher::new(
            &rules_db,
            scanner_pool,
            &seen_blobs,
            None,
            false,
            None,
            &[],
            false,
            false,
        )?;

        let blob = Blob::from_bytes(input.to_vec());
        let origin = OriginSet::from(Origin::from_file(PathBuf::from("exclude-disabled.txt")));

        let matches = match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
            ScanResult::New(matches) => matches,
            ScanResult::SeenWithMatches => {
                panic!(
                    "unexpected scan result: blob should not be considered previously seen with matches"
                )
            }
            ScanResult::SeenSansMatches => {
                panic!(
                    "unexpected scan result: blob should not be considered previously seen without matches"
                )
            }
        };

        assert_eq!(matches.len(), 2, "disabling ignore_if_contains should keep all matches");
        Ok(())
    }

    // ---------------------------------------------------------------------
    // additional deterministic unit-tests
    // ---------------------------------------------------------------------

    /// `get_base64_strings` should recognise a well-formed token, decode it,
    /// and report correct byte-offsets.
    #[test]
    fn test_get_base64_strings_basic() {
        let base64_payload = b"MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY=";
        let mut raw = b"foo ".to_vec();
        raw.extend_from_slice(base64_payload);
        raw.extend_from_slice(b" bar");
        // decodes to "0123456789abcdef0123456789abcdef"
        let hits = get_base64_strings(&raw);
        assert_eq!(hits.len(), 1);
        let item = &hits[0];
        assert_eq!(std::str::from_utf8(&item.decoded).unwrap(), "0123456789abcdef0123456789abcdef");
        // "foo␠" is 4 bytes, so the start offset is 4
        assert_eq!((item.pos_start, item.pos_end), (4, 4 + base64_payload.len()));
    }

    /// `compute_finding_fingerprint` must be stable (same input ⇒ same output)
    /// and sensitive to any input component.
    #[test]
    fn test_finding_fingerprint_stability_and_uniqueness() {
        let a = compute_finding_fingerprint("secret", "fileA", 0, 6);
        let b = compute_finding_fingerprint("secret", "fileA", 0, 6);
        assert_eq!(a, b, "fingerprint should be deterministic");

        // changing any parameter should perturb the hash
        let c = compute_finding_fingerprint("secret", "fileA", 1, 7); // offsets differ
        let d = compute_finding_fingerprint("secret", "fileB", 0, 6); // file id differs
        let e = compute_finding_fingerprint("different", "fileA", 0, 6); // content differs
        assert_ne!(a, c);
        assert_ne!(a, d);
        assert_ne!(a, e);
    }

    /// The (private) `compute_match_key` helper is the linchpin of the raw-dedup
    /// path.  It should return identical keys for identical inputs and different
    /// keys as soon as *anything* changes.
    #[test]
    fn test_compute_match_key_uniqueness() {
        use super::compute_match_key;

        let k1 = compute_match_key(b"abc", b"rule-1", 0, 3);
        let k2 = compute_match_key(b"abc", b"rule-1", 0, 3);
        assert_eq!(k1, k2);

        // mutate each component in turn
        let diff_content = compute_match_key(b"abcd", b"rule-1", 0, 4);
        let diff_rule = compute_match_key(b"abc", b"rule-2", 0, 3);
        let diff_span = compute_match_key(b"abc", b"rule-1", 1, 4);
        assert_ne!(k1, diff_content);
        assert_ne!(k1, diff_rule);
        assert_ne!(k1, diff_span);
    }

    /// Running `scan_bytes_raw` twice over the *same* input should never record
    /// duplicate entries in `raw_matches_scratch`.
    #[test]
    fn test_scan_bytes_raw_no_duplicate_raw_matches() -> Result<()> {
        // simple rule: literal "dup"
        let rule = Rule::new(RuleSyntax {
            id: "dup.check".into(),
            name: "dup".into(),
            pattern: "dup".into(),
            confidence: crate::rules::rule::Confidence::Low,
            min_entropy: 0.0,
            visible: true,
            examples: vec![],
            negative_examples: vec![],
            references: vec![],
            validation: None::<Validation>,
            revocation: None,
            depends_on_rule: vec![],
            pattern_requirements: None,
        });

        let rules_db = RulesDatabase::from_rules(vec![rule])?;
        let seen = BlobIdMap::new();
        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
        let mut m =
            Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;

        let buf = b"dup dup"; // two literal hits, same rule

        // first scan
        m.scan_bytes_raw(buf, "buf1")?;
        let first_len = m.user_data.raw_matches_scratch.len();

        // second scan over the same buffer
        m.scan_bytes_raw(buf, "buf1")?;
        let second_len = m.user_data.raw_matches_scratch.len();

        // we should still only have two unique raw matches recorded
        assert_eq!(first_len, 2);
        assert_eq!(second_len, 2);
        Ok(())
    }

    #[test]
    fn inline_comment_skips_match() -> Result<()> {
        let rule = Rule::new(RuleSyntax {
            id: "inline.ignore".into(),
            name: "inline".into(),
            pattern: "secret_token".into(),
            confidence: crate::rules::rule::Confidence::Low,
            min_entropy: 0.0,
            visible: true,
            examples: vec![],
            negative_examples: vec![],
            references: vec![],
            validation: None::<Validation>,
            revocation: None,
            depends_on_rule: vec![],
            pattern_requirements: None,
        });
        let rules_db = RulesDatabase::from_rules(vec![rule])?;
        let seen = BlobIdMap::new();
        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
        let mut matcher =
            Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;

        let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec());
        let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt")));

        match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
            ScanResult::New(matches) => assert!(matches.is_empty()),
            _ => panic!("unexpected scan result"),
        }

        Ok(())
    }

    #[test]
    fn inline_comment_after_multiline_secret_skips_match() -> Result<()> {
        let rule = Rule::new(RuleSyntax {
            id: "inline.multiline".into(),
            name: "inline multiline".into(),
            pattern: "line1\\s+line2".into(),
            confidence: crate::rules::rule::Confidence::Low,
            min_entropy: 0.0,
            visible: true,
            examples: vec![],
            negative_examples: vec![],
            references: vec![],
            validation: None::<Validation>,
            revocation: None,
            depends_on_rule: vec![],
            pattern_requirements: None,
        });
        let rules_db = RulesDatabase::from_rules(vec![rule])?;
        let seen = BlobIdMap::new();
        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
        let mut matcher =
            Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;

        let blob = Blob::from_bytes(
            br#"let data = """
line1
line2
"""
# kingfisher:ignore
"#
            .to_vec(),
        );
        let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt")));

        match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
            ScanResult::New(matches) => assert!(matches.is_empty()),
            _ => panic!("unexpected scan result"),
        }

        Ok(())
    }

    #[test]
    fn compat_flag_controls_external_directives() -> Result<()> {
        let rule = Rule::new(RuleSyntax {
            id: "inline.compat".into(),
            name: "inline compat".into(),
            pattern: "supersecret123".into(),
            confidence: crate::rules::rule::Confidence::Low,
            min_entropy: 0.0,
            visible: true,
            examples: vec![],
            negative_examples: vec![],
            references: vec![],
            validation: None::<Validation>,
            revocation: None,
            depends_on_rule: vec![],
            pattern_requirements: None,
        });
        let rules_db = RulesDatabase::from_rules(vec![rule])?;

        let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec());
        let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt")));

        let seen = BlobIdMap::new();
        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
        let mut matcher =
            Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
        let matches_without_compat =
            match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
                ScanResult::New(matches) => matches.len(),
                _ => panic!("unexpected scan result"),
            };
        assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag");

        let seen = BlobIdMap::new();
        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
        let extra = vec![String::from("gitleaks:allow")];
        let mut matcher =
            Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &extra, false, true)?;
        match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
            ScanResult::New(matches) => assert!(matches.is_empty()),
            _ => panic!("unexpected scan result"),
        }

        Ok(())
    }

    #[test]
    fn serializes_captures_in_numeric_order() {
        let re =
            Regex::new(r"(?xi)\b(ghp_(?P<body>[A-Z0-9]{3})(?P<checksum>[A-Z0-9]{2}))").unwrap();
        let caps = re.captures(b"ghp_ABC12").expect("expected captures");

        let serialized = SerializableCaptures::from_captures(&caps, b"", &re);
        let entries: Vec<(Option<&str>, i32, &str)> = serialized
            .captures
            .iter()
            .map(|cap| (cap.name.as_deref(), cap.match_number, cap.value))
            .collect();

        assert_eq!(entries.len(), 3);

        assert_eq!(entries[0], (None, 1, "ghp_ABC12"));
        assert_eq!(entries[1], (Some("body"), 2, "ABC"));
        assert_eq!(entries[2], (Some("checksum"), 3, "12"));
    }
}