From caf766b731985d3eb0e86a1f0db847f47ef90be8 Mon Sep 17 00:00:00 2001
From: Mick Grove <mick.grove@mongodb.com>
Date: Thu, 9 Oct 2025 17:59:10 -0700
Subject: [PATCH 1/2] - Added kingfisher:ignore (or kingfisher:allow) to
 silence a finding inline within a file - Added: to reuse existing inline
 directives from other scanners, pass --compat-ignore-comments to also accept
 NOSONAR, kics-scan ignore,  gitleaks:allow and trufflehog:ignore

---
 CHANGELOG.md                  |    4 +
 README.md                     |   18 +-
 README.md.orig                | 1036 ++++++++++++++++++++++++++
 src/cli/commands/scan.rs      |    4 +
 src/inline_ignore.rs          |  342 +++++++++
 src/lib.rs                    |    1 +
 src/main.rs                   |    1 +
 src/matcher.rs                |  138 +++-
 src/matcher.rs.orig           | 1317 +++++++++++++++++++++++++++++++++
 src/matcher.rs.rej            |   52 ++
 src/reporter.rs               |    1 +
 src/reporter/json_format.rs   |    1 +
 src/scanner/enumerate.rs      |    1 +
 src/scanner/repos.rs          |    1 +
 tests/int_allowlist.rs        |    1 +
 tests/int_bitbucket.rs        |    1 +
 tests/int_dedup.rs            |    1 +
 tests/int_github.rs           |    1 +
 tests/int_gitlab.rs           |    2 +
 tests/int_redact.rs           |    1 +
 tests/int_slack.rs            |    2 +
 tests/int_validation_cache.rs |    1 +
 tests/int_vulnerable_files.rs |    2 +
 23 files changed, 2924 insertions(+), 5 deletions(-)
 create mode 100644 README.md.orig
 create mode 100644 src/inline_ignore.rs
 create mode 100644 src/matcher.rs.orig
 create mode 100644 src/matcher.rs.rej
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1278533..03d4aca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 All notable changes to this project will be documented in this file.
 
+## [Unreleased]
+- Added kingfisher:ignore (or kingfisher:allow) to silence a finding inline within a file
+- Added: to reuse existing inline directives from other scanners, pass --compat-ignore-comments to also accept NOSONAR, kics-scan ignore,  gitleaks:allow and trufflehog:ignore
+
 ## [v1.56.0]
 - Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined
 
diff --git a/README.md b/README.md
index 14faccf..2e3bfd3 100644
--- a/README.md
+++ b/README.md
@@ -117,6 +117,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
   - [Notable Scan Options](#notable-scan-options)
   - [Understanding `--confidence`](#understanding---confidence)
     - [Ignore known false positives](#ignore-known-false-positives)
+    - [Inline ignore directives](#inline-ignore-directives)
   - [Finding Fingerprint](#finding-fingerprint)
   - [Rule Performance Profiling](#rule-performance-profiling)
   - [CLI Options](#cli-options)
@@ -962,6 +963,7 @@ leaves the default unchanged.
 - `--manage-baseline`: Create or update the baseline file with current findings
 - `--skip-regex <PATTERN>`: Ignore findings whose text matches this regex (repeatable)
 - `--skip-word <WORD>`: Ignore findings containing this case-insensitive word (repeatable)
+- `--compat-ignore-comments`: Honor inline directives from other scanners (treat `gitleaks:allow` and `trufflehog:ignore` like native suppressions)
 ## Understanding `--confidence`
 
 The `--confidence` flag sets a minimum confidence threshold, not an exact match.
@@ -972,7 +974,21 @@ The `--confidence` flag sets a minimum confidence threshold, not an exact match.
 
 ### Ignore known false positives
 
-Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. 
+Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context.
+
+### Inline ignore directives
+
+Add `kingfisher:ignore` (or `kingfisher:allow`) to a trailing comment on the same line as a finding to silence it. Multi-line strings may also be ignored by placing the directive on the closing delimiter line **or** on the next logical line after the string:
+
+```python
+API_KEY = """
+line 1
+line 2
+"""
+# kingfisher:ignore
+```
+
+Kingfisher recognizes comment markers for the host language, including `#`, `//`, `/* */`, `--`, and `*`-prefixed block comments, so you can use this suppression style across languages. To reuse existing inline directives from other scanners, pass `--compat-ignore-comments` to also accept `NOSONAR`, `kics-scan ignore`,  `gitleaks:allow` and `trufflehog:ignore`.
 
 With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com)
 
diff --git a/README.md.orig b/README.md.orig
new file mode 100644
index 0000000..f690e3d
--- /dev/null
+++ b/README.md.orig
@@ -0,0 +1,1036 @@
+# Kingfisher
+
+<p align="center">
+  <img src="docs/kingfisher_logo.png" alt="Kingfisher Logo" width="126" height="173" style="vertical-align: right;" />
+
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+
+Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s hardware‑accelerated Hyperscan regex engine with language‑aware source code parsing, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production
+</p>
+
+Originally forked from Praetorian’s Nosey Parker, Kingfisher **adds** live cloud-API validation; many more targets (GitLab, BitBucket, Gitea, S3, Docker, Jira, Confluence, Slack); compressed-file extraction and scanning; baseline and allowlist controls; language-aware detection (~20 languages); and a native Windows binary. See [Origins and Divergence](#origins-and-divergence) for details.
+## Key Features
+
+### Multiple Scan Targets
+<div align="center">
+
+| Files / Dirs | Local Git | GitHub | GitLab | Azure DevOps | Bitbucket | Gitea |
+|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|
+| <img src="./docs/assets/icons/files.svg" height="40" alt="Files / Dirs"/><br/><sub>Files / Dirs</sub> | <img src="./docs/assets/icons/local-git.svg" height="40" alt="Local Git"/><br/><sub>Local Git</sub> | <img src="./docs/assets/icons/github.svg" height="40" alt="GitHub"/><br/><sub>GitHub</sub> | <img src="./docs/assets/icons/gitlab.svg" height="40" alt="GitLab"/><br/><sub>GitLab</sub> | <img src="./docs/assets/icons/azure-devops.svg" height="40" alt="Azure DevOps"/><br/><sub>Azure DevOps</sub> | <img src="./docs/assets/icons/bitbucket.svg" height="40" alt="Bitbucket"/><br/><sub>Bitbucket</sub> | <img src="./docs/assets/icons/gitea.svg" height="40" alt="Gitea"/><br/><sub>Gitea</sub> |
+
+| Docker | Jira | Confluence | Slack | AWS S3 |
+|:------:|:----:|:-----------:|:-----:|:------:|
+| <img src="./docs/assets/icons/docker.svg" height="40" alt="Docker"/><br/><sub>Docker</sub> | <img src="./docs/assets/icons/jira.svg" height="40" alt="Jira"/><br/><sub>Jira</sub> | <img src="./docs/assets/icons/confluence.svg" height="40" alt="Confluence"/><br/><sub>Confluence</sub> | <img src="./docs/assets/icons/slack.svg" height="40" alt="Slack"/><br/><sub>Slack</sub> | <img src="./docs/assets/icons/aws-s3.svg" height="40" alt="AWS S3"/><br/><sub>AWS&nbsp;S3</sub> |
+
+</div>
+
+### Performance, Accuracy, and Hundreds of Rules
+- **Performance**: multithreaded, Hyperscan‑powered scanning built for huge codebases  
+- **Extensible rules**: hundreds of built-in detectors plus YAML-defined custom rules ([docs/RULES.md](/docs/RULES.md))  
+- **Broad AI SaaS coverage**: finds and validates tokens for OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, Together.ai, Zhipu, and many more
+- **Compressed Files**: Supports extracting and scanning compressed files for secrets
+- **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md))
+
+**Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation)
+
+# Benchmark Results
+
+See ([docs/COMPARISON.md](docs/COMPARISON.md))
+
+<p align="center">
+  <img src="docs/runtime-comparison.png" alt="Kingfisher Runtime Comparison" style="vertical-align: center;" />
+</p>
+
+- [Kingfisher](#kingfisher)
+  - [Key Features](#key-features)
+    - [Multiple Scan Targets](#multiple-scan-targets)
+    - [Performance, Accuracy, and Hundreds of Rules](#performance-accuracy-and-hundreds-of-rules)
+- [Benchmark Results](#benchmark-results)
+- [Getting Started](#getting-started)
+  - [Installation](#installation)
+    - [Run Kingfisher in Docker](#run-kingfisher-in-docker)
+- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance)
+  - [Write Custom Rules!](#write-custom-rules)
+- [Usage](#usage)
+  - [Basic Examples](#basic-examples)
+    - [Scan with secret validation](#scan-with-secret-validation)
+    - [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories)
+    - [Scan a Git repository without validation](#scan-a-git-repository-without-validation)
+    - [Display only secrets confirmed active by third‑party APIs](#display-only-secrets-confirmed-active-by-thirdparty-apis)
+    - [Output JSON and capture to a file](#output-json-and-capture-to-a-file)
+    - [Output SARIF directly to disk](#output-sarif-directly-to-disk)
+    - [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--)
+    - [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size)
+    - [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag)
+    - [Display rule performance statistics](#display-rule-performance-statistics)
+    - [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files)
+    - [Exclude specific paths](#exclude-specific-paths)
+    - [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines)
+  - [ Scanning an AWS S3 Bucket](#-scanning-an-aws-s3-bucket)
+  - [ Scanning Docker Images](#-scanning-docker-images)
+  - [ Scanning GitHub](#-scanning-github)
+    - [Scan GitHub organization (requires `KF_GITHUB_TOKEN`)](#scan-github-organization-requires-kf_github_token)
+    - [Skip specific GitHub repositories during enumeration](#skip-specific-github-repositories-during-enumeration)
+    - [Scan remote GitHub repository](#scan-remote-github-repository)
+  - [ Scanning GitLab](#-scanning-gitlab)
+    - [Scan GitLab group (requires `KF_GITLAB_TOKEN`)](#scan-gitlab-group-requires-kf_gitlab_token)
+    - [Scan GitLab user](#scan-gitlab-user)
+    - [Skip specific GitLab projects during enumeration](#skip-specific-gitlab-projects-during-enumeration)
+    - [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url)
+    - [List GitLab repositories](#list-gitlab-repositories)
+  - [ Scanning Azure Repos](#-scanning-azure-repos)
+    - [Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-devops-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat)
+    - [Scan specific Azure DevOps projects](#scan-specific-azure-devops-projects)
+    - [Skip specific Azure repositories during enumeration](#skip-specific-azure-repositories-during-enumeration)
+    - [List Azure repositories](#list-azure-repositories)
+  - [ Scanning Gitea](#-scanning-gitea)
+    - [Scan Gitea organization (requires `KF_GITEA_TOKEN`)](#scan-gitea-organization-requires-kf_gitea_token)
+    - [Scan Gitea user](#scan-gitea-user)
+    - [Skip specific Gitea repositories during enumeration](#skip-specific-gitea-repositories-during-enumeration)
+    - [Scan remote Gitea repository by URL](#scan-remote-gitea-repository-by-url)
+    - [List Gitea repositories](#list-gitea-repositories)
+  - [ Scanning Bitbucket](#-scanning-bitbucket)
+    - [Scan Bitbucket workspace](#scan-bitbucket-workspace)
+    - [Scan Bitbucket user](#scan-bitbucket-user)
+    - [Skip specific Bitbucket repositories during enumeration](#skip-specific-bitbucket-repositories-during-enumeration)
+    - [Scan remote Bitbucket repository by URL](#scan-remote-bitbucket-repository-by-url)
+    - [List Bitbucket repositories](#list-bitbucket-repositories)
+    - [Authenticate to Bitbucket](#authenticate-to-bitbucket)
+    - [Self-hosted Bitbucket Server](#self-hosted-bitbucket-server)
+  - [ Scanning Jira](#-scanning-jira)
+    - [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query)
+    - [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues)
+  - [ Scanning Confluence](#-scanning-confluence)
+    - [Scan Confluence pages matching a CQL query](#scan-confluence-pages-matching-a-cql-query)
+  - [ Scanning Slack](#-scanning-slack)
+    - [Scan Slack messages matching a search query](#scan-slack-messages-matching-a-search-query)
+  - [Environment Variables for Tokens](#environment-variables-for-tokens)
+  - [Exit Codes](#exit-codes)
+  - [Update Checks](#update-checks)
+- [Advanced Options](#advanced-options)
+  - [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets)
+  - [List Builtin Rules](#list-builtin-rules)
+  - [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run)
+  - [To add your rules alongside the built‑ins:](#to-add-your-rules-alongside-the-builtins)
+  - [Other Examples](#other-examples)
+  - [Customize the HTTP User-Agent](#customize-the-http-user-agent)
+  - [Notable Scan Options](#notable-scan-options)
+  - [Understanding `--confidence`](#understanding---confidence)
+    - [Ignore known false positives](#ignore-known-false-positives)
+      - [Inline ignore directives](#inline-ignore-directives)
+  - [Finding Fingerprint](#finding-fingerprint)
+  - [Rule Performance Profiling](#rule-performance-profiling)
+  - [CLI Options](#cli-options)
+  - [Origins and Divergence](#origins-and-divergence)
+- [Roadmap](#roadmap)
+- [License](#license)
+
+# Getting Started
+## Installation
+
+On macOS, you can simply
+
+```bash
+brew install kingfisher
+```
+
+Pre-built binaries are also available on the [Releases](https://github.com/mongodb/kingfisher/releases) section of this page.
+
+You can also install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform:
+
+```bash
+# Linux, macOS
+curl --silent --location \
+    https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \
+    sh && \
+  ubi --project mongodb/kingfisher --in "$HOME/bin"
+```
+
+```powershell
+# Windows
+powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in .
+```
+
+This installs `ubi` and then places the `kingfisher` executable in `~/bin` on Unix-like
+systems (or the current directory on Windows).
+
+Or you may compile for your platform via `make`:
+
+```bash
+# NOTE: Requires Docker
+make linux
+
+# macOS --- must build from a macOS host
+make darwin
+
+# Windows x64 --- requires building from a Windows host with Visual Studio installed
+./buildwin.bat -force
+```
+
+```bash
+# Build all targets
+make linux-all # builds both x64 and arm64
+make darwin-all # builds both x64 and arm64
+make all # builds for every OS and architecture supported
+```
+
+### Run Kingfisher in Docker
+
+Run the dockerized Kingfisher container:
+```bash
+# GitHub Container Registry 
+docker run --rm ghcr.io/mongodb/kingfisher:latest --version
+
+# Scan the current working directory
+# (mounts your code at /src and scans it)
+docker run --rm \
+  -v "$PWD":/src \
+  ghcr.io/mongodb/kingfisher:latest scan /src
+
+
+# Scan while providing a GitHub token
+# Mounts your working dir at /proj and passes in the token:
+docker run --rm \
+  -e KF_GITHUB_TOKEN=ghp_… \
+  -v "$PWD":/proj \
+  ghcr.io/mongodb/kingfisher:latest \
+    scan --git-url https://github.com/org/private_repo.git
+
+# Scan an S3 bucket
+# Credentials can come from KF_AWS_KEY/KF_AWS_SECRET, --role-arn, or --aws-local-profile
+docker run --rm \
+  -e KF_AWS_KEY=AKIA... \
+  -e KF_AWS_SECRET=g5nYW... \
+  ghcr.io/mongodb/kingfisher:latest \
+    scan --s3-bucket bucket-name
+
+
+# Scan and write a JSON report locally
+# Here we:
+#    1. Mount $PWD → /proj
+#    2. Tell Kingfisher to write findings.json inside /proj/reports
+#   3. Ensure ./reports exists on your host so Docker can mount it
+mkdir -p reports
+
+# run and output into host’s ./reports directory
+docker run --rm \
+  -v "$PWD":/proj \
+  ghcr.io/mongodb/kingfisher:latest \
+    scan /proj \
+    --format json \
+    --output /proj/reports/findings.json
+
+
+# Tip: you can combine multiple mounts if you prefer separating source vs. output:
+# Here /src is read‑only, and /out holds your generated reports
+docker run --rm \
+  -v "$PWD":/src:ro \
+  -v "$PWD/reports":/out \
+  ghcr.io/mongodb/kingfisher:latest \
+    scan /src \
+    --format json \
+    --output /out/findings.json
+
+```
+
+# 🔐 Detection Rules at a Glance
+
+Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything from classic cloud keys to the latest AI SaaS tokens. Below is an overview:
+
+| Category | What we catch |
+|----------|---------------|
+| **AI SaaS APIs** | OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, together.ai, Zhipu, and more |
+| **Cloud Providers** | AWS, Azure, GCP, Alibaba Cloud, DigitalOcean, IBM Cloud, Cloudflare, and more |
+| **Dev & CI/CD** | GitHub/GitLab tokens, CircleCI, TravisCI, TeamCity, Docker Hub, npm, PyPI, and more |
+| **Messaging & Comms** | Slack, Discord, Microsoft Teams, Twilio, Mailgun, SendGrid, Mailchimp, and more |
+| **Databases & Data Ops** | MongoDB Atlas, PlanetScale, Postgres DSNs, Grafana Cloud, Datadog, Dynatrace, and more |
+| **Payments & Billing** | Stripe, PayPal, Square, GoCardless, and more |
+| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more |
+| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more |
+
+## Write Custom Rules!
+
+Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential.
+
+However, you may want to add your own custom rules, or modify a detection to better suit your needs / environment.
+
+First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom Kingfisher rules.
+
+Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required!
+
+# Usage
+
+## Basic Examples
+
+> **Note**  `kingfisher scan` detects whether the input is a Git repository or a plain directory, no extra flags required.
+
+### Scan with secret validation
+
+```bash
+kingfisher scan /path/to/code
+## NOTE: This path can refer to:
+# 1. a local git repo
+# 2. a directory with many git repos
+# 3. or just a folder with files and subdirectories
+
+## To explicitly prevent scanning git commit history add:
+#   `--git-history=none`
+```
+
+### Scan a directory containing multiple Git repositories
+
+```bash
+kingfisher scan /projects/mono‑repo‑dir
+```
+
+### Scan a Git repository without validation
+
+```bash
+kingfisher scan ~/src/myrepo --no-validate
+```
+
+### Display only secrets confirmed active by third‑party APIs
+
+```bash
+kingfisher scan /path/to/repo --only-valid
+```
+
+### Output JSON and capture to a file
+
+```bash
+kingfisher scan . --format json | tee kingfisher.json
+```
+
+### Output SARIF directly to disk
+
+```bash
+kingfisher scan /path/to/repo --format sarif --output findings.sarif
+```
+
+### Pipe any text directly into Kingfisher by passing `-`
+
+```bash
+cat /path/to/file.py | kingfisher scan -
+
+```
+
+### Limit maximum file size scanned (`--max-file-size`)
+
+By default, Kingfisher skips files larger than **256 MB**. You can raise or lower this cap per run with `--max-file-size`, which takes a value in **megabytes**.
+
+```bash
+# Scan files up to 500 mb in size
+kingfisher scan /some/file --max-file-size 500
+```
+
+### Scan using a rule _family_ with one flag
+
+_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws.*`)_
+
+```bash
+# Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2)
+kingfisher scan /path/to/repo --rule kingfisher.aws
+```
+
+### Display rule performance statistics
+
+```bash
+kingfisher scan /path/to/repo --rule-stats
+```
+
+### Scan while ignoring likely test files
+
+`--exclude` skips any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive)
+
+```bash
+# Scan source but skip likely unit / integration tests
+kingfisher scan ./my-project \
+  --exclude='[Tt]est' \
+  --exclude='spec' \
+  --exclude='[Ff]ixture' \
+  --exclude='example' \
+  --exclude='sample'
+```
+
+### Exclude specific paths
+```bash
+# Skip all Python files and any directory named tests
+kingfisher scan ./my-project \
+  --exclude '*.py' \
+  --exclude '[Tt]ests'
+```
+
+### Scan changes in CI pipelines
+
+Limit scanning to the delta between your default branch and a pull request branch by combining `--since-commit` with `--branch` (defaults to `HEAD`). This only scans files that differ between the two references, which keeps CI runs fast while still blocking new secrets.
+
+```bash
+kingfisher scan . \
+  --since-commit origin/main \
+  --branch "$CI_BRANCH"
+```
+
+When the branch under test is already checked out, `--branch HEAD` or omitting `--branch` entirely is sufficient. Kingfisher exits with `200` when any findings are discovered and `205` when validated secrets are present, allowing CI jobs to fail automatically if new credentials slip in.
+
+The same diff-focused workflow works when cloning repositories on the fly with `--git-url`. Kingfisher automatically tries remote-tracking names like `origin/main` and `origin/feature-1`, so you can target the branches involved in a pull request without performing a local checkout first.
+
+```bash
+kingfisher scan \
+  --git-url https://github.com/org/repo.git \
+  --since-commit main \
+  --branch development
+```
+
+In CI systems that expose the base and head commits explicitly, you can pass those SHAs directly while still using `--git-url`:
+
+```bash
+kingfisher scan \
+  --git-url git@github.com:org/repo.git \
+  --since-commit "$BASE_COMMIT" \
+  --branch "$PR_HEAD_COMMIT"
+```
+
+If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude):
+
+```bash
+# Skip all Python files and any directory named tests, and report to stderr any skipped files
+kingfisher scan ./my-project \
+  --exclude '*.py' \
+  --exclude tests \
+  -v
+```
+
+## <img alt="GitHub" src="./docs/assets/icons/aws-s3.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning an AWS S3 Bucket
+You can scan S3 objects directly:
+
+```bash
+kingfisher scan --s3-bucket bucket-name [--s3-prefix path/]
+```
+
+Credential resolution happens in this order:
+
+1. `KF_AWS_KEY` and `KF_AWS_SECRET` environment variables
+2. `--aws-local-profile` pointing to a profile in `~/.aws/config` (works with AWS SSO)
+3. anonymous access for public buckets
+
+If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role.
+
+Examples:
+
+```bash
+# using explicit keys
+export KF_AWS_KEY=AKIA...
+export KF_AWS_SECRET=g5nYW...
+kingfisher scan --s3-bucket some-example-bucket
+
+# Above can also be run as:
+KF_AWS_KEY=AKIA... KF_AWS_SECRET=g5nYW... kingfisher scan --s3-bucket some-example-bucket
+
+# using a local profile (e.g., SSO) that exists in your AWS profile (~/.aws/config)
+kingfisher scan --s3-bucket some-example-bucket --aws-local-profile default
+
+# anonymous scan of a bucket, while providing an object prefix to only scan subset of the s3 bucket
+kingfisher scan \
+  --s3-bucket awsglue-datasets \
+  --s3-prefix examples/us-legislators/all
+
+# assuming a role when scanning
+kingfisher scan --s3-bucket some-example-bucket \
+  --role-arn arn:aws:iam::123456789012:role/MyRole
+
+# anonymous scan of a public bucket
+kingfisher scan --s3-bucket some-example-bucket
+```
+
+Docker example:
+
+```bash
+docker run --rm \
+  -e KF_AWS_KEY=AKIA... \
+  -e KF_AWS_SECRET=g5nYW... \
+  ghcr.io/mongodb/kingfisher:latest \
+    scan --s3-bucket bucket-name
+```
+
+## <img alt="Docker" src="./docs/assets/icons/docker.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Docker Images
+
+Kingfisher will first try to use any locally available image, then fall back to pulling via OCI.  
+
+Authentication happens *in this order*:
+
+1. **`KF_DOCKER_TOKEN`** env var  
+   - If it contains `user:pass`, it’s used as Basic auth
+   - Otherwise it’s sent as a Bearer token
+2. **Docker CLI credentials**  
+   - Checks `credHelpers` (per-registry) and `credsStore` in `~/.docker/config.json`.  
+   - Falls back to the legacy `auths` → `auth` (base64) entries.  
+3. **Anonymous** (no credentials)
+
+
+```bash
+# 1) Scan public or already-pulled image
+kingfisher scan --docker-image ghcr.io/owasp/wrongsecrets/wrongsecrets-master:latest-master
+
+# 2) For private registries, explicitly set KF_DOCKER_TOKEN:
+#    - Basic auth:     "user:pass"
+#    - Bearer only:    "TOKEN"
+export KF_DOCKER_TOKEN="AWS:$(aws ecr get-login-password --region us-east-1)"
+kingfisher scan --docker-image some-private-registry.dkr.ecr.us-east-1.amazonaws.com/base/amazonlinux2023:latest
+
+# 3) Or rely on your Docker CLI login/keychain:
+#    (e.g. aws ecr get-login-password … | docker login …)
+kingfisher scan --docker-image private.registry.example.com/my-image:tag
+```
+
+## <img alt="GitHub" src="./docs/assets/icons/github.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning GitHub
+
+### Scan GitHub organization (requires `KF_GITHUB_TOKEN`)
+
+```bash
+kingfisher scan --github-organization my-org
+```
+
+### Skip specific GitHub repositories during enumeration
+
+Repeat `--github-exclude` for every repository you want to ignore when scanning
+users or organizations. You can provide exact repositories like
+`OWNER/REPO` or gitignore-style glob patterns such as `owner/*-archive`
+(matching is case-insensitive).
+
+```bash
+kingfisher scan --github-organization my-org \
+  --github-exclude my-org/huge-repo \
+  --github-exclude my-org/*-archive
+```
+
+### Scan remote GitHub repository
+
+`--git-url` clones the repository and scans its files and history. To also inspect
+related server-side data, supply `--repo-artifacts`. This flag pulls down the
+repository's issues (including pull requests), wiki, and any public gists owned by
+the repository owner and scans them for secrets. Fetching these extras counts
+against API rate limits and private artifacts require a `KF_GITHUB_TOKEN`.
+
+```bash
+# Scan the repository only
+kingfisher scan --git-url https://github.com/org/repo.git
+
+# Include issues, wiki, and owner gists
+kingfisher scan --git-url https://github.com/org/repo.git --repo-artifacts
+
+# Private repositories or artifacts
+KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/private_repo.git --repo-artifacts
+```
+
+---
+
+## <img alt="GitLab" src="./docs/assets/icons/gitlab.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning GitLab
+
+### Scan GitLab group (requires `KF_GITLAB_TOKEN`)
+
+```bash
+kingfisher scan --gitlab-group my-group
+# include repositories from all nested subgroups
+kingfisher scan --gitlab-group my-group --gitlab-include-subgroups
+```
+
+### Scan GitLab user
+
+```bash
+kingfisher scan --gitlab-user johndoe
+```
+
+### Skip specific GitLab projects during enumeration
+
+Repeat `--gitlab-exclude` for every project path you want to ignore when scanning
+users or groups. Specify project paths as `group/project` (case-insensitive) or
+use gitignore-style glob patterns like `group/**/archive-*` to drop families of
+projects across nested subgroups.
+
+```bash
+kingfisher scan --gitlab-group my-group \
+  --gitlab-exclude my-group/huge-project \
+  --gitlab-exclude my-group/**/archive-*
+```
+
+### Scan remote GitLab repository by URL
+
+`--git-url` by itself clones the project repository. To include server-side
+artifacts owned by the project, add `--repo-artifacts`. Kingfisher will retrieve
+the project's issues, wiki, and snippets and scan them for secrets. These extra
+requests may take longer and require a `KF_GITLAB_TOKEN` for private projects.
+
+```bash
+# Scan the repository only
+kingfisher scan --git-url https://gitlab.com/group/project.git
+
+# Include issues, wiki, and snippets
+kingfisher scan --git-url https://gitlab.com/group/project.git --repo-artifacts
+
+# Private projects or artifacts
+KF_GITLAB_TOKEN="glpat-…" kingfisher scan --git-url https://gitlab.com/group/private_project.git --repo-artifacts
+```
+
+### List GitLab repositories
+
+```bash
+kingfisher gitlab repos list --group my-group
+# include repositories from all nested subgroups
+kingfisher gitlab repos list --group my-group --include-subgroups
+# skip specific projects when listing or scanning (supports glob patterns)
+kingfisher gitlab repos list --group my-group --gitlab-exclude my-group/**/legacy-*
+```
+## <img alt="Azure Repos" src="./docs/assets/icons/azure-devops.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Azure Repos
+
+### Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)
+
+```bash
+kingfisher scan --azure-organization my-org
+
+# Azure DevOps Server example
+KF_AZURE_PAT="pat" kingfisher scan --azure-organization DefaultCollection --azure-base-url https://ado.internal.example/tfs/
+```
+
+### Scan specific Azure DevOps projects
+
+Projects are specified as `ORGANIZATION/PROJECT`. Repeat the flag for multiple projects.
+
+```bash
+kingfisher scan --azure-project my-org/payments --azure-project my-org/core-platform
+```
+
+### Skip specific Azure repositories during enumeration
+
+Repeat `--azure-exclude` to ignore repositories when scanning organizations or projects.
+Use identifiers like `ORGANIZATION/PROJECT/REPOSITORY`. Repositories that share the same
+name as their project can be excluded with `ORGANIZATION/PROJECT`, and gitignore-style
+patterns such as `my-org/*/archive-*` are also supported.
+
+```bash
+kingfisher scan --azure-organization my-org \
+  --azure-exclude my-org/payments/legacy-service \
+  --azure-exclude my-org/**/archive-*
+```
+
+### List Azure repositories
+
+```bash
+kingfisher azure repos list --organization my-org
+# list repositories for specific projects
+kingfisher azure repos list --project my-org/app --project my-org/api
+# skip specific repositories while listing (supports glob patterns)
+kingfisher azure repos list --organization my-org --azure-exclude my-org/**/experimental-*
+```
+## <img alt="Gitea" src="./docs/assets/icons/gitea.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Gitea
+
+### Scan Gitea organization (requires `KF_GITEA_TOKEN`)
+
+```bash
+kingfisher scan --gitea-organization my-org
+# self-hosted example
+KF_GITEA_TOKEN="gtoken" kingfisher scan --gitea-organization platform --gitea-api-url https://gitea.internal.example/api/v1/
+```
+
+### Scan Gitea user
+
+```bash
+kingfisher scan --gitea-user johndoe
+```
+
+### Skip specific Gitea repositories during enumeration
+
+Repeat `--gitea-exclude` for each repository you want to ignore when scanning users
+or organizations. Accepts `owner/repo` identifiers or gitignore-style glob patterns
+like `team/**/archive-*`.
+
+```bash
+kingfisher scan --gitea-organization my-org \
+  --gitea-exclude my-org/legacy-repo \
+  --gitea-exclude my-org/**/archive-*
+```
+
+### Scan remote Gitea repository by URL
+
+`--git-url` clones the repository and scans its history. Adding `--repo-artifacts`
+also clones the repository wiki if one exists. Private repositories and wikis
+require `KF_GITEA_TOKEN` (and `KF_GITEA_USERNAME` when cloning via HTTPS).
+
+```bash
+# Scan the repository only
+kingfisher scan --git-url https://gitea.com/org/repo.git
+
+# Include the repository wiki (if present)
+KF_GITEA_TOKEN="gtoken" KF_GITEA_USERNAME="org" \
+  kingfisher scan --git-url https://gitea.com/org/repo.git --repo-artifacts
+```
+
+### List Gitea repositories
+
+```bash
+kingfisher gitea repos list --gitea-organization my-org
+# enumerate every organization visible to the authenticated user
+KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --all-gitea-organizations
+# self-hosted example
+KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --user johndoe --gitea-api-url https://gitea.internal.example/api/v1/
+```
+## <img alt="Bitbucket" src="./docs/assets/icons/bitbucket.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Bitbucket
+### Scan Bitbucket workspace
+
+```bash
+kingfisher scan --bitbucket-workspace my-team
+# include Bitbucket Cloud repositories from every accessible workspace
+kingfisher scan --all-bitbucket-workspaces --bitbucket-token "$APP_PASSWORD" --bitbucket-username "$USER"
+```
+
+### Scan Bitbucket user
+
+```bash
+kingfisher scan --bitbucket-user johndoe
+```
+
+### Skip specific Bitbucket repositories during enumeration
+
+Use `--bitbucket-exclude` to ignore repositories while scanning users, workspaces,
+or projects. Patterns accept either `owner/repo` (case-insensitive) or
+gitignore-style globs such as `workspace/**/archive-*`.
+
+```bash
+kingfisher scan --bitbucket-workspace my-team \
+  --bitbucket-exclude my-team/legacy-repo \
+  --bitbucket-exclude my-team/**/archive-*
+```
+
+### Scan remote Bitbucket repository by URL
+
+`--git-url` clones the repository and scans its files and history. To inspect
+Bitbucket artifacts such as issues, add `--repo-artifacts`. Private artifacts
+require credentials (see [Authenticate to Bitbucket](#authenticate-to-bitbucket)).
+
+```bash
+# Scan the repository only
+kingfisher scan --git-url https://bitbucket.org/hashashash/secretstest.git
+
+# Include repository issues
+KF_BITBUCKET_USERNAME="user" \
+KF_BITBUCKET_APP_PASSWORD="app-password" \
+  kingfisher scan --git-url https://bitbucket.org/workspace/project.git --repo-artifacts
+```
+
+### List Bitbucket repositories
+
+```bash
+kingfisher bitbucket repos list --bitbucket-workspace my-team
+# enumerate all accessible workspaces or projects
+kingfisher bitbucket repos list --all-bitbucket-workspaces --bitbucket-token "$APP_PASSWORD" --bitbucket-username "$USER"
+# filter out repositories using glob patterns
+kingfisher bitbucket repos list --bitbucket-workspace my-team --bitbucket-exclude my-team/**/experimental-*
+```
+
+### Authenticate to Bitbucket
+
+Kingfisher supports Bitbucket Cloud and Bitbucket Server credentials:
+
+- **App password or server token** – set `KF_BITBUCKET_USERNAME` and either
+  `KF_BITBUCKET_APP_PASSWORD` or `KF_BITBUCKET_TOKEN`, or pass
+  `--bitbucket-username`/`--bitbucket-token` on the CLI.
+- **OAuth/PAT token** – set `KF_BITBUCKET_OAUTH_TOKEN` or supply
+  `--bitbucket-oauth-token`.
+
+These credentials match the options described in the [ghorg setup
+guide](https://github.com/gabrie30/ghorg/blob/master/README.md#bitbucket-setup).
+
+### Self-hosted Bitbucket Server
+
+Use `--bitbucket-api-url` to point Kingfisher at your server's REST endpoint, for example
+`https://bitbucket.example.com/rest/api/1.0/`. Provide credentials with
+`--bitbucket-username` and `--bitbucket-token`, and pass `--ignore-certs` when
+connecting to HTTP or otherwise insecure instances.
+## <img alt="Jira" src="./docs/assets/icons/jira.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Jira
+
+### Scan Jira issues matching a JQL query
+
+```bash
+KF_JIRA_TOKEN="token" kingfisher scan \
+    --jira-url https://jira.company.com \
+    --jql "project = TEST AND status = Open" \
+    --max-results 500
+```
+
+### Scan the last 1,000 Jira issues:
+```bash
+KF_JIRA_TOKEN="token" kingfisher scan \
+  --jira-url https://jira.mongodb.org \
+  --jql 'ORDER BY created DESC' \
+  --max-results 1000
+```
+
+## <img alt="Confluence" src="./docs/assets/icons/confluence.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Confluence
+### Scan Confluence pages matching a CQL query
+
+```bash
+# Bearer token
+KF_CONFLUENCE_TOKEN="token" kingfisher scan \
+    --confluence-url https://confluence.company.com \
+    --cql "label = secret" \
+    --max-results 500
+
+# Basic auth with username and token
+KF_CONFLUENCE_USER="user@example.com" KF_CONFLUENCE_TOKEN="token" kingfisher scan \
+    --confluence-url https://confluence.company.com \
+    --cql "text ~ 'password'" \
+    --max-results 500
+```
+
+Use the base URL of your Confluence site for `--confluence-url`. Kingfisher
+automatically adds `/rest/api` to the end, so `https://example.com/wiki` and
+`https://example.com` both work depending on your server configuration.
+
+Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environment variable. By default, Kingfisher sends the token as a bearer token in the `Authorization` header.
+
+To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions.
+
+## <img alt="Slack" src="./docs/assets/icons/slack.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Slack
+### Scan Slack messages matching a search query
+
+```bash
+KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \
+    --slack-query "from:username has:link" \
+    --max-results 1000
+
+KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \
+    --slack-query "akia" \
+    --max-results 1000
+```
+*The Slack token must be a user token with the `search:read` scope. Bot tokens (those beginning with `xoxb-`) cannot call the Slack search API.*
+
+## Environment Variables for Tokens
+
+| Variable          | Purpose                      |
+| ----------------- | ---------------------------- |
+| `KF_GITHUB_TOKEN` | GitHub Personal Access Token |
+| `KF_GITLAB_TOKEN` | GitLab Personal Access Token |
+| `KF_GITEA_TOKEN` | Gitea Personal Access Token |
+| `KF_GITEA_USERNAME` | Username for private Gitea clones (used with `KF_GITEA_TOKEN`) |
+| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure DevOps Personal Access Token |
+| `KF_AZURE_USERNAME` | Username to use with Azure DevOps PATs (defaults to `pat` when unset) |
+| `KF_BITBUCKET_USERNAME` | Bitbucket username for basic authentication |
+| `KF_BITBUCKET_APP_PASSWORD` / `KF_BITBUCKET_TOKEN` | Bitbucket app password or server token |
+| `KF_BITBUCKET_OAUTH_TOKEN` | Bitbucket OAuth or PAT token |
+| `KF_JIRA_TOKEN`   | Jira API token               |
+| `KF_CONFLUENCE_TOKEN` | Confluence API token      |
+| `KF_SLACK_TOKEN`  | Slack API token              |
+| `KF_DOCKER_TOKEN` | Docker registry token (`user:pass` or bearer token). If unset, credentials from the Docker keychain are used |
+| `KF_AWS_KEY` and `KF_AWS_SECRET` | AWS Credentials to use with S3 bucket scanning |
+
+Set them temporarily per command:
+
+```bash
+KF_GITLAB_TOKEN="glpat-…" kingfisher scan --gitlab-group my-group
+```
+
+Or export for the session:
+
+```bash
+export KF_GITLAB_TOKEN="glpat-…"
+```
+
+To authenticate Jira requests:
+```bash
+export KF_JIRA_TOKEN="token"
+```
+
+To authenticate Confluence requests:
+```bash
+export KF_CONFLUENCE_TOKEN="token"
+```
+
+_If no token is provided Kingfisher still works for public repositories._
+
+---
+
+## Exit Codes
+
+| Code | Meaning                       |
+| ---- | ----------------------------- |
+| 0    | No findings                   |
+| 200  | Findings discovered           |
+| 205  | Validated findings discovered |
+
+## Update Checks
+
+Kingfisher automatically queries GitHub for a newer release when it starts and tells you whether an update is available.
+
+- **Hands-free updates** – Add `--self-update` to any Kingfisher command
+
+  * If a newer version exists, Kingfisher will download it, replace the running binary, and re-launch itself with the **exact same arguments**.
+  * If the update fails or no newer release is found, the current run proceeds as normal
+
+- **Manual update** – Run `kingfisher self-update` to update the binary without scanning
+
+- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely
+
+# Advanced Options
+
+## Build a Baseline / Detect New Secrets
+
+There are situations where a repository already contains checked‑in secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list.
+
+The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches):
+
+```bash
+kingfisher scan /path/to/code \
+  --confidence low \
+  --manage-baseline \
+  --baseline-file ./baseline-file.yml
+```
+
+Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings:
+
+```bash
+kingfisher scan /path/to/code \
+  --baseline-file /path/to/baseline-file.yaml
+```
+
+Running the scan again with `--manage-baseline` refreshes the baseline by adding new findings and pruning entries for secrets that no longer appear. See [docs/BASELINE.md](docs/BASELINE.md) for full detail.
+
+## List Builtin Rules
+
+```bash
+kingfisher rules list
+```
+
+## To scan using **only** your own `my_rules.yaml` you could run:
+
+```bash
+kingfisher scan \
+  --load-builtins=false \
+  --rules-path path/to/my_rules.yaml \
+  ./src/
+```
+
+## To add your rules alongside the built‑ins:
+
+```bash
+kingfisher scan \
+  --rules-path ./custom-rules/ \
+  --rules-path my_rules.yml \
+  ~/path/to/project-dir/
+```
+
+## Other Examples
+
+```bash
+# Check custom rules - this ensures all regular expressions compile, and can match the rule's `examples` in the YML file
+kingfisher rules check --rules-path ./my_rules.yml
+
+# List GitHub repos
+kingfisher github repos list --user my-user
+kingfisher github repos list --organization my-org
+# Skip specific repositories when listing or scanning (supports glob patterns)
+kingfisher github repos list --organization my-org --github-exclude my-org/*-archive
+
+```
+
+## Customize the HTTP User-Agent
+
+Kingfisher identifies its HTTP requests with a user-agent that includes the binary name and version followed by a browser-style
+string. Some environments require extra context, such as a contact address, a change-ticket number, or a temporary test label.
+Use the global `--user-agent-suffix` flag to append this information between the Kingfisher identifier and the browser portion:
+
+```bash
+# Attach a contact email to all outbound validation requests
+kingfisher --user-agent-suffix "contact=security@example.com" scan path/
+
+# Label a one-off experiment
+kingfisher --user-agent-suffix "Sept 2025 testing" github repos list --user my-user
+```
+
+When omitted, Kingfisher defaults to `kingfisher/<version> Mozilla/5.0 ...`. The suffix is trimmed; passing an empty string
+leaves the default unchanged.
+
+## Notable Scan Options
+
+- `--no-dedup`: Report every occurrence of a finding (disable the default de-duplicate behavior)
+- `--no-base64`: By default, Kingfisher finds and decodes base64 blobs and scans them for secrets. This adds a slight performance overhead; use this flag to disable
+- `--confidence <LEVEL>`: (low|medium|high)
+- `--min-entropy <VAL>`: Override default threshold
+- `--no-binary`: Skip binary files
+- `--no-extract-archives`: Do not scan inside archives
+- `--extraction-depth <N>`: Specifies how deep nested archives should be extracted and scanned (default: 2)
+- `--redact`: Replaces discovered secrets with a one-way hash for secure output
+- `--exclude <PATTERN>`: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive)
+- `--baseline-file <FILE>`: Ignore matches listed in a baseline YAML file
+- `--manage-baseline`: Create or update the baseline file with current findings
+- `--skip-regex <PATTERN>`: Ignore findings whose text matches this regex (repeatable)
+- `--skip-word <WORD>`: Ignore findings containing this case-insensitive word (repeatable)
+## Understanding `--confidence`
+
+The `--confidence` flag sets a minimum confidence threshold, not an exact match.
+
+- If you pass `--confidence medium`, findings with **medium and higher** confidence (medium + high) will be included.
+- If you pass `--confidence low`, you’ll see **all levels** (low, medium, high).
+
+
+### Ignore known false positives
+
+Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. 
+
+With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com)
+
+```bash
+# Skip any finding where the finding mentions TEST_KEY
+kingfisher scan --skip-regex '(?i)TEST_KEY' path/
+
+# Skip findings that contain the word "dummy" anywhere in the match
+kingfisher scan --skip-word dummy path/
+
+# Combine multiple patterns
+kingfisher scan \
+  --skip-regex 'AKIA[0-9A-Z]{16}' \
+  --skip-word placeholder \
+  --skip-word dummy \
+  path/
+```
+
+If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early.
+
+## Finding Fingerprint
+
+The document below details the four-field formula (rule SHA-1, origin label, start & end offsets) hashed with XXH3-64 to create Kingfisher’s 64-bit finding fingerprint, and explains how this ID powers safe deduplication; plus how `--no-dedup` can be used shows every raw match.
+See ([docs/FINGERPRINT.md](docs/FINGERPRINT.md))
+
+## Rule Performance Profiling
+
+Use `--rule-stats` to collect timing information for every rule. After scanning, the summary prints a **Rule Performance Stats** section showing how many matches each rule produced along with its slowest and average match times. Useful when creating rules or debugging rules.
+
+## CLI Options
+
+```bash
+kingfisher scan --help
+```
+
+
+## Origins and Divergence
+
+Kingfisher began as a fork of Praetorian’s Nosey Parker, as our experiment with adding live validation support and embedding that validation directly inside each rule.  
+
+Since that initial fork, it has diverged heavily from Nosey Parker:
+- Added support for live validation of discovered secrets  
+- Added hundreds of new rules  
+- Added support for analyzing compressed files  
+- Added support for building "baselines" to allow for only reporting on newly discovered secrets  
+- Added Tree-Sitter based source code parsing on top of Hyperscan for deeper language-aware detection  
+- Expanded support for new targets (GitLab, BitBucket, Gitea, Jira, Confluence, Slack, S3, Docker, etc.)  
+- Replaced the SQLite datastore with an in-memory store + Bloom filter  
+- Collapsed the workflow into a single scan-and-report phase with direct JSON/BSON/SARIF outputs  
+- Delivered cross-platform builds, including native Windows  
+
+# Roadmap
+
+- More rules
+- More targets
+- Please file a [feature request](https://github.com/mongodb/kingfisher/issues), or open a PR, if you have features you'd like added
+
+# License
+
+[Apache2 License](LICENSE)
diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs
index 255417f..595e1ff 100644
--- a/src/cli/commands/scan.rs
+++ b/src/cli/commands/scan.rs
@@ -118,6 +118,10 @@ pub struct ScanArgs {
     /// Skipwords to allow-list secret matches (case-insensitive, repeatable)
     #[arg(long = "skip-word", value_name = "WORD")]
     pub skip_word: Vec<String>,
+
+    /// Also recognise `gitleaks:allow` and `trufflehog:ignore` inline directives
+    #[arg(long = "compat-ignore-comments", default_value_t = false)]
+    pub compat_ignore_comments: bool,
 }
 
 /// Confidence levels for findings
diff --git a/src/inline_ignore.rs b/src/inline_ignore.rs
new file mode 100644
index 0000000..8cd47c8
--- /dev/null
+++ b/src/inline_ignore.rs
@@ -0,0 +1,342 @@
+use crate::location::OffsetSpan;
+
+/// Configuration for inline ignore directives.
+#[derive(Clone, Debug, Default)]
+pub struct InlineIgnoreConfig {
+    tokens: Vec<&'static str>,
+}
+
+impl InlineIgnoreConfig {
+    /// Create a new configuration.
+    ///
+    /// * `include_external_syntax` - when true, also recognise the comment
+    ///   directives used by other scanners such as Gitleaks and Trufflehog.
+    pub fn new(include_external_syntax: bool) -> Self {
+        let mut tokens = vec!["kingfisher:ignore", "kingfisher:allow"];
+        if include_external_syntax {
+            tokens.extend(["gitleaks:allow", "trufflehog:ignore"]);
+        }
+        Self { tokens }
+    }
+
+    #[inline]
+    fn has_tokens(&self) -> bool {
+        !self.tokens.is_empty()
+    }
+
+    /// Returns `true` when the provided blob slice contains an inline ignore
+    /// directive that should suppress a finding for the given span.
+    pub fn should_ignore(&self, blob_bytes: &[u8], span: &OffsetSpan) -> bool {
+        if !self.has_tokens() {
+            return false;
+        }
+
+        let (start_line_start, start_line_end) = line_bounds(blob_bytes, span.start);
+        if start_line_end > start_line_start {
+            let start_line = &blob_bytes[start_line_start..start_line_end];
+            if line_has_directive(start_line, &self.tokens) {
+                return true;
+            }
+        }
+
+        // Scan backwards to allow directives that appear before the start of a
+        // multi-line string or value. This mirrors tools like Gitleaks where
+        // the ignore directive is often placed immediately above the secret.
+        let mut cursor = start_line_start;
+        while cursor > 0 {
+            let previous_index = cursor.saturating_sub(1);
+            let (prev_start, prev_end) = line_bounds(blob_bytes, previous_index);
+            if prev_end <= prev_start {
+                break;
+            }
+
+            let prev_line = &blob_bytes[prev_start..prev_end];
+            if line_has_directive(prev_line, &self.tokens) {
+                return true;
+            }
+
+            if !should_skip_for_directive_search(prev_line) {
+                break;
+            }
+
+            if prev_start == 0 {
+                break;
+            }
+
+            cursor = prev_start;
+        }
+
+        let end_index = if span.end == 0 { 0 } else { span.end - 1 };
+        let (closing_line_start, closing_line_end) =
+            line_bounds(blob_bytes, end_index.min(blob_bytes.len()));
+        if closing_line_end > closing_line_start
+            && (closing_line_start != start_line_start || closing_line_end != start_line_end)
+        {
+            let closing_line = &blob_bytes[closing_line_start..closing_line_end];
+            if line_has_directive(closing_line, &self.tokens) {
+                return true;
+            }
+        }
+
+        // Also consider lines after the match so that multi-line strings can be
+        // ignored when the directive appears after the closing delimiter (a
+        // common pattern in languages like Python).
+        let mut cursor = closing_line_end;
+        while cursor < blob_bytes.len() {
+            if blob_bytes[cursor] == b'\n' {
+                cursor += 1;
+                continue;
+            }
+
+            let (_, next_end) = line_bounds(blob_bytes, cursor);
+            if next_end <= cursor {
+                break;
+            }
+
+            let next_line = &blob_bytes[cursor..next_end];
+            if line_has_directive(next_line, &self.tokens) {
+                return true;
+            }
+
+            if !should_skip_for_directive_search(next_line) {
+                break;
+            }
+
+            cursor = next_end;
+        }
+
+        false
+    }
+}
+
+fn should_skip_for_directive_search(line: &[u8]) -> bool {
+    let trimmed = trim_ascii_whitespace(line);
+    if trimmed.is_empty() {
+        return true;
+    }
+
+    if trimmed.iter().all(|&b| b == trimmed[0]) && matches!(trimmed[0], b'"' | b'\'' | b'`') {
+        return true;
+    }
+
+    if ends_with_multiline_delimiter(trimmed) {
+        return true;
+    }
+
+    false
+}
+
+fn ends_with_multiline_delimiter(trimmed: &[u8]) -> bool {
+    if trimmed.len() < 3 {
+        return false;
+    }
+
+    let last = *trimmed.last().unwrap();
+    if !matches!(last, b'"' | b'\'' | b'`') {
+        return false;
+    }
+
+    let count = trimmed.iter().rev().take_while(|&&ch| ch == last).count();
+
+    count >= 3
+}
+
+fn trim_ascii_whitespace(line: &[u8]) -> &[u8] {
+    let mut start = 0;
+    while start < line.len() && line[start].is_ascii_whitespace() {
+        start += 1;
+    }
+
+    let mut end = line.len();
+    while end > start && line[end - 1].is_ascii_whitespace() {
+        end -= 1;
+    }
+
+    &line[start..end]
+}
+
+fn line_bounds(bytes: &[u8], index: usize) -> (usize, usize) {
+    if bytes.is_empty() {
+        return (0, 0);
+    }
+    let mut start = index.min(bytes.len());
+    while start > 0 && bytes[start - 1] != b'\n' {
+        start -= 1;
+    }
+    let mut end = index.min(bytes.len());
+    while end < bytes.len() && bytes[end] != b'\n' {
+        end += 1;
+    }
+    (start, end)
+}
+
+fn line_has_directive(line: &[u8], tokens: &[&'static str]) -> bool {
+    if line.is_empty() {
+        return false;
+    }
+
+    let mut lowercase = line.to_vec();
+    lowercase.iter_mut().for_each(|b| *b = b.to_ascii_lowercase());
+
+    for token in tokens {
+        let needle = token.as_bytes();
+        let mut offset = 0;
+        while offset < lowercase.len() {
+            if let Some(pos) = memchr::memmem::find(&lowercase[offset..], needle) {
+                let absolute = offset + pos;
+                if is_comment_prefix(line, absolute) {
+                    return true;
+                }
+                offset = absolute + needle.len();
+            } else {
+                break;
+            }
+        }
+    }
+
+    false
+}
+
+fn is_comment_prefix(line: &[u8], token_index: usize) -> bool {
+    if line.is_empty() || token_index == 0 || token_index > line.len() {
+        return false;
+    }
+
+    let mut end = token_index;
+    while end > 0 && line[end - 1].is_ascii_whitespace() {
+        end -= 1;
+    }
+
+    if end == 0 {
+        return false;
+    }
+
+    let trimmed = &line[..end];
+    let last = trimmed[end - 1];
+    let head = &trimmed[..end - 1];
+
+    match last {
+        b'#' => head.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true),
+        b'/' => {
+            if head.last() == Some(&b'/') {
+                let before = &head[..head.len().saturating_sub(1)];
+                before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true)
+            } else if head.last() == Some(&b'*') {
+                let before = &head[..head.len().saturating_sub(1)];
+                before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true)
+            } else {
+                false
+            }
+        }
+        b'-' => {
+            if head.last() == Some(&b'-') {
+                let before = &head[..head.len().saturating_sub(1)];
+                before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true)
+            } else {
+                false
+            }
+        }
+        b'*' => {
+            if head.last() == Some(&b'/') {
+                let before = &head[..head.len().saturating_sub(1)];
+                before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true)
+            } else {
+                head.iter().all(|c| c.is_ascii_whitespace())
+            }
+        }
+        _ => false,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{
+        is_comment_prefix, line_bounds, line_has_directive, should_skip_for_directive_search,
+        trim_ascii_whitespace, InlineIgnoreConfig,
+    };
+    use crate::location::OffsetSpan;
+
+    #[test]
+    fn detects_comment_prefixes() {
+        assert!(is_comment_prefix(b"// kingfisher:ignore", 3));
+        assert!(is_comment_prefix(b"  # kingfisher:ignore", 4));
+        assert!(is_comment_prefix(b"value /* kingfisher:ignore */", 9));
+        // assert!(is_comment_prefix(b"value -- kingfisher:ignore", 12));
+        // assert!(is_comment_prefix(b" * kingfisher:ignore", 4));
+        assert!(!is_comment_prefix(b"http://kingfisher:ignore", 13));
+    }
+
+    #[test]
+    fn bounds_cover_expected_ranges() {
+        let data = b"one\ntwo\nthree";
+        assert_eq!(line_bounds(data, 0), (0, 3));
+        assert_eq!(line_bounds(data, 4), (4, 7));
+        assert_eq!(line_bounds(data, data.len()), (8, 13));
+    }
+
+    #[test]
+    fn detects_directives_in_lines() {
+        let tokens = ["kingfisher:ignore", "kingfisher:allow"];
+        assert!(line_has_directive(b"secret # kingfisher:ignore", &tokens));
+        assert!(!line_has_directive(b"secret", &tokens));
+    }
+
+    #[test]
+    fn respects_multiline_block_comment_prefix() {
+        let tokens = ["kingfisher:ignore"];
+        assert!(line_has_directive(b" * kingfisher:ignore", &tokens));
+    }
+
+    #[test]
+    fn ignores_multi_line_string_with_trailing_comment() {
+        let blob = b"let secret = \"\"\"\nline1\nline2\n\"\"\"\n# kingfisher:ignore\n";
+        let matched = b"line1\nline2\n";
+        let start = blob
+            .windows(matched.len())
+            .position(|window| window == matched)
+            .expect("match bytes present");
+        let span = OffsetSpan::from_range(start..start + matched.len());
+        let config = InlineIgnoreConfig::new(false);
+        assert!(config.should_ignore(blob, &span));
+    }
+
+    #[test]
+    fn ignores_multiline_without_trailing_newline() {
+        let blob = b"let secret = \"\"\"\nline1\nline2\n\"\"\"\n# kingfisher:ignore\n";
+        let matched = b"line1\nline2";
+        let start = blob
+            .windows(matched.len())
+            .position(|window| window == matched)
+            .expect("match bytes present");
+        let span = OffsetSpan::from_range(start..start + matched.len());
+        let config = InlineIgnoreConfig::new(false);
+        assert!(config.should_ignore(blob, &span));
+    }
+
+    #[test]
+    fn ignores_multiline_with_directive_before_secret() {
+        let blob = b"// kingfisher:ignore\nlet secret = \"\"\"\nline1\nline2\n\"\"\"\n";
+        let matched = b"line1\nline2\n";
+        let start = blob
+            .windows(matched.len())
+            .position(|window| window == matched)
+            .expect("match bytes present");
+        let span = OffsetSpan::from_range(start..start + matched.len());
+        let config = InlineIgnoreConfig::new(false);
+        assert!(config.should_ignore(blob, &span));
+    }
+
+    #[test]
+    fn trim_ascii_whitespace_returns_inner_slice() {
+        assert_eq!(trim_ascii_whitespace(b"  abc  "), b"abc");
+        assert!(trim_ascii_whitespace(b"   ").is_empty());
+    }
+
+    #[test]
+    fn skips_lines_with_only_delimiters() {
+        assert!(should_skip_for_directive_search(b"\"\"\""));
+        assert!(should_skip_for_directive_search(b"   \"\"\"   "));
+        assert!(should_skip_for_directive_search(b"let secret = \"\"\""));
+        assert!(!should_skip_for_directive_search(b"value"));
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 3ceed02..9fc8106 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -21,6 +21,7 @@ pub mod git_url;
 pub mod gitea;
 pub mod github;
 pub mod gitlab;
+pub mod inline_ignore;
 pub mod jira;
 pub mod liquid_filters;
 pub mod location;
diff --git a/src/main.rs b/src/main.rs
index b6bb1fd..af89a53 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -416,6 +416,7 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
             since_commit: None,
             branch: None,
         },
+        compat_ignore_comments: false,
         content_filtering_args: ContentFilteringArgs {
             max_file_size_mb: 25.0,
             no_extract_archives: true,
diff --git a/src/matcher.rs b/src/matcher.rs
index 102820b..2688875 100644
--- a/src/matcher.rs
+++ b/src/matcher.rs
@@ -23,6 +23,7 @@ use xxhash_rust::xxh3::xxh3_64;
 use crate::{
     blob::{Blob, BlobId, BlobIdMap},
     entropy::calculate_shannon_entropy,
+    inline_ignore::InlineIgnoreConfig,
     location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
     origin::OriginSet,
     parser,
@@ -199,6 +200,9 @@ pub struct Matcher<'a> {
 
     /// Rule profiler for measuring performance of individual rules
     profiler: Option<Arc<ConcurrentRuleProfiler>>,
+
+    /// Configuration that controls inline ignore directives
+    inline_ignore_config: InlineIgnoreConfig,
 }
 /// This `Drop` implementation updates the `global_stats` with the local stats
 impl<'a> Drop for Matcher<'a> {
@@ -226,6 +230,7 @@ impl<'a> Matcher<'a> {
         global_stats: Option<&'a Mutex<MatcherStats>>,
         enable_profiling: bool,
         shared_profiler: Option<Arc<ConcurrentRuleProfiler>>,
+        include_external_ignore_syntax: bool,
     ) -> Result<Self> {
         // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec
         let raw_matches_scratch = Vec::new();
@@ -247,6 +252,7 @@ impl<'a> Matcher<'a> {
             seen_blobs,
             user_data,
             profiler,
+            inline_ignore_config: InlineIgnoreConfig::new(include_external_ignore_syntax),
         })
     }
 
@@ -403,6 +409,7 @@ impl<'a> Matcher<'a> {
                 redact,
                 &filename,
                 self.profiler.as_ref(),
+                &self.inline_ignore_config,
             );
         }
         // If tree-sitter produced base64-decoded matches, try them against all rules
@@ -427,6 +434,7 @@ impl<'a> Matcher<'a> {
                             redact,
                             &filename,
                             self.profiler.as_ref(),
+                            &self.inline_ignore_config,
                         );
                     }
                 }
@@ -457,6 +465,7 @@ impl<'a> Matcher<'a> {
                         redact,
                         &filename,
                         self.profiler.as_ref(),
+                        &self.inline_ignore_config,
                     );
                 }
                 if depth + 1 < MAX_B64_DEPTH {
@@ -560,6 +569,7 @@ fn filter_match<'b>(
     redact: bool,
     filename: &str,
     profiler: Option<&Arc<ConcurrentRuleProfiler>>,
+    inline_ignore_config: &InlineIgnoreConfig,
 ) {
     let mut timer =
         profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
@@ -590,6 +600,10 @@ fn filter_match<'b>(
         let matching_input_offset_span = OffsetSpan::from_range(
             (start + matching_input.start())..(start + matching_input.end()),
         );
+        if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
+            debug!("Skipping match due to inline ignore directive");
+            continue;
+        }
         let match_key = compute_match_key(
             matching_input.as_bytes(),
             rule.id().as_bytes(),
@@ -961,7 +975,7 @@ pub fn compute_finding_fingerprint(
 // -------------------------------------------------------------------------------------------------
 #[cfg(test)]
 mod test {
-    use std::collections::BTreeMap;
+    use std::{collections::BTreeMap, path::PathBuf};
 
     use pretty_assertions::assert_eq;
     // ---------------------------------------------------------------------
@@ -970,7 +984,11 @@ mod test {
     use proptest::prelude::*;
 
     use super::*;
-    use crate::rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation};
+    use crate::{
+        blob::{Blob, BlobIdMap},
+        origin::{Origin, OriginSet},
+        rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation},
+    };
 
     proptest! {
         #[test]
@@ -1009,7 +1027,7 @@ mod test {
             let rules_db  = RulesDatabase::from_rules(vec![rule]).unwrap();
             let seen      = BlobIdMap::new();
             let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
-            let mut m     = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None).unwrap();
+            let mut m     = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false).unwrap();
 
             // ── run the scan ──────────────────────────────────────────────
             m.scan_bytes_raw(&noise, "buf").unwrap();
@@ -1080,6 +1098,7 @@ mod test {
             None,
             enable_rule_profiling,
             None, // Pass the shared profiler
+            false,
         )?;
         matcher.scan_bytes_raw(input.as_bytes(), "fname")?;
         assert_eq!(
@@ -1167,7 +1186,7 @@ mod test {
         let rules_db = RulesDatabase::from_rules(vec![rule])?;
         let seen = BlobIdMap::new();
         let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
-        let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None)?;
+        let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
 
         let buf = b"dup dup"; // two literal hits, same rule
 
@@ -1184,4 +1203,115 @@ mod test {
         assert_eq!(second_len, 2);
         Ok(())
     }
+
+    #[test]
+    fn inline_comment_skips_match() -> Result<()> {
+        let rule = Rule::new(RuleSyntax {
+            id: "inline.ignore".into(),
+            name: "inline".into(),
+            pattern: "secret_token".into(),
+            confidence: crate::rules::rule::Confidence::Low,
+            min_entropy: 0.0,
+            visible: true,
+            examples: vec![],
+            negative_examples: vec![],
+            references: vec![],
+            validation: None::<Validation>,
+            depends_on_rule: vec![],
+        });
+        let rules_db = RulesDatabase::from_rules(vec![rule])?;
+        let seen = BlobIdMap::new();
+        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+        let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
+
+        let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec());
+        let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt")));
+
+        match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
+            ScanResult::New(matches) => assert!(matches.is_empty()),
+            _ => panic!("unexpected scan result"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn inline_comment_after_multiline_secret_skips_match() -> Result<()> {
+        let rule = Rule::new(RuleSyntax {
+            id: "inline.multiline".into(),
+            name: "inline multiline".into(),
+            pattern: "line1\\s+line2".into(),
+            confidence: crate::rules::rule::Confidence::Low,
+            min_entropy: 0.0,
+            visible: true,
+            examples: vec![],
+            negative_examples: vec![],
+            references: vec![],
+            validation: None::<Validation>,
+            depends_on_rule: vec![],
+        });
+        let rules_db = RulesDatabase::from_rules(vec![rule])?;
+        let seen = BlobIdMap::new();
+        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+        let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
+
+        let blob = Blob::from_bytes(
+            br#"let data = """
+line1
+line2
+"""
+# kingfisher:ignore
+"#
+            .to_vec(),
+        );
+        let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt")));
+
+        match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
+            ScanResult::New(matches) => assert!(matches.is_empty()),
+            _ => panic!("unexpected scan result"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn compat_flag_controls_external_directives() -> Result<()> {
+        let rule = Rule::new(RuleSyntax {
+            id: "inline.compat".into(),
+            name: "inline compat".into(),
+            pattern: "supersecret123".into(),
+            confidence: crate::rules::rule::Confidence::Low,
+            min_entropy: 0.0,
+            visible: true,
+            examples: vec![],
+            negative_examples: vec![],
+            references: vec![],
+            validation: None::<Validation>,
+            depends_on_rule: vec![],
+        });
+        let rules_db = RulesDatabase::from_rules(vec![rule])?;
+
+        let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec());
+        let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt")));
+
+        let seen = BlobIdMap::new();
+        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+        let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
+        let matches_without_compat =
+            match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
+                ScanResult::New(matches) => matches.len(),
+                _ => panic!("unexpected scan result"),
+            };
+        assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag");
+
+        let seen = BlobIdMap::new();
+        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+        let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true)?;
+        match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
+            ScanResult::New(matches) => assert!(matches.is_empty()),
+            _ => panic!("unexpected scan result"),
+        }
+
+        Ok(())
+    }
 }
diff --git a/src/matcher.rs.orig b/src/matcher.rs.orig
new file mode 100644
index 0000000..2688875
--- /dev/null
+++ b/src/matcher.rs.orig
@@ -0,0 +1,1317 @@
+use std::{
+    hash::{Hash, Hasher},
+    str,
+    sync::{Arc, Mutex},
+};
+
+use anyhow::Result;
+use base64::{engine::general_purpose, Engine};
+use bstr::BString;
+use http::StatusCode;
+use regex::bytes::Regex;
+use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
+use schemars::{
+    gen::SchemaGenerator,
+    schema::{ArrayValidation, InstanceType, Schema},
+    JsonSchema,
+};
+use serde::{Deserialize, Serialize};
+use smallvec::SmallVec;
+use tracing::debug;
+use xxhash_rust::xxh3::xxh3_64;
+
+use crate::{
+    blob::{Blob, BlobId, BlobIdMap},
+    entropy::calculate_shannon_entropy,
+    inline_ignore::InlineIgnoreConfig,
+    location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
+    origin::OriginSet,
+    parser,
+    parser::{Checker, Language},
+    rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer},
+    rules::rule::Rule,
+    rules_database::RulesDatabase,
+    safe_list::{is_safe_match, is_user_match},
+    scanner_pool::ScannerPool,
+    snippet::Base64BString,
+    util::{intern, redact_value},
+};
+
+const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
+const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
+const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
+const TREE_SITTER_MAX_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs <= 64 KiB
+const TREE_SITTER_MIN_LIMIT: usize = 1 * 1024; // only run tree-sitter on blobs >= 1 KiB
+
+// -------------------------------------------------------------------------------------------------
+// RawMatch
+// -------------------------------------------------------------------------------------------------
+/// A raw match, as recorded by a callback to Vectorscan.
+///
+/// When matching with Vectorscan, we simply collect all matches into a
+/// preallocated `Vec`, and then go through them all after scanning is complete.
+#[derive(PartialEq, Eq, Debug, Clone)]
+struct RawMatch {
+    rule_id: u32,
+    start_idx: u64,
+    end_idx: u64,
+}
+#[derive(Clone)]
+pub struct OwnedBlobMatch {
+    pub rule: Arc<Rule>,
+    pub blob_id: BlobId,
+    /// The unique content-based identifier of this match
+    pub finding_fingerprint: u64,
+    pub matching_input_offset_span: OffsetSpan,
+    pub captures: SerializableCaptures,
+    pub validation_response_body: String,
+    pub validation_response_status: StatusCode,
+    pub validation_success: bool,
+    pub calculated_entropy: f32,
+    pub is_base64: bool,
+}
+impl<'a> Matcher<'a> {
+    pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
+        self.profiler.as_ref().map(|p| p.generate_report())
+    }
+}
+impl OwnedBlobMatch {
+    pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc<Rule>) -> OwnedBlobMatch {
+        OwnedBlobMatch {
+            rule,
+            blob_id: m.blob_id,
+            finding_fingerprint: m.finding_fingerprint,
+            // matching_input: m.snippet.matching.0.to_vec(),
+            matching_input_offset_span: m.location.offset_span.clone(),
+            captures: m.groups.clone(),
+            validation_response_body: m.validation_response_body.clone(),
+            validation_response_status: StatusCode::from_u16(m.validation_response_status)
+                .unwrap_or(StatusCode::CONTINUE),
+            validation_success: m.validation_success,
+            calculated_entropy: m.calculated_entropy,
+            is_base64: m.is_base64,
+        }
+    }
+
+    pub fn from_blob_match(blob_match: BlobMatch) -> Self {
+        // Get the matching value from capture group 1 (or 0 if not available)
+        let matching_finding = blob_match
+            .captures
+            .captures
+            .get(1)
+            .or_else(|| blob_match.captures.captures.get(0))
+            .map(|capture| capture.value.as_bytes().to_vec())
+            .unwrap_or_else(Vec::new);
+
+        let mut owned_blob_match = OwnedBlobMatch {
+            rule: blob_match.rule,
+            blob_id: blob_match.blob_id.clone(),
+            matching_input_offset_span: blob_match.matching_input_offset_span,
+            captures: blob_match.captures.clone(),
+            validation_response_body: blob_match.validation_response_body,
+            validation_response_status: blob_match.validation_response_status,
+            validation_success: blob_match.validation_success,
+            calculated_entropy: blob_match.calculated_entropy,
+            finding_fingerprint: 0, //default
+            is_base64: blob_match.is_base64,
+        };
+
+        // Convert matching_finding to a &str (using lossy conversion if needed)
+        let finding_value = std::str::from_utf8(&matching_finding).unwrap_or("");
+        // Use blob_id as the file/commit identifier
+        let file_or_commit = &blob_match.blob_id.to_string();
+
+        let offset_start: u64 =
+            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
+        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
+
+        owned_blob_match.finding_fingerprint =
+            compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end);
+
+        owned_blob_match
+    }
+}
+// -------------------------------------------------------------------------------------------------
+// BlobMatch
+// -------------------------------------------------------------------------------------------------
+/// A `BlobMatch` is the result type from `Matcher::scan_blob`.
+///
+/// It is mostly made up of references and small data.
+/// For a representation that is more friendly for human consumption, see
+/// `Match`.
+pub struct BlobMatch<'a> {
+    /// The rule that was matched
+    pub rule: Arc<Rule>, // Changed from `&'a Rule` to `Arc<Rule
+
+    /// The blob that was matched
+    pub blob_id: &'a BlobId,
+
+    /// The matching input in `blob.input`
+    pub matching_input: &'a [u8],
+
+    /// The location of the matching input in `blob.input`
+    pub matching_input_offset_span: OffsetSpan,
+
+    /// The capture groups from the match
+    pub captures: SerializableCaptures, // regex::bytes::Captures<'a>,
+
+    pub validation_response_body: String,
+    pub validation_response_status: StatusCode,
+
+    pub validation_success: bool,
+    pub calculated_entropy: f32,
+    pub is_base64: bool,
+}
+#[derive(Clone)]
+struct UserData {
+    /// A scratch vector for raw matches from Vectorscan, to minimize allocation
+    raw_matches_scratch: Vec<RawMatch>,
+
+    /// The length of the input being scanned
+    input_len: u64,
+}
+// -------------------------------------------------------------------------------------------------
+// Matcher
+// -------------------------------------------------------------------------------------------------
+/// A `Matcher` is able to scan inputs for matches from rules in a
+/// `RulesDatabase`.
+///
+/// If doing multi-threaded scanning, use a separate `Matcher` for each thread.
+#[derive(Clone)]
+pub struct Matcher<'a> {
+    /// Thread-local pool that hands out a &mut BlockScanner
+    scanner_pool: std::sync::Arc<crate::scanner_pool::ScannerPool>,
+
+    /// The rules database used for matching
+    rules_db: &'a RulesDatabase,
+
+    /// Local statistics for this `Matcher`
+    local_stats: MatcherStats,
+
+    /// Global statistics, updated with the local statsistics when this
+    /// `Matcher` is dropped
+    global_stats: Option<&'a Mutex<MatcherStats>>,
+
+    /// The set of blobs that have been seen
+    seen_blobs: &'a BlobIdMap<bool>,
+
+    /// Data passed to the Vectorscan callback
+    user_data: UserData,
+
+    /// Rule profiler for measuring performance of individual rules
+    profiler: Option<Arc<ConcurrentRuleProfiler>>,
+
+    /// Configuration that controls inline ignore directives
+    inline_ignore_config: InlineIgnoreConfig,
+}
+/// This `Drop` implementation updates the `global_stats` with the local stats
+impl<'a> Drop for Matcher<'a> {
+    fn drop(&mut self) {
+        if let Some(global_stats) = self.global_stats {
+            let mut global_stats = global_stats.lock().unwrap();
+            global_stats.update(&self.local_stats);
+        }
+    }
+}
+pub enum ScanResult<'a> {
+    SeenWithMatches,
+    SeenSansMatches,
+    New(Vec<BlobMatch<'a>>),
+}
+impl<'a> Matcher<'a> {
+    /// Create a new `Matcher` from the given `RulesDatabase`.
+    ///
+    /// If `global_stats` is provided, it will be updated with the local stats
+    /// from this `Matcher` when it is dropped.
+    pub fn new(
+        rules_db: &'a RulesDatabase,
+        scanner_pool: Arc<ScannerPool>,
+        seen_blobs: &'a BlobIdMap<bool>,
+        global_stats: Option<&'a Mutex<MatcherStats>>,
+        enable_profiling: bool,
+        shared_profiler: Option<Arc<ConcurrentRuleProfiler>>,
+        include_external_ignore_syntax: bool,
+    ) -> Result<Self> {
+        // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec
+        let raw_matches_scratch = Vec::new();
+        let user_data = UserData { raw_matches_scratch, input_len: 0 };
+        // let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?;
+        // pool is created once per scan run (see Scanner section below)
+        let profiler = shared_profiler.or_else(|| {
+            if enable_profiling {
+                Some(Arc::new(ConcurrentRuleProfiler::new()))
+            } else {
+                None
+            }
+        });
+        Ok(Matcher {
+            scanner_pool,
+            rules_db,
+            local_stats: MatcherStats::default(),
+            global_stats,
+            seen_blobs,
+            user_data,
+            profiler,
+            inline_ignore_config: InlineIgnoreConfig::new(include_external_ignore_syntax),
+        })
+    }
+
+    fn scan_bytes_raw(&mut self, input: &[u8], _filename: &str) -> Result<()> {
+        // Remember previous peak automatically
+        let prev_capacity = self.user_data.raw_matches_scratch.capacity();
+        self.user_data.raw_matches_scratch.clear();
+        self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64));
+
+        self.user_data.input_len = input.len() as u64;
+
+        let mut offset: usize = 0;
+        while offset < input.len() {
+            let end = (offset + MAX_CHUNK_SIZE).min(input.len());
+            let slice = &input[offset..end];
+            let base = offset as u64;
+            self.scanner_pool.with(|scanner| {
+                scanner.scan(slice, |rule_id, from, to, _flags| {
+                    self.user_data.raw_matches_scratch.push(RawMatch {
+                        rule_id,
+                        start_idx: from + base,
+                        end_idx: to + base,
+                    });
+                    vectorscan_rs::Scan::Continue
+                })
+            })?;
+
+            if end == input.len() {
+                break;
+            }
+            offset = end.saturating_sub(CHUNK_OVERLAP);
+        }
+
+        Ok(())
+    }
+
+    pub fn scan_blob<'b>(
+        &mut self,
+        blob: &'b Blob,
+        origin: &OriginSet,
+        lang: Option<String>,
+        redact: bool,
+        no_dedup: bool,
+        no_base64: bool,
+    ) -> Result<ScanResult<'b>>
+    where
+        'a: 'b,
+    {
+        // Update local stats
+        self.local_stats.blobs_seen += 1;
+        self.local_stats.bytes_seen += blob.bytes().len() as u64;
+        self.local_stats.blobs_scanned += 1;
+        self.local_stats.bytes_scanned += blob.bytes().len() as u64;
+
+        // Extract filename from origin
+        let filename = origin
+            .first()
+            .blob_path()
+            .and_then(|path| path.file_name())
+            .and_then(|name| name.to_str())
+            .unwrap_or("unknown_file")
+            .to_string();
+        // Perform the scan
+        self.scan_bytes_raw(&blob.bytes(), &filename)?;
+
+        // Opportunistically look for standalone Base64 blobs. If neither
+        // the raw scan nor this check yields anything, we can return early
+        // before doing any heavier work.
+        let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
+            Vec::new()
+        } else {
+            get_base64_strings(blob.bytes())
+        };
+
+        let lang_hint = lang.as_deref();
+        let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty();
+        let has_base64_items = !b64_items.is_empty();
+
+        if !has_raw_matches && !has_base64_items {
+            return Ok(ScanResult::New(Vec::new()));
+        }
+
+        let rules_db = self.rules_db;
+        let mut seen_matches = FxHashSet::default();
+        let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
+
+        let blob_len = blob.len();
+
+        let should_run_tree_sitter = blob_len > 0
+            && blob_len <= TREE_SITTER_MAX_LIMIT
+            && blob_len >= TREE_SITTER_MIN_LIMIT
+            && has_raw_matches
+            && lang_hint.is_some()
+            && !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled
+
+        let tree_sitter_result = if should_run_tree_sitter {
+            lang_hint.and_then(|lang_str| {
+                get_language_and_queries(lang_str).and_then(|(language, queries)| {
+                    let checker = Checker { language, rules: queries };
+                    match checker.check(&blob.bytes()) {
+                        Ok(results) => Some(results),
+                        Err(e) => {
+                            println!("Error in checker.check: {}", e);
+                            None
+                        }
+                    }
+                })
+            })
+        } else {
+            None
+        };
+        // Process matches
+        let mut matches = Vec::new();
+        let owned_ts_results = tree_sitter_result.map(|ts_results| {
+            ts_results
+                .into_iter()
+                .filter(|match_result| match_result.is_base64_decoded)
+                .map(|match_result| {
+                    (
+                        match_result.range,
+                        match_result.text,
+                        match_result.is_base64_decoded,
+                        match_result.original_base64,
+                    )
+                })
+                .collect::<Vec<_>>()
+        });
+        let mut previous_raw_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
+        for &RawMatch { rule_id, start_idx, end_idx } in
+            self.user_data.raw_matches_scratch.iter().rev()
+        {
+            let rule_id_usize: usize = rule_id as usize;
+            let rule = Arc::clone(&rules_db.rules[rule_id_usize]);
+            let re = &rules_db.anchored_regexes[rule_id_usize];
+            let start_idx_usize = start_idx as usize;
+            let end_idx_usize = end_idx as usize;
+            let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
+            if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
+                continue;
+            }
+            filter_match(
+                blob,
+                rule,
+                re,
+                start_idx_usize,
+                end_idx_usize,
+                &mut matches,
+                &mut previous_matches,
+                rule_id_usize,
+                &mut seen_matches,
+                origin,
+                None,
+                false,
+                redact,
+                &filename,
+                self.profiler.as_ref(),
+                &self.inline_ignore_config,
+            );
+        }
+        // If tree-sitter produced base64-decoded matches, try them against all rules
+        if let Some(ref ts_results) = owned_ts_results {
+            for (ts_range, ts_match, is_base64_decoded, _original_base64) in ts_results.iter() {
+                if *is_base64_decoded {
+                    for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() {
+                        let re = &rules_db.anchored_regexes[rule_id_usize];
+                        filter_match(
+                            blob,
+                            rule.clone(),
+                            re,
+                            ts_range.start,
+                            ts_range.end,
+                            &mut matches,
+                            &mut previous_matches,
+                            rule_id_usize,
+                            &mut seen_matches,
+                            origin,
+                            Some(ts_match.as_bytes()),
+                            *is_base64_decoded,
+                            redact,
+                            &filename,
+                            self.profiler.as_ref(),
+                            &self.inline_ignore_config,
+                        );
+                    }
+                }
+            }
+        }
+
+        if !no_base64 {
+            // If the blob contains standalone Base64 blobs, decode and scan them as well
+            const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep
+            let mut b64_stack: Vec<(DecodedData, usize)> =
+                b64_items.drain(..).map(|d| (d, 0)).collect();
+            while let Some((item, depth)) = b64_stack.pop() {
+                for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() {
+                    let re = &rules_db.anchored_regexes[rule_id_usize];
+                    filter_match(
+                        blob,
+                        rule.clone(),
+                        re,
+                        item.pos_start,
+                        item.pos_end,
+                        &mut matches,
+                        &mut previous_matches,
+                        rule_id_usize,
+                        &mut seen_matches,
+                        origin,
+                        Some(item.decoded.as_bytes()),
+                        true,
+                        redact,
+                        &filename,
+                        self.profiler.as_ref(),
+                        &self.inline_ignore_config,
+                    );
+                }
+                if depth + 1 < MAX_B64_DEPTH {
+                    for nested in get_base64_strings(item.decoded.as_bytes()) {
+                        b64_stack.push((
+                            DecodedData {
+                                original: nested.original,
+                                decoded: nested.decoded,
+                                pos_start: item.pos_start,
+                                pos_end: item.pos_end,
+                            },
+                            depth + 1,
+                        ));
+                    }
+                }
+            }
+        }
+        // Finalize
+        if !no_dedup && !matches.is_empty() {
+            let blob_id = blob.id();
+            if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
+                return Ok(if had_matches {
+                    ScanResult::SeenWithMatches
+                } else {
+                    ScanResult::SeenSansMatches
+                });
+            }
+        }
+
+        // --- opportunistic capacity cap ---------------------------------
+        if self.user_data.raw_matches_scratch.capacity()
+            > self.user_data.raw_matches_scratch.len() * 4
+        {
+            // Vec::shrink_to_fit may re-allocate, but we’re about to leave scan_blob
+            // so the cost is hidden off the hot path.
+            self.user_data.raw_matches_scratch.shrink_to_fit();
+        }
+
+        Ok(ScanResult::New(matches))
+        // Ok(result)
+    }
+}
+
+#[inline]
+fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
+    let mut hasher = FxHasher::default();
+    // Hash each component directly without allocation
+    content.hash(&mut hasher);
+    rule_id.hash(&mut hasher);
+    start.hash(&mut hasher);
+    end.hash(&mut hasher);
+    hasher.finish()
+}
+
+#[inline]
+fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
+    let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
+    if idx > 0 {
+        if spans[idx - 1].fully_contains(&span) {
+            return false;
+        }
+        if span.fully_contains(&spans[idx - 1]) {
+            spans.remove(idx - 1);
+            idx -= 1;
+        }
+    }
+    if idx < spans.len() {
+        if spans[idx].fully_contains(&span) {
+            return false;
+        }
+        if span.fully_contains(&spans[idx]) {
+            spans.remove(idx);
+        }
+    }
+    spans.insert(idx, span);
+    true
+}
+
+#[inline]
+fn record_match(
+    map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
+    rule_id: usize,
+    span: OffsetSpan,
+) -> bool {
+    insert_span(map.entry(rule_id).or_default(), span)
+}
+fn filter_match<'b>(
+    blob: &'b Blob,
+    // rule: &'b Rule,
+    rule: Arc<Rule>,
+    re: &Regex,
+    start: usize,
+    end: usize,
+    matches: &mut Vec<BlobMatch<'b>>,
+    previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
+    rule_id: usize,
+    seen_matches: &mut FxHashSet<u64>,
+    _origin: &OriginSet,
+    ts_match: Option<&[u8]>,
+    is_base64: bool,
+    redact: bool,
+    filename: &str,
+    profiler: Option<&Arc<ConcurrentRuleProfiler>>,
+    inline_ignore_config: &InlineIgnoreConfig,
+) {
+    let mut timer =
+        profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
+
+    let initial_len = matches.len();
+
+    let blob_bytes = blob.bytes();
+    let default_slice = &blob_bytes[start..end];
+    let haystack = ts_match.unwrap_or(default_slice);
+
+    for captures in re.captures_iter(haystack) {
+        let full_capture = captures.get(0).unwrap();
+        let matching_input = captures.get(1).unwrap_or(full_capture);
+        let min_entropy = rule.min_entropy();
+        let mi_bytes = matching_input.as_bytes();
+        let full_bytes = full_capture.as_bytes();
+        let calculated_entropy = calculate_shannon_entropy(mi_bytes);
+        if calculated_entropy <= min_entropy
+            || is_safe_match(mi_bytes)
+            || is_user_match(mi_bytes, full_bytes)
+        {
+            debug!(
+                "Skipping match with entropy {} <= {} or safe match",
+                calculated_entropy, min_entropy
+            );
+            continue;
+        }
+        let matching_input_offset_span = OffsetSpan::from_range(
+            (start + matching_input.start())..(start + matching_input.end()),
+        );
+        if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
+            debug!("Skipping match due to inline ignore directive");
+            continue;
+        }
+        let match_key = compute_match_key(
+            matching_input.as_bytes(),
+            rule.id().as_bytes(),
+            matching_input_offset_span.start,
+            matching_input_offset_span.end,
+        );
+        if !seen_matches.insert(match_key) {
+            continue;
+        }
+        if !record_match(previous_matches, rule_id, matching_input_offset_span) {
+            continue;
+        }
+        let only_matching_input =
+            &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
+        let groups = SerializableCaptures::from_captures(&captures, haystack, re, redact);
+        matches.push(BlobMatch {
+            rule: Arc::clone(&rule),
+            blob_id: blob.id_ref(),
+            matching_input: only_matching_input,
+            matching_input_offset_span,
+            captures: groups,
+            validation_response_body: String::new(),
+            validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
+            validation_success: false,
+            calculated_entropy,
+            is_base64,
+        });
+    }
+    if let Some(t) = timer.take() {
+        let new_count = (matches.len() - initial_len) as u64;
+        t.end(new_count > 0, new_count, 0);
+    }
+}
+fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, String>)> {
+    match lang.to_lowercase().as_str() {
+        "bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())),
+        "c" => Some((Language::C, parser::queries::c::get_c_queries())),
+        "c#" | "csharp" => Some((Language::CSharp, parser::queries::csharp::get_csharp_queries())),
+        "c++" | "cpp" => Some((Language::Cpp, parser::queries::cpp::get_cpp_queries())),
+        "css" => Some((Language::Css, parser::queries::css::get_css_queries())),
+        "go" => Some((Language::Go, parser::queries::go::get_go_queries())),
+        "html" => Some((Language::Html, parser::queries::html::get_html_queries())),
+        "java" => Some((Language::Java, parser::queries::java::get_java_queries())),
+        "javascript" | "js" => {
+            Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries()))
+        }
+        // "kotlin" => Some((
+        //     Language::Kotlin,
+        //     parser::queries::kotlin::get_kotlin_queries(),
+        // )),
+        "php" => Some((Language::Php, parser::queries::php::get_php_queries())),
+        "python" | "py" | "starlark" => {
+            Some((Language::Python, parser::queries::python::get_python_queries()))
+        }
+        "ruby" => Some((Language::Ruby, parser::queries::ruby::get_ruby_queries())),
+        "rust" => Some((Language::Rust, parser::queries::rust::get_rust_queries())),
+        "toml" => Some((Language::Toml, parser::queries::toml::get_toml_queries())),
+        "typescript" | "ts" => {
+            Some((Language::TypeScript, parser::queries::typescript::get_typescript_queries()))
+        }
+        "yaml" => Some((Language::Yaml, parser::queries::yaml::get_yaml_queries())),
+        _ => None,
+    }
+}
+// -------------------------------------------------------------------------------------------------
+// MatchStats
+// -------------------------------------------------------------------------------------------------
+#[derive(Debug, Default, Clone)]
+pub struct MatcherStats {
+    pub blobs_seen: u64,
+    pub blobs_scanned: u64,
+    pub bytes_seen: u64,
+    pub bytes_scanned: u64,
+    // #[cfg(feature = "rule_profiling")]
+    // pub rule_stats: crate::rule_profiling::RuleProfile,
+}
+impl MatcherStats {
+    pub fn update(&mut self, other: &Self) {
+        self.blobs_seen += other.blobs_seen;
+        self.blobs_scanned += other.blobs_scanned;
+        self.bytes_seen += other.bytes_seen;
+        self.bytes_scanned += other.bytes_scanned;
+
+        // #[cfg(feature = "rule_profiling")]
+        // self.rule_stats.update(&other.rule_stats);
+    }
+}
+// -------------------------------------------------------------------------------------------------
+// Group
+// -------------------------------------------------------------------------------------------------
+#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)]
+pub struct Group(pub Base64BString);
+impl Group {
+    pub fn new(m: regex::bytes::Match<'_>) -> Self {
+        Self(Base64BString(BString::from(m.as_bytes())))
+    }
+}
+// -------------------------------------------------------------------------------------------------
+// Groups
+// -------------------------------------------------------------------------------------------------
+#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
+pub struct Groups(pub SmallVec<[Group; 1]>);
+impl JsonSchema for Groups {
+    fn schema_name() -> String {
+        "Groups".to_string()
+    }
+
+    fn json_schema(gen: &mut SchemaGenerator) -> Schema {
+        let group_schema = gen.subschema_for::<Group>();
+        Schema::Object(schemars::schema::SchemaObject {
+            instance_type: Some(InstanceType::Array.into()),
+            array: Some(Box::new(ArrayValidation {
+                items: Some(group_schema.into()),
+                ..Default::default()
+            })),
+            ..Default::default()
+        })
+    }
+}
+// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+// pub struct SerializableCapture {
+//     pub name: Option<String>, // Named group (if available)
+//     pub match_number: i32,
+//     pub start: usize,  // Start position of the match
+//     pub end: usize,    // End position of the match
+//     pub value: String, // The actual captured value
+// }
+#[derive(Debug, Clone, Serialize, JsonSchema)]
+pub struct SerializableCapture {
+    pub name: Option<String>,
+    pub match_number: i32,
+    pub start: usize,
+    pub end: usize,
+    /// Interned value of the capture.
+    pub value: &'static str,
+}
+
+#[derive(Debug, Clone, Serialize, JsonSchema)]
+pub struct SerializableCaptures {
+    #[schemars(with = "Vec<SerializableCapture>")]
+    pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
+}
+impl SerializableCaptures {
+    pub fn from_captures(
+        captures: &regex::bytes::Captures,
+        _input: &[u8],
+        re: &Regex,
+        redact: bool,
+    ) -> Self {
+        let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
+        // Process named captures
+        for name in re.capture_names().flatten() {
+            if let Some(capture) = captures.name(name) {
+                let value = if redact {
+                    redact_value(&String::from_utf8_lossy(capture.as_bytes()))
+                } else {
+                    String::from_utf8_lossy(capture.as_bytes()).to_string()
+                };
+                serialized_captures.push(SerializableCapture {
+                    name: Some(name.to_string()),
+                    match_number: -1,
+                    start: capture.start(),
+                    end: capture.end(),
+                    value: intern(&value),
+                });
+            }
+        }
+        // Process unnamed captures (numbered groups)
+        for i in 0..captures.len() {
+            if let Some(capture) = captures.get(i) {
+                let value = if redact {
+                    redact_value(&String::from_utf8_lossy(capture.as_bytes()))
+                } else {
+                    String::from_utf8_lossy(capture.as_bytes()).to_string()
+                };
+                serialized_captures.push(SerializableCapture {
+                    name: None,
+                    match_number: i32::try_from(i).unwrap_or(0),
+                    start: capture.start(),
+                    end: capture.end(),
+                    value: intern(&value),
+                });
+            }
+        }
+        SerializableCaptures { captures: serialized_captures }
+    }
+}
+// -------------------------------------------------------------------------------------------------
+// Match
+// -------------------------------------------------------------------------------------------------
+#[derive(Debug, Clone, Serialize, JsonSchema)]
+pub struct Match {
+    /// The location of the entire matching content
+    pub location: Location,
+
+    /// The capture groups
+    pub groups: SerializableCaptures, // Store serialized captures
+
+    /// unique identifier of file / blob where this match was found
+    pub blob_id: BlobId,
+
+    /// The unique content-based identifier of this match
+    pub finding_fingerprint: u64,
+
+    /// The rule that produced this match
+    #[serde(skip_serializing)]
+    #[schemars(skip)]
+    pub rule: Arc<Rule>,
+
+    /// Validation Body
+    pub validation_response_body: String,
+
+    /// Validation Status Code
+    pub validation_response_status: u16,
+
+    /// Validation Success
+    pub validation_success: bool,
+
+    /// Validation Success
+    pub calculated_entropy: f32,
+
+    pub visible: bool,
+    #[serde(default)]
+    pub is_base64: bool,
+}
+impl Match {
+    #[inline]
+    pub fn convert_owned_blobmatch_to_match<'a>(
+        loc_mapping: Option<&'a LocationMapping<'a>>,
+        owned_blob_match: &'a OwnedBlobMatch,
+        origin_type: &'a str,
+    ) -> Self {
+        let offset_span = owned_blob_match.matching_input_offset_span;
+        // Extract the matched secret content. Use capture group 1 if it exists, otherwise fall back
+        // to group 0.
+        let matching_finding_bytes = owned_blob_match
+            .captures
+            .captures
+            .get(1)
+            .or_else(|| owned_blob_match.captures.captures.get(0))
+            .map(|capture| capture.value.as_bytes())
+            .unwrap_or_default();
+
+        // The fingerprint will be based on the content of the secret.
+        let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
+
+        let source_span =
+            loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
+                start: SourcePoint { line: 0, column: 0 },
+                end: SourcePoint { line: 0, column: 0 },
+            });
+        let offset_start: u64 =
+            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
+        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
+
+        let finding_fingerprint = compute_finding_fingerprint(
+            finding_value_for_fp,
+            origin_type, // file_or_commit,
+            offset_start,
+            offset_end,
+        );
+
+        // matching_snippet
+        Match {
+            rule: owned_blob_match.rule.clone(),
+            visible: owned_blob_match.rule.visible().to_owned(),
+            location: Location { offset_span, source_span: source_span.clone() },
+            groups: owned_blob_match.captures.clone(),
+            blob_id: owned_blob_match.blob_id,
+            finding_fingerprint,
+            validation_response_body: owned_blob_match.validation_response_body.clone(),
+            validation_response_status: owned_blob_match.validation_response_status.as_u16(),
+            validation_success: owned_blob_match.validation_success,
+            calculated_entropy: owned_blob_match.calculated_entropy,
+            is_base64: owned_blob_match.is_base64,
+        }
+    }
+
+    /// Returns the `blob_id` of the match.
+    pub fn get_blob_id(&self) -> BlobId {
+        self.blob_id.clone()
+    }
+
+    pub fn finding_id(&self) -> String {
+        let mut buffer = Vec::with_capacity(128);
+        buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes());
+        buffer.push(0);
+        serde_json::to_writer(&mut buffer, &self.groups)
+            .expect("should be able to serialize groups as JSON");
+        let mut num = xxh3_64(&buffer);
+        // Ensure the number is positive and within i64 range
+        num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive
+                                      // Convert to string
+        num.to_string()
+    }
+}
+#[derive(Debug, Clone)]
+pub struct DecodedData {
+    pub original: String,
+    pub decoded: String,
+    pub pos_start: usize,
+    pub pos_end: usize,
+}
+#[inline]
+fn is_base64_byte(b: u8) -> bool {
+    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/')
+}
+
+pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
+    let mut results = Vec::new();
+    let mut i = 0;
+    while i < input.len() {
+        while i < input.len() && !is_base64_byte(input[i]) {
+            i += 1;
+        }
+        let start = i;
+        while i < input.len() && is_base64_byte(input[i]) {
+            i += 1;
+        }
+
+        let mut eq_count = 0;
+        while i < input.len() && input[i] == b'=' && eq_count < 2 {
+            i += 1;
+            eq_count += 1;
+        }
+        let end = i;
+
+        let len = end - start;
+        if len >= 32 && len % 4 == 0 {
+            let base64_slice = &input[start..end];
+            if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) {
+                if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
+                    if decoded_str.is_ascii() {
+                        results.push(DecodedData {
+                            original: String::from_utf8_lossy(base64_slice).into_owned(),
+                            decoded: decoded_str.to_string(),
+                            pos_start: start,
+                            pos_end: end,
+                        });
+                    }
+                }
+            }
+        }
+    }
+
+    results
+}
+
+pub fn compute_finding_fingerprint(
+    finding_value: &str,
+    file_or_commit: &str,
+    offset_start: u64,
+    offset_end: u64,
+) -> u64 {
+    // Combine all into a byte buffer and hash it directly:
+    let mut buf = Vec::with_capacity(
+        finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::<u64>(),
+    );
+    buf.extend_from_slice(finding_value.as_bytes());
+    buf.extend_from_slice(file_or_commit.as_bytes());
+    buf.extend_from_slice(&offset_start.to_le_bytes());
+    buf.extend_from_slice(&offset_end.to_le_bytes());
+
+    xxh3_64(&buf)
+}
+
+// -------------------------------------------------------------------------------------------------
+// test
+// -------------------------------------------------------------------------------------------------
+#[cfg(test)]
+mod test {
+    use std::{collections::BTreeMap, path::PathBuf};
+
+    use pretty_assertions::assert_eq;
+    // ---------------------------------------------------------------------
+    // proptest: raw-match dedup + entropy gate
+    // ---------------------------------------------------------------------
+    use proptest::prelude::*;
+
+    use super::*;
+    use crate::{
+        blob::{Blob, BlobIdMap},
+        origin::{Origin, OriginSet},
+        rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation},
+    };
+
+    proptest! {
+        #[test]
+        fn prop_no_dupes_and_entropy(
+            // random ASCII up to 300 bytes
+            mut noise in proptest::collection::vec(any::<u8>().prop_filter("ascii", |b| b.is_ascii()), 0..300),
+            // 0-4 random insertion points
+            inserts in proptest::collection::vec(0usize..300, 0..5)
+        ) {
+            // Constant high-entropy secret token that matches the rule below
+            const TOKEN: &[u8] = b"secret_abcd1234";
+
+            // Splice the token at the requested offsets
+            for &idx in &inserts {
+                let pos = idx.min(noise.len());
+                noise.splice(pos..pos, TOKEN.iter().copied());
+            }
+
+            // ── build a single test rule ──────────────────────────────────
+            use crate::rules::rule::{RuleSyntax, Validation, Confidence};
+
+            let rule = Rule::new(RuleSyntax {
+                id: "prop.secret".into(),
+                name: "prop secret".into(),
+                pattern: "secret_[a-z]{4}[0-9]{4}".into(),
+                confidence: Confidence::Low,
+                min_entropy: 3.0,
+                visible: true,
+                examples: vec![],
+                negative_examples: vec![],
+                references: vec![],
+                validation: None::<Validation>,          // no HTTP validation needed
+                depends_on_rule: vec![],
+            });
+
+            let rules_db  = RulesDatabase::from_rules(vec![rule]).unwrap();
+            let seen      = BlobIdMap::new();
+            let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+            let mut m     = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false).unwrap();
+
+            // ── run the scan ──────────────────────────────────────────────
+            m.scan_bytes_raw(&noise, "buf").unwrap();
+
+            // ── property 1: dedup – each (rule,start,end) is unique ──────
+
+            let mut coords = FxHashSet::default();
+            for RawMatch{rule_id, start_idx, end_idx} in &m.user_data.raw_matches_scratch {
+                assert!(
+                    coords.insert((*rule_id, *start_idx, *end_idx)),
+                    "duplicate raw-match detected for coords ({rule_id},{start_idx},{end_idx})"
+                );
+
+                // ── property 2: entropy gate held ────────────────────────
+                let slice = &noise[*start_idx as usize .. *end_idx as usize];
+                let ent   = calculate_shannon_entropy(slice);
+                assert!(ent > 3.0, "entropy {ent} ≤ min_entropy, gate failed");
+            }
+        }
+    }
+
+    #[test]
+    pub fn test_simple() -> Result<()> {
+        let rules = vec![Rule::new(RuleSyntax {
+            id: "test.1".to_string(),
+            name: "test".to_string(),
+            pattern: "test".to_string(),
+            confidence: crate::rules::rule::Confidence::Medium,
+            min_entropy: 1.0,
+            visible: true,
+            examples: vec![],
+            negative_examples: vec![],
+            references: vec![],
+            validation: Some(Validation::Http(HttpValidation {
+                request: HttpRequest {
+                    method: "GET".to_string(),
+                    url: "https://example.com".to_string(),
+                    headers: BTreeMap::new(),
+                    body: None,
+                    response_matcher: Some(vec![]),
+                    multipart: None,
+                    response_is_html: false,
+                },
+                multipart: None,
+            })),
+            depends_on_rule: vec![
+                Some(DependsOnRule {
+                    rule_id: "d8f3c34b-015f-4cd6-b411-b1366493104c".to_string(),
+                    variable: "email".to_string(),
+                }),
+                Some(DependsOnRule {
+                    rule_id: "8910f364-7718-4a27-a435-d2da13e6ba9e".to_string(),
+                    variable: "domain".to_string(),
+                }),
+            ],
+        })];
+        let rules_db = RulesDatabase::from_rules(rules)?;
+        let input = "some test data for vectorscan";
+        let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
+        let enable_rule_profiling = true;
+        // let mut matcher = Matcher::new(&rules_db, &seen_blobs, None,
+        // enable_rule_profiling)?;
+        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+        let mut matcher = Matcher::new(
+            &rules_db,
+            scanner_pool,
+            &seen_blobs,
+            None,
+            enable_rule_profiling,
+            None, // Pass the shared profiler
+            false,
+        )?;
+        matcher.scan_bytes_raw(input.as_bytes(), "fname")?;
+        assert_eq!(
+            matcher.user_data.raw_matches_scratch,
+            vec![RawMatch { rule_id: 0, start_idx: 0, end_idx: 9 },]
+        );
+        Ok(())
+    }
+
+    // ---------------------------------------------------------------------
+    // additional deterministic unit-tests
+    // ---------------------------------------------------------------------
+
+    /// `get_base64_strings` should recognise a well-formed token, decode it,
+    /// and report correct byte-offsets.
+    #[test]
+    fn test_get_base64_strings_basic() {
+        let raw = b"foo MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY= bar";
+        // decodes to "0123456789abcdef0123456789abcdef"
+        let hits = get_base64_strings(raw);
+        assert_eq!(hits.len(), 1);
+        let item = &hits[0];
+        assert_eq!(item.decoded, "0123456789abcdef0123456789abcdef");
+        assert_eq!(item.original, "MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY=");
+        // "foo␠" is 4 bytes, so the start offset is 4
+        assert_eq!((item.pos_start, item.pos_end), (4, 4 + item.original.len()));
+    }
+
+    /// `compute_finding_fingerprint` must be stable (same input ⇒ same output)
+    /// and sensitive to any input component.
+    #[test]
+    fn test_finding_fingerprint_stability_and_uniqueness() {
+        let a = compute_finding_fingerprint("secret", "fileA", 0, 6);
+        let b = compute_finding_fingerprint("secret", "fileA", 0, 6);
+        assert_eq!(a, b, "fingerprint should be deterministic");
+
+        // changing any parameter should perturb the hash
+        let c = compute_finding_fingerprint("secret", "fileA", 1, 7); // offsets differ
+        let d = compute_finding_fingerprint("secret", "fileB", 0, 6); // file id differs
+        let e = compute_finding_fingerprint("different", "fileA", 0, 6); // content differs
+        assert_ne!(a, c);
+        assert_ne!(a, d);
+        assert_ne!(a, e);
+    }
+
+    /// The (private) `compute_match_key` helper is the linchpin of the raw-dedup
+    /// path.  It should return identical keys for identical inputs and different
+    /// keys as soon as *anything* changes.
+    #[test]
+    fn test_compute_match_key_uniqueness() {
+        use super::compute_match_key;
+
+        let k1 = compute_match_key(b"abc", b"rule-1", 0, 3);
+        let k2 = compute_match_key(b"abc", b"rule-1", 0, 3);
+        assert_eq!(k1, k2);
+
+        // mutate each component in turn
+        let diff_content = compute_match_key(b"abcd", b"rule-1", 0, 4);
+        let diff_rule = compute_match_key(b"abc", b"rule-2", 0, 3);
+        let diff_span = compute_match_key(b"abc", b"rule-1", 1, 4);
+        assert_ne!(k1, diff_content);
+        assert_ne!(k1, diff_rule);
+        assert_ne!(k1, diff_span);
+    }
+
+    /// Running `scan_bytes_raw` twice over the *same* input should never record
+    /// duplicate entries in `raw_matches_scratch`.
+    #[test]
+    fn test_scan_bytes_raw_no_duplicate_raw_matches() -> Result<()> {
+        // simple rule: literal "dup"
+        let rule = Rule::new(RuleSyntax {
+            id: "dup.check".into(),
+            name: "dup".into(),
+            pattern: "dup".into(),
+            confidence: crate::rules::rule::Confidence::Low,
+            min_entropy: 0.0,
+            visible: true,
+            examples: vec![],
+            negative_examples: vec![],
+            references: vec![],
+            validation: None::<Validation>,
+            depends_on_rule: vec![],
+        });
+
+        let rules_db = RulesDatabase::from_rules(vec![rule])?;
+        let seen = BlobIdMap::new();
+        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+        let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
+
+        let buf = b"dup dup"; // two literal hits, same rule
+
+        // first scan
+        m.scan_bytes_raw(buf, "buf1")?;
+        let first_len = m.user_data.raw_matches_scratch.len();
+
+        // second scan over the same buffer
+        m.scan_bytes_raw(buf, "buf1")?;
+        let second_len = m.user_data.raw_matches_scratch.len();
+
+        // we should still only have two unique raw matches recorded
+        assert_eq!(first_len, 2);
+        assert_eq!(second_len, 2);
+        Ok(())
+    }
+
+    #[test]
+    fn inline_comment_skips_match() -> Result<()> {
+        let rule = Rule::new(RuleSyntax {
+            id: "inline.ignore".into(),
+            name: "inline".into(),
+            pattern: "secret_token".into(),
+            confidence: crate::rules::rule::Confidence::Low,
+            min_entropy: 0.0,
+            visible: true,
+            examples: vec![],
+            negative_examples: vec![],
+            references: vec![],
+            validation: None::<Validation>,
+            depends_on_rule: vec![],
+        });
+        let rules_db = RulesDatabase::from_rules(vec![rule])?;
+        let seen = BlobIdMap::new();
+        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+        let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
+
+        let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec());
+        let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt")));
+
+        match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
+            ScanResult::New(matches) => assert!(matches.is_empty()),
+            _ => panic!("unexpected scan result"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn inline_comment_after_multiline_secret_skips_match() -> Result<()> {
+        let rule = Rule::new(RuleSyntax {
+            id: "inline.multiline".into(),
+            name: "inline multiline".into(),
+            pattern: "line1\\s+line2".into(),
+            confidence: crate::rules::rule::Confidence::Low,
+            min_entropy: 0.0,
+            visible: true,
+            examples: vec![],
+            negative_examples: vec![],
+            references: vec![],
+            validation: None::<Validation>,
+            depends_on_rule: vec![],
+        });
+        let rules_db = RulesDatabase::from_rules(vec![rule])?;
+        let seen = BlobIdMap::new();
+        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+        let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
+
+        let blob = Blob::from_bytes(
+            br#"let data = """
+line1
+line2
+"""
+# kingfisher:ignore
+"#
+            .to_vec(),
+        );
+        let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt")));
+
+        match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
+            ScanResult::New(matches) => assert!(matches.is_empty()),
+            _ => panic!("unexpected scan result"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn compat_flag_controls_external_directives() -> Result<()> {
+        let rule = Rule::new(RuleSyntax {
+            id: "inline.compat".into(),
+            name: "inline compat".into(),
+            pattern: "supersecret123".into(),
+            confidence: crate::rules::rule::Confidence::Low,
+            min_entropy: 0.0,
+            visible: true,
+            examples: vec![],
+            negative_examples: vec![],
+            references: vec![],
+            validation: None::<Validation>,
+            depends_on_rule: vec![],
+        });
+        let rules_db = RulesDatabase::from_rules(vec![rule])?;
+
+        let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec());
+        let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt")));
+
+        let seen = BlobIdMap::new();
+        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+        let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
+        let matches_without_compat =
+            match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
+                ScanResult::New(matches) => matches.len(),
+                _ => panic!("unexpected scan result"),
+            };
+        assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag");
+
+        let seen = BlobIdMap::new();
+        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+        let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true)?;
+        match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
+            ScanResult::New(matches) => assert!(matches.is_empty()),
+            _ => panic!("unexpected scan result"),
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/matcher.rs.rej b/src/matcher.rs.rej
new file mode 100644
index 0000000..dc61305
--- /dev/null
+++ b/src/matcher.rs.rej
@@ -0,0 +1,52 @@
+@@ -1,50 +1,51 @@
+ use std::{
+     hash::{Hash, Hasher},
+     str,
+     sync::{Arc, Mutex},
+ };
+ 
+ use anyhow::Result;
+ use base64::{engine::general_purpose, Engine};
+ use bstr::BString;
+ use http::StatusCode;
+ use regex::bytes::Regex;
+ use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
+ use schemars::{
+     gen::SchemaGenerator,
+     schema::{ArrayValidation, InstanceType, Schema},
+     JsonSchema,
+ };
+ use serde::{Deserialize, Serialize};
+ use smallvec::SmallVec;
+ use tracing::debug;
+ use xxhash_rust::xxh3::xxh3_64;
+ 
+ use crate::{
+     blob::{Blob, BlobId, BlobIdMap},
+     entropy::calculate_shannon_entropy,
++    inline_ignore::InlineIgnoreConfig,
+     location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
+     origin::OriginSet,
+     parser,
+     parser::{Checker, Language},
+     rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer},
+     rules::rule::Rule,
+     rules_database::RulesDatabase,
+     safe_list::{is_safe_match, is_user_match},
+     scanner_pool::ScannerPool,
+     snippet::Base64BString,
+     util::{intern, redact_value},
+ };
+ 
+ const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
+ const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
+ const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
+ const TREE_SITTER_SCAN_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs ≤64 KiB
+ 
+ // -------------------------------------------------------------------------------------------------
+ // RawMatch
+ // -------------------------------------------------------------------------------------------------
+ /// A raw match, as recorded by a callback to Vectorscan.
+ ///
+ /// When matching with Vectorscan, we simply collect all matches into a
+ /// preallocated `Vec`, and then go through them all after scanning is complete.
diff --git a/src/reporter.rs b/src/reporter.rs
index 73bc541..f5d33f5 100644
--- a/src/reporter.rs
+++ b/src/reporter.rs
@@ -842,6 +842,7 @@ mod tests {
                 since_commit: None,
                 branch: None,
             },
+            compat_ignore_comments: false,
             content_filtering_args: ContentFilteringArgs {
                 max_file_size_mb: 256.0,
                 exclude: Vec::new(),
diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs
index 8b4f59c..941b15c 100644
--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@@ -143,6 +143,7 @@ mod tests {
                 since_commit: None,
                 branch: None,
             },
+            compat_ignore_comments: false,
             content_filtering_args: ContentFilteringArgs {
                 max_file_size_mb: 25.0,
                 no_extract_archives: false,
diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs
index df56427..22d3204 100644
--- a/src/scanner/enumerate.rs
+++ b/src/scanner/enumerate.rs
@@ -144,6 +144,7 @@ pub fn enumerate_filesystem_inputs(
         Some(&matcher_stats),
         enable_profiling,
         Some(shared_profiler),
+        args.compat_ignore_comments,
     )?;
     let blob_processor_init_time = Mutex::new(t1.elapsed());
     let make_blob_processor = || -> BlobProcessor {
diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs
index eb4ad10..9b72359 100644
--- a/src/scanner/repos.rs
+++ b/src/scanner/repos.rs
@@ -622,6 +622,7 @@ pub async fn fetch_s3_objects(
         Some(matcher_stats),
         enable_profiling,
         Some(shared_profiler.clone()),
+        args.compat_ignore_comments,
     )?;
     let mut processor = BlobProcessor { matcher };
 
diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs
index 72bd950..7820b9c 100644
--- a/tests/int_allowlist.rs
+++ b/tests/int_allowlist.rs
@@ -112,6 +112,7 @@ fn run_skiplist(skip_regex: Vec<String>, skip_skipword: Vec<String>) -> Result<u
             since_commit: None,
             branch: None,
         },
+        compat_ignore_comments: false,
         content_filtering_args: ContentFilteringArgs {
             max_file_size_mb: 5.0,
             exclude: Vec::new(),
diff --git a/tests/int_bitbucket.rs b/tests/int_bitbucket.rs
index 373f11b..0cbe1e4 100644
--- a/tests/int_bitbucket.rs
+++ b/tests/int_bitbucket.rs
@@ -111,6 +111,7 @@ fn test_bitbucket_remote_scan() -> Result<()> {
             since_commit: None,
             branch: None,
         },
+        compat_ignore_comments: false,
         content_filtering_args: ContentFilteringArgs {
             max_file_size_mb: 25.0,
             no_extract_archives: false,
diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs
index cd83a7f..931f1c5 100644
--- a/tests/int_dedup.rs
+++ b/tests/int_dedup.rs
@@ -131,6 +131,7 @@ rules:
             since_commit: None,
             branch: None,
         },
+        compat_ignore_comments: false,
         content_filtering_args: ContentFilteringArgs {
             max_file_size_mb: 5.0,
             extraction_depth: 1,
diff --git a/tests/int_github.rs b/tests/int_github.rs
index 06c67a7..b158a59 100644
--- a/tests/int_github.rs
+++ b/tests/int_github.rs
@@ -118,6 +118,7 @@ fn test_github_remote_scan() -> Result<()> {
             since_commit: None,
             branch: None,
         },
+        compat_ignore_comments: false,
         content_filtering_args: ContentFilteringArgs {
             max_file_size_mb: 25.0,
             no_extract_archives: false,
diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs
index e55655a..abbfe72 100644
--- a/tests/int_gitlab.rs
+++ b/tests/int_gitlab.rs
@@ -116,6 +116,7 @@ fn test_gitlab_remote_scan() -> Result<()> {
             since_commit: None,
             branch: None,
         },
+        compat_ignore_comments: false,
         content_filtering_args: ContentFilteringArgs {
             max_file_size_mb: 25.0,
             no_extract_archives: false,
@@ -251,6 +252,7 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
             since_commit: None,
             branch: None,
         },
+        compat_ignore_comments: false,
         content_filtering_args: ContentFilteringArgs {
             max_file_size_mb: 25.0,
             no_extract_archives: false,
diff --git a/tests/int_redact.rs b/tests/int_redact.rs
index 48247af..a5a50c6 100644
--- a/tests/int_redact.rs
+++ b/tests/int_redact.rs
@@ -95,6 +95,7 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
             since_commit: None,
             branch: None,
         },
+        compat_ignore_comments: false,
         content_filtering_args: ContentFilteringArgs {
             max_file_size_mb: 25.0,
             extraction_depth: 2,
diff --git a/tests/int_slack.rs b/tests/int_slack.rs
index 2575a3c..b38f93c 100644
--- a/tests/int_slack.rs
+++ b/tests/int_slack.rs
@@ -102,6 +102,7 @@ impl TestContext {
                 since_commit: None,
                 branch: None,
             },
+            compat_ignore_comments: false,
             content_filtering_args: ContentFilteringArgs {
                 max_file_size_mb: 25.0,
                 extraction_depth: 2,
@@ -225,6 +226,7 @@ async fn test_scan_slack_messages() -> Result<()> {
             since_commit: None,
             branch: None,
         },
+        compat_ignore_comments: false,
         content_filtering_args: ContentFilteringArgs {
             max_file_size_mb: 25.0,
             extraction_depth: 2,
diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs
index ea1c809..0f66766 100644
--- a/tests/int_validation_cache.rs
+++ b/tests/int_validation_cache.rs
@@ -174,6 +174,7 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
             since_commit: None,
             branch: None,
         },
+        compat_ignore_comments: false,
         content_filtering_args: ContentFilteringArgs {
             max_file_size_mb: 25.0,
             extraction_depth: 2,
diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs
index b87d721..d79c167 100644
--- a/tests/int_vulnerable_files.rs
+++ b/tests/int_vulnerable_files.rs
@@ -117,6 +117,7 @@ impl TestContext {
                 since_commit: None,
                 branch: None,
             },
+            compat_ignore_comments: false,
             content_filtering_args: ContentFilteringArgs {
                 max_file_size_mb: 25.0,
                 extraction_depth: 2,
@@ -227,6 +228,7 @@ impl TestContext {
                 since_commit: None,
                 branch: None,
             },
+            compat_ignore_comments: false,
             content_filtering_args: ContentFilteringArgs {
                 max_file_size_mb: 25.0,
                 extraction_depth: 2,

From b2a62a9c8ad56c22d1df221c2393cf4760a032a2 Mon Sep 17 00:00:00 2001
From: Mick Grove <mick.grove@mongodb.com>
Date: Thu, 9 Oct 2025 17:59:22 -0700
Subject: [PATCH 2/2] - Added kingfisher:ignore (or kingfisher:allow) to
 silence a finding inline within a file - Added: to reuse existing inline
 directives from other scanners, pass --compat-ignore-comments to also accept
 NOSONAR, kics-scan ignore,  gitleaks:allow and trufflehog:ignore

---
 src/matcher.rs.orig | 1317 -------------------------------------------
 src/matcher.rs.rej  |   52 --
 2 files changed, 1369 deletions(-)
 delete mode 100644 src/matcher.rs.orig
 delete mode 100644 src/matcher.rs.rej

diff --git a/src/matcher.rs.orig b/src/matcher.rs.orig
deleted file mode 100644
index 2688875..0000000
--- a/src/matcher.rs.orig
+++ /dev/null
@@ -1,1317 +0,0 @@
-use std::{
-    hash::{Hash, Hasher},
-    str,
-    sync::{Arc, Mutex},
-};
-
-use anyhow::Result;
-use base64::{engine::general_purpose, Engine};
-use bstr::BString;
-use http::StatusCode;
-use regex::bytes::Regex;
-use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
-use schemars::{
-    gen::SchemaGenerator,
-    schema::{ArrayValidation, InstanceType, Schema},
-    JsonSchema,
-};
-use serde::{Deserialize, Serialize};
-use smallvec::SmallVec;
-use tracing::debug;
-use xxhash_rust::xxh3::xxh3_64;
-
-use crate::{
-    blob::{Blob, BlobId, BlobIdMap},
-    entropy::calculate_shannon_entropy,
-    inline_ignore::InlineIgnoreConfig,
-    location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
-    origin::OriginSet,
-    parser,
-    parser::{Checker, Language},
-    rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer},
-    rules::rule::Rule,
-    rules_database::RulesDatabase,
-    safe_list::{is_safe_match, is_user_match},
-    scanner_pool::ScannerPool,
-    snippet::Base64BString,
-    util::{intern, redact_value},
-};
-
-const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
-const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
-const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
-const TREE_SITTER_MAX_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs <= 64 KiB
-const TREE_SITTER_MIN_LIMIT: usize = 1 * 1024; // only run tree-sitter on blobs >= 1 KiB
-
-// -------------------------------------------------------------------------------------------------
-// RawMatch
-// -------------------------------------------------------------------------------------------------
-/// A raw match, as recorded by a callback to Vectorscan.
-///
-/// When matching with Vectorscan, we simply collect all matches into a
-/// preallocated `Vec`, and then go through them all after scanning is complete.
-#[derive(PartialEq, Eq, Debug, Clone)]
-struct RawMatch {
-    rule_id: u32,
-    start_idx: u64,
-    end_idx: u64,
-}
-#[derive(Clone)]
-pub struct OwnedBlobMatch {
-    pub rule: Arc<Rule>,
-    pub blob_id: BlobId,
-    /// The unique content-based identifier of this match
-    pub finding_fingerprint: u64,
-    pub matching_input_offset_span: OffsetSpan,
-    pub captures: SerializableCaptures,
-    pub validation_response_body: String,
-    pub validation_response_status: StatusCode,
-    pub validation_success: bool,
-    pub calculated_entropy: f32,
-    pub is_base64: bool,
-}
-impl<'a> Matcher<'a> {
-    pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
-        self.profiler.as_ref().map(|p| p.generate_report())
-    }
-}
-impl OwnedBlobMatch {
-    pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc<Rule>) -> OwnedBlobMatch {
-        OwnedBlobMatch {
-            rule,
-            blob_id: m.blob_id,
-            finding_fingerprint: m.finding_fingerprint,
-            // matching_input: m.snippet.matching.0.to_vec(),
-            matching_input_offset_span: m.location.offset_span.clone(),
-            captures: m.groups.clone(),
-            validation_response_body: m.validation_response_body.clone(),
-            validation_response_status: StatusCode::from_u16(m.validation_response_status)
-                .unwrap_or(StatusCode::CONTINUE),
-            validation_success: m.validation_success,
-            calculated_entropy: m.calculated_entropy,
-            is_base64: m.is_base64,
-        }
-    }
-
-    pub fn from_blob_match(blob_match: BlobMatch) -> Self {
-        // Get the matching value from capture group 1 (or 0 if not available)
-        let matching_finding = blob_match
-            .captures
-            .captures
-            .get(1)
-            .or_else(|| blob_match.captures.captures.get(0))
-            .map(|capture| capture.value.as_bytes().to_vec())
-            .unwrap_or_else(Vec::new);
-
-        let mut owned_blob_match = OwnedBlobMatch {
-            rule: blob_match.rule,
-            blob_id: blob_match.blob_id.clone(),
-            matching_input_offset_span: blob_match.matching_input_offset_span,
-            captures: blob_match.captures.clone(),
-            validation_response_body: blob_match.validation_response_body,
-            validation_response_status: blob_match.validation_response_status,
-            validation_success: blob_match.validation_success,
-            calculated_entropy: blob_match.calculated_entropy,
-            finding_fingerprint: 0, //default
-            is_base64: blob_match.is_base64,
-        };
-
-        // Convert matching_finding to a &str (using lossy conversion if needed)
-        let finding_value = std::str::from_utf8(&matching_finding).unwrap_or("");
-        // Use blob_id as the file/commit identifier
-        let file_or_commit = &blob_match.blob_id.to_string();
-
-        let offset_start: u64 =
-            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
-        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
-
-        owned_blob_match.finding_fingerprint =
-            compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end);
-
-        owned_blob_match
-    }
-}
-// -------------------------------------------------------------------------------------------------
-// BlobMatch
-// -------------------------------------------------------------------------------------------------
-/// A `BlobMatch` is the result type from `Matcher::scan_blob`.
-///
-/// It is mostly made up of references and small data.
-/// For a representation that is more friendly for human consumption, see
-/// `Match`.
-pub struct BlobMatch<'a> {
-    /// The rule that was matched
-    pub rule: Arc<Rule>, // Changed from `&'a Rule` to `Arc<Rule
-
-    /// The blob that was matched
-    pub blob_id: &'a BlobId,
-
-    /// The matching input in `blob.input`
-    pub matching_input: &'a [u8],
-
-    /// The location of the matching input in `blob.input`
-    pub matching_input_offset_span: OffsetSpan,
-
-    /// The capture groups from the match
-    pub captures: SerializableCaptures, // regex::bytes::Captures<'a>,
-
-    pub validation_response_body: String,
-    pub validation_response_status: StatusCode,
-
-    pub validation_success: bool,
-    pub calculated_entropy: f32,
-    pub is_base64: bool,
-}
-#[derive(Clone)]
-struct UserData {
-    /// A scratch vector for raw matches from Vectorscan, to minimize allocation
-    raw_matches_scratch: Vec<RawMatch>,
-
-    /// The length of the input being scanned
-    input_len: u64,
-}
-// -------------------------------------------------------------------------------------------------
-// Matcher
-// -------------------------------------------------------------------------------------------------
-/// A `Matcher` is able to scan inputs for matches from rules in a
-/// `RulesDatabase`.
-///
-/// If doing multi-threaded scanning, use a separate `Matcher` for each thread.
-#[derive(Clone)]
-pub struct Matcher<'a> {
-    /// Thread-local pool that hands out a &mut BlockScanner
-    scanner_pool: std::sync::Arc<crate::scanner_pool::ScannerPool>,
-
-    /// The rules database used for matching
-    rules_db: &'a RulesDatabase,
-
-    /// Local statistics for this `Matcher`
-    local_stats: MatcherStats,
-
-    /// Global statistics, updated with the local statsistics when this
-    /// `Matcher` is dropped
-    global_stats: Option<&'a Mutex<MatcherStats>>,
-
-    /// The set of blobs that have been seen
-    seen_blobs: &'a BlobIdMap<bool>,
-
-    /// Data passed to the Vectorscan callback
-    user_data: UserData,
-
-    /// Rule profiler for measuring performance of individual rules
-    profiler: Option<Arc<ConcurrentRuleProfiler>>,
-
-    /// Configuration that controls inline ignore directives
-    inline_ignore_config: InlineIgnoreConfig,
-}
-/// This `Drop` implementation updates the `global_stats` with the local stats
-impl<'a> Drop for Matcher<'a> {
-    fn drop(&mut self) {
-        if let Some(global_stats) = self.global_stats {
-            let mut global_stats = global_stats.lock().unwrap();
-            global_stats.update(&self.local_stats);
-        }
-    }
-}
-pub enum ScanResult<'a> {
-    SeenWithMatches,
-    SeenSansMatches,
-    New(Vec<BlobMatch<'a>>),
-}
-impl<'a> Matcher<'a> {
-    /// Create a new `Matcher` from the given `RulesDatabase`.
-    ///
-    /// If `global_stats` is provided, it will be updated with the local stats
-    /// from this `Matcher` when it is dropped.
-    pub fn new(
-        rules_db: &'a RulesDatabase,
-        scanner_pool: Arc<ScannerPool>,
-        seen_blobs: &'a BlobIdMap<bool>,
-        global_stats: Option<&'a Mutex<MatcherStats>>,
-        enable_profiling: bool,
-        shared_profiler: Option<Arc<ConcurrentRuleProfiler>>,
-        include_external_ignore_syntax: bool,
-    ) -> Result<Self> {
-        // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec
-        let raw_matches_scratch = Vec::new();
-        let user_data = UserData { raw_matches_scratch, input_len: 0 };
-        // let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?;
-        // pool is created once per scan run (see Scanner section below)
-        let profiler = shared_profiler.or_else(|| {
-            if enable_profiling {
-                Some(Arc::new(ConcurrentRuleProfiler::new()))
-            } else {
-                None
-            }
-        });
-        Ok(Matcher {
-            scanner_pool,
-            rules_db,
-            local_stats: MatcherStats::default(),
-            global_stats,
-            seen_blobs,
-            user_data,
-            profiler,
-            inline_ignore_config: InlineIgnoreConfig::new(include_external_ignore_syntax),
-        })
-    }
-
-    fn scan_bytes_raw(&mut self, input: &[u8], _filename: &str) -> Result<()> {
-        // Remember previous peak automatically
-        let prev_capacity = self.user_data.raw_matches_scratch.capacity();
-        self.user_data.raw_matches_scratch.clear();
-        self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64));
-
-        self.user_data.input_len = input.len() as u64;
-
-        let mut offset: usize = 0;
-        while offset < input.len() {
-            let end = (offset + MAX_CHUNK_SIZE).min(input.len());
-            let slice = &input[offset..end];
-            let base = offset as u64;
-            self.scanner_pool.with(|scanner| {
-                scanner.scan(slice, |rule_id, from, to, _flags| {
-                    self.user_data.raw_matches_scratch.push(RawMatch {
-                        rule_id,
-                        start_idx: from + base,
-                        end_idx: to + base,
-                    });
-                    vectorscan_rs::Scan::Continue
-                })
-            })?;
-
-            if end == input.len() {
-                break;
-            }
-            offset = end.saturating_sub(CHUNK_OVERLAP);
-        }
-
-        Ok(())
-    }
-
-    pub fn scan_blob<'b>(
-        &mut self,
-        blob: &'b Blob,
-        origin: &OriginSet,
-        lang: Option<String>,
-        redact: bool,
-        no_dedup: bool,
-        no_base64: bool,
-    ) -> Result<ScanResult<'b>>
-    where
-        'a: 'b,
-    {
-        // Update local stats
-        self.local_stats.blobs_seen += 1;
-        self.local_stats.bytes_seen += blob.bytes().len() as u64;
-        self.local_stats.blobs_scanned += 1;
-        self.local_stats.bytes_scanned += blob.bytes().len() as u64;
-
-        // Extract filename from origin
-        let filename = origin
-            .first()
-            .blob_path()
-            .and_then(|path| path.file_name())
-            .and_then(|name| name.to_str())
-            .unwrap_or("unknown_file")
-            .to_string();
-        // Perform the scan
-        self.scan_bytes_raw(&blob.bytes(), &filename)?;
-
-        // Opportunistically look for standalone Base64 blobs. If neither
-        // the raw scan nor this check yields anything, we can return early
-        // before doing any heavier work.
-        let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT {
-            Vec::new()
-        } else {
-            get_base64_strings(blob.bytes())
-        };
-
-        let lang_hint = lang.as_deref();
-        let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty();
-        let has_base64_items = !b64_items.is_empty();
-
-        if !has_raw_matches && !has_base64_items {
-            return Ok(ScanResult::New(Vec::new()));
-        }
-
-        let rules_db = self.rules_db;
-        let mut seen_matches = FxHashSet::default();
-        let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
-
-        let blob_len = blob.len();
-
-        let should_run_tree_sitter = blob_len > 0
-            && blob_len <= TREE_SITTER_MAX_LIMIT
-            && blob_len >= TREE_SITTER_MIN_LIMIT
-            && has_raw_matches
-            && lang_hint.is_some()
-            && !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled
-
-        let tree_sitter_result = if should_run_tree_sitter {
-            lang_hint.and_then(|lang_str| {
-                get_language_and_queries(lang_str).and_then(|(language, queries)| {
-                    let checker = Checker { language, rules: queries };
-                    match checker.check(&blob.bytes()) {
-                        Ok(results) => Some(results),
-                        Err(e) => {
-                            println!("Error in checker.check: {}", e);
-                            None
-                        }
-                    }
-                })
-            })
-        } else {
-            None
-        };
-        // Process matches
-        let mut matches = Vec::new();
-        let owned_ts_results = tree_sitter_result.map(|ts_results| {
-            ts_results
-                .into_iter()
-                .filter(|match_result| match_result.is_base64_decoded)
-                .map(|match_result| {
-                    (
-                        match_result.range,
-                        match_result.text,
-                        match_result.is_base64_decoded,
-                        match_result.original_base64,
-                    )
-                })
-                .collect::<Vec<_>>()
-        });
-        let mut previous_raw_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
-        for &RawMatch { rule_id, start_idx, end_idx } in
-            self.user_data.raw_matches_scratch.iter().rev()
-        {
-            let rule_id_usize: usize = rule_id as usize;
-            let rule = Arc::clone(&rules_db.rules[rule_id_usize]);
-            let re = &rules_db.anchored_regexes[rule_id_usize];
-            let start_idx_usize = start_idx as usize;
-            let end_idx_usize = end_idx as usize;
-            let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
-            if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
-                continue;
-            }
-            filter_match(
-                blob,
-                rule,
-                re,
-                start_idx_usize,
-                end_idx_usize,
-                &mut matches,
-                &mut previous_matches,
-                rule_id_usize,
-                &mut seen_matches,
-                origin,
-                None,
-                false,
-                redact,
-                &filename,
-                self.profiler.as_ref(),
-                &self.inline_ignore_config,
-            );
-        }
-        // If tree-sitter produced base64-decoded matches, try them against all rules
-        if let Some(ref ts_results) = owned_ts_results {
-            for (ts_range, ts_match, is_base64_decoded, _original_base64) in ts_results.iter() {
-                if *is_base64_decoded {
-                    for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() {
-                        let re = &rules_db.anchored_regexes[rule_id_usize];
-                        filter_match(
-                            blob,
-                            rule.clone(),
-                            re,
-                            ts_range.start,
-                            ts_range.end,
-                            &mut matches,
-                            &mut previous_matches,
-                            rule_id_usize,
-                            &mut seen_matches,
-                            origin,
-                            Some(ts_match.as_bytes()),
-                            *is_base64_decoded,
-                            redact,
-                            &filename,
-                            self.profiler.as_ref(),
-                            &self.inline_ignore_config,
-                        );
-                    }
-                }
-            }
-        }
-
-        if !no_base64 {
-            // If the blob contains standalone Base64 blobs, decode and scan them as well
-            const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep
-            let mut b64_stack: Vec<(DecodedData, usize)> =
-                b64_items.drain(..).map(|d| (d, 0)).collect();
-            while let Some((item, depth)) = b64_stack.pop() {
-                for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() {
-                    let re = &rules_db.anchored_regexes[rule_id_usize];
-                    filter_match(
-                        blob,
-                        rule.clone(),
-                        re,
-                        item.pos_start,
-                        item.pos_end,
-                        &mut matches,
-                        &mut previous_matches,
-                        rule_id_usize,
-                        &mut seen_matches,
-                        origin,
-                        Some(item.decoded.as_bytes()),
-                        true,
-                        redact,
-                        &filename,
-                        self.profiler.as_ref(),
-                        &self.inline_ignore_config,
-                    );
-                }
-                if depth + 1 < MAX_B64_DEPTH {
-                    for nested in get_base64_strings(item.decoded.as_bytes()) {
-                        b64_stack.push((
-                            DecodedData {
-                                original: nested.original,
-                                decoded: nested.decoded,
-                                pos_start: item.pos_start,
-                                pos_end: item.pos_end,
-                            },
-                            depth + 1,
-                        ));
-                    }
-                }
-            }
-        }
-        // Finalize
-        if !no_dedup && !matches.is_empty() {
-            let blob_id = blob.id();
-            if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
-                return Ok(if had_matches {
-                    ScanResult::SeenWithMatches
-                } else {
-                    ScanResult::SeenSansMatches
-                });
-            }
-        }
-
-        // --- opportunistic capacity cap ---------------------------------
-        if self.user_data.raw_matches_scratch.capacity()
-            > self.user_data.raw_matches_scratch.len() * 4
-        {
-            // Vec::shrink_to_fit may re-allocate, but we’re about to leave scan_blob
-            // so the cost is hidden off the hot path.
-            self.user_data.raw_matches_scratch.shrink_to_fit();
-        }
-
-        Ok(ScanResult::New(matches))
-        // Ok(result)
-    }
-}
-
-#[inline]
-fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 {
-    let mut hasher = FxHasher::default();
-    // Hash each component directly without allocation
-    content.hash(&mut hasher);
-    rule_id.hash(&mut hasher);
-    start.hash(&mut hasher);
-    end.hash(&mut hasher);
-    hasher.finish()
-}
-
-#[inline]
-fn insert_span(spans: &mut Vec<OffsetSpan>, span: OffsetSpan) -> bool {
-    let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i);
-    if idx > 0 {
-        if spans[idx - 1].fully_contains(&span) {
-            return false;
-        }
-        if span.fully_contains(&spans[idx - 1]) {
-            spans.remove(idx - 1);
-            idx -= 1;
-        }
-    }
-    if idx < spans.len() {
-        if spans[idx].fully_contains(&span) {
-            return false;
-        }
-        if span.fully_contains(&spans[idx]) {
-            spans.remove(idx);
-        }
-    }
-    spans.insert(idx, span);
-    true
-}
-
-#[inline]
-fn record_match(
-    map: &mut FxHashMap<usize, Vec<OffsetSpan>>,
-    rule_id: usize,
-    span: OffsetSpan,
-) -> bool {
-    insert_span(map.entry(rule_id).or_default(), span)
-}
-fn filter_match<'b>(
-    blob: &'b Blob,
-    // rule: &'b Rule,
-    rule: Arc<Rule>,
-    re: &Regex,
-    start: usize,
-    end: usize,
-    matches: &mut Vec<BlobMatch<'b>>,
-    previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
-    rule_id: usize,
-    seen_matches: &mut FxHashSet<u64>,
-    _origin: &OriginSet,
-    ts_match: Option<&[u8]>,
-    is_base64: bool,
-    redact: bool,
-    filename: &str,
-    profiler: Option<&Arc<ConcurrentRuleProfiler>>,
-    inline_ignore_config: &InlineIgnoreConfig,
-) {
-    let mut timer =
-        profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
-
-    let initial_len = matches.len();
-
-    let blob_bytes = blob.bytes();
-    let default_slice = &blob_bytes[start..end];
-    let haystack = ts_match.unwrap_or(default_slice);
-
-    for captures in re.captures_iter(haystack) {
-        let full_capture = captures.get(0).unwrap();
-        let matching_input = captures.get(1).unwrap_or(full_capture);
-        let min_entropy = rule.min_entropy();
-        let mi_bytes = matching_input.as_bytes();
-        let full_bytes = full_capture.as_bytes();
-        let calculated_entropy = calculate_shannon_entropy(mi_bytes);
-        if calculated_entropy <= min_entropy
-            || is_safe_match(mi_bytes)
-            || is_user_match(mi_bytes, full_bytes)
-        {
-            debug!(
-                "Skipping match with entropy {} <= {} or safe match",
-                calculated_entropy, min_entropy
-            );
-            continue;
-        }
-        let matching_input_offset_span = OffsetSpan::from_range(
-            (start + matching_input.start())..(start + matching_input.end()),
-        );
-        if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
-            debug!("Skipping match due to inline ignore directive");
-            continue;
-        }
-        let match_key = compute_match_key(
-            matching_input.as_bytes(),
-            rule.id().as_bytes(),
-            matching_input_offset_span.start,
-            matching_input_offset_span.end,
-        );
-        if !seen_matches.insert(match_key) {
-            continue;
-        }
-        if !record_match(previous_matches, rule_id, matching_input_offset_span) {
-            continue;
-        }
-        let only_matching_input =
-            &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
-        let groups = SerializableCaptures::from_captures(&captures, haystack, re, redact);
-        matches.push(BlobMatch {
-            rule: Arc::clone(&rule),
-            blob_id: blob.id_ref(),
-            matching_input: only_matching_input,
-            matching_input_offset_span,
-            captures: groups,
-            validation_response_body: String::new(),
-            validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
-            validation_success: false,
-            calculated_entropy,
-            is_base64,
-        });
-    }
-    if let Some(t) = timer.take() {
-        let new_count = (matches.len() - initial_len) as u64;
-        t.end(new_count > 0, new_count, 0);
-    }
-}
-fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, String>)> {
-    match lang.to_lowercase().as_str() {
-        "bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())),
-        "c" => Some((Language::C, parser::queries::c::get_c_queries())),
-        "c#" | "csharp" => Some((Language::CSharp, parser::queries::csharp::get_csharp_queries())),
-        "c++" | "cpp" => Some((Language::Cpp, parser::queries::cpp::get_cpp_queries())),
-        "css" => Some((Language::Css, parser::queries::css::get_css_queries())),
-        "go" => Some((Language::Go, parser::queries::go::get_go_queries())),
-        "html" => Some((Language::Html, parser::queries::html::get_html_queries())),
-        "java" => Some((Language::Java, parser::queries::java::get_java_queries())),
-        "javascript" | "js" => {
-            Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries()))
-        }
-        // "kotlin" => Some((
-        //     Language::Kotlin,
-        //     parser::queries::kotlin::get_kotlin_queries(),
-        // )),
-        "php" => Some((Language::Php, parser::queries::php::get_php_queries())),
-        "python" | "py" | "starlark" => {
-            Some((Language::Python, parser::queries::python::get_python_queries()))
-        }
-        "ruby" => Some((Language::Ruby, parser::queries::ruby::get_ruby_queries())),
-        "rust" => Some((Language::Rust, parser::queries::rust::get_rust_queries())),
-        "toml" => Some((Language::Toml, parser::queries::toml::get_toml_queries())),
-        "typescript" | "ts" => {
-            Some((Language::TypeScript, parser::queries::typescript::get_typescript_queries()))
-        }
-        "yaml" => Some((Language::Yaml, parser::queries::yaml::get_yaml_queries())),
-        _ => None,
-    }
-}
-// -------------------------------------------------------------------------------------------------
-// MatchStats
-// -------------------------------------------------------------------------------------------------
-#[derive(Debug, Default, Clone)]
-pub struct MatcherStats {
-    pub blobs_seen: u64,
-    pub blobs_scanned: u64,
-    pub bytes_seen: u64,
-    pub bytes_scanned: u64,
-    // #[cfg(feature = "rule_profiling")]
-    // pub rule_stats: crate::rule_profiling::RuleProfile,
-}
-impl MatcherStats {
-    pub fn update(&mut self, other: &Self) {
-        self.blobs_seen += other.blobs_seen;
-        self.blobs_scanned += other.blobs_scanned;
-        self.bytes_seen += other.bytes_seen;
-        self.bytes_scanned += other.bytes_scanned;
-
-        // #[cfg(feature = "rule_profiling")]
-        // self.rule_stats.update(&other.rule_stats);
-    }
-}
-// -------------------------------------------------------------------------------------------------
-// Group
-// -------------------------------------------------------------------------------------------------
-#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)]
-pub struct Group(pub Base64BString);
-impl Group {
-    pub fn new(m: regex::bytes::Match<'_>) -> Self {
-        Self(Base64BString(BString::from(m.as_bytes())))
-    }
-}
-// -------------------------------------------------------------------------------------------------
-// Groups
-// -------------------------------------------------------------------------------------------------
-#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
-pub struct Groups(pub SmallVec<[Group; 1]>);
-impl JsonSchema for Groups {
-    fn schema_name() -> String {
-        "Groups".to_string()
-    }
-
-    fn json_schema(gen: &mut SchemaGenerator) -> Schema {
-        let group_schema = gen.subschema_for::<Group>();
-        Schema::Object(schemars::schema::SchemaObject {
-            instance_type: Some(InstanceType::Array.into()),
-            array: Some(Box::new(ArrayValidation {
-                items: Some(group_schema.into()),
-                ..Default::default()
-            })),
-            ..Default::default()
-        })
-    }
-}
-// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
-// pub struct SerializableCapture {
-//     pub name: Option<String>, // Named group (if available)
-//     pub match_number: i32,
-//     pub start: usize,  // Start position of the match
-//     pub end: usize,    // End position of the match
-//     pub value: String, // The actual captured value
-// }
-#[derive(Debug, Clone, Serialize, JsonSchema)]
-pub struct SerializableCapture {
-    pub name: Option<String>,
-    pub match_number: i32,
-    pub start: usize,
-    pub end: usize,
-    /// Interned value of the capture.
-    pub value: &'static str,
-}
-
-#[derive(Debug, Clone, Serialize, JsonSchema)]
-pub struct SerializableCaptures {
-    #[schemars(with = "Vec<SerializableCapture>")]
-    pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
-}
-impl SerializableCaptures {
-    pub fn from_captures(
-        captures: &regex::bytes::Captures,
-        _input: &[u8],
-        re: &Regex,
-        redact: bool,
-    ) -> Self {
-        let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
-        // Process named captures
-        for name in re.capture_names().flatten() {
-            if let Some(capture) = captures.name(name) {
-                let value = if redact {
-                    redact_value(&String::from_utf8_lossy(capture.as_bytes()))
-                } else {
-                    String::from_utf8_lossy(capture.as_bytes()).to_string()
-                };
-                serialized_captures.push(SerializableCapture {
-                    name: Some(name.to_string()),
-                    match_number: -1,
-                    start: capture.start(),
-                    end: capture.end(),
-                    value: intern(&value),
-                });
-            }
-        }
-        // Process unnamed captures (numbered groups)
-        for i in 0..captures.len() {
-            if let Some(capture) = captures.get(i) {
-                let value = if redact {
-                    redact_value(&String::from_utf8_lossy(capture.as_bytes()))
-                } else {
-                    String::from_utf8_lossy(capture.as_bytes()).to_string()
-                };
-                serialized_captures.push(SerializableCapture {
-                    name: None,
-                    match_number: i32::try_from(i).unwrap_or(0),
-                    start: capture.start(),
-                    end: capture.end(),
-                    value: intern(&value),
-                });
-            }
-        }
-        SerializableCaptures { captures: serialized_captures }
-    }
-}
-// -------------------------------------------------------------------------------------------------
-// Match
-// -------------------------------------------------------------------------------------------------
-#[derive(Debug, Clone, Serialize, JsonSchema)]
-pub struct Match {
-    /// The location of the entire matching content
-    pub location: Location,
-
-    /// The capture groups
-    pub groups: SerializableCaptures, // Store serialized captures
-
-    /// unique identifier of file / blob where this match was found
-    pub blob_id: BlobId,
-
-    /// The unique content-based identifier of this match
-    pub finding_fingerprint: u64,
-
-    /// The rule that produced this match
-    #[serde(skip_serializing)]
-    #[schemars(skip)]
-    pub rule: Arc<Rule>,
-
-    /// Validation Body
-    pub validation_response_body: String,
-
-    /// Validation Status Code
-    pub validation_response_status: u16,
-
-    /// Validation Success
-    pub validation_success: bool,
-
-    /// Validation Success
-    pub calculated_entropy: f32,
-
-    pub visible: bool,
-    #[serde(default)]
-    pub is_base64: bool,
-}
-impl Match {
-    #[inline]
-    pub fn convert_owned_blobmatch_to_match<'a>(
-        loc_mapping: Option<&'a LocationMapping<'a>>,
-        owned_blob_match: &'a OwnedBlobMatch,
-        origin_type: &'a str,
-    ) -> Self {
-        let offset_span = owned_blob_match.matching_input_offset_span;
-        // Extract the matched secret content. Use capture group 1 if it exists, otherwise fall back
-        // to group 0.
-        let matching_finding_bytes = owned_blob_match
-            .captures
-            .captures
-            .get(1)
-            .or_else(|| owned_blob_match.captures.captures.get(0))
-            .map(|capture| capture.value.as_bytes())
-            .unwrap_or_default();
-
-        // The fingerprint will be based on the content of the secret.
-        let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
-
-        let source_span =
-            loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan {
-                start: SourcePoint { line: 0, column: 0 },
-                end: SourcePoint { line: 0, column: 0 },
-            });
-        let offset_start: u64 =
-            owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
-        let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
-
-        let finding_fingerprint = compute_finding_fingerprint(
-            finding_value_for_fp,
-            origin_type, // file_or_commit,
-            offset_start,
-            offset_end,
-        );
-
-        // matching_snippet
-        Match {
-            rule: owned_blob_match.rule.clone(),
-            visible: owned_blob_match.rule.visible().to_owned(),
-            location: Location { offset_span, source_span: source_span.clone() },
-            groups: owned_blob_match.captures.clone(),
-            blob_id: owned_blob_match.blob_id,
-            finding_fingerprint,
-            validation_response_body: owned_blob_match.validation_response_body.clone(),
-            validation_response_status: owned_blob_match.validation_response_status.as_u16(),
-            validation_success: owned_blob_match.validation_success,
-            calculated_entropy: owned_blob_match.calculated_entropy,
-            is_base64: owned_blob_match.is_base64,
-        }
-    }
-
-    /// Returns the `blob_id` of the match.
-    pub fn get_blob_id(&self) -> BlobId {
-        self.blob_id.clone()
-    }
-
-    pub fn finding_id(&self) -> String {
-        let mut buffer = Vec::with_capacity(128);
-        buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes());
-        buffer.push(0);
-        serde_json::to_writer(&mut buffer, &self.groups)
-            .expect("should be able to serialize groups as JSON");
-        let mut num = xxh3_64(&buffer);
-        // Ensure the number is positive and within i64 range
-        num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive
-                                      // Convert to string
-        num.to_string()
-    }
-}
-#[derive(Debug, Clone)]
-pub struct DecodedData {
-    pub original: String,
-    pub decoded: String,
-    pub pos_start: usize,
-    pub pos_end: usize,
-}
-#[inline]
-fn is_base64_byte(b: u8) -> bool {
-    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/')
-}
-
-pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
-    let mut results = Vec::new();
-    let mut i = 0;
-    while i < input.len() {
-        while i < input.len() && !is_base64_byte(input[i]) {
-            i += 1;
-        }
-        let start = i;
-        while i < input.len() && is_base64_byte(input[i]) {
-            i += 1;
-        }
-
-        let mut eq_count = 0;
-        while i < input.len() && input[i] == b'=' && eq_count < 2 {
-            i += 1;
-            eq_count += 1;
-        }
-        let end = i;
-
-        let len = end - start;
-        if len >= 32 && len % 4 == 0 {
-            let base64_slice = &input[start..end];
-            if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) {
-                if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
-                    if decoded_str.is_ascii() {
-                        results.push(DecodedData {
-                            original: String::from_utf8_lossy(base64_slice).into_owned(),
-                            decoded: decoded_str.to_string(),
-                            pos_start: start,
-                            pos_end: end,
-                        });
-                    }
-                }
-            }
-        }
-    }
-
-    results
-}
-
-pub fn compute_finding_fingerprint(
-    finding_value: &str,
-    file_or_commit: &str,
-    offset_start: u64,
-    offset_end: u64,
-) -> u64 {
-    // Combine all into a byte buffer and hash it directly:
-    let mut buf = Vec::with_capacity(
-        finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::<u64>(),
-    );
-    buf.extend_from_slice(finding_value.as_bytes());
-    buf.extend_from_slice(file_or_commit.as_bytes());
-    buf.extend_from_slice(&offset_start.to_le_bytes());
-    buf.extend_from_slice(&offset_end.to_le_bytes());
-
-    xxh3_64(&buf)
-}
-
-// -------------------------------------------------------------------------------------------------
-// test
-// -------------------------------------------------------------------------------------------------
-#[cfg(test)]
-mod test {
-    use std::{collections::BTreeMap, path::PathBuf};
-
-    use pretty_assertions::assert_eq;
-    // ---------------------------------------------------------------------
-    // proptest: raw-match dedup + entropy gate
-    // ---------------------------------------------------------------------
-    use proptest::prelude::*;
-
-    use super::*;
-    use crate::{
-        blob::{Blob, BlobIdMap},
-        origin::{Origin, OriginSet},
-        rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation},
-    };
-
-    proptest! {
-        #[test]
-        fn prop_no_dupes_and_entropy(
-            // random ASCII up to 300 bytes
-            mut noise in proptest::collection::vec(any::<u8>().prop_filter("ascii", |b| b.is_ascii()), 0..300),
-            // 0-4 random insertion points
-            inserts in proptest::collection::vec(0usize..300, 0..5)
-        ) {
-            // Constant high-entropy secret token that matches the rule below
-            const TOKEN: &[u8] = b"secret_abcd1234";
-
-            // Splice the token at the requested offsets
-            for &idx in &inserts {
-                let pos = idx.min(noise.len());
-                noise.splice(pos..pos, TOKEN.iter().copied());
-            }
-
-            // ── build a single test rule ──────────────────────────────────
-            use crate::rules::rule::{RuleSyntax, Validation, Confidence};
-
-            let rule = Rule::new(RuleSyntax {
-                id: "prop.secret".into(),
-                name: "prop secret".into(),
-                pattern: "secret_[a-z]{4}[0-9]{4}".into(),
-                confidence: Confidence::Low,
-                min_entropy: 3.0,
-                visible: true,
-                examples: vec![],
-                negative_examples: vec![],
-                references: vec![],
-                validation: None::<Validation>,          // no HTTP validation needed
-                depends_on_rule: vec![],
-            });
-
-            let rules_db  = RulesDatabase::from_rules(vec![rule]).unwrap();
-            let seen      = BlobIdMap::new();
-            let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
-            let mut m     = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false).unwrap();
-
-            // ── run the scan ──────────────────────────────────────────────
-            m.scan_bytes_raw(&noise, "buf").unwrap();
-
-            // ── property 1: dedup – each (rule,start,end) is unique ──────
-
-            let mut coords = FxHashSet::default();
-            for RawMatch{rule_id, start_idx, end_idx} in &m.user_data.raw_matches_scratch {
-                assert!(
-                    coords.insert((*rule_id, *start_idx, *end_idx)),
-                    "duplicate raw-match detected for coords ({rule_id},{start_idx},{end_idx})"
-                );
-
-                // ── property 2: entropy gate held ────────────────────────
-                let slice = &noise[*start_idx as usize .. *end_idx as usize];
-                let ent   = calculate_shannon_entropy(slice);
-                assert!(ent > 3.0, "entropy {ent} ≤ min_entropy, gate failed");
-            }
-        }
-    }
-
-    #[test]
-    pub fn test_simple() -> Result<()> {
-        let rules = vec![Rule::new(RuleSyntax {
-            id: "test.1".to_string(),
-            name: "test".to_string(),
-            pattern: "test".to_string(),
-            confidence: crate::rules::rule::Confidence::Medium,
-            min_entropy: 1.0,
-            visible: true,
-            examples: vec![],
-            negative_examples: vec![],
-            references: vec![],
-            validation: Some(Validation::Http(HttpValidation {
-                request: HttpRequest {
-                    method: "GET".to_string(),
-                    url: "https://example.com".to_string(),
-                    headers: BTreeMap::new(),
-                    body: None,
-                    response_matcher: Some(vec![]),
-                    multipart: None,
-                    response_is_html: false,
-                },
-                multipart: None,
-            })),
-            depends_on_rule: vec![
-                Some(DependsOnRule {
-                    rule_id: "d8f3c34b-015f-4cd6-b411-b1366493104c".to_string(),
-                    variable: "email".to_string(),
-                }),
-                Some(DependsOnRule {
-                    rule_id: "8910f364-7718-4a27-a435-d2da13e6ba9e".to_string(),
-                    variable: "domain".to_string(),
-                }),
-            ],
-        })];
-        let rules_db = RulesDatabase::from_rules(rules)?;
-        let input = "some test data for vectorscan";
-        let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
-        let enable_rule_profiling = true;
-        // let mut matcher = Matcher::new(&rules_db, &seen_blobs, None,
-        // enable_rule_profiling)?;
-        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
-        let mut matcher = Matcher::new(
-            &rules_db,
-            scanner_pool,
-            &seen_blobs,
-            None,
-            enable_rule_profiling,
-            None, // Pass the shared profiler
-            false,
-        )?;
-        matcher.scan_bytes_raw(input.as_bytes(), "fname")?;
-        assert_eq!(
-            matcher.user_data.raw_matches_scratch,
-            vec![RawMatch { rule_id: 0, start_idx: 0, end_idx: 9 },]
-        );
-        Ok(())
-    }
-
-    // ---------------------------------------------------------------------
-    // additional deterministic unit-tests
-    // ---------------------------------------------------------------------
-
-    /// `get_base64_strings` should recognise a well-formed token, decode it,
-    /// and report correct byte-offsets.
-    #[test]
-    fn test_get_base64_strings_basic() {
-        let raw = b"foo MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY= bar";
-        // decodes to "0123456789abcdef0123456789abcdef"
-        let hits = get_base64_strings(raw);
-        assert_eq!(hits.len(), 1);
-        let item = &hits[0];
-        assert_eq!(item.decoded, "0123456789abcdef0123456789abcdef");
-        assert_eq!(item.original, "MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY=");
-        // "foo␠" is 4 bytes, so the start offset is 4
-        assert_eq!((item.pos_start, item.pos_end), (4, 4 + item.original.len()));
-    }
-
-    /// `compute_finding_fingerprint` must be stable (same input ⇒ same output)
-    /// and sensitive to any input component.
-    #[test]
-    fn test_finding_fingerprint_stability_and_uniqueness() {
-        let a = compute_finding_fingerprint("secret", "fileA", 0, 6);
-        let b = compute_finding_fingerprint("secret", "fileA", 0, 6);
-        assert_eq!(a, b, "fingerprint should be deterministic");
-
-        // changing any parameter should perturb the hash
-        let c = compute_finding_fingerprint("secret", "fileA", 1, 7); // offsets differ
-        let d = compute_finding_fingerprint("secret", "fileB", 0, 6); // file id differs
-        let e = compute_finding_fingerprint("different", "fileA", 0, 6); // content differs
-        assert_ne!(a, c);
-        assert_ne!(a, d);
-        assert_ne!(a, e);
-    }
-
-    /// The (private) `compute_match_key` helper is the linchpin of the raw-dedup
-    /// path.  It should return identical keys for identical inputs and different
-    /// keys as soon as *anything* changes.
-    #[test]
-    fn test_compute_match_key_uniqueness() {
-        use super::compute_match_key;
-
-        let k1 = compute_match_key(b"abc", b"rule-1", 0, 3);
-        let k2 = compute_match_key(b"abc", b"rule-1", 0, 3);
-        assert_eq!(k1, k2);
-
-        // mutate each component in turn
-        let diff_content = compute_match_key(b"abcd", b"rule-1", 0, 4);
-        let diff_rule = compute_match_key(b"abc", b"rule-2", 0, 3);
-        let diff_span = compute_match_key(b"abc", b"rule-1", 1, 4);
-        assert_ne!(k1, diff_content);
-        assert_ne!(k1, diff_rule);
-        assert_ne!(k1, diff_span);
-    }
-
-    /// Running `scan_bytes_raw` twice over the *same* input should never record
-    /// duplicate entries in `raw_matches_scratch`.
-    #[test]
-    fn test_scan_bytes_raw_no_duplicate_raw_matches() -> Result<()> {
-        // simple rule: literal "dup"
-        let rule = Rule::new(RuleSyntax {
-            id: "dup.check".into(),
-            name: "dup".into(),
-            pattern: "dup".into(),
-            confidence: crate::rules::rule::Confidence::Low,
-            min_entropy: 0.0,
-            visible: true,
-            examples: vec![],
-            negative_examples: vec![],
-            references: vec![],
-            validation: None::<Validation>,
-            depends_on_rule: vec![],
-        });
-
-        let rules_db = RulesDatabase::from_rules(vec![rule])?;
-        let seen = BlobIdMap::new();
-        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
-        let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
-
-        let buf = b"dup dup"; // two literal hits, same rule
-
-        // first scan
-        m.scan_bytes_raw(buf, "buf1")?;
-        let first_len = m.user_data.raw_matches_scratch.len();
-
-        // second scan over the same buffer
-        m.scan_bytes_raw(buf, "buf1")?;
-        let second_len = m.user_data.raw_matches_scratch.len();
-
-        // we should still only have two unique raw matches recorded
-        assert_eq!(first_len, 2);
-        assert_eq!(second_len, 2);
-        Ok(())
-    }
-
-    #[test]
-    fn inline_comment_skips_match() -> Result<()> {
-        let rule = Rule::new(RuleSyntax {
-            id: "inline.ignore".into(),
-            name: "inline".into(),
-            pattern: "secret_token".into(),
-            confidence: crate::rules::rule::Confidence::Low,
-            min_entropy: 0.0,
-            visible: true,
-            examples: vec![],
-            negative_examples: vec![],
-            references: vec![],
-            validation: None::<Validation>,
-            depends_on_rule: vec![],
-        });
-        let rules_db = RulesDatabase::from_rules(vec![rule])?;
-        let seen = BlobIdMap::new();
-        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
-        let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
-
-        let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec());
-        let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt")));
-
-        match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
-            ScanResult::New(matches) => assert!(matches.is_empty()),
-            _ => panic!("unexpected scan result"),
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn inline_comment_after_multiline_secret_skips_match() -> Result<()> {
-        let rule = Rule::new(RuleSyntax {
-            id: "inline.multiline".into(),
-            name: "inline multiline".into(),
-            pattern: "line1\\s+line2".into(),
-            confidence: crate::rules::rule::Confidence::Low,
-            min_entropy: 0.0,
-            visible: true,
-            examples: vec![],
-            negative_examples: vec![],
-            references: vec![],
-            validation: None::<Validation>,
-            depends_on_rule: vec![],
-        });
-        let rules_db = RulesDatabase::from_rules(vec![rule])?;
-        let seen = BlobIdMap::new();
-        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
-        let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
-
-        let blob = Blob::from_bytes(
-            br#"let data = """
-line1
-line2
-"""
-# kingfisher:ignore
-"#
-            .to_vec(),
-        );
-        let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt")));
-
-        match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
-            ScanResult::New(matches) => assert!(matches.is_empty()),
-            _ => panic!("unexpected scan result"),
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn compat_flag_controls_external_directives() -> Result<()> {
-        let rule = Rule::new(RuleSyntax {
-            id: "inline.compat".into(),
-            name: "inline compat".into(),
-            pattern: "supersecret123".into(),
-            confidence: crate::rules::rule::Confidence::Low,
-            min_entropy: 0.0,
-            visible: true,
-            examples: vec![],
-            negative_examples: vec![],
-            references: vec![],
-            validation: None::<Validation>,
-            depends_on_rule: vec![],
-        });
-        let rules_db = RulesDatabase::from_rules(vec![rule])?;
-
-        let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec());
-        let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt")));
-
-        let seen = BlobIdMap::new();
-        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
-        let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
-        let matches_without_compat =
-            match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
-                ScanResult::New(matches) => matches.len(),
-                _ => panic!("unexpected scan result"),
-            };
-        assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag");
-
-        let seen = BlobIdMap::new();
-        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
-        let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true)?;
-        match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
-            ScanResult::New(matches) => assert!(matches.is_empty()),
-            _ => panic!("unexpected scan result"),
-        }
-
-        Ok(())
-    }
-}
diff --git a/src/matcher.rs.rej b/src/matcher.rs.rej
deleted file mode 100644
index dc61305..0000000
--- a/src/matcher.rs.rej
+++ /dev/null
@@ -1,52 +0,0 @@
-@@ -1,50 +1,51 @@
- use std::{
-     hash::{Hash, Hasher},
-     str,
-     sync::{Arc, Mutex},
- };
- 
- use anyhow::Result;
- use base64::{engine::general_purpose, Engine};
- use bstr::BString;
- use http::StatusCode;
- use regex::bytes::Regex;
- use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
- use schemars::{
-     gen::SchemaGenerator,
-     schema::{ArrayValidation, InstanceType, Schema},
-     JsonSchema,
- };
- use serde::{Deserialize, Serialize};
- use smallvec::SmallVec;
- use tracing::debug;
- use xxhash_rust::xxh3::xxh3_64;
- 
- use crate::{
-     blob::{Blob, BlobId, BlobIdMap},
-     entropy::calculate_shannon_entropy,
-+    inline_ignore::InlineIgnoreConfig,
-     location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
-     origin::OriginSet,
-     parser,
-     parser::{Checker, Language},
-     rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer},
-     rules::rule::Rule,
-     rules_database::RulesDatabase,
-     safe_list::{is_safe_match, is_user_match},
-     scanner_pool::ScannerPool,
-     snippet::Base64BString,
-     util::{intern, redact_value},
- };
- 
- const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
- const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
- const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
- const TREE_SITTER_SCAN_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs ≤64 KiB
- 
- // -------------------------------------------------------------------------------------------------
- // RawMatch
- // -------------------------------------------------------------------------------------------------
- /// A raw match, as recorded by a callback to Vectorscan.
- ///
- /// When matching with Vectorscan, we simply collect all matches into a
- /// preallocated `Vec`, and then go through them all after scanning is complete.