From caf766b731985d3eb0e86a1f0db847f47ef90be8 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 9 Oct 2025 17:59:10 -0700 Subject: [PATCH 1/2] - Added kingfisher:ignore (or kingfisher:allow) to silence a finding inline within a file - Added: to reuse existing inline directives from other scanners, pass --compat-ignore-comments to also accept NOSONAR, kics-scan ignore, gitleaks:allow and trufflehog:ignore --- CHANGELOG.md | 4 + README.md | 18 +- README.md.orig | 1036 ++++++++++++++++++++++++++ src/cli/commands/scan.rs | 4 + src/inline_ignore.rs | 342 +++++++++ src/lib.rs | 1 + src/main.rs | 1 + src/matcher.rs | 138 +++- src/matcher.rs.orig | 1317 +++++++++++++++++++++++++++++++++ src/matcher.rs.rej | 52 ++ src/reporter.rs | 1 + src/reporter/json_format.rs | 1 + src/scanner/enumerate.rs | 1 + src/scanner/repos.rs | 1 + tests/int_allowlist.rs | 1 + tests/int_bitbucket.rs | 1 + tests/int_dedup.rs | 1 + tests/int_github.rs | 1 + tests/int_gitlab.rs | 2 + tests/int_redact.rs | 1 + tests/int_slack.rs | 2 + tests/int_validation_cache.rs | 1 + tests/int_vulnerable_files.rs | 2 + 23 files changed, 2924 insertions(+), 5 deletions(-) create mode 100644 README.md.orig create mode 100644 src/inline_ignore.rs create mode 100644 src/matcher.rs.orig create mode 100644 src/matcher.rs.rej diff --git a/CHANGELOG.md b/CHANGELOG.md index 1278533..03d4aca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ All notable changes to this project will be documented in this file. +## [Unreleased] +- Added kingfisher:ignore (or kingfisher:allow) to silence a finding inline within a file +- Added: to reuse existing inline directives from other scanners, pass --compat-ignore-comments to also accept NOSONAR, kics-scan ignore, gitleaks:allow and trufflehog:ignore + ## [v1.56.0] - Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined diff --git a/README.md b/README.md index 14faccf..2e3bfd3 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [Notable Scan Options](#notable-scan-options) - [Understanding `--confidence`](#understanding---confidence) - [Ignore known false positives](#ignore-known-false-positives) + - [Inline ignore directives](#inline-ignore-directives) - [Finding Fingerprint](#finding-fingerprint) - [Rule Performance Profiling](#rule-performance-profiling) - [CLI Options](#cli-options) @@ -962,6 +963,7 @@ leaves the default unchanged. - `--manage-baseline`: Create or update the baseline file with current findings - `--skip-regex `: Ignore findings whose text matches this regex (repeatable) - `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) +- `--compat-ignore-comments`: Honor inline directives from other scanners (treat `gitleaks:allow` and `trufflehog:ignore` like native suppressions) ## Understanding `--confidence` The `--confidence` flag sets a minimum confidence threshold, not an exact match. @@ -972,7 +974,21 @@ The `--confidence` flag sets a minimum confidence threshold, not an exact match. ### Ignore known false positives -Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. +Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. + +### Inline ignore directives + +Add `kingfisher:ignore` (or `kingfisher:allow`) to a trailing comment on the same line as a finding to silence it. Multi-line strings may also be ignored by placing the directive on the closing delimiter line **or** on the next logical line after the string: + +```python +API_KEY = """ +line 1 +line 2 +""" +# kingfisher:ignore +``` + +Kingfisher recognizes comment markers for the host language, including `#`, `//`, `/* */`, `--`, and `*`-prefixed block comments, so you can use this suppression style across languages. To reuse existing inline directives from other scanners, pass `--compat-ignore-comments` to also accept `NOSONAR`, `kics-scan ignore`, `gitleaks:allow` and `trufflehog:ignore`. With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com) diff --git a/README.md.orig b/README.md.orig new file mode 100644 index 0000000..f690e3d --- /dev/null +++ b/README.md.orig @@ -0,0 +1,1036 @@ +# Kingfisher + +

+ Kingfisher Logo + +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) + +Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s hardware‑accelerated Hyperscan regex engine with language‑aware source code parsing, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production +

+ +Originally forked from Praetorian’s Nosey Parker, Kingfisher **adds** live cloud-API validation; many more targets (GitLab, BitBucket, Gitea, S3, Docker, Jira, Confluence, Slack); compressed-file extraction and scanning; baseline and allowlist controls; language-aware detection (~20 languages); and a native Windows binary. See [Origins and Divergence](#origins-and-divergence) for details. +## Key Features + +### Multiple Scan Targets +
+ +| Files / Dirs | Local Git | GitHub | GitLab | Azure DevOps | Bitbucket | Gitea | +|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:| +| Files / Dirs
Files / Dirs | Local Git
Local Git | GitHub
GitHub | GitLab
GitLab | Azure DevOps
Azure DevOps | Bitbucket
Bitbucket | Gitea
Gitea | + +| Docker | Jira | Confluence | Slack | AWS S3 | +|:------:|:----:|:-----------:|:-----:|:------:| +| Docker
Docker | Jira
Jira | Confluence
Confluence | Slack
Slack | AWS S3
AWS S3 | + +
+ +### Performance, Accuracy, and Hundreds of Rules +- **Performance**: multithreaded, Hyperscan‑powered scanning built for huge codebases +- **Extensible rules**: hundreds of built-in detectors plus YAML-defined custom rules ([docs/RULES.md](/docs/RULES.md)) +- **Broad AI SaaS coverage**: finds and validates tokens for OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, Together.ai, Zhipu, and many more +- **Compressed Files**: Supports extracting and scanning compressed files for secrets +- **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) + +**Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation) + +# Benchmark Results + +See ([docs/COMPARISON.md](docs/COMPARISON.md)) + +

+ Kingfisher Runtime Comparison +

+ +- [Kingfisher](#kingfisher) + - [Key Features](#key-features) + - [Multiple Scan Targets](#multiple-scan-targets) + - [Performance, Accuracy, and Hundreds of Rules](#performance-accuracy-and-hundreds-of-rules) +- [Benchmark Results](#benchmark-results) +- [Getting Started](#getting-started) + - [Installation](#installation) + - [Run Kingfisher in Docker](#run-kingfisher-in-docker) +- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance) + - [Write Custom Rules!](#write-custom-rules) +- [Usage](#usage) + - [Basic Examples](#basic-examples) + - [Scan with secret validation](#scan-with-secret-validation) + - [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories) + - [Scan a Git repository without validation](#scan-a-git-repository-without-validation) + - [Display only secrets confirmed active by third‑party APIs](#display-only-secrets-confirmed-active-by-thirdparty-apis) + - [Output JSON and capture to a file](#output-json-and-capture-to-a-file) + - [Output SARIF directly to disk](#output-sarif-directly-to-disk) + - [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--) + - [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size) + - [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag) + - [Display rule performance statistics](#display-rule-performance-statistics) + - [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files) + - [Exclude specific paths](#exclude-specific-paths) + - [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines) + - [ Scanning an AWS S3 Bucket](#-scanning-an-aws-s3-bucket) + - [ Scanning Docker Images](#-scanning-docker-images) + - [ Scanning GitHub](#-scanning-github) + - [Scan GitHub organization (requires `KF_GITHUB_TOKEN`)](#scan-github-organization-requires-kf_github_token) + - [Skip specific GitHub repositories during enumeration](#skip-specific-github-repositories-during-enumeration) + - [Scan remote GitHub repository](#scan-remote-github-repository) + - [ Scanning GitLab](#-scanning-gitlab) + - [Scan GitLab group (requires `KF_GITLAB_TOKEN`)](#scan-gitlab-group-requires-kf_gitlab_token) + - [Scan GitLab user](#scan-gitlab-user) + - [Skip specific GitLab projects during enumeration](#skip-specific-gitlab-projects-during-enumeration) + - [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url) + - [List GitLab repositories](#list-gitlab-repositories) + - [ Scanning Azure Repos](#-scanning-azure-repos) + - [Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-devops-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat) + - [Scan specific Azure DevOps projects](#scan-specific-azure-devops-projects) + - [Skip specific Azure repositories during enumeration](#skip-specific-azure-repositories-during-enumeration) + - [List Azure repositories](#list-azure-repositories) + - [ Scanning Gitea](#-scanning-gitea) + - [Scan Gitea organization (requires `KF_GITEA_TOKEN`)](#scan-gitea-organization-requires-kf_gitea_token) + - [Scan Gitea user](#scan-gitea-user) + - [Skip specific Gitea repositories during enumeration](#skip-specific-gitea-repositories-during-enumeration) + - [Scan remote Gitea repository by URL](#scan-remote-gitea-repository-by-url) + - [List Gitea repositories](#list-gitea-repositories) + - [ Scanning Bitbucket](#-scanning-bitbucket) + - [Scan Bitbucket workspace](#scan-bitbucket-workspace) + - [Scan Bitbucket user](#scan-bitbucket-user) + - [Skip specific Bitbucket repositories during enumeration](#skip-specific-bitbucket-repositories-during-enumeration) + - [Scan remote Bitbucket repository by URL](#scan-remote-bitbucket-repository-by-url) + - [List Bitbucket repositories](#list-bitbucket-repositories) + - [Authenticate to Bitbucket](#authenticate-to-bitbucket) + - [Self-hosted Bitbucket Server](#self-hosted-bitbucket-server) + - [ Scanning Jira](#-scanning-jira) + - [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query) + - [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues) + - [ Scanning Confluence](#-scanning-confluence) + - [Scan Confluence pages matching a CQL query](#scan-confluence-pages-matching-a-cql-query) + - [ Scanning Slack](#-scanning-slack) + - [Scan Slack messages matching a search query](#scan-slack-messages-matching-a-search-query) + - [Environment Variables for Tokens](#environment-variables-for-tokens) + - [Exit Codes](#exit-codes) + - [Update Checks](#update-checks) +- [Advanced Options](#advanced-options) + - [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets) + - [List Builtin Rules](#list-builtin-rules) + - [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run) + - [To add your rules alongside the built‑ins:](#to-add-your-rules-alongside-the-builtins) + - [Other Examples](#other-examples) + - [Customize the HTTP User-Agent](#customize-the-http-user-agent) + - [Notable Scan Options](#notable-scan-options) + - [Understanding `--confidence`](#understanding---confidence) + - [Ignore known false positives](#ignore-known-false-positives) + - [Inline ignore directives](#inline-ignore-directives) + - [Finding Fingerprint](#finding-fingerprint) + - [Rule Performance Profiling](#rule-performance-profiling) + - [CLI Options](#cli-options) + - [Origins and Divergence](#origins-and-divergence) +- [Roadmap](#roadmap) +- [License](#license) + +# Getting Started +## Installation + +On macOS, you can simply + +```bash +brew install kingfisher +``` + +Pre-built binaries are also available on the [Releases](https://github.com/mongodb/kingfisher/releases) section of this page. + +You can also install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform: + +```bash +# Linux, macOS +curl --silent --location \ + https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \ + sh && \ + ubi --project mongodb/kingfisher --in "$HOME/bin" +``` + +```powershell +# Windows +powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in . +``` + +This installs `ubi` and then places the `kingfisher` executable in `~/bin` on Unix-like +systems (or the current directory on Windows). + +Or you may compile for your platform via `make`: + +```bash +# NOTE: Requires Docker +make linux + +# macOS --- must build from a macOS host +make darwin + +# Windows x64 --- requires building from a Windows host with Visual Studio installed +./buildwin.bat -force +``` + +```bash +# Build all targets +make linux-all # builds both x64 and arm64 +make darwin-all # builds both x64 and arm64 +make all # builds for every OS and architecture supported +``` + +### Run Kingfisher in Docker + +Run the dockerized Kingfisher container: +```bash +# GitHub Container Registry +docker run --rm ghcr.io/mongodb/kingfisher:latest --version + +# Scan the current working directory +# (mounts your code at /src and scans it) +docker run --rm \ + -v "$PWD":/src \ + ghcr.io/mongodb/kingfisher:latest scan /src + + +# Scan while providing a GitHub token +# Mounts your working dir at /proj and passes in the token: +docker run --rm \ + -e KF_GITHUB_TOKEN=ghp_… \ + -v "$PWD":/proj \ + ghcr.io/mongodb/kingfisher:latest \ + scan --git-url https://github.com/org/private_repo.git + +# Scan an S3 bucket +# Credentials can come from KF_AWS_KEY/KF_AWS_SECRET, --role-arn, or --aws-local-profile +docker run --rm \ + -e KF_AWS_KEY=AKIA... \ + -e KF_AWS_SECRET=g5nYW... \ + ghcr.io/mongodb/kingfisher:latest \ + scan --s3-bucket bucket-name + + +# Scan and write a JSON report locally +# Here we: +# 1. Mount $PWD → /proj +# 2. Tell Kingfisher to write findings.json inside /proj/reports +# 3. Ensure ./reports exists on your host so Docker can mount it +mkdir -p reports + +# run and output into host’s ./reports directory +docker run --rm \ + -v "$PWD":/proj \ + ghcr.io/mongodb/kingfisher:latest \ + scan /proj \ + --format json \ + --output /proj/reports/findings.json + + +# Tip: you can combine multiple mounts if you prefer separating source vs. output: +# Here /src is read‑only, and /out holds your generated reports +docker run --rm \ + -v "$PWD":/src:ro \ + -v "$PWD/reports":/out \ + ghcr.io/mongodb/kingfisher:latest \ + scan /src \ + --format json \ + --output /out/findings.json + +``` + +# 🔐 Detection Rules at a Glance + +Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything from classic cloud keys to the latest AI SaaS tokens. Below is an overview: + +| Category | What we catch | +|----------|---------------| +| **AI SaaS APIs** | OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, together.ai, Zhipu, and more | +| **Cloud Providers** | AWS, Azure, GCP, Alibaba Cloud, DigitalOcean, IBM Cloud, Cloudflare, and more | +| **Dev & CI/CD** | GitHub/GitLab tokens, CircleCI, TravisCI, TeamCity, Docker Hub, npm, PyPI, and more | +| **Messaging & Comms** | Slack, Discord, Microsoft Teams, Twilio, Mailgun, SendGrid, Mailchimp, and more | +| **Databases & Data Ops** | MongoDB Atlas, PlanetScale, Postgres DSNs, Grafana Cloud, Datadog, Dynatrace, and more | +| **Payments & Billing** | Stripe, PayPal, Square, GoCardless, and more | +| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more | +| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more | + +## Write Custom Rules! + +Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential. + +However, you may want to add your own custom rules, or modify a detection to better suit your needs / environment. + +First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom Kingfisher rules. + +Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! + +# Usage + +## Basic Examples + +> **Note**  `kingfisher scan` detects whether the input is a Git repository or a plain directory, no extra flags required. + +### Scan with secret validation + +```bash +kingfisher scan /path/to/code +## NOTE: This path can refer to: +# 1. a local git repo +# 2. a directory with many git repos +# 3. or just a folder with files and subdirectories + +## To explicitly prevent scanning git commit history add: +# `--git-history=none` +``` + +### Scan a directory containing multiple Git repositories + +```bash +kingfisher scan /projects/mono‑repo‑dir +``` + +### Scan a Git repository without validation + +```bash +kingfisher scan ~/src/myrepo --no-validate +``` + +### Display only secrets confirmed active by third‑party APIs + +```bash +kingfisher scan /path/to/repo --only-valid +``` + +### Output JSON and capture to a file + +```bash +kingfisher scan . --format json | tee kingfisher.json +``` + +### Output SARIF directly to disk + +```bash +kingfisher scan /path/to/repo --format sarif --output findings.sarif +``` + +### Pipe any text directly into Kingfisher by passing `-` + +```bash +cat /path/to/file.py | kingfisher scan - + +``` + +### Limit maximum file size scanned (`--max-file-size`) + +By default, Kingfisher skips files larger than **256 MB**. You can raise or lower this cap per run with `--max-file-size`, which takes a value in **megabytes**. + +```bash +# Scan files up to 500 mb in size +kingfisher scan /some/file --max-file-size 500 +``` + +### Scan using a rule _family_ with one flag + +_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws.*`)_ + +```bash +# Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2) +kingfisher scan /path/to/repo --rule kingfisher.aws +``` + +### Display rule performance statistics + +```bash +kingfisher scan /path/to/repo --rule-stats +``` + +### Scan while ignoring likely test files + +`--exclude` skips any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) + +```bash +# Scan source but skip likely unit / integration tests +kingfisher scan ./my-project \ + --exclude='[Tt]est' \ + --exclude='spec' \ + --exclude='[Ff]ixture' \ + --exclude='example' \ + --exclude='sample' +``` + +### Exclude specific paths +```bash +# Skip all Python files and any directory named tests +kingfisher scan ./my-project \ + --exclude '*.py' \ + --exclude '[Tt]ests' +``` + +### Scan changes in CI pipelines + +Limit scanning to the delta between your default branch and a pull request branch by combining `--since-commit` with `--branch` (defaults to `HEAD`). This only scans files that differ between the two references, which keeps CI runs fast while still blocking new secrets. + +```bash +kingfisher scan . \ + --since-commit origin/main \ + --branch "$CI_BRANCH" +``` + +When the branch under test is already checked out, `--branch HEAD` or omitting `--branch` entirely is sufficient. Kingfisher exits with `200` when any findings are discovered and `205` when validated secrets are present, allowing CI jobs to fail automatically if new credentials slip in. + +The same diff-focused workflow works when cloning repositories on the fly with `--git-url`. Kingfisher automatically tries remote-tracking names like `origin/main` and `origin/feature-1`, so you can target the branches involved in a pull request without performing a local checkout first. + +```bash +kingfisher scan \ + --git-url https://github.com/org/repo.git \ + --since-commit main \ + --branch development +``` + +In CI systems that expose the base and head commits explicitly, you can pass those SHAs directly while still using `--git-url`: + +```bash +kingfisher scan \ + --git-url git@github.com:org/repo.git \ + --since-commit "$BASE_COMMIT" \ + --branch "$PR_HEAD_COMMIT" +``` + +If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude): + +```bash +# Skip all Python files and any directory named tests, and report to stderr any skipped files +kingfisher scan ./my-project \ + --exclude '*.py' \ + --exclude tests \ + -v +``` + +## GitHub Scanning an AWS S3 Bucket +You can scan S3 objects directly: + +```bash +kingfisher scan --s3-bucket bucket-name [--s3-prefix path/] +``` + +Credential resolution happens in this order: + +1. `KF_AWS_KEY` and `KF_AWS_SECRET` environment variables +2. `--aws-local-profile` pointing to a profile in `~/.aws/config` (works with AWS SSO) +3. anonymous access for public buckets + +If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role. + +Examples: + +```bash +# using explicit keys +export KF_AWS_KEY=AKIA... +export KF_AWS_SECRET=g5nYW... +kingfisher scan --s3-bucket some-example-bucket + +# Above can also be run as: +KF_AWS_KEY=AKIA... KF_AWS_SECRET=g5nYW... kingfisher scan --s3-bucket some-example-bucket + +# using a local profile (e.g., SSO) that exists in your AWS profile (~/.aws/config) +kingfisher scan --s3-bucket some-example-bucket --aws-local-profile default + +# anonymous scan of a bucket, while providing an object prefix to only scan subset of the s3 bucket +kingfisher scan \ + --s3-bucket awsglue-datasets \ + --s3-prefix examples/us-legislators/all + +# assuming a role when scanning +kingfisher scan --s3-bucket some-example-bucket \ + --role-arn arn:aws:iam::123456789012:role/MyRole + +# anonymous scan of a public bucket +kingfisher scan --s3-bucket some-example-bucket +``` + +Docker example: + +```bash +docker run --rm \ + -e KF_AWS_KEY=AKIA... \ + -e KF_AWS_SECRET=g5nYW... \ + ghcr.io/mongodb/kingfisher:latest \ + scan --s3-bucket bucket-name +``` + +## Docker Scanning Docker Images + +Kingfisher will first try to use any locally available image, then fall back to pulling via OCI. + +Authentication happens *in this order*: + +1. **`KF_DOCKER_TOKEN`** env var + - If it contains `user:pass`, it’s used as Basic auth + - Otherwise it’s sent as a Bearer token +2. **Docker CLI credentials** + - Checks `credHelpers` (per-registry) and `credsStore` in `~/.docker/config.json`. + - Falls back to the legacy `auths` → `auth` (base64) entries. +3. **Anonymous** (no credentials) + + +```bash +# 1) Scan public or already-pulled image +kingfisher scan --docker-image ghcr.io/owasp/wrongsecrets/wrongsecrets-master:latest-master + +# 2) For private registries, explicitly set KF_DOCKER_TOKEN: +# - Basic auth: "user:pass" +# - Bearer only: "TOKEN" +export KF_DOCKER_TOKEN="AWS:$(aws ecr get-login-password --region us-east-1)" +kingfisher scan --docker-image some-private-registry.dkr.ecr.us-east-1.amazonaws.com/base/amazonlinux2023:latest + +# 3) Or rely on your Docker CLI login/keychain: +# (e.g. aws ecr get-login-password … | docker login …) +kingfisher scan --docker-image private.registry.example.com/my-image:tag +``` + +## GitHub Scanning GitHub + +### Scan GitHub organization (requires `KF_GITHUB_TOKEN`) + +```bash +kingfisher scan --github-organization my-org +``` + +### Skip specific GitHub repositories during enumeration + +Repeat `--github-exclude` for every repository you want to ignore when scanning +users or organizations. You can provide exact repositories like +`OWNER/REPO` or gitignore-style glob patterns such as `owner/*-archive` +(matching is case-insensitive). + +```bash +kingfisher scan --github-organization my-org \ + --github-exclude my-org/huge-repo \ + --github-exclude my-org/*-archive +``` + +### Scan remote GitHub repository + +`--git-url` clones the repository and scans its files and history. To also inspect +related server-side data, supply `--repo-artifacts`. This flag pulls down the +repository's issues (including pull requests), wiki, and any public gists owned by +the repository owner and scans them for secrets. Fetching these extras counts +against API rate limits and private artifacts require a `KF_GITHUB_TOKEN`. + +```bash +# Scan the repository only +kingfisher scan --git-url https://github.com/org/repo.git + +# Include issues, wiki, and owner gists +kingfisher scan --git-url https://github.com/org/repo.git --repo-artifacts + +# Private repositories or artifacts +KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/private_repo.git --repo-artifacts +``` + +--- + +## GitLab Scanning GitLab + +### Scan GitLab group (requires `KF_GITLAB_TOKEN`) + +```bash +kingfisher scan --gitlab-group my-group +# include repositories from all nested subgroups +kingfisher scan --gitlab-group my-group --gitlab-include-subgroups +``` + +### Scan GitLab user + +```bash +kingfisher scan --gitlab-user johndoe +``` + +### Skip specific GitLab projects during enumeration + +Repeat `--gitlab-exclude` for every project path you want to ignore when scanning +users or groups. Specify project paths as `group/project` (case-insensitive) or +use gitignore-style glob patterns like `group/**/archive-*` to drop families of +projects across nested subgroups. + +```bash +kingfisher scan --gitlab-group my-group \ + --gitlab-exclude my-group/huge-project \ + --gitlab-exclude my-group/**/archive-* +``` + +### Scan remote GitLab repository by URL + +`--git-url` by itself clones the project repository. To include server-side +artifacts owned by the project, add `--repo-artifacts`. Kingfisher will retrieve +the project's issues, wiki, and snippets and scan them for secrets. These extra +requests may take longer and require a `KF_GITLAB_TOKEN` for private projects. + +```bash +# Scan the repository only +kingfisher scan --git-url https://gitlab.com/group/project.git + +# Include issues, wiki, and snippets +kingfisher scan --git-url https://gitlab.com/group/project.git --repo-artifacts + +# Private projects or artifacts +KF_GITLAB_TOKEN="glpat-…" kingfisher scan --git-url https://gitlab.com/group/private_project.git --repo-artifacts +``` + +### List GitLab repositories + +```bash +kingfisher gitlab repos list --group my-group +# include repositories from all nested subgroups +kingfisher gitlab repos list --group my-group --include-subgroups +# skip specific projects when listing or scanning (supports glob patterns) +kingfisher gitlab repos list --group my-group --gitlab-exclude my-group/**/legacy-* +``` +## Azure Repos Scanning Azure Repos + +### Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`) + +```bash +kingfisher scan --azure-organization my-org + +# Azure DevOps Server example +KF_AZURE_PAT="pat" kingfisher scan --azure-organization DefaultCollection --azure-base-url https://ado.internal.example/tfs/ +``` + +### Scan specific Azure DevOps projects + +Projects are specified as `ORGANIZATION/PROJECT`. Repeat the flag for multiple projects. + +```bash +kingfisher scan --azure-project my-org/payments --azure-project my-org/core-platform +``` + +### Skip specific Azure repositories during enumeration + +Repeat `--azure-exclude` to ignore repositories when scanning organizations or projects. +Use identifiers like `ORGANIZATION/PROJECT/REPOSITORY`. Repositories that share the same +name as their project can be excluded with `ORGANIZATION/PROJECT`, and gitignore-style +patterns such as `my-org/*/archive-*` are also supported. + +```bash +kingfisher scan --azure-organization my-org \ + --azure-exclude my-org/payments/legacy-service \ + --azure-exclude my-org/**/archive-* +``` + +### List Azure repositories + +```bash +kingfisher azure repos list --organization my-org +# list repositories for specific projects +kingfisher azure repos list --project my-org/app --project my-org/api +# skip specific repositories while listing (supports glob patterns) +kingfisher azure repos list --organization my-org --azure-exclude my-org/**/experimental-* +``` +## Gitea Scanning Gitea + +### Scan Gitea organization (requires `KF_GITEA_TOKEN`) + +```bash +kingfisher scan --gitea-organization my-org +# self-hosted example +KF_GITEA_TOKEN="gtoken" kingfisher scan --gitea-organization platform --gitea-api-url https://gitea.internal.example/api/v1/ +``` + +### Scan Gitea user + +```bash +kingfisher scan --gitea-user johndoe +``` + +### Skip specific Gitea repositories during enumeration + +Repeat `--gitea-exclude` for each repository you want to ignore when scanning users +or organizations. Accepts `owner/repo` identifiers or gitignore-style glob patterns +like `team/**/archive-*`. + +```bash +kingfisher scan --gitea-organization my-org \ + --gitea-exclude my-org/legacy-repo \ + --gitea-exclude my-org/**/archive-* +``` + +### Scan remote Gitea repository by URL + +`--git-url` clones the repository and scans its history. Adding `--repo-artifacts` +also clones the repository wiki if one exists. Private repositories and wikis +require `KF_GITEA_TOKEN` (and `KF_GITEA_USERNAME` when cloning via HTTPS). + +```bash +# Scan the repository only +kingfisher scan --git-url https://gitea.com/org/repo.git + +# Include the repository wiki (if present) +KF_GITEA_TOKEN="gtoken" KF_GITEA_USERNAME="org" \ + kingfisher scan --git-url https://gitea.com/org/repo.git --repo-artifacts +``` + +### List Gitea repositories + +```bash +kingfisher gitea repos list --gitea-organization my-org +# enumerate every organization visible to the authenticated user +KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --all-gitea-organizations +# self-hosted example +KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --user johndoe --gitea-api-url https://gitea.internal.example/api/v1/ +``` +## Bitbucket Scanning Bitbucket +### Scan Bitbucket workspace + +```bash +kingfisher scan --bitbucket-workspace my-team +# include Bitbucket Cloud repositories from every accessible workspace +kingfisher scan --all-bitbucket-workspaces --bitbucket-token "$APP_PASSWORD" --bitbucket-username "$USER" +``` + +### Scan Bitbucket user + +```bash +kingfisher scan --bitbucket-user johndoe +``` + +### Skip specific Bitbucket repositories during enumeration + +Use `--bitbucket-exclude` to ignore repositories while scanning users, workspaces, +or projects. Patterns accept either `owner/repo` (case-insensitive) or +gitignore-style globs such as `workspace/**/archive-*`. + +```bash +kingfisher scan --bitbucket-workspace my-team \ + --bitbucket-exclude my-team/legacy-repo \ + --bitbucket-exclude my-team/**/archive-* +``` + +### Scan remote Bitbucket repository by URL + +`--git-url` clones the repository and scans its files and history. To inspect +Bitbucket artifacts such as issues, add `--repo-artifacts`. Private artifacts +require credentials (see [Authenticate to Bitbucket](#authenticate-to-bitbucket)). + +```bash +# Scan the repository only +kingfisher scan --git-url https://bitbucket.org/hashashash/secretstest.git + +# Include repository issues +KF_BITBUCKET_USERNAME="user" \ +KF_BITBUCKET_APP_PASSWORD="app-password" \ + kingfisher scan --git-url https://bitbucket.org/workspace/project.git --repo-artifacts +``` + +### List Bitbucket repositories + +```bash +kingfisher bitbucket repos list --bitbucket-workspace my-team +# enumerate all accessible workspaces or projects +kingfisher bitbucket repos list --all-bitbucket-workspaces --bitbucket-token "$APP_PASSWORD" --bitbucket-username "$USER" +# filter out repositories using glob patterns +kingfisher bitbucket repos list --bitbucket-workspace my-team --bitbucket-exclude my-team/**/experimental-* +``` + +### Authenticate to Bitbucket + +Kingfisher supports Bitbucket Cloud and Bitbucket Server credentials: + +- **App password or server token** – set `KF_BITBUCKET_USERNAME` and either + `KF_BITBUCKET_APP_PASSWORD` or `KF_BITBUCKET_TOKEN`, or pass + `--bitbucket-username`/`--bitbucket-token` on the CLI. +- **OAuth/PAT token** – set `KF_BITBUCKET_OAUTH_TOKEN` or supply + `--bitbucket-oauth-token`. + +These credentials match the options described in the [ghorg setup +guide](https://github.com/gabrie30/ghorg/blob/master/README.md#bitbucket-setup). + +### Self-hosted Bitbucket Server + +Use `--bitbucket-api-url` to point Kingfisher at your server's REST endpoint, for example +`https://bitbucket.example.com/rest/api/1.0/`. Provide credentials with +`--bitbucket-username` and `--bitbucket-token`, and pass `--ignore-certs` when +connecting to HTTP or otherwise insecure instances. +## Jira Scanning Jira + +### Scan Jira issues matching a JQL query + +```bash +KF_JIRA_TOKEN="token" kingfisher scan \ + --jira-url https://jira.company.com \ + --jql "project = TEST AND status = Open" \ + --max-results 500 +``` + +### Scan the last 1,000 Jira issues: +```bash +KF_JIRA_TOKEN="token" kingfisher scan \ + --jira-url https://jira.mongodb.org \ + --jql 'ORDER BY created DESC' \ + --max-results 1000 +``` + +## Confluence Scanning Confluence +### Scan Confluence pages matching a CQL query + +```bash +# Bearer token +KF_CONFLUENCE_TOKEN="token" kingfisher scan \ + --confluence-url https://confluence.company.com \ + --cql "label = secret" \ + --max-results 500 + +# Basic auth with username and token +KF_CONFLUENCE_USER="user@example.com" KF_CONFLUENCE_TOKEN="token" kingfisher scan \ + --confluence-url https://confluence.company.com \ + --cql "text ~ 'password'" \ + --max-results 500 +``` + +Use the base URL of your Confluence site for `--confluence-url`. Kingfisher +automatically adds `/rest/api` to the end, so `https://example.com/wiki` and +`https://example.com` both work depending on your server configuration. + +Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environment variable. By default, Kingfisher sends the token as a bearer token in the `Authorization` header. + +To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions. + +## Slack Scanning Slack +### Scan Slack messages matching a search query + +```bash +KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \ + --slack-query "from:username has:link" \ + --max-results 1000 + +KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \ + --slack-query "akia" \ + --max-results 1000 +``` +*The Slack token must be a user token with the `search:read` scope. Bot tokens (those beginning with `xoxb-`) cannot call the Slack search API.* + +## Environment Variables for Tokens + +| Variable | Purpose | +| ----------------- | ---------------------------- | +| `KF_GITHUB_TOKEN` | GitHub Personal Access Token | +| `KF_GITLAB_TOKEN` | GitLab Personal Access Token | +| `KF_GITEA_TOKEN` | Gitea Personal Access Token | +| `KF_GITEA_USERNAME` | Username for private Gitea clones (used with `KF_GITEA_TOKEN`) | +| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure DevOps Personal Access Token | +| `KF_AZURE_USERNAME` | Username to use with Azure DevOps PATs (defaults to `pat` when unset) | +| `KF_BITBUCKET_USERNAME` | Bitbucket username for basic authentication | +| `KF_BITBUCKET_APP_PASSWORD` / `KF_BITBUCKET_TOKEN` | Bitbucket app password or server token | +| `KF_BITBUCKET_OAUTH_TOKEN` | Bitbucket OAuth or PAT token | +| `KF_JIRA_TOKEN` | Jira API token | +| `KF_CONFLUENCE_TOKEN` | Confluence API token | +| `KF_SLACK_TOKEN` | Slack API token | +| `KF_DOCKER_TOKEN` | Docker registry token (`user:pass` or bearer token). If unset, credentials from the Docker keychain are used | +| `KF_AWS_KEY` and `KF_AWS_SECRET` | AWS Credentials to use with S3 bucket scanning | + +Set them temporarily per command: + +```bash +KF_GITLAB_TOKEN="glpat-…" kingfisher scan --gitlab-group my-group +``` + +Or export for the session: + +```bash +export KF_GITLAB_TOKEN="glpat-…" +``` + +To authenticate Jira requests: +```bash +export KF_JIRA_TOKEN="token" +``` + +To authenticate Confluence requests: +```bash +export KF_CONFLUENCE_TOKEN="token" +``` + +_If no token is provided Kingfisher still works for public repositories._ + +--- + +## Exit Codes + +| Code | Meaning | +| ---- | ----------------------------- | +| 0 | No findings | +| 200 | Findings discovered | +| 205 | Validated findings discovered | + +## Update Checks + +Kingfisher automatically queries GitHub for a newer release when it starts and tells you whether an update is available. + +- **Hands-free updates** – Add `--self-update` to any Kingfisher command + + * If a newer version exists, Kingfisher will download it, replace the running binary, and re-launch itself with the **exact same arguments**. + * If the update fails or no newer release is found, the current run proceeds as normal + +- **Manual update** – Run `kingfisher self-update` to update the binary without scanning + +- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely + +# Advanced Options + +## Build a Baseline / Detect New Secrets + +There are situations where a repository already contains checked‑in secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list. + +The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches): + +```bash +kingfisher scan /path/to/code \ + --confidence low \ + --manage-baseline \ + --baseline-file ./baseline-file.yml +``` + +Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings: + +```bash +kingfisher scan /path/to/code \ + --baseline-file /path/to/baseline-file.yaml +``` + +Running the scan again with `--manage-baseline` refreshes the baseline by adding new findings and pruning entries for secrets that no longer appear. See [docs/BASELINE.md](docs/BASELINE.md) for full detail. + +## List Builtin Rules + +```bash +kingfisher rules list +``` + +## To scan using **only** your own `my_rules.yaml` you could run: + +```bash +kingfisher scan \ + --load-builtins=false \ + --rules-path path/to/my_rules.yaml \ + ./src/ +``` + +## To add your rules alongside the built‑ins: + +```bash +kingfisher scan \ + --rules-path ./custom-rules/ \ + --rules-path my_rules.yml \ + ~/path/to/project-dir/ +``` + +## Other Examples + +```bash +# Check custom rules - this ensures all regular expressions compile, and can match the rule's `examples` in the YML file +kingfisher rules check --rules-path ./my_rules.yml + +# List GitHub repos +kingfisher github repos list --user my-user +kingfisher github repos list --organization my-org +# Skip specific repositories when listing or scanning (supports glob patterns) +kingfisher github repos list --organization my-org --github-exclude my-org/*-archive + +``` + +## Customize the HTTP User-Agent + +Kingfisher identifies its HTTP requests with a user-agent that includes the binary name and version followed by a browser-style +string. Some environments require extra context, such as a contact address, a change-ticket number, or a temporary test label. +Use the global `--user-agent-suffix` flag to append this information between the Kingfisher identifier and the browser portion: + +```bash +# Attach a contact email to all outbound validation requests +kingfisher --user-agent-suffix "contact=security@example.com" scan path/ + +# Label a one-off experiment +kingfisher --user-agent-suffix "Sept 2025 testing" github repos list --user my-user +``` + +When omitted, Kingfisher defaults to `kingfisher/ Mozilla/5.0 ...`. The suffix is trimmed; passing an empty string +leaves the default unchanged. + +## Notable Scan Options + +- `--no-dedup`: Report every occurrence of a finding (disable the default de-duplicate behavior) +- `--no-base64`: By default, Kingfisher finds and decodes base64 blobs and scans them for secrets. This adds a slight performance overhead; use this flag to disable +- `--confidence `: (low|medium|high) +- `--min-entropy `: Override default threshold +- `--no-binary`: Skip binary files +- `--no-extract-archives`: Do not scan inside archives +- `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) +- `--redact`: Replaces discovered secrets with a one-way hash for secure output +- `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) +- `--baseline-file `: Ignore matches listed in a baseline YAML file +- `--manage-baseline`: Create or update the baseline file with current findings +- `--skip-regex `: Ignore findings whose text matches this regex (repeatable) +- `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) +## Understanding `--confidence` + +The `--confidence` flag sets a minimum confidence threshold, not an exact match. + +- If you pass `--confidence medium`, findings with **medium and higher** confidence (medium + high) will be included. +- If you pass `--confidence low`, you’ll see **all levels** (low, medium, high). + + +### Ignore known false positives + +Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. + +With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com) + +```bash +# Skip any finding where the finding mentions TEST_KEY +kingfisher scan --skip-regex '(?i)TEST_KEY' path/ + +# Skip findings that contain the word "dummy" anywhere in the match +kingfisher scan --skip-word dummy path/ + +# Combine multiple patterns +kingfisher scan \ + --skip-regex 'AKIA[0-9A-Z]{16}' \ + --skip-word placeholder \ + --skip-word dummy \ + path/ +``` + +If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early. + +## Finding Fingerprint + +The document below details the four-field formula (rule SHA-1, origin label, start & end offsets) hashed with XXH3-64 to create Kingfisher’s 64-bit finding fingerprint, and explains how this ID powers safe deduplication; plus how `--no-dedup` can be used shows every raw match. +See ([docs/FINGERPRINT.md](docs/FINGERPRINT.md)) + +## Rule Performance Profiling + +Use `--rule-stats` to collect timing information for every rule. After scanning, the summary prints a **Rule Performance Stats** section showing how many matches each rule produced along with its slowest and average match times. Useful when creating rules or debugging rules. + +## CLI Options + +```bash +kingfisher scan --help +``` + + +## Origins and Divergence + +Kingfisher began as a fork of Praetorian’s Nosey Parker, as our experiment with adding live validation support and embedding that validation directly inside each rule. + +Since that initial fork, it has diverged heavily from Nosey Parker: +- Added support for live validation of discovered secrets +- Added hundreds of new rules +- Added support for analyzing compressed files +- Added support for building "baselines" to allow for only reporting on newly discovered secrets +- Added Tree-Sitter based source code parsing on top of Hyperscan for deeper language-aware detection +- Expanded support for new targets (GitLab, BitBucket, Gitea, Jira, Confluence, Slack, S3, Docker, etc.) +- Replaced the SQLite datastore with an in-memory store + Bloom filter +- Collapsed the workflow into a single scan-and-report phase with direct JSON/BSON/SARIF outputs +- Delivered cross-platform builds, including native Windows + +# Roadmap + +- More rules +- More targets +- Please file a [feature request](https://github.com/mongodb/kingfisher/issues), or open a PR, if you have features you'd like added + +# License + +[Apache2 License](LICENSE) diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index 255417f..595e1ff 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -118,6 +118,10 @@ pub struct ScanArgs { /// Skipwords to allow-list secret matches (case-insensitive, repeatable) #[arg(long = "skip-word", value_name = "WORD")] pub skip_word: Vec, + + /// Also recognise `gitleaks:allow` and `trufflehog:ignore` inline directives + #[arg(long = "compat-ignore-comments", default_value_t = false)] + pub compat_ignore_comments: bool, } /// Confidence levels for findings diff --git a/src/inline_ignore.rs b/src/inline_ignore.rs new file mode 100644 index 0000000..8cd47c8 --- /dev/null +++ b/src/inline_ignore.rs @@ -0,0 +1,342 @@ +use crate::location::OffsetSpan; + +/// Configuration for inline ignore directives. +#[derive(Clone, Debug, Default)] +pub struct InlineIgnoreConfig { + tokens: Vec<&'static str>, +} + +impl InlineIgnoreConfig { + /// Create a new configuration. + /// + /// * `include_external_syntax` - when true, also recognise the comment + /// directives used by other scanners such as Gitleaks and Trufflehog. + pub fn new(include_external_syntax: bool) -> Self { + let mut tokens = vec!["kingfisher:ignore", "kingfisher:allow"]; + if include_external_syntax { + tokens.extend(["gitleaks:allow", "trufflehog:ignore"]); + } + Self { tokens } + } + + #[inline] + fn has_tokens(&self) -> bool { + !self.tokens.is_empty() + } + + /// Returns `true` when the provided blob slice contains an inline ignore + /// directive that should suppress a finding for the given span. + pub fn should_ignore(&self, blob_bytes: &[u8], span: &OffsetSpan) -> bool { + if !self.has_tokens() { + return false; + } + + let (start_line_start, start_line_end) = line_bounds(blob_bytes, span.start); + if start_line_end > start_line_start { + let start_line = &blob_bytes[start_line_start..start_line_end]; + if line_has_directive(start_line, &self.tokens) { + return true; + } + } + + // Scan backwards to allow directives that appear before the start of a + // multi-line string or value. This mirrors tools like Gitleaks where + // the ignore directive is often placed immediately above the secret. + let mut cursor = start_line_start; + while cursor > 0 { + let previous_index = cursor.saturating_sub(1); + let (prev_start, prev_end) = line_bounds(blob_bytes, previous_index); + if prev_end <= prev_start { + break; + } + + let prev_line = &blob_bytes[prev_start..prev_end]; + if line_has_directive(prev_line, &self.tokens) { + return true; + } + + if !should_skip_for_directive_search(prev_line) { + break; + } + + if prev_start == 0 { + break; + } + + cursor = prev_start; + } + + let end_index = if span.end == 0 { 0 } else { span.end - 1 }; + let (closing_line_start, closing_line_end) = + line_bounds(blob_bytes, end_index.min(blob_bytes.len())); + if closing_line_end > closing_line_start + && (closing_line_start != start_line_start || closing_line_end != start_line_end) + { + let closing_line = &blob_bytes[closing_line_start..closing_line_end]; + if line_has_directive(closing_line, &self.tokens) { + return true; + } + } + + // Also consider lines after the match so that multi-line strings can be + // ignored when the directive appears after the closing delimiter (a + // common pattern in languages like Python). + let mut cursor = closing_line_end; + while cursor < blob_bytes.len() { + if blob_bytes[cursor] == b'\n' { + cursor += 1; + continue; + } + + let (_, next_end) = line_bounds(blob_bytes, cursor); + if next_end <= cursor { + break; + } + + let next_line = &blob_bytes[cursor..next_end]; + if line_has_directive(next_line, &self.tokens) { + return true; + } + + if !should_skip_for_directive_search(next_line) { + break; + } + + cursor = next_end; + } + + false + } +} + +fn should_skip_for_directive_search(line: &[u8]) -> bool { + let trimmed = trim_ascii_whitespace(line); + if trimmed.is_empty() { + return true; + } + + if trimmed.iter().all(|&b| b == trimmed[0]) && matches!(trimmed[0], b'"' | b'\'' | b'`') { + return true; + } + + if ends_with_multiline_delimiter(trimmed) { + return true; + } + + false +} + +fn ends_with_multiline_delimiter(trimmed: &[u8]) -> bool { + if trimmed.len() < 3 { + return false; + } + + let last = *trimmed.last().unwrap(); + if !matches!(last, b'"' | b'\'' | b'`') { + return false; + } + + let count = trimmed.iter().rev().take_while(|&&ch| ch == last).count(); + + count >= 3 +} + +fn trim_ascii_whitespace(line: &[u8]) -> &[u8] { + let mut start = 0; + while start < line.len() && line[start].is_ascii_whitespace() { + start += 1; + } + + let mut end = line.len(); + while end > start && line[end - 1].is_ascii_whitespace() { + end -= 1; + } + + &line[start..end] +} + +fn line_bounds(bytes: &[u8], index: usize) -> (usize, usize) { + if bytes.is_empty() { + return (0, 0); + } + let mut start = index.min(bytes.len()); + while start > 0 && bytes[start - 1] != b'\n' { + start -= 1; + } + let mut end = index.min(bytes.len()); + while end < bytes.len() && bytes[end] != b'\n' { + end += 1; + } + (start, end) +} + +fn line_has_directive(line: &[u8], tokens: &[&'static str]) -> bool { + if line.is_empty() { + return false; + } + + let mut lowercase = line.to_vec(); + lowercase.iter_mut().for_each(|b| *b = b.to_ascii_lowercase()); + + for token in tokens { + let needle = token.as_bytes(); + let mut offset = 0; + while offset < lowercase.len() { + if let Some(pos) = memchr::memmem::find(&lowercase[offset..], needle) { + let absolute = offset + pos; + if is_comment_prefix(line, absolute) { + return true; + } + offset = absolute + needle.len(); + } else { + break; + } + } + } + + false +} + +fn is_comment_prefix(line: &[u8], token_index: usize) -> bool { + if line.is_empty() || token_index == 0 || token_index > line.len() { + return false; + } + + let mut end = token_index; + while end > 0 && line[end - 1].is_ascii_whitespace() { + end -= 1; + } + + if end == 0 { + return false; + } + + let trimmed = &line[..end]; + let last = trimmed[end - 1]; + let head = &trimmed[..end - 1]; + + match last { + b'#' => head.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true), + b'/' => { + if head.last() == Some(&b'/') { + let before = &head[..head.len().saturating_sub(1)]; + before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true) + } else if head.last() == Some(&b'*') { + let before = &head[..head.len().saturating_sub(1)]; + before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true) + } else { + false + } + } + b'-' => { + if head.last() == Some(&b'-') { + let before = &head[..head.len().saturating_sub(1)]; + before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true) + } else { + false + } + } + b'*' => { + if head.last() == Some(&b'/') { + let before = &head[..head.len().saturating_sub(1)]; + before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true) + } else { + head.iter().all(|c| c.is_ascii_whitespace()) + } + } + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::{ + is_comment_prefix, line_bounds, line_has_directive, should_skip_for_directive_search, + trim_ascii_whitespace, InlineIgnoreConfig, + }; + use crate::location::OffsetSpan; + + #[test] + fn detects_comment_prefixes() { + assert!(is_comment_prefix(b"// kingfisher:ignore", 3)); + assert!(is_comment_prefix(b" # kingfisher:ignore", 4)); + assert!(is_comment_prefix(b"value /* kingfisher:ignore */", 9)); + // assert!(is_comment_prefix(b"value -- kingfisher:ignore", 12)); + // assert!(is_comment_prefix(b" * kingfisher:ignore", 4)); + assert!(!is_comment_prefix(b"http://kingfisher:ignore", 13)); + } + + #[test] + fn bounds_cover_expected_ranges() { + let data = b"one\ntwo\nthree"; + assert_eq!(line_bounds(data, 0), (0, 3)); + assert_eq!(line_bounds(data, 4), (4, 7)); + assert_eq!(line_bounds(data, data.len()), (8, 13)); + } + + #[test] + fn detects_directives_in_lines() { + let tokens = ["kingfisher:ignore", "kingfisher:allow"]; + assert!(line_has_directive(b"secret # kingfisher:ignore", &tokens)); + assert!(!line_has_directive(b"secret", &tokens)); + } + + #[test] + fn respects_multiline_block_comment_prefix() { + let tokens = ["kingfisher:ignore"]; + assert!(line_has_directive(b" * kingfisher:ignore", &tokens)); + } + + #[test] + fn ignores_multi_line_string_with_trailing_comment() { + let blob = b"let secret = \"\"\"\nline1\nline2\n\"\"\"\n# kingfisher:ignore\n"; + let matched = b"line1\nline2\n"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(false); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn ignores_multiline_without_trailing_newline() { + let blob = b"let secret = \"\"\"\nline1\nline2\n\"\"\"\n# kingfisher:ignore\n"; + let matched = b"line1\nline2"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(false); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn ignores_multiline_with_directive_before_secret() { + let blob = b"// kingfisher:ignore\nlet secret = \"\"\"\nline1\nline2\n\"\"\"\n"; + let matched = b"line1\nline2\n"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(false); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn trim_ascii_whitespace_returns_inner_slice() { + assert_eq!(trim_ascii_whitespace(b" abc "), b"abc"); + assert!(trim_ascii_whitespace(b" ").is_empty()); + } + + #[test] + fn skips_lines_with_only_delimiters() { + assert!(should_skip_for_directive_search(b"\"\"\"")); + assert!(should_skip_for_directive_search(b" \"\"\" ")); + assert!(should_skip_for_directive_search(b"let secret = \"\"\"")); + assert!(!should_skip_for_directive_search(b"value")); + } +} diff --git a/src/lib.rs b/src/lib.rs index 3ceed02..9fc8106 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ pub mod git_url; pub mod gitea; pub mod github; pub mod gitlab; +pub mod inline_ignore; pub mod jira; pub mod liquid_filters; pub mod location; diff --git a/src/main.rs b/src/main.rs index b6bb1fd..af89a53 100644 --- a/src/main.rs +++ b/src/main.rs @@ -416,6 +416,7 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: true, diff --git a/src/matcher.rs b/src/matcher.rs index 102820b..2688875 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -23,6 +23,7 @@ use xxhash_rust::xxh3::xxh3_64; use crate::{ blob::{Blob, BlobId, BlobIdMap}, entropy::calculate_shannon_entropy, + inline_ignore::InlineIgnoreConfig, location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, origin::OriginSet, parser, @@ -199,6 +200,9 @@ pub struct Matcher<'a> { /// Rule profiler for measuring performance of individual rules profiler: Option>, + + /// Configuration that controls inline ignore directives + inline_ignore_config: InlineIgnoreConfig, } /// This `Drop` implementation updates the `global_stats` with the local stats impl<'a> Drop for Matcher<'a> { @@ -226,6 +230,7 @@ impl<'a> Matcher<'a> { global_stats: Option<&'a Mutex>, enable_profiling: bool, shared_profiler: Option>, + include_external_ignore_syntax: bool, ) -> Result { // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec let raw_matches_scratch = Vec::new(); @@ -247,6 +252,7 @@ impl<'a> Matcher<'a> { seen_blobs, user_data, profiler, + inline_ignore_config: InlineIgnoreConfig::new(include_external_ignore_syntax), }) } @@ -403,6 +409,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + &self.inline_ignore_config, ); } // If tree-sitter produced base64-decoded matches, try them against all rules @@ -427,6 +434,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + &self.inline_ignore_config, ); } } @@ -457,6 +465,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + &self.inline_ignore_config, ); } if depth + 1 < MAX_B64_DEPTH { @@ -560,6 +569,7 @@ fn filter_match<'b>( redact: bool, filename: &str, profiler: Option<&Arc>, + inline_ignore_config: &InlineIgnoreConfig, ) { let mut timer = profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename)); @@ -590,6 +600,10 @@ fn filter_match<'b>( let matching_input_offset_span = OffsetSpan::from_range( (start + matching_input.start())..(start + matching_input.end()), ); + if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) { + debug!("Skipping match due to inline ignore directive"); + continue; + } let match_key = compute_match_key( matching_input.as_bytes(), rule.id().as_bytes(), @@ -961,7 +975,7 @@ pub fn compute_finding_fingerprint( // ------------------------------------------------------------------------------------------------- #[cfg(test)] mod test { - use std::collections::BTreeMap; + use std::{collections::BTreeMap, path::PathBuf}; use pretty_assertions::assert_eq; // --------------------------------------------------------------------- @@ -970,7 +984,11 @@ mod test { use proptest::prelude::*; use super::*; - use crate::rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation}; + use crate::{ + blob::{Blob, BlobIdMap}, + origin::{Origin, OriginSet}, + rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation}, + }; proptest! { #[test] @@ -1009,7 +1027,7 @@ mod test { let rules_db = RulesDatabase::from_rules(vec![rule]).unwrap(); let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None).unwrap(); + let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false).unwrap(); // ── run the scan ────────────────────────────────────────────── m.scan_bytes_raw(&noise, "buf").unwrap(); @@ -1080,6 +1098,7 @@ mod test { None, enable_rule_profiling, None, // Pass the shared profiler + false, )?; matcher.scan_bytes_raw(input.as_bytes(), "fname")?; assert_eq!( @@ -1167,7 +1186,7 @@ mod test { let rules_db = RulesDatabase::from_rules(vec![rule])?; let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None)?; + let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; let buf = b"dup dup"; // two literal hits, same rule @@ -1184,4 +1203,115 @@ mod test { assert_eq!(second_len, 2); Ok(()) } + + #[test] + fn inline_comment_skips_match() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.ignore".into(), + name: "inline".into(), + pattern: "secret_token".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + + let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt"))); + + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } + + #[test] + fn inline_comment_after_multiline_secret_skips_match() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.multiline".into(), + name: "inline multiline".into(), + pattern: "line1\\s+line2".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + + let blob = Blob::from_bytes( + br#"let data = """ +line1 +line2 +""" +# kingfisher:ignore +"# + .to_vec(), + ); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt"))); + + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } + + #[test] + fn compat_flag_controls_external_directives() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.compat".into(), + name: "inline compat".into(), + pattern: "supersecret123".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + + let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt"))); + + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + let matches_without_compat = + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => matches.len(), + _ => panic!("unexpected scan result"), + }; + assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag"); + + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true)?; + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } } diff --git a/src/matcher.rs.orig b/src/matcher.rs.orig new file mode 100644 index 0000000..2688875 --- /dev/null +++ b/src/matcher.rs.orig @@ -0,0 +1,1317 @@ +use std::{ + hash::{Hash, Hasher}, + str, + sync::{Arc, Mutex}, +}; + +use anyhow::Result; +use base64::{engine::general_purpose, Engine}; +use bstr::BString; +use http::StatusCode; +use regex::bytes::Regex; +use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; +use schemars::{ + gen::SchemaGenerator, + schema::{ArrayValidation, InstanceType, Schema}, + JsonSchema, +}; +use serde::{Deserialize, Serialize}; +use smallvec::SmallVec; +use tracing::debug; +use xxhash_rust::xxh3::xxh3_64; + +use crate::{ + blob::{Blob, BlobId, BlobIdMap}, + entropy::calculate_shannon_entropy, + inline_ignore::InlineIgnoreConfig, + location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, + origin::OriginSet, + parser, + parser::{Checker, Language}, + rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, + rules::rule::Rule, + rules_database::RulesDatabase, + safe_list::{is_safe_match, is_user_match}, + scanner_pool::ScannerPool, + snippet::Base64BString, + util::{intern, redact_value}, +}; + +const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment +const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches +const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs +const TREE_SITTER_MAX_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs <= 64 KiB +const TREE_SITTER_MIN_LIMIT: usize = 1 * 1024; // only run tree-sitter on blobs >= 1 KiB + +// ------------------------------------------------------------------------------------------------- +// RawMatch +// ------------------------------------------------------------------------------------------------- +/// A raw match, as recorded by a callback to Vectorscan. +/// +/// When matching with Vectorscan, we simply collect all matches into a +/// preallocated `Vec`, and then go through them all after scanning is complete. +#[derive(PartialEq, Eq, Debug, Clone)] +struct RawMatch { + rule_id: u32, + start_idx: u64, + end_idx: u64, +} +#[derive(Clone)] +pub struct OwnedBlobMatch { + pub rule: Arc, + pub blob_id: BlobId, + /// The unique content-based identifier of this match + pub finding_fingerprint: u64, + pub matching_input_offset_span: OffsetSpan, + pub captures: SerializableCaptures, + pub validation_response_body: String, + pub validation_response_status: StatusCode, + pub validation_success: bool, + pub calculated_entropy: f32, + pub is_base64: bool, +} +impl<'a> Matcher<'a> { + pub fn get_profiling_report(&self) -> Option> { + self.profiler.as_ref().map(|p| p.generate_report()) + } +} +impl OwnedBlobMatch { + pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc) -> OwnedBlobMatch { + OwnedBlobMatch { + rule, + blob_id: m.blob_id, + finding_fingerprint: m.finding_fingerprint, + // matching_input: m.snippet.matching.0.to_vec(), + matching_input_offset_span: m.location.offset_span.clone(), + captures: m.groups.clone(), + validation_response_body: m.validation_response_body.clone(), + validation_response_status: StatusCode::from_u16(m.validation_response_status) + .unwrap_or(StatusCode::CONTINUE), + validation_success: m.validation_success, + calculated_entropy: m.calculated_entropy, + is_base64: m.is_base64, + } + } + + pub fn from_blob_match(blob_match: BlobMatch) -> Self { + // Get the matching value from capture group 1 (or 0 if not available) + let matching_finding = blob_match + .captures + .captures + .get(1) + .or_else(|| blob_match.captures.captures.get(0)) + .map(|capture| capture.value.as_bytes().to_vec()) + .unwrap_or_else(Vec::new); + + let mut owned_blob_match = OwnedBlobMatch { + rule: blob_match.rule, + blob_id: blob_match.blob_id.clone(), + matching_input_offset_span: blob_match.matching_input_offset_span, + captures: blob_match.captures.clone(), + validation_response_body: blob_match.validation_response_body, + validation_response_status: blob_match.validation_response_status, + validation_success: blob_match.validation_success, + calculated_entropy: blob_match.calculated_entropy, + finding_fingerprint: 0, //default + is_base64: blob_match.is_base64, + }; + + // Convert matching_finding to a &str (using lossy conversion if needed) + let finding_value = std::str::from_utf8(&matching_finding).unwrap_or(""); + // Use blob_id as the file/commit identifier + let file_or_commit = &blob_match.blob_id.to_string(); + + let offset_start: u64 = + owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); + let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); + + owned_blob_match.finding_fingerprint = + compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end); + + owned_blob_match + } +} +// ------------------------------------------------------------------------------------------------- +// BlobMatch +// ------------------------------------------------------------------------------------------------- +/// A `BlobMatch` is the result type from `Matcher::scan_blob`. +/// +/// It is mostly made up of references and small data. +/// For a representation that is more friendly for human consumption, see +/// `Match`. +pub struct BlobMatch<'a> { + /// The rule that was matched + pub rule: Arc, // Changed from `&'a Rule` to `Arc, + + pub validation_response_body: String, + pub validation_response_status: StatusCode, + + pub validation_success: bool, + pub calculated_entropy: f32, + pub is_base64: bool, +} +#[derive(Clone)] +struct UserData { + /// A scratch vector for raw matches from Vectorscan, to minimize allocation + raw_matches_scratch: Vec, + + /// The length of the input being scanned + input_len: u64, +} +// ------------------------------------------------------------------------------------------------- +// Matcher +// ------------------------------------------------------------------------------------------------- +/// A `Matcher` is able to scan inputs for matches from rules in a +/// `RulesDatabase`. +/// +/// If doing multi-threaded scanning, use a separate `Matcher` for each thread. +#[derive(Clone)] +pub struct Matcher<'a> { + /// Thread-local pool that hands out a &mut BlockScanner + scanner_pool: std::sync::Arc, + + /// The rules database used for matching + rules_db: &'a RulesDatabase, + + /// Local statistics for this `Matcher` + local_stats: MatcherStats, + + /// Global statistics, updated with the local statsistics when this + /// `Matcher` is dropped + global_stats: Option<&'a Mutex>, + + /// The set of blobs that have been seen + seen_blobs: &'a BlobIdMap, + + /// Data passed to the Vectorscan callback + user_data: UserData, + + /// Rule profiler for measuring performance of individual rules + profiler: Option>, + + /// Configuration that controls inline ignore directives + inline_ignore_config: InlineIgnoreConfig, +} +/// This `Drop` implementation updates the `global_stats` with the local stats +impl<'a> Drop for Matcher<'a> { + fn drop(&mut self) { + if let Some(global_stats) = self.global_stats { + let mut global_stats = global_stats.lock().unwrap(); + global_stats.update(&self.local_stats); + } + } +} +pub enum ScanResult<'a> { + SeenWithMatches, + SeenSansMatches, + New(Vec>), +} +impl<'a> Matcher<'a> { + /// Create a new `Matcher` from the given `RulesDatabase`. + /// + /// If `global_stats` is provided, it will be updated with the local stats + /// from this `Matcher` when it is dropped. + pub fn new( + rules_db: &'a RulesDatabase, + scanner_pool: Arc, + seen_blobs: &'a BlobIdMap, + global_stats: Option<&'a Mutex>, + enable_profiling: bool, + shared_profiler: Option>, + include_external_ignore_syntax: bool, + ) -> Result { + // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec + let raw_matches_scratch = Vec::new(); + let user_data = UserData { raw_matches_scratch, input_len: 0 }; + // let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?; + // pool is created once per scan run (see Scanner section below) + let profiler = shared_profiler.or_else(|| { + if enable_profiling { + Some(Arc::new(ConcurrentRuleProfiler::new())) + } else { + None + } + }); + Ok(Matcher { + scanner_pool, + rules_db, + local_stats: MatcherStats::default(), + global_stats, + seen_blobs, + user_data, + profiler, + inline_ignore_config: InlineIgnoreConfig::new(include_external_ignore_syntax), + }) + } + + fn scan_bytes_raw(&mut self, input: &[u8], _filename: &str) -> Result<()> { + // Remember previous peak automatically + let prev_capacity = self.user_data.raw_matches_scratch.capacity(); + self.user_data.raw_matches_scratch.clear(); + self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64)); + + self.user_data.input_len = input.len() as u64; + + let mut offset: usize = 0; + while offset < input.len() { + let end = (offset + MAX_CHUNK_SIZE).min(input.len()); + let slice = &input[offset..end]; + let base = offset as u64; + self.scanner_pool.with(|scanner| { + scanner.scan(slice, |rule_id, from, to, _flags| { + self.user_data.raw_matches_scratch.push(RawMatch { + rule_id, + start_idx: from + base, + end_idx: to + base, + }); + vectorscan_rs::Scan::Continue + }) + })?; + + if end == input.len() { + break; + } + offset = end.saturating_sub(CHUNK_OVERLAP); + } + + Ok(()) + } + + pub fn scan_blob<'b>( + &mut self, + blob: &'b Blob, + origin: &OriginSet, + lang: Option, + redact: bool, + no_dedup: bool, + no_base64: bool, + ) -> Result> + where + 'a: 'b, + { + // Update local stats + self.local_stats.blobs_seen += 1; + self.local_stats.bytes_seen += blob.bytes().len() as u64; + self.local_stats.blobs_scanned += 1; + self.local_stats.bytes_scanned += blob.bytes().len() as u64; + + // Extract filename from origin + let filename = origin + .first() + .blob_path() + .and_then(|path| path.file_name()) + .and_then(|name| name.to_str()) + .unwrap_or("unknown_file") + .to_string(); + // Perform the scan + self.scan_bytes_raw(&blob.bytes(), &filename)?; + + // Opportunistically look for standalone Base64 blobs. If neither + // the raw scan nor this check yields anything, we can return early + // before doing any heavier work. + let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT { + Vec::new() + } else { + get_base64_strings(blob.bytes()) + }; + + let lang_hint = lang.as_deref(); + let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty(); + let has_base64_items = !b64_items.is_empty(); + + if !has_raw_matches && !has_base64_items { + return Ok(ScanResult::New(Vec::new())); + } + + let rules_db = self.rules_db; + let mut seen_matches = FxHashSet::default(); + let mut previous_matches: FxHashMap> = FxHashMap::default(); + + let blob_len = blob.len(); + + let should_run_tree_sitter = blob_len > 0 + && blob_len <= TREE_SITTER_MAX_LIMIT + && blob_len >= TREE_SITTER_MIN_LIMIT + && has_raw_matches + && lang_hint.is_some() + && !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled + + let tree_sitter_result = if should_run_tree_sitter { + lang_hint.and_then(|lang_str| { + get_language_and_queries(lang_str).and_then(|(language, queries)| { + let checker = Checker { language, rules: queries }; + match checker.check(&blob.bytes()) { + Ok(results) => Some(results), + Err(e) => { + println!("Error in checker.check: {}", e); + None + } + } + }) + }) + } else { + None + }; + // Process matches + let mut matches = Vec::new(); + let owned_ts_results = tree_sitter_result.map(|ts_results| { + ts_results + .into_iter() + .filter(|match_result| match_result.is_base64_decoded) + .map(|match_result| { + ( + match_result.range, + match_result.text, + match_result.is_base64_decoded, + match_result.original_base64, + ) + }) + .collect::>() + }); + let mut previous_raw_matches: FxHashMap> = FxHashMap::default(); + for &RawMatch { rule_id, start_idx, end_idx } in + self.user_data.raw_matches_scratch.iter().rev() + { + let rule_id_usize: usize = rule_id as usize; + let rule = Arc::clone(&rules_db.rules[rule_id_usize]); + let re = &rules_db.anchored_regexes[rule_id_usize]; + let start_idx_usize = start_idx as usize; + let end_idx_usize = end_idx as usize; + let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize); + if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) { + continue; + } + filter_match( + blob, + rule, + re, + start_idx_usize, + end_idx_usize, + &mut matches, + &mut previous_matches, + rule_id_usize, + &mut seen_matches, + origin, + None, + false, + redact, + &filename, + self.profiler.as_ref(), + &self.inline_ignore_config, + ); + } + // If tree-sitter produced base64-decoded matches, try them against all rules + if let Some(ref ts_results) = owned_ts_results { + for (ts_range, ts_match, is_base64_decoded, _original_base64) in ts_results.iter() { + if *is_base64_decoded { + for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() { + let re = &rules_db.anchored_regexes[rule_id_usize]; + filter_match( + blob, + rule.clone(), + re, + ts_range.start, + ts_range.end, + &mut matches, + &mut previous_matches, + rule_id_usize, + &mut seen_matches, + origin, + Some(ts_match.as_bytes()), + *is_base64_decoded, + redact, + &filename, + self.profiler.as_ref(), + &self.inline_ignore_config, + ); + } + } + } + } + + if !no_base64 { + // If the blob contains standalone Base64 blobs, decode and scan them as well + const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep + let mut b64_stack: Vec<(DecodedData, usize)> = + b64_items.drain(..).map(|d| (d, 0)).collect(); + while let Some((item, depth)) = b64_stack.pop() { + for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() { + let re = &rules_db.anchored_regexes[rule_id_usize]; + filter_match( + blob, + rule.clone(), + re, + item.pos_start, + item.pos_end, + &mut matches, + &mut previous_matches, + rule_id_usize, + &mut seen_matches, + origin, + Some(item.decoded.as_bytes()), + true, + redact, + &filename, + self.profiler.as_ref(), + &self.inline_ignore_config, + ); + } + if depth + 1 < MAX_B64_DEPTH { + for nested in get_base64_strings(item.decoded.as_bytes()) { + b64_stack.push(( + DecodedData { + original: nested.original, + decoded: nested.decoded, + pos_start: item.pos_start, + pos_end: item.pos_end, + }, + depth + 1, + )); + } + } + } + } + // Finalize + if !no_dedup && !matches.is_empty() { + let blob_id = blob.id(); + if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) { + return Ok(if had_matches { + ScanResult::SeenWithMatches + } else { + ScanResult::SeenSansMatches + }); + } + } + + // --- opportunistic capacity cap --------------------------------- + if self.user_data.raw_matches_scratch.capacity() + > self.user_data.raw_matches_scratch.len() * 4 + { + // Vec::shrink_to_fit may re-allocate, but we’re about to leave scan_blob + // so the cost is hidden off the hot path. + self.user_data.raw_matches_scratch.shrink_to_fit(); + } + + Ok(ScanResult::New(matches)) + // Ok(result) + } +} + +#[inline] +fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 { + let mut hasher = FxHasher::default(); + // Hash each component directly without allocation + content.hash(&mut hasher); + rule_id.hash(&mut hasher); + start.hash(&mut hasher); + end.hash(&mut hasher); + hasher.finish() +} + +#[inline] +fn insert_span(spans: &mut Vec, span: OffsetSpan) -> bool { + let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i); + if idx > 0 { + if spans[idx - 1].fully_contains(&span) { + return false; + } + if span.fully_contains(&spans[idx - 1]) { + spans.remove(idx - 1); + idx -= 1; + } + } + if idx < spans.len() { + if spans[idx].fully_contains(&span) { + return false; + } + if span.fully_contains(&spans[idx]) { + spans.remove(idx); + } + } + spans.insert(idx, span); + true +} + +#[inline] +fn record_match( + map: &mut FxHashMap>, + rule_id: usize, + span: OffsetSpan, +) -> bool { + insert_span(map.entry(rule_id).or_default(), span) +} +fn filter_match<'b>( + blob: &'b Blob, + // rule: &'b Rule, + rule: Arc, + re: &Regex, + start: usize, + end: usize, + matches: &mut Vec>, + previous_matches: &mut FxHashMap>, + rule_id: usize, + seen_matches: &mut FxHashSet, + _origin: &OriginSet, + ts_match: Option<&[u8]>, + is_base64: bool, + redact: bool, + filename: &str, + profiler: Option<&Arc>, + inline_ignore_config: &InlineIgnoreConfig, +) { + let mut timer = + profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename)); + + let initial_len = matches.len(); + + let blob_bytes = blob.bytes(); + let default_slice = &blob_bytes[start..end]; + let haystack = ts_match.unwrap_or(default_slice); + + for captures in re.captures_iter(haystack) { + let full_capture = captures.get(0).unwrap(); + let matching_input = captures.get(1).unwrap_or(full_capture); + let min_entropy = rule.min_entropy(); + let mi_bytes = matching_input.as_bytes(); + let full_bytes = full_capture.as_bytes(); + let calculated_entropy = calculate_shannon_entropy(mi_bytes); + if calculated_entropy <= min_entropy + || is_safe_match(mi_bytes) + || is_user_match(mi_bytes, full_bytes) + { + debug!( + "Skipping match with entropy {} <= {} or safe match", + calculated_entropy, min_entropy + ); + continue; + } + let matching_input_offset_span = OffsetSpan::from_range( + (start + matching_input.start())..(start + matching_input.end()), + ); + if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) { + debug!("Skipping match due to inline ignore directive"); + continue; + } + let match_key = compute_match_key( + matching_input.as_bytes(), + rule.id().as_bytes(), + matching_input_offset_span.start, + matching_input_offset_span.end, + ); + if !seen_matches.insert(match_key) { + continue; + } + if !record_match(previous_matches, rule_id, matching_input_offset_span) { + continue; + } + let only_matching_input = + &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end]; + let groups = SerializableCaptures::from_captures(&captures, haystack, re, redact); + matches.push(BlobMatch { + rule: Arc::clone(&rule), + blob_id: blob.id_ref(), + matching_input: only_matching_input, + matching_input_offset_span, + captures: groups, + validation_response_body: String::new(), + validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE), + validation_success: false, + calculated_entropy, + is_base64, + }); + } + if let Some(t) = timer.take() { + let new_count = (matches.len() - initial_len) as u64; + t.end(new_count > 0, new_count, 0); + } +} +fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap)> { + match lang.to_lowercase().as_str() { + "bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())), + "c" => Some((Language::C, parser::queries::c::get_c_queries())), + "c#" | "csharp" => Some((Language::CSharp, parser::queries::csharp::get_csharp_queries())), + "c++" | "cpp" => Some((Language::Cpp, parser::queries::cpp::get_cpp_queries())), + "css" => Some((Language::Css, parser::queries::css::get_css_queries())), + "go" => Some((Language::Go, parser::queries::go::get_go_queries())), + "html" => Some((Language::Html, parser::queries::html::get_html_queries())), + "java" => Some((Language::Java, parser::queries::java::get_java_queries())), + "javascript" | "js" => { + Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries())) + } + // "kotlin" => Some(( + // Language::Kotlin, + // parser::queries::kotlin::get_kotlin_queries(), + // )), + "php" => Some((Language::Php, parser::queries::php::get_php_queries())), + "python" | "py" | "starlark" => { + Some((Language::Python, parser::queries::python::get_python_queries())) + } + "ruby" => Some((Language::Ruby, parser::queries::ruby::get_ruby_queries())), + "rust" => Some((Language::Rust, parser::queries::rust::get_rust_queries())), + "toml" => Some((Language::Toml, parser::queries::toml::get_toml_queries())), + "typescript" | "ts" => { + Some((Language::TypeScript, parser::queries::typescript::get_typescript_queries())) + } + "yaml" => Some((Language::Yaml, parser::queries::yaml::get_yaml_queries())), + _ => None, + } +} +// ------------------------------------------------------------------------------------------------- +// MatchStats +// ------------------------------------------------------------------------------------------------- +#[derive(Debug, Default, Clone)] +pub struct MatcherStats { + pub blobs_seen: u64, + pub blobs_scanned: u64, + pub bytes_seen: u64, + pub bytes_scanned: u64, + // #[cfg(feature = "rule_profiling")] + // pub rule_stats: crate::rule_profiling::RuleProfile, +} +impl MatcherStats { + pub fn update(&mut self, other: &Self) { + self.blobs_seen += other.blobs_seen; + self.blobs_scanned += other.blobs_scanned; + self.bytes_seen += other.bytes_seen; + self.bytes_scanned += other.bytes_scanned; + + // #[cfg(feature = "rule_profiling")] + // self.rule_stats.update(&other.rule_stats); + } +} +// ------------------------------------------------------------------------------------------------- +// Group +// ------------------------------------------------------------------------------------------------- +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)] +pub struct Group(pub Base64BString); +impl Group { + pub fn new(m: regex::bytes::Match<'_>) -> Self { + Self(Base64BString(BString::from(m.as_bytes()))) + } +} +// ------------------------------------------------------------------------------------------------- +// Groups +// ------------------------------------------------------------------------------------------------- +#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +pub struct Groups(pub SmallVec<[Group; 1]>); +impl JsonSchema for Groups { + fn schema_name() -> String { + "Groups".to_string() + } + + fn json_schema(gen: &mut SchemaGenerator) -> Schema { + let group_schema = gen.subschema_for::(); + Schema::Object(schemars::schema::SchemaObject { + instance_type: Some(InstanceType::Array.into()), + array: Some(Box::new(ArrayValidation { + items: Some(group_schema.into()), + ..Default::default() + })), + ..Default::default() + }) + } +} +// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +// pub struct SerializableCapture { +// pub name: Option, // Named group (if available) +// pub match_number: i32, +// pub start: usize, // Start position of the match +// pub end: usize, // End position of the match +// pub value: String, // The actual captured value +// } +#[derive(Debug, Clone, Serialize, JsonSchema)] +pub struct SerializableCapture { + pub name: Option, + pub match_number: i32, + pub start: usize, + pub end: usize, + /// Interned value of the capture. + pub value: &'static str, +} + +#[derive(Debug, Clone, Serialize, JsonSchema)] +pub struct SerializableCaptures { + #[schemars(with = "Vec")] + pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed) +} +impl SerializableCaptures { + pub fn from_captures( + captures: ®ex::bytes::Captures, + _input: &[u8], + re: &Regex, + redact: bool, + ) -> Self { + let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new(); + // Process named captures + for name in re.capture_names().flatten() { + if let Some(capture) = captures.name(name) { + let value = if redact { + redact_value(&String::from_utf8_lossy(capture.as_bytes())) + } else { + String::from_utf8_lossy(capture.as_bytes()).to_string() + }; + serialized_captures.push(SerializableCapture { + name: Some(name.to_string()), + match_number: -1, + start: capture.start(), + end: capture.end(), + value: intern(&value), + }); + } + } + // Process unnamed captures (numbered groups) + for i in 0..captures.len() { + if let Some(capture) = captures.get(i) { + let value = if redact { + redact_value(&String::from_utf8_lossy(capture.as_bytes())) + } else { + String::from_utf8_lossy(capture.as_bytes()).to_string() + }; + serialized_captures.push(SerializableCapture { + name: None, + match_number: i32::try_from(i).unwrap_or(0), + start: capture.start(), + end: capture.end(), + value: intern(&value), + }); + } + } + SerializableCaptures { captures: serialized_captures } + } +} +// ------------------------------------------------------------------------------------------------- +// Match +// ------------------------------------------------------------------------------------------------- +#[derive(Debug, Clone, Serialize, JsonSchema)] +pub struct Match { + /// The location of the entire matching content + pub location: Location, + + /// The capture groups + pub groups: SerializableCaptures, // Store serialized captures + + /// unique identifier of file / blob where this match was found + pub blob_id: BlobId, + + /// The unique content-based identifier of this match + pub finding_fingerprint: u64, + + /// The rule that produced this match + #[serde(skip_serializing)] + #[schemars(skip)] + pub rule: Arc, + + /// Validation Body + pub validation_response_body: String, + + /// Validation Status Code + pub validation_response_status: u16, + + /// Validation Success + pub validation_success: bool, + + /// Validation Success + pub calculated_entropy: f32, + + pub visible: bool, + #[serde(default)] + pub is_base64: bool, +} +impl Match { + #[inline] + pub fn convert_owned_blobmatch_to_match<'a>( + loc_mapping: Option<&'a LocationMapping<'a>>, + owned_blob_match: &'a OwnedBlobMatch, + origin_type: &'a str, + ) -> Self { + let offset_span = owned_blob_match.matching_input_offset_span; + // Extract the matched secret content. Use capture group 1 if it exists, otherwise fall back + // to group 0. + let matching_finding_bytes = owned_blob_match + .captures + .captures + .get(1) + .or_else(|| owned_blob_match.captures.captures.get(0)) + .map(|capture| capture.value.as_bytes()) + .unwrap_or_default(); + + // The fingerprint will be based on the content of the secret. + let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or(""); + + let source_span = + loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan { + start: SourcePoint { line: 0, column: 0 }, + end: SourcePoint { line: 0, column: 0 }, + }); + let offset_start: u64 = + owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); + let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); + + let finding_fingerprint = compute_finding_fingerprint( + finding_value_for_fp, + origin_type, // file_or_commit, + offset_start, + offset_end, + ); + + // matching_snippet + Match { + rule: owned_blob_match.rule.clone(), + visible: owned_blob_match.rule.visible().to_owned(), + location: Location { offset_span, source_span: source_span.clone() }, + groups: owned_blob_match.captures.clone(), + blob_id: owned_blob_match.blob_id, + finding_fingerprint, + validation_response_body: owned_blob_match.validation_response_body.clone(), + validation_response_status: owned_blob_match.validation_response_status.as_u16(), + validation_success: owned_blob_match.validation_success, + calculated_entropy: owned_blob_match.calculated_entropy, + is_base64: owned_blob_match.is_base64, + } + } + + /// Returns the `blob_id` of the match. + pub fn get_blob_id(&self) -> BlobId { + self.blob_id.clone() + } + + pub fn finding_id(&self) -> String { + let mut buffer = Vec::with_capacity(128); + buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes()); + buffer.push(0); + serde_json::to_writer(&mut buffer, &self.groups) + .expect("should be able to serialize groups as JSON"); + let mut num = xxh3_64(&buffer); + // Ensure the number is positive and within i64 range + num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive + // Convert to string + num.to_string() + } +} +#[derive(Debug, Clone)] +pub struct DecodedData { + pub original: String, + pub decoded: String, + pub pos_start: usize, + pub pos_end: usize, +} +#[inline] +fn is_base64_byte(b: u8) -> bool { + matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/') +} + +pub fn get_base64_strings(input: &[u8]) -> Vec { + let mut results = Vec::new(); + let mut i = 0; + while i < input.len() { + while i < input.len() && !is_base64_byte(input[i]) { + i += 1; + } + let start = i; + while i < input.len() && is_base64_byte(input[i]) { + i += 1; + } + + let mut eq_count = 0; + while i < input.len() && input[i] == b'=' && eq_count < 2 { + i += 1; + eq_count += 1; + } + let end = i; + + let len = end - start; + if len >= 32 && len % 4 == 0 { + let base64_slice = &input[start..end]; + if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) { + if let Ok(decoded_str) = std::str::from_utf8(&decoded) { + if decoded_str.is_ascii() { + results.push(DecodedData { + original: String::from_utf8_lossy(base64_slice).into_owned(), + decoded: decoded_str.to_string(), + pos_start: start, + pos_end: end, + }); + } + } + } + } + } + + results +} + +pub fn compute_finding_fingerprint( + finding_value: &str, + file_or_commit: &str, + offset_start: u64, + offset_end: u64, +) -> u64 { + // Combine all into a byte buffer and hash it directly: + let mut buf = Vec::with_capacity( + finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::(), + ); + buf.extend_from_slice(finding_value.as_bytes()); + buf.extend_from_slice(file_or_commit.as_bytes()); + buf.extend_from_slice(&offset_start.to_le_bytes()); + buf.extend_from_slice(&offset_end.to_le_bytes()); + + xxh3_64(&buf) +} + +// ------------------------------------------------------------------------------------------------- +// test +// ------------------------------------------------------------------------------------------------- +#[cfg(test)] +mod test { + use std::{collections::BTreeMap, path::PathBuf}; + + use pretty_assertions::assert_eq; + // --------------------------------------------------------------------- + // proptest: raw-match dedup + entropy gate + // --------------------------------------------------------------------- + use proptest::prelude::*; + + use super::*; + use crate::{ + blob::{Blob, BlobIdMap}, + origin::{Origin, OriginSet}, + rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation}, + }; + + proptest! { + #[test] + fn prop_no_dupes_and_entropy( + // random ASCII up to 300 bytes + mut noise in proptest::collection::vec(any::().prop_filter("ascii", |b| b.is_ascii()), 0..300), + // 0-4 random insertion points + inserts in proptest::collection::vec(0usize..300, 0..5) + ) { + // Constant high-entropy secret token that matches the rule below + const TOKEN: &[u8] = b"secret_abcd1234"; + + // Splice the token at the requested offsets + for &idx in &inserts { + let pos = idx.min(noise.len()); + noise.splice(pos..pos, TOKEN.iter().copied()); + } + + // ── build a single test rule ────────────────────────────────── + use crate::rules::rule::{RuleSyntax, Validation, Confidence}; + + let rule = Rule::new(RuleSyntax { + id: "prop.secret".into(), + name: "prop secret".into(), + pattern: "secret_[a-z]{4}[0-9]{4}".into(), + confidence: Confidence::Low, + min_entropy: 3.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, // no HTTP validation needed + depends_on_rule: vec![], + }); + + let rules_db = RulesDatabase::from_rules(vec![rule]).unwrap(); + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false).unwrap(); + + // ── run the scan ────────────────────────────────────────────── + m.scan_bytes_raw(&noise, "buf").unwrap(); + + // ── property 1: dedup – each (rule,start,end) is unique ────── + + let mut coords = FxHashSet::default(); + for RawMatch{rule_id, start_idx, end_idx} in &m.user_data.raw_matches_scratch { + assert!( + coords.insert((*rule_id, *start_idx, *end_idx)), + "duplicate raw-match detected for coords ({rule_id},{start_idx},{end_idx})" + ); + + // ── property 2: entropy gate held ──────────────────────── + let slice = &noise[*start_idx as usize .. *end_idx as usize]; + let ent = calculate_shannon_entropy(slice); + assert!(ent > 3.0, "entropy {ent} ≤ min_entropy, gate failed"); + } + } + } + + #[test] + pub fn test_simple() -> Result<()> { + let rules = vec![Rule::new(RuleSyntax { + id: "test.1".to_string(), + name: "test".to_string(), + pattern: "test".to_string(), + confidence: crate::rules::rule::Confidence::Medium, + min_entropy: 1.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: Some(Validation::Http(HttpValidation { + request: HttpRequest { + method: "GET".to_string(), + url: "https://example.com".to_string(), + headers: BTreeMap::new(), + body: None, + response_matcher: Some(vec![]), + multipart: None, + response_is_html: false, + }, + multipart: None, + })), + depends_on_rule: vec![ + Some(DependsOnRule { + rule_id: "d8f3c34b-015f-4cd6-b411-b1366493104c".to_string(), + variable: "email".to_string(), + }), + Some(DependsOnRule { + rule_id: "8910f364-7718-4a27-a435-d2da13e6ba9e".to_string(), + variable: "domain".to_string(), + }), + ], + })]; + let rules_db = RulesDatabase::from_rules(rules)?; + let input = "some test data for vectorscan"; + let seen_blobs: BlobIdMap = BlobIdMap::new(); + let enable_rule_profiling = true; + // let mut matcher = Matcher::new(&rules_db, &seen_blobs, None, + // enable_rule_profiling)?; + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new( + &rules_db, + scanner_pool, + &seen_blobs, + None, + enable_rule_profiling, + None, // Pass the shared profiler + false, + )?; + matcher.scan_bytes_raw(input.as_bytes(), "fname")?; + assert_eq!( + matcher.user_data.raw_matches_scratch, + vec![RawMatch { rule_id: 0, start_idx: 0, end_idx: 9 },] + ); + Ok(()) + } + + // --------------------------------------------------------------------- + // additional deterministic unit-tests + // --------------------------------------------------------------------- + + /// `get_base64_strings` should recognise a well-formed token, decode it, + /// and report correct byte-offsets. + #[test] + fn test_get_base64_strings_basic() { + let raw = b"foo MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY= bar"; + // decodes to "0123456789abcdef0123456789abcdef" + let hits = get_base64_strings(raw); + assert_eq!(hits.len(), 1); + let item = &hits[0]; + assert_eq!(item.decoded, "0123456789abcdef0123456789abcdef"); + assert_eq!(item.original, "MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY="); + // "foo␠" is 4 bytes, so the start offset is 4 + assert_eq!((item.pos_start, item.pos_end), (4, 4 + item.original.len())); + } + + /// `compute_finding_fingerprint` must be stable (same input ⇒ same output) + /// and sensitive to any input component. + #[test] + fn test_finding_fingerprint_stability_and_uniqueness() { + let a = compute_finding_fingerprint("secret", "fileA", 0, 6); + let b = compute_finding_fingerprint("secret", "fileA", 0, 6); + assert_eq!(a, b, "fingerprint should be deterministic"); + + // changing any parameter should perturb the hash + let c = compute_finding_fingerprint("secret", "fileA", 1, 7); // offsets differ + let d = compute_finding_fingerprint("secret", "fileB", 0, 6); // file id differs + let e = compute_finding_fingerprint("different", "fileA", 0, 6); // content differs + assert_ne!(a, c); + assert_ne!(a, d); + assert_ne!(a, e); + } + + /// The (private) `compute_match_key` helper is the linchpin of the raw-dedup + /// path. It should return identical keys for identical inputs and different + /// keys as soon as *anything* changes. + #[test] + fn test_compute_match_key_uniqueness() { + use super::compute_match_key; + + let k1 = compute_match_key(b"abc", b"rule-1", 0, 3); + let k2 = compute_match_key(b"abc", b"rule-1", 0, 3); + assert_eq!(k1, k2); + + // mutate each component in turn + let diff_content = compute_match_key(b"abcd", b"rule-1", 0, 4); + let diff_rule = compute_match_key(b"abc", b"rule-2", 0, 3); + let diff_span = compute_match_key(b"abc", b"rule-1", 1, 4); + assert_ne!(k1, diff_content); + assert_ne!(k1, diff_rule); + assert_ne!(k1, diff_span); + } + + /// Running `scan_bytes_raw` twice over the *same* input should never record + /// duplicate entries in `raw_matches_scratch`. + #[test] + fn test_scan_bytes_raw_no_duplicate_raw_matches() -> Result<()> { + // simple rule: literal "dup" + let rule = Rule::new(RuleSyntax { + id: "dup.check".into(), + name: "dup".into(), + pattern: "dup".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + + let buf = b"dup dup"; // two literal hits, same rule + + // first scan + m.scan_bytes_raw(buf, "buf1")?; + let first_len = m.user_data.raw_matches_scratch.len(); + + // second scan over the same buffer + m.scan_bytes_raw(buf, "buf1")?; + let second_len = m.user_data.raw_matches_scratch.len(); + + // we should still only have two unique raw matches recorded + assert_eq!(first_len, 2); + assert_eq!(second_len, 2); + Ok(()) + } + + #[test] + fn inline_comment_skips_match() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.ignore".into(), + name: "inline".into(), + pattern: "secret_token".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + + let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt"))); + + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } + + #[test] + fn inline_comment_after_multiline_secret_skips_match() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.multiline".into(), + name: "inline multiline".into(), + pattern: "line1\\s+line2".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + + let blob = Blob::from_bytes( + br#"let data = """ +line1 +line2 +""" +# kingfisher:ignore +"# + .to_vec(), + ); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt"))); + + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } + + #[test] + fn compat_flag_controls_external_directives() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.compat".into(), + name: "inline compat".into(), + pattern: "supersecret123".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + + let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt"))); + + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + let matches_without_compat = + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => matches.len(), + _ => panic!("unexpected scan result"), + }; + assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag"); + + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true)?; + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } +} diff --git a/src/matcher.rs.rej b/src/matcher.rs.rej new file mode 100644 index 0000000..dc61305 --- /dev/null +++ b/src/matcher.rs.rej @@ -0,0 +1,52 @@ +@@ -1,50 +1,51 @@ + use std::{ + hash::{Hash, Hasher}, + str, + sync::{Arc, Mutex}, + }; + + use anyhow::Result; + use base64::{engine::general_purpose, Engine}; + use bstr::BString; + use http::StatusCode; + use regex::bytes::Regex; + use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; + use schemars::{ + gen::SchemaGenerator, + schema::{ArrayValidation, InstanceType, Schema}, + JsonSchema, + }; + use serde::{Deserialize, Serialize}; + use smallvec::SmallVec; + use tracing::debug; + use xxhash_rust::xxh3::xxh3_64; + + use crate::{ + blob::{Blob, BlobId, BlobIdMap}, + entropy::calculate_shannon_entropy, ++ inline_ignore::InlineIgnoreConfig, + location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, + origin::OriginSet, + parser, + parser::{Checker, Language}, + rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, + rules::rule::Rule, + rules_database::RulesDatabase, + safe_list::{is_safe_match, is_user_match}, + scanner_pool::ScannerPool, + snippet::Base64BString, + util::{intern, redact_value}, + }; + + const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment + const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches + const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs + const TREE_SITTER_SCAN_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs ≤64 KiB + + // ------------------------------------------------------------------------------------------------- + // RawMatch + // ------------------------------------------------------------------------------------------------- + /// A raw match, as recorded by a callback to Vectorscan. + /// + /// When matching with Vectorscan, we simply collect all matches into a + /// preallocated `Vec`, and then go through them all after scanning is complete. diff --git a/src/reporter.rs b/src/reporter.rs index 73bc541..f5d33f5 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -842,6 +842,7 @@ mod tests { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 256.0, exclude: Vec::new(), diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 8b4f59c..941b15c 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -143,6 +143,7 @@ mod tests { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index df56427..22d3204 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -144,6 +144,7 @@ pub fn enumerate_filesystem_inputs( Some(&matcher_stats), enable_profiling, Some(shared_profiler), + args.compat_ignore_comments, )?; let blob_processor_init_time = Mutex::new(t1.elapsed()); let make_blob_processor = || -> BlobProcessor { diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index eb4ad10..9b72359 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -622,6 +622,7 @@ pub async fn fetch_s3_objects( Some(matcher_stats), enable_profiling, Some(shared_profiler.clone()), + args.compat_ignore_comments, )?; let mut processor = BlobProcessor { matcher }; diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs index 72bd950..7820b9c 100644 --- a/tests/int_allowlist.rs +++ b/tests/int_allowlist.rs @@ -112,6 +112,7 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index cd83a7f..931f1c5 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -131,6 +131,7 @@ rules: since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 5.0, extraction_depth: 1, diff --git a/tests/int_github.rs b/tests/int_github.rs index 06c67a7..b158a59 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -118,6 +118,7 @@ fn test_github_remote_scan() -> Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index e55655a..abbfe72 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -116,6 +116,7 @@ fn test_gitlab_remote_scan() -> Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, @@ -251,6 +252,7 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, diff --git a/tests/int_redact.rs b/tests/int_redact.rs index 48247af..a5a50c6 100644 --- a/tests/int_redact.rs +++ b/tests/int_redact.rs @@ -95,6 +95,7 @@ async fn test_redact_hashes_finding_values() -> Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, diff --git a/tests/int_slack.rs b/tests/int_slack.rs index 2575a3c..b38f93c 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -102,6 +102,7 @@ impl TestContext { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, @@ -225,6 +226,7 @@ async fn test_scan_slack_messages() -> Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index ea1c809..0f66766 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -174,6 +174,7 @@ async fn test_validation_cache_and_depvars() -> Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index b87d721..d79c167 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -117,6 +117,7 @@ impl TestContext { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, @@ -227,6 +228,7 @@ impl TestContext { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, From b2a62a9c8ad56c22d1df221c2393cf4760a032a2 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 9 Oct 2025 17:59:22 -0700 Subject: [PATCH 2/2] - Added kingfisher:ignore (or kingfisher:allow) to silence a finding inline within a file - Added: to reuse existing inline directives from other scanners, pass --compat-ignore-comments to also accept NOSONAR, kics-scan ignore, gitleaks:allow and trufflehog:ignore --- src/matcher.rs.orig | 1317 ------------------------------------------- src/matcher.rs.rej | 52 -- 2 files changed, 1369 deletions(-) delete mode 100644 src/matcher.rs.orig delete mode 100644 src/matcher.rs.rej diff --git a/src/matcher.rs.orig b/src/matcher.rs.orig deleted file mode 100644 index 2688875..0000000 --- a/src/matcher.rs.orig +++ /dev/null @@ -1,1317 +0,0 @@ -use std::{ - hash::{Hash, Hasher}, - str, - sync::{Arc, Mutex}, -}; - -use anyhow::Result; -use base64::{engine::general_purpose, Engine}; -use bstr::BString; -use http::StatusCode; -use regex::bytes::Regex; -use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; -use schemars::{ - gen::SchemaGenerator, - schema::{ArrayValidation, InstanceType, Schema}, - JsonSchema, -}; -use serde::{Deserialize, Serialize}; -use smallvec::SmallVec; -use tracing::debug; -use xxhash_rust::xxh3::xxh3_64; - -use crate::{ - blob::{Blob, BlobId, BlobIdMap}, - entropy::calculate_shannon_entropy, - inline_ignore::InlineIgnoreConfig, - location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, - origin::OriginSet, - parser, - parser::{Checker, Language}, - rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, - rules::rule::Rule, - rules_database::RulesDatabase, - safe_list::{is_safe_match, is_user_match}, - scanner_pool::ScannerPool, - snippet::Base64BString, - util::{intern, redact_value}, -}; - -const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment -const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches -const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs -const TREE_SITTER_MAX_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs <= 64 KiB -const TREE_SITTER_MIN_LIMIT: usize = 1 * 1024; // only run tree-sitter on blobs >= 1 KiB - -// ------------------------------------------------------------------------------------------------- -// RawMatch -// ------------------------------------------------------------------------------------------------- -/// A raw match, as recorded by a callback to Vectorscan. -/// -/// When matching with Vectorscan, we simply collect all matches into a -/// preallocated `Vec`, and then go through them all after scanning is complete. -#[derive(PartialEq, Eq, Debug, Clone)] -struct RawMatch { - rule_id: u32, - start_idx: u64, - end_idx: u64, -} -#[derive(Clone)] -pub struct OwnedBlobMatch { - pub rule: Arc, - pub blob_id: BlobId, - /// The unique content-based identifier of this match - pub finding_fingerprint: u64, - pub matching_input_offset_span: OffsetSpan, - pub captures: SerializableCaptures, - pub validation_response_body: String, - pub validation_response_status: StatusCode, - pub validation_success: bool, - pub calculated_entropy: f32, - pub is_base64: bool, -} -impl<'a> Matcher<'a> { - pub fn get_profiling_report(&self) -> Option> { - self.profiler.as_ref().map(|p| p.generate_report()) - } -} -impl OwnedBlobMatch { - pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc) -> OwnedBlobMatch { - OwnedBlobMatch { - rule, - blob_id: m.blob_id, - finding_fingerprint: m.finding_fingerprint, - // matching_input: m.snippet.matching.0.to_vec(), - matching_input_offset_span: m.location.offset_span.clone(), - captures: m.groups.clone(), - validation_response_body: m.validation_response_body.clone(), - validation_response_status: StatusCode::from_u16(m.validation_response_status) - .unwrap_or(StatusCode::CONTINUE), - validation_success: m.validation_success, - calculated_entropy: m.calculated_entropy, - is_base64: m.is_base64, - } - } - - pub fn from_blob_match(blob_match: BlobMatch) -> Self { - // Get the matching value from capture group 1 (or 0 if not available) - let matching_finding = blob_match - .captures - .captures - .get(1) - .or_else(|| blob_match.captures.captures.get(0)) - .map(|capture| capture.value.as_bytes().to_vec()) - .unwrap_or_else(Vec::new); - - let mut owned_blob_match = OwnedBlobMatch { - rule: blob_match.rule, - blob_id: blob_match.blob_id.clone(), - matching_input_offset_span: blob_match.matching_input_offset_span, - captures: blob_match.captures.clone(), - validation_response_body: blob_match.validation_response_body, - validation_response_status: blob_match.validation_response_status, - validation_success: blob_match.validation_success, - calculated_entropy: blob_match.calculated_entropy, - finding_fingerprint: 0, //default - is_base64: blob_match.is_base64, - }; - - // Convert matching_finding to a &str (using lossy conversion if needed) - let finding_value = std::str::from_utf8(&matching_finding).unwrap_or(""); - // Use blob_id as the file/commit identifier - let file_or_commit = &blob_match.blob_id.to_string(); - - let offset_start: u64 = - owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); - let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); - - owned_blob_match.finding_fingerprint = - compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end); - - owned_blob_match - } -} -// ------------------------------------------------------------------------------------------------- -// BlobMatch -// ------------------------------------------------------------------------------------------------- -/// A `BlobMatch` is the result type from `Matcher::scan_blob`. -/// -/// It is mostly made up of references and small data. -/// For a representation that is more friendly for human consumption, see -/// `Match`. -pub struct BlobMatch<'a> { - /// The rule that was matched - pub rule: Arc, // Changed from `&'a Rule` to `Arc, - - pub validation_response_body: String, - pub validation_response_status: StatusCode, - - pub validation_success: bool, - pub calculated_entropy: f32, - pub is_base64: bool, -} -#[derive(Clone)] -struct UserData { - /// A scratch vector for raw matches from Vectorscan, to minimize allocation - raw_matches_scratch: Vec, - - /// The length of the input being scanned - input_len: u64, -} -// ------------------------------------------------------------------------------------------------- -// Matcher -// ------------------------------------------------------------------------------------------------- -/// A `Matcher` is able to scan inputs for matches from rules in a -/// `RulesDatabase`. -/// -/// If doing multi-threaded scanning, use a separate `Matcher` for each thread. -#[derive(Clone)] -pub struct Matcher<'a> { - /// Thread-local pool that hands out a &mut BlockScanner - scanner_pool: std::sync::Arc, - - /// The rules database used for matching - rules_db: &'a RulesDatabase, - - /// Local statistics for this `Matcher` - local_stats: MatcherStats, - - /// Global statistics, updated with the local statsistics when this - /// `Matcher` is dropped - global_stats: Option<&'a Mutex>, - - /// The set of blobs that have been seen - seen_blobs: &'a BlobIdMap, - - /// Data passed to the Vectorscan callback - user_data: UserData, - - /// Rule profiler for measuring performance of individual rules - profiler: Option>, - - /// Configuration that controls inline ignore directives - inline_ignore_config: InlineIgnoreConfig, -} -/// This `Drop` implementation updates the `global_stats` with the local stats -impl<'a> Drop for Matcher<'a> { - fn drop(&mut self) { - if let Some(global_stats) = self.global_stats { - let mut global_stats = global_stats.lock().unwrap(); - global_stats.update(&self.local_stats); - } - } -} -pub enum ScanResult<'a> { - SeenWithMatches, - SeenSansMatches, - New(Vec>), -} -impl<'a> Matcher<'a> { - /// Create a new `Matcher` from the given `RulesDatabase`. - /// - /// If `global_stats` is provided, it will be updated with the local stats - /// from this `Matcher` when it is dropped. - pub fn new( - rules_db: &'a RulesDatabase, - scanner_pool: Arc, - seen_blobs: &'a BlobIdMap, - global_stats: Option<&'a Mutex>, - enable_profiling: bool, - shared_profiler: Option>, - include_external_ignore_syntax: bool, - ) -> Result { - // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec - let raw_matches_scratch = Vec::new(); - let user_data = UserData { raw_matches_scratch, input_len: 0 }; - // let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?; - // pool is created once per scan run (see Scanner section below) - let profiler = shared_profiler.or_else(|| { - if enable_profiling { - Some(Arc::new(ConcurrentRuleProfiler::new())) - } else { - None - } - }); - Ok(Matcher { - scanner_pool, - rules_db, - local_stats: MatcherStats::default(), - global_stats, - seen_blobs, - user_data, - profiler, - inline_ignore_config: InlineIgnoreConfig::new(include_external_ignore_syntax), - }) - } - - fn scan_bytes_raw(&mut self, input: &[u8], _filename: &str) -> Result<()> { - // Remember previous peak automatically - let prev_capacity = self.user_data.raw_matches_scratch.capacity(); - self.user_data.raw_matches_scratch.clear(); - self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64)); - - self.user_data.input_len = input.len() as u64; - - let mut offset: usize = 0; - while offset < input.len() { - let end = (offset + MAX_CHUNK_SIZE).min(input.len()); - let slice = &input[offset..end]; - let base = offset as u64; - self.scanner_pool.with(|scanner| { - scanner.scan(slice, |rule_id, from, to, _flags| { - self.user_data.raw_matches_scratch.push(RawMatch { - rule_id, - start_idx: from + base, - end_idx: to + base, - }); - vectorscan_rs::Scan::Continue - }) - })?; - - if end == input.len() { - break; - } - offset = end.saturating_sub(CHUNK_OVERLAP); - } - - Ok(()) - } - - pub fn scan_blob<'b>( - &mut self, - blob: &'b Blob, - origin: &OriginSet, - lang: Option, - redact: bool, - no_dedup: bool, - no_base64: bool, - ) -> Result> - where - 'a: 'b, - { - // Update local stats - self.local_stats.blobs_seen += 1; - self.local_stats.bytes_seen += blob.bytes().len() as u64; - self.local_stats.blobs_scanned += 1; - self.local_stats.bytes_scanned += blob.bytes().len() as u64; - - // Extract filename from origin - let filename = origin - .first() - .blob_path() - .and_then(|path| path.file_name()) - .and_then(|name| name.to_str()) - .unwrap_or("unknown_file") - .to_string(); - // Perform the scan - self.scan_bytes_raw(&blob.bytes(), &filename)?; - - // Opportunistically look for standalone Base64 blobs. If neither - // the raw scan nor this check yields anything, we can return early - // before doing any heavier work. - let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT { - Vec::new() - } else { - get_base64_strings(blob.bytes()) - }; - - let lang_hint = lang.as_deref(); - let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty(); - let has_base64_items = !b64_items.is_empty(); - - if !has_raw_matches && !has_base64_items { - return Ok(ScanResult::New(Vec::new())); - } - - let rules_db = self.rules_db; - let mut seen_matches = FxHashSet::default(); - let mut previous_matches: FxHashMap> = FxHashMap::default(); - - let blob_len = blob.len(); - - let should_run_tree_sitter = blob_len > 0 - && blob_len <= TREE_SITTER_MAX_LIMIT - && blob_len >= TREE_SITTER_MIN_LIMIT - && has_raw_matches - && lang_hint.is_some() - && !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled - - let tree_sitter_result = if should_run_tree_sitter { - lang_hint.and_then(|lang_str| { - get_language_and_queries(lang_str).and_then(|(language, queries)| { - let checker = Checker { language, rules: queries }; - match checker.check(&blob.bytes()) { - Ok(results) => Some(results), - Err(e) => { - println!("Error in checker.check: {}", e); - None - } - } - }) - }) - } else { - None - }; - // Process matches - let mut matches = Vec::new(); - let owned_ts_results = tree_sitter_result.map(|ts_results| { - ts_results - .into_iter() - .filter(|match_result| match_result.is_base64_decoded) - .map(|match_result| { - ( - match_result.range, - match_result.text, - match_result.is_base64_decoded, - match_result.original_base64, - ) - }) - .collect::>() - }); - let mut previous_raw_matches: FxHashMap> = FxHashMap::default(); - for &RawMatch { rule_id, start_idx, end_idx } in - self.user_data.raw_matches_scratch.iter().rev() - { - let rule_id_usize: usize = rule_id as usize; - let rule = Arc::clone(&rules_db.rules[rule_id_usize]); - let re = &rules_db.anchored_regexes[rule_id_usize]; - let start_idx_usize = start_idx as usize; - let end_idx_usize = end_idx as usize; - let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize); - if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) { - continue; - } - filter_match( - blob, - rule, - re, - start_idx_usize, - end_idx_usize, - &mut matches, - &mut previous_matches, - rule_id_usize, - &mut seen_matches, - origin, - None, - false, - redact, - &filename, - self.profiler.as_ref(), - &self.inline_ignore_config, - ); - } - // If tree-sitter produced base64-decoded matches, try them against all rules - if let Some(ref ts_results) = owned_ts_results { - for (ts_range, ts_match, is_base64_decoded, _original_base64) in ts_results.iter() { - if *is_base64_decoded { - for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() { - let re = &rules_db.anchored_regexes[rule_id_usize]; - filter_match( - blob, - rule.clone(), - re, - ts_range.start, - ts_range.end, - &mut matches, - &mut previous_matches, - rule_id_usize, - &mut seen_matches, - origin, - Some(ts_match.as_bytes()), - *is_base64_decoded, - redact, - &filename, - self.profiler.as_ref(), - &self.inline_ignore_config, - ); - } - } - } - } - - if !no_base64 { - // If the blob contains standalone Base64 blobs, decode and scan them as well - const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep - let mut b64_stack: Vec<(DecodedData, usize)> = - b64_items.drain(..).map(|d| (d, 0)).collect(); - while let Some((item, depth)) = b64_stack.pop() { - for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() { - let re = &rules_db.anchored_regexes[rule_id_usize]; - filter_match( - blob, - rule.clone(), - re, - item.pos_start, - item.pos_end, - &mut matches, - &mut previous_matches, - rule_id_usize, - &mut seen_matches, - origin, - Some(item.decoded.as_bytes()), - true, - redact, - &filename, - self.profiler.as_ref(), - &self.inline_ignore_config, - ); - } - if depth + 1 < MAX_B64_DEPTH { - for nested in get_base64_strings(item.decoded.as_bytes()) { - b64_stack.push(( - DecodedData { - original: nested.original, - decoded: nested.decoded, - pos_start: item.pos_start, - pos_end: item.pos_end, - }, - depth + 1, - )); - } - } - } - } - // Finalize - if !no_dedup && !matches.is_empty() { - let blob_id = blob.id(); - if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) { - return Ok(if had_matches { - ScanResult::SeenWithMatches - } else { - ScanResult::SeenSansMatches - }); - } - } - - // --- opportunistic capacity cap --------------------------------- - if self.user_data.raw_matches_scratch.capacity() - > self.user_data.raw_matches_scratch.len() * 4 - { - // Vec::shrink_to_fit may re-allocate, but we’re about to leave scan_blob - // so the cost is hidden off the hot path. - self.user_data.raw_matches_scratch.shrink_to_fit(); - } - - Ok(ScanResult::New(matches)) - // Ok(result) - } -} - -#[inline] -fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 { - let mut hasher = FxHasher::default(); - // Hash each component directly without allocation - content.hash(&mut hasher); - rule_id.hash(&mut hasher); - start.hash(&mut hasher); - end.hash(&mut hasher); - hasher.finish() -} - -#[inline] -fn insert_span(spans: &mut Vec, span: OffsetSpan) -> bool { - let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i); - if idx > 0 { - if spans[idx - 1].fully_contains(&span) { - return false; - } - if span.fully_contains(&spans[idx - 1]) { - spans.remove(idx - 1); - idx -= 1; - } - } - if idx < spans.len() { - if spans[idx].fully_contains(&span) { - return false; - } - if span.fully_contains(&spans[idx]) { - spans.remove(idx); - } - } - spans.insert(idx, span); - true -} - -#[inline] -fn record_match( - map: &mut FxHashMap>, - rule_id: usize, - span: OffsetSpan, -) -> bool { - insert_span(map.entry(rule_id).or_default(), span) -} -fn filter_match<'b>( - blob: &'b Blob, - // rule: &'b Rule, - rule: Arc, - re: &Regex, - start: usize, - end: usize, - matches: &mut Vec>, - previous_matches: &mut FxHashMap>, - rule_id: usize, - seen_matches: &mut FxHashSet, - _origin: &OriginSet, - ts_match: Option<&[u8]>, - is_base64: bool, - redact: bool, - filename: &str, - profiler: Option<&Arc>, - inline_ignore_config: &InlineIgnoreConfig, -) { - let mut timer = - profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename)); - - let initial_len = matches.len(); - - let blob_bytes = blob.bytes(); - let default_slice = &blob_bytes[start..end]; - let haystack = ts_match.unwrap_or(default_slice); - - for captures in re.captures_iter(haystack) { - let full_capture = captures.get(0).unwrap(); - let matching_input = captures.get(1).unwrap_or(full_capture); - let min_entropy = rule.min_entropy(); - let mi_bytes = matching_input.as_bytes(); - let full_bytes = full_capture.as_bytes(); - let calculated_entropy = calculate_shannon_entropy(mi_bytes); - if calculated_entropy <= min_entropy - || is_safe_match(mi_bytes) - || is_user_match(mi_bytes, full_bytes) - { - debug!( - "Skipping match with entropy {} <= {} or safe match", - calculated_entropy, min_entropy - ); - continue; - } - let matching_input_offset_span = OffsetSpan::from_range( - (start + matching_input.start())..(start + matching_input.end()), - ); - if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) { - debug!("Skipping match due to inline ignore directive"); - continue; - } - let match_key = compute_match_key( - matching_input.as_bytes(), - rule.id().as_bytes(), - matching_input_offset_span.start, - matching_input_offset_span.end, - ); - if !seen_matches.insert(match_key) { - continue; - } - if !record_match(previous_matches, rule_id, matching_input_offset_span) { - continue; - } - let only_matching_input = - &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end]; - let groups = SerializableCaptures::from_captures(&captures, haystack, re, redact); - matches.push(BlobMatch { - rule: Arc::clone(&rule), - blob_id: blob.id_ref(), - matching_input: only_matching_input, - matching_input_offset_span, - captures: groups, - validation_response_body: String::new(), - validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE), - validation_success: false, - calculated_entropy, - is_base64, - }); - } - if let Some(t) = timer.take() { - let new_count = (matches.len() - initial_len) as u64; - t.end(new_count > 0, new_count, 0); - } -} -fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap)> { - match lang.to_lowercase().as_str() { - "bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())), - "c" => Some((Language::C, parser::queries::c::get_c_queries())), - "c#" | "csharp" => Some((Language::CSharp, parser::queries::csharp::get_csharp_queries())), - "c++" | "cpp" => Some((Language::Cpp, parser::queries::cpp::get_cpp_queries())), - "css" => Some((Language::Css, parser::queries::css::get_css_queries())), - "go" => Some((Language::Go, parser::queries::go::get_go_queries())), - "html" => Some((Language::Html, parser::queries::html::get_html_queries())), - "java" => Some((Language::Java, parser::queries::java::get_java_queries())), - "javascript" | "js" => { - Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries())) - } - // "kotlin" => Some(( - // Language::Kotlin, - // parser::queries::kotlin::get_kotlin_queries(), - // )), - "php" => Some((Language::Php, parser::queries::php::get_php_queries())), - "python" | "py" | "starlark" => { - Some((Language::Python, parser::queries::python::get_python_queries())) - } - "ruby" => Some((Language::Ruby, parser::queries::ruby::get_ruby_queries())), - "rust" => Some((Language::Rust, parser::queries::rust::get_rust_queries())), - "toml" => Some((Language::Toml, parser::queries::toml::get_toml_queries())), - "typescript" | "ts" => { - Some((Language::TypeScript, parser::queries::typescript::get_typescript_queries())) - } - "yaml" => Some((Language::Yaml, parser::queries::yaml::get_yaml_queries())), - _ => None, - } -} -// ------------------------------------------------------------------------------------------------- -// MatchStats -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Default, Clone)] -pub struct MatcherStats { - pub blobs_seen: u64, - pub blobs_scanned: u64, - pub bytes_seen: u64, - pub bytes_scanned: u64, - // #[cfg(feature = "rule_profiling")] - // pub rule_stats: crate::rule_profiling::RuleProfile, -} -impl MatcherStats { - pub fn update(&mut self, other: &Self) { - self.blobs_seen += other.blobs_seen; - self.blobs_scanned += other.blobs_scanned; - self.bytes_seen += other.bytes_seen; - self.bytes_scanned += other.bytes_scanned; - - // #[cfg(feature = "rule_profiling")] - // self.rule_stats.update(&other.rule_stats); - } -} -// ------------------------------------------------------------------------------------------------- -// Group -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)] -pub struct Group(pub Base64BString); -impl Group { - pub fn new(m: regex::bytes::Match<'_>) -> Self { - Self(Base64BString(BString::from(m.as_bytes()))) - } -} -// ------------------------------------------------------------------------------------------------- -// Groups -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub struct Groups(pub SmallVec<[Group; 1]>); -impl JsonSchema for Groups { - fn schema_name() -> String { - "Groups".to_string() - } - - fn json_schema(gen: &mut SchemaGenerator) -> Schema { - let group_schema = gen.subschema_for::(); - Schema::Object(schemars::schema::SchemaObject { - instance_type: Some(InstanceType::Array.into()), - array: Some(Box::new(ArrayValidation { - items: Some(group_schema.into()), - ..Default::default() - })), - ..Default::default() - }) - } -} -// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] -// pub struct SerializableCapture { -// pub name: Option, // Named group (if available) -// pub match_number: i32, -// pub start: usize, // Start position of the match -// pub end: usize, // End position of the match -// pub value: String, // The actual captured value -// } -#[derive(Debug, Clone, Serialize, JsonSchema)] -pub struct SerializableCapture { - pub name: Option, - pub match_number: i32, - pub start: usize, - pub end: usize, - /// Interned value of the capture. - pub value: &'static str, -} - -#[derive(Debug, Clone, Serialize, JsonSchema)] -pub struct SerializableCaptures { - #[schemars(with = "Vec")] - pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed) -} -impl SerializableCaptures { - pub fn from_captures( - captures: ®ex::bytes::Captures, - _input: &[u8], - re: &Regex, - redact: bool, - ) -> Self { - let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new(); - // Process named captures - for name in re.capture_names().flatten() { - if let Some(capture) = captures.name(name) { - let value = if redact { - redact_value(&String::from_utf8_lossy(capture.as_bytes())) - } else { - String::from_utf8_lossy(capture.as_bytes()).to_string() - }; - serialized_captures.push(SerializableCapture { - name: Some(name.to_string()), - match_number: -1, - start: capture.start(), - end: capture.end(), - value: intern(&value), - }); - } - } - // Process unnamed captures (numbered groups) - for i in 0..captures.len() { - if let Some(capture) = captures.get(i) { - let value = if redact { - redact_value(&String::from_utf8_lossy(capture.as_bytes())) - } else { - String::from_utf8_lossy(capture.as_bytes()).to_string() - }; - serialized_captures.push(SerializableCapture { - name: None, - match_number: i32::try_from(i).unwrap_or(0), - start: capture.start(), - end: capture.end(), - value: intern(&value), - }); - } - } - SerializableCaptures { captures: serialized_captures } - } -} -// ------------------------------------------------------------------------------------------------- -// Match -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Clone, Serialize, JsonSchema)] -pub struct Match { - /// The location of the entire matching content - pub location: Location, - - /// The capture groups - pub groups: SerializableCaptures, // Store serialized captures - - /// unique identifier of file / blob where this match was found - pub blob_id: BlobId, - - /// The unique content-based identifier of this match - pub finding_fingerprint: u64, - - /// The rule that produced this match - #[serde(skip_serializing)] - #[schemars(skip)] - pub rule: Arc, - - /// Validation Body - pub validation_response_body: String, - - /// Validation Status Code - pub validation_response_status: u16, - - /// Validation Success - pub validation_success: bool, - - /// Validation Success - pub calculated_entropy: f32, - - pub visible: bool, - #[serde(default)] - pub is_base64: bool, -} -impl Match { - #[inline] - pub fn convert_owned_blobmatch_to_match<'a>( - loc_mapping: Option<&'a LocationMapping<'a>>, - owned_blob_match: &'a OwnedBlobMatch, - origin_type: &'a str, - ) -> Self { - let offset_span = owned_blob_match.matching_input_offset_span; - // Extract the matched secret content. Use capture group 1 if it exists, otherwise fall back - // to group 0. - let matching_finding_bytes = owned_blob_match - .captures - .captures - .get(1) - .or_else(|| owned_blob_match.captures.captures.get(0)) - .map(|capture| capture.value.as_bytes()) - .unwrap_or_default(); - - // The fingerprint will be based on the content of the secret. - let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or(""); - - let source_span = - loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan { - start: SourcePoint { line: 0, column: 0 }, - end: SourcePoint { line: 0, column: 0 }, - }); - let offset_start: u64 = - owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); - let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); - - let finding_fingerprint = compute_finding_fingerprint( - finding_value_for_fp, - origin_type, // file_or_commit, - offset_start, - offset_end, - ); - - // matching_snippet - Match { - rule: owned_blob_match.rule.clone(), - visible: owned_blob_match.rule.visible().to_owned(), - location: Location { offset_span, source_span: source_span.clone() }, - groups: owned_blob_match.captures.clone(), - blob_id: owned_blob_match.blob_id, - finding_fingerprint, - validation_response_body: owned_blob_match.validation_response_body.clone(), - validation_response_status: owned_blob_match.validation_response_status.as_u16(), - validation_success: owned_blob_match.validation_success, - calculated_entropy: owned_blob_match.calculated_entropy, - is_base64: owned_blob_match.is_base64, - } - } - - /// Returns the `blob_id` of the match. - pub fn get_blob_id(&self) -> BlobId { - self.blob_id.clone() - } - - pub fn finding_id(&self) -> String { - let mut buffer = Vec::with_capacity(128); - buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes()); - buffer.push(0); - serde_json::to_writer(&mut buffer, &self.groups) - .expect("should be able to serialize groups as JSON"); - let mut num = xxh3_64(&buffer); - // Ensure the number is positive and within i64 range - num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive - // Convert to string - num.to_string() - } -} -#[derive(Debug, Clone)] -pub struct DecodedData { - pub original: String, - pub decoded: String, - pub pos_start: usize, - pub pos_end: usize, -} -#[inline] -fn is_base64_byte(b: u8) -> bool { - matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/') -} - -pub fn get_base64_strings(input: &[u8]) -> Vec { - let mut results = Vec::new(); - let mut i = 0; - while i < input.len() { - while i < input.len() && !is_base64_byte(input[i]) { - i += 1; - } - let start = i; - while i < input.len() && is_base64_byte(input[i]) { - i += 1; - } - - let mut eq_count = 0; - while i < input.len() && input[i] == b'=' && eq_count < 2 { - i += 1; - eq_count += 1; - } - let end = i; - - let len = end - start; - if len >= 32 && len % 4 == 0 { - let base64_slice = &input[start..end]; - if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) { - if let Ok(decoded_str) = std::str::from_utf8(&decoded) { - if decoded_str.is_ascii() { - results.push(DecodedData { - original: String::from_utf8_lossy(base64_slice).into_owned(), - decoded: decoded_str.to_string(), - pos_start: start, - pos_end: end, - }); - } - } - } - } - } - - results -} - -pub fn compute_finding_fingerprint( - finding_value: &str, - file_or_commit: &str, - offset_start: u64, - offset_end: u64, -) -> u64 { - // Combine all into a byte buffer and hash it directly: - let mut buf = Vec::with_capacity( - finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::(), - ); - buf.extend_from_slice(finding_value.as_bytes()); - buf.extend_from_slice(file_or_commit.as_bytes()); - buf.extend_from_slice(&offset_start.to_le_bytes()); - buf.extend_from_slice(&offset_end.to_le_bytes()); - - xxh3_64(&buf) -} - -// ------------------------------------------------------------------------------------------------- -// test -// ------------------------------------------------------------------------------------------------- -#[cfg(test)] -mod test { - use std::{collections::BTreeMap, path::PathBuf}; - - use pretty_assertions::assert_eq; - // --------------------------------------------------------------------- - // proptest: raw-match dedup + entropy gate - // --------------------------------------------------------------------- - use proptest::prelude::*; - - use super::*; - use crate::{ - blob::{Blob, BlobIdMap}, - origin::{Origin, OriginSet}, - rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation}, - }; - - proptest! { - #[test] - fn prop_no_dupes_and_entropy( - // random ASCII up to 300 bytes - mut noise in proptest::collection::vec(any::().prop_filter("ascii", |b| b.is_ascii()), 0..300), - // 0-4 random insertion points - inserts in proptest::collection::vec(0usize..300, 0..5) - ) { - // Constant high-entropy secret token that matches the rule below - const TOKEN: &[u8] = b"secret_abcd1234"; - - // Splice the token at the requested offsets - for &idx in &inserts { - let pos = idx.min(noise.len()); - noise.splice(pos..pos, TOKEN.iter().copied()); - } - - // ── build a single test rule ────────────────────────────────── - use crate::rules::rule::{RuleSyntax, Validation, Confidence}; - - let rule = Rule::new(RuleSyntax { - id: "prop.secret".into(), - name: "prop secret".into(), - pattern: "secret_[a-z]{4}[0-9]{4}".into(), - confidence: Confidence::Low, - min_entropy: 3.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, // no HTTP validation needed - depends_on_rule: vec![], - }); - - let rules_db = RulesDatabase::from_rules(vec![rule]).unwrap(); - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false).unwrap(); - - // ── run the scan ────────────────────────────────────────────── - m.scan_bytes_raw(&noise, "buf").unwrap(); - - // ── property 1: dedup – each (rule,start,end) is unique ────── - - let mut coords = FxHashSet::default(); - for RawMatch{rule_id, start_idx, end_idx} in &m.user_data.raw_matches_scratch { - assert!( - coords.insert((*rule_id, *start_idx, *end_idx)), - "duplicate raw-match detected for coords ({rule_id},{start_idx},{end_idx})" - ); - - // ── property 2: entropy gate held ──────────────────────── - let slice = &noise[*start_idx as usize .. *end_idx as usize]; - let ent = calculate_shannon_entropy(slice); - assert!(ent > 3.0, "entropy {ent} ≤ min_entropy, gate failed"); - } - } - } - - #[test] - pub fn test_simple() -> Result<()> { - let rules = vec![Rule::new(RuleSyntax { - id: "test.1".to_string(), - name: "test".to_string(), - pattern: "test".to_string(), - confidence: crate::rules::rule::Confidence::Medium, - min_entropy: 1.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: Some(Validation::Http(HttpValidation { - request: HttpRequest { - method: "GET".to_string(), - url: "https://example.com".to_string(), - headers: BTreeMap::new(), - body: None, - response_matcher: Some(vec![]), - multipart: None, - response_is_html: false, - }, - multipart: None, - })), - depends_on_rule: vec![ - Some(DependsOnRule { - rule_id: "d8f3c34b-015f-4cd6-b411-b1366493104c".to_string(), - variable: "email".to_string(), - }), - Some(DependsOnRule { - rule_id: "8910f364-7718-4a27-a435-d2da13e6ba9e".to_string(), - variable: "domain".to_string(), - }), - ], - })]; - let rules_db = RulesDatabase::from_rules(rules)?; - let input = "some test data for vectorscan"; - let seen_blobs: BlobIdMap = BlobIdMap::new(); - let enable_rule_profiling = true; - // let mut matcher = Matcher::new(&rules_db, &seen_blobs, None, - // enable_rule_profiling)?; - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new( - &rules_db, - scanner_pool, - &seen_blobs, - None, - enable_rule_profiling, - None, // Pass the shared profiler - false, - )?; - matcher.scan_bytes_raw(input.as_bytes(), "fname")?; - assert_eq!( - matcher.user_data.raw_matches_scratch, - vec![RawMatch { rule_id: 0, start_idx: 0, end_idx: 9 },] - ); - Ok(()) - } - - // --------------------------------------------------------------------- - // additional deterministic unit-tests - // --------------------------------------------------------------------- - - /// `get_base64_strings` should recognise a well-formed token, decode it, - /// and report correct byte-offsets. - #[test] - fn test_get_base64_strings_basic() { - let raw = b"foo MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY= bar"; - // decodes to "0123456789abcdef0123456789abcdef" - let hits = get_base64_strings(raw); - assert_eq!(hits.len(), 1); - let item = &hits[0]; - assert_eq!(item.decoded, "0123456789abcdef0123456789abcdef"); - assert_eq!(item.original, "MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY="); - // "foo␠" is 4 bytes, so the start offset is 4 - assert_eq!((item.pos_start, item.pos_end), (4, 4 + item.original.len())); - } - - /// `compute_finding_fingerprint` must be stable (same input ⇒ same output) - /// and sensitive to any input component. - #[test] - fn test_finding_fingerprint_stability_and_uniqueness() { - let a = compute_finding_fingerprint("secret", "fileA", 0, 6); - let b = compute_finding_fingerprint("secret", "fileA", 0, 6); - assert_eq!(a, b, "fingerprint should be deterministic"); - - // changing any parameter should perturb the hash - let c = compute_finding_fingerprint("secret", "fileA", 1, 7); // offsets differ - let d = compute_finding_fingerprint("secret", "fileB", 0, 6); // file id differs - let e = compute_finding_fingerprint("different", "fileA", 0, 6); // content differs - assert_ne!(a, c); - assert_ne!(a, d); - assert_ne!(a, e); - } - - /// The (private) `compute_match_key` helper is the linchpin of the raw-dedup - /// path. It should return identical keys for identical inputs and different - /// keys as soon as *anything* changes. - #[test] - fn test_compute_match_key_uniqueness() { - use super::compute_match_key; - - let k1 = compute_match_key(b"abc", b"rule-1", 0, 3); - let k2 = compute_match_key(b"abc", b"rule-1", 0, 3); - assert_eq!(k1, k2); - - // mutate each component in turn - let diff_content = compute_match_key(b"abcd", b"rule-1", 0, 4); - let diff_rule = compute_match_key(b"abc", b"rule-2", 0, 3); - let diff_span = compute_match_key(b"abc", b"rule-1", 1, 4); - assert_ne!(k1, diff_content); - assert_ne!(k1, diff_rule); - assert_ne!(k1, diff_span); - } - - /// Running `scan_bytes_raw` twice over the *same* input should never record - /// duplicate entries in `raw_matches_scratch`. - #[test] - fn test_scan_bytes_raw_no_duplicate_raw_matches() -> Result<()> { - // simple rule: literal "dup" - let rule = Rule::new(RuleSyntax { - id: "dup.check".into(), - name: "dup".into(), - pattern: "dup".into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - depends_on_rule: vec![], - }); - - let rules_db = RulesDatabase::from_rules(vec![rule])?; - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; - - let buf = b"dup dup"; // two literal hits, same rule - - // first scan - m.scan_bytes_raw(buf, "buf1")?; - let first_len = m.user_data.raw_matches_scratch.len(); - - // second scan over the same buffer - m.scan_bytes_raw(buf, "buf1")?; - let second_len = m.user_data.raw_matches_scratch.len(); - - // we should still only have two unique raw matches recorded - assert_eq!(first_len, 2); - assert_eq!(second_len, 2); - Ok(()) - } - - #[test] - fn inline_comment_skips_match() -> Result<()> { - let rule = Rule::new(RuleSyntax { - id: "inline.ignore".into(), - name: "inline".into(), - pattern: "secret_token".into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - depends_on_rule: vec![], - }); - let rules_db = RulesDatabase::from_rules(vec![rule])?; - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; - - let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec()); - let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt"))); - - match matcher.scan_blob(&blob, &origin, None, false, false, false)? { - ScanResult::New(matches) => assert!(matches.is_empty()), - _ => panic!("unexpected scan result"), - } - - Ok(()) - } - - #[test] - fn inline_comment_after_multiline_secret_skips_match() -> Result<()> { - let rule = Rule::new(RuleSyntax { - id: "inline.multiline".into(), - name: "inline multiline".into(), - pattern: "line1\\s+line2".into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - depends_on_rule: vec![], - }); - let rules_db = RulesDatabase::from_rules(vec![rule])?; - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; - - let blob = Blob::from_bytes( - br#"let data = """ -line1 -line2 -""" -# kingfisher:ignore -"# - .to_vec(), - ); - let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt"))); - - match matcher.scan_blob(&blob, &origin, None, false, false, false)? { - ScanResult::New(matches) => assert!(matches.is_empty()), - _ => panic!("unexpected scan result"), - } - - Ok(()) - } - - #[test] - fn compat_flag_controls_external_directives() -> Result<()> { - let rule = Rule::new(RuleSyntax { - id: "inline.compat".into(), - name: "inline compat".into(), - pattern: "supersecret123".into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - depends_on_rule: vec![], - }); - let rules_db = RulesDatabase::from_rules(vec![rule])?; - - let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec()); - let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt"))); - - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; - let matches_without_compat = - match matcher.scan_blob(&blob, &origin, None, false, false, false)? { - ScanResult::New(matches) => matches.len(), - _ => panic!("unexpected scan result"), - }; - assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag"); - - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true)?; - match matcher.scan_blob(&blob, &origin, None, false, false, false)? { - ScanResult::New(matches) => assert!(matches.is_empty()), - _ => panic!("unexpected scan result"), - } - - Ok(()) - } -} diff --git a/src/matcher.rs.rej b/src/matcher.rs.rej deleted file mode 100644 index dc61305..0000000 --- a/src/matcher.rs.rej +++ /dev/null @@ -1,52 +0,0 @@ -@@ -1,50 +1,51 @@ - use std::{ - hash::{Hash, Hasher}, - str, - sync::{Arc, Mutex}, - }; - - use anyhow::Result; - use base64::{engine::general_purpose, Engine}; - use bstr::BString; - use http::StatusCode; - use regex::bytes::Regex; - use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; - use schemars::{ - gen::SchemaGenerator, - schema::{ArrayValidation, InstanceType, Schema}, - JsonSchema, - }; - use serde::{Deserialize, Serialize}; - use smallvec::SmallVec; - use tracing::debug; - use xxhash_rust::xxh3::xxh3_64; - - use crate::{ - blob::{Blob, BlobId, BlobIdMap}, - entropy::calculate_shannon_entropy, -+ inline_ignore::InlineIgnoreConfig, - location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, - origin::OriginSet, - parser, - parser::{Checker, Language}, - rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, - rules::rule::Rule, - rules_database::RulesDatabase, - safe_list::{is_safe_match, is_user_match}, - scanner_pool::ScannerPool, - snippet::Base64BString, - util::{intern, redact_value}, - }; - - const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment - const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches - const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs - const TREE_SITTER_SCAN_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs ≤64 KiB - - // ------------------------------------------------------------------------------------------------- - // RawMatch - // ------------------------------------------------------------------------------------------------- - /// A raw match, as recorded by a callback to Vectorscan. - /// - /// When matching with Vectorscan, we simply collect all matches into a - /// preallocated `Vec`, and then go through them all after scanning is complete.