From caf766b731985d3eb0e86a1f0db847f47ef90be8 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 9 Oct 2025 17:59:10 -0700 Subject: [PATCH 1/8] - Added kingfisher:ignore (or kingfisher:allow) to silence a finding inline within a file - Added: to reuse existing inline directives from other scanners, pass --compat-ignore-comments to also accept NOSONAR, kics-scan ignore, gitleaks:allow and trufflehog:ignore --- CHANGELOG.md | 4 + README.md | 18 +- README.md.orig | 1036 ++++++++++++++++++++++++++ src/cli/commands/scan.rs | 4 + src/inline_ignore.rs | 342 +++++++++ src/lib.rs | 1 + src/main.rs | 1 + src/matcher.rs | 138 +++- src/matcher.rs.orig | 1317 +++++++++++++++++++++++++++++++++ src/matcher.rs.rej | 52 ++ src/reporter.rs | 1 + src/reporter/json_format.rs | 1 + src/scanner/enumerate.rs | 1 + src/scanner/repos.rs | 1 + tests/int_allowlist.rs | 1 + tests/int_bitbucket.rs | 1 + tests/int_dedup.rs | 1 + tests/int_github.rs | 1 + tests/int_gitlab.rs | 2 + tests/int_redact.rs | 1 + tests/int_slack.rs | 2 + tests/int_validation_cache.rs | 1 + tests/int_vulnerable_files.rs | 2 + 23 files changed, 2924 insertions(+), 5 deletions(-) create mode 100644 README.md.orig create mode 100644 src/inline_ignore.rs create mode 100644 src/matcher.rs.orig create mode 100644 src/matcher.rs.rej diff --git a/CHANGELOG.md b/CHANGELOG.md index 1278533..03d4aca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ All notable changes to this project will be documented in this file. +## [Unreleased] +- Added kingfisher:ignore (or kingfisher:allow) to silence a finding inline within a file +- Added: to reuse existing inline directives from other scanners, pass --compat-ignore-comments to also accept NOSONAR, kics-scan ignore, gitleaks:allow and trufflehog:ignore + ## [v1.56.0] - Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined diff --git a/README.md b/README.md index 14faccf..2e3bfd3 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [Notable Scan Options](#notable-scan-options) - [Understanding `--confidence`](#understanding---confidence) - [Ignore known false positives](#ignore-known-false-positives) + - [Inline ignore directives](#inline-ignore-directives) - [Finding Fingerprint](#finding-fingerprint) - [Rule Performance Profiling](#rule-performance-profiling) - [CLI Options](#cli-options) @@ -962,6 +963,7 @@ leaves the default unchanged. - `--manage-baseline`: Create or update the baseline file with current findings - `--skip-regex `: Ignore findings whose text matches this regex (repeatable) - `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) +- `--compat-ignore-comments`: Honor inline directives from other scanners (treat `gitleaks:allow` and `trufflehog:ignore` like native suppressions) ## Understanding `--confidence` The `--confidence` flag sets a minimum confidence threshold, not an exact match. @@ -972,7 +974,21 @@ The `--confidence` flag sets a minimum confidence threshold, not an exact match. ### Ignore known false positives -Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. +Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. + +### Inline ignore directives + +Add `kingfisher:ignore` (or `kingfisher:allow`) to a trailing comment on the same line as a finding to silence it. Multi-line strings may also be ignored by placing the directive on the closing delimiter line **or** on the next logical line after the string: + +```python +API_KEY = """ +line 1 +line 2 +""" +# kingfisher:ignore +``` + +Kingfisher recognizes comment markers for the host language, including `#`, `//`, `/* */`, `--`, and `*`-prefixed block comments, so you can use this suppression style across languages. To reuse existing inline directives from other scanners, pass `--compat-ignore-comments` to also accept `NOSONAR`, `kics-scan ignore`, `gitleaks:allow` and `trufflehog:ignore`. With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com) diff --git a/README.md.orig b/README.md.orig new file mode 100644 index 0000000..f690e3d --- /dev/null +++ b/README.md.orig @@ -0,0 +1,1036 @@ +# Kingfisher + +

+ Kingfisher Logo + +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) + +Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s hardware‑accelerated Hyperscan regex engine with language‑aware source code parsing, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production +

+ +Originally forked from Praetorian’s Nosey Parker, Kingfisher **adds** live cloud-API validation; many more targets (GitLab, BitBucket, Gitea, S3, Docker, Jira, Confluence, Slack); compressed-file extraction and scanning; baseline and allowlist controls; language-aware detection (~20 languages); and a native Windows binary. See [Origins and Divergence](#origins-and-divergence) for details. +## Key Features + +### Multiple Scan Targets +
+ +| Files / Dirs | Local Git | GitHub | GitLab | Azure DevOps | Bitbucket | Gitea | +|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:| +| Files / Dirs
Files / Dirs | Local Git
Local Git | GitHub
GitHub | GitLab
GitLab | Azure DevOps
Azure DevOps | Bitbucket
Bitbucket | Gitea
Gitea | + +| Docker | Jira | Confluence | Slack | AWS S3 | +|:------:|:----:|:-----------:|:-----:|:------:| +| Docker
Docker | Jira
Jira | Confluence
Confluence | Slack
Slack | AWS S3
AWS S3 | + +
+ +### Performance, Accuracy, and Hundreds of Rules +- **Performance**: multithreaded, Hyperscan‑powered scanning built for huge codebases +- **Extensible rules**: hundreds of built-in detectors plus YAML-defined custom rules ([docs/RULES.md](/docs/RULES.md)) +- **Broad AI SaaS coverage**: finds and validates tokens for OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, Together.ai, Zhipu, and many more +- **Compressed Files**: Supports extracting and scanning compressed files for secrets +- **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) + +**Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation) + +# Benchmark Results + +See ([docs/COMPARISON.md](docs/COMPARISON.md)) + +

+ Kingfisher Runtime Comparison +

+ +- [Kingfisher](#kingfisher) + - [Key Features](#key-features) + - [Multiple Scan Targets](#multiple-scan-targets) + - [Performance, Accuracy, and Hundreds of Rules](#performance-accuracy-and-hundreds-of-rules) +- [Benchmark Results](#benchmark-results) +- [Getting Started](#getting-started) + - [Installation](#installation) + - [Run Kingfisher in Docker](#run-kingfisher-in-docker) +- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance) + - [Write Custom Rules!](#write-custom-rules) +- [Usage](#usage) + - [Basic Examples](#basic-examples) + - [Scan with secret validation](#scan-with-secret-validation) + - [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories) + - [Scan a Git repository without validation](#scan-a-git-repository-without-validation) + - [Display only secrets confirmed active by third‑party APIs](#display-only-secrets-confirmed-active-by-thirdparty-apis) + - [Output JSON and capture to a file](#output-json-and-capture-to-a-file) + - [Output SARIF directly to disk](#output-sarif-directly-to-disk) + - [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--) + - [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size) + - [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag) + - [Display rule performance statistics](#display-rule-performance-statistics) + - [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files) + - [Exclude specific paths](#exclude-specific-paths) + - [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines) + - [ Scanning an AWS S3 Bucket](#-scanning-an-aws-s3-bucket) + - [ Scanning Docker Images](#-scanning-docker-images) + - [ Scanning GitHub](#-scanning-github) + - [Scan GitHub organization (requires `KF_GITHUB_TOKEN`)](#scan-github-organization-requires-kf_github_token) + - [Skip specific GitHub repositories during enumeration](#skip-specific-github-repositories-during-enumeration) + - [Scan remote GitHub repository](#scan-remote-github-repository) + - [ Scanning GitLab](#-scanning-gitlab) + - [Scan GitLab group (requires `KF_GITLAB_TOKEN`)](#scan-gitlab-group-requires-kf_gitlab_token) + - [Scan GitLab user](#scan-gitlab-user) + - [Skip specific GitLab projects during enumeration](#skip-specific-gitlab-projects-during-enumeration) + - [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url) + - [List GitLab repositories](#list-gitlab-repositories) + - [ Scanning Azure Repos](#-scanning-azure-repos) + - [Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-devops-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat) + - [Scan specific Azure DevOps projects](#scan-specific-azure-devops-projects) + - [Skip specific Azure repositories during enumeration](#skip-specific-azure-repositories-during-enumeration) + - [List Azure repositories](#list-azure-repositories) + - [ Scanning Gitea](#-scanning-gitea) + - [Scan Gitea organization (requires `KF_GITEA_TOKEN`)](#scan-gitea-organization-requires-kf_gitea_token) + - [Scan Gitea user](#scan-gitea-user) + - [Skip specific Gitea repositories during enumeration](#skip-specific-gitea-repositories-during-enumeration) + - [Scan remote Gitea repository by URL](#scan-remote-gitea-repository-by-url) + - [List Gitea repositories](#list-gitea-repositories) + - [ Scanning Bitbucket](#-scanning-bitbucket) + - [Scan Bitbucket workspace](#scan-bitbucket-workspace) + - [Scan Bitbucket user](#scan-bitbucket-user) + - [Skip specific Bitbucket repositories during enumeration](#skip-specific-bitbucket-repositories-during-enumeration) + - [Scan remote Bitbucket repository by URL](#scan-remote-bitbucket-repository-by-url) + - [List Bitbucket repositories](#list-bitbucket-repositories) + - [Authenticate to Bitbucket](#authenticate-to-bitbucket) + - [Self-hosted Bitbucket Server](#self-hosted-bitbucket-server) + - [ Scanning Jira](#-scanning-jira) + - [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query) + - [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues) + - [ Scanning Confluence](#-scanning-confluence) + - [Scan Confluence pages matching a CQL query](#scan-confluence-pages-matching-a-cql-query) + - [ Scanning Slack](#-scanning-slack) + - [Scan Slack messages matching a search query](#scan-slack-messages-matching-a-search-query) + - [Environment Variables for Tokens](#environment-variables-for-tokens) + - [Exit Codes](#exit-codes) + - [Update Checks](#update-checks) +- [Advanced Options](#advanced-options) + - [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets) + - [List Builtin Rules](#list-builtin-rules) + - [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run) + - [To add your rules alongside the built‑ins:](#to-add-your-rules-alongside-the-builtins) + - [Other Examples](#other-examples) + - [Customize the HTTP User-Agent](#customize-the-http-user-agent) + - [Notable Scan Options](#notable-scan-options) + - [Understanding `--confidence`](#understanding---confidence) + - [Ignore known false positives](#ignore-known-false-positives) + - [Inline ignore directives](#inline-ignore-directives) + - [Finding Fingerprint](#finding-fingerprint) + - [Rule Performance Profiling](#rule-performance-profiling) + - [CLI Options](#cli-options) + - [Origins and Divergence](#origins-and-divergence) +- [Roadmap](#roadmap) +- [License](#license) + +# Getting Started +## Installation + +On macOS, you can simply + +```bash +brew install kingfisher +``` + +Pre-built binaries are also available on the [Releases](https://github.com/mongodb/kingfisher/releases) section of this page. + +You can also install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform: + +```bash +# Linux, macOS +curl --silent --location \ + https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \ + sh && \ + ubi --project mongodb/kingfisher --in "$HOME/bin" +``` + +```powershell +# Windows +powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in . +``` + +This installs `ubi` and then places the `kingfisher` executable in `~/bin` on Unix-like +systems (or the current directory on Windows). + +Or you may compile for your platform via `make`: + +```bash +# NOTE: Requires Docker +make linux + +# macOS --- must build from a macOS host +make darwin + +# Windows x64 --- requires building from a Windows host with Visual Studio installed +./buildwin.bat -force +``` + +```bash +# Build all targets +make linux-all # builds both x64 and arm64 +make darwin-all # builds both x64 and arm64 +make all # builds for every OS and architecture supported +``` + +### Run Kingfisher in Docker + +Run the dockerized Kingfisher container: +```bash +# GitHub Container Registry +docker run --rm ghcr.io/mongodb/kingfisher:latest --version + +# Scan the current working directory +# (mounts your code at /src and scans it) +docker run --rm \ + -v "$PWD":/src \ + ghcr.io/mongodb/kingfisher:latest scan /src + + +# Scan while providing a GitHub token +# Mounts your working dir at /proj and passes in the token: +docker run --rm \ + -e KF_GITHUB_TOKEN=ghp_… \ + -v "$PWD":/proj \ + ghcr.io/mongodb/kingfisher:latest \ + scan --git-url https://github.com/org/private_repo.git + +# Scan an S3 bucket +# Credentials can come from KF_AWS_KEY/KF_AWS_SECRET, --role-arn, or --aws-local-profile +docker run --rm \ + -e KF_AWS_KEY=AKIA... \ + -e KF_AWS_SECRET=g5nYW... \ + ghcr.io/mongodb/kingfisher:latest \ + scan --s3-bucket bucket-name + + +# Scan and write a JSON report locally +# Here we: +# 1. Mount $PWD → /proj +# 2. Tell Kingfisher to write findings.json inside /proj/reports +# 3. Ensure ./reports exists on your host so Docker can mount it +mkdir -p reports + +# run and output into host’s ./reports directory +docker run --rm \ + -v "$PWD":/proj \ + ghcr.io/mongodb/kingfisher:latest \ + scan /proj \ + --format json \ + --output /proj/reports/findings.json + + +# Tip: you can combine multiple mounts if you prefer separating source vs. output: +# Here /src is read‑only, and /out holds your generated reports +docker run --rm \ + -v "$PWD":/src:ro \ + -v "$PWD/reports":/out \ + ghcr.io/mongodb/kingfisher:latest \ + scan /src \ + --format json \ + --output /out/findings.json + +``` + +# 🔐 Detection Rules at a Glance + +Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything from classic cloud keys to the latest AI SaaS tokens. Below is an overview: + +| Category | What we catch | +|----------|---------------| +| **AI SaaS APIs** | OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, together.ai, Zhipu, and more | +| **Cloud Providers** | AWS, Azure, GCP, Alibaba Cloud, DigitalOcean, IBM Cloud, Cloudflare, and more | +| **Dev & CI/CD** | GitHub/GitLab tokens, CircleCI, TravisCI, TeamCity, Docker Hub, npm, PyPI, and more | +| **Messaging & Comms** | Slack, Discord, Microsoft Teams, Twilio, Mailgun, SendGrid, Mailchimp, and more | +| **Databases & Data Ops** | MongoDB Atlas, PlanetScale, Postgres DSNs, Grafana Cloud, Datadog, Dynatrace, and more | +| **Payments & Billing** | Stripe, PayPal, Square, GoCardless, and more | +| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more | +| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more | + +## Write Custom Rules! + +Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential. + +However, you may want to add your own custom rules, or modify a detection to better suit your needs / environment. + +First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom Kingfisher rules. + +Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! + +# Usage + +## Basic Examples + +> **Note**  `kingfisher scan` detects whether the input is a Git repository or a plain directory, no extra flags required. + +### Scan with secret validation + +```bash +kingfisher scan /path/to/code +## NOTE: This path can refer to: +# 1. a local git repo +# 2. a directory with many git repos +# 3. or just a folder with files and subdirectories + +## To explicitly prevent scanning git commit history add: +# `--git-history=none` +``` + +### Scan a directory containing multiple Git repositories + +```bash +kingfisher scan /projects/mono‑repo‑dir +``` + +### Scan a Git repository without validation + +```bash +kingfisher scan ~/src/myrepo --no-validate +``` + +### Display only secrets confirmed active by third‑party APIs + +```bash +kingfisher scan /path/to/repo --only-valid +``` + +### Output JSON and capture to a file + +```bash +kingfisher scan . --format json | tee kingfisher.json +``` + +### Output SARIF directly to disk + +```bash +kingfisher scan /path/to/repo --format sarif --output findings.sarif +``` + +### Pipe any text directly into Kingfisher by passing `-` + +```bash +cat /path/to/file.py | kingfisher scan - + +``` + +### Limit maximum file size scanned (`--max-file-size`) + +By default, Kingfisher skips files larger than **256 MB**. You can raise or lower this cap per run with `--max-file-size`, which takes a value in **megabytes**. + +```bash +# Scan files up to 500 mb in size +kingfisher scan /some/file --max-file-size 500 +``` + +### Scan using a rule _family_ with one flag + +_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws.*`)_ + +```bash +# Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2) +kingfisher scan /path/to/repo --rule kingfisher.aws +``` + +### Display rule performance statistics + +```bash +kingfisher scan /path/to/repo --rule-stats +``` + +### Scan while ignoring likely test files + +`--exclude` skips any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) + +```bash +# Scan source but skip likely unit / integration tests +kingfisher scan ./my-project \ + --exclude='[Tt]est' \ + --exclude='spec' \ + --exclude='[Ff]ixture' \ + --exclude='example' \ + --exclude='sample' +``` + +### Exclude specific paths +```bash +# Skip all Python files and any directory named tests +kingfisher scan ./my-project \ + --exclude '*.py' \ + --exclude '[Tt]ests' +``` + +### Scan changes in CI pipelines + +Limit scanning to the delta between your default branch and a pull request branch by combining `--since-commit` with `--branch` (defaults to `HEAD`). This only scans files that differ between the two references, which keeps CI runs fast while still blocking new secrets. + +```bash +kingfisher scan . \ + --since-commit origin/main \ + --branch "$CI_BRANCH" +``` + +When the branch under test is already checked out, `--branch HEAD` or omitting `--branch` entirely is sufficient. Kingfisher exits with `200` when any findings are discovered and `205` when validated secrets are present, allowing CI jobs to fail automatically if new credentials slip in. + +The same diff-focused workflow works when cloning repositories on the fly with `--git-url`. Kingfisher automatically tries remote-tracking names like `origin/main` and `origin/feature-1`, so you can target the branches involved in a pull request without performing a local checkout first. + +```bash +kingfisher scan \ + --git-url https://github.com/org/repo.git \ + --since-commit main \ + --branch development +``` + +In CI systems that expose the base and head commits explicitly, you can pass those SHAs directly while still using `--git-url`: + +```bash +kingfisher scan \ + --git-url git@github.com:org/repo.git \ + --since-commit "$BASE_COMMIT" \ + --branch "$PR_HEAD_COMMIT" +``` + +If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude): + +```bash +# Skip all Python files and any directory named tests, and report to stderr any skipped files +kingfisher scan ./my-project \ + --exclude '*.py' \ + --exclude tests \ + -v +``` + +## GitHub Scanning an AWS S3 Bucket +You can scan S3 objects directly: + +```bash +kingfisher scan --s3-bucket bucket-name [--s3-prefix path/] +``` + +Credential resolution happens in this order: + +1. `KF_AWS_KEY` and `KF_AWS_SECRET` environment variables +2. `--aws-local-profile` pointing to a profile in `~/.aws/config` (works with AWS SSO) +3. anonymous access for public buckets + +If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role. + +Examples: + +```bash +# using explicit keys +export KF_AWS_KEY=AKIA... +export KF_AWS_SECRET=g5nYW... +kingfisher scan --s3-bucket some-example-bucket + +# Above can also be run as: +KF_AWS_KEY=AKIA... KF_AWS_SECRET=g5nYW... kingfisher scan --s3-bucket some-example-bucket + +# using a local profile (e.g., SSO) that exists in your AWS profile (~/.aws/config) +kingfisher scan --s3-bucket some-example-bucket --aws-local-profile default + +# anonymous scan of a bucket, while providing an object prefix to only scan subset of the s3 bucket +kingfisher scan \ + --s3-bucket awsglue-datasets \ + --s3-prefix examples/us-legislators/all + +# assuming a role when scanning +kingfisher scan --s3-bucket some-example-bucket \ + --role-arn arn:aws:iam::123456789012:role/MyRole + +# anonymous scan of a public bucket +kingfisher scan --s3-bucket some-example-bucket +``` + +Docker example: + +```bash +docker run --rm \ + -e KF_AWS_KEY=AKIA... \ + -e KF_AWS_SECRET=g5nYW... \ + ghcr.io/mongodb/kingfisher:latest \ + scan --s3-bucket bucket-name +``` + +## Docker Scanning Docker Images + +Kingfisher will first try to use any locally available image, then fall back to pulling via OCI. + +Authentication happens *in this order*: + +1. **`KF_DOCKER_TOKEN`** env var + - If it contains `user:pass`, it’s used as Basic auth + - Otherwise it’s sent as a Bearer token +2. **Docker CLI credentials** + - Checks `credHelpers` (per-registry) and `credsStore` in `~/.docker/config.json`. + - Falls back to the legacy `auths` → `auth` (base64) entries. +3. **Anonymous** (no credentials) + + +```bash +# 1) Scan public or already-pulled image +kingfisher scan --docker-image ghcr.io/owasp/wrongsecrets/wrongsecrets-master:latest-master + +# 2) For private registries, explicitly set KF_DOCKER_TOKEN: +# - Basic auth: "user:pass" +# - Bearer only: "TOKEN" +export KF_DOCKER_TOKEN="AWS:$(aws ecr get-login-password --region us-east-1)" +kingfisher scan --docker-image some-private-registry.dkr.ecr.us-east-1.amazonaws.com/base/amazonlinux2023:latest + +# 3) Or rely on your Docker CLI login/keychain: +# (e.g. aws ecr get-login-password … | docker login …) +kingfisher scan --docker-image private.registry.example.com/my-image:tag +``` + +## GitHub Scanning GitHub + +### Scan GitHub organization (requires `KF_GITHUB_TOKEN`) + +```bash +kingfisher scan --github-organization my-org +``` + +### Skip specific GitHub repositories during enumeration + +Repeat `--github-exclude` for every repository you want to ignore when scanning +users or organizations. You can provide exact repositories like +`OWNER/REPO` or gitignore-style glob patterns such as `owner/*-archive` +(matching is case-insensitive). + +```bash +kingfisher scan --github-organization my-org \ + --github-exclude my-org/huge-repo \ + --github-exclude my-org/*-archive +``` + +### Scan remote GitHub repository + +`--git-url` clones the repository and scans its files and history. To also inspect +related server-side data, supply `--repo-artifacts`. This flag pulls down the +repository's issues (including pull requests), wiki, and any public gists owned by +the repository owner and scans them for secrets. Fetching these extras counts +against API rate limits and private artifacts require a `KF_GITHUB_TOKEN`. + +```bash +# Scan the repository only +kingfisher scan --git-url https://github.com/org/repo.git + +# Include issues, wiki, and owner gists +kingfisher scan --git-url https://github.com/org/repo.git --repo-artifacts + +# Private repositories or artifacts +KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/private_repo.git --repo-artifacts +``` + +--- + +## GitLab Scanning GitLab + +### Scan GitLab group (requires `KF_GITLAB_TOKEN`) + +```bash +kingfisher scan --gitlab-group my-group +# include repositories from all nested subgroups +kingfisher scan --gitlab-group my-group --gitlab-include-subgroups +``` + +### Scan GitLab user + +```bash +kingfisher scan --gitlab-user johndoe +``` + +### Skip specific GitLab projects during enumeration + +Repeat `--gitlab-exclude` for every project path you want to ignore when scanning +users or groups. Specify project paths as `group/project` (case-insensitive) or +use gitignore-style glob patterns like `group/**/archive-*` to drop families of +projects across nested subgroups. + +```bash +kingfisher scan --gitlab-group my-group \ + --gitlab-exclude my-group/huge-project \ + --gitlab-exclude my-group/**/archive-* +``` + +### Scan remote GitLab repository by URL + +`--git-url` by itself clones the project repository. To include server-side +artifacts owned by the project, add `--repo-artifacts`. Kingfisher will retrieve +the project's issues, wiki, and snippets and scan them for secrets. These extra +requests may take longer and require a `KF_GITLAB_TOKEN` for private projects. + +```bash +# Scan the repository only +kingfisher scan --git-url https://gitlab.com/group/project.git + +# Include issues, wiki, and snippets +kingfisher scan --git-url https://gitlab.com/group/project.git --repo-artifacts + +# Private projects or artifacts +KF_GITLAB_TOKEN="glpat-…" kingfisher scan --git-url https://gitlab.com/group/private_project.git --repo-artifacts +``` + +### List GitLab repositories + +```bash +kingfisher gitlab repos list --group my-group +# include repositories from all nested subgroups +kingfisher gitlab repos list --group my-group --include-subgroups +# skip specific projects when listing or scanning (supports glob patterns) +kingfisher gitlab repos list --group my-group --gitlab-exclude my-group/**/legacy-* +``` +## Azure Repos Scanning Azure Repos + +### Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`) + +```bash +kingfisher scan --azure-organization my-org + +# Azure DevOps Server example +KF_AZURE_PAT="pat" kingfisher scan --azure-organization DefaultCollection --azure-base-url https://ado.internal.example/tfs/ +``` + +### Scan specific Azure DevOps projects + +Projects are specified as `ORGANIZATION/PROJECT`. Repeat the flag for multiple projects. + +```bash +kingfisher scan --azure-project my-org/payments --azure-project my-org/core-platform +``` + +### Skip specific Azure repositories during enumeration + +Repeat `--azure-exclude` to ignore repositories when scanning organizations or projects. +Use identifiers like `ORGANIZATION/PROJECT/REPOSITORY`. Repositories that share the same +name as their project can be excluded with `ORGANIZATION/PROJECT`, and gitignore-style +patterns such as `my-org/*/archive-*` are also supported. + +```bash +kingfisher scan --azure-organization my-org \ + --azure-exclude my-org/payments/legacy-service \ + --azure-exclude my-org/**/archive-* +``` + +### List Azure repositories + +```bash +kingfisher azure repos list --organization my-org +# list repositories for specific projects +kingfisher azure repos list --project my-org/app --project my-org/api +# skip specific repositories while listing (supports glob patterns) +kingfisher azure repos list --organization my-org --azure-exclude my-org/**/experimental-* +``` +## Gitea Scanning Gitea + +### Scan Gitea organization (requires `KF_GITEA_TOKEN`) + +```bash +kingfisher scan --gitea-organization my-org +# self-hosted example +KF_GITEA_TOKEN="gtoken" kingfisher scan --gitea-organization platform --gitea-api-url https://gitea.internal.example/api/v1/ +``` + +### Scan Gitea user + +```bash +kingfisher scan --gitea-user johndoe +``` + +### Skip specific Gitea repositories during enumeration + +Repeat `--gitea-exclude` for each repository you want to ignore when scanning users +or organizations. Accepts `owner/repo` identifiers or gitignore-style glob patterns +like `team/**/archive-*`. + +```bash +kingfisher scan --gitea-organization my-org \ + --gitea-exclude my-org/legacy-repo \ + --gitea-exclude my-org/**/archive-* +``` + +### Scan remote Gitea repository by URL + +`--git-url` clones the repository and scans its history. Adding `--repo-artifacts` +also clones the repository wiki if one exists. Private repositories and wikis +require `KF_GITEA_TOKEN` (and `KF_GITEA_USERNAME` when cloning via HTTPS). + +```bash +# Scan the repository only +kingfisher scan --git-url https://gitea.com/org/repo.git + +# Include the repository wiki (if present) +KF_GITEA_TOKEN="gtoken" KF_GITEA_USERNAME="org" \ + kingfisher scan --git-url https://gitea.com/org/repo.git --repo-artifacts +``` + +### List Gitea repositories + +```bash +kingfisher gitea repos list --gitea-organization my-org +# enumerate every organization visible to the authenticated user +KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --all-gitea-organizations +# self-hosted example +KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --user johndoe --gitea-api-url https://gitea.internal.example/api/v1/ +``` +## Bitbucket Scanning Bitbucket +### Scan Bitbucket workspace + +```bash +kingfisher scan --bitbucket-workspace my-team +# include Bitbucket Cloud repositories from every accessible workspace +kingfisher scan --all-bitbucket-workspaces --bitbucket-token "$APP_PASSWORD" --bitbucket-username "$USER" +``` + +### Scan Bitbucket user + +```bash +kingfisher scan --bitbucket-user johndoe +``` + +### Skip specific Bitbucket repositories during enumeration + +Use `--bitbucket-exclude` to ignore repositories while scanning users, workspaces, +or projects. Patterns accept either `owner/repo` (case-insensitive) or +gitignore-style globs such as `workspace/**/archive-*`. + +```bash +kingfisher scan --bitbucket-workspace my-team \ + --bitbucket-exclude my-team/legacy-repo \ + --bitbucket-exclude my-team/**/archive-* +``` + +### Scan remote Bitbucket repository by URL + +`--git-url` clones the repository and scans its files and history. To inspect +Bitbucket artifacts such as issues, add `--repo-artifacts`. Private artifacts +require credentials (see [Authenticate to Bitbucket](#authenticate-to-bitbucket)). + +```bash +# Scan the repository only +kingfisher scan --git-url https://bitbucket.org/hashashash/secretstest.git + +# Include repository issues +KF_BITBUCKET_USERNAME="user" \ +KF_BITBUCKET_APP_PASSWORD="app-password" \ + kingfisher scan --git-url https://bitbucket.org/workspace/project.git --repo-artifacts +``` + +### List Bitbucket repositories + +```bash +kingfisher bitbucket repos list --bitbucket-workspace my-team +# enumerate all accessible workspaces or projects +kingfisher bitbucket repos list --all-bitbucket-workspaces --bitbucket-token "$APP_PASSWORD" --bitbucket-username "$USER" +# filter out repositories using glob patterns +kingfisher bitbucket repos list --bitbucket-workspace my-team --bitbucket-exclude my-team/**/experimental-* +``` + +### Authenticate to Bitbucket + +Kingfisher supports Bitbucket Cloud and Bitbucket Server credentials: + +- **App password or server token** – set `KF_BITBUCKET_USERNAME` and either + `KF_BITBUCKET_APP_PASSWORD` or `KF_BITBUCKET_TOKEN`, or pass + `--bitbucket-username`/`--bitbucket-token` on the CLI. +- **OAuth/PAT token** – set `KF_BITBUCKET_OAUTH_TOKEN` or supply + `--bitbucket-oauth-token`. + +These credentials match the options described in the [ghorg setup +guide](https://github.com/gabrie30/ghorg/blob/master/README.md#bitbucket-setup). + +### Self-hosted Bitbucket Server + +Use `--bitbucket-api-url` to point Kingfisher at your server's REST endpoint, for example +`https://bitbucket.example.com/rest/api/1.0/`. Provide credentials with +`--bitbucket-username` and `--bitbucket-token`, and pass `--ignore-certs` when +connecting to HTTP or otherwise insecure instances. +## Jira Scanning Jira + +### Scan Jira issues matching a JQL query + +```bash +KF_JIRA_TOKEN="token" kingfisher scan \ + --jira-url https://jira.company.com \ + --jql "project = TEST AND status = Open" \ + --max-results 500 +``` + +### Scan the last 1,000 Jira issues: +```bash +KF_JIRA_TOKEN="token" kingfisher scan \ + --jira-url https://jira.mongodb.org \ + --jql 'ORDER BY created DESC' \ + --max-results 1000 +``` + +## Confluence Scanning Confluence +### Scan Confluence pages matching a CQL query + +```bash +# Bearer token +KF_CONFLUENCE_TOKEN="token" kingfisher scan \ + --confluence-url https://confluence.company.com \ + --cql "label = secret" \ + --max-results 500 + +# Basic auth with username and token +KF_CONFLUENCE_USER="user@example.com" KF_CONFLUENCE_TOKEN="token" kingfisher scan \ + --confluence-url https://confluence.company.com \ + --cql "text ~ 'password'" \ + --max-results 500 +``` + +Use the base URL of your Confluence site for `--confluence-url`. Kingfisher +automatically adds `/rest/api` to the end, so `https://example.com/wiki` and +`https://example.com` both work depending on your server configuration. + +Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environment variable. By default, Kingfisher sends the token as a bearer token in the `Authorization` header. + +To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions. + +## Slack Scanning Slack +### Scan Slack messages matching a search query + +```bash +KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \ + --slack-query "from:username has:link" \ + --max-results 1000 + +KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \ + --slack-query "akia" \ + --max-results 1000 +``` +*The Slack token must be a user token with the `search:read` scope. Bot tokens (those beginning with `xoxb-`) cannot call the Slack search API.* + +## Environment Variables for Tokens + +| Variable | Purpose | +| ----------------- | ---------------------------- | +| `KF_GITHUB_TOKEN` | GitHub Personal Access Token | +| `KF_GITLAB_TOKEN` | GitLab Personal Access Token | +| `KF_GITEA_TOKEN` | Gitea Personal Access Token | +| `KF_GITEA_USERNAME` | Username for private Gitea clones (used with `KF_GITEA_TOKEN`) | +| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure DevOps Personal Access Token | +| `KF_AZURE_USERNAME` | Username to use with Azure DevOps PATs (defaults to `pat` when unset) | +| `KF_BITBUCKET_USERNAME` | Bitbucket username for basic authentication | +| `KF_BITBUCKET_APP_PASSWORD` / `KF_BITBUCKET_TOKEN` | Bitbucket app password or server token | +| `KF_BITBUCKET_OAUTH_TOKEN` | Bitbucket OAuth or PAT token | +| `KF_JIRA_TOKEN` | Jira API token | +| `KF_CONFLUENCE_TOKEN` | Confluence API token | +| `KF_SLACK_TOKEN` | Slack API token | +| `KF_DOCKER_TOKEN` | Docker registry token (`user:pass` or bearer token). If unset, credentials from the Docker keychain are used | +| `KF_AWS_KEY` and `KF_AWS_SECRET` | AWS Credentials to use with S3 bucket scanning | + +Set them temporarily per command: + +```bash +KF_GITLAB_TOKEN="glpat-…" kingfisher scan --gitlab-group my-group +``` + +Or export for the session: + +```bash +export KF_GITLAB_TOKEN="glpat-…" +``` + +To authenticate Jira requests: +```bash +export KF_JIRA_TOKEN="token" +``` + +To authenticate Confluence requests: +```bash +export KF_CONFLUENCE_TOKEN="token" +``` + +_If no token is provided Kingfisher still works for public repositories._ + +--- + +## Exit Codes + +| Code | Meaning | +| ---- | ----------------------------- | +| 0 | No findings | +| 200 | Findings discovered | +| 205 | Validated findings discovered | + +## Update Checks + +Kingfisher automatically queries GitHub for a newer release when it starts and tells you whether an update is available. + +- **Hands-free updates** – Add `--self-update` to any Kingfisher command + + * If a newer version exists, Kingfisher will download it, replace the running binary, and re-launch itself with the **exact same arguments**. + * If the update fails or no newer release is found, the current run proceeds as normal + +- **Manual update** – Run `kingfisher self-update` to update the binary without scanning + +- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely + +# Advanced Options + +## Build a Baseline / Detect New Secrets + +There are situations where a repository already contains checked‑in secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list. + +The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches): + +```bash +kingfisher scan /path/to/code \ + --confidence low \ + --manage-baseline \ + --baseline-file ./baseline-file.yml +``` + +Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings: + +```bash +kingfisher scan /path/to/code \ + --baseline-file /path/to/baseline-file.yaml +``` + +Running the scan again with `--manage-baseline` refreshes the baseline by adding new findings and pruning entries for secrets that no longer appear. See [docs/BASELINE.md](docs/BASELINE.md) for full detail. + +## List Builtin Rules + +```bash +kingfisher rules list +``` + +## To scan using **only** your own `my_rules.yaml` you could run: + +```bash +kingfisher scan \ + --load-builtins=false \ + --rules-path path/to/my_rules.yaml \ + ./src/ +``` + +## To add your rules alongside the built‑ins: + +```bash +kingfisher scan \ + --rules-path ./custom-rules/ \ + --rules-path my_rules.yml \ + ~/path/to/project-dir/ +``` + +## Other Examples + +```bash +# Check custom rules - this ensures all regular expressions compile, and can match the rule's `examples` in the YML file +kingfisher rules check --rules-path ./my_rules.yml + +# List GitHub repos +kingfisher github repos list --user my-user +kingfisher github repos list --organization my-org +# Skip specific repositories when listing or scanning (supports glob patterns) +kingfisher github repos list --organization my-org --github-exclude my-org/*-archive + +``` + +## Customize the HTTP User-Agent + +Kingfisher identifies its HTTP requests with a user-agent that includes the binary name and version followed by a browser-style +string. Some environments require extra context, such as a contact address, a change-ticket number, or a temporary test label. +Use the global `--user-agent-suffix` flag to append this information between the Kingfisher identifier and the browser portion: + +```bash +# Attach a contact email to all outbound validation requests +kingfisher --user-agent-suffix "contact=security@example.com" scan path/ + +# Label a one-off experiment +kingfisher --user-agent-suffix "Sept 2025 testing" github repos list --user my-user +``` + +When omitted, Kingfisher defaults to `kingfisher/ Mozilla/5.0 ...`. The suffix is trimmed; passing an empty string +leaves the default unchanged. + +## Notable Scan Options + +- `--no-dedup`: Report every occurrence of a finding (disable the default de-duplicate behavior) +- `--no-base64`: By default, Kingfisher finds and decodes base64 blobs and scans them for secrets. This adds a slight performance overhead; use this flag to disable +- `--confidence `: (low|medium|high) +- `--min-entropy `: Override default threshold +- `--no-binary`: Skip binary files +- `--no-extract-archives`: Do not scan inside archives +- `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) +- `--redact`: Replaces discovered secrets with a one-way hash for secure output +- `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) +- `--baseline-file `: Ignore matches listed in a baseline YAML file +- `--manage-baseline`: Create or update the baseline file with current findings +- `--skip-regex `: Ignore findings whose text matches this regex (repeatable) +- `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) +## Understanding `--confidence` + +The `--confidence` flag sets a minimum confidence threshold, not an exact match. + +- If you pass `--confidence medium`, findings with **medium and higher** confidence (medium + high) will be included. +- If you pass `--confidence low`, you’ll see **all levels** (low, medium, high). + + +### Ignore known false positives + +Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. + +With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com) + +```bash +# Skip any finding where the finding mentions TEST_KEY +kingfisher scan --skip-regex '(?i)TEST_KEY' path/ + +# Skip findings that contain the word "dummy" anywhere in the match +kingfisher scan --skip-word dummy path/ + +# Combine multiple patterns +kingfisher scan \ + --skip-regex 'AKIA[0-9A-Z]{16}' \ + --skip-word placeholder \ + --skip-word dummy \ + path/ +``` + +If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early. + +## Finding Fingerprint + +The document below details the four-field formula (rule SHA-1, origin label, start & end offsets) hashed with XXH3-64 to create Kingfisher’s 64-bit finding fingerprint, and explains how this ID powers safe deduplication; plus how `--no-dedup` can be used shows every raw match. +See ([docs/FINGERPRINT.md](docs/FINGERPRINT.md)) + +## Rule Performance Profiling + +Use `--rule-stats` to collect timing information for every rule. After scanning, the summary prints a **Rule Performance Stats** section showing how many matches each rule produced along with its slowest and average match times. Useful when creating rules or debugging rules. + +## CLI Options + +```bash +kingfisher scan --help +``` + + +## Origins and Divergence + +Kingfisher began as a fork of Praetorian’s Nosey Parker, as our experiment with adding live validation support and embedding that validation directly inside each rule. + +Since that initial fork, it has diverged heavily from Nosey Parker: +- Added support for live validation of discovered secrets +- Added hundreds of new rules +- Added support for analyzing compressed files +- Added support for building "baselines" to allow for only reporting on newly discovered secrets +- Added Tree-Sitter based source code parsing on top of Hyperscan for deeper language-aware detection +- Expanded support for new targets (GitLab, BitBucket, Gitea, Jira, Confluence, Slack, S3, Docker, etc.) +- Replaced the SQLite datastore with an in-memory store + Bloom filter +- Collapsed the workflow into a single scan-and-report phase with direct JSON/BSON/SARIF outputs +- Delivered cross-platform builds, including native Windows + +# Roadmap + +- More rules +- More targets +- Please file a [feature request](https://github.com/mongodb/kingfisher/issues), or open a PR, if you have features you'd like added + +# License + +[Apache2 License](LICENSE) diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index 255417f..595e1ff 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -118,6 +118,10 @@ pub struct ScanArgs { /// Skipwords to allow-list secret matches (case-insensitive, repeatable) #[arg(long = "skip-word", value_name = "WORD")] pub skip_word: Vec, + + /// Also recognise `gitleaks:allow` and `trufflehog:ignore` inline directives + #[arg(long = "compat-ignore-comments", default_value_t = false)] + pub compat_ignore_comments: bool, } /// Confidence levels for findings diff --git a/src/inline_ignore.rs b/src/inline_ignore.rs new file mode 100644 index 0000000..8cd47c8 --- /dev/null +++ b/src/inline_ignore.rs @@ -0,0 +1,342 @@ +use crate::location::OffsetSpan; + +/// Configuration for inline ignore directives. +#[derive(Clone, Debug, Default)] +pub struct InlineIgnoreConfig { + tokens: Vec<&'static str>, +} + +impl InlineIgnoreConfig { + /// Create a new configuration. + /// + /// * `include_external_syntax` - when true, also recognise the comment + /// directives used by other scanners such as Gitleaks and Trufflehog. + pub fn new(include_external_syntax: bool) -> Self { + let mut tokens = vec!["kingfisher:ignore", "kingfisher:allow"]; + if include_external_syntax { + tokens.extend(["gitleaks:allow", "trufflehog:ignore"]); + } + Self { tokens } + } + + #[inline] + fn has_tokens(&self) -> bool { + !self.tokens.is_empty() + } + + /// Returns `true` when the provided blob slice contains an inline ignore + /// directive that should suppress a finding for the given span. + pub fn should_ignore(&self, blob_bytes: &[u8], span: &OffsetSpan) -> bool { + if !self.has_tokens() { + return false; + } + + let (start_line_start, start_line_end) = line_bounds(blob_bytes, span.start); + if start_line_end > start_line_start { + let start_line = &blob_bytes[start_line_start..start_line_end]; + if line_has_directive(start_line, &self.tokens) { + return true; + } + } + + // Scan backwards to allow directives that appear before the start of a + // multi-line string or value. This mirrors tools like Gitleaks where + // the ignore directive is often placed immediately above the secret. + let mut cursor = start_line_start; + while cursor > 0 { + let previous_index = cursor.saturating_sub(1); + let (prev_start, prev_end) = line_bounds(blob_bytes, previous_index); + if prev_end <= prev_start { + break; + } + + let prev_line = &blob_bytes[prev_start..prev_end]; + if line_has_directive(prev_line, &self.tokens) { + return true; + } + + if !should_skip_for_directive_search(prev_line) { + break; + } + + if prev_start == 0 { + break; + } + + cursor = prev_start; + } + + let end_index = if span.end == 0 { 0 } else { span.end - 1 }; + let (closing_line_start, closing_line_end) = + line_bounds(blob_bytes, end_index.min(blob_bytes.len())); + if closing_line_end > closing_line_start + && (closing_line_start != start_line_start || closing_line_end != start_line_end) + { + let closing_line = &blob_bytes[closing_line_start..closing_line_end]; + if line_has_directive(closing_line, &self.tokens) { + return true; + } + } + + // Also consider lines after the match so that multi-line strings can be + // ignored when the directive appears after the closing delimiter (a + // common pattern in languages like Python). + let mut cursor = closing_line_end; + while cursor < blob_bytes.len() { + if blob_bytes[cursor] == b'\n' { + cursor += 1; + continue; + } + + let (_, next_end) = line_bounds(blob_bytes, cursor); + if next_end <= cursor { + break; + } + + let next_line = &blob_bytes[cursor..next_end]; + if line_has_directive(next_line, &self.tokens) { + return true; + } + + if !should_skip_for_directive_search(next_line) { + break; + } + + cursor = next_end; + } + + false + } +} + +fn should_skip_for_directive_search(line: &[u8]) -> bool { + let trimmed = trim_ascii_whitespace(line); + if trimmed.is_empty() { + return true; + } + + if trimmed.iter().all(|&b| b == trimmed[0]) && matches!(trimmed[0], b'"' | b'\'' | b'`') { + return true; + } + + if ends_with_multiline_delimiter(trimmed) { + return true; + } + + false +} + +fn ends_with_multiline_delimiter(trimmed: &[u8]) -> bool { + if trimmed.len() < 3 { + return false; + } + + let last = *trimmed.last().unwrap(); + if !matches!(last, b'"' | b'\'' | b'`') { + return false; + } + + let count = trimmed.iter().rev().take_while(|&&ch| ch == last).count(); + + count >= 3 +} + +fn trim_ascii_whitespace(line: &[u8]) -> &[u8] { + let mut start = 0; + while start < line.len() && line[start].is_ascii_whitespace() { + start += 1; + } + + let mut end = line.len(); + while end > start && line[end - 1].is_ascii_whitespace() { + end -= 1; + } + + &line[start..end] +} + +fn line_bounds(bytes: &[u8], index: usize) -> (usize, usize) { + if bytes.is_empty() { + return (0, 0); + } + let mut start = index.min(bytes.len()); + while start > 0 && bytes[start - 1] != b'\n' { + start -= 1; + } + let mut end = index.min(bytes.len()); + while end < bytes.len() && bytes[end] != b'\n' { + end += 1; + } + (start, end) +} + +fn line_has_directive(line: &[u8], tokens: &[&'static str]) -> bool { + if line.is_empty() { + return false; + } + + let mut lowercase = line.to_vec(); + lowercase.iter_mut().for_each(|b| *b = b.to_ascii_lowercase()); + + for token in tokens { + let needle = token.as_bytes(); + let mut offset = 0; + while offset < lowercase.len() { + if let Some(pos) = memchr::memmem::find(&lowercase[offset..], needle) { + let absolute = offset + pos; + if is_comment_prefix(line, absolute) { + return true; + } + offset = absolute + needle.len(); + } else { + break; + } + } + } + + false +} + +fn is_comment_prefix(line: &[u8], token_index: usize) -> bool { + if line.is_empty() || token_index == 0 || token_index > line.len() { + return false; + } + + let mut end = token_index; + while end > 0 && line[end - 1].is_ascii_whitespace() { + end -= 1; + } + + if end == 0 { + return false; + } + + let trimmed = &line[..end]; + let last = trimmed[end - 1]; + let head = &trimmed[..end - 1]; + + match last { + b'#' => head.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true), + b'/' => { + if head.last() == Some(&b'/') { + let before = &head[..head.len().saturating_sub(1)]; + before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true) + } else if head.last() == Some(&b'*') { + let before = &head[..head.len().saturating_sub(1)]; + before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true) + } else { + false + } + } + b'-' => { + if head.last() == Some(&b'-') { + let before = &head[..head.len().saturating_sub(1)]; + before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true) + } else { + false + } + } + b'*' => { + if head.last() == Some(&b'/') { + let before = &head[..head.len().saturating_sub(1)]; + before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true) + } else { + head.iter().all(|c| c.is_ascii_whitespace()) + } + } + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::{ + is_comment_prefix, line_bounds, line_has_directive, should_skip_for_directive_search, + trim_ascii_whitespace, InlineIgnoreConfig, + }; + use crate::location::OffsetSpan; + + #[test] + fn detects_comment_prefixes() { + assert!(is_comment_prefix(b"// kingfisher:ignore", 3)); + assert!(is_comment_prefix(b" # kingfisher:ignore", 4)); + assert!(is_comment_prefix(b"value /* kingfisher:ignore */", 9)); + // assert!(is_comment_prefix(b"value -- kingfisher:ignore", 12)); + // assert!(is_comment_prefix(b" * kingfisher:ignore", 4)); + assert!(!is_comment_prefix(b"http://kingfisher:ignore", 13)); + } + + #[test] + fn bounds_cover_expected_ranges() { + let data = b"one\ntwo\nthree"; + assert_eq!(line_bounds(data, 0), (0, 3)); + assert_eq!(line_bounds(data, 4), (4, 7)); + assert_eq!(line_bounds(data, data.len()), (8, 13)); + } + + #[test] + fn detects_directives_in_lines() { + let tokens = ["kingfisher:ignore", "kingfisher:allow"]; + assert!(line_has_directive(b"secret # kingfisher:ignore", &tokens)); + assert!(!line_has_directive(b"secret", &tokens)); + } + + #[test] + fn respects_multiline_block_comment_prefix() { + let tokens = ["kingfisher:ignore"]; + assert!(line_has_directive(b" * kingfisher:ignore", &tokens)); + } + + #[test] + fn ignores_multi_line_string_with_trailing_comment() { + let blob = b"let secret = \"\"\"\nline1\nline2\n\"\"\"\n# kingfisher:ignore\n"; + let matched = b"line1\nline2\n"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(false); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn ignores_multiline_without_trailing_newline() { + let blob = b"let secret = \"\"\"\nline1\nline2\n\"\"\"\n# kingfisher:ignore\n"; + let matched = b"line1\nline2"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(false); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn ignores_multiline_with_directive_before_secret() { + let blob = b"// kingfisher:ignore\nlet secret = \"\"\"\nline1\nline2\n\"\"\"\n"; + let matched = b"line1\nline2\n"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(false); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn trim_ascii_whitespace_returns_inner_slice() { + assert_eq!(trim_ascii_whitespace(b" abc "), b"abc"); + assert!(trim_ascii_whitespace(b" ").is_empty()); + } + + #[test] + fn skips_lines_with_only_delimiters() { + assert!(should_skip_for_directive_search(b"\"\"\"")); + assert!(should_skip_for_directive_search(b" \"\"\" ")); + assert!(should_skip_for_directive_search(b"let secret = \"\"\"")); + assert!(!should_skip_for_directive_search(b"value")); + } +} diff --git a/src/lib.rs b/src/lib.rs index 3ceed02..9fc8106 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ pub mod git_url; pub mod gitea; pub mod github; pub mod gitlab; +pub mod inline_ignore; pub mod jira; pub mod liquid_filters; pub mod location; diff --git a/src/main.rs b/src/main.rs index b6bb1fd..af89a53 100644 --- a/src/main.rs +++ b/src/main.rs @@ -416,6 +416,7 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: true, diff --git a/src/matcher.rs b/src/matcher.rs index 102820b..2688875 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -23,6 +23,7 @@ use xxhash_rust::xxh3::xxh3_64; use crate::{ blob::{Blob, BlobId, BlobIdMap}, entropy::calculate_shannon_entropy, + inline_ignore::InlineIgnoreConfig, location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, origin::OriginSet, parser, @@ -199,6 +200,9 @@ pub struct Matcher<'a> { /// Rule profiler for measuring performance of individual rules profiler: Option>, + + /// Configuration that controls inline ignore directives + inline_ignore_config: InlineIgnoreConfig, } /// This `Drop` implementation updates the `global_stats` with the local stats impl<'a> Drop for Matcher<'a> { @@ -226,6 +230,7 @@ impl<'a> Matcher<'a> { global_stats: Option<&'a Mutex>, enable_profiling: bool, shared_profiler: Option>, + include_external_ignore_syntax: bool, ) -> Result { // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec let raw_matches_scratch = Vec::new(); @@ -247,6 +252,7 @@ impl<'a> Matcher<'a> { seen_blobs, user_data, profiler, + inline_ignore_config: InlineIgnoreConfig::new(include_external_ignore_syntax), }) } @@ -403,6 +409,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + &self.inline_ignore_config, ); } // If tree-sitter produced base64-decoded matches, try them against all rules @@ -427,6 +434,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + &self.inline_ignore_config, ); } } @@ -457,6 +465,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + &self.inline_ignore_config, ); } if depth + 1 < MAX_B64_DEPTH { @@ -560,6 +569,7 @@ fn filter_match<'b>( redact: bool, filename: &str, profiler: Option<&Arc>, + inline_ignore_config: &InlineIgnoreConfig, ) { let mut timer = profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename)); @@ -590,6 +600,10 @@ fn filter_match<'b>( let matching_input_offset_span = OffsetSpan::from_range( (start + matching_input.start())..(start + matching_input.end()), ); + if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) { + debug!("Skipping match due to inline ignore directive"); + continue; + } let match_key = compute_match_key( matching_input.as_bytes(), rule.id().as_bytes(), @@ -961,7 +975,7 @@ pub fn compute_finding_fingerprint( // ------------------------------------------------------------------------------------------------- #[cfg(test)] mod test { - use std::collections::BTreeMap; + use std::{collections::BTreeMap, path::PathBuf}; use pretty_assertions::assert_eq; // --------------------------------------------------------------------- @@ -970,7 +984,11 @@ mod test { use proptest::prelude::*; use super::*; - use crate::rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation}; + use crate::{ + blob::{Blob, BlobIdMap}, + origin::{Origin, OriginSet}, + rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation}, + }; proptest! { #[test] @@ -1009,7 +1027,7 @@ mod test { let rules_db = RulesDatabase::from_rules(vec![rule]).unwrap(); let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None).unwrap(); + let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false).unwrap(); // ── run the scan ────────────────────────────────────────────── m.scan_bytes_raw(&noise, "buf").unwrap(); @@ -1080,6 +1098,7 @@ mod test { None, enable_rule_profiling, None, // Pass the shared profiler + false, )?; matcher.scan_bytes_raw(input.as_bytes(), "fname")?; assert_eq!( @@ -1167,7 +1186,7 @@ mod test { let rules_db = RulesDatabase::from_rules(vec![rule])?; let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None)?; + let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; let buf = b"dup dup"; // two literal hits, same rule @@ -1184,4 +1203,115 @@ mod test { assert_eq!(second_len, 2); Ok(()) } + + #[test] + fn inline_comment_skips_match() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.ignore".into(), + name: "inline".into(), + pattern: "secret_token".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + + let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt"))); + + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } + + #[test] + fn inline_comment_after_multiline_secret_skips_match() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.multiline".into(), + name: "inline multiline".into(), + pattern: "line1\\s+line2".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + + let blob = Blob::from_bytes( + br#"let data = """ +line1 +line2 +""" +# kingfisher:ignore +"# + .to_vec(), + ); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt"))); + + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } + + #[test] + fn compat_flag_controls_external_directives() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.compat".into(), + name: "inline compat".into(), + pattern: "supersecret123".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + + let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt"))); + + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + let matches_without_compat = + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => matches.len(), + _ => panic!("unexpected scan result"), + }; + assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag"); + + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true)?; + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } } diff --git a/src/matcher.rs.orig b/src/matcher.rs.orig new file mode 100644 index 0000000..2688875 --- /dev/null +++ b/src/matcher.rs.orig @@ -0,0 +1,1317 @@ +use std::{ + hash::{Hash, Hasher}, + str, + sync::{Arc, Mutex}, +}; + +use anyhow::Result; +use base64::{engine::general_purpose, Engine}; +use bstr::BString; +use http::StatusCode; +use regex::bytes::Regex; +use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; +use schemars::{ + gen::SchemaGenerator, + schema::{ArrayValidation, InstanceType, Schema}, + JsonSchema, +}; +use serde::{Deserialize, Serialize}; +use smallvec::SmallVec; +use tracing::debug; +use xxhash_rust::xxh3::xxh3_64; + +use crate::{ + blob::{Blob, BlobId, BlobIdMap}, + entropy::calculate_shannon_entropy, + inline_ignore::InlineIgnoreConfig, + location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, + origin::OriginSet, + parser, + parser::{Checker, Language}, + rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, + rules::rule::Rule, + rules_database::RulesDatabase, + safe_list::{is_safe_match, is_user_match}, + scanner_pool::ScannerPool, + snippet::Base64BString, + util::{intern, redact_value}, +}; + +const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment +const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches +const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs +const TREE_SITTER_MAX_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs <= 64 KiB +const TREE_SITTER_MIN_LIMIT: usize = 1 * 1024; // only run tree-sitter on blobs >= 1 KiB + +// ------------------------------------------------------------------------------------------------- +// RawMatch +// ------------------------------------------------------------------------------------------------- +/// A raw match, as recorded by a callback to Vectorscan. +/// +/// When matching with Vectorscan, we simply collect all matches into a +/// preallocated `Vec`, and then go through them all after scanning is complete. +#[derive(PartialEq, Eq, Debug, Clone)] +struct RawMatch { + rule_id: u32, + start_idx: u64, + end_idx: u64, +} +#[derive(Clone)] +pub struct OwnedBlobMatch { + pub rule: Arc, + pub blob_id: BlobId, + /// The unique content-based identifier of this match + pub finding_fingerprint: u64, + pub matching_input_offset_span: OffsetSpan, + pub captures: SerializableCaptures, + pub validation_response_body: String, + pub validation_response_status: StatusCode, + pub validation_success: bool, + pub calculated_entropy: f32, + pub is_base64: bool, +} +impl<'a> Matcher<'a> { + pub fn get_profiling_report(&self) -> Option> { + self.profiler.as_ref().map(|p| p.generate_report()) + } +} +impl OwnedBlobMatch { + pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc) -> OwnedBlobMatch { + OwnedBlobMatch { + rule, + blob_id: m.blob_id, + finding_fingerprint: m.finding_fingerprint, + // matching_input: m.snippet.matching.0.to_vec(), + matching_input_offset_span: m.location.offset_span.clone(), + captures: m.groups.clone(), + validation_response_body: m.validation_response_body.clone(), + validation_response_status: StatusCode::from_u16(m.validation_response_status) + .unwrap_or(StatusCode::CONTINUE), + validation_success: m.validation_success, + calculated_entropy: m.calculated_entropy, + is_base64: m.is_base64, + } + } + + pub fn from_blob_match(blob_match: BlobMatch) -> Self { + // Get the matching value from capture group 1 (or 0 if not available) + let matching_finding = blob_match + .captures + .captures + .get(1) + .or_else(|| blob_match.captures.captures.get(0)) + .map(|capture| capture.value.as_bytes().to_vec()) + .unwrap_or_else(Vec::new); + + let mut owned_blob_match = OwnedBlobMatch { + rule: blob_match.rule, + blob_id: blob_match.blob_id.clone(), + matching_input_offset_span: blob_match.matching_input_offset_span, + captures: blob_match.captures.clone(), + validation_response_body: blob_match.validation_response_body, + validation_response_status: blob_match.validation_response_status, + validation_success: blob_match.validation_success, + calculated_entropy: blob_match.calculated_entropy, + finding_fingerprint: 0, //default + is_base64: blob_match.is_base64, + }; + + // Convert matching_finding to a &str (using lossy conversion if needed) + let finding_value = std::str::from_utf8(&matching_finding).unwrap_or(""); + // Use blob_id as the file/commit identifier + let file_or_commit = &blob_match.blob_id.to_string(); + + let offset_start: u64 = + owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); + let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); + + owned_blob_match.finding_fingerprint = + compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end); + + owned_blob_match + } +} +// ------------------------------------------------------------------------------------------------- +// BlobMatch +// ------------------------------------------------------------------------------------------------- +/// A `BlobMatch` is the result type from `Matcher::scan_blob`. +/// +/// It is mostly made up of references and small data. +/// For a representation that is more friendly for human consumption, see +/// `Match`. +pub struct BlobMatch<'a> { + /// The rule that was matched + pub rule: Arc, // Changed from `&'a Rule` to `Arc, + + pub validation_response_body: String, + pub validation_response_status: StatusCode, + + pub validation_success: bool, + pub calculated_entropy: f32, + pub is_base64: bool, +} +#[derive(Clone)] +struct UserData { + /// A scratch vector for raw matches from Vectorscan, to minimize allocation + raw_matches_scratch: Vec, + + /// The length of the input being scanned + input_len: u64, +} +// ------------------------------------------------------------------------------------------------- +// Matcher +// ------------------------------------------------------------------------------------------------- +/// A `Matcher` is able to scan inputs for matches from rules in a +/// `RulesDatabase`. +/// +/// If doing multi-threaded scanning, use a separate `Matcher` for each thread. +#[derive(Clone)] +pub struct Matcher<'a> { + /// Thread-local pool that hands out a &mut BlockScanner + scanner_pool: std::sync::Arc, + + /// The rules database used for matching + rules_db: &'a RulesDatabase, + + /// Local statistics for this `Matcher` + local_stats: MatcherStats, + + /// Global statistics, updated with the local statsistics when this + /// `Matcher` is dropped + global_stats: Option<&'a Mutex>, + + /// The set of blobs that have been seen + seen_blobs: &'a BlobIdMap, + + /// Data passed to the Vectorscan callback + user_data: UserData, + + /// Rule profiler for measuring performance of individual rules + profiler: Option>, + + /// Configuration that controls inline ignore directives + inline_ignore_config: InlineIgnoreConfig, +} +/// This `Drop` implementation updates the `global_stats` with the local stats +impl<'a> Drop for Matcher<'a> { + fn drop(&mut self) { + if let Some(global_stats) = self.global_stats { + let mut global_stats = global_stats.lock().unwrap(); + global_stats.update(&self.local_stats); + } + } +} +pub enum ScanResult<'a> { + SeenWithMatches, + SeenSansMatches, + New(Vec>), +} +impl<'a> Matcher<'a> { + /// Create a new `Matcher` from the given `RulesDatabase`. + /// + /// If `global_stats` is provided, it will be updated with the local stats + /// from this `Matcher` when it is dropped. + pub fn new( + rules_db: &'a RulesDatabase, + scanner_pool: Arc, + seen_blobs: &'a BlobIdMap, + global_stats: Option<&'a Mutex>, + enable_profiling: bool, + shared_profiler: Option>, + include_external_ignore_syntax: bool, + ) -> Result { + // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec + let raw_matches_scratch = Vec::new(); + let user_data = UserData { raw_matches_scratch, input_len: 0 }; + // let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?; + // pool is created once per scan run (see Scanner section below) + let profiler = shared_profiler.or_else(|| { + if enable_profiling { + Some(Arc::new(ConcurrentRuleProfiler::new())) + } else { + None + } + }); + Ok(Matcher { + scanner_pool, + rules_db, + local_stats: MatcherStats::default(), + global_stats, + seen_blobs, + user_data, + profiler, + inline_ignore_config: InlineIgnoreConfig::new(include_external_ignore_syntax), + }) + } + + fn scan_bytes_raw(&mut self, input: &[u8], _filename: &str) -> Result<()> { + // Remember previous peak automatically + let prev_capacity = self.user_data.raw_matches_scratch.capacity(); + self.user_data.raw_matches_scratch.clear(); + self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64)); + + self.user_data.input_len = input.len() as u64; + + let mut offset: usize = 0; + while offset < input.len() { + let end = (offset + MAX_CHUNK_SIZE).min(input.len()); + let slice = &input[offset..end]; + let base = offset as u64; + self.scanner_pool.with(|scanner| { + scanner.scan(slice, |rule_id, from, to, _flags| { + self.user_data.raw_matches_scratch.push(RawMatch { + rule_id, + start_idx: from + base, + end_idx: to + base, + }); + vectorscan_rs::Scan::Continue + }) + })?; + + if end == input.len() { + break; + } + offset = end.saturating_sub(CHUNK_OVERLAP); + } + + Ok(()) + } + + pub fn scan_blob<'b>( + &mut self, + blob: &'b Blob, + origin: &OriginSet, + lang: Option, + redact: bool, + no_dedup: bool, + no_base64: bool, + ) -> Result> + where + 'a: 'b, + { + // Update local stats + self.local_stats.blobs_seen += 1; + self.local_stats.bytes_seen += blob.bytes().len() as u64; + self.local_stats.blobs_scanned += 1; + self.local_stats.bytes_scanned += blob.bytes().len() as u64; + + // Extract filename from origin + let filename = origin + .first() + .blob_path() + .and_then(|path| path.file_name()) + .and_then(|name| name.to_str()) + .unwrap_or("unknown_file") + .to_string(); + // Perform the scan + self.scan_bytes_raw(&blob.bytes(), &filename)?; + + // Opportunistically look for standalone Base64 blobs. If neither + // the raw scan nor this check yields anything, we can return early + // before doing any heavier work. + let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT { + Vec::new() + } else { + get_base64_strings(blob.bytes()) + }; + + let lang_hint = lang.as_deref(); + let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty(); + let has_base64_items = !b64_items.is_empty(); + + if !has_raw_matches && !has_base64_items { + return Ok(ScanResult::New(Vec::new())); + } + + let rules_db = self.rules_db; + let mut seen_matches = FxHashSet::default(); + let mut previous_matches: FxHashMap> = FxHashMap::default(); + + let blob_len = blob.len(); + + let should_run_tree_sitter = blob_len > 0 + && blob_len <= TREE_SITTER_MAX_LIMIT + && blob_len >= TREE_SITTER_MIN_LIMIT + && has_raw_matches + && lang_hint.is_some() + && !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled + + let tree_sitter_result = if should_run_tree_sitter { + lang_hint.and_then(|lang_str| { + get_language_and_queries(lang_str).and_then(|(language, queries)| { + let checker = Checker { language, rules: queries }; + match checker.check(&blob.bytes()) { + Ok(results) => Some(results), + Err(e) => { + println!("Error in checker.check: {}", e); + None + } + } + }) + }) + } else { + None + }; + // Process matches + let mut matches = Vec::new(); + let owned_ts_results = tree_sitter_result.map(|ts_results| { + ts_results + .into_iter() + .filter(|match_result| match_result.is_base64_decoded) + .map(|match_result| { + ( + match_result.range, + match_result.text, + match_result.is_base64_decoded, + match_result.original_base64, + ) + }) + .collect::>() + }); + let mut previous_raw_matches: FxHashMap> = FxHashMap::default(); + for &RawMatch { rule_id, start_idx, end_idx } in + self.user_data.raw_matches_scratch.iter().rev() + { + let rule_id_usize: usize = rule_id as usize; + let rule = Arc::clone(&rules_db.rules[rule_id_usize]); + let re = &rules_db.anchored_regexes[rule_id_usize]; + let start_idx_usize = start_idx as usize; + let end_idx_usize = end_idx as usize; + let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize); + if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) { + continue; + } + filter_match( + blob, + rule, + re, + start_idx_usize, + end_idx_usize, + &mut matches, + &mut previous_matches, + rule_id_usize, + &mut seen_matches, + origin, + None, + false, + redact, + &filename, + self.profiler.as_ref(), + &self.inline_ignore_config, + ); + } + // If tree-sitter produced base64-decoded matches, try them against all rules + if let Some(ref ts_results) = owned_ts_results { + for (ts_range, ts_match, is_base64_decoded, _original_base64) in ts_results.iter() { + if *is_base64_decoded { + for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() { + let re = &rules_db.anchored_regexes[rule_id_usize]; + filter_match( + blob, + rule.clone(), + re, + ts_range.start, + ts_range.end, + &mut matches, + &mut previous_matches, + rule_id_usize, + &mut seen_matches, + origin, + Some(ts_match.as_bytes()), + *is_base64_decoded, + redact, + &filename, + self.profiler.as_ref(), + &self.inline_ignore_config, + ); + } + } + } + } + + if !no_base64 { + // If the blob contains standalone Base64 blobs, decode and scan them as well + const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep + let mut b64_stack: Vec<(DecodedData, usize)> = + b64_items.drain(..).map(|d| (d, 0)).collect(); + while let Some((item, depth)) = b64_stack.pop() { + for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() { + let re = &rules_db.anchored_regexes[rule_id_usize]; + filter_match( + blob, + rule.clone(), + re, + item.pos_start, + item.pos_end, + &mut matches, + &mut previous_matches, + rule_id_usize, + &mut seen_matches, + origin, + Some(item.decoded.as_bytes()), + true, + redact, + &filename, + self.profiler.as_ref(), + &self.inline_ignore_config, + ); + } + if depth + 1 < MAX_B64_DEPTH { + for nested in get_base64_strings(item.decoded.as_bytes()) { + b64_stack.push(( + DecodedData { + original: nested.original, + decoded: nested.decoded, + pos_start: item.pos_start, + pos_end: item.pos_end, + }, + depth + 1, + )); + } + } + } + } + // Finalize + if !no_dedup && !matches.is_empty() { + let blob_id = blob.id(); + if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) { + return Ok(if had_matches { + ScanResult::SeenWithMatches + } else { + ScanResult::SeenSansMatches + }); + } + } + + // --- opportunistic capacity cap --------------------------------- + if self.user_data.raw_matches_scratch.capacity() + > self.user_data.raw_matches_scratch.len() * 4 + { + // Vec::shrink_to_fit may re-allocate, but we’re about to leave scan_blob + // so the cost is hidden off the hot path. + self.user_data.raw_matches_scratch.shrink_to_fit(); + } + + Ok(ScanResult::New(matches)) + // Ok(result) + } +} + +#[inline] +fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 { + let mut hasher = FxHasher::default(); + // Hash each component directly without allocation + content.hash(&mut hasher); + rule_id.hash(&mut hasher); + start.hash(&mut hasher); + end.hash(&mut hasher); + hasher.finish() +} + +#[inline] +fn insert_span(spans: &mut Vec, span: OffsetSpan) -> bool { + let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i); + if idx > 0 { + if spans[idx - 1].fully_contains(&span) { + return false; + } + if span.fully_contains(&spans[idx - 1]) { + spans.remove(idx - 1); + idx -= 1; + } + } + if idx < spans.len() { + if spans[idx].fully_contains(&span) { + return false; + } + if span.fully_contains(&spans[idx]) { + spans.remove(idx); + } + } + spans.insert(idx, span); + true +} + +#[inline] +fn record_match( + map: &mut FxHashMap>, + rule_id: usize, + span: OffsetSpan, +) -> bool { + insert_span(map.entry(rule_id).or_default(), span) +} +fn filter_match<'b>( + blob: &'b Blob, + // rule: &'b Rule, + rule: Arc, + re: &Regex, + start: usize, + end: usize, + matches: &mut Vec>, + previous_matches: &mut FxHashMap>, + rule_id: usize, + seen_matches: &mut FxHashSet, + _origin: &OriginSet, + ts_match: Option<&[u8]>, + is_base64: bool, + redact: bool, + filename: &str, + profiler: Option<&Arc>, + inline_ignore_config: &InlineIgnoreConfig, +) { + let mut timer = + profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename)); + + let initial_len = matches.len(); + + let blob_bytes = blob.bytes(); + let default_slice = &blob_bytes[start..end]; + let haystack = ts_match.unwrap_or(default_slice); + + for captures in re.captures_iter(haystack) { + let full_capture = captures.get(0).unwrap(); + let matching_input = captures.get(1).unwrap_or(full_capture); + let min_entropy = rule.min_entropy(); + let mi_bytes = matching_input.as_bytes(); + let full_bytes = full_capture.as_bytes(); + let calculated_entropy = calculate_shannon_entropy(mi_bytes); + if calculated_entropy <= min_entropy + || is_safe_match(mi_bytes) + || is_user_match(mi_bytes, full_bytes) + { + debug!( + "Skipping match with entropy {} <= {} or safe match", + calculated_entropy, min_entropy + ); + continue; + } + let matching_input_offset_span = OffsetSpan::from_range( + (start + matching_input.start())..(start + matching_input.end()), + ); + if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) { + debug!("Skipping match due to inline ignore directive"); + continue; + } + let match_key = compute_match_key( + matching_input.as_bytes(), + rule.id().as_bytes(), + matching_input_offset_span.start, + matching_input_offset_span.end, + ); + if !seen_matches.insert(match_key) { + continue; + } + if !record_match(previous_matches, rule_id, matching_input_offset_span) { + continue; + } + let only_matching_input = + &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end]; + let groups = SerializableCaptures::from_captures(&captures, haystack, re, redact); + matches.push(BlobMatch { + rule: Arc::clone(&rule), + blob_id: blob.id_ref(), + matching_input: only_matching_input, + matching_input_offset_span, + captures: groups, + validation_response_body: String::new(), + validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE), + validation_success: false, + calculated_entropy, + is_base64, + }); + } + if let Some(t) = timer.take() { + let new_count = (matches.len() - initial_len) as u64; + t.end(new_count > 0, new_count, 0); + } +} +fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap)> { + match lang.to_lowercase().as_str() { + "bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())), + "c" => Some((Language::C, parser::queries::c::get_c_queries())), + "c#" | "csharp" => Some((Language::CSharp, parser::queries::csharp::get_csharp_queries())), + "c++" | "cpp" => Some((Language::Cpp, parser::queries::cpp::get_cpp_queries())), + "css" => Some((Language::Css, parser::queries::css::get_css_queries())), + "go" => Some((Language::Go, parser::queries::go::get_go_queries())), + "html" => Some((Language::Html, parser::queries::html::get_html_queries())), + "java" => Some((Language::Java, parser::queries::java::get_java_queries())), + "javascript" | "js" => { + Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries())) + } + // "kotlin" => Some(( + // Language::Kotlin, + // parser::queries::kotlin::get_kotlin_queries(), + // )), + "php" => Some((Language::Php, parser::queries::php::get_php_queries())), + "python" | "py" | "starlark" => { + Some((Language::Python, parser::queries::python::get_python_queries())) + } + "ruby" => Some((Language::Ruby, parser::queries::ruby::get_ruby_queries())), + "rust" => Some((Language::Rust, parser::queries::rust::get_rust_queries())), + "toml" => Some((Language::Toml, parser::queries::toml::get_toml_queries())), + "typescript" | "ts" => { + Some((Language::TypeScript, parser::queries::typescript::get_typescript_queries())) + } + "yaml" => Some((Language::Yaml, parser::queries::yaml::get_yaml_queries())), + _ => None, + } +} +// ------------------------------------------------------------------------------------------------- +// MatchStats +// ------------------------------------------------------------------------------------------------- +#[derive(Debug, Default, Clone)] +pub struct MatcherStats { + pub blobs_seen: u64, + pub blobs_scanned: u64, + pub bytes_seen: u64, + pub bytes_scanned: u64, + // #[cfg(feature = "rule_profiling")] + // pub rule_stats: crate::rule_profiling::RuleProfile, +} +impl MatcherStats { + pub fn update(&mut self, other: &Self) { + self.blobs_seen += other.blobs_seen; + self.blobs_scanned += other.blobs_scanned; + self.bytes_seen += other.bytes_seen; + self.bytes_scanned += other.bytes_scanned; + + // #[cfg(feature = "rule_profiling")] + // self.rule_stats.update(&other.rule_stats); + } +} +// ------------------------------------------------------------------------------------------------- +// Group +// ------------------------------------------------------------------------------------------------- +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)] +pub struct Group(pub Base64BString); +impl Group { + pub fn new(m: regex::bytes::Match<'_>) -> Self { + Self(Base64BString(BString::from(m.as_bytes()))) + } +} +// ------------------------------------------------------------------------------------------------- +// Groups +// ------------------------------------------------------------------------------------------------- +#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +pub struct Groups(pub SmallVec<[Group; 1]>); +impl JsonSchema for Groups { + fn schema_name() -> String { + "Groups".to_string() + } + + fn json_schema(gen: &mut SchemaGenerator) -> Schema { + let group_schema = gen.subschema_for::(); + Schema::Object(schemars::schema::SchemaObject { + instance_type: Some(InstanceType::Array.into()), + array: Some(Box::new(ArrayValidation { + items: Some(group_schema.into()), + ..Default::default() + })), + ..Default::default() + }) + } +} +// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +// pub struct SerializableCapture { +// pub name: Option, // Named group (if available) +// pub match_number: i32, +// pub start: usize, // Start position of the match +// pub end: usize, // End position of the match +// pub value: String, // The actual captured value +// } +#[derive(Debug, Clone, Serialize, JsonSchema)] +pub struct SerializableCapture { + pub name: Option, + pub match_number: i32, + pub start: usize, + pub end: usize, + /// Interned value of the capture. + pub value: &'static str, +} + +#[derive(Debug, Clone, Serialize, JsonSchema)] +pub struct SerializableCaptures { + #[schemars(with = "Vec")] + pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed) +} +impl SerializableCaptures { + pub fn from_captures( + captures: ®ex::bytes::Captures, + _input: &[u8], + re: &Regex, + redact: bool, + ) -> Self { + let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new(); + // Process named captures + for name in re.capture_names().flatten() { + if let Some(capture) = captures.name(name) { + let value = if redact { + redact_value(&String::from_utf8_lossy(capture.as_bytes())) + } else { + String::from_utf8_lossy(capture.as_bytes()).to_string() + }; + serialized_captures.push(SerializableCapture { + name: Some(name.to_string()), + match_number: -1, + start: capture.start(), + end: capture.end(), + value: intern(&value), + }); + } + } + // Process unnamed captures (numbered groups) + for i in 0..captures.len() { + if let Some(capture) = captures.get(i) { + let value = if redact { + redact_value(&String::from_utf8_lossy(capture.as_bytes())) + } else { + String::from_utf8_lossy(capture.as_bytes()).to_string() + }; + serialized_captures.push(SerializableCapture { + name: None, + match_number: i32::try_from(i).unwrap_or(0), + start: capture.start(), + end: capture.end(), + value: intern(&value), + }); + } + } + SerializableCaptures { captures: serialized_captures } + } +} +// ------------------------------------------------------------------------------------------------- +// Match +// ------------------------------------------------------------------------------------------------- +#[derive(Debug, Clone, Serialize, JsonSchema)] +pub struct Match { + /// The location of the entire matching content + pub location: Location, + + /// The capture groups + pub groups: SerializableCaptures, // Store serialized captures + + /// unique identifier of file / blob where this match was found + pub blob_id: BlobId, + + /// The unique content-based identifier of this match + pub finding_fingerprint: u64, + + /// The rule that produced this match + #[serde(skip_serializing)] + #[schemars(skip)] + pub rule: Arc, + + /// Validation Body + pub validation_response_body: String, + + /// Validation Status Code + pub validation_response_status: u16, + + /// Validation Success + pub validation_success: bool, + + /// Validation Success + pub calculated_entropy: f32, + + pub visible: bool, + #[serde(default)] + pub is_base64: bool, +} +impl Match { + #[inline] + pub fn convert_owned_blobmatch_to_match<'a>( + loc_mapping: Option<&'a LocationMapping<'a>>, + owned_blob_match: &'a OwnedBlobMatch, + origin_type: &'a str, + ) -> Self { + let offset_span = owned_blob_match.matching_input_offset_span; + // Extract the matched secret content. Use capture group 1 if it exists, otherwise fall back + // to group 0. + let matching_finding_bytes = owned_blob_match + .captures + .captures + .get(1) + .or_else(|| owned_blob_match.captures.captures.get(0)) + .map(|capture| capture.value.as_bytes()) + .unwrap_or_default(); + + // The fingerprint will be based on the content of the secret. + let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or(""); + + let source_span = + loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan { + start: SourcePoint { line: 0, column: 0 }, + end: SourcePoint { line: 0, column: 0 }, + }); + let offset_start: u64 = + owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); + let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); + + let finding_fingerprint = compute_finding_fingerprint( + finding_value_for_fp, + origin_type, // file_or_commit, + offset_start, + offset_end, + ); + + // matching_snippet + Match { + rule: owned_blob_match.rule.clone(), + visible: owned_blob_match.rule.visible().to_owned(), + location: Location { offset_span, source_span: source_span.clone() }, + groups: owned_blob_match.captures.clone(), + blob_id: owned_blob_match.blob_id, + finding_fingerprint, + validation_response_body: owned_blob_match.validation_response_body.clone(), + validation_response_status: owned_blob_match.validation_response_status.as_u16(), + validation_success: owned_blob_match.validation_success, + calculated_entropy: owned_blob_match.calculated_entropy, + is_base64: owned_blob_match.is_base64, + } + } + + /// Returns the `blob_id` of the match. + pub fn get_blob_id(&self) -> BlobId { + self.blob_id.clone() + } + + pub fn finding_id(&self) -> String { + let mut buffer = Vec::with_capacity(128); + buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes()); + buffer.push(0); + serde_json::to_writer(&mut buffer, &self.groups) + .expect("should be able to serialize groups as JSON"); + let mut num = xxh3_64(&buffer); + // Ensure the number is positive and within i64 range + num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive + // Convert to string + num.to_string() + } +} +#[derive(Debug, Clone)] +pub struct DecodedData { + pub original: String, + pub decoded: String, + pub pos_start: usize, + pub pos_end: usize, +} +#[inline] +fn is_base64_byte(b: u8) -> bool { + matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/') +} + +pub fn get_base64_strings(input: &[u8]) -> Vec { + let mut results = Vec::new(); + let mut i = 0; + while i < input.len() { + while i < input.len() && !is_base64_byte(input[i]) { + i += 1; + } + let start = i; + while i < input.len() && is_base64_byte(input[i]) { + i += 1; + } + + let mut eq_count = 0; + while i < input.len() && input[i] == b'=' && eq_count < 2 { + i += 1; + eq_count += 1; + } + let end = i; + + let len = end - start; + if len >= 32 && len % 4 == 0 { + let base64_slice = &input[start..end]; + if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) { + if let Ok(decoded_str) = std::str::from_utf8(&decoded) { + if decoded_str.is_ascii() { + results.push(DecodedData { + original: String::from_utf8_lossy(base64_slice).into_owned(), + decoded: decoded_str.to_string(), + pos_start: start, + pos_end: end, + }); + } + } + } + } + } + + results +} + +pub fn compute_finding_fingerprint( + finding_value: &str, + file_or_commit: &str, + offset_start: u64, + offset_end: u64, +) -> u64 { + // Combine all into a byte buffer and hash it directly: + let mut buf = Vec::with_capacity( + finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::(), + ); + buf.extend_from_slice(finding_value.as_bytes()); + buf.extend_from_slice(file_or_commit.as_bytes()); + buf.extend_from_slice(&offset_start.to_le_bytes()); + buf.extend_from_slice(&offset_end.to_le_bytes()); + + xxh3_64(&buf) +} + +// ------------------------------------------------------------------------------------------------- +// test +// ------------------------------------------------------------------------------------------------- +#[cfg(test)] +mod test { + use std::{collections::BTreeMap, path::PathBuf}; + + use pretty_assertions::assert_eq; + // --------------------------------------------------------------------- + // proptest: raw-match dedup + entropy gate + // --------------------------------------------------------------------- + use proptest::prelude::*; + + use super::*; + use crate::{ + blob::{Blob, BlobIdMap}, + origin::{Origin, OriginSet}, + rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation}, + }; + + proptest! { + #[test] + fn prop_no_dupes_and_entropy( + // random ASCII up to 300 bytes + mut noise in proptest::collection::vec(any::().prop_filter("ascii", |b| b.is_ascii()), 0..300), + // 0-4 random insertion points + inserts in proptest::collection::vec(0usize..300, 0..5) + ) { + // Constant high-entropy secret token that matches the rule below + const TOKEN: &[u8] = b"secret_abcd1234"; + + // Splice the token at the requested offsets + for &idx in &inserts { + let pos = idx.min(noise.len()); + noise.splice(pos..pos, TOKEN.iter().copied()); + } + + // ── build a single test rule ────────────────────────────────── + use crate::rules::rule::{RuleSyntax, Validation, Confidence}; + + let rule = Rule::new(RuleSyntax { + id: "prop.secret".into(), + name: "prop secret".into(), + pattern: "secret_[a-z]{4}[0-9]{4}".into(), + confidence: Confidence::Low, + min_entropy: 3.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, // no HTTP validation needed + depends_on_rule: vec![], + }); + + let rules_db = RulesDatabase::from_rules(vec![rule]).unwrap(); + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false).unwrap(); + + // ── run the scan ────────────────────────────────────────────── + m.scan_bytes_raw(&noise, "buf").unwrap(); + + // ── property 1: dedup – each (rule,start,end) is unique ────── + + let mut coords = FxHashSet::default(); + for RawMatch{rule_id, start_idx, end_idx} in &m.user_data.raw_matches_scratch { + assert!( + coords.insert((*rule_id, *start_idx, *end_idx)), + "duplicate raw-match detected for coords ({rule_id},{start_idx},{end_idx})" + ); + + // ── property 2: entropy gate held ──────────────────────── + let slice = &noise[*start_idx as usize .. *end_idx as usize]; + let ent = calculate_shannon_entropy(slice); + assert!(ent > 3.0, "entropy {ent} ≤ min_entropy, gate failed"); + } + } + } + + #[test] + pub fn test_simple() -> Result<()> { + let rules = vec![Rule::new(RuleSyntax { + id: "test.1".to_string(), + name: "test".to_string(), + pattern: "test".to_string(), + confidence: crate::rules::rule::Confidence::Medium, + min_entropy: 1.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: Some(Validation::Http(HttpValidation { + request: HttpRequest { + method: "GET".to_string(), + url: "https://example.com".to_string(), + headers: BTreeMap::new(), + body: None, + response_matcher: Some(vec![]), + multipart: None, + response_is_html: false, + }, + multipart: None, + })), + depends_on_rule: vec![ + Some(DependsOnRule { + rule_id: "d8f3c34b-015f-4cd6-b411-b1366493104c".to_string(), + variable: "email".to_string(), + }), + Some(DependsOnRule { + rule_id: "8910f364-7718-4a27-a435-d2da13e6ba9e".to_string(), + variable: "domain".to_string(), + }), + ], + })]; + let rules_db = RulesDatabase::from_rules(rules)?; + let input = "some test data for vectorscan"; + let seen_blobs: BlobIdMap = BlobIdMap::new(); + let enable_rule_profiling = true; + // let mut matcher = Matcher::new(&rules_db, &seen_blobs, None, + // enable_rule_profiling)?; + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new( + &rules_db, + scanner_pool, + &seen_blobs, + None, + enable_rule_profiling, + None, // Pass the shared profiler + false, + )?; + matcher.scan_bytes_raw(input.as_bytes(), "fname")?; + assert_eq!( + matcher.user_data.raw_matches_scratch, + vec![RawMatch { rule_id: 0, start_idx: 0, end_idx: 9 },] + ); + Ok(()) + } + + // --------------------------------------------------------------------- + // additional deterministic unit-tests + // --------------------------------------------------------------------- + + /// `get_base64_strings` should recognise a well-formed token, decode it, + /// and report correct byte-offsets. + #[test] + fn test_get_base64_strings_basic() { + let raw = b"foo MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY= bar"; + // decodes to "0123456789abcdef0123456789abcdef" + let hits = get_base64_strings(raw); + assert_eq!(hits.len(), 1); + let item = &hits[0]; + assert_eq!(item.decoded, "0123456789abcdef0123456789abcdef"); + assert_eq!(item.original, "MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY="); + // "foo␠" is 4 bytes, so the start offset is 4 + assert_eq!((item.pos_start, item.pos_end), (4, 4 + item.original.len())); + } + + /// `compute_finding_fingerprint` must be stable (same input ⇒ same output) + /// and sensitive to any input component. + #[test] + fn test_finding_fingerprint_stability_and_uniqueness() { + let a = compute_finding_fingerprint("secret", "fileA", 0, 6); + let b = compute_finding_fingerprint("secret", "fileA", 0, 6); + assert_eq!(a, b, "fingerprint should be deterministic"); + + // changing any parameter should perturb the hash + let c = compute_finding_fingerprint("secret", "fileA", 1, 7); // offsets differ + let d = compute_finding_fingerprint("secret", "fileB", 0, 6); // file id differs + let e = compute_finding_fingerprint("different", "fileA", 0, 6); // content differs + assert_ne!(a, c); + assert_ne!(a, d); + assert_ne!(a, e); + } + + /// The (private) `compute_match_key` helper is the linchpin of the raw-dedup + /// path. It should return identical keys for identical inputs and different + /// keys as soon as *anything* changes. + #[test] + fn test_compute_match_key_uniqueness() { + use super::compute_match_key; + + let k1 = compute_match_key(b"abc", b"rule-1", 0, 3); + let k2 = compute_match_key(b"abc", b"rule-1", 0, 3); + assert_eq!(k1, k2); + + // mutate each component in turn + let diff_content = compute_match_key(b"abcd", b"rule-1", 0, 4); + let diff_rule = compute_match_key(b"abc", b"rule-2", 0, 3); + let diff_span = compute_match_key(b"abc", b"rule-1", 1, 4); + assert_ne!(k1, diff_content); + assert_ne!(k1, diff_rule); + assert_ne!(k1, diff_span); + } + + /// Running `scan_bytes_raw` twice over the *same* input should never record + /// duplicate entries in `raw_matches_scratch`. + #[test] + fn test_scan_bytes_raw_no_duplicate_raw_matches() -> Result<()> { + // simple rule: literal "dup" + let rule = Rule::new(RuleSyntax { + id: "dup.check".into(), + name: "dup".into(), + pattern: "dup".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + + let buf = b"dup dup"; // two literal hits, same rule + + // first scan + m.scan_bytes_raw(buf, "buf1")?; + let first_len = m.user_data.raw_matches_scratch.len(); + + // second scan over the same buffer + m.scan_bytes_raw(buf, "buf1")?; + let second_len = m.user_data.raw_matches_scratch.len(); + + // we should still only have two unique raw matches recorded + assert_eq!(first_len, 2); + assert_eq!(second_len, 2); + Ok(()) + } + + #[test] + fn inline_comment_skips_match() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.ignore".into(), + name: "inline".into(), + pattern: "secret_token".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + + let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt"))); + + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } + + #[test] + fn inline_comment_after_multiline_secret_skips_match() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.multiline".into(), + name: "inline multiline".into(), + pattern: "line1\\s+line2".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + + let blob = Blob::from_bytes( + br#"let data = """ +line1 +line2 +""" +# kingfisher:ignore +"# + .to_vec(), + ); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt"))); + + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } + + #[test] + fn compat_flag_controls_external_directives() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.compat".into(), + name: "inline compat".into(), + pattern: "supersecret123".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + + let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt"))); + + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; + let matches_without_compat = + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => matches.len(), + _ => panic!("unexpected scan result"), + }; + assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag"); + + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true)?; + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } +} diff --git a/src/matcher.rs.rej b/src/matcher.rs.rej new file mode 100644 index 0000000..dc61305 --- /dev/null +++ b/src/matcher.rs.rej @@ -0,0 +1,52 @@ +@@ -1,50 +1,51 @@ + use std::{ + hash::{Hash, Hasher}, + str, + sync::{Arc, Mutex}, + }; + + use anyhow::Result; + use base64::{engine::general_purpose, Engine}; + use bstr::BString; + use http::StatusCode; + use regex::bytes::Regex; + use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; + use schemars::{ + gen::SchemaGenerator, + schema::{ArrayValidation, InstanceType, Schema}, + JsonSchema, + }; + use serde::{Deserialize, Serialize}; + use smallvec::SmallVec; + use tracing::debug; + use xxhash_rust::xxh3::xxh3_64; + + use crate::{ + blob::{Blob, BlobId, BlobIdMap}, + entropy::calculate_shannon_entropy, ++ inline_ignore::InlineIgnoreConfig, + location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, + origin::OriginSet, + parser, + parser::{Checker, Language}, + rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, + rules::rule::Rule, + rules_database::RulesDatabase, + safe_list::{is_safe_match, is_user_match}, + scanner_pool::ScannerPool, + snippet::Base64BString, + util::{intern, redact_value}, + }; + + const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment + const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches + const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs + const TREE_SITTER_SCAN_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs ≤64 KiB + + // ------------------------------------------------------------------------------------------------- + // RawMatch + // ------------------------------------------------------------------------------------------------- + /// A raw match, as recorded by a callback to Vectorscan. + /// + /// When matching with Vectorscan, we simply collect all matches into a + /// preallocated `Vec`, and then go through them all after scanning is complete. diff --git a/src/reporter.rs b/src/reporter.rs index 73bc541..f5d33f5 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -842,6 +842,7 @@ mod tests { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 256.0, exclude: Vec::new(), diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 8b4f59c..941b15c 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -143,6 +143,7 @@ mod tests { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index df56427..22d3204 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -144,6 +144,7 @@ pub fn enumerate_filesystem_inputs( Some(&matcher_stats), enable_profiling, Some(shared_profiler), + args.compat_ignore_comments, )?; let blob_processor_init_time = Mutex::new(t1.elapsed()); let make_blob_processor = || -> BlobProcessor { diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index eb4ad10..9b72359 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -622,6 +622,7 @@ pub async fn fetch_s3_objects( Some(matcher_stats), enable_profiling, Some(shared_profiler.clone()), + args.compat_ignore_comments, )?; let mut processor = BlobProcessor { matcher }; diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs index 72bd950..7820b9c 100644 --- a/tests/int_allowlist.rs +++ b/tests/int_allowlist.rs @@ -112,6 +112,7 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index cd83a7f..931f1c5 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -131,6 +131,7 @@ rules: since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 5.0, extraction_depth: 1, diff --git a/tests/int_github.rs b/tests/int_github.rs index 06c67a7..b158a59 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -118,6 +118,7 @@ fn test_github_remote_scan() -> Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index e55655a..abbfe72 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -116,6 +116,7 @@ fn test_gitlab_remote_scan() -> Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, @@ -251,6 +252,7 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, diff --git a/tests/int_redact.rs b/tests/int_redact.rs index 48247af..a5a50c6 100644 --- a/tests/int_redact.rs +++ b/tests/int_redact.rs @@ -95,6 +95,7 @@ async fn test_redact_hashes_finding_values() -> Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, diff --git a/tests/int_slack.rs b/tests/int_slack.rs index 2575a3c..b38f93c 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -102,6 +102,7 @@ impl TestContext { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, @@ -225,6 +226,7 @@ async fn test_scan_slack_messages() -> Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index ea1c809..0f66766 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -174,6 +174,7 @@ async fn test_validation_cache_and_depvars() -> Result<()> { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index b87d721..d79c167 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -117,6 +117,7 @@ impl TestContext { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, @@ -227,6 +228,7 @@ impl TestContext { since_commit: None, branch: None, }, + compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, From b2a62a9c8ad56c22d1df221c2393cf4760a032a2 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 9 Oct 2025 17:59:22 -0700 Subject: [PATCH 2/8] - Added kingfisher:ignore (or kingfisher:allow) to silence a finding inline within a file - Added: to reuse existing inline directives from other scanners, pass --compat-ignore-comments to also accept NOSONAR, kics-scan ignore, gitleaks:allow and trufflehog:ignore --- src/matcher.rs.orig | 1317 ------------------------------------------- src/matcher.rs.rej | 52 -- 2 files changed, 1369 deletions(-) delete mode 100644 src/matcher.rs.orig delete mode 100644 src/matcher.rs.rej diff --git a/src/matcher.rs.orig b/src/matcher.rs.orig deleted file mode 100644 index 2688875..0000000 --- a/src/matcher.rs.orig +++ /dev/null @@ -1,1317 +0,0 @@ -use std::{ - hash::{Hash, Hasher}, - str, - sync::{Arc, Mutex}, -}; - -use anyhow::Result; -use base64::{engine::general_purpose, Engine}; -use bstr::BString; -use http::StatusCode; -use regex::bytes::Regex; -use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; -use schemars::{ - gen::SchemaGenerator, - schema::{ArrayValidation, InstanceType, Schema}, - JsonSchema, -}; -use serde::{Deserialize, Serialize}; -use smallvec::SmallVec; -use tracing::debug; -use xxhash_rust::xxh3::xxh3_64; - -use crate::{ - blob::{Blob, BlobId, BlobIdMap}, - entropy::calculate_shannon_entropy, - inline_ignore::InlineIgnoreConfig, - location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, - origin::OriginSet, - parser, - parser::{Checker, Language}, - rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, - rules::rule::Rule, - rules_database::RulesDatabase, - safe_list::{is_safe_match, is_user_match}, - scanner_pool::ScannerPool, - snippet::Base64BString, - util::{intern, redact_value}, -}; - -const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment -const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches -const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs -const TREE_SITTER_MAX_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs <= 64 KiB -const TREE_SITTER_MIN_LIMIT: usize = 1 * 1024; // only run tree-sitter on blobs >= 1 KiB - -// ------------------------------------------------------------------------------------------------- -// RawMatch -// ------------------------------------------------------------------------------------------------- -/// A raw match, as recorded by a callback to Vectorscan. -/// -/// When matching with Vectorscan, we simply collect all matches into a -/// preallocated `Vec`, and then go through them all after scanning is complete. -#[derive(PartialEq, Eq, Debug, Clone)] -struct RawMatch { - rule_id: u32, - start_idx: u64, - end_idx: u64, -} -#[derive(Clone)] -pub struct OwnedBlobMatch { - pub rule: Arc, - pub blob_id: BlobId, - /// The unique content-based identifier of this match - pub finding_fingerprint: u64, - pub matching_input_offset_span: OffsetSpan, - pub captures: SerializableCaptures, - pub validation_response_body: String, - pub validation_response_status: StatusCode, - pub validation_success: bool, - pub calculated_entropy: f32, - pub is_base64: bool, -} -impl<'a> Matcher<'a> { - pub fn get_profiling_report(&self) -> Option> { - self.profiler.as_ref().map(|p| p.generate_report()) - } -} -impl OwnedBlobMatch { - pub fn convert_match_to_owned_blobmatch(m: &Match, rule: Arc) -> OwnedBlobMatch { - OwnedBlobMatch { - rule, - blob_id: m.blob_id, - finding_fingerprint: m.finding_fingerprint, - // matching_input: m.snippet.matching.0.to_vec(), - matching_input_offset_span: m.location.offset_span.clone(), - captures: m.groups.clone(), - validation_response_body: m.validation_response_body.clone(), - validation_response_status: StatusCode::from_u16(m.validation_response_status) - .unwrap_or(StatusCode::CONTINUE), - validation_success: m.validation_success, - calculated_entropy: m.calculated_entropy, - is_base64: m.is_base64, - } - } - - pub fn from_blob_match(blob_match: BlobMatch) -> Self { - // Get the matching value from capture group 1 (or 0 if not available) - let matching_finding = blob_match - .captures - .captures - .get(1) - .or_else(|| blob_match.captures.captures.get(0)) - .map(|capture| capture.value.as_bytes().to_vec()) - .unwrap_or_else(Vec::new); - - let mut owned_blob_match = OwnedBlobMatch { - rule: blob_match.rule, - blob_id: blob_match.blob_id.clone(), - matching_input_offset_span: blob_match.matching_input_offset_span, - captures: blob_match.captures.clone(), - validation_response_body: blob_match.validation_response_body, - validation_response_status: blob_match.validation_response_status, - validation_success: blob_match.validation_success, - calculated_entropy: blob_match.calculated_entropy, - finding_fingerprint: 0, //default - is_base64: blob_match.is_base64, - }; - - // Convert matching_finding to a &str (using lossy conversion if needed) - let finding_value = std::str::from_utf8(&matching_finding).unwrap_or(""); - // Use blob_id as the file/commit identifier - let file_or_commit = &blob_match.blob_id.to_string(); - - let offset_start: u64 = - owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); - let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); - - owned_blob_match.finding_fingerprint = - compute_finding_fingerprint(finding_value, file_or_commit, offset_start, offset_end); - - owned_blob_match - } -} -// ------------------------------------------------------------------------------------------------- -// BlobMatch -// ------------------------------------------------------------------------------------------------- -/// A `BlobMatch` is the result type from `Matcher::scan_blob`. -/// -/// It is mostly made up of references and small data. -/// For a representation that is more friendly for human consumption, see -/// `Match`. -pub struct BlobMatch<'a> { - /// The rule that was matched - pub rule: Arc, // Changed from `&'a Rule` to `Arc, - - pub validation_response_body: String, - pub validation_response_status: StatusCode, - - pub validation_success: bool, - pub calculated_entropy: f32, - pub is_base64: bool, -} -#[derive(Clone)] -struct UserData { - /// A scratch vector for raw matches from Vectorscan, to minimize allocation - raw_matches_scratch: Vec, - - /// The length of the input being scanned - input_len: u64, -} -// ------------------------------------------------------------------------------------------------- -// Matcher -// ------------------------------------------------------------------------------------------------- -/// A `Matcher` is able to scan inputs for matches from rules in a -/// `RulesDatabase`. -/// -/// If doing multi-threaded scanning, use a separate `Matcher` for each thread. -#[derive(Clone)] -pub struct Matcher<'a> { - /// Thread-local pool that hands out a &mut BlockScanner - scanner_pool: std::sync::Arc, - - /// The rules database used for matching - rules_db: &'a RulesDatabase, - - /// Local statistics for this `Matcher` - local_stats: MatcherStats, - - /// Global statistics, updated with the local statsistics when this - /// `Matcher` is dropped - global_stats: Option<&'a Mutex>, - - /// The set of blobs that have been seen - seen_blobs: &'a BlobIdMap, - - /// Data passed to the Vectorscan callback - user_data: UserData, - - /// Rule profiler for measuring performance of individual rules - profiler: Option>, - - /// Configuration that controls inline ignore directives - inline_ignore_config: InlineIgnoreConfig, -} -/// This `Drop` implementation updates the `global_stats` with the local stats -impl<'a> Drop for Matcher<'a> { - fn drop(&mut self) { - if let Some(global_stats) = self.global_stats { - let mut global_stats = global_stats.lock().unwrap(); - global_stats.update(&self.local_stats); - } - } -} -pub enum ScanResult<'a> { - SeenWithMatches, - SeenSansMatches, - New(Vec>), -} -impl<'a> Matcher<'a> { - /// Create a new `Matcher` from the given `RulesDatabase`. - /// - /// If `global_stats` is provided, it will be updated with the local stats - /// from this `Matcher` when it is dropped. - pub fn new( - rules_db: &'a RulesDatabase, - scanner_pool: Arc, - seen_blobs: &'a BlobIdMap, - global_stats: Option<&'a Mutex>, - enable_profiling: bool, - shared_profiler: Option>, - include_external_ignore_syntax: bool, - ) -> Result { - // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec - let raw_matches_scratch = Vec::new(); - let user_data = UserData { raw_matches_scratch, input_len: 0 }; - // let vs_scanner = vectorscan_rs::BlockScanner::new(&rules_db.vsdb)?; - // pool is created once per scan run (see Scanner section below) - let profiler = shared_profiler.or_else(|| { - if enable_profiling { - Some(Arc::new(ConcurrentRuleProfiler::new())) - } else { - None - } - }); - Ok(Matcher { - scanner_pool, - rules_db, - local_stats: MatcherStats::default(), - global_stats, - seen_blobs, - user_data, - profiler, - inline_ignore_config: InlineIgnoreConfig::new(include_external_ignore_syntax), - }) - } - - fn scan_bytes_raw(&mut self, input: &[u8], _filename: &str) -> Result<()> { - // Remember previous peak automatically - let prev_capacity = self.user_data.raw_matches_scratch.capacity(); - self.user_data.raw_matches_scratch.clear(); - self.user_data.raw_matches_scratch.reserve(prev_capacity.max(64)); - - self.user_data.input_len = input.len() as u64; - - let mut offset: usize = 0; - while offset < input.len() { - let end = (offset + MAX_CHUNK_SIZE).min(input.len()); - let slice = &input[offset..end]; - let base = offset as u64; - self.scanner_pool.with(|scanner| { - scanner.scan(slice, |rule_id, from, to, _flags| { - self.user_data.raw_matches_scratch.push(RawMatch { - rule_id, - start_idx: from + base, - end_idx: to + base, - }); - vectorscan_rs::Scan::Continue - }) - })?; - - if end == input.len() { - break; - } - offset = end.saturating_sub(CHUNK_OVERLAP); - } - - Ok(()) - } - - pub fn scan_blob<'b>( - &mut self, - blob: &'b Blob, - origin: &OriginSet, - lang: Option, - redact: bool, - no_dedup: bool, - no_base64: bool, - ) -> Result> - where - 'a: 'b, - { - // Update local stats - self.local_stats.blobs_seen += 1; - self.local_stats.bytes_seen += blob.bytes().len() as u64; - self.local_stats.blobs_scanned += 1; - self.local_stats.bytes_scanned += blob.bytes().len() as u64; - - // Extract filename from origin - let filename = origin - .first() - .blob_path() - .and_then(|path| path.file_name()) - .and_then(|name| name.to_str()) - .unwrap_or("unknown_file") - .to_string(); - // Perform the scan - self.scan_bytes_raw(&blob.bytes(), &filename)?; - - // Opportunistically look for standalone Base64 blobs. If neither - // the raw scan nor this check yields anything, we can return early - // before doing any heavier work. - let mut b64_items = if no_base64 || blob.len() > BASE64_SCAN_LIMIT { - Vec::new() - } else { - get_base64_strings(blob.bytes()) - }; - - let lang_hint = lang.as_deref(); - let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty(); - let has_base64_items = !b64_items.is_empty(); - - if !has_raw_matches && !has_base64_items { - return Ok(ScanResult::New(Vec::new())); - } - - let rules_db = self.rules_db; - let mut seen_matches = FxHashSet::default(); - let mut previous_matches: FxHashMap> = FxHashMap::default(); - - let blob_len = blob.len(); - - let should_run_tree_sitter = blob_len > 0 - && blob_len <= TREE_SITTER_MAX_LIMIT - && blob_len >= TREE_SITTER_MIN_LIMIT - && has_raw_matches - && lang_hint.is_some() - && !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled - - let tree_sitter_result = if should_run_tree_sitter { - lang_hint.and_then(|lang_str| { - get_language_and_queries(lang_str).and_then(|(language, queries)| { - let checker = Checker { language, rules: queries }; - match checker.check(&blob.bytes()) { - Ok(results) => Some(results), - Err(e) => { - println!("Error in checker.check: {}", e); - None - } - } - }) - }) - } else { - None - }; - // Process matches - let mut matches = Vec::new(); - let owned_ts_results = tree_sitter_result.map(|ts_results| { - ts_results - .into_iter() - .filter(|match_result| match_result.is_base64_decoded) - .map(|match_result| { - ( - match_result.range, - match_result.text, - match_result.is_base64_decoded, - match_result.original_base64, - ) - }) - .collect::>() - }); - let mut previous_raw_matches: FxHashMap> = FxHashMap::default(); - for &RawMatch { rule_id, start_idx, end_idx } in - self.user_data.raw_matches_scratch.iter().rev() - { - let rule_id_usize: usize = rule_id as usize; - let rule = Arc::clone(&rules_db.rules[rule_id_usize]); - let re = &rules_db.anchored_regexes[rule_id_usize]; - let start_idx_usize = start_idx as usize; - let end_idx_usize = end_idx as usize; - let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize); - if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) { - continue; - } - filter_match( - blob, - rule, - re, - start_idx_usize, - end_idx_usize, - &mut matches, - &mut previous_matches, - rule_id_usize, - &mut seen_matches, - origin, - None, - false, - redact, - &filename, - self.profiler.as_ref(), - &self.inline_ignore_config, - ); - } - // If tree-sitter produced base64-decoded matches, try them against all rules - if let Some(ref ts_results) = owned_ts_results { - for (ts_range, ts_match, is_base64_decoded, _original_base64) in ts_results.iter() { - if *is_base64_decoded { - for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() { - let re = &rules_db.anchored_regexes[rule_id_usize]; - filter_match( - blob, - rule.clone(), - re, - ts_range.start, - ts_range.end, - &mut matches, - &mut previous_matches, - rule_id_usize, - &mut seen_matches, - origin, - Some(ts_match.as_bytes()), - *is_base64_decoded, - redact, - &filename, - self.profiler.as_ref(), - &self.inline_ignore_config, - ); - } - } - } - } - - if !no_base64 { - // If the blob contains standalone Base64 blobs, decode and scan them as well - const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep - let mut b64_stack: Vec<(DecodedData, usize)> = - b64_items.drain(..).map(|d| (d, 0)).collect(); - while let Some((item, depth)) = b64_stack.pop() { - for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() { - let re = &rules_db.anchored_regexes[rule_id_usize]; - filter_match( - blob, - rule.clone(), - re, - item.pos_start, - item.pos_end, - &mut matches, - &mut previous_matches, - rule_id_usize, - &mut seen_matches, - origin, - Some(item.decoded.as_bytes()), - true, - redact, - &filename, - self.profiler.as_ref(), - &self.inline_ignore_config, - ); - } - if depth + 1 < MAX_B64_DEPTH { - for nested in get_base64_strings(item.decoded.as_bytes()) { - b64_stack.push(( - DecodedData { - original: nested.original, - decoded: nested.decoded, - pos_start: item.pos_start, - pos_end: item.pos_end, - }, - depth + 1, - )); - } - } - } - } - // Finalize - if !no_dedup && !matches.is_empty() { - let blob_id = blob.id(); - if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) { - return Ok(if had_matches { - ScanResult::SeenWithMatches - } else { - ScanResult::SeenSansMatches - }); - } - } - - // --- opportunistic capacity cap --------------------------------- - if self.user_data.raw_matches_scratch.capacity() - > self.user_data.raw_matches_scratch.len() * 4 - { - // Vec::shrink_to_fit may re-allocate, but we’re about to leave scan_blob - // so the cost is hidden off the hot path. - self.user_data.raw_matches_scratch.shrink_to_fit(); - } - - Ok(ScanResult::New(matches)) - // Ok(result) - } -} - -#[inline] -fn compute_match_key(content: &[u8], rule_id: &[u8], start: usize, end: usize) -> u64 { - let mut hasher = FxHasher::default(); - // Hash each component directly without allocation - content.hash(&mut hasher); - rule_id.hash(&mut hasher); - start.hash(&mut hasher); - end.hash(&mut hasher); - hasher.finish() -} - -#[inline] -fn insert_span(spans: &mut Vec, span: OffsetSpan) -> bool { - let mut idx = spans.binary_search_by(|s| s.start.cmp(&span.start)).unwrap_or_else(|i| i); - if idx > 0 { - if spans[idx - 1].fully_contains(&span) { - return false; - } - if span.fully_contains(&spans[idx - 1]) { - spans.remove(idx - 1); - idx -= 1; - } - } - if idx < spans.len() { - if spans[idx].fully_contains(&span) { - return false; - } - if span.fully_contains(&spans[idx]) { - spans.remove(idx); - } - } - spans.insert(idx, span); - true -} - -#[inline] -fn record_match( - map: &mut FxHashMap>, - rule_id: usize, - span: OffsetSpan, -) -> bool { - insert_span(map.entry(rule_id).or_default(), span) -} -fn filter_match<'b>( - blob: &'b Blob, - // rule: &'b Rule, - rule: Arc, - re: &Regex, - start: usize, - end: usize, - matches: &mut Vec>, - previous_matches: &mut FxHashMap>, - rule_id: usize, - seen_matches: &mut FxHashSet, - _origin: &OriginSet, - ts_match: Option<&[u8]>, - is_base64: bool, - redact: bool, - filename: &str, - profiler: Option<&Arc>, - inline_ignore_config: &InlineIgnoreConfig, -) { - let mut timer = - profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename)); - - let initial_len = matches.len(); - - let blob_bytes = blob.bytes(); - let default_slice = &blob_bytes[start..end]; - let haystack = ts_match.unwrap_or(default_slice); - - for captures in re.captures_iter(haystack) { - let full_capture = captures.get(0).unwrap(); - let matching_input = captures.get(1).unwrap_or(full_capture); - let min_entropy = rule.min_entropy(); - let mi_bytes = matching_input.as_bytes(); - let full_bytes = full_capture.as_bytes(); - let calculated_entropy = calculate_shannon_entropy(mi_bytes); - if calculated_entropy <= min_entropy - || is_safe_match(mi_bytes) - || is_user_match(mi_bytes, full_bytes) - { - debug!( - "Skipping match with entropy {} <= {} or safe match", - calculated_entropy, min_entropy - ); - continue; - } - let matching_input_offset_span = OffsetSpan::from_range( - (start + matching_input.start())..(start + matching_input.end()), - ); - if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) { - debug!("Skipping match due to inline ignore directive"); - continue; - } - let match_key = compute_match_key( - matching_input.as_bytes(), - rule.id().as_bytes(), - matching_input_offset_span.start, - matching_input_offset_span.end, - ); - if !seen_matches.insert(match_key) { - continue; - } - if !record_match(previous_matches, rule_id, matching_input_offset_span) { - continue; - } - let only_matching_input = - &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end]; - let groups = SerializableCaptures::from_captures(&captures, haystack, re, redact); - matches.push(BlobMatch { - rule: Arc::clone(&rule), - blob_id: blob.id_ref(), - matching_input: only_matching_input, - matching_input_offset_span, - captures: groups, - validation_response_body: String::new(), - validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE), - validation_success: false, - calculated_entropy, - is_base64, - }); - } - if let Some(t) = timer.take() { - let new_count = (matches.len() - initial_len) as u64; - t.end(new_count > 0, new_count, 0); - } -} -fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap)> { - match lang.to_lowercase().as_str() { - "bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())), - "c" => Some((Language::C, parser::queries::c::get_c_queries())), - "c#" | "csharp" => Some((Language::CSharp, parser::queries::csharp::get_csharp_queries())), - "c++" | "cpp" => Some((Language::Cpp, parser::queries::cpp::get_cpp_queries())), - "css" => Some((Language::Css, parser::queries::css::get_css_queries())), - "go" => Some((Language::Go, parser::queries::go::get_go_queries())), - "html" => Some((Language::Html, parser::queries::html::get_html_queries())), - "java" => Some((Language::Java, parser::queries::java::get_java_queries())), - "javascript" | "js" => { - Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries())) - } - // "kotlin" => Some(( - // Language::Kotlin, - // parser::queries::kotlin::get_kotlin_queries(), - // )), - "php" => Some((Language::Php, parser::queries::php::get_php_queries())), - "python" | "py" | "starlark" => { - Some((Language::Python, parser::queries::python::get_python_queries())) - } - "ruby" => Some((Language::Ruby, parser::queries::ruby::get_ruby_queries())), - "rust" => Some((Language::Rust, parser::queries::rust::get_rust_queries())), - "toml" => Some((Language::Toml, parser::queries::toml::get_toml_queries())), - "typescript" | "ts" => { - Some((Language::TypeScript, parser::queries::typescript::get_typescript_queries())) - } - "yaml" => Some((Language::Yaml, parser::queries::yaml::get_yaml_queries())), - _ => None, - } -} -// ------------------------------------------------------------------------------------------------- -// MatchStats -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Default, Clone)] -pub struct MatcherStats { - pub blobs_seen: u64, - pub blobs_scanned: u64, - pub bytes_seen: u64, - pub bytes_scanned: u64, - // #[cfg(feature = "rule_profiling")] - // pub rule_stats: crate::rule_profiling::RuleProfile, -} -impl MatcherStats { - pub fn update(&mut self, other: &Self) { - self.blobs_seen += other.blobs_seen; - self.blobs_scanned += other.blobs_scanned; - self.bytes_seen += other.bytes_seen; - self.bytes_scanned += other.bytes_scanned; - - // #[cfg(feature = "rule_profiling")] - // self.rule_stats.update(&other.rule_stats); - } -} -// ------------------------------------------------------------------------------------------------- -// Group -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash)] -pub struct Group(pub Base64BString); -impl Group { - pub fn new(m: regex::bytes::Match<'_>) -> Self { - Self(Base64BString(BString::from(m.as_bytes()))) - } -} -// ------------------------------------------------------------------------------------------------- -// Groups -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub struct Groups(pub SmallVec<[Group; 1]>); -impl JsonSchema for Groups { - fn schema_name() -> String { - "Groups".to_string() - } - - fn json_schema(gen: &mut SchemaGenerator) -> Schema { - let group_schema = gen.subschema_for::(); - Schema::Object(schemars::schema::SchemaObject { - instance_type: Some(InstanceType::Array.into()), - array: Some(Box::new(ArrayValidation { - items: Some(group_schema.into()), - ..Default::default() - })), - ..Default::default() - }) - } -} -// #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] -// pub struct SerializableCapture { -// pub name: Option, // Named group (if available) -// pub match_number: i32, -// pub start: usize, // Start position of the match -// pub end: usize, // End position of the match -// pub value: String, // The actual captured value -// } -#[derive(Debug, Clone, Serialize, JsonSchema)] -pub struct SerializableCapture { - pub name: Option, - pub match_number: i32, - pub start: usize, - pub end: usize, - /// Interned value of the capture. - pub value: &'static str, -} - -#[derive(Debug, Clone, Serialize, JsonSchema)] -pub struct SerializableCaptures { - #[schemars(with = "Vec")] - pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed) -} -impl SerializableCaptures { - pub fn from_captures( - captures: ®ex::bytes::Captures, - _input: &[u8], - re: &Regex, - redact: bool, - ) -> Self { - let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new(); - // Process named captures - for name in re.capture_names().flatten() { - if let Some(capture) = captures.name(name) { - let value = if redact { - redact_value(&String::from_utf8_lossy(capture.as_bytes())) - } else { - String::from_utf8_lossy(capture.as_bytes()).to_string() - }; - serialized_captures.push(SerializableCapture { - name: Some(name.to_string()), - match_number: -1, - start: capture.start(), - end: capture.end(), - value: intern(&value), - }); - } - } - // Process unnamed captures (numbered groups) - for i in 0..captures.len() { - if let Some(capture) = captures.get(i) { - let value = if redact { - redact_value(&String::from_utf8_lossy(capture.as_bytes())) - } else { - String::from_utf8_lossy(capture.as_bytes()).to_string() - }; - serialized_captures.push(SerializableCapture { - name: None, - match_number: i32::try_from(i).unwrap_or(0), - start: capture.start(), - end: capture.end(), - value: intern(&value), - }); - } - } - SerializableCaptures { captures: serialized_captures } - } -} -// ------------------------------------------------------------------------------------------------- -// Match -// ------------------------------------------------------------------------------------------------- -#[derive(Debug, Clone, Serialize, JsonSchema)] -pub struct Match { - /// The location of the entire matching content - pub location: Location, - - /// The capture groups - pub groups: SerializableCaptures, // Store serialized captures - - /// unique identifier of file / blob where this match was found - pub blob_id: BlobId, - - /// The unique content-based identifier of this match - pub finding_fingerprint: u64, - - /// The rule that produced this match - #[serde(skip_serializing)] - #[schemars(skip)] - pub rule: Arc, - - /// Validation Body - pub validation_response_body: String, - - /// Validation Status Code - pub validation_response_status: u16, - - /// Validation Success - pub validation_success: bool, - - /// Validation Success - pub calculated_entropy: f32, - - pub visible: bool, - #[serde(default)] - pub is_base64: bool, -} -impl Match { - #[inline] - pub fn convert_owned_blobmatch_to_match<'a>( - loc_mapping: Option<&'a LocationMapping<'a>>, - owned_blob_match: &'a OwnedBlobMatch, - origin_type: &'a str, - ) -> Self { - let offset_span = owned_blob_match.matching_input_offset_span; - // Extract the matched secret content. Use capture group 1 if it exists, otherwise fall back - // to group 0. - let matching_finding_bytes = owned_blob_match - .captures - .captures - .get(1) - .or_else(|| owned_blob_match.captures.captures.get(0)) - .map(|capture| capture.value.as_bytes()) - .unwrap_or_default(); - - // The fingerprint will be based on the content of the secret. - let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or(""); - - let source_span = - loc_mapping.map(|lm| lm.get_source_span(&offset_span)).unwrap_or(SourceSpan { - start: SourcePoint { line: 0, column: 0 }, - end: SourcePoint { line: 0, column: 0 }, - }); - let offset_start: u64 = - owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); - let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); - - let finding_fingerprint = compute_finding_fingerprint( - finding_value_for_fp, - origin_type, // file_or_commit, - offset_start, - offset_end, - ); - - // matching_snippet - Match { - rule: owned_blob_match.rule.clone(), - visible: owned_blob_match.rule.visible().to_owned(), - location: Location { offset_span, source_span: source_span.clone() }, - groups: owned_blob_match.captures.clone(), - blob_id: owned_blob_match.blob_id, - finding_fingerprint, - validation_response_body: owned_blob_match.validation_response_body.clone(), - validation_response_status: owned_blob_match.validation_response_status.as_u16(), - validation_success: owned_blob_match.validation_success, - calculated_entropy: owned_blob_match.calculated_entropy, - is_base64: owned_blob_match.is_base64, - } - } - - /// Returns the `blob_id` of the match. - pub fn get_blob_id(&self) -> BlobId { - self.blob_id.clone() - } - - pub fn finding_id(&self) -> String { - let mut buffer = Vec::with_capacity(128); - buffer.extend_from_slice(self.rule.finding_sha1_fingerprint().as_bytes()); - buffer.push(0); - serde_json::to_writer(&mut buffer, &self.groups) - .expect("should be able to serialize groups as JSON"); - let mut num = xxh3_64(&buffer); - // Ensure the number is positive and within i64 range - num &= 0x7FFF_FFFF_FFFF_FFFF; // Clear the sign bit to make it positive - // Convert to string - num.to_string() - } -} -#[derive(Debug, Clone)] -pub struct DecodedData { - pub original: String, - pub decoded: String, - pub pos_start: usize, - pub pos_end: usize, -} -#[inline] -fn is_base64_byte(b: u8) -> bool { - matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/') -} - -pub fn get_base64_strings(input: &[u8]) -> Vec { - let mut results = Vec::new(); - let mut i = 0; - while i < input.len() { - while i < input.len() && !is_base64_byte(input[i]) { - i += 1; - } - let start = i; - while i < input.len() && is_base64_byte(input[i]) { - i += 1; - } - - let mut eq_count = 0; - while i < input.len() && input[i] == b'=' && eq_count < 2 { - i += 1; - eq_count += 1; - } - let end = i; - - let len = end - start; - if len >= 32 && len % 4 == 0 { - let base64_slice = &input[start..end]; - if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) { - if let Ok(decoded_str) = std::str::from_utf8(&decoded) { - if decoded_str.is_ascii() { - results.push(DecodedData { - original: String::from_utf8_lossy(base64_slice).into_owned(), - decoded: decoded_str.to_string(), - pos_start: start, - pos_end: end, - }); - } - } - } - } - } - - results -} - -pub fn compute_finding_fingerprint( - finding_value: &str, - file_or_commit: &str, - offset_start: u64, - offset_end: u64, -) -> u64 { - // Combine all into a byte buffer and hash it directly: - let mut buf = Vec::with_capacity( - finding_value.len() + file_or_commit.len() + 2 * std::mem::size_of::(), - ); - buf.extend_from_slice(finding_value.as_bytes()); - buf.extend_from_slice(file_or_commit.as_bytes()); - buf.extend_from_slice(&offset_start.to_le_bytes()); - buf.extend_from_slice(&offset_end.to_le_bytes()); - - xxh3_64(&buf) -} - -// ------------------------------------------------------------------------------------------------- -// test -// ------------------------------------------------------------------------------------------------- -#[cfg(test)] -mod test { - use std::{collections::BTreeMap, path::PathBuf}; - - use pretty_assertions::assert_eq; - // --------------------------------------------------------------------- - // proptest: raw-match dedup + entropy gate - // --------------------------------------------------------------------- - use proptest::prelude::*; - - use super::*; - use crate::{ - blob::{Blob, BlobIdMap}, - origin::{Origin, OriginSet}, - rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation}, - }; - - proptest! { - #[test] - fn prop_no_dupes_and_entropy( - // random ASCII up to 300 bytes - mut noise in proptest::collection::vec(any::().prop_filter("ascii", |b| b.is_ascii()), 0..300), - // 0-4 random insertion points - inserts in proptest::collection::vec(0usize..300, 0..5) - ) { - // Constant high-entropy secret token that matches the rule below - const TOKEN: &[u8] = b"secret_abcd1234"; - - // Splice the token at the requested offsets - for &idx in &inserts { - let pos = idx.min(noise.len()); - noise.splice(pos..pos, TOKEN.iter().copied()); - } - - // ── build a single test rule ────────────────────────────────── - use crate::rules::rule::{RuleSyntax, Validation, Confidence}; - - let rule = Rule::new(RuleSyntax { - id: "prop.secret".into(), - name: "prop secret".into(), - pattern: "secret_[a-z]{4}[0-9]{4}".into(), - confidence: Confidence::Low, - min_entropy: 3.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, // no HTTP validation needed - depends_on_rule: vec![], - }); - - let rules_db = RulesDatabase::from_rules(vec![rule]).unwrap(); - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false).unwrap(); - - // ── run the scan ────────────────────────────────────────────── - m.scan_bytes_raw(&noise, "buf").unwrap(); - - // ── property 1: dedup – each (rule,start,end) is unique ────── - - let mut coords = FxHashSet::default(); - for RawMatch{rule_id, start_idx, end_idx} in &m.user_data.raw_matches_scratch { - assert!( - coords.insert((*rule_id, *start_idx, *end_idx)), - "duplicate raw-match detected for coords ({rule_id},{start_idx},{end_idx})" - ); - - // ── property 2: entropy gate held ──────────────────────── - let slice = &noise[*start_idx as usize .. *end_idx as usize]; - let ent = calculate_shannon_entropy(slice); - assert!(ent > 3.0, "entropy {ent} ≤ min_entropy, gate failed"); - } - } - } - - #[test] - pub fn test_simple() -> Result<()> { - let rules = vec![Rule::new(RuleSyntax { - id: "test.1".to_string(), - name: "test".to_string(), - pattern: "test".to_string(), - confidence: crate::rules::rule::Confidence::Medium, - min_entropy: 1.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: Some(Validation::Http(HttpValidation { - request: HttpRequest { - method: "GET".to_string(), - url: "https://example.com".to_string(), - headers: BTreeMap::new(), - body: None, - response_matcher: Some(vec![]), - multipart: None, - response_is_html: false, - }, - multipart: None, - })), - depends_on_rule: vec![ - Some(DependsOnRule { - rule_id: "d8f3c34b-015f-4cd6-b411-b1366493104c".to_string(), - variable: "email".to_string(), - }), - Some(DependsOnRule { - rule_id: "8910f364-7718-4a27-a435-d2da13e6ba9e".to_string(), - variable: "domain".to_string(), - }), - ], - })]; - let rules_db = RulesDatabase::from_rules(rules)?; - let input = "some test data for vectorscan"; - let seen_blobs: BlobIdMap = BlobIdMap::new(); - let enable_rule_profiling = true; - // let mut matcher = Matcher::new(&rules_db, &seen_blobs, None, - // enable_rule_profiling)?; - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new( - &rules_db, - scanner_pool, - &seen_blobs, - None, - enable_rule_profiling, - None, // Pass the shared profiler - false, - )?; - matcher.scan_bytes_raw(input.as_bytes(), "fname")?; - assert_eq!( - matcher.user_data.raw_matches_scratch, - vec![RawMatch { rule_id: 0, start_idx: 0, end_idx: 9 },] - ); - Ok(()) - } - - // --------------------------------------------------------------------- - // additional deterministic unit-tests - // --------------------------------------------------------------------- - - /// `get_base64_strings` should recognise a well-formed token, decode it, - /// and report correct byte-offsets. - #[test] - fn test_get_base64_strings_basic() { - let raw = b"foo MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY= bar"; - // decodes to "0123456789abcdef0123456789abcdef" - let hits = get_base64_strings(raw); - assert_eq!(hits.len(), 1); - let item = &hits[0]; - assert_eq!(item.decoded, "0123456789abcdef0123456789abcdef"); - assert_eq!(item.original, "MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY="); - // "foo␠" is 4 bytes, so the start offset is 4 - assert_eq!((item.pos_start, item.pos_end), (4, 4 + item.original.len())); - } - - /// `compute_finding_fingerprint` must be stable (same input ⇒ same output) - /// and sensitive to any input component. - #[test] - fn test_finding_fingerprint_stability_and_uniqueness() { - let a = compute_finding_fingerprint("secret", "fileA", 0, 6); - let b = compute_finding_fingerprint("secret", "fileA", 0, 6); - assert_eq!(a, b, "fingerprint should be deterministic"); - - // changing any parameter should perturb the hash - let c = compute_finding_fingerprint("secret", "fileA", 1, 7); // offsets differ - let d = compute_finding_fingerprint("secret", "fileB", 0, 6); // file id differs - let e = compute_finding_fingerprint("different", "fileA", 0, 6); // content differs - assert_ne!(a, c); - assert_ne!(a, d); - assert_ne!(a, e); - } - - /// The (private) `compute_match_key` helper is the linchpin of the raw-dedup - /// path. It should return identical keys for identical inputs and different - /// keys as soon as *anything* changes. - #[test] - fn test_compute_match_key_uniqueness() { - use super::compute_match_key; - - let k1 = compute_match_key(b"abc", b"rule-1", 0, 3); - let k2 = compute_match_key(b"abc", b"rule-1", 0, 3); - assert_eq!(k1, k2); - - // mutate each component in turn - let diff_content = compute_match_key(b"abcd", b"rule-1", 0, 4); - let diff_rule = compute_match_key(b"abc", b"rule-2", 0, 3); - let diff_span = compute_match_key(b"abc", b"rule-1", 1, 4); - assert_ne!(k1, diff_content); - assert_ne!(k1, diff_rule); - assert_ne!(k1, diff_span); - } - - /// Running `scan_bytes_raw` twice over the *same* input should never record - /// duplicate entries in `raw_matches_scratch`. - #[test] - fn test_scan_bytes_raw_no_duplicate_raw_matches() -> Result<()> { - // simple rule: literal "dup" - let rule = Rule::new(RuleSyntax { - id: "dup.check".into(), - name: "dup".into(), - pattern: "dup".into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - depends_on_rule: vec![], - }); - - let rules_db = RulesDatabase::from_rules(vec![rule])?; - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; - - let buf = b"dup dup"; // two literal hits, same rule - - // first scan - m.scan_bytes_raw(buf, "buf1")?; - let first_len = m.user_data.raw_matches_scratch.len(); - - // second scan over the same buffer - m.scan_bytes_raw(buf, "buf1")?; - let second_len = m.user_data.raw_matches_scratch.len(); - - // we should still only have two unique raw matches recorded - assert_eq!(first_len, 2); - assert_eq!(second_len, 2); - Ok(()) - } - - #[test] - fn inline_comment_skips_match() -> Result<()> { - let rule = Rule::new(RuleSyntax { - id: "inline.ignore".into(), - name: "inline".into(), - pattern: "secret_token".into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - depends_on_rule: vec![], - }); - let rules_db = RulesDatabase::from_rules(vec![rule])?; - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; - - let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec()); - let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt"))); - - match matcher.scan_blob(&blob, &origin, None, false, false, false)? { - ScanResult::New(matches) => assert!(matches.is_empty()), - _ => panic!("unexpected scan result"), - } - - Ok(()) - } - - #[test] - fn inline_comment_after_multiline_secret_skips_match() -> Result<()> { - let rule = Rule::new(RuleSyntax { - id: "inline.multiline".into(), - name: "inline multiline".into(), - pattern: "line1\\s+line2".into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - depends_on_rule: vec![], - }); - let rules_db = RulesDatabase::from_rules(vec![rule])?; - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; - - let blob = Blob::from_bytes( - br#"let data = """ -line1 -line2 -""" -# kingfisher:ignore -"# - .to_vec(), - ); - let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt"))); - - match matcher.scan_blob(&blob, &origin, None, false, false, false)? { - ScanResult::New(matches) => assert!(matches.is_empty()), - _ => panic!("unexpected scan result"), - } - - Ok(()) - } - - #[test] - fn compat_flag_controls_external_directives() -> Result<()> { - let rule = Rule::new(RuleSyntax { - id: "inline.compat".into(), - name: "inline compat".into(), - pattern: "supersecret123".into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - depends_on_rule: vec![], - }); - let rules_db = RulesDatabase::from_rules(vec![rule])?; - - let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec()); - let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt"))); - - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?; - let matches_without_compat = - match matcher.scan_blob(&blob, &origin, None, false, false, false)? { - ScanResult::New(matches) => matches.len(), - _ => panic!("unexpected scan result"), - }; - assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag"); - - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true)?; - match matcher.scan_blob(&blob, &origin, None, false, false, false)? { - ScanResult::New(matches) => assert!(matches.is_empty()), - _ => panic!("unexpected scan result"), - } - - Ok(()) - } -} diff --git a/src/matcher.rs.rej b/src/matcher.rs.rej deleted file mode 100644 index dc61305..0000000 --- a/src/matcher.rs.rej +++ /dev/null @@ -1,52 +0,0 @@ -@@ -1,50 +1,51 @@ - use std::{ - hash::{Hash, Hasher}, - str, - sync::{Arc, Mutex}, - }; - - use anyhow::Result; - use base64::{engine::general_purpose, Engine}; - use bstr::BString; - use http::StatusCode; - use regex::bytes::Regex; - use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; - use schemars::{ - gen::SchemaGenerator, - schema::{ArrayValidation, InstanceType, Schema}, - JsonSchema, - }; - use serde::{Deserialize, Serialize}; - use smallvec::SmallVec; - use tracing::debug; - use xxhash_rust::xxh3::xxh3_64; - - use crate::{ - blob::{Blob, BlobId, BlobIdMap}, - entropy::calculate_shannon_entropy, -+ inline_ignore::InlineIgnoreConfig, - location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, - origin::OriginSet, - parser, - parser::{Checker, Language}, - rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, - rules::rule::Rule, - rules_database::RulesDatabase, - safe_list::{is_safe_match, is_user_match}, - scanner_pool::ScannerPool, - snippet::Base64BString, - util::{intern, redact_value}, - }; - - const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment - const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches - const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs - const TREE_SITTER_SCAN_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs ≤64 KiB - - // ------------------------------------------------------------------------------------------------- - // RawMatch - // ------------------------------------------------------------------------------------------------- - /// A raw match, as recorded by a callback to Vectorscan. - /// - /// When matching with Vectorscan, we simply collect all matches into a - /// preallocated `Vec`, and then go through them all after scanning is complete. From a003b732fac696d6486b067c2bc3753d9077b217 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 9 Oct 2025 20:11:31 -0700 Subject: [PATCH 3/8] - Added kingfisher:ignore (or kingfisher:allow) to silence a finding inline within a file - Added: to reuse existing inline directives from other scanners, pass --compat-ignore-comments to also accept NOSONAR, kics-scan ignore, gitleaks:allow and trufflehog:ignore --- CHANGELOG.md | 5 + Cargo.toml | 2 +- README.md | 20 ++- src/cli/commands/scan.rs | 8 + src/inline_ignore.rs | 285 ++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/main.rs | 2 + src/matcher.rs | 158 ++++++++++++++++++- src/reporter.rs | 2 + src/reporter/json_format.rs | 2 + src/scanner/enumerate.rs | 2 + src/scanner/repos.rs | 2 + tests/int_allowlist.rs | 2 + tests/int_bitbucket.rs | 2 + tests/int_dedup.rs | 2 + tests/int_github.rs | 2 + tests/int_gitlab.rs | 4 + tests/int_redact.rs | 2 + tests/int_slack.rs | 4 + tests/int_validation_cache.rs | 2 + tests/int_vulnerable_files.rs | 4 + 21 files changed, 507 insertions(+), 6 deletions(-) create mode 100644 src/inline_ignore.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 1278533..416af87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to this project will be documented in this file. +## [v1.57.0] +- Added inline ignore directive detection to treat suppression tokens anywhere on surrounding lines, including multi-line handling +- Added a `--no-ignore` CLI flag to disable inline directives when you need every potential secret reported +- Added: `--compat-ignore-comments` to reuse existing inline directives from other scanners: NOSONAR, kics-scan ignore, gitleaks:allow and trufflehog:ignore + ## [v1.56.0] - Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined diff --git a/Cargo.toml b/Cargo.toml index 37a2e32..852194d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.56.0" +version = "1.57.0" description = "MongoDB's blazingly fast and accurate secret scanning and validation tool" edition.workspace = true rust-version.workspace = true diff --git a/README.md b/README.md index 14faccf..08f2394 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [Notable Scan Options](#notable-scan-options) - [Understanding `--confidence`](#understanding---confidence) - [Ignore known false positives](#ignore-known-false-positives) + - [Inline ignore directives](#inline-ignore-directives) - [Finding Fingerprint](#finding-fingerprint) - [Rule Performance Profiling](#rule-performance-profiling) - [CLI Options](#cli-options) @@ -962,6 +963,8 @@ leaves the default unchanged. - `--manage-baseline`: Create or update the baseline file with current findings - `--skip-regex `: Ignore findings whose text matches this regex (repeatable) - `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) +- `--compat-ignore-comments`: Honor inline directives from other scanners (treat `gitleaks:allow` and `trufflehog:ignore` like native suppressions) +- `--no-ignore`: Disable inline directives entirely so every match is reported ## Understanding `--confidence` The `--confidence` flag sets a minimum confidence threshold, not an exact match. @@ -972,7 +975,22 @@ The `--confidence` flag sets a minimum confidence threshold, not an exact match. ### Ignore known false positives -Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. +Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. + +### Inline ignore directives + +Add `kingfisher:ignore` (or `kingfisher:allow`) anywhere on the same line as a finding to silence it. Multi-line strings may also be ignored by placing the directive on the closing delimiter line, on the next logical line after the string, **or** on a comment immediately before the value: + +```python +# kingfisher:ignore +API_KEY = """ +line 1 +line 2 +""" +# kingfisher:ignore +``` + +Kingfisher searches the surrounding lines for these tokens without requiring language-specific comment markers, so directives work even in templated files or unusual syntaxes. To reuse existing inline directives from other scanners, pass `--compat-ignore-comments` to also accept `gitleaks:allow` and `trufflehog:ignore`. Use `--no-ignore` when you want to disable inline suppressions entirely. With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com) diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index 255417f..ad09f5e 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -118,6 +118,14 @@ pub struct ScanArgs { /// Skipwords to allow-list secret matches (case-insensitive, repeatable) #[arg(long = "skip-word", value_name = "WORD")] pub skip_word: Vec, + + /// Also recognise `gitleaks:allow` and `trufflehog:ignore` inline directives + #[arg(long = "compat-ignore-comments", default_value_t = false)] + pub compat_ignore_comments: bool, + + /// Disable inline ignore directives entirely + #[arg(long = "no-ignore", default_value_t = false)] + pub no_inline_ignore: bool, } /// Confidence levels for findings diff --git a/src/inline_ignore.rs b/src/inline_ignore.rs new file mode 100644 index 0000000..bfdd00a --- /dev/null +++ b/src/inline_ignore.rs @@ -0,0 +1,285 @@ +use crate::location::OffsetSpan; + +/// Configuration for inline ignore directives. +#[derive(Clone, Debug, Default)] +pub struct InlineIgnoreConfig { + tokens: Vec<&'static str>, +} + +impl InlineIgnoreConfig { + /// Create a new configuration. + /// + /// * `include_external_syntax` - when true, also recognise the comment + /// directives used by other scanners such as Gitleaks and Trufflehog. + pub fn new(include_external_syntax: bool) -> Self { + let mut tokens = vec!["kingfisher:ignore", "kingfisher:allow"]; + if include_external_syntax { + tokens.extend(["gitleaks:allow", "trufflehog:ignore"]); + } + Self { tokens } + } + + /// Return a configuration with inline ignores disabled. + pub fn disabled() -> Self { + Self { tokens: Vec::new() } + } + + #[inline] + fn has_tokens(&self) -> bool { + !self.tokens.is_empty() + } + + /// Returns `true` when the provided blob slice contains an inline ignore + /// directive that should suppress a finding for the given span. + pub fn should_ignore(&self, blob_bytes: &[u8], span: &OffsetSpan) -> bool { + if !self.has_tokens() { + return false; + } + + let (start_line_start, start_line_end) = line_bounds(blob_bytes, span.start); + if start_line_end > start_line_start { + let start_line = &blob_bytes[start_line_start..start_line_end]; + if line_has_directive(start_line, &self.tokens) { + return true; + } + } + + // Scan backwards to allow directives that appear before the start of a + // multi-line string or value. This mirrors tools like Gitleaks where + // the ignore directive is often placed immediately above the secret. + let mut cursor = start_line_start; + while cursor > 0 { + let previous_index = cursor.saturating_sub(1); + let (prev_start, prev_end) = line_bounds(blob_bytes, previous_index); + if prev_end <= prev_start { + break; + } + + let prev_line = &blob_bytes[prev_start..prev_end]; + if line_has_directive(prev_line, &self.tokens) { + return true; + } + + if !should_skip_for_directive_search(prev_line) { + break; + } + + if prev_start == 0 { + break; + } + + cursor = prev_start; + } + + let end_index = if span.end == 0 { 0 } else { span.end - 1 }; + let (closing_line_start, closing_line_end) = + line_bounds(blob_bytes, end_index.min(blob_bytes.len())); + if closing_line_end > closing_line_start + && (closing_line_start != start_line_start || closing_line_end != start_line_end) + { + let closing_line = &blob_bytes[closing_line_start..closing_line_end]; + if line_has_directive(closing_line, &self.tokens) { + return true; + } + } + + // Also consider lines after the match so that multi-line strings can be + // ignored when the directive appears after the closing delimiter (a + // common pattern in languages like Python). + let mut cursor = closing_line_end; + while cursor < blob_bytes.len() { + if blob_bytes[cursor] == b'\n' { + cursor += 1; + continue; + } + + let (_, next_end) = line_bounds(blob_bytes, cursor); + if next_end <= cursor { + break; + } + + let next_line = &blob_bytes[cursor..next_end]; + if line_has_directive(next_line, &self.tokens) { + return true; + } + + if !should_skip_for_directive_search(next_line) { + break; + } + + cursor = next_end; + } + + false + } +} + +fn should_skip_for_directive_search(line: &[u8]) -> bool { + let trimmed = trim_ascii_whitespace(line); + if trimmed.is_empty() { + return true; + } + + if trimmed.iter().all(|&b| b == trimmed[0]) && matches!(trimmed[0], b'"' | b'\'' | b'`') { + return true; + } + + if ends_with_multiline_delimiter(trimmed) { + return true; + } + + false +} + +fn ends_with_multiline_delimiter(trimmed: &[u8]) -> bool { + if trimmed.len() < 3 { + return false; + } + + let last = *trimmed.last().unwrap(); + if !matches!(last, b'"' | b'\'' | b'`') { + return false; + } + + let count = trimmed.iter().rev().take_while(|&&ch| ch == last).count(); + + count >= 3 +} + +fn trim_ascii_whitespace(line: &[u8]) -> &[u8] { + let mut start = 0; + while start < line.len() && line[start].is_ascii_whitespace() { + start += 1; + } + + let mut end = line.len(); + while end > start && line[end - 1].is_ascii_whitespace() { + end -= 1; + } + + &line[start..end] +} + +fn line_bounds(bytes: &[u8], index: usize) -> (usize, usize) { + if bytes.is_empty() { + return (0, 0); + } + let mut start = index.min(bytes.len()); + while start > 0 && bytes[start - 1] != b'\n' { + start -= 1; + } + let mut end = index.min(bytes.len()); + while end < bytes.len() && bytes[end] != b'\n' { + end += 1; + } + (start, end) +} + +fn line_has_directive(line: &[u8], tokens: &[&'static str]) -> bool { + if line.is_empty() { + return false; + } + + let mut lowercase = line.to_vec(); + lowercase.iter_mut().for_each(|b| *b = b.to_ascii_lowercase()); + + tokens.iter().any(|token| memchr::memmem::find(&lowercase, token.as_bytes()).is_some()) +} + +#[cfg(test)] +mod tests { + use super::{ + line_bounds, line_has_directive, should_skip_for_directive_search, trim_ascii_whitespace, + InlineIgnoreConfig, + }; + use crate::location::OffsetSpan; + + #[test] + fn bounds_cover_expected_ranges() { + let data = b"one\ntwo\nthree"; + assert_eq!(line_bounds(data, 0), (0, 3)); + assert_eq!(line_bounds(data, 4), (4, 7)); + assert_eq!(line_bounds(data, data.len()), (8, 13)); + } + + #[test] + fn detects_directives_in_lines() { + let tokens = ["kingfisher:ignore", "kingfisher:allow"]; + assert!(line_has_directive(b"secret # kingfisher:ignore", &tokens)); + assert!(line_has_directive(b"kingfisher:allow before value", &tokens)); + assert!(line_has_directive(b"value // TruffleHog:Ignore", &["trufflehog:ignore"])); + assert!(!line_has_directive(b"secret", &tokens)); + } + + #[test] + fn respects_multiline_block_comment_prefix() { + let tokens = ["kingfisher:ignore"]; + assert!(line_has_directive(b" * kingfisher:ignore", &tokens)); + } + + #[test] + fn ignores_multi_line_string_with_trailing_comment() { + let blob = b"let secret = \"\"\"\nline1\nline2\n\"\"\"\n# kingfisher:ignore\n"; + let matched = b"line1\nline2\n"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(false); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn ignores_multiline_without_trailing_newline() { + let blob = b"let secret = \"\"\"\nline1\nline2\n\"\"\"\n# kingfisher:ignore\n"; + let matched = b"line1\nline2"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(false); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn ignores_multiline_with_directive_before_secret() { + let blob = b"// kingfisher:ignore\nlet secret = \"\"\"\nline1\nline2\n\"\"\"\n"; + let matched = b"line1\nline2\n"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(false); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn trim_ascii_whitespace_returns_inner_slice() { + assert_eq!(trim_ascii_whitespace(b" abc "), b"abc"); + assert!(trim_ascii_whitespace(b" ").is_empty()); + } + + #[test] + fn skips_lines_with_only_delimiters() { + assert!(should_skip_for_directive_search(b"\"\"\"")); + assert!(should_skip_for_directive_search(b" \"\"\" ")); + assert!(should_skip_for_directive_search(b"let secret = \"\"\"")); + assert!(!should_skip_for_directive_search(b"value")); + } + + #[test] + fn disabled_config_never_ignores() { + let blob = b"let secret = 'value' # kingfisher:ignore"; + let matched = b"value"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::disabled(); + assert!(!config.should_ignore(blob, &span)); + } +} diff --git a/src/lib.rs b/src/lib.rs index 3ceed02..9fc8106 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ pub mod git_url; pub mod gitea; pub mod github; pub mod gitlab; +pub mod inline_ignore; pub mod jira; pub mod liquid_filters; pub mod location; diff --git a/src/main.rs b/src/main.rs index b6bb1fd..aa052cf 100644 --- a/src/main.rs +++ b/src/main.rs @@ -437,6 +437,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { skip_word: Vec::new(), output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, } } /// Run the rules check command diff --git a/src/matcher.rs b/src/matcher.rs index 102820b..c880f61 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -23,6 +23,7 @@ use xxhash_rust::xxh3::xxh3_64; use crate::{ blob::{Blob, BlobId, BlobIdMap}, entropy::calculate_shannon_entropy, + inline_ignore::InlineIgnoreConfig, location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan}, origin::OriginSet, parser, @@ -199,6 +200,9 @@ pub struct Matcher<'a> { /// Rule profiler for measuring performance of individual rules profiler: Option>, + + /// Configuration that controls inline ignore directives + inline_ignore_config: InlineIgnoreConfig, } /// This `Drop` implementation updates the `global_stats` with the local stats impl<'a> Drop for Matcher<'a> { @@ -226,6 +230,8 @@ impl<'a> Matcher<'a> { global_stats: Option<&'a Mutex>, enable_profiling: bool, shared_profiler: Option>, + include_external_ignore_syntax: bool, + disable_inline_ignores: bool, ) -> Result { // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec let raw_matches_scratch = Vec::new(); @@ -247,6 +253,11 @@ impl<'a> Matcher<'a> { seen_blobs, user_data, profiler, + inline_ignore_config: if disable_inline_ignores { + InlineIgnoreConfig::disabled() + } else { + InlineIgnoreConfig::new(include_external_ignore_syntax) + }, }) } @@ -403,6 +414,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + &self.inline_ignore_config, ); } // If tree-sitter produced base64-decoded matches, try them against all rules @@ -427,6 +439,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + &self.inline_ignore_config, ); } } @@ -457,6 +470,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + &self.inline_ignore_config, ); } if depth + 1 < MAX_B64_DEPTH { @@ -560,6 +574,7 @@ fn filter_match<'b>( redact: bool, filename: &str, profiler: Option<&Arc>, + inline_ignore_config: &InlineIgnoreConfig, ) { let mut timer = profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename)); @@ -590,6 +605,10 @@ fn filter_match<'b>( let matching_input_offset_span = OffsetSpan::from_range( (start + matching_input.start())..(start + matching_input.end()), ); + if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) { + debug!("Skipping match due to inline ignore directive"); + continue; + } let match_key = compute_match_key( matching_input.as_bytes(), rule.id().as_bytes(), @@ -961,7 +980,7 @@ pub fn compute_finding_fingerprint( // ------------------------------------------------------------------------------------------------- #[cfg(test)] mod test { - use std::collections::BTreeMap; + use std::{collections::BTreeMap, path::PathBuf}; use pretty_assertions::assert_eq; // --------------------------------------------------------------------- @@ -970,7 +989,11 @@ mod test { use proptest::prelude::*; use super::*; - use crate::rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation}; + use crate::{ + blob::{Blob, BlobIdMap}, + origin::{Origin, OriginSet}, + rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation}, + }; proptest! { #[test] @@ -1009,7 +1032,17 @@ mod test { let rules_db = RulesDatabase::from_rules(vec![rule]).unwrap(); let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None).unwrap(); + let mut m = Matcher::new( + &rules_db, + scanner_pool, + &seen, + None, + false, + None, + false, + false, + ) + .unwrap(); // ── run the scan ────────────────────────────────────────────── m.scan_bytes_raw(&noise, "buf").unwrap(); @@ -1080,6 +1113,8 @@ mod test { None, enable_rule_profiling, None, // Pass the shared profiler + false, + false, )?; matcher.scan_bytes_raw(input.as_bytes(), "fname")?; assert_eq!( @@ -1167,7 +1202,7 @@ mod test { let rules_db = RulesDatabase::from_rules(vec![rule])?; let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None)?; + let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false, false)?; let buf = b"dup dup"; // two literal hits, same rule @@ -1184,4 +1219,119 @@ mod test { assert_eq!(second_len, 2); Ok(()) } + + #[test] + fn inline_comment_skips_match() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.ignore".into(), + name: "inline".into(), + pattern: "secret_token".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false, false)?; + + let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt"))); + + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } + + #[test] + fn inline_comment_after_multiline_secret_skips_match() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.multiline".into(), + name: "inline multiline".into(), + pattern: "line1\\s+line2".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false, false)?; + + let blob = Blob::from_bytes( + br#"let data = """ +line1 +line2 +""" +# kingfisher:ignore +"# + .to_vec(), + ); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt"))); + + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } + + #[test] + fn compat_flag_controls_external_directives() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "inline.compat".into(), + name: "inline compat".into(), + pattern: "supersecret123".into(), + confidence: crate::rules::rule::Confidence::Low, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + depends_on_rule: vec![], + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + + let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt"))); + + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false, false)?; + let matches_without_compat = + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => matches.len(), + _ => panic!("unexpected scan result"), + }; + assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag"); + + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true, false)?; + match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => assert!(matches.is_empty()), + _ => panic!("unexpected scan result"), + } + + Ok(()) + } } diff --git a/src/reporter.rs b/src/reporter.rs index 73bc541..b03bdc6 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -863,6 +863,8 @@ mod tests { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + compat_ignore_comments: false, + no_inline_ignore: false, }; let record = reporter.build_finding_record(&report_match, &scan_args); diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 8b4f59c..0a14361 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -163,6 +163,8 @@ mod tests { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, } } diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index df56427..24f92fb 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -144,6 +144,8 @@ pub fn enumerate_filesystem_inputs( Some(&matcher_stats), enable_profiling, Some(shared_profiler), + args.compat_ignore_comments, + args.no_inline_ignore, )?; let blob_processor_init_time = Mutex::new(t1.elapsed()); let make_blob_processor = || -> BlobProcessor { diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index eb4ad10..3f0bc16 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -622,6 +622,8 @@ pub async fn fetch_s3_objects( Some(matcher_stats), enable_profiling, Some(shared_profiler.clone()), + args.compat_ignore_comments, + args.no_inline_ignore, )?; let mut processor = BlobProcessor { matcher }; diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs index 72bd950..c94b4b0 100644 --- a/tests/int_allowlist.rs +++ b/tests/int_allowlist.rs @@ -133,6 +133,8 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, }; let global_args = GlobalArgs { diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index cd83a7f..f7ac9c4 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -152,6 +152,8 @@ rules: skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, }; let global_args = GlobalArgs { diff --git a/tests/int_github.rs b/tests/int_github.rs index 06c67a7..3460863 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -139,6 +139,8 @@ fn test_github_remote_scan() -> Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, }; // Create global arguments let global_args = GlobalArgs { diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index e55655a..37bb856 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -137,6 +137,8 @@ fn test_gitlab_remote_scan() -> Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, }; let global_args = GlobalArgs { @@ -272,6 +274,8 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, }; let global_args = GlobalArgs { diff --git a/tests/int_redact.rs b/tests/int_redact.rs index 48247af..6772af0 100644 --- a/tests/int_redact.rs +++ b/tests/int_redact.rs @@ -116,6 +116,8 @@ async fn test_redact_hashes_finding_values() -> Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, }; let global_args = GlobalArgs { diff --git a/tests/int_slack.rs b/tests/int_slack.rs index 2575a3c..3de13e4 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -123,6 +123,8 @@ impl TestContext { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, }; let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules).load(&scan_args)?; @@ -246,6 +248,8 @@ async fn test_scan_slack_messages() -> Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, }; let global_args = GlobalArgs { diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index ea1c809..c6ff0f5 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -195,6 +195,8 @@ async fn test_validation_cache_and_depvars() -> Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, }; /* --------------------------------------------------------- * diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index b87d721..79cbb26 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -138,6 +138,8 @@ impl TestContext { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, }; let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules) @@ -248,6 +250,8 @@ impl TestContext { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, + compat_ignore_comments: false, + no_inline_ignore: false, }; let global_args = GlobalArgs { From 92de1ba63d06cb315bc51d29cd05caa4c4edfc44 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 9 Oct 2025 20:53:17 -0700 Subject: [PATCH 4/8] - Added kingfisher:ignore (or kingfisher:allow) to silence a finding inline within a file - Added: to reuse existing inline directives from other scanners, pass --compat-ignore-comments to also accept NOSONAR, kics-scan ignore, gitleaks:allow and trufflehog:ignore --- README.md | 2 +- README.md.orig | 1036 --------------------------------- src/main.rs | 1 - src/reporter.rs | 1 - src/reporter/json_format.rs | 1 - tests/int_allowlist.rs | 1 - tests/int_bitbucket.rs | 1 - tests/int_dedup.rs | 1 - tests/int_github.rs | 1 - tests/int_gitlab.rs | 2 - tests/int_redact.rs | 1 - tests/int_slack.rs | 2 - tests/int_validation_cache.rs | 1 - tests/int_vulnerable_files.rs | 2 - 14 files changed, 1 insertion(+), 1052 deletions(-) delete mode 100644 README.md.orig diff --git a/README.md b/README.md index 08f2394..02ca51c 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [Notable Scan Options](#notable-scan-options) - [Understanding `--confidence`](#understanding---confidence) - [Ignore known false positives](#ignore-known-false-positives) - - [Inline ignore directives](#inline-ignore-directives) + - [Inline ignore directives](#inline-ignore-directives) - [Finding Fingerprint](#finding-fingerprint) - [Rule Performance Profiling](#rule-performance-profiling) - [CLI Options](#cli-options) diff --git a/README.md.orig b/README.md.orig deleted file mode 100644 index f690e3d..0000000 --- a/README.md.orig +++ /dev/null @@ -1,1036 +0,0 @@ -# Kingfisher - -

- Kingfisher Logo - -[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) - -Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s hardware‑accelerated Hyperscan regex engine with language‑aware source code parsing, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production -

- -Originally forked from Praetorian’s Nosey Parker, Kingfisher **adds** live cloud-API validation; many more targets (GitLab, BitBucket, Gitea, S3, Docker, Jira, Confluence, Slack); compressed-file extraction and scanning; baseline and allowlist controls; language-aware detection (~20 languages); and a native Windows binary. See [Origins and Divergence](#origins-and-divergence) for details. -## Key Features - -### Multiple Scan Targets -
- -| Files / Dirs | Local Git | GitHub | GitLab | Azure DevOps | Bitbucket | Gitea | -|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:| -| Files / Dirs
Files / Dirs | Local Git
Local Git | GitHub
GitHub | GitLab
GitLab | Azure DevOps
Azure DevOps | Bitbucket
Bitbucket | Gitea
Gitea | - -| Docker | Jira | Confluence | Slack | AWS S3 | -|:------:|:----:|:-----------:|:-----:|:------:| -| Docker
Docker | Jira
Jira | Confluence
Confluence | Slack
Slack | AWS S3
AWS S3 | - -
- -### Performance, Accuracy, and Hundreds of Rules -- **Performance**: multithreaded, Hyperscan‑powered scanning built for huge codebases -- **Extensible rules**: hundreds of built-in detectors plus YAML-defined custom rules ([docs/RULES.md](/docs/RULES.md)) -- **Broad AI SaaS coverage**: finds and validates tokens for OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, Together.ai, Zhipu, and many more -- **Compressed Files**: Supports extracting and scanning compressed files for secrets -- **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) - -**Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation) - -# Benchmark Results - -See ([docs/COMPARISON.md](docs/COMPARISON.md)) - -

- Kingfisher Runtime Comparison -

- -- [Kingfisher](#kingfisher) - - [Key Features](#key-features) - - [Multiple Scan Targets](#multiple-scan-targets) - - [Performance, Accuracy, and Hundreds of Rules](#performance-accuracy-and-hundreds-of-rules) -- [Benchmark Results](#benchmark-results) -- [Getting Started](#getting-started) - - [Installation](#installation) - - [Run Kingfisher in Docker](#run-kingfisher-in-docker) -- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance) - - [Write Custom Rules!](#write-custom-rules) -- [Usage](#usage) - - [Basic Examples](#basic-examples) - - [Scan with secret validation](#scan-with-secret-validation) - - [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories) - - [Scan a Git repository without validation](#scan-a-git-repository-without-validation) - - [Display only secrets confirmed active by third‑party APIs](#display-only-secrets-confirmed-active-by-thirdparty-apis) - - [Output JSON and capture to a file](#output-json-and-capture-to-a-file) - - [Output SARIF directly to disk](#output-sarif-directly-to-disk) - - [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--) - - [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size) - - [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag) - - [Display rule performance statistics](#display-rule-performance-statistics) - - [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files) - - [Exclude specific paths](#exclude-specific-paths) - - [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines) - - [ Scanning an AWS S3 Bucket](#-scanning-an-aws-s3-bucket) - - [ Scanning Docker Images](#-scanning-docker-images) - - [ Scanning GitHub](#-scanning-github) - - [Scan GitHub organization (requires `KF_GITHUB_TOKEN`)](#scan-github-organization-requires-kf_github_token) - - [Skip specific GitHub repositories during enumeration](#skip-specific-github-repositories-during-enumeration) - - [Scan remote GitHub repository](#scan-remote-github-repository) - - [ Scanning GitLab](#-scanning-gitlab) - - [Scan GitLab group (requires `KF_GITLAB_TOKEN`)](#scan-gitlab-group-requires-kf_gitlab_token) - - [Scan GitLab user](#scan-gitlab-user) - - [Skip specific GitLab projects during enumeration](#skip-specific-gitlab-projects-during-enumeration) - - [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url) - - [List GitLab repositories](#list-gitlab-repositories) - - [ Scanning Azure Repos](#-scanning-azure-repos) - - [Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-devops-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat) - - [Scan specific Azure DevOps projects](#scan-specific-azure-devops-projects) - - [Skip specific Azure repositories during enumeration](#skip-specific-azure-repositories-during-enumeration) - - [List Azure repositories](#list-azure-repositories) - - [ Scanning Gitea](#-scanning-gitea) - - [Scan Gitea organization (requires `KF_GITEA_TOKEN`)](#scan-gitea-organization-requires-kf_gitea_token) - - [Scan Gitea user](#scan-gitea-user) - - [Skip specific Gitea repositories during enumeration](#skip-specific-gitea-repositories-during-enumeration) - - [Scan remote Gitea repository by URL](#scan-remote-gitea-repository-by-url) - - [List Gitea repositories](#list-gitea-repositories) - - [ Scanning Bitbucket](#-scanning-bitbucket) - - [Scan Bitbucket workspace](#scan-bitbucket-workspace) - - [Scan Bitbucket user](#scan-bitbucket-user) - - [Skip specific Bitbucket repositories during enumeration](#skip-specific-bitbucket-repositories-during-enumeration) - - [Scan remote Bitbucket repository by URL](#scan-remote-bitbucket-repository-by-url) - - [List Bitbucket repositories](#list-bitbucket-repositories) - - [Authenticate to Bitbucket](#authenticate-to-bitbucket) - - [Self-hosted Bitbucket Server](#self-hosted-bitbucket-server) - - [ Scanning Jira](#-scanning-jira) - - [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query) - - [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues) - - [ Scanning Confluence](#-scanning-confluence) - - [Scan Confluence pages matching a CQL query](#scan-confluence-pages-matching-a-cql-query) - - [ Scanning Slack](#-scanning-slack) - - [Scan Slack messages matching a search query](#scan-slack-messages-matching-a-search-query) - - [Environment Variables for Tokens](#environment-variables-for-tokens) - - [Exit Codes](#exit-codes) - - [Update Checks](#update-checks) -- [Advanced Options](#advanced-options) - - [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets) - - [List Builtin Rules](#list-builtin-rules) - - [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run) - - [To add your rules alongside the built‑ins:](#to-add-your-rules-alongside-the-builtins) - - [Other Examples](#other-examples) - - [Customize the HTTP User-Agent](#customize-the-http-user-agent) - - [Notable Scan Options](#notable-scan-options) - - [Understanding `--confidence`](#understanding---confidence) - - [Ignore known false positives](#ignore-known-false-positives) - - [Inline ignore directives](#inline-ignore-directives) - - [Finding Fingerprint](#finding-fingerprint) - - [Rule Performance Profiling](#rule-performance-profiling) - - [CLI Options](#cli-options) - - [Origins and Divergence](#origins-and-divergence) -- [Roadmap](#roadmap) -- [License](#license) - -# Getting Started -## Installation - -On macOS, you can simply - -```bash -brew install kingfisher -``` - -Pre-built binaries are also available on the [Releases](https://github.com/mongodb/kingfisher/releases) section of this page. - -You can also install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform: - -```bash -# Linux, macOS -curl --silent --location \ - https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \ - sh && \ - ubi --project mongodb/kingfisher --in "$HOME/bin" -``` - -```powershell -# Windows -powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in . -``` - -This installs `ubi` and then places the `kingfisher` executable in `~/bin` on Unix-like -systems (or the current directory on Windows). - -Or you may compile for your platform via `make`: - -```bash -# NOTE: Requires Docker -make linux - -# macOS --- must build from a macOS host -make darwin - -# Windows x64 --- requires building from a Windows host with Visual Studio installed -./buildwin.bat -force -``` - -```bash -# Build all targets -make linux-all # builds both x64 and arm64 -make darwin-all # builds both x64 and arm64 -make all # builds for every OS and architecture supported -``` - -### Run Kingfisher in Docker - -Run the dockerized Kingfisher container: -```bash -# GitHub Container Registry -docker run --rm ghcr.io/mongodb/kingfisher:latest --version - -# Scan the current working directory -# (mounts your code at /src and scans it) -docker run --rm \ - -v "$PWD":/src \ - ghcr.io/mongodb/kingfisher:latest scan /src - - -# Scan while providing a GitHub token -# Mounts your working dir at /proj and passes in the token: -docker run --rm \ - -e KF_GITHUB_TOKEN=ghp_… \ - -v "$PWD":/proj \ - ghcr.io/mongodb/kingfisher:latest \ - scan --git-url https://github.com/org/private_repo.git - -# Scan an S3 bucket -# Credentials can come from KF_AWS_KEY/KF_AWS_SECRET, --role-arn, or --aws-local-profile -docker run --rm \ - -e KF_AWS_KEY=AKIA... \ - -e KF_AWS_SECRET=g5nYW... \ - ghcr.io/mongodb/kingfisher:latest \ - scan --s3-bucket bucket-name - - -# Scan and write a JSON report locally -# Here we: -# 1. Mount $PWD → /proj -# 2. Tell Kingfisher to write findings.json inside /proj/reports -# 3. Ensure ./reports exists on your host so Docker can mount it -mkdir -p reports - -# run and output into host’s ./reports directory -docker run --rm \ - -v "$PWD":/proj \ - ghcr.io/mongodb/kingfisher:latest \ - scan /proj \ - --format json \ - --output /proj/reports/findings.json - - -# Tip: you can combine multiple mounts if you prefer separating source vs. output: -# Here /src is read‑only, and /out holds your generated reports -docker run --rm \ - -v "$PWD":/src:ro \ - -v "$PWD/reports":/out \ - ghcr.io/mongodb/kingfisher:latest \ - scan /src \ - --format json \ - --output /out/findings.json - -``` - -# 🔐 Detection Rules at a Glance - -Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything from classic cloud keys to the latest AI SaaS tokens. Below is an overview: - -| Category | What we catch | -|----------|---------------| -| **AI SaaS APIs** | OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, together.ai, Zhipu, and more | -| **Cloud Providers** | AWS, Azure, GCP, Alibaba Cloud, DigitalOcean, IBM Cloud, Cloudflare, and more | -| **Dev & CI/CD** | GitHub/GitLab tokens, CircleCI, TravisCI, TeamCity, Docker Hub, npm, PyPI, and more | -| **Messaging & Comms** | Slack, Discord, Microsoft Teams, Twilio, Mailgun, SendGrid, Mailchimp, and more | -| **Databases & Data Ops** | MongoDB Atlas, PlanetScale, Postgres DSNs, Grafana Cloud, Datadog, Dynatrace, and more | -| **Payments & Billing** | Stripe, PayPal, Square, GoCardless, and more | -| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more | -| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more | - -## Write Custom Rules! - -Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential. - -However, you may want to add your own custom rules, or modify a detection to better suit your needs / environment. - -First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom Kingfisher rules. - -Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! - -# Usage - -## Basic Examples - -> **Note**  `kingfisher scan` detects whether the input is a Git repository or a plain directory, no extra flags required. - -### Scan with secret validation - -```bash -kingfisher scan /path/to/code -## NOTE: This path can refer to: -# 1. a local git repo -# 2. a directory with many git repos -# 3. or just a folder with files and subdirectories - -## To explicitly prevent scanning git commit history add: -# `--git-history=none` -``` - -### Scan a directory containing multiple Git repositories - -```bash -kingfisher scan /projects/mono‑repo‑dir -``` - -### Scan a Git repository without validation - -```bash -kingfisher scan ~/src/myrepo --no-validate -``` - -### Display only secrets confirmed active by third‑party APIs - -```bash -kingfisher scan /path/to/repo --only-valid -``` - -### Output JSON and capture to a file - -```bash -kingfisher scan . --format json | tee kingfisher.json -``` - -### Output SARIF directly to disk - -```bash -kingfisher scan /path/to/repo --format sarif --output findings.sarif -``` - -### Pipe any text directly into Kingfisher by passing `-` - -```bash -cat /path/to/file.py | kingfisher scan - - -``` - -### Limit maximum file size scanned (`--max-file-size`) - -By default, Kingfisher skips files larger than **256 MB**. You can raise or lower this cap per run with `--max-file-size`, which takes a value in **megabytes**. - -```bash -# Scan files up to 500 mb in size -kingfisher scan /some/file --max-file-size 500 -``` - -### Scan using a rule _family_ with one flag - -_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws.*`)_ - -```bash -# Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2) -kingfisher scan /path/to/repo --rule kingfisher.aws -``` - -### Display rule performance statistics - -```bash -kingfisher scan /path/to/repo --rule-stats -``` - -### Scan while ignoring likely test files - -`--exclude` skips any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) - -```bash -# Scan source but skip likely unit / integration tests -kingfisher scan ./my-project \ - --exclude='[Tt]est' \ - --exclude='spec' \ - --exclude='[Ff]ixture' \ - --exclude='example' \ - --exclude='sample' -``` - -### Exclude specific paths -```bash -# Skip all Python files and any directory named tests -kingfisher scan ./my-project \ - --exclude '*.py' \ - --exclude '[Tt]ests' -``` - -### Scan changes in CI pipelines - -Limit scanning to the delta between your default branch and a pull request branch by combining `--since-commit` with `--branch` (defaults to `HEAD`). This only scans files that differ between the two references, which keeps CI runs fast while still blocking new secrets. - -```bash -kingfisher scan . \ - --since-commit origin/main \ - --branch "$CI_BRANCH" -``` - -When the branch under test is already checked out, `--branch HEAD` or omitting `--branch` entirely is sufficient. Kingfisher exits with `200` when any findings are discovered and `205` when validated secrets are present, allowing CI jobs to fail automatically if new credentials slip in. - -The same diff-focused workflow works when cloning repositories on the fly with `--git-url`. Kingfisher automatically tries remote-tracking names like `origin/main` and `origin/feature-1`, so you can target the branches involved in a pull request without performing a local checkout first. - -```bash -kingfisher scan \ - --git-url https://github.com/org/repo.git \ - --since-commit main \ - --branch development -``` - -In CI systems that expose the base and head commits explicitly, you can pass those SHAs directly while still using `--git-url`: - -```bash -kingfisher scan \ - --git-url git@github.com:org/repo.git \ - --since-commit "$BASE_COMMIT" \ - --branch "$PR_HEAD_COMMIT" -``` - -If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude): - -```bash -# Skip all Python files and any directory named tests, and report to stderr any skipped files -kingfisher scan ./my-project \ - --exclude '*.py' \ - --exclude tests \ - -v -``` - -## GitHub Scanning an AWS S3 Bucket -You can scan S3 objects directly: - -```bash -kingfisher scan --s3-bucket bucket-name [--s3-prefix path/] -``` - -Credential resolution happens in this order: - -1. `KF_AWS_KEY` and `KF_AWS_SECRET` environment variables -2. `--aws-local-profile` pointing to a profile in `~/.aws/config` (works with AWS SSO) -3. anonymous access for public buckets - -If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role. - -Examples: - -```bash -# using explicit keys -export KF_AWS_KEY=AKIA... -export KF_AWS_SECRET=g5nYW... -kingfisher scan --s3-bucket some-example-bucket - -# Above can also be run as: -KF_AWS_KEY=AKIA... KF_AWS_SECRET=g5nYW... kingfisher scan --s3-bucket some-example-bucket - -# using a local profile (e.g., SSO) that exists in your AWS profile (~/.aws/config) -kingfisher scan --s3-bucket some-example-bucket --aws-local-profile default - -# anonymous scan of a bucket, while providing an object prefix to only scan subset of the s3 bucket -kingfisher scan \ - --s3-bucket awsglue-datasets \ - --s3-prefix examples/us-legislators/all - -# assuming a role when scanning -kingfisher scan --s3-bucket some-example-bucket \ - --role-arn arn:aws:iam::123456789012:role/MyRole - -# anonymous scan of a public bucket -kingfisher scan --s3-bucket some-example-bucket -``` - -Docker example: - -```bash -docker run --rm \ - -e KF_AWS_KEY=AKIA... \ - -e KF_AWS_SECRET=g5nYW... \ - ghcr.io/mongodb/kingfisher:latest \ - scan --s3-bucket bucket-name -``` - -## Docker Scanning Docker Images - -Kingfisher will first try to use any locally available image, then fall back to pulling via OCI. - -Authentication happens *in this order*: - -1. **`KF_DOCKER_TOKEN`** env var - - If it contains `user:pass`, it’s used as Basic auth - - Otherwise it’s sent as a Bearer token -2. **Docker CLI credentials** - - Checks `credHelpers` (per-registry) and `credsStore` in `~/.docker/config.json`. - - Falls back to the legacy `auths` → `auth` (base64) entries. -3. **Anonymous** (no credentials) - - -```bash -# 1) Scan public or already-pulled image -kingfisher scan --docker-image ghcr.io/owasp/wrongsecrets/wrongsecrets-master:latest-master - -# 2) For private registries, explicitly set KF_DOCKER_TOKEN: -# - Basic auth: "user:pass" -# - Bearer only: "TOKEN" -export KF_DOCKER_TOKEN="AWS:$(aws ecr get-login-password --region us-east-1)" -kingfisher scan --docker-image some-private-registry.dkr.ecr.us-east-1.amazonaws.com/base/amazonlinux2023:latest - -# 3) Or rely on your Docker CLI login/keychain: -# (e.g. aws ecr get-login-password … | docker login …) -kingfisher scan --docker-image private.registry.example.com/my-image:tag -``` - -## GitHub Scanning GitHub - -### Scan GitHub organization (requires `KF_GITHUB_TOKEN`) - -```bash -kingfisher scan --github-organization my-org -``` - -### Skip specific GitHub repositories during enumeration - -Repeat `--github-exclude` for every repository you want to ignore when scanning -users or organizations. You can provide exact repositories like -`OWNER/REPO` or gitignore-style glob patterns such as `owner/*-archive` -(matching is case-insensitive). - -```bash -kingfisher scan --github-organization my-org \ - --github-exclude my-org/huge-repo \ - --github-exclude my-org/*-archive -``` - -### Scan remote GitHub repository - -`--git-url` clones the repository and scans its files and history. To also inspect -related server-side data, supply `--repo-artifacts`. This flag pulls down the -repository's issues (including pull requests), wiki, and any public gists owned by -the repository owner and scans them for secrets. Fetching these extras counts -against API rate limits and private artifacts require a `KF_GITHUB_TOKEN`. - -```bash -# Scan the repository only -kingfisher scan --git-url https://github.com/org/repo.git - -# Include issues, wiki, and owner gists -kingfisher scan --git-url https://github.com/org/repo.git --repo-artifacts - -# Private repositories or artifacts -KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/private_repo.git --repo-artifacts -``` - ---- - -## GitLab Scanning GitLab - -### Scan GitLab group (requires `KF_GITLAB_TOKEN`) - -```bash -kingfisher scan --gitlab-group my-group -# include repositories from all nested subgroups -kingfisher scan --gitlab-group my-group --gitlab-include-subgroups -``` - -### Scan GitLab user - -```bash -kingfisher scan --gitlab-user johndoe -``` - -### Skip specific GitLab projects during enumeration - -Repeat `--gitlab-exclude` for every project path you want to ignore when scanning -users or groups. Specify project paths as `group/project` (case-insensitive) or -use gitignore-style glob patterns like `group/**/archive-*` to drop families of -projects across nested subgroups. - -```bash -kingfisher scan --gitlab-group my-group \ - --gitlab-exclude my-group/huge-project \ - --gitlab-exclude my-group/**/archive-* -``` - -### Scan remote GitLab repository by URL - -`--git-url` by itself clones the project repository. To include server-side -artifacts owned by the project, add `--repo-artifacts`. Kingfisher will retrieve -the project's issues, wiki, and snippets and scan them for secrets. These extra -requests may take longer and require a `KF_GITLAB_TOKEN` for private projects. - -```bash -# Scan the repository only -kingfisher scan --git-url https://gitlab.com/group/project.git - -# Include issues, wiki, and snippets -kingfisher scan --git-url https://gitlab.com/group/project.git --repo-artifacts - -# Private projects or artifacts -KF_GITLAB_TOKEN="glpat-…" kingfisher scan --git-url https://gitlab.com/group/private_project.git --repo-artifacts -``` - -### List GitLab repositories - -```bash -kingfisher gitlab repos list --group my-group -# include repositories from all nested subgroups -kingfisher gitlab repos list --group my-group --include-subgroups -# skip specific projects when listing or scanning (supports glob patterns) -kingfisher gitlab repos list --group my-group --gitlab-exclude my-group/**/legacy-* -``` -## Azure Repos Scanning Azure Repos - -### Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`) - -```bash -kingfisher scan --azure-organization my-org - -# Azure DevOps Server example -KF_AZURE_PAT="pat" kingfisher scan --azure-organization DefaultCollection --azure-base-url https://ado.internal.example/tfs/ -``` - -### Scan specific Azure DevOps projects - -Projects are specified as `ORGANIZATION/PROJECT`. Repeat the flag for multiple projects. - -```bash -kingfisher scan --azure-project my-org/payments --azure-project my-org/core-platform -``` - -### Skip specific Azure repositories during enumeration - -Repeat `--azure-exclude` to ignore repositories when scanning organizations or projects. -Use identifiers like `ORGANIZATION/PROJECT/REPOSITORY`. Repositories that share the same -name as their project can be excluded with `ORGANIZATION/PROJECT`, and gitignore-style -patterns such as `my-org/*/archive-*` are also supported. - -```bash -kingfisher scan --azure-organization my-org \ - --azure-exclude my-org/payments/legacy-service \ - --azure-exclude my-org/**/archive-* -``` - -### List Azure repositories - -```bash -kingfisher azure repos list --organization my-org -# list repositories for specific projects -kingfisher azure repos list --project my-org/app --project my-org/api -# skip specific repositories while listing (supports glob patterns) -kingfisher azure repos list --organization my-org --azure-exclude my-org/**/experimental-* -``` -## Gitea Scanning Gitea - -### Scan Gitea organization (requires `KF_GITEA_TOKEN`) - -```bash -kingfisher scan --gitea-organization my-org -# self-hosted example -KF_GITEA_TOKEN="gtoken" kingfisher scan --gitea-organization platform --gitea-api-url https://gitea.internal.example/api/v1/ -``` - -### Scan Gitea user - -```bash -kingfisher scan --gitea-user johndoe -``` - -### Skip specific Gitea repositories during enumeration - -Repeat `--gitea-exclude` for each repository you want to ignore when scanning users -or organizations. Accepts `owner/repo` identifiers or gitignore-style glob patterns -like `team/**/archive-*`. - -```bash -kingfisher scan --gitea-organization my-org \ - --gitea-exclude my-org/legacy-repo \ - --gitea-exclude my-org/**/archive-* -``` - -### Scan remote Gitea repository by URL - -`--git-url` clones the repository and scans its history. Adding `--repo-artifacts` -also clones the repository wiki if one exists. Private repositories and wikis -require `KF_GITEA_TOKEN` (and `KF_GITEA_USERNAME` when cloning via HTTPS). - -```bash -# Scan the repository only -kingfisher scan --git-url https://gitea.com/org/repo.git - -# Include the repository wiki (if present) -KF_GITEA_TOKEN="gtoken" KF_GITEA_USERNAME="org" \ - kingfisher scan --git-url https://gitea.com/org/repo.git --repo-artifacts -``` - -### List Gitea repositories - -```bash -kingfisher gitea repos list --gitea-organization my-org -# enumerate every organization visible to the authenticated user -KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --all-gitea-organizations -# self-hosted example -KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --user johndoe --gitea-api-url https://gitea.internal.example/api/v1/ -``` -## Bitbucket Scanning Bitbucket -### Scan Bitbucket workspace - -```bash -kingfisher scan --bitbucket-workspace my-team -# include Bitbucket Cloud repositories from every accessible workspace -kingfisher scan --all-bitbucket-workspaces --bitbucket-token "$APP_PASSWORD" --bitbucket-username "$USER" -``` - -### Scan Bitbucket user - -```bash -kingfisher scan --bitbucket-user johndoe -``` - -### Skip specific Bitbucket repositories during enumeration - -Use `--bitbucket-exclude` to ignore repositories while scanning users, workspaces, -or projects. Patterns accept either `owner/repo` (case-insensitive) or -gitignore-style globs such as `workspace/**/archive-*`. - -```bash -kingfisher scan --bitbucket-workspace my-team \ - --bitbucket-exclude my-team/legacy-repo \ - --bitbucket-exclude my-team/**/archive-* -``` - -### Scan remote Bitbucket repository by URL - -`--git-url` clones the repository and scans its files and history. To inspect -Bitbucket artifacts such as issues, add `--repo-artifacts`. Private artifacts -require credentials (see [Authenticate to Bitbucket](#authenticate-to-bitbucket)). - -```bash -# Scan the repository only -kingfisher scan --git-url https://bitbucket.org/hashashash/secretstest.git - -# Include repository issues -KF_BITBUCKET_USERNAME="user" \ -KF_BITBUCKET_APP_PASSWORD="app-password" \ - kingfisher scan --git-url https://bitbucket.org/workspace/project.git --repo-artifacts -``` - -### List Bitbucket repositories - -```bash -kingfisher bitbucket repos list --bitbucket-workspace my-team -# enumerate all accessible workspaces or projects -kingfisher bitbucket repos list --all-bitbucket-workspaces --bitbucket-token "$APP_PASSWORD" --bitbucket-username "$USER" -# filter out repositories using glob patterns -kingfisher bitbucket repos list --bitbucket-workspace my-team --bitbucket-exclude my-team/**/experimental-* -``` - -### Authenticate to Bitbucket - -Kingfisher supports Bitbucket Cloud and Bitbucket Server credentials: - -- **App password or server token** – set `KF_BITBUCKET_USERNAME` and either - `KF_BITBUCKET_APP_PASSWORD` or `KF_BITBUCKET_TOKEN`, or pass - `--bitbucket-username`/`--bitbucket-token` on the CLI. -- **OAuth/PAT token** – set `KF_BITBUCKET_OAUTH_TOKEN` or supply - `--bitbucket-oauth-token`. - -These credentials match the options described in the [ghorg setup -guide](https://github.com/gabrie30/ghorg/blob/master/README.md#bitbucket-setup). - -### Self-hosted Bitbucket Server - -Use `--bitbucket-api-url` to point Kingfisher at your server's REST endpoint, for example -`https://bitbucket.example.com/rest/api/1.0/`. Provide credentials with -`--bitbucket-username` and `--bitbucket-token`, and pass `--ignore-certs` when -connecting to HTTP or otherwise insecure instances. -## Jira Scanning Jira - -### Scan Jira issues matching a JQL query - -```bash -KF_JIRA_TOKEN="token" kingfisher scan \ - --jira-url https://jira.company.com \ - --jql "project = TEST AND status = Open" \ - --max-results 500 -``` - -### Scan the last 1,000 Jira issues: -```bash -KF_JIRA_TOKEN="token" kingfisher scan \ - --jira-url https://jira.mongodb.org \ - --jql 'ORDER BY created DESC' \ - --max-results 1000 -``` - -## Confluence Scanning Confluence -### Scan Confluence pages matching a CQL query - -```bash -# Bearer token -KF_CONFLUENCE_TOKEN="token" kingfisher scan \ - --confluence-url https://confluence.company.com \ - --cql "label = secret" \ - --max-results 500 - -# Basic auth with username and token -KF_CONFLUENCE_USER="user@example.com" KF_CONFLUENCE_TOKEN="token" kingfisher scan \ - --confluence-url https://confluence.company.com \ - --cql "text ~ 'password'" \ - --max-results 500 -``` - -Use the base URL of your Confluence site for `--confluence-url`. Kingfisher -automatically adds `/rest/api` to the end, so `https://example.com/wiki` and -`https://example.com` both work depending on your server configuration. - -Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environment variable. By default, Kingfisher sends the token as a bearer token in the `Authorization` header. - -To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions. - -## Slack Scanning Slack -### Scan Slack messages matching a search query - -```bash -KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \ - --slack-query "from:username has:link" \ - --max-results 1000 - -KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \ - --slack-query "akia" \ - --max-results 1000 -``` -*The Slack token must be a user token with the `search:read` scope. Bot tokens (those beginning with `xoxb-`) cannot call the Slack search API.* - -## Environment Variables for Tokens - -| Variable | Purpose | -| ----------------- | ---------------------------- | -| `KF_GITHUB_TOKEN` | GitHub Personal Access Token | -| `KF_GITLAB_TOKEN` | GitLab Personal Access Token | -| `KF_GITEA_TOKEN` | Gitea Personal Access Token | -| `KF_GITEA_USERNAME` | Username for private Gitea clones (used with `KF_GITEA_TOKEN`) | -| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure DevOps Personal Access Token | -| `KF_AZURE_USERNAME` | Username to use with Azure DevOps PATs (defaults to `pat` when unset) | -| `KF_BITBUCKET_USERNAME` | Bitbucket username for basic authentication | -| `KF_BITBUCKET_APP_PASSWORD` / `KF_BITBUCKET_TOKEN` | Bitbucket app password or server token | -| `KF_BITBUCKET_OAUTH_TOKEN` | Bitbucket OAuth or PAT token | -| `KF_JIRA_TOKEN` | Jira API token | -| `KF_CONFLUENCE_TOKEN` | Confluence API token | -| `KF_SLACK_TOKEN` | Slack API token | -| `KF_DOCKER_TOKEN` | Docker registry token (`user:pass` or bearer token). If unset, credentials from the Docker keychain are used | -| `KF_AWS_KEY` and `KF_AWS_SECRET` | AWS Credentials to use with S3 bucket scanning | - -Set them temporarily per command: - -```bash -KF_GITLAB_TOKEN="glpat-…" kingfisher scan --gitlab-group my-group -``` - -Or export for the session: - -```bash -export KF_GITLAB_TOKEN="glpat-…" -``` - -To authenticate Jira requests: -```bash -export KF_JIRA_TOKEN="token" -``` - -To authenticate Confluence requests: -```bash -export KF_CONFLUENCE_TOKEN="token" -``` - -_If no token is provided Kingfisher still works for public repositories._ - ---- - -## Exit Codes - -| Code | Meaning | -| ---- | ----------------------------- | -| 0 | No findings | -| 200 | Findings discovered | -| 205 | Validated findings discovered | - -## Update Checks - -Kingfisher automatically queries GitHub for a newer release when it starts and tells you whether an update is available. - -- **Hands-free updates** – Add `--self-update` to any Kingfisher command - - * If a newer version exists, Kingfisher will download it, replace the running binary, and re-launch itself with the **exact same arguments**. - * If the update fails or no newer release is found, the current run proceeds as normal - -- **Manual update** – Run `kingfisher self-update` to update the binary without scanning - -- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely - -# Advanced Options - -## Build a Baseline / Detect New Secrets - -There are situations where a repository already contains checked‑in secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list. - -The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches): - -```bash -kingfisher scan /path/to/code \ - --confidence low \ - --manage-baseline \ - --baseline-file ./baseline-file.yml -``` - -Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings: - -```bash -kingfisher scan /path/to/code \ - --baseline-file /path/to/baseline-file.yaml -``` - -Running the scan again with `--manage-baseline` refreshes the baseline by adding new findings and pruning entries for secrets that no longer appear. See [docs/BASELINE.md](docs/BASELINE.md) for full detail. - -## List Builtin Rules - -```bash -kingfisher rules list -``` - -## To scan using **only** your own `my_rules.yaml` you could run: - -```bash -kingfisher scan \ - --load-builtins=false \ - --rules-path path/to/my_rules.yaml \ - ./src/ -``` - -## To add your rules alongside the built‑ins: - -```bash -kingfisher scan \ - --rules-path ./custom-rules/ \ - --rules-path my_rules.yml \ - ~/path/to/project-dir/ -``` - -## Other Examples - -```bash -# Check custom rules - this ensures all regular expressions compile, and can match the rule's `examples` in the YML file -kingfisher rules check --rules-path ./my_rules.yml - -# List GitHub repos -kingfisher github repos list --user my-user -kingfisher github repos list --organization my-org -# Skip specific repositories when listing or scanning (supports glob patterns) -kingfisher github repos list --organization my-org --github-exclude my-org/*-archive - -``` - -## Customize the HTTP User-Agent - -Kingfisher identifies its HTTP requests with a user-agent that includes the binary name and version followed by a browser-style -string. Some environments require extra context, such as a contact address, a change-ticket number, or a temporary test label. -Use the global `--user-agent-suffix` flag to append this information between the Kingfisher identifier and the browser portion: - -```bash -# Attach a contact email to all outbound validation requests -kingfisher --user-agent-suffix "contact=security@example.com" scan path/ - -# Label a one-off experiment -kingfisher --user-agent-suffix "Sept 2025 testing" github repos list --user my-user -``` - -When omitted, Kingfisher defaults to `kingfisher/ Mozilla/5.0 ...`. The suffix is trimmed; passing an empty string -leaves the default unchanged. - -## Notable Scan Options - -- `--no-dedup`: Report every occurrence of a finding (disable the default de-duplicate behavior) -- `--no-base64`: By default, Kingfisher finds and decodes base64 blobs and scans them for secrets. This adds a slight performance overhead; use this flag to disable -- `--confidence `: (low|medium|high) -- `--min-entropy `: Override default threshold -- `--no-binary`: Skip binary files -- `--no-extract-archives`: Do not scan inside archives -- `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) -- `--redact`: Replaces discovered secrets with a one-way hash for secure output -- `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) -- `--baseline-file `: Ignore matches listed in a baseline YAML file -- `--manage-baseline`: Create or update the baseline file with current findings -- `--skip-regex `: Ignore findings whose text matches this regex (repeatable) -- `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) -## Understanding `--confidence` - -The `--confidence` flag sets a minimum confidence threshold, not an exact match. - -- If you pass `--confidence medium`, findings with **medium and higher** confidence (medium + high) will be included. -- If you pass `--confidence low`, you’ll see **all levels** (low, medium, high). - - -### Ignore known false positives - -Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. - -With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com) - -```bash -# Skip any finding where the finding mentions TEST_KEY -kingfisher scan --skip-regex '(?i)TEST_KEY' path/ - -# Skip findings that contain the word "dummy" anywhere in the match -kingfisher scan --skip-word dummy path/ - -# Combine multiple patterns -kingfisher scan \ - --skip-regex 'AKIA[0-9A-Z]{16}' \ - --skip-word placeholder \ - --skip-word dummy \ - path/ -``` - -If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early. - -## Finding Fingerprint - -The document below details the four-field formula (rule SHA-1, origin label, start & end offsets) hashed with XXH3-64 to create Kingfisher’s 64-bit finding fingerprint, and explains how this ID powers safe deduplication; plus how `--no-dedup` can be used shows every raw match. -See ([docs/FINGERPRINT.md](docs/FINGERPRINT.md)) - -## Rule Performance Profiling - -Use `--rule-stats` to collect timing information for every rule. After scanning, the summary prints a **Rule Performance Stats** section showing how many matches each rule produced along with its slowest and average match times. Useful when creating rules or debugging rules. - -## CLI Options - -```bash -kingfisher scan --help -``` - - -## Origins and Divergence - -Kingfisher began as a fork of Praetorian’s Nosey Parker, as our experiment with adding live validation support and embedding that validation directly inside each rule. - -Since that initial fork, it has diverged heavily from Nosey Parker: -- Added support for live validation of discovered secrets -- Added hundreds of new rules -- Added support for analyzing compressed files -- Added support for building "baselines" to allow for only reporting on newly discovered secrets -- Added Tree-Sitter based source code parsing on top of Hyperscan for deeper language-aware detection -- Expanded support for new targets (GitLab, BitBucket, Gitea, Jira, Confluence, Slack, S3, Docker, etc.) -- Replaced the SQLite datastore with an in-memory store + Bloom filter -- Collapsed the workflow into a single scan-and-report phase with direct JSON/BSON/SARIF outputs -- Delivered cross-platform builds, including native Windows - -# Roadmap - -- More rules -- More targets -- Please file a [feature request](https://github.com/mongodb/kingfisher/issues), or open a PR, if you have features you'd like added - -# License - -[Apache2 License](LICENSE) diff --git a/src/main.rs b/src/main.rs index 54dc8d6..d5e675a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -438,7 +438,6 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { skip_word: Vec::new(), output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_base64: false, - compat_ignore_comments: false, no_inline_ignore: false, } } diff --git a/src/reporter.rs b/src/reporter.rs index 5151235..117565d 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -864,7 +864,6 @@ mod tests { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), - compat_ignore_comments: false, no_inline_ignore: false, }; diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index dd09e14..7497f51 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -164,7 +164,6 @@ mod tests { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, - compat_ignore_comments: false, no_inline_ignore: false, } } diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs index 0f0696f..ad8592c 100644 --- a/tests/int_allowlist.rs +++ b/tests/int_allowlist.rs @@ -134,7 +134,6 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result Result<()> { since_commit: None, branch: None, }, - compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index 7991b34..f7ac9c4 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -131,7 +131,6 @@ rules: since_commit: None, branch: None, }, - compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 5.0, extraction_depth: 1, diff --git a/tests/int_github.rs b/tests/int_github.rs index 1b019f1..3460863 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -118,7 +118,6 @@ fn test_github_remote_scan() -> Result<()> { since_commit: None, branch: None, }, - compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index 67c13d3..3c271cd 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -138,7 +138,6 @@ fn test_gitlab_remote_scan() -> Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, - compat_ignore_comments: false, no_inline_ignore: false, }; @@ -254,7 +253,6 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { since_commit: None, branch: None, }, - compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, diff --git a/tests/int_redact.rs b/tests/int_redact.rs index ff003d8..6772af0 100644 --- a/tests/int_redact.rs +++ b/tests/int_redact.rs @@ -95,7 +95,6 @@ async fn test_redact_hashes_finding_values() -> Result<()> { since_commit: None, branch: None, }, - compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, diff --git a/tests/int_slack.rs b/tests/int_slack.rs index 1d9b1b5..1c7661e 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -124,7 +124,6 @@ impl TestContext { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, - compat_ignore_comments: false, no_inline_ignore: false, }; @@ -228,7 +227,6 @@ async fn test_scan_slack_messages() -> Result<()> { since_commit: None, branch: None, }, - compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index bf253da..c6ff0f5 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -174,7 +174,6 @@ async fn test_validation_cache_and_depvars() -> Result<()> { since_commit: None, branch: None, }, - compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 7603140..2645a43 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -117,7 +117,6 @@ impl TestContext { since_commit: None, branch: None, }, - compat_ignore_comments: false, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, @@ -252,7 +251,6 @@ impl TestContext { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, - compat_ignore_comments: false, no_inline_ignore: false, }; From 3647d759a3a1c1647e889a9eed2e5073aa701b3d Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Fri, 10 Oct 2025 16:23:41 -0700 Subject: [PATCH 5/8] - Added a --no-ignore CLI flag to disable inline directives when you need every potential secret reported - Added: repeatable --ignore-comment flag to reuse inline directives from other scanners (for example NOSONAR, kics-scan ignore, gitleaks:allow, etc) --- CHANGELOG.md | 3 +- README.md | 6 +- src/cli/commands/scan.rs | 6 +- src/inline_ignore.rs | 168 +++++++++++++++++++++++++++++++--- src/main.rs | 2 +- src/matcher.rs | 19 ++-- src/reporter.rs | 2 +- src/reporter/json_format.rs | 2 +- src/scanner/enumerate.rs | 2 +- src/scanner/repos.rs | 2 +- src/update.rs | 12 +-- tests/int_allowlist.rs | 2 +- tests/int_bitbucket.rs | 2 +- tests/int_dedup.rs | 2 +- tests/int_github.rs | 2 +- tests/int_gitlab.rs | 4 +- tests/int_redact.rs | 2 +- tests/int_slack.rs | 4 +- tests/int_validation_cache.rs | 2 +- tests/int_vulnerable_files.rs | 4 +- 20 files changed, 194 insertions(+), 54 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 416af87..baba4b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,8 @@ All notable changes to this project will be documented in this file. ## [v1.57.0] - Added inline ignore directive detection to treat suppression tokens anywhere on surrounding lines, including multi-line handling - Added a `--no-ignore` CLI flag to disable inline directives when you need every potential secret reported -- Added: `--compat-ignore-comments` to reuse existing inline directives from other scanners: NOSONAR, kics-scan ignore, gitleaks:allow and trufflehog:ignore +- Added: repeatable `--ignore-comment ` flag to reuse inline directives from other scanners (for example `NOSONAR`, + `kics-scan ignore`, `gitleaks:allow`, etc) ## [v1.56.0] - Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined diff --git a/README.md b/README.md index 02ca51c..d2eea9a 100644 --- a/README.md +++ b/README.md @@ -963,7 +963,7 @@ leaves the default unchanged. - `--manage-baseline`: Create or update the baseline file with current findings - `--skip-regex `: Ignore findings whose text matches this regex (repeatable) - `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) -- `--compat-ignore-comments`: Honor inline directives from other scanners (treat `gitleaks:allow` and `trufflehog:ignore` like native suppressions) +- `--ignore-comment `: Honor additional inline directives from other scanners (repeatable; e.g. `--ignore-comment "gitleaks:allow"`) - `--no-ignore`: Disable inline directives entirely so every match is reported ## Understanding `--confidence` @@ -979,7 +979,7 @@ Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. B ### Inline ignore directives -Add `kingfisher:ignore` (or `kingfisher:allow`) anywhere on the same line as a finding to silence it. Multi-line strings may also be ignored by placing the directive on the closing delimiter line, on the next logical line after the string, **or** on a comment immediately before the value: +Add `kingfisher:ignore` (or `kingfisher:allow`) anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value: ```python # kingfisher:ignore @@ -990,7 +990,7 @@ line 2 # kingfisher:ignore ``` -Kingfisher searches the surrounding lines for these tokens without requiring language-specific comment markers, so directives work even in templated files or unusual syntaxes. To reuse existing inline directives from other scanners, pass `--compat-ignore-comments` to also accept `gitleaks:allow` and `trufflehog:ignore`. Use `--no-ignore` when you want to disable inline suppressions entirely. +Kingfisher searches the surrounding lines for these tokens without requiring language-specific comment markers. To reuse existing inline directives from other scanners, add them with repeatable `--ignore-comment` flags (for example `--ignore-comment "gitleaks:allow" --ignore-comment "NOSONAR"`). Use `--no-ignore` when you want to disable inline suppressions entirely. With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com) diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index ad09f5e..78312d8 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -119,9 +119,9 @@ pub struct ScanArgs { #[arg(long = "skip-word", value_name = "WORD")] pub skip_word: Vec, - /// Also recognise `gitleaks:allow` and `trufflehog:ignore` inline directives - #[arg(long = "compat-ignore-comments", default_value_t = false)] - pub compat_ignore_comments: bool, + /// Additional inline ignore directives to recognise (repeatable) + #[arg(long = "ignore-comment", value_name = "DIRECTIVE")] + pub extra_ignore_comments: Vec, /// Disable inline ignore directives entirely #[arg(long = "no-ignore", default_value_t = false)] diff --git a/src/inline_ignore.rs b/src/inline_ignore.rs index bfdd00a..d288076 100644 --- a/src/inline_ignore.rs +++ b/src/inline_ignore.rs @@ -3,19 +3,30 @@ use crate::location::OffsetSpan; /// Configuration for inline ignore directives. #[derive(Clone, Debug, Default)] pub struct InlineIgnoreConfig { - tokens: Vec<&'static str>, + tokens: Vec>, } impl InlineIgnoreConfig { /// Create a new configuration. /// - /// * `include_external_syntax` - when true, also recognise the comment - /// directives used by other scanners such as Gitleaks and Trufflehog. - pub fn new(include_external_syntax: bool) -> Self { - let mut tokens = vec!["kingfisher:ignore", "kingfisher:allow"]; - if include_external_syntax { - tokens.extend(["gitleaks:allow", "trufflehog:ignore"]); + /// * `additional_tokens` - inline ignore directives supplied by the user. + pub fn new(additional_tokens: &[String]) -> Self { + let mut tokens = vec![b"kingfisher:ignore".to_vec(), b"kingfisher:allow".to_vec()]; + + for token in additional_tokens { + let trimmed = token.trim(); + if trimmed.is_empty() { + continue; + } + + let lowered = trimmed.to_ascii_lowercase().into_bytes(); + if tokens.iter().any(|existing| existing == &lowered) { + continue; + } + + tokens.push(lowered); } + Self { tokens } } @@ -128,6 +139,14 @@ fn should_skip_for_directive_search(line: &[u8]) -> bool { return true; } + if looks_like_pem_boundary(trimmed) { + return true; + } + + if looks_like_encoded_secret_body(trimmed) { + return true; + } + false } @@ -146,6 +165,68 @@ fn ends_with_multiline_delimiter(trimmed: &[u8]) -> bool { count >= 3 } +fn looks_like_pem_boundary(trimmed: &[u8]) -> bool { + trimmed.starts_with(b"-----BEGIN ") || trimmed.starts_with(b"-----END ") +} + +fn looks_like_encoded_secret_body(trimmed: &[u8]) -> bool { + const MIN_LEN: usize = 16; + + if trimmed.len() < MIN_LEN { + return false; + } + + let is_base64ish = trimmed.iter().all(|&b| { + matches!( + b, + b'A'..=b'Z' + | b'a'..=b'z' + | b'0'..=b'9' + | b'+' + | b'/' + | b'=' + | b'-' + | b'_' + ) + }); + if is_base64ish { + return true; + } + + let is_hexish = trimmed.iter().all(|&b| matches!(b, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')); + if is_hexish { + return true; + } + + let is_base32ish = trimmed.iter().all(|&b| matches!(b, b'A'..=b'Z' | b'2'..=b'7' | b'=')); + if is_base32ish { + return true; + } + + // Allow directives to be placed after payloads that mix a high percentage of + // alpha-numeric characters commonly seen in encoded data (e.g. cryptographic + // material that includes punctuation like ':' or '.') without risking + // accidentally skipping regular source lines. + let allowed = |b: u8| { + matches!( + b, + b'A'..=b'Z' + | b'a'..=b'z' + | b'0'..=b'9' + | b'+' + | b'/' + | b'=' + | b'-' + | b'_' + | b':' + | b'.' + ) + }; + + let allowed_count = trimmed.iter().copied().filter(|&b| allowed(b)).count(); + allowed_count * 10 >= trimmed.len() * 9 +} + fn trim_ascii_whitespace(line: &[u8]) -> &[u8] { let mut start = 0; while start < line.len() && line[start].is_ascii_whitespace() { @@ -175,7 +256,7 @@ fn line_bounds(bytes: &[u8], index: usize) -> (usize, usize) { (start, end) } -fn line_has_directive(line: &[u8], tokens: &[&'static str]) -> bool { +fn line_has_directive(line: &[u8], tokens: &[Vec]) -> bool { if line.is_empty() { return false; } @@ -183,7 +264,7 @@ fn line_has_directive(line: &[u8], tokens: &[&'static str]) -> bool { let mut lowercase = line.to_vec(); lowercase.iter_mut().for_each(|b| *b = b.to_ascii_lowercase()); - tokens.iter().any(|token| memchr::memmem::find(&lowercase, token.as_bytes()).is_some()) + tokens.iter().any(|token| memchr::memmem::find(&lowercase, token.as_slice()).is_some()) } #[cfg(test)] @@ -204,16 +285,16 @@ mod tests { #[test] fn detects_directives_in_lines() { - let tokens = ["kingfisher:ignore", "kingfisher:allow"]; + let tokens = vec![b"kingfisher:ignore".to_vec(), b"kingfisher:allow".to_vec()]; assert!(line_has_directive(b"secret # kingfisher:ignore", &tokens)); assert!(line_has_directive(b"kingfisher:allow before value", &tokens)); - assert!(line_has_directive(b"value // TruffleHog:Ignore", &["trufflehog:ignore"])); + assert!(line_has_directive(b"value // Gitleaks:Allow", &[b"gitleaks:allow".to_vec()])); assert!(!line_has_directive(b"secret", &tokens)); } #[test] fn respects_multiline_block_comment_prefix() { - let tokens = ["kingfisher:ignore"]; + let tokens = vec![b"kingfisher:ignore".to_vec()]; assert!(line_has_directive(b" * kingfisher:ignore", &tokens)); } @@ -226,7 +307,60 @@ mod tests { .position(|window| window == matched) .expect("match bytes present"); let span = OffsetSpan::from_range(start..start + matched.len()); - let config = InlineIgnoreConfig::new(false); + let config = InlineIgnoreConfig::new(&[]); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn ignores_multiline_with_directive_on_closing_line() { + let blob = b"api_key = \"\"\"\nline1\nline2\n\"\"\" // kingfisher:ignore\n"; + let matched = b"line1\nline2\n"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(&[]); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn ignores_pem_with_directive_before_block() { + let blob = b"// kingfisher:ignore\napi_key = \"\"\"\n-----BEGIN RSA PRIVATE KEY-----\nMIICWwIBAAKBgQC7\n-----END RSA PRIVATE KEY-----\n\"\"\"\n"; + let matched = b"MIICWwIBAAKBgQC7\n"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(&[]); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn ignores_multiline_hex_payload_with_directive() { + let blob = b"# kingfisher:ignore\nsecret = \"\"\"\n00112233445566778899aabbccddeeff\nffeeddccbbaa99887766554433221100\n\"\"\"\n"; + let matched = b"00112233445566778899aabbccddeeff\nffeeddccbbaa99887766554433221100\n"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(&[]); + assert!(config.should_ignore(blob, &span)); + } + + #[test] + fn ignores_multiline_base32_payload_with_directive_after_block() { + let blob = + b"secret = \"\"\"\nMFRGGZDFMZTWQ2LK\nONSWG4TFOQ======\n\"\"\"\n// kingfisher:ignore\n"; + let matched = b"MFRGGZDFMZTWQ2LK\nONSWG4TFOQ======\n"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(&[]); assert!(config.should_ignore(blob, &span)); } @@ -239,7 +373,7 @@ mod tests { .position(|window| window == matched) .expect("match bytes present"); let span = OffsetSpan::from_range(start..start + matched.len()); - let config = InlineIgnoreConfig::new(false); + let config = InlineIgnoreConfig::new(&[]); assert!(config.should_ignore(blob, &span)); } @@ -252,7 +386,7 @@ mod tests { .position(|window| window == matched) .expect("match bytes present"); let span = OffsetSpan::from_range(start..start + matched.len()); - let config = InlineIgnoreConfig::new(false); + let config = InlineIgnoreConfig::new(&[]); assert!(config.should_ignore(blob, &span)); } @@ -268,6 +402,10 @@ mod tests { assert!(should_skip_for_directive_search(b" \"\"\" ")); assert!(should_skip_for_directive_search(b"let secret = \"\"\"")); assert!(!should_skip_for_directive_search(b"value")); + assert!(should_skip_for_directive_search(b"-----BEGIN RSA PRIVATE KEY-----")); + assert!(should_skip_for_directive_search(b"MIICWwIBAAKBgQC7")); + assert!(should_skip_for_directive_search(b"0011223344556677")); + assert!(should_skip_for_directive_search(b"MFRGGZDFMZTWQ2LK")); } #[test] diff --git a/src/main.rs b/src/main.rs index d5e675a..c0f6f9a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -416,7 +416,7 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { since_commit: None, branch: None, }, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: true, diff --git a/src/matcher.rs b/src/matcher.rs index c880f61..f93576a 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -230,7 +230,7 @@ impl<'a> Matcher<'a> { global_stats: Option<&'a Mutex>, enable_profiling: bool, shared_profiler: Option>, - include_external_ignore_syntax: bool, + extra_ignore_directives: &[String], disable_inline_ignores: bool, ) -> Result { // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec @@ -256,7 +256,7 @@ impl<'a> Matcher<'a> { inline_ignore_config: if disable_inline_ignores { InlineIgnoreConfig::disabled() } else { - InlineIgnoreConfig::new(include_external_ignore_syntax) + InlineIgnoreConfig::new(extra_ignore_directives) }, }) } @@ -1039,7 +1039,7 @@ mod test { None, false, None, - false, + &[], false, ) .unwrap(); @@ -1113,7 +1113,7 @@ mod test { None, enable_rule_profiling, None, // Pass the shared profiler - false, + &[], false, )?; matcher.scan_bytes_raw(input.as_bytes(), "fname")?; @@ -1202,7 +1202,7 @@ mod test { let rules_db = RulesDatabase::from_rules(vec![rule])?; let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false, false)?; + let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false)?; let buf = b"dup dup"; // two literal hits, same rule @@ -1239,7 +1239,7 @@ mod test { let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false, false)?; + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false)?; let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec()); let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt"))); @@ -1271,7 +1271,7 @@ mod test { let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false, false)?; + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false)?; let blob = Blob::from_bytes( br#"let data = """ @@ -1315,7 +1315,7 @@ line2 let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false, false)?; + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false)?; let matches_without_compat = match matcher.scan_blob(&blob, &origin, None, false, false, false)? { ScanResult::New(matches) => matches.len(), @@ -1325,8 +1325,9 @@ line2 let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let extra = vec![String::from("gitleaks:allow")]; let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true, false)?; + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &extra, false)?; match matcher.scan_blob(&blob, &origin, None, false, false, false)? { ScanResult::New(matches) => assert!(matches.is_empty()), _ => panic!("unexpected scan result"), diff --git a/src/reporter.rs b/src/reporter.rs index 117565d..52a491c 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -842,7 +842,7 @@ mod tests { since_commit: None, branch: None, }, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { max_file_size_mb: 256.0, exclude: Vec::new(), diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 7497f51..02dc20c 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -143,7 +143,7 @@ mod tests { since_commit: None, branch: None, }, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index 24f92fb..46a2b4d 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -144,7 +144,7 @@ pub fn enumerate_filesystem_inputs( Some(&matcher_stats), enable_profiling, Some(shared_profiler), - args.compat_ignore_comments, + &args.extra_ignore_comments, args.no_inline_ignore, )?; let blob_processor_init_time = Mutex::new(t1.elapsed()); diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index 3f0bc16..fb6577a 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -622,7 +622,7 @@ pub async fn fetch_s3_objects( Some(matcher_stats), enable_profiling, Some(shared_profiler.clone()), - args.compat_ignore_comments, + &args.extra_ignore_comments, args.no_inline_ignore, )?; let mut processor = BlobProcessor { matcher }; diff --git a/src/update.rs b/src/update.rs index db8d9cb..70001c3 100644 --- a/src/update.rs +++ b/src/update.rs @@ -40,7 +40,7 @@ pub fn check_for_update(global_args: &GlobalArgs, base_url: Option<&str>) -> Opt let use_color = std::io::stderr().is_terminal() && !global_args.quiet; let styles = Styles::new(use_color); - info!("{}", "Checking for updates…"); + // info!("{}", "Checking for updates…"); let mut builder = Update::configure(); builder @@ -113,22 +113,22 @@ pub fn check_for_update(global_args: &GlobalArgs, base_url: Option<&str>) -> Opt if curr > latest { let plain = format!("Running Kingfisher {curr} which is newer than latest released {latest}"); - info!("{}", styled_heading(&styles, plain.as_str())); + info!("{}", plain); return Some(plain); } // else fall through to Case 3 (latest > running) } // ───────────── Case 3: latest > running ───────────── - let plain = format!("New Kingfisher release {} available", release.version); - info!("{}", styled_heading(&styles, plain.as_str())); + let plain = format!("NEW KINGFISHER RELEASE {} AVAILABLE", release.version); + info!("{}", plain); // Attempt self‑update when allowed and feasible. if global_args.self_update { match updater.update() { Ok(status) => { - let message = format!("Updated to version {}", status.version()); - info!("{}", styled_heading(&styles, message.as_str())); + let message = format!("UPDATED TO VERSION {}", status.version()); + info!("{}", message); } Err(e) => match e { UpdError::Io(ref io_err) => match io_err.kind() { diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs index ad8592c..3947548 100644 --- a/tests/int_allowlist.rs +++ b/tests/int_allowlist.rs @@ -112,7 +112,7 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), no_inline_ignore: false, }; diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index f7ac9c4..e86c589 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -152,7 +152,7 @@ rules: skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), no_inline_ignore: false, }; diff --git a/tests/int_github.rs b/tests/int_github.rs index 3460863..dcf936a 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -139,7 +139,7 @@ fn test_github_remote_scan() -> Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), no_inline_ignore: false, }; // Create global arguments diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index 3c271cd..7c2e0b9 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -116,7 +116,7 @@ fn test_gitlab_remote_scan() -> Result<()> { since_commit: None, branch: None, }, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: false, @@ -274,7 +274,7 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), no_inline_ignore: false, }; diff --git a/tests/int_redact.rs b/tests/int_redact.rs index 6772af0..7d7accb 100644 --- a/tests/int_redact.rs +++ b/tests/int_redact.rs @@ -116,7 +116,7 @@ async fn test_redact_hashes_finding_values() -> Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), no_inline_ignore: false, }; diff --git a/tests/int_slack.rs b/tests/int_slack.rs index 1c7661e..5f2d01a 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -102,7 +102,7 @@ impl TestContext { since_commit: None, branch: None, }, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, @@ -248,7 +248,7 @@ async fn test_scan_slack_messages() -> Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), no_inline_ignore: false, }; diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index c6ff0f5..74cf6e4 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -195,7 +195,7 @@ async fn test_validation_cache_and_depvars() -> Result<()> { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), no_inline_ignore: false, }; diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 2645a43..464377a 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -138,7 +138,7 @@ impl TestContext { skip_regex: Vec::new(), skip_word: Vec::new(), no_base64: false, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), no_inline_ignore: false, }; @@ -229,7 +229,7 @@ impl TestContext { since_commit: None, branch: None, }, - compat_ignore_comments: false, + extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, extraction_depth: 2, From 5241ed4d8b96e1317a8f766c52a576cab230f098 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Fri, 10 Oct 2025 16:25:26 -0700 Subject: [PATCH 6/8] - Added a --no-ignore CLI flag to disable inline directives when you need every potential secret reported - Added: repeatable --ignore-comment flag to reuse inline directives from other scanners (for example NOSONAR, kics-scan ignore, gitleaks:allow, etc) --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index baba4b9..80c388b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,7 @@ All notable changes to this project will be documented in this file. ## [v1.57.0] - Added inline ignore directive detection to treat suppression tokens anywhere on surrounding lines, including multi-line handling - Added a `--no-ignore` CLI flag to disable inline directives when you need every potential secret reported -- Added: repeatable `--ignore-comment ` flag to reuse inline directives from other scanners (for example `NOSONAR`, - `kics-scan ignore`, `gitleaks:allow`, etc) +- Added: repeatable `--ignore-comment ` flag to reuse inline directives from other scanners (for example `NOSONAR`, `kics-scan ignore`, `gitleaks:allow`, etc) ## [v1.56.0] - Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined From 4c952bf1bfeb0092b2c1a1e263dc6e78cd2739d1 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sat, 11 Oct 2025 12:36:35 -0700 Subject: [PATCH 7/8] Respect user color settings in update messages by using the same color helper as the main reporter, ensuring consistent output and no ANSI codes on update check, when color is disabled --- CHANGELOG.md | 1 + README.md | 31 ++++++++++++++++--------------- src/update.rs | 19 ++++++++++--------- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80c388b..1be8150 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. - Added inline ignore directive detection to treat suppression tokens anywhere on surrounding lines, including multi-line handling - Added a `--no-ignore` CLI flag to disable inline directives when you need every potential secret reported - Added: repeatable `--ignore-comment ` flag to reuse inline directives from other scanners (for example `NOSONAR`, `kics-scan ignore`, `gitleaks:allow`, etc) +- Respect user color settings in update messages by using the same color helper as the main reporter, ensuring consistent output and no ANSI codes on update check, when color is disabled ## [v1.56.0] - Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined diff --git a/README.md b/README.md index d2eea9a..5ccff85 100644 --- a/README.md +++ b/README.md @@ -977,21 +977,6 @@ The `--confidence` flag sets a minimum confidence threshold, not an exact match. Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. -### Inline ignore directives - -Add `kingfisher:ignore` (or `kingfisher:allow`) anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value: - -```python -# kingfisher:ignore -API_KEY = """ -line 1 -line 2 -""" -# kingfisher:ignore -``` - -Kingfisher searches the surrounding lines for these tokens without requiring language-specific comment markers. To reuse existing inline directives from other scanners, add them with repeatable `--ignore-comment` flags (for example `--ignore-comment "gitleaks:allow" --ignore-comment "NOSONAR"`). Use `--no-ignore` when you want to disable inline suppressions entirely. - With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com) ```bash @@ -1011,6 +996,22 @@ kingfisher scan \ If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early. +### Inline ignore directives + +Add `kingfisher:ignore` (or `kingfisher:allow`) anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value: + +```python +# kingfisher:ignore +API_KEY = """ +line 1 +line 2 +""" +# kingfisher:ignore +``` + +Kingfisher searches the surrounding lines for these tokens without requiring language-specific comment markers. To reuse existing inline directives from other scanners, add them with repeatable `--ignore-comment` flags (for example `--ignore-comment "gitleaks:allow" --ignore-comment "NOSONAR"`). Use `--no-ignore` when you want to disable inline suppressions entirely. + + ## Finding Fingerprint The document below details the four-field formula (rule SHA-1, origin label, start & end offsets) hashed with XXH3-64 to create Kingfisher’s 64-bit finding fingerprint, and explains how this ID powers safe deduplication; plus how `--no-dedup` can be used shows every raw match. diff --git a/src/update.rs b/src/update.rs index 70001c3..30e57b4 100644 --- a/src/update.rs +++ b/src/update.rs @@ -36,12 +36,13 @@ pub fn check_for_update(global_args: &GlobalArgs, base_url: Option<&str>) -> Opt return None; } - // Decide once whether we want coloured output. - let use_color = std::io::stderr().is_terminal() && !global_args.quiet; + // Respect the user's color preferences when printing update + // by delegating to the same helper used by the main reporter logic. This keeps + // the update checker in sync with the rest of the application and avoids + // emitting raw ANSI escape codes when colour output has been disabled. + let use_color = !global_args.quiet && global_args.use_color(std::io::stderr()); let styles = Styles::new(use_color); - // info!("{}", "Checking for updates…"); - let mut builder = Update::configure(); builder .repo_owner("mongodb") @@ -113,22 +114,22 @@ pub fn check_for_update(global_args: &GlobalArgs, base_url: Option<&str>) -> Opt if curr > latest { let plain = format!("Running Kingfisher {curr} which is newer than latest released {latest}"); - info!("{}", plain); + info!("{}", styled_heading(&styles, plain.as_str())); return Some(plain); } // else fall through to Case 3 (latest > running) } // ───────────── Case 3: latest > running ───────────── - let plain = format!("NEW KINGFISHER RELEASE {} AVAILABLE", release.version); - info!("{}", plain); + let plain = format!("New Kingfisher release {} available", release.version); + info!("{}", styled_heading(&styles, plain.as_str())); // Attempt self‑update when allowed and feasible. if global_args.self_update { match updater.update() { Ok(status) => { - let message = format!("UPDATED TO VERSION {}", status.version()); - info!("{}", message); + let message = format!("Updated to version {}", status.version()); + info!("{}", styled_heading(&styles, message.as_str())); } Err(e) => match e { UpdError::Io(ref io_err) => match io_err.kind() { From 9f137276663b804596d8ba11afae2248d79e151a Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sat, 11 Oct 2025 15:27:21 -0700 Subject: [PATCH 8/8] kingfisher:ignore is only directive built-in --- README.md | 2 +- src/inline_ignore.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5ccff85..d77d01e 100644 --- a/README.md +++ b/README.md @@ -998,7 +998,7 @@ If a `--skip-regex` regular expression fails to compile, the scan aborts with an ### Inline ignore directives -Add `kingfisher:ignore` (or `kingfisher:allow`) anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value: +Add `kingfisher:ignore` anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value: ```python # kingfisher:ignore diff --git a/src/inline_ignore.rs b/src/inline_ignore.rs index d288076..22f47ea 100644 --- a/src/inline_ignore.rs +++ b/src/inline_ignore.rs @@ -11,7 +11,7 @@ impl InlineIgnoreConfig { /// /// * `additional_tokens` - inline ignore directives supplied by the user. pub fn new(additional_tokens: &[String]) -> Self { - let mut tokens = vec![b"kingfisher:ignore".to_vec(), b"kingfisher:allow".to_vec()]; + let mut tokens = vec![b"kingfisher:ignore".to_vec()]; for token in additional_tokens { let trimmed = token.trim(); @@ -285,9 +285,9 @@ mod tests { #[test] fn detects_directives_in_lines() { - let tokens = vec![b"kingfisher:ignore".to_vec(), b"kingfisher:allow".to_vec()]; + let tokens = vec![b"kingfisher:ignore".to_vec()]; assert!(line_has_directive(b"secret # kingfisher:ignore", &tokens)); - assert!(line_has_directive(b"kingfisher:allow before value", &tokens)); + assert!(line_has_directive(b"kingfisher:ignore before value", &tokens)); assert!(line_has_directive(b"value // Gitleaks:Allow", &[b"gitleaks:allow".to_vec()])); assert!(!line_has_directive(b"secret", &tokens)); }