Merge pull request #329 from mongodb/development

This commit is contained in:
Mick Grove 2026-04-09 22:26:14 -07:00 committed by GitHub
commit be0ce3bae0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
35 changed files with 448 additions and 358 deletions

View file

@ -3,7 +3,8 @@
All notable changes to this project will be documented in this file.
## [v1.95.0]
- Added 80+ built-in rules, bringing the bundled ruleset to 820 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more.
- Fixed scan performance regression: the rule profiler was unconditionally active even without `--rule-stats`, causing RwLock contention across scan threads. Scans are now ~15% faster than v1.94.0.
- Added 80+ built-in rules, bringing the bundled ruleset to 825 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more.
- Replaced tree-sitter with a lighter parser-based context verifier built from handwritten lexers plus `tl`/`cssparser`, preserving context-dependent matching while cutting about 19 MB from the release binary.
- Added a `validation: type: Raw` exception path for provider-specific checks, with new raw validators for Azure Batch, FTP, Kraken, LDAP, RabbitMQ, and Redis. Also added stable request-scoped template values plus new Liquid filters for HMAC-SHA384 hex output and timestamp generation.
- Expanded live validation coverage for several built-in rules, including Agora, Bitfinex, DocuSign, Dwolla, GitLab, KuCoin, RingCentral, Snowflake, Tableau, Trello, and Webex. Also tightened newly added helper regex to avoid high-match scan regressions, and made preflight-blocked raw validations report as skipped/not attempted instead of failed.

View file

@ -7,7 +7,7 @@
<img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" alt="License" style="height: 24px;" />
</a>
<a href="https://github.com/mongodb/kingfisher">
<img src="https://img.shields.io/badge/Detection%20Rules-821-2ea043.svg" alt="Detection Rules" style="height: 24px;" />
<img src="https://img.shields.io/badge/Detection%20Rules-825-2ea043.svg" alt="Detection Rules" style="height: 24px;" />
</a>
<br>
<a href="https://github.com/mongodb/kingfisher/pkgs/container/kingfisher">

View file

@ -4,7 +4,7 @@ rules:
pattern: |
(?x)
\b
(?i:client[_-]?token|access[_-]?token)
(?i:client[_-]?token)
\s*=\s*
(
akab-
@ -18,9 +18,51 @@ rules:
visible: false
examples:
- "client-token=akab-sXedJBTOf0dHl27vVOd"
- "access-token=akab-xcsuUf7DY6xJgYscoe"
- "client_token = akab-c113ntt0k3n4qtari252bfxxbsl-yvsdj"
references:
- https://developer.akamai.com/api/getting-started
- https://techdocs.akamai.com/developer/docs/set-up-authentication-credentials
- name: Akamai API Access Token
id: kingfisher.akamai.3
pattern: |
(?x)
\b
(?i:access[_-]?token)
\s*=\s*
(
akab-
[A-Za-z0-9_-]{18,36}
)
\b
pattern_requirements:
min_digits: 2
min_entropy: 3.0
confidence: medium
visible: false
examples:
- "access-token=akab-xcsuUf7DY6xJgYscoe"
- "access_token = akab-acc35t0k3nodujqunph3w7hzp7-gtm6ij"
references:
- https://techdocs.akamai.com/developer/docs/set-up-authentication-credentials
- name: Akamai API Host
id: kingfisher.akamai.4
pattern: |
(?x)
\b
(?i:host)
\s*=\s*
(
[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)*\.akamaiapis\.net
)
\b
min_entropy: 2.0
confidence: medium
visible: false
examples:
- "host = akab-h05tnam3wl42son7nktnlnnx-kbob3i3v.luna.akamaiapis.net"
references:
- https://techdocs.akamai.com/developer/docs/set-up-authentication-credentials
- name: Akamai API Client Secret
id: kingfisher.akamai.2
@ -32,7 +74,6 @@ rules:
(
[A-Za-z0-9+/=-]{20,36}
)
\b
pattern_requirements:
min_digits: 2
min_uppercase: 2
@ -41,7 +82,43 @@ rules:
examples:
- "client-secret=431733QPkO-TgxBHBnH6alirfy"
- "client-mysecret=BcDeFgHi2345JKLM67nopqRS"
- "client_secret = C113nt53KR3TN6N90yVuAgICxIRwsObLi0E67/N8eRN="
references:
- https://developer.akamai.com/api/getting-started
# No simple validation: Akamai API uses HMAC-SHA256 EdgeGrid signing
# with the client token, secret, and access token together.
- https://techdocs.akamai.com/developer/docs/set-up-authentication-credentials
- https://techdocs.akamai.com/iam-api/reference/api-client-self
- https://techdocs.akamai.com/developer/docs/make-your-first-api-call
validation:
type: Http
content:
request:
method: GET
url: https://{{ HOST }}/identity-management/v3/api-clients/self
headers:
Accept: application/json
Authorization: |
{%- assign timestamp = "" | iso_timestamp_no_frac | replace: "-", "" | replace: "Z", "+0000" -%}
{%- assign nonce = "" | uuid -%}
{%- assign auth_data = "EG1-HMAC-SHA256 client_token=" | append: CLIENT_TOKEN | append: ";access_token=" | append: ACCESS_TOKEN | append: ";timestamp=" | append: timestamp | append: ";nonce=" | append: nonce | append: ";" -%}
{%- capture data_to_sign -%}GET https {{ HOST }} /identity-management/v3/api-clients/self {{ auth_data }}{%- endcapture -%}
{%- assign signing_key = timestamp | hmac_sha256: TOKEN -%}
{%- assign signature = data_to_sign | hmac_sha256: signing_key -%}
{{ auth_data }}signature={{ signature }}
response_matcher:
- report_response: true
- type: StatusMatch
status: [200]
- type: StatusMatch
status: [401, 403]
negative: true
- type: JsonValid
depends_on_rule:
- rule_id: kingfisher.akamai.1
variable: CLIENT_TOKEN
- rule_id: kingfisher.akamai.3
variable: ACCESS_TOKEN
- rule_id: kingfisher.akamai.4
variable: HOST
# No revocation yet: Akamai documents self-deactivation in IAM v2 and
# client-scoped deactivation in IAM v3, but the v3 docs available here do not
# clearly expose a safe single-credential self flow we can derive from the
# leaked tuple without risking broader API-client deactivation.

View file

@ -17,3 +17,19 @@ rules:
- 'BLOCK_PROTOCOL_API_KEY=b10ck5.AbCdEfGhIjKlMnOpQrStUvWxYz1234.AbCdEfGhIjKlMnOpQrStUvWxYz12345678'
references:
- https://blockprotocol.org/docs/hub/api
validation:
type: Http
content:
request:
method: GET
url: https://blockprotocol.org/api/blocks
headers:
x-api-key: "{{ TOKEN }}"
Accept: application/json
response_matcher:
- report_response: true
- type: StatusMatch
status: [200]
- type: JsonValid
# No programmatic revocation endpoint is documented; Block Protocol
# directs users to revoke keys from the Account > API Keys page.

View file

@ -1,9 +1,45 @@
rules:
- name: Blynk Cloud Host
id: kingfisher.blynk.10
pattern: |
(?xi)
https://
(
(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)?blynk\.cloud
)
/
min_entropy: 2.5
confidence: medium
visible: false
examples:
- https://blynk.cloud/external/api/get?token=Ab12Cd34Ef56Gh78Ij90Kl12Mn34Op56&V1
- https://fra1.blynk.cloud/api/organization/profile
references:
- https://docs.blynk.io/en/blynk.cloud/platform-https-api/authentication
- name: Blynk OAuth Client ID
id: kingfisher.blynk.11
pattern: |
(?xi)
\b
(
oa2-client-id_[A-Z0-9_\-]{32}
)
\b
min_entropy: 3.0
confidence: medium
visible: false
examples:
- oa2-client-id_Ab12Cd34Ef56Gh78Ij90Kl12Mn34Op56
- client_id=oa2-client-id_Zx98Yw76Vu54Ts32Rq10Po98Nm76Lk54
references:
- https://docs.blynk.io/en/blynk.console/settings/developers/oauth2
- name: Blynk Device Access Token
id: kingfisher.blynk.1
pattern: |
(?xi)
https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/external/api/[A-Z0-9/]*\?token=
https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)?blynk\.cloud/external/api/[A-Z0-9/]*\?token=
([A-Z0-9_\-]{32})
&
pattern_requirements:
@ -19,11 +55,30 @@ rules:
- curl "https://blynk.cloud/external/api/update/property?token=Rps15JICmtRVbFyS_95houlLbm6xIQ2L&pin=v1&isDisabled=true"
references:
- https://docs.blynk.io/en/blynk.cloud/platform-https-api/authentication
- https://docs.blynk.io/en/blynk.cloud/device-https-api/is-device-connected
depends_on_rule:
- rule_id: kingfisher.blynk.10
variable: HOST
validation:
type: Http
content:
request:
method: GET
url: https://{{ HOST }}/external/api/isHardwareConnected?token={{ TOKEN | url_encode }}
headers:
Accept: application/json
response_matcher:
- report_response: true
- type: StatusMatch
status: [200]
- type: WordMatch
words: ["true", "false"]
# No documented API endpoint is provided to revoke device auth tokens.
- name: Blynk Organization Access Token
id: kingfisher.blynk.2
pattern: |
(?xi)
https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/api/[A-Z0-9_\-\s/\\]*
https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)?blynk\.cloud/api/[A-Z0-9_\-\s/\\]*
-H\s*"Authorization:\s*Bearer\s*
([A-Z0-9_\-]{40})
"
@ -40,13 +95,35 @@ rules:
-H "Authorization: Bearer eIdWHQqRfFmvP5LDDh-IGxPUzi7I27HthzCPAVmS"
references:
- https://docs.blynk.io/en/blynk.cloud/platform-https-api/authentication
- https://docs.blynk.io/en/blynk.cloud/platform-https-api/ogranization-api/get-own-organization-info
depends_on_rule:
- rule_id: kingfisher.blynk.10
variable: HOST
validation:
type: Http
content:
request:
method: GET
url: https://{{ HOST }}/api/v1/organization/profile
headers:
Authorization: Bearer {{ TOKEN }}
Accept: application/json
response_matcher:
- report_response: true
- type: StatusMatch
status: [200]
- type: JsonValid
- type: WordMatch
match_all_words: true
words: ['"id"', '"name"']
# No documented API endpoint is provided to revoke organization access tokens.
- name: Blynk Organization Access Token
id: kingfisher.blynk.3
pattern: |
(?xi)
-H\s*"Authorization:\s*Bearer\s*
([A-Z0-9_\-]{40})
"[\s\\]*https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/api
"[\s\\]*https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)?blynk\.cloud/api
pattern_requirements:
min_digits: 2
min_uppercase: 1
@ -60,12 +137,34 @@ rules:
https://fra1.blynk.cloud/api/organization/profile
references:
- https://docs.blynk.io/en/blynk.cloud/platform-https-api/authentication
- https://docs.blynk.io/en/blynk.cloud/platform-https-api/ogranization-api/get-own-organization-info
depends_on_rule:
- rule_id: kingfisher.blynk.10
variable: HOST
validation:
type: Http
content:
request:
method: GET
url: https://{{ HOST }}/api/v1/organization/profile
headers:
Authorization: Bearer {{ TOKEN }}
Accept: application/json
response_matcher:
- report_response: true
- type: StatusMatch
status: [200]
- type: JsonValid
- type: WordMatch
match_all_words: true
words: ['"id"', '"name"']
# No documented API endpoint is provided to revoke organization access tokens.
- name: Blynk Organization Client Credentials
id: kingfisher.blynk.8
pattern: |
(?xi)
https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/oauth2/[A-Z0-9_\-\s/\\?=&]*
(oa2-client-id_[A-Z0-9_\-]{32})
https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)?blynk\.cloud/oauth2/[A-Z0-9_\-\s/\\?=&]*
oa2-client-id_[A-Z0-9_\-]{32}
(?: : | &client_secret= )
([A-Z0-9_\-]{40})
pattern_requirements:
@ -82,14 +181,38 @@ rules:
- 'curl -X POST https://fra1.blynk.cloud/oauth2/token?grant_type=client_credentials&client_id=oa2-client-id_zmNtW-D0Toqpz4AZnBLCIlklBrz9ynU-&client_secret=5uC5Y4Mcvdl5rB56rBmxnvB4DZgiIpcyTPbOoEWp'
references:
- https://docs.blynk.io/en/blynk.console/settings/developers/oauth2
- https://docs.blynk.io/en/blynk.cloud/platform-https-api/authentication
depends_on_rule:
- rule_id: kingfisher.blynk.10
variable: HOST
- rule_id: kingfisher.blynk.11
variable: CLIENT_ID
validation:
type: Http
content:
request:
method: POST
url: https://{{ HOST }}/oauth2/token?grant_type=client_credentials
headers:
Authorization: 'Basic {{ CLIENT_ID | append: ":" | append: TOKEN | b64enc }}'
Accept: application/json
response_matcher:
- report_response: true
- type: StatusMatch
status: [200]
- type: JsonValid
- type: WordMatch
match_all_words: true
words: ['"access_token"', '"token_type"']
# No documented API endpoint is provided to revoke OAuth client credentials.
- name: Blynk Organization Client Credentials
id: kingfisher.blynk.9
pattern: |
(?xi)
\b
(oa2-client-id_[A-Z0-9_\-]{32})
oa2-client-id_[A-Z0-9_\-]{32}
:([A-Z0-9_\-]{40})
[\s\\]*https://(fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/oauth2
[\s\\]*https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)?blynk\.cloud/oauth2
pattern_requirements:
min_digits: 2
min_uppercase: 1
@ -102,4 +225,28 @@ rules:
curl -X POST -u oa2-client-id_zmNtW-D0Toqpz4AZnBLCIlklBrz9ynU-:5uC5Y4Mcvdl5rB56rBmxnvB4DZgiIpcyTPbOoEWp \
https://fra1.blynk.cloud/oauth2/token?grant_type=client_credentials
references:
- https://docs.blynk.io/en/blynk.console/settings/developers/oauth2
- https://docs.blynk.io/en/blynk.console/settings/developers/oauth2
- https://docs.blynk.io/en/blynk.cloud/platform-https-api/authentication
depends_on_rule:
- rule_id: kingfisher.blynk.10
variable: HOST
- rule_id: kingfisher.blynk.11
variable: CLIENT_ID
validation:
type: Http
content:
request:
method: POST
url: https://{{ HOST }}/oauth2/token?grant_type=client_credentials
headers:
Authorization: 'Basic {{ CLIENT_ID | append: ":" | append: TOKEN | b64enc }}'
Accept: application/json
response_matcher:
- report_response: true
- type: StatusMatch
status: [200]
- type: JsonValid
- type: WordMatch
match_all_words: true
words: ['"access_token"', '"token_type"']
# No documented API endpoint is provided to revoke OAuth client credentials.

View file

@ -31,3 +31,6 @@ rules:
references:
- https://developer.paypal.com/braintree/docs/guides/authorization/tokenization-key
- https://developer.paypal.com/braintree/docs/start/overview
# No standalone validation or revocation is added: Braintree documents
# tokenization keys as publishable client credentials, and the public docs
# do not describe a programmatic tokenization-key introspection or revoke API.

View file

@ -18,3 +18,6 @@ rules:
references:
- https://www.canva.dev/docs/connect/authentication/
- https://www.canva.dev/docs/connect/guidelines/security/
# No standalone validation is added: Canva Connect client secrets are used
# in OAuth flows that require the matching client_id and PKCE flow state.
# No programmatic client-secret revocation endpoint is documented.

View file

@ -21,4 +21,6 @@ rules:
- "carto_api_key = 'eyJhbGciOiJIUzI1NiJ9.eyJhIjoiYWNfY2RlZmdoIiwianRpIjoiYWI2MTIzNDUifQ.BcDeFgHiJkLmNoPqRsTuVwXy'"
references:
- https://api-docs.carto.com
# No public validation endpoint documented for CARTO JWT tokens.
# No standalone validation or revocation is added: CARTO's documented APIs
# use account- and deployment-specific endpoints, and the public docs do not
# describe a token self-introspection or self-revocation API for these JWTs.

View file

@ -17,3 +17,6 @@ rules:
- 'sv_licenseKey "cfxk_AbCdEfGhIjKlMnOpQrStUvWxYz1234567890_abcdef"'
references:
- https://docs.fivem.net/docs/server-manual/setting-up-a-server/
# No standalone validation or revocation is added: the public FiveM/Cfx.re
# docs describe configuring server keys, but do not document a safe API for
# verifying or revoking a key using the key itself.

View file

@ -23,3 +23,6 @@ rules:
- CLAY_API_KEY=d8dfd14ec83e4e17a7d2
references:
- https://university.clay.com/docs/http-api-integration-overview
# No standalone validation or revocation is added: the public Clay docs
# referenced here do not document an authenticated API endpoint for
# introspecting or revoking these keys.

View file

@ -22,10 +22,10 @@ rules:
- name: Clojars API Token
id: kingfisher.clojars.2
pattern: |
(?xi)
(?x)
\b
(
CLOJARS_[a-z0-9]{60}
CLOJARS_[a-zA-Z0-9]{60}
)
\b
pattern_requirements:
@ -36,4 +36,7 @@ rules:
- "CLOJARS_7a0b5c647fdca75616e99a5629ba28955e53faf6391b1b4461ccd972b0e2"
references:
- https://github.com/clojars/clojars-web/wiki/API
- https://github.com/clojars/clojars-web/wiki/Tutorial-for-library-authors
- https://github.com/clojars/clojars-web/wiki/Tutorial-for-library-authors
# No standalone validation or revocation is added: Clojars deploy tokens are
# used via Basic auth for artifact uploads, and the public docs/source do not
# expose a safe read-only token introspection or token self-revocation API.

View file

@ -16,4 +16,34 @@ rules:
- 'Bearer: ciotgp8BGZBlX192iExSQPm0SrUlBunG8zd'
references:
- https://blog.rust-lang.org/2023/09/22/crates-io-usage-policy-rfc
- https://github.com/rust-lang/crates.io/tree/main/src
- https://github.com/rust-lang/crates.io/tree/main/src
- https://doc.rust-lang.org/cargo/reference/registry-web-api.html
validation:
type: Http
content:
request:
method: GET
url: https://crates.io/api/v1/me
headers:
Authorization: "{{ TOKEN }}"
Accept: application/json
response_matcher:
- report_response: true
- type: StatusMatch
status: [200]
- type: JsonValid
- type: WordMatch
match_all_words: true
words: ['"id"', '"login"']
revocation:
type: Http
content:
request:
method: DELETE
url: https://crates.io/api/v1/tokens/current
headers:
Authorization: "{{ TOKEN }}"
response_matcher:
- report_response: true
- type: StatusMatch
status: [204]

View file

@ -43,6 +43,9 @@
.kf-hero__actions .md-button {
color: var(--md-primary-bg-color);
border-color: var(--md-primary-bg-color);
font-size: 0.95rem;
font-weight: 600;
padding: 0.8em 1.4em;
}
.kf-hero__actions .md-button--primary {
@ -152,6 +155,17 @@
border-radius: 0.5rem;
}
.kf-benchmark__link {
margin-top: 1rem;
text-align: center;
}
.kf-benchmark__link a {
display: inline-block;
font-size: 0.95rem;
font-weight: 600;
}
/* Scan targets */
.kf-targets {
padding: 4rem 0;
@ -220,6 +234,12 @@
margin-top: 2rem;
}
.kf-install__cta .md-button {
font-size: 0.95rem;
font-weight: 600;
padding: 0.8em 1.4em;
}
/* ===== Rules Table ===== */
.rules-search {

View file

@ -8,7 +8,8 @@ description: "Kingfisher release history: new features, rules, bug fixes, and im
All notable changes to this project will be documented in this file.
## [v1.95.0]
- Added 80+ built-in rules, bringing the bundled ruleset to 820 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more.
- Fixed scan performance regression: the rule profiler was unconditionally active even without `--rule-stats`, causing RwLock contention across scan threads. Scans are now ~15% faster than v1.94.0.
- Added 80+ built-in rules, bringing the bundled ruleset to 825 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more.
- Replaced tree-sitter with a lighter parser-based context verifier built from handwritten lexers plus `tl`/`cssparser`, preserving context-dependent matching while cutting about 19 MB from the release binary.
- Added a `validation: type: Raw` exception path for provider-specific checks, with new raw validators for Azure Batch, FTP, Kraken, LDAP, RabbitMQ, and Redis. Also added stable request-scoped template values plus new Liquid filters for HMAC-SHA384 hex output and timestamp generation.
- Expanded live validation coverage for several built-in rules, including Agora, Bitfinex, DocuSign, Dwolla, GitLab, KuCoin, RingCentral, Snowflake, Tableau, Trello, and Webex. Also tightened newly added helper regex to avoid high-match scan regressions, and made preflight-blocked raw validations report as skipped/not attempted instead of failed.

View file

@ -1,6 +1,6 @@
---
title: "Access Map (Blast Radius)"
description: "Map the blast radius of leaked credentials. Authenticate against 39 providers to enumerate accessible resources and permissions."
description: "Map the blast radius of leaked credentials by authenticating and enumerating accessible resources and permissions."
---
# Access Map: supported tokens & credential formats

View file

@ -1,6 +1,6 @@
---
title: "Source Code Parsing"
description: "Language-aware secret detection using tree-sitter parsing for 13+ languages including Python, JavaScript, Go, Rust, and more."
description: "Language-aware secret detection using lightweight parser-based context verification across 16 supported source and config languages."
---
# Kingfisher Source Code Parsing
@ -13,7 +13,7 @@ The implementation favors lightweight extractors over full AST parsing:
- **`tl`** for HTML — attribute values, element text, and embedded `<script>` / `<style>` delegation
- **`cssparser`** for CSS — declaration parsing via Mozillas CSS tokenizer
> **History:** Earlier versions used tree-sitter with 17 statically-linked
> **History:** Earlier parser implementations relied on 17 statically-linked
> grammar crates. This added ~20 MB to the binary and required building a
> full syntax tree just to extract assignment pairs. The current lexer-based
> approach achieves the same extraction quality with near-zero binary overhead

View file

@ -1,6 +1,6 @@
---
title: "Secret Revocation"
description: "Revoke compromised credentials directly from the CLI. Supports 20+ providers including GitHub, GitLab, Slack, AWS, and GCP."
description: "Revoke compromised credentials directly from the CLI using built-in provider-specific revocation flows."
---
# Revocation Support Matrix
@ -8,8 +8,8 @@ description: "Revoke compromised credentials directly from the CLI. Supports 20+
Kingfisher supports direct secret revocation through rule-level `revocation:` blocks.
Current coverage in built-in rules:
- `23` provider families
- `39` revocation-enabled rules
- `28` provider families
- `46` revocation-enabled rules
Use `kingfisher revoke --rule <rule-id> <secret>` to invoke these flows. See [USAGE.md](../usage/basic-scanning.md#direct-secret-revocation-with-kingfisher-revoke) for command details.
@ -22,12 +22,15 @@ Use `kingfisher revoke --rule <rule-id> <secret>` to invoke these flows. See [US
| `buildkite` | 1 | `kingfisher.buildkite.1` |
| `cloudflare` | 1 | `kingfisher.cloudflare.1` |
| `confluent` | 2 | `kingfisher.confluent.2`, `kingfisher.confluent.3` |
| `cratesio` | 1 | `kingfisher.cratesio.1` |
| `deviantart` | 1 | `kingfisher.deviantart.1` |
| `doppler` | 6 | `kingfisher.doppler.1`, `kingfisher.doppler.2`, `kingfisher.doppler.3`, `kingfisher.doppler.4`, `kingfisher.doppler.5`, `kingfisher.doppler.6` |
| `gcp` | 1 | `kingfisher.gcp.1` |
| `github` | 3 | `kingfisher.github.1`, `kingfisher.github.2`, `kingfisher.github.5` |
| `gitlab` | 2 | `kingfisher.gitlab.1`, `kingfisher.gitlab.4` |
| `google` | 2 | `kingfisher.google.4`, `kingfisher.google.oauth2.1` |
| `harness` | 1 | `kingfisher.harness.pat.1` |
| `heroku` | 2 | `kingfisher.heroku.1`, `kingfisher.heroku.2` |
| `mapbox` | 1 | `kingfisher.mapbox.2` |
| `mongodb` | 1 | `kingfisher.mongodb.1` |
| `npm` | 2 | `kingfisher.npm.1`, `kingfisher.npm.2` |
@ -40,6 +43,8 @@ Use `kingfisher revoke --rule <rule-id> <secret>` to invoke these flows. See [US
| `twitch` | 1 | `kingfisher.twitch.1` |
| `unkey` | 1 | `kingfisher.unkey.2` |
| `vercel` | 5 | `kingfisher.vercel.1`, `kingfisher.vercel.2`, `kingfisher.vercel.3`, `kingfisher.vercel.4`, `kingfisher.vercel.5` |
| `vonage` | 1 | `kingfisher.vonage.2` |
| `vultr` | 1 | `kingfisher.vultr.1` |
## Notes

View file

@ -2,7 +2,7 @@
title: Kingfisher — Open Source Secret Scanner with Live Validation
description: >-
Kingfisher is an open source secret scanner with live validation, blast radius
mapping, and credential revocation. 734+ detection rules. Built in Rust by MongoDB.
mapping, and credential revocation. 825 detection rules. Built in Rust by MongoDB.
template: home.html
hide:
- navigation

View file

@ -268,7 +268,7 @@ flowchart TD
### Loading Builtin Rules
Kingfisher comes with 800+ builtin rules for common secret types:
Kingfisher currently ships with 825 built-in rules for common secret types:
```rust
use kingfisher_rules::{get_builtin_rules, Confidence};

View file

@ -1,13 +1,13 @@
---
title: "Built-in Rules List"
description: "Complete list of all 821 built-in secret detection rules in Kingfisher. Searchable and filterable by provider, confidence level, and validation support."
description: "Complete list of all 825 built-in secret detection rules in Kingfisher. Searchable and filterable by provider, confidence level, and validation support."
---
# Built-in Rules
Kingfisher ships with **821 detection rules** across **510 providers**
(721 detectors + 100 dependent rules).
Of these, **513** include live validation and **45** support direct revocation.
Kingfisher ships with **825 detection rules** across **510 providers**
(719 detectors + 106 dependent rules).
Of these, **521** include live validation and **46** support direct revocation.
!!! tip "Search"
Use the search box below to filter rules by provider name, rule ID, or confidence level.
@ -200,6 +200,22 @@ Of these, **513** include live validation and **45** support direct revocation.
<td>Akamai API Client Secret</td>
<td><code>kingfisher.akamai.2</code></td>
<td>Medium</td>
<td>Yes</td>
<td></td>
</tr>
<tr>
<td>Akamai</td>
<td>Akamai API Access Token</td>
<td><code>kingfisher.akamai.3</code></td>
<td>Medium</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Akamai</td>
<td>Akamai API Host</td>
<td><code>kingfisher.akamai.4</code></td>
<td>Medium</td>
<td></td>
<td></td>
</tr>
@ -920,7 +936,7 @@ Of these, **513** include live validation and **45** support direct revocation.
<td>Block Protocol API Key</td>
<td><code>kingfisher.blockprotocol.1</code></td>
<td>High</td>
<td></td>
<td>Yes</td>
<td></td>
</tr>
<tr>
@ -928,6 +944,22 @@ Of these, **513** include live validation and **45** support direct revocation.
<td>Blynk Device Access Token</td>
<td><code>kingfisher.blynk.1</code></td>
<td>Medium</td>
<td>Yes</td>
<td></td>
</tr>
<tr>
<td>Blynk</td>
<td>Blynk Cloud Host</td>
<td><code>kingfisher.blynk.10</code></td>
<td>Medium</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Blynk</td>
<td>Blynk OAuth Client ID</td>
<td><code>kingfisher.blynk.11</code></td>
<td>Medium</td>
<td></td>
<td></td>
</tr>
@ -936,7 +968,7 @@ Of these, **513** include live validation and **45** support direct revocation.
<td>Blynk Organization Access Token</td>
<td><code>kingfisher.blynk.2</code></td>
<td>Medium</td>
<td></td>
<td>Yes</td>
<td></td>
</tr>
<tr>
@ -944,7 +976,7 @@ Of these, **513** include live validation and **45** support direct revocation.
<td>Blynk Organization Access Token</td>
<td><code>kingfisher.blynk.3</code></td>
<td>Medium</td>
<td></td>
<td>Yes</td>
<td></td>
</tr>
<tr>
@ -952,7 +984,7 @@ Of these, **513** include live validation and **45** support direct revocation.
<td>Blynk Organization Client Credentials</td>
<td><code>kingfisher.blynk.8</code></td>
<td>Medium</td>
<td></td>
<td>Yes</td>
<td></td>
</tr>
<tr>
@ -960,7 +992,7 @@ Of these, **513** include live validation and **45** support direct revocation.
<td>Blynk Organization Client Credentials</td>
<td><code>kingfisher.blynk.9</code></td>
<td>Medium</td>
<td></td>
<td>Yes</td>
<td></td>
</tr>
<tr>
@ -1440,8 +1472,8 @@ Of these, **513** include live validation and **45** support direct revocation.
<td>crates.io API Key</td>
<td><code>kingfisher.cratesio.1</code></td>
<td>Medium</td>
<td></td>
<td></td>
<td>Yes</td>
<td>Yes</td>
</tr>
<tr>
<td>Credentials</td>

View file

@ -300,7 +300,7 @@ kingfisher scan ./my-project \
## Custom Rules
Kingfisher ships with 800+ rules, but you may want to add your own custom rules or modify existing detection to better suit your needs.
Kingfisher currently ships with 825 built-in rules, but you may want to add your own custom rules or modify existing detection to better suit your needs.
First, review [RULES.md](../rules/overview.md) to learn how to create custom Kingfisher rules.

View file

@ -1,7 +1,7 @@
site_name: Kingfisher
site_url: https://mongodb.github.io/kingfisher
site_description: >-
Open source secret scanner with live validation. 734+ detection rules,
Open source secret scanner with live validation. 825 detection rules,
blast radius mapping, and credential revocation. Built in Rust by MongoDB.
site_author: MongoDB
repo_url: https://github.com/mongodb/kingfisher

View file

@ -36,7 +36,7 @@
<section class="kf-stats">
<div class="kf-stats__inner md-grid">
<div class="kf-stats__item">
<span class="kf-stats__number">734+</span>
<span class="kf-stats__number">825</span>
<span class="kf-stats__label">Detection Rules</span>
</div>
<div class="kf-stats__item">
@ -48,7 +48,7 @@
<span class="kf-stats__label">Scan Targets</span>
</div>
<div class="kf-stats__item">
<span class="kf-stats__number">20+</span>
<span class="kf-stats__number">28</span>
<span class="kf-stats__label">Revocation Providers</span>
</div>
</div>
@ -90,7 +90,7 @@
<div class="kf-feature">
<h3>Direct Revocation</h3>
<p>
Revoke compromised credentials directly from the CLI for 20+ providers
Revoke compromised credentials directly from the CLI for 28 provider families
including GitHub, GitLab, Slack, AWS, GCP, Heroku, and Cloudflare.
</p>
</div>
@ -107,7 +107,7 @@
<div class="kf-feature">
<h3>Built for Accuracy</h3>
<p>
Tree-sitter language-aware parsing across 13+ languages reduces false positives
Lightweight parser-based context verification across 16 supported languages reduces false positives
at the detection layer. Combined with entropy filtering, checksum verification,
and live validation, Kingfisher delivers high-signal results you can act on.
</p>

View file

@ -7,7 +7,7 @@
"@context": "https://schema.org",
"@type": "SoftwareApplication",
"name": "Kingfisher",
"description": "Open source secret scanner with live validation. 734+ detection rules, blast radius mapping, and credential revocation.",
"description": "Open source secret scanner with live validation. 825 detection rules, blast radius mapping, and credential revocation.",
"applicationCategory": "DeveloperApplication",
"operatingSystem": "Linux, macOS, Windows",
"license": "https://opensource.org/licenses/Apache-2.0",

View file

@ -48,17 +48,17 @@ DOC_MAP = {
"ACCESS_MAP.md": (
"features/access-map.md",
"Access Map (Blast Radius)",
"Map the blast radius of leaked credentials. Authenticate against 39 providers to enumerate accessible resources and permissions.",
"Map the blast radius of leaked credentials by authenticating and enumerating accessible resources and permissions.",
),
"REVOCATION_PROVIDERS.md": (
"features/revocation.md",
"Secret Revocation",
"Revoke compromised credentials directly from the CLI. Supports 20+ providers including GitHub, GitLab, Slack, AWS, and GCP.",
"Revoke compromised credentials directly from the CLI using built-in provider-specific revocation flows.",
),
"PARSING.md": (
"features/parsing.md",
"Source Code Parsing",
"Language-aware secret detection using tree-sitter parsing for 13+ languages including Python, JavaScript, Go, Rust, and more.",
"Language-aware secret detection using lightweight parser-based context verification across 16 supported source and config languages.",
),
"FINGERPRINT.md": (
"features/fingerprints.md",

View file

@ -297,7 +297,7 @@ kingfisher scan ./my-project \
## Custom Rules
Kingfisher ships with 800+ rules, but you may want to add your own custom rules or modify existing detection to better suit your needs.
Kingfisher currently ships with 825 built-in rules, but you may want to add your own custom rules or modify existing detection to better suit your needs.
First, review [RULES.md](RULES.md) to learn how to create custom Kingfisher rules.

View file

@ -4,8 +4,8 @@
Kingfisher starts with a fast regex pass powered by Vectorscan/Hyperscan. For rules classified as `ContextDependent`, it then runs a lightweight parser-based verification pass that extracts likely assignment-style snippets such as `api_key = secret`.
> **Why not tree-sitter?** Earlier versions used tree-sitter for this step.
> Statically linking 17 grammar crates added roughly 20 MB to the binary and
> **Why not a full AST parser?** Earlier implementations used statically linked
> grammar crates for this step. That added roughly 20 MB to the binary and
> required a full AST parse just to extract `key = value` pairs. The current
> approach — handwritten regex-based lexers with comment-aware stripping —
> produces the same (or better) extraction quality at a fraction of the binary

View file

@ -265,7 +265,7 @@ flowchart TD
### Loading Builtin Rules
Kingfisher comes with 800+ builtin rules for common secret types:
Kingfisher currently ships with 825 built-in rules for common secret types:
```rust
use kingfisher_rules::{get_builtin_rules, Confidence};

View file

@ -10,7 +10,7 @@ The implementation favors lightweight extractors over full AST parsing:
- **`tl`** for HTML — attribute values, element text, and embedded `<script>` / `<style>` delegation
- **`cssparser`** for CSS — declaration parsing via Mozillas CSS tokenizer
> **History:** Earlier versions used tree-sitter with 17 statically-linked
> **History:** Earlier parser implementations relied on 17 statically-linked
> grammar crates. This added ~20 MB to the binary and required building a
> full syntax tree just to extract assignment pairs. The current lexer-based
> approach achieves the same extraction quality with near-zero binary overhead

View file

@ -3,8 +3,8 @@
Kingfisher supports direct secret revocation through rule-level `revocation:` blocks.
Current coverage in built-in rules:
- `23` provider families
- `39` revocation-enabled rules
- `28` provider families
- `46` revocation-enabled rules
Use `kingfisher revoke --rule <rule-id> <secret>` to invoke these flows. See [USAGE.md](USAGE.md#direct-secret-revocation-with-kingfisher-revoke) for command details.
@ -17,12 +17,15 @@ Use `kingfisher revoke --rule <rule-id> <secret>` to invoke these flows. See [US
| `buildkite` | 1 | `kingfisher.buildkite.1` |
| `cloudflare` | 1 | `kingfisher.cloudflare.1` |
| `confluent` | 2 | `kingfisher.confluent.2`, `kingfisher.confluent.3` |
| `cratesio` | 1 | `kingfisher.cratesio.1` |
| `deviantart` | 1 | `kingfisher.deviantart.1` |
| `doppler` | 6 | `kingfisher.doppler.1`, `kingfisher.doppler.2`, `kingfisher.doppler.3`, `kingfisher.doppler.4`, `kingfisher.doppler.5`, `kingfisher.doppler.6` |
| `gcp` | 1 | `kingfisher.gcp.1` |
| `github` | 3 | `kingfisher.github.1`, `kingfisher.github.2`, `kingfisher.github.5` |
| `gitlab` | 2 | `kingfisher.gitlab.1`, `kingfisher.gitlab.4` |
| `google` | 2 | `kingfisher.google.4`, `kingfisher.google.oauth2.1` |
| `harness` | 1 | `kingfisher.harness.pat.1` |
| `heroku` | 2 | `kingfisher.heroku.1`, `kingfisher.heroku.2` |
| `mapbox` | 1 | `kingfisher.mapbox.2` |
| `mongodb` | 1 | `kingfisher.mongodb.1` |
| `npm` | 2 | `kingfisher.npm.1`, `kingfisher.npm.2` |
@ -35,6 +38,8 @@ Use `kingfisher revoke --rule <rule-id> <secret>` to invoke these flows. See [US
| `twitch` | 1 | `kingfisher.twitch.1` |
| `unkey` | 1 | `kingfisher.unkey.2` |
| `vercel` | 5 | `kingfisher.vercel.1`, `kingfisher.vercel.2`, `kingfisher.vercel.3`, `kingfisher.vercel.4`, `kingfisher.vercel.5` |
| `vonage` | 1 | `kingfisher.vonage.2` |
| `vultr` | 1 | `kingfisher.vultr.1` |
## Notes

View file

@ -1,5 +1,4 @@
use std::{
cmp::Ordering,
hash::{Hash, Hasher},
path::PathBuf,
str::FromStr,
@ -49,7 +48,6 @@ pub struct FindingsStore {
rules: Vec<Arc<Rule>>,
matches: Vec<Arc<FindingsStoreMessage>>,
index_map: FxHashMap<(BlobId, OffsetSpan), usize>,
dedup_index_map: FxHashMap<u64, usize>,
blobs: FxHashSet<BlobId>,
clone_dir: PathBuf,
seen_bloom: Bloom<u64>,
@ -77,7 +75,6 @@ impl FindingsStore {
matches: Vec::new(),
blobs: FxHashSet::default(),
index_map: FxHashMap::default(),
dedup_index_map: FxHashMap::default(),
blob_meta: FxHashMap::default(),
origin_meta: FxHashMap::default(),
clone_dir,
@ -117,14 +114,11 @@ impl FindingsStore {
pub fn replace_matches(&mut self, new_matches: Vec<Arc<FindingsStoreMessage>>) {
self.matches = new_matches;
self.index_map.clear();
self.dedup_index_map.clear();
self.blobs.clear();
for (i, message) in self.matches.iter().enumerate() {
let blob_id = message.1.id;
let offset_span = message.2.location.offset_span;
self.index_map.insert((blob_id, offset_span), i);
let dedup_key = self.dedup_key(message.0.as_ref(), message.1.as_ref(), &message.2);
self.dedup_index_map.insert(dedup_key, i);
self.blobs.insert(blob_id);
}
}
@ -159,160 +153,6 @@ impl FindingsStore {
self.dependent_rule_ids.insert(dependency.rule_id.to_uppercase());
}
}
if !self.matches.is_empty() {
self.rebuild_dedup_index_map();
}
}
fn primary_snippet<'a>(m: &'a Match) -> &'a str {
m.groups
.captures
.iter()
.find(|c| c.name.is_none() && c.match_number == 0)
.map(|c| c.raw_value())
.or_else(|| {
m.groups
.captures
.iter()
.find(|c| matches!(c.name.as_deref(), Some("TOKEN")))
.map(|c| c.raw_value())
})
.or_else(|| m.groups.captures.get(0).map(|c| c.raw_value()))
.unwrap_or("")
}
fn dedup_key(&self, origin: &OriginSet, blob_md: &BlobMetadata, m: &Match) -> u64 {
let origin_kind = match origin.first() {
Origin::GitRepo(_) => "git",
Origin::File(_) => "file",
Origin::Extended(_) => "ext",
};
let rule_id = m.rule.id().to_uppercase();
let snippet = Self::primary_snippet(m);
let key_string = if self.dependent_rule_ids.contains(&rule_id) {
format!("{}|{}|{}|{}", rule_id, origin_kind, snippet, blob_md.id.hex())
} else {
format!("{}|{}|{}", rule_id, origin_kind, snippet)
};
xxh3_64(key_string.as_bytes())
}
fn normalize_path_for_order(path: &str) -> String {
path.replace('\\', "/")
}
fn origin_order_key(origin: &Origin) -> (u8, String, String) {
match origin {
Origin::GitRepo(repo) => {
let repo_path = Self::normalize_path_for_order(&repo.repo_path.to_string_lossy());
let blob_path = repo
.first_commit
.as_ref()
.map(|commit| Self::normalize_path_for_order(&commit.blob_path))
.unwrap_or_default();
let commit_id = repo
.first_commit
.as_ref()
.map(|commit| commit.commit_metadata.commit_id.to_string())
.unwrap_or_default();
(0, format!("{repo_path}/{blob_path}"), commit_id)
}
Origin::File(file) => {
(1, Self::normalize_path_for_order(&file.path.to_string_lossy()), String::new())
}
Origin::Extended(ext) => (
2,
ext.path()
.map(|path| Self::normalize_path_for_order(&path.to_string_lossy()))
.unwrap_or_else(|| Self::normalize_path_for_order(&ext.0.to_string())),
String::new(),
),
}
}
fn canonical_entry_key(
origin: &OriginSet,
blob_md: &BlobMetadata,
m: &Match,
) -> ((u8, String, String), usize, usize, String) {
let primary_origin = origin
.iter()
.min_by_key(|origin| Self::origin_order_key(origin))
.map(Self::origin_order_key)
.unwrap_or((u8::MAX, String::new(), String::new()));
(primary_origin, m.location.offset_span.start, m.location.offset_span.end, blob_md.id.hex())
}
fn merge_origin_sets(existing: &OriginSet, incoming: &OriginSet) -> OriginSet {
let mut origins = Vec::new();
let mut push_unique = |origin: &Origin| {
if !origins.iter().any(|existing| existing == origin) {
origins.push(origin.clone());
}
};
for origin in existing.iter().chain(incoming.iter()) {
push_unique(origin);
}
origins.sort_by_key(Self::origin_order_key);
OriginSet::try_from_iter(origins).expect("merged origin set is non-empty")
}
fn merge_duplicate(
&mut self,
idx: usize,
incoming_origin: Arc<OriginSet>,
incoming_blob: Arc<BlobMetadata>,
incoming_match: Match,
) {
let incoming_index_key = (incoming_blob.id, incoming_match.location.offset_span);
let (prefer_incoming, merged_origin) = {
let (existing_origin, existing_blob, existing_match) = &*self.matches[idx];
let existing_key = Self::canonical_entry_key(
existing_origin.as_ref(),
existing_blob.as_ref(),
existing_match,
);
let incoming_key = Self::canonical_entry_key(
incoming_origin.as_ref(),
incoming_blob.as_ref(),
&incoming_match,
);
(
incoming_key.cmp(&existing_key) == Ordering::Less,
Self::merge_origin_sets(existing_origin.as_ref(), incoming_origin.as_ref()),
)
};
let merged_origin_arc = {
let merged_origin_arc = Arc::new(merged_origin);
let fp = origin_fp(merged_origin_arc.as_ref());
self.origin_meta.entry(fp).or_insert_with(|| merged_origin_arc.clone()).clone()
};
self.index_map.insert(incoming_index_key, idx);
let stored = &mut self.matches[idx];
let (stored_origin, stored_blob, stored_match) = Arc::make_mut(stored);
*stored_origin = merged_origin_arc;
if prefer_incoming {
let blob_arc = self
.blob_meta
.entry(incoming_blob.id)
.or_insert_with(|| incoming_blob.clone())
.clone();
*stored_blob = blob_arc;
*stored_match = incoming_match;
}
}
fn rebuild_dedup_index_map(&mut self) {
self.dedup_index_map.clear();
for (idx, message) in self.matches.iter().enumerate() {
let key = self.dedup_key(message.0.as_ref(), message.1.as_ref(), &message.2);
self.dedup_index_map.insert(key, idx);
}
}
/// Insert a batch of findings.
@ -329,17 +169,42 @@ impl FindingsStore {
1. Optional duplicate filter (unchanged)
*/
if dedup {
let dedup_key = self.dedup_key(origin.as_ref(), blob_md.as_ref(), &m);
if self.seen_bloom.check(&dedup_key) {
if let Some(&idx) = self.dedup_index_map.get(&dedup_key) {
if self.blobs.insert(blob_md.id) {
added += 1;
}
self.merge_duplicate(idx, origin, blob_md, m);
continue;
}
// Prefer the full unnamed match (index 0). Fall back to a named TOKEN capture
// before using whatever capture is available.
let snippet = m
.groups
.captures
.iter()
.find(|c| c.name.is_none() && c.match_number == 0)
.map(|c| c.raw_value())
.or_else(|| {
m.groups
.captures
.iter()
.find(|c| matches!(c.name.as_deref(), Some("TOKEN")))
.map(|c| c.raw_value())
})
.or_else(|| m.groups.captures.get(0).map(|c| c.raw_value()))
.unwrap_or("");
let origin_kind = match origin.first() {
Origin::GitRepo(_) => "git",
Origin::File(_) => "file",
Origin::Extended(_) => "ext",
};
let rule_id = m.rule.id().to_uppercase();
let key_string = if self.dependent_rule_ids.contains(&rule_id) {
format!("{}|{}|{}|{}", rule_id, origin_kind, snippet, blob_md.id.hex())
} else {
format!("{}|{}|{}", rule_id, origin_kind, snippet)
};
let key = xxh3_64(key_string.as_bytes());
if self.seen_bloom.check(&key) {
continue; // very likely a duplicate
}
self.seen_bloom.set(&dedup_key);
self.seen_bloom.set(&key);
self.bloom_items += 1;
}
@ -368,14 +233,6 @@ impl FindingsStore {
let blob_id = self.matches[idx].1.id;
let offset_span = self.matches[idx].2.location.offset_span;
self.index_map.insert((blob_id, offset_span), idx);
if dedup {
let dedup_key = self.dedup_key(
self.matches[idx].0.as_ref(),
self.matches[idx].1.as_ref(),
&self.matches[idx].2,
);
self.dedup_index_map.insert(dedup_key, idx);
}
}
/* ─────────────────────────────────────────────────────────────────── */

View file

@ -37,7 +37,7 @@ use self::{base64_decode::get_base64_strings as get_b64_strings, filter::filter_
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
const RAW_MATCH_LOOKBACK: usize = 64 * 1024; // Re-scan a bounded suffix ending at the raw match.
const RAW_MATCH_LOOKBACK: usize = 4 * 1024; // Re-scan a bounded suffix ending at the raw match.
const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
// The old tree-sitter limit was 128 KiB due to full-AST parsing cost.
// The lightweight regex-based lexer is O(n) line-by-line, so we can afford
@ -270,8 +270,6 @@ impl<'a> Matcher<'a> {
continue;
}
// Re-scan a bounded suffix ending at the raw match and dedupe on the
// actual capture spans produced by the anchored regex.
let scan_start = end_idx_usize.saturating_sub(RAW_MATCH_LOOKBACK);
let before_len = matches.len();
filter_match(
@ -927,65 +925,6 @@ mod test {
Ok(())
}
#[test]
fn bogus_raw_starts_do_not_hide_earlier_matches() -> Result<()> {
let rule = Rule::new(RuleSyntax {
id: "bogus.start".into(),
name: "bogus start".into(),
pattern: r#"key\s*=\s*"([A-Z]{3})""#.into(),
confidence: crate::rules::rule::Confidence::Low,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let mut matcher = matcher;
matcher.user_data.raw_matches_scratch = vec![
RawMatch { rule_id: 0, start_idx: 5, end_idx: 9 },
RawMatch { rule_id: 0, start_idx: 5, end_idx: 19 },
];
let blob = Blob::from_bytes(b"key=\"ABC\"\nkey=\"DEF\"".to_vec());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("bogus-starts.txt")));
let mut matches = Vec::new();
let mut previous_matches = FxHashMap::default();
let mut seen_matches = FxHashSet::default();
let mut match_rule_indices = Vec::new();
matcher.process_raw_matches(
&blob,
&origin,
"bogus-starts.txt",
false,
&mut matches,
&mut previous_matches,
&mut seen_matches,
&mut match_rule_indices,
);
let secrets = matches
.iter()
.map(|m| String::from_utf8_lossy(m.matching_input).to_string())
.collect::<Vec<_>>();
assert_eq!(secrets, vec!["ABC", "DEF"]);
assert_eq!(match_rule_indices, vec![0, 0]);
Ok(())
}
#[test]
fn inline_comment_skips_match() -> Result<()> {
let rule = Rule::new(RuleSyntax {

View file

@ -172,7 +172,7 @@ pub fn enumerate_filesystem_inputs(
&seen_blobs,
Some(&matcher_stats),
enable_profiling,
Some(shared_profiler),
if enable_profiling { Some(shared_profiler) } else { None },
&args.extra_ignore_comments,
args.no_inline_ignore,
!args.no_ignore_if_contains,

View file

@ -867,7 +867,7 @@ pub async fn fetch_s3_objects(
&seen_blobs,
Some(matcher_stats),
enable_profiling,
Some(shared_profiler.clone()),
if enable_profiling { Some(shared_profiler.clone()) } else { None },
&args.extra_ignore_comments,
args.no_inline_ignore,
!args.no_ignore_if_contains,
@ -949,7 +949,7 @@ pub async fn fetch_gcs_objects(
&seen_blobs,
Some(matcher_stats),
enable_profiling,
Some(shared_profiler.clone()),
if enable_profiling { Some(shared_profiler.clone()) } else { None },
&args.extra_ignore_comments,
args.no_inline_ignore,
!args.no_ignore_if_contains,

View file

@ -145,60 +145,3 @@ fn dedup_still_merges_non_dependency_rules_across_blobs() -> Result<()> {
Ok(())
}
#[test]
fn dedup_uses_a_stable_canonical_representative() -> Result<()> {
let rule = make_rule("RULE.SIMPLE", vec![]);
let make_store = |rule: &Arc<Rule>| {
let mut store = FindingsStore::new(PathBuf::from("/tmp"));
store.record_rules(&[rule.clone()]);
store
};
let origin_a = Arc::new(OriginSet::single(Origin::from_file(PathBuf::from("a.txt"))));
let origin_z = Arc::new(OriginSet::single(Origin::from_file(PathBuf::from("z.txt"))));
let blob_a = Arc::new(BlobMetadata {
id: BlobId::new(b"blob-a"),
num_bytes: 10,
mime_essence: None,
language: None,
});
let blob_z = Arc::new(BlobMetadata {
id: BlobId::new(b"blob-z"),
num_bytes: 10,
mime_essence: None,
language: None,
});
let forward = vec![
record_match(&origin_z, &blob_z, make_match(rule.clone(), blob_z.id, "shared_token")),
record_match(&origin_a, &blob_a, make_match(rule.clone(), blob_a.id, "shared_token")),
];
let reverse = vec![
record_match(&origin_a, &blob_a, make_match(rule.clone(), blob_a.id, "shared_token")),
record_match(&origin_z, &blob_z, make_match(rule.clone(), blob_z.id, "shared_token")),
];
let mut forward_store = make_store(&rule);
forward_store.record(forward, true);
let mut reverse_store = make_store(&rule);
reverse_store.record(reverse, true);
for store in [&forward_store, &reverse_store] {
assert_eq!(store.get_matches().len(), 1);
let (origin, blob, matched) = &*store.get_matches()[0];
assert_eq!(origin.len(), 2, "duplicate findings should merge origins");
assert_eq!(
origin.first().full_path().as_deref(),
Some(PathBuf::from("a.txt").as_path()),
"the lexicographically smallest path should be the representative",
);
assert_eq!(blob.id, blob_a.id);
assert_eq!(matched.blob_id, blob_a.id);
}
Ok(())
}