From aa940b0c7abb8fd98b69859926b230a4a7878e21 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 9 Apr 2026 11:59:31 -0700 Subject: [PATCH 1/3] fixed performance regression --- crates/kingfisher-rules/data/rules/akamai.yml | 91 +++++++- .../data/rules/blockprotocol.yml | 16 ++ crates/kingfisher-rules/data/rules/blynk.yml | 163 +++++++++++++- .../kingfisher-rules/data/rules/braintree.yml | 3 + crates/kingfisher-rules/data/rules/canva.yml | 3 + crates/kingfisher-rules/data/rules/carto.yml | 4 +- crates/kingfisher-rules/data/rules/cfxre.yml | 3 + crates/kingfisher-rules/data/rules/clay.yml | 3 + .../kingfisher-rules/data/rules/clojars.yml | 9 +- .../kingfisher-rules/data/rules/crates.io.yml | 32 ++- src/findings_store.rs | 213 +++--------------- src/matcher/mod.rs | 63 +----- tests/dependent_rule_dedup.rs | 57 ----- 13 files changed, 343 insertions(+), 317 deletions(-) diff --git a/crates/kingfisher-rules/data/rules/akamai.yml b/crates/kingfisher-rules/data/rules/akamai.yml index 382b131..573072c 100644 --- a/crates/kingfisher-rules/data/rules/akamai.yml +++ b/crates/kingfisher-rules/data/rules/akamai.yml @@ -4,7 +4,7 @@ rules: pattern: | (?x) \b - (?i:client[_-]?token|access[_-]?token) + (?i:client[_-]?token) \s*=\s* ( akab- @@ -18,9 +18,51 @@ rules: visible: false examples: - "client-token=akab-sXedJBTOf0dHl27vVOd" - - "access-token=akab-xcsuUf7DY6xJgYscoe" + - "client_token = akab-c113ntt0k3n4qtari252bfxxbsl-yvsdj" references: - - https://developer.akamai.com/api/getting-started + - https://techdocs.akamai.com/developer/docs/set-up-authentication-credentials + + - name: Akamai API Access Token + id: kingfisher.akamai.3 + pattern: | + (?x) + \b + (?i:access[_-]?token) + \s*=\s* + ( + akab- + [A-Za-z0-9_-]{18,36} + ) + \b + pattern_requirements: + min_digits: 2 + min_entropy: 3.0 + confidence: medium + visible: false + examples: + - "access-token=akab-xcsuUf7DY6xJgYscoe" + - "access_token = akab-acc35t0k3nodujqunph3w7hzp7-gtm6ij" + references: + - https://techdocs.akamai.com/developer/docs/set-up-authentication-credentials + + - name: Akamai API Host + id: kingfisher.akamai.4 + pattern: | + (?x) + \b + (?i:host) + \s*=\s* + ( + [A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)*\.akamaiapis\.net + ) + \b + min_entropy: 2.0 + confidence: medium + visible: false + examples: + - "host = akab-h05tnam3wl42son7nktnlnnx-kbob3i3v.luna.akamaiapis.net" + references: + - https://techdocs.akamai.com/developer/docs/set-up-authentication-credentials - name: Akamai API Client Secret id: kingfisher.akamai.2 @@ -32,7 +74,6 @@ rules: ( [A-Za-z0-9+/=-]{20,36} ) - \b pattern_requirements: min_digits: 2 min_uppercase: 2 @@ -41,7 +82,43 @@ rules: examples: - "client-secret=431733QPkO-TgxBHBnH6alirfy" - "client-mysecret=BcDeFgHi2345JKLM67nopqRS" + - "client_secret = C113nt53KR3TN6N90yVuAgICxIRwsObLi0E67/N8eRN=" references: - - https://developer.akamai.com/api/getting-started - # No simple validation: Akamai API uses HMAC-SHA256 EdgeGrid signing - # with the client token, secret, and access token together. + - https://techdocs.akamai.com/developer/docs/set-up-authentication-credentials + - https://techdocs.akamai.com/iam-api/reference/api-client-self + - https://techdocs.akamai.com/developer/docs/make-your-first-api-call + validation: + type: Http + content: + request: + method: GET + url: https://{{ HOST }}/identity-management/v3/api-clients/self + headers: + Accept: application/json + Authorization: | + {%- assign timestamp = "" | iso_timestamp_no_frac | replace: "-", "" | replace: "Z", "+0000" -%} + {%- assign nonce = "" | uuid -%} + {%- assign auth_data = "EG1-HMAC-SHA256 client_token=" | append: CLIENT_TOKEN | append: ";access_token=" | append: ACCESS_TOKEN | append: ";timestamp=" | append: timestamp | append: ";nonce=" | append: nonce | append: ";" -%} + {%- capture data_to_sign -%}GET https {{ HOST }} /identity-management/v3/api-clients/self {{ auth_data }}{%- endcapture -%} + {%- assign signing_key = timestamp | hmac_sha256: TOKEN -%} + {%- assign signature = data_to_sign | hmac_sha256: signing_key -%} + {{ auth_data }}signature={{ signature }} + response_matcher: + - report_response: true + - type: StatusMatch + status: [200] + - type: StatusMatch + status: [401, 403] + negative: true + - type: JsonValid + depends_on_rule: + - rule_id: kingfisher.akamai.1 + variable: CLIENT_TOKEN + - rule_id: kingfisher.akamai.3 + variable: ACCESS_TOKEN + - rule_id: kingfisher.akamai.4 + variable: HOST + # No revocation yet: Akamai documents self-deactivation in IAM v2 and + # client-scoped deactivation in IAM v3, but the v3 docs available here do not + # clearly expose a safe single-credential self flow we can derive from the + # leaked tuple without risking broader API-client deactivation. diff --git a/crates/kingfisher-rules/data/rules/blockprotocol.yml b/crates/kingfisher-rules/data/rules/blockprotocol.yml index 0797b1e..3ac0aab 100644 --- a/crates/kingfisher-rules/data/rules/blockprotocol.yml +++ b/crates/kingfisher-rules/data/rules/blockprotocol.yml @@ -17,3 +17,19 @@ rules: - 'BLOCK_PROTOCOL_API_KEY=b10ck5.AbCdEfGhIjKlMnOpQrStUvWxYz1234.AbCdEfGhIjKlMnOpQrStUvWxYz12345678' references: - https://blockprotocol.org/docs/hub/api + validation: + type: Http + content: + request: + method: GET + url: https://blockprotocol.org/api/blocks + headers: + x-api-key: "{{ TOKEN }}" + Accept: application/json + response_matcher: + - report_response: true + - type: StatusMatch + status: [200] + - type: JsonValid + # No programmatic revocation endpoint is documented; Block Protocol + # directs users to revoke keys from the Account > API Keys page. diff --git a/crates/kingfisher-rules/data/rules/blynk.yml b/crates/kingfisher-rules/data/rules/blynk.yml index 963fed6..bd9ae19 100644 --- a/crates/kingfisher-rules/data/rules/blynk.yml +++ b/crates/kingfisher-rules/data/rules/blynk.yml @@ -1,9 +1,45 @@ rules: + - name: Blynk Cloud Host + id: kingfisher.blynk.10 + pattern: | + (?xi) + https:// + ( + (?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)?blynk\.cloud + ) + / + min_entropy: 2.5 + confidence: medium + visible: false + examples: + - https://blynk.cloud/external/api/get?token=Ab12Cd34Ef56Gh78Ij90Kl12Mn34Op56&V1 + - https://fra1.blynk.cloud/api/organization/profile + references: + - https://docs.blynk.io/en/blynk.cloud/platform-https-api/authentication + + - name: Blynk OAuth Client ID + id: kingfisher.blynk.11 + pattern: | + (?xi) + \b + ( + oa2-client-id_[A-Z0-9_\-]{32} + ) + \b + min_entropy: 3.0 + confidence: medium + visible: false + examples: + - oa2-client-id_Ab12Cd34Ef56Gh78Ij90Kl12Mn34Op56 + - client_id=oa2-client-id_Zx98Yw76Vu54Ts32Rq10Po98Nm76Lk54 + references: + - https://docs.blynk.io/en/blynk.console/settings/developers/oauth2 + - name: Blynk Device Access Token id: kingfisher.blynk.1 pattern: | (?xi) - https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/external/api/[A-Z0-9/]*\?token= + https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)?blynk\.cloud/external/api/[A-Z0-9/]*\?token= ([A-Z0-9_\-]{32}) & pattern_requirements: @@ -19,11 +55,30 @@ rules: - curl "https://blynk.cloud/external/api/update/property?token=Rps15JICmtRVbFyS_95houlLbm6xIQ2L&pin=v1&isDisabled=true" references: - https://docs.blynk.io/en/blynk.cloud/platform-https-api/authentication + - https://docs.blynk.io/en/blynk.cloud/device-https-api/is-device-connected + depends_on_rule: + - rule_id: kingfisher.blynk.10 + variable: HOST + validation: + type: Http + content: + request: + method: GET + url: https://{{ HOST }}/external/api/isHardwareConnected?token={{ TOKEN | url_encode }} + headers: + Accept: application/json + response_matcher: + - report_response: true + - type: StatusMatch + status: [200] + - type: WordMatch + words: ["true", "false"] + # No documented API endpoint is provided to revoke device auth tokens. - name: Blynk Organization Access Token id: kingfisher.blynk.2 pattern: | (?xi) - https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/api/[A-Z0-9_\-\s/\\]* + https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)?blynk\.cloud/api/[A-Z0-9_\-\s/\\]* -H\s*"Authorization:\s*Bearer\s* ([A-Z0-9_\-]{40}) " @@ -40,13 +95,35 @@ rules: -H "Authorization: Bearer eIdWHQqRfFmvP5LDDh-IGxPUzi7I27HthzCPAVmS" references: - https://docs.blynk.io/en/blynk.cloud/platform-https-api/authentication + - https://docs.blynk.io/en/blynk.cloud/platform-https-api/ogranization-api/get-own-organization-info + depends_on_rule: + - rule_id: kingfisher.blynk.10 + variable: HOST + validation: + type: Http + content: + request: + method: GET + url: https://{{ HOST }}/api/v1/organization/profile + headers: + Authorization: Bearer {{ TOKEN }} + Accept: application/json + response_matcher: + - report_response: true + - type: StatusMatch + status: [200] + - type: JsonValid + - type: WordMatch + match_all_words: true + words: ['"id"', '"name"'] + # No documented API endpoint is provided to revoke organization access tokens. - name: Blynk Organization Access Token id: kingfisher.blynk.3 pattern: | (?xi) -H\s*"Authorization:\s*Bearer\s* ([A-Z0-9_\-]{40}) - "[\s\\]*https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/api + "[\s\\]*https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)?blynk\.cloud/api pattern_requirements: min_digits: 2 min_uppercase: 1 @@ -60,12 +137,34 @@ rules: https://fra1.blynk.cloud/api/organization/profile references: - https://docs.blynk.io/en/blynk.cloud/platform-https-api/authentication + - https://docs.blynk.io/en/blynk.cloud/platform-https-api/ogranization-api/get-own-organization-info + depends_on_rule: + - rule_id: kingfisher.blynk.10 + variable: HOST + validation: + type: Http + content: + request: + method: GET + url: https://{{ HOST }}/api/v1/organization/profile + headers: + Authorization: Bearer {{ TOKEN }} + Accept: application/json + response_matcher: + - report_response: true + - type: StatusMatch + status: [200] + - type: JsonValid + - type: WordMatch + match_all_words: true + words: ['"id"', '"name"'] + # No documented API endpoint is provided to revoke organization access tokens. - name: Blynk Organization Client Credentials id: kingfisher.blynk.8 pattern: | (?xi) - https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/oauth2/[A-Z0-9_\-\s/\\?=&]* - (oa2-client-id_[A-Z0-9_\-]{32}) + https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)?blynk\.cloud/oauth2/[A-Z0-9_\-\s/\\?=&]* + oa2-client-id_[A-Z0-9_\-]{32} (?: : | &client_secret= ) ([A-Z0-9_\-]{40}) pattern_requirements: @@ -82,14 +181,38 @@ rules: - 'curl -X POST https://fra1.blynk.cloud/oauth2/token?grant_type=client_credentials&client_id=oa2-client-id_zmNtW-D0Toqpz4AZnBLCIlklBrz9ynU-&client_secret=5uC5Y4Mcvdl5rB56rBmxnvB4DZgiIpcyTPbOoEWp' references: - https://docs.blynk.io/en/blynk.console/settings/developers/oauth2 + - https://docs.blynk.io/en/blynk.cloud/platform-https-api/authentication + depends_on_rule: + - rule_id: kingfisher.blynk.10 + variable: HOST + - rule_id: kingfisher.blynk.11 + variable: CLIENT_ID + validation: + type: Http + content: + request: + method: POST + url: https://{{ HOST }}/oauth2/token?grant_type=client_credentials + headers: + Authorization: 'Basic {{ CLIENT_ID | append: ":" | append: TOKEN | b64enc }}' + Accept: application/json + response_matcher: + - report_response: true + - type: StatusMatch + status: [200] + - type: JsonValid + - type: WordMatch + match_all_words: true + words: ['"access_token"', '"token_type"'] + # No documented API endpoint is provided to revoke OAuth client credentials. - name: Blynk Organization Client Credentials id: kingfisher.blynk.9 pattern: | (?xi) \b - (oa2-client-id_[A-Z0-9_\-]{32}) + oa2-client-id_[A-Z0-9_\-]{32} :([A-Z0-9_\-]{40}) - [\s\\]*https://(fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/oauth2 + [\s\\]*https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)?blynk\.cloud/oauth2 pattern_requirements: min_digits: 2 min_uppercase: 1 @@ -102,4 +225,28 @@ rules: curl -X POST -u oa2-client-id_zmNtW-D0Toqpz4AZnBLCIlklBrz9ynU-:5uC5Y4Mcvdl5rB56rBmxnvB4DZgiIpcyTPbOoEWp \ https://fra1.blynk.cloud/oauth2/token?grant_type=client_credentials references: - - https://docs.blynk.io/en/blynk.console/settings/developers/oauth2 \ No newline at end of file + - https://docs.blynk.io/en/blynk.console/settings/developers/oauth2 + - https://docs.blynk.io/en/blynk.cloud/platform-https-api/authentication + depends_on_rule: + - rule_id: kingfisher.blynk.10 + variable: HOST + - rule_id: kingfisher.blynk.11 + variable: CLIENT_ID + validation: + type: Http + content: + request: + method: POST + url: https://{{ HOST }}/oauth2/token?grant_type=client_credentials + headers: + Authorization: 'Basic {{ CLIENT_ID | append: ":" | append: TOKEN | b64enc }}' + Accept: application/json + response_matcher: + - report_response: true + - type: StatusMatch + status: [200] + - type: JsonValid + - type: WordMatch + match_all_words: true + words: ['"access_token"', '"token_type"'] + # No documented API endpoint is provided to revoke OAuth client credentials. diff --git a/crates/kingfisher-rules/data/rules/braintree.yml b/crates/kingfisher-rules/data/rules/braintree.yml index 57f87ef..1e4002a 100644 --- a/crates/kingfisher-rules/data/rules/braintree.yml +++ b/crates/kingfisher-rules/data/rules/braintree.yml @@ -31,3 +31,6 @@ rules: references: - https://developer.paypal.com/braintree/docs/guides/authorization/tokenization-key - https://developer.paypal.com/braintree/docs/start/overview + # No standalone validation or revocation is added: Braintree documents + # tokenization keys as publishable client credentials, and the public docs + # do not describe a programmatic tokenization-key introspection or revoke API. diff --git a/crates/kingfisher-rules/data/rules/canva.yml b/crates/kingfisher-rules/data/rules/canva.yml index e8d84cb..6986fba 100644 --- a/crates/kingfisher-rules/data/rules/canva.yml +++ b/crates/kingfisher-rules/data/rules/canva.yml @@ -18,3 +18,6 @@ rules: references: - https://www.canva.dev/docs/connect/authentication/ - https://www.canva.dev/docs/connect/guidelines/security/ + # No standalone validation is added: Canva Connect client secrets are used + # in OAuth flows that require the matching client_id and PKCE flow state. + # No programmatic client-secret revocation endpoint is documented. diff --git a/crates/kingfisher-rules/data/rules/carto.yml b/crates/kingfisher-rules/data/rules/carto.yml index e93525d..45653b5 100644 --- a/crates/kingfisher-rules/data/rules/carto.yml +++ b/crates/kingfisher-rules/data/rules/carto.yml @@ -21,4 +21,6 @@ rules: - "carto_api_key = 'eyJhbGciOiJIUzI1NiJ9.eyJhIjoiYWNfY2RlZmdoIiwianRpIjoiYWI2MTIzNDUifQ.BcDeFgHiJkLmNoPqRsTuVwXy'" references: - https://api-docs.carto.com - # No public validation endpoint documented for CARTO JWT tokens. + # No standalone validation or revocation is added: CARTO's documented APIs + # use account- and deployment-specific endpoints, and the public docs do not + # describe a token self-introspection or self-revocation API for these JWTs. diff --git a/crates/kingfisher-rules/data/rules/cfxre.yml b/crates/kingfisher-rules/data/rules/cfxre.yml index f0325d2..3e05873 100644 --- a/crates/kingfisher-rules/data/rules/cfxre.yml +++ b/crates/kingfisher-rules/data/rules/cfxre.yml @@ -17,3 +17,6 @@ rules: - 'sv_licenseKey "cfxk_AbCdEfGhIjKlMnOpQrStUvWxYz1234567890_abcdef"' references: - https://docs.fivem.net/docs/server-manual/setting-up-a-server/ + # No standalone validation or revocation is added: the public FiveM/Cfx.re + # docs describe configuring server keys, but do not document a safe API for + # verifying or revoking a key using the key itself. diff --git a/crates/kingfisher-rules/data/rules/clay.yml b/crates/kingfisher-rules/data/rules/clay.yml index ff86401..d2a3cc2 100644 --- a/crates/kingfisher-rules/data/rules/clay.yml +++ b/crates/kingfisher-rules/data/rules/clay.yml @@ -23,3 +23,6 @@ rules: - CLAY_API_KEY=d8dfd14ec83e4e17a7d2 references: - https://university.clay.com/docs/http-api-integration-overview + # No standalone validation or revocation is added: the public Clay docs + # referenced here do not document an authenticated API endpoint for + # introspecting or revoking these keys. diff --git a/crates/kingfisher-rules/data/rules/clojars.yml b/crates/kingfisher-rules/data/rules/clojars.yml index e5daf07..dc0bd65 100644 --- a/crates/kingfisher-rules/data/rules/clojars.yml +++ b/crates/kingfisher-rules/data/rules/clojars.yml @@ -22,10 +22,10 @@ rules: - name: Clojars API Token id: kingfisher.clojars.2 pattern: | - (?xi) + (?x) \b ( - CLOJARS_[a-z0-9]{60} + CLOJARS_[a-zA-Z0-9]{60} ) \b pattern_requirements: @@ -36,4 +36,7 @@ rules: - "CLOJARS_7a0b5c647fdca75616e99a5629ba28955e53faf6391b1b4461ccd972b0e2" references: - https://github.com/clojars/clojars-web/wiki/API - - https://github.com/clojars/clojars-web/wiki/Tutorial-for-library-authors \ No newline at end of file + - https://github.com/clojars/clojars-web/wiki/Tutorial-for-library-authors + # No standalone validation or revocation is added: Clojars deploy tokens are + # used via Basic auth for artifact uploads, and the public docs/source do not + # expose a safe read-only token introspection or token self-revocation API. diff --git a/crates/kingfisher-rules/data/rules/crates.io.yml b/crates/kingfisher-rules/data/rules/crates.io.yml index 3fa76e4..dda849b 100644 --- a/crates/kingfisher-rules/data/rules/crates.io.yml +++ b/crates/kingfisher-rules/data/rules/crates.io.yml @@ -16,4 +16,34 @@ rules: - 'Bearer: ciotgp8BGZBlX192iExSQPm0SrUlBunG8zd' references: - https://blog.rust-lang.org/2023/09/22/crates-io-usage-policy-rfc - - https://github.com/rust-lang/crates.io/tree/main/src \ No newline at end of file + - https://github.com/rust-lang/crates.io/tree/main/src + - https://doc.rust-lang.org/cargo/reference/registry-web-api.html + validation: + type: Http + content: + request: + method: GET + url: https://crates.io/api/v1/me + headers: + Authorization: "{{ TOKEN }}" + Accept: application/json + response_matcher: + - report_response: true + - type: StatusMatch + status: [200] + - type: JsonValid + - type: WordMatch + match_all_words: true + words: ['"id"', '"login"'] + revocation: + type: Http + content: + request: + method: DELETE + url: https://crates.io/api/v1/tokens/current + headers: + Authorization: "{{ TOKEN }}" + response_matcher: + - report_response: true + - type: StatusMatch + status: [204] diff --git a/src/findings_store.rs b/src/findings_store.rs index 8dcd8c4..57070f0 100644 --- a/src/findings_store.rs +++ b/src/findings_store.rs @@ -1,5 +1,4 @@ use std::{ - cmp::Ordering, hash::{Hash, Hasher}, path::PathBuf, str::FromStr, @@ -49,7 +48,6 @@ pub struct FindingsStore { rules: Vec>, matches: Vec>, index_map: FxHashMap<(BlobId, OffsetSpan), usize>, - dedup_index_map: FxHashMap, blobs: FxHashSet, clone_dir: PathBuf, seen_bloom: Bloom, @@ -77,7 +75,6 @@ impl FindingsStore { matches: Vec::new(), blobs: FxHashSet::default(), index_map: FxHashMap::default(), - dedup_index_map: FxHashMap::default(), blob_meta: FxHashMap::default(), origin_meta: FxHashMap::default(), clone_dir, @@ -117,14 +114,11 @@ impl FindingsStore { pub fn replace_matches(&mut self, new_matches: Vec>) { self.matches = new_matches; self.index_map.clear(); - self.dedup_index_map.clear(); self.blobs.clear(); for (i, message) in self.matches.iter().enumerate() { let blob_id = message.1.id; let offset_span = message.2.location.offset_span; self.index_map.insert((blob_id, offset_span), i); - let dedup_key = self.dedup_key(message.0.as_ref(), message.1.as_ref(), &message.2); - self.dedup_index_map.insert(dedup_key, i); self.blobs.insert(blob_id); } } @@ -159,160 +153,6 @@ impl FindingsStore { self.dependent_rule_ids.insert(dependency.rule_id.to_uppercase()); } } - if !self.matches.is_empty() { - self.rebuild_dedup_index_map(); - } - } - - fn primary_snippet<'a>(m: &'a Match) -> &'a str { - m.groups - .captures - .iter() - .find(|c| c.name.is_none() && c.match_number == 0) - .map(|c| c.raw_value()) - .or_else(|| { - m.groups - .captures - .iter() - .find(|c| matches!(c.name.as_deref(), Some("TOKEN"))) - .map(|c| c.raw_value()) - }) - .or_else(|| m.groups.captures.get(0).map(|c| c.raw_value())) - .unwrap_or("") - } - - fn dedup_key(&self, origin: &OriginSet, blob_md: &BlobMetadata, m: &Match) -> u64 { - let origin_kind = match origin.first() { - Origin::GitRepo(_) => "git", - Origin::File(_) => "file", - Origin::Extended(_) => "ext", - }; - let rule_id = m.rule.id().to_uppercase(); - let snippet = Self::primary_snippet(m); - let key_string = if self.dependent_rule_ids.contains(&rule_id) { - format!("{}|{}|{}|{}", rule_id, origin_kind, snippet, blob_md.id.hex()) - } else { - format!("{}|{}|{}", rule_id, origin_kind, snippet) - }; - xxh3_64(key_string.as_bytes()) - } - - fn normalize_path_for_order(path: &str) -> String { - path.replace('\\', "/") - } - - fn origin_order_key(origin: &Origin) -> (u8, String, String) { - match origin { - Origin::GitRepo(repo) => { - let repo_path = Self::normalize_path_for_order(&repo.repo_path.to_string_lossy()); - let blob_path = repo - .first_commit - .as_ref() - .map(|commit| Self::normalize_path_for_order(&commit.blob_path)) - .unwrap_or_default(); - let commit_id = repo - .first_commit - .as_ref() - .map(|commit| commit.commit_metadata.commit_id.to_string()) - .unwrap_or_default(); - (0, format!("{repo_path}/{blob_path}"), commit_id) - } - Origin::File(file) => { - (1, Self::normalize_path_for_order(&file.path.to_string_lossy()), String::new()) - } - Origin::Extended(ext) => ( - 2, - ext.path() - .map(|path| Self::normalize_path_for_order(&path.to_string_lossy())) - .unwrap_or_else(|| Self::normalize_path_for_order(&ext.0.to_string())), - String::new(), - ), - } - } - - fn canonical_entry_key( - origin: &OriginSet, - blob_md: &BlobMetadata, - m: &Match, - ) -> ((u8, String, String), usize, usize, String) { - let primary_origin = origin - .iter() - .min_by_key(|origin| Self::origin_order_key(origin)) - .map(Self::origin_order_key) - .unwrap_or((u8::MAX, String::new(), String::new())); - (primary_origin, m.location.offset_span.start, m.location.offset_span.end, blob_md.id.hex()) - } - - fn merge_origin_sets(existing: &OriginSet, incoming: &OriginSet) -> OriginSet { - let mut origins = Vec::new(); - let mut push_unique = |origin: &Origin| { - if !origins.iter().any(|existing| existing == origin) { - origins.push(origin.clone()); - } - }; - - for origin in existing.iter().chain(incoming.iter()) { - push_unique(origin); - } - - origins.sort_by_key(Self::origin_order_key); - OriginSet::try_from_iter(origins).expect("merged origin set is non-empty") - } - - fn merge_duplicate( - &mut self, - idx: usize, - incoming_origin: Arc, - incoming_blob: Arc, - incoming_match: Match, - ) { - let incoming_index_key = (incoming_blob.id, incoming_match.location.offset_span); - let (prefer_incoming, merged_origin) = { - let (existing_origin, existing_blob, existing_match) = &*self.matches[idx]; - let existing_key = Self::canonical_entry_key( - existing_origin.as_ref(), - existing_blob.as_ref(), - existing_match, - ); - let incoming_key = Self::canonical_entry_key( - incoming_origin.as_ref(), - incoming_blob.as_ref(), - &incoming_match, - ); - ( - incoming_key.cmp(&existing_key) == Ordering::Less, - Self::merge_origin_sets(existing_origin.as_ref(), incoming_origin.as_ref()), - ) - }; - - let merged_origin_arc = { - let merged_origin_arc = Arc::new(merged_origin); - let fp = origin_fp(merged_origin_arc.as_ref()); - self.origin_meta.entry(fp).or_insert_with(|| merged_origin_arc.clone()).clone() - }; - - self.index_map.insert(incoming_index_key, idx); - - let stored = &mut self.matches[idx]; - let (stored_origin, stored_blob, stored_match) = Arc::make_mut(stored); - *stored_origin = merged_origin_arc; - if prefer_incoming { - let blob_arc = self - .blob_meta - .entry(incoming_blob.id) - .or_insert_with(|| incoming_blob.clone()) - .clone(); - *stored_blob = blob_arc; - *stored_match = incoming_match; - } - } - - fn rebuild_dedup_index_map(&mut self) { - self.dedup_index_map.clear(); - for (idx, message) in self.matches.iter().enumerate() { - let key = self.dedup_key(message.0.as_ref(), message.1.as_ref(), &message.2); - self.dedup_index_map.insert(key, idx); - } } /// Insert a batch of findings. @@ -329,17 +169,42 @@ impl FindingsStore { │ 1. Optional duplicate filter (unchanged) │ └───────────────────────────────────────────────────────────────*/ if dedup { - let dedup_key = self.dedup_key(origin.as_ref(), blob_md.as_ref(), &m); - if self.seen_bloom.check(&dedup_key) { - if let Some(&idx) = self.dedup_index_map.get(&dedup_key) { - if self.blobs.insert(blob_md.id) { - added += 1; - } - self.merge_duplicate(idx, origin, blob_md, m); - continue; - } + // Prefer the full unnamed match (index 0). Fall back to a named TOKEN capture + // before using whatever capture is available. + let snippet = m + .groups + .captures + .iter() + .find(|c| c.name.is_none() && c.match_number == 0) + .map(|c| c.raw_value()) + .or_else(|| { + m.groups + .captures + .iter() + .find(|c| matches!(c.name.as_deref(), Some("TOKEN"))) + .map(|c| c.raw_value()) + }) + .or_else(|| m.groups.captures.get(0).map(|c| c.raw_value())) + .unwrap_or(""); + + let origin_kind = match origin.first() { + Origin::GitRepo(_) => "git", + Origin::File(_) => "file", + Origin::Extended(_) => "ext", + }; + + let rule_id = m.rule.id().to_uppercase(); + let key_string = if self.dependent_rule_ids.contains(&rule_id) { + format!("{}|{}|{}|{}", rule_id, origin_kind, snippet, blob_md.id.hex()) + } else { + format!("{}|{}|{}", rule_id, origin_kind, snippet) + }; + let key = xxh3_64(key_string.as_bytes()); + + if self.seen_bloom.check(&key) { + continue; // very likely a duplicate } - self.seen_bloom.set(&dedup_key); + self.seen_bloom.set(&key); self.bloom_items += 1; } @@ -368,14 +233,6 @@ impl FindingsStore { let blob_id = self.matches[idx].1.id; let offset_span = self.matches[idx].2.location.offset_span; self.index_map.insert((blob_id, offset_span), idx); - if dedup { - let dedup_key = self.dedup_key( - self.matches[idx].0.as_ref(), - self.matches[idx].1.as_ref(), - &self.matches[idx].2, - ); - self.dedup_index_map.insert(dedup_key, idx); - } } /* ─────────────────────────────────────────────────────────────────── */ diff --git a/src/matcher/mod.rs b/src/matcher/mod.rs index cdb0bd5..1938b30 100644 --- a/src/matcher/mod.rs +++ b/src/matcher/mod.rs @@ -37,7 +37,7 @@ use self::{base64_decode::get_base64_strings as get_b64_strings, filter::filter_ const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches -const RAW_MATCH_LOOKBACK: usize = 64 * 1024; // Re-scan a bounded suffix ending at the raw match. +const RAW_MATCH_LOOKBACK: usize = 4 * 1024; // Re-scan a bounded suffix ending at the raw match. const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs // The old tree-sitter limit was 128 KiB due to full-AST parsing cost. // The lightweight regex-based lexer is O(n) line-by-line, so we can afford @@ -270,8 +270,6 @@ impl<'a> Matcher<'a> { continue; } - // Re-scan a bounded suffix ending at the raw match and dedupe on the - // actual capture spans produced by the anchored regex. let scan_start = end_idx_usize.saturating_sub(RAW_MATCH_LOOKBACK); let before_len = matches.len(); filter_match( @@ -927,65 +925,6 @@ mod test { Ok(()) } - #[test] - fn bogus_raw_starts_do_not_hide_earlier_matches() -> Result<()> { - let rule = Rule::new(RuleSyntax { - id: "bogus.start".into(), - name: "bogus start".into(), - pattern: r#"key\s*=\s*"([A-Z]{3})""#.into(), - confidence: crate::rules::rule::Confidence::Low, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - revocation: None, - depends_on_rule: vec![], - pattern_requirements: None, - tls_mode: None, - }); - - let rules_db = RulesDatabase::from_rules(vec![rule])?; - let seen = BlobIdMap::new(); - let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone()))); - let matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; - - let mut matcher = matcher; - matcher.user_data.raw_matches_scratch = vec![ - RawMatch { rule_id: 0, start_idx: 5, end_idx: 9 }, - RawMatch { rule_id: 0, start_idx: 5, end_idx: 19 }, - ]; - - let blob = Blob::from_bytes(b"key=\"ABC\"\nkey=\"DEF\"".to_vec()); - let origin = OriginSet::from(Origin::from_file(PathBuf::from("bogus-starts.txt"))); - let mut matches = Vec::new(); - let mut previous_matches = FxHashMap::default(); - let mut seen_matches = FxHashSet::default(); - let mut match_rule_indices = Vec::new(); - - matcher.process_raw_matches( - &blob, - &origin, - "bogus-starts.txt", - false, - &mut matches, - &mut previous_matches, - &mut seen_matches, - &mut match_rule_indices, - ); - - let secrets = matches - .iter() - .map(|m| String::from_utf8_lossy(m.matching_input).to_string()) - .collect::>(); - - assert_eq!(secrets, vec!["ABC", "DEF"]); - assert_eq!(match_rule_indices, vec![0, 0]); - Ok(()) - } - #[test] fn inline_comment_skips_match() -> Result<()> { let rule = Rule::new(RuleSyntax { diff --git a/tests/dependent_rule_dedup.rs b/tests/dependent_rule_dedup.rs index 003e55b..273468e 100644 --- a/tests/dependent_rule_dedup.rs +++ b/tests/dependent_rule_dedup.rs @@ -145,60 +145,3 @@ fn dedup_still_merges_non_dependency_rules_across_blobs() -> Result<()> { Ok(()) } - -#[test] -fn dedup_uses_a_stable_canonical_representative() -> Result<()> { - let rule = make_rule("RULE.SIMPLE", vec![]); - - let make_store = |rule: &Arc| { - let mut store = FindingsStore::new(PathBuf::from("/tmp")); - store.record_rules(&[rule.clone()]); - store - }; - - let origin_a = Arc::new(OriginSet::single(Origin::from_file(PathBuf::from("a.txt")))); - let origin_z = Arc::new(OriginSet::single(Origin::from_file(PathBuf::from("z.txt")))); - let blob_a = Arc::new(BlobMetadata { - id: BlobId::new(b"blob-a"), - num_bytes: 10, - mime_essence: None, - language: None, - }); - let blob_z = Arc::new(BlobMetadata { - id: BlobId::new(b"blob-z"), - num_bytes: 10, - mime_essence: None, - language: None, - }); - - let forward = vec![ - record_match(&origin_z, &blob_z, make_match(rule.clone(), blob_z.id, "shared_token")), - record_match(&origin_a, &blob_a, make_match(rule.clone(), blob_a.id, "shared_token")), - ]; - let reverse = vec![ - record_match(&origin_a, &blob_a, make_match(rule.clone(), blob_a.id, "shared_token")), - record_match(&origin_z, &blob_z, make_match(rule.clone(), blob_z.id, "shared_token")), - ]; - - let mut forward_store = make_store(&rule); - forward_store.record(forward, true); - - let mut reverse_store = make_store(&rule); - reverse_store.record(reverse, true); - - for store in [&forward_store, &reverse_store] { - assert_eq!(store.get_matches().len(), 1); - - let (origin, blob, matched) = &*store.get_matches()[0]; - assert_eq!(origin.len(), 2, "duplicate findings should merge origins"); - assert_eq!( - origin.first().full_path().as_deref(), - Some(PathBuf::from("a.txt").as_path()), - "the lexicographically smallest path should be the representative", - ); - assert_eq!(blob.id, blob_a.id); - assert_eq!(matched.blob_id, blob_a.id); - } - - Ok(()) -} From 2de703105f1b324a2935a4255e3e7751a1e03b32 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 9 Apr 2026 21:06:51 -0700 Subject: [PATCH 2/3] fixed performance regression --- CHANGELOG.md | 1 + src/scanner/enumerate.rs | 2 +- src/scanner/repos.rs | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94ec62d..185812d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ All notable changes to this project will be documented in this file. ## [v1.95.0] +- Fixed scan performance regression: the rule profiler was unconditionally active even without `--rule-stats`, causing RwLock contention across scan threads. Scans are now ~15% faster than v1.94.0. - Added 80+ built-in rules, bringing the bundled ruleset to 820 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more. - Replaced tree-sitter with a lighter parser-based context verifier built from handwritten lexers plus `tl`/`cssparser`, preserving context-dependent matching while cutting about 19 MB from the release binary. - Added a `validation: type: Raw` exception path for provider-specific checks, with new raw validators for Azure Batch, FTP, Kraken, LDAP, RabbitMQ, and Redis. Also added stable request-scoped template values plus new Liquid filters for HMAC-SHA384 hex output and timestamp generation. diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index fb4fc13..feb3959 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -172,7 +172,7 @@ pub fn enumerate_filesystem_inputs( &seen_blobs, Some(&matcher_stats), enable_profiling, - Some(shared_profiler), + if enable_profiling { Some(shared_profiler) } else { None }, &args.extra_ignore_comments, args.no_inline_ignore, !args.no_ignore_if_contains, diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index 3b354b0..e975914 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -867,7 +867,7 @@ pub async fn fetch_s3_objects( &seen_blobs, Some(matcher_stats), enable_profiling, - Some(shared_profiler.clone()), + if enable_profiling { Some(shared_profiler.clone()) } else { None }, &args.extra_ignore_comments, args.no_inline_ignore, !args.no_ignore_if_contains, @@ -949,7 +949,7 @@ pub async fn fetch_gcs_objects( &seen_blobs, Some(matcher_stats), enable_profiling, - Some(shared_profiler.clone()), + if enable_profiling { Some(shared_profiler.clone()) } else { None }, &args.extra_ignore_comments, args.no_inline_ignore, !args.no_ignore_if_contains, From 365422a819bb304d9536a42c14f22ab67281c05f Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 9 Apr 2026 22:21:02 -0700 Subject: [PATCH 3/3] fixed performance regression --- CHANGELOG.md | 2 +- README.md | 2 +- docs-site/docs/assets/stylesheets/extra.css | 20 ++++++++ docs-site/docs/changelog.md | 3 +- docs-site/docs/features/access-map.md | 2 +- docs-site/docs/features/parsing.md | 4 +- docs-site/docs/features/revocation.md | 11 +++-- docs-site/docs/index.md | 2 +- docs-site/docs/reference/library.md | 2 +- docs-site/docs/rules/builtin-rules.md | 54 ++++++++++++++++----- docs-site/docs/usage/advanced.md | 2 +- docs-site/mkdocs.yml | 2 +- docs-site/overrides/home.html | 8 +-- docs-site/overrides/main.html | 2 +- docs-site/scripts/prepare-docs.py | 6 +-- docs/ADVANCED.md | 2 +- docs/CONTEXT_VERIFICATION.md | 4 +- docs/LIBRARY.md | 2 +- docs/PARSING.md | 2 +- docs/REVOCATION_PROVIDERS.md | 9 +++- 20 files changed, 102 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 185812d..fc4bea8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. ## [v1.95.0] - Fixed scan performance regression: the rule profiler was unconditionally active even without `--rule-stats`, causing RwLock contention across scan threads. Scans are now ~15% faster than v1.94.0. -- Added 80+ built-in rules, bringing the bundled ruleset to 820 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more. +- Added 80+ built-in rules, bringing the bundled ruleset to 825 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more. - Replaced tree-sitter with a lighter parser-based context verifier built from handwritten lexers plus `tl`/`cssparser`, preserving context-dependent matching while cutting about 19 MB from the release binary. - Added a `validation: type: Raw` exception path for provider-specific checks, with new raw validators for Azure Batch, FTP, Kraken, LDAP, RabbitMQ, and Redis. Also added stable request-scoped template values plus new Liquid filters for HMAC-SHA384 hex output and timestamp generation. - Expanded live validation coverage for several built-in rules, including Agora, Bitfinex, DocuSign, Dwolla, GitLab, KuCoin, RingCentral, Snowflake, Tableau, Trello, and Webex. Also tightened newly added helper regex to avoid high-match scan regressions, and made preflight-blocked raw validations report as skipped/not attempted instead of failed. diff --git a/README.md b/README.md index d2ddc61..9de1ebf 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ License - Detection Rules + Detection Rules
diff --git a/docs-site/docs/assets/stylesheets/extra.css b/docs-site/docs/assets/stylesheets/extra.css index c590f93..2930eaf 100644 --- a/docs-site/docs/assets/stylesheets/extra.css +++ b/docs-site/docs/assets/stylesheets/extra.css @@ -43,6 +43,9 @@ .kf-hero__actions .md-button { color: var(--md-primary-bg-color); border-color: var(--md-primary-bg-color); + font-size: 0.95rem; + font-weight: 600; + padding: 0.8em 1.4em; } .kf-hero__actions .md-button--primary { @@ -152,6 +155,17 @@ border-radius: 0.5rem; } +.kf-benchmark__link { + margin-top: 1rem; + text-align: center; +} + +.kf-benchmark__link a { + display: inline-block; + font-size: 0.95rem; + font-weight: 600; +} + /* Scan targets */ .kf-targets { padding: 4rem 0; @@ -220,6 +234,12 @@ margin-top: 2rem; } +.kf-install__cta .md-button { + font-size: 0.95rem; + font-weight: 600; + padding: 0.8em 1.4em; +} + /* ===== Rules Table ===== */ .rules-search { diff --git a/docs-site/docs/changelog.md b/docs-site/docs/changelog.md index 0e5334f..de4e611 100644 --- a/docs-site/docs/changelog.md +++ b/docs-site/docs/changelog.md @@ -8,7 +8,8 @@ description: "Kingfisher release history: new features, rules, bug fixes, and im All notable changes to this project will be documented in this file. ## [v1.95.0] -- Added 80+ built-in rules, bringing the bundled ruleset to 820 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more. +- Fixed scan performance regression: the rule profiler was unconditionally active even without `--rule-stats`, causing RwLock contention across scan threads. Scans are now ~15% faster than v1.94.0. +- Added 80+ built-in rules, bringing the bundled ruleset to 825 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more. - Replaced tree-sitter with a lighter parser-based context verifier built from handwritten lexers plus `tl`/`cssparser`, preserving context-dependent matching while cutting about 19 MB from the release binary. - Added a `validation: type: Raw` exception path for provider-specific checks, with new raw validators for Azure Batch, FTP, Kraken, LDAP, RabbitMQ, and Redis. Also added stable request-scoped template values plus new Liquid filters for HMAC-SHA384 hex output and timestamp generation. - Expanded live validation coverage for several built-in rules, including Agora, Bitfinex, DocuSign, Dwolla, GitLab, KuCoin, RingCentral, Snowflake, Tableau, Trello, and Webex. Also tightened newly added helper regex to avoid high-match scan regressions, and made preflight-blocked raw validations report as skipped/not attempted instead of failed. diff --git a/docs-site/docs/features/access-map.md b/docs-site/docs/features/access-map.md index fea6f59..e62e679 100644 --- a/docs-site/docs/features/access-map.md +++ b/docs-site/docs/features/access-map.md @@ -1,6 +1,6 @@ --- title: "Access Map (Blast Radius)" -description: "Map the blast radius of leaked credentials. Authenticate against 39 providers to enumerate accessible resources and permissions." +description: "Map the blast radius of leaked credentials by authenticating and enumerating accessible resources and permissions." --- # Access Map: supported tokens & credential formats diff --git a/docs-site/docs/features/parsing.md b/docs-site/docs/features/parsing.md index dc0c04c..0d53695 100644 --- a/docs-site/docs/features/parsing.md +++ b/docs-site/docs/features/parsing.md @@ -1,6 +1,6 @@ --- title: "Source Code Parsing" -description: "Language-aware secret detection using tree-sitter parsing for 13+ languages including Python, JavaScript, Go, Rust, and more." +description: "Language-aware secret detection using lightweight parser-based context verification across 16 supported source and config languages." --- # Kingfisher Source Code Parsing @@ -13,7 +13,7 @@ The implementation favors lightweight extractors over full AST parsing: - **`tl`** for HTML — attribute values, element text, and embedded `