changes in response to PR review

This commit is contained in:
Mick Grove 2026-04-08 16:16:31 -07:00
commit 58e9cfd585
15 changed files with 1092 additions and 143 deletions

View file

@ -7,7 +7,7 @@
<img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" alt="License" style="height: 24px;" />
</a>
<a href="https://github.com/mongodb/kingfisher">
<img src="https://img.shields.io/badge/Detection%20Rules-820-2ea043.svg" alt="Detection Rules" style="height: 24px;" />
<img src="https://img.shields.io/badge/Detection%20Rules-821-2ea043.svg" alt="Detection Rules" style="height: 24px;" />
</a>
<br>
<a href="https://github.com/mongodb/kingfisher/pkgs/container/kingfisher">

View file

@ -14,6 +14,8 @@ use crate::finding::{Finding, FindingLocation};
use crate::primitives;
use crate::scanner_pool::ScannerPool;
const RAW_MATCH_LOOKBACK: usize = 64 * 1024;
/// Configuration options for the scanner.
#[derive(Debug, Clone)]
pub struct ScannerConfig {
@ -167,9 +169,14 @@ impl Scanner {
// Process matches through regex
let mut findings = Vec::new();
let mut seen_matches: FxHashSet<u64> = FxHashSet::default();
let mut previous_spans: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
let mut seen_raw_match_ends: FxHashSet<(usize, usize)> = FxHashSet::default();
let mut previous_full_spans: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
for (rule_id, start, end) in raw_matches.into_iter().rev() {
let _ = start; // Block-mode Vectorscan reports `from` as 0 unless SOM is enabled.
if !seen_raw_match_ends.insert((rule_id, end)) {
continue;
}
let rule = match self.rules_db.get_rule(rule_id) {
Some(r) => r,
None => continue,
@ -180,16 +187,18 @@ impl Scanner {
Err(_) => continue,
};
let current_span = OffsetSpan::from_range(start..end);
// Check for overlapping spans
if !primitives::record_match(&mut previous_spans, rule_id, current_span) {
continue;
}
let haystack = &bytes[start..end];
let scan_start = end.saturating_sub(RAW_MATCH_LOOKBACK);
let haystack = &bytes[scan_start..end];
for captures in anchored_regex.captures_iter(haystack) {
let full_capture = captures.get(0).unwrap();
let full_capture_span = OffsetSpan::from_range(
(scan_start + full_capture.start())..(scan_start + full_capture.end()),
);
if !primitives::record_match(&mut previous_full_spans, rule_id, full_capture_span) {
continue;
}
// Get the primary secret value
let secret_capture = primitives::find_secret_capture(&anchored_regex, &captures);
let secret_bytes = secret_capture.as_bytes();
@ -203,20 +212,20 @@ impl Scanner {
}
// Compute match key for dedup
let offset_start = scan_start + secret_capture.start();
let offset_end = scan_start + secret_capture.end();
let match_key = primitives::compute_match_key(
secret_bytes,
rule.id().as_bytes(),
start + secret_capture.start(),
start + secret_capture.end(),
offset_start,
offset_end,
);
if !seen_matches.insert(match_key) {
continue;
}
// Build the finding
let offset_span = OffsetSpan::from_range(
(start + secret_capture.start())..(start + secret_capture.end()),
);
let offset_span = OffsetSpan::from_range(offset_start..offset_end);
let source_span = loc_mapping.get_source_span(&offset_span);
let secret = if self.config.redact_secrets {

View file

@ -1,6 +1,6 @@
---
title: "Source Code Parsing"
description: "Language-aware secret detection using lightweight lexers for 16 languages including Python, JavaScript, Go, Rust, and more."
description: "Language-aware secret detection using tree-sitter parsing for 13+ languages including Python, JavaScript, Go, Rust, and more."
---
# Kingfisher Source Code Parsing
@ -11,7 +11,7 @@ The implementation favors lightweight extractors over full AST parsing:
- **Handwritten lexers** for common programming and config languages — comment-aware stripping followed by regex-based `key = value` extraction
- **`tl`** for HTML — attribute values, element text, and embedded `<script>` / `<style>` delegation
- **`cssparser`** for CSS — declaration parsing via Mozilla's CSS tokenizer
- **`cssparser`** for CSS — declaration parsing via Mozillas CSS tokenizer
> **History:** Earlier versions used tree-sitter with 17 statically-linked
> grammar crates. This added ~20 MB to the binary and required building a
@ -19,9 +19,9 @@ The implementation favors lightweight extractors over full AST parsing:
> approach achieves the same extraction quality with near-zero binary overhead
> and no external grammar dependencies.
## How It's Called
## How Its Called
In the scanning phase (in the Matcher's implementation), Kingfisher does the following:
In the scanning phase (in the Matchers implementation), Kingfisher does the following:
- **Primary Regex Pass:** Kingfisher always scans the full blob with Vectorscan/Hyperscan first.
- **Candidate Selection:** Findings from rules classified as context-dependent become parser-verification candidates.
@ -42,8 +42,8 @@ The design supports many common source code languages. The Language enum (define
Context verification is skipped in certain cases:
- **No Language Identified:** If the file isn't recognized as belonging to one of the supported languages or no language hint is provided, the context verifier isn't even constructed.
- **Non-source Files:** Binary files or files that aren't expected to contain code (or aren't extracted from archives) bypass parser-based context verification.
- **No Language Identified:** If the file isnt recognized as belonging to one of the supported languages or no language hint is provided, the context verifier isnt even constructed.
- **Non-source Files:** Binary files or files that arent expected to contain code (or arent extracted from archives) bypass parser-based context verification.
- **Large Blobs:** Files larger than 2 MiB skip context verification to avoid spending time on generated or minified content.
- **Verification Errors:** If extraction fails, context-dependent matches are suppressed instead of falling back to raw regex hits.

View file

@ -115,7 +115,7 @@ flowchart LR
- `src/parser.rs` and `src/parser/*`: parser-based context verification for language-aware matching, with handwritten lexers plus lightweight HTML and CSS parsers.
- `src/scanner_pool.rs`: thread-local vectorscan `BlockScanner` pool, providing safe reuse of compiled pattern databases across scan threads.
- `src/reporter.rs` and `src/reporter/*`: report rendering for pretty, JSON, BSON, TOON, SARIF, and HTML outputs, plus the data model used by the viewer.
- `src/direct_validate.rs`: direct validation of a known secret without going through pattern matching. Supports HTTP, AWS, Azure, GCP, JDBC, MongoDB, MySQL, PostgreSQL, JWT, and Coinbase validators, with Liquid template integration for custom validation logic.
- `src/direct_validate.rs`: direct validation of a known secret without going through pattern matching. Supports HTTP, gRPC, plus schema-level typed validators such as AWS, AzureStorage, GCP, JDBC, MongoDB, MySQL, PostgreSQL, JWT, and Coinbase, and delegates ad-hoc `Raw` validators to `crates/kingfisher-scanner/src/validation/raw.rs`.
- `src/direct_revoke.rs`: direct revocation of a known secret without going through the scan pipeline. Uses Liquid templates for revocation configurations and supports multi-step HTTP revocation flows.
- `src/access_map.rs` and `src/access_map/*`: standalone blast-radius mapping with 24 provider implementations including AWS, Azure, GCP, GitHub, GitLab, Slack, Bitbucket, Gitea, Hugging Face, Buildkite, Anthropic, OpenAI, and more.
@ -123,6 +123,7 @@ flowchart LR
- The main CLI scan path is implemented primarily in the application modules under `src/`, not in `kingfisher-scanner`.
- `kingfisher-scanner` is still important: it provides the embeddable scanner API plus shared validation and primitive functionality reused by the application.
- The shared validation layer in `crates/kingfisher-scanner/src/validation/` contains both reusable typed validator families and the `Raw` exception-path validators used by rule YAML.
- Direct `validate`, `revoke`, and standalone `access-map` are sibling command paths. They are not downstream stages of `FindingsStore`.
- Reporting is downstream from the datastore, which lets Kingfisher emit multiple output formats and drive the local viewer from the same finding set.
- The matching layer is intentionally hybrid: vectorscan provides high-throughput SIMD-accelerated pattern detection, while regex helpers, Base64 support, and parser-based context verification improve accuracy and reduce false positives.

View file

@ -7,6 +7,7 @@ description: "Benchmark results comparing Kingfisher performance against Truffle
## Runtime Comparison (seconds)
*Lower runtimes are better.*
| Repository | Kingfisher Runtime | TruffleHog Runtime | GitLeaks Runtime |
|------------|--------------------|--------------------|------------------|
| croc | 2.64 | 10.36 | 3.10 |
@ -67,7 +68,7 @@ Note: For GitLeaks and detect-secrets, validated/verified counts are not availab
*Smaller binaries are easier to distribute, deploy in CI, and embed in container images*
<p align="center">
<img src="../assets/images/binary-size-comparison.png" alt="Binary Size Comparison" />
<img src="./binary-size-comparison.png" alt="Binary Size Comparison" />
</p>
## Benchmark Environment

View file

@ -39,7 +39,13 @@ The `kingfisher-scanner` crate supports optional validation features:
| ------- | ----------- |
| `validation` | Core validation support (includes HTTP validation) |
| `validation-http` | HTTP-based validation for API tokens |
| `validation-raw` | Provider/protocol-specific raw validation flows for `validation: type: Raw` rules |
| `validation-aws` | AWS credential validation via STS GetCallerIdentity |
| `validation-azure` | Azure storage credential validation |
| `validation-coinbase` | Coinbase credential validation |
| `validation-gcp` | GCP credential validation |
| `validation-jwt` | JWT validation |
| `validation-database` | MongoDB, MySQL, PostgreSQL, and JDBC validation |
| `validation-all` | Enable all validation features |
## Quick Start
@ -727,9 +733,17 @@ kingfisher-scanner = { git = "https://github.com/mongodb/kingfisher", features =
| ------- | ----------- |
| `validation` | Core validation support with HTTP validation |
| `validation-http` | HTTP-based validation for API tokens |
| `validation-raw` | Provider/protocol-specific raw validation flows for `validation: type: Raw` rules |
| `validation-aws` | AWS credential validation via STS |
| `validation-azure` | Azure storage credential validation |
| `validation-coinbase` | Coinbase credential validation |
| `validation-gcp` | GCP credential validation |
| `validation-jwt` | JWT validation |
| `validation-database` | MongoDB, MySQL, PostgreSQL, and JDBC validation |
| `validation-all` | Enable all validation features |
`validation: type: Raw` is the ad-hoc validator path for provider-specific or protocol-specific checks that are not generic enough to become schema-level validator families. Typed validators such as `AWS`, `GCP`, `MongoDB`, and `JWT` remain separate validator kinds in the rule schema.
### HTTP Validation Example
```rust

File diff suppressed because it is too large Load diff

View file

@ -171,9 +171,29 @@ revocation:
| visible | false to hide nonsecret captures (e.g. IDs) |
| depends_on_rule | Chain rules: use captures from one rule in another's validation |
| pattern_requirements | Require character types and/or exclude placeholder words from matches |
| validation | Configure HTTP, AWS, GCP, etc. checks to verify live validity |
| validation | Configure `Http`, `Grpc`, typed validators (`AWS`, `GCP`, etc.), or `Raw` exception-path checks to verify live validity |
| revocation | Configure HTTP, AWS, or multi-step revocation for a detected secret |
## Validation Types
Kingfisher supports three validation buckets:
1. `Http` and `Grpc`: YAML-native validation flows. Prefer these first.
2. Typed validators: schema-level validation families already modeled in the rule schema, such as `AWS`, `AzureStorage`, `Coinbase`, `GCP`, `MongoDB`, `MySQL`, `Postgres`, `Jdbc`, and `JWT`.
3. Raw validators: provider-specific or protocol-specific exception paths dispatched through `validation: type: Raw`.
Raw validation looks like this:
```yaml
validation:
type: Raw
content: kraken
```
Use `Raw` only when the provider check cannot be expressed reliably with `Http` or `Grpc` and does not justify a new reusable validator family. Raw validator implementations live in `crates/kingfisher-scanner/src/validation/raw.rs`.
Typed validators are safer and more reusable because the validator kind is part of the schema. `Raw` validators are string-dispatched and fail at runtime if the `content` name is unknown. If you need a Rust-backed exception path for one provider, prefer `Raw`; reserve new typed validators for stable validation families that can be reused across rules.
## gRPC Validation (Grpc)
Some services (notably CLI/SDK control planes) are **gRPC-only**. For these, `validation: type: Http`
@ -473,6 +493,7 @@ Below is the complete list of Liquid filters available in Kingfisher, along with
| `hmac_sha1` | `key` (string) | Computes HMAC-SHA1 over the input, returns Base64-encoded result. | `{{ TOKEN \| hmac_sha1: "secret-key" }}` |
| `hmac_sha256` | `key` (string) | Computes HMAC-SHA256 over the input, returns Base64-encoded result. | `{{ TOKEN \| hmac_sha256: "secret-key" }}` |
| `hmac_sha384` | `key` (string) | Computes HMAC-SHA384 over the input, returns Base64-encoded result. | `{{ TOKEN \| hmac_sha384: "secret-key" }}` |
| `hmac_sha384_hex` | `key` (string) | Computes HMAC-SHA384 over the input, returns lowercase hexadecimal output. | `{{ TOKEN \| hmac_sha384_hex: "secret-key" }}` |
| `hmac_sha256_b64key` | `key` (string, base64-encoded) | Decodes the key from Base64 to raw bytes, then computes HMAC-SHA256. Returns Base64. Use for Azure SAS and other protocols where the signing key is base64-encoded. | `{{ to_sign \| hmac_sha256_b64key: TOKEN }}` |
| `random_string` | `len` (integer, optional) | Generates a cryptographically-secure random alphanumeric string of the specified length (default: 32). | `{{ "" \| random_string: 16 }}` |
| `prefix` | `len` (integer, optional) | Returns the first `len` characters from the string (default: full). | `{{ TOKEN \| prefix: 6 }}` |
@ -481,8 +502,10 @@ Below is the complete list of Liquid filters available in Kingfisher, along with
| `url_encode` | | Percent-encodes the input according to RFC 3986. | `{{ TOKEN \| url_encode }}` |
| `json_escape` | | Escapes special characters so a string can be safely injected into JSON contexts. | `{{ TOKEN \| json_escape }}` |
| `unix_timestamp` | | Returns the current Unix epoch time in seconds (UTC). | `{{ "" \| unix_timestamp }}` |
| `unix_timestamp_ms` | | Returns the current Unix epoch time in milliseconds (UTC). | `{{ "" \| unix_timestamp_ms }}` |
| `iso_timestamp` | | Returns the current UTC timestamp in full ISO-8601 format (may include fractional seconds). | `{{ "" \| iso_timestamp }}` |
| `iso_timestamp_no_frac` | | Current ISO-8601 timestamp (UTC) **without** fractional seconds. | `{{ "" \| iso_timestamp_no_frac }}` |
| `rfc1123_date` | | Returns the current RFC-1123 timestamp in GMT. | `{{ "" \| rfc1123_date }}` |
| `uuid` | | Generates a random UUIDv4 string. | `{{ "" \| uuid }}` |
| `jwt_header` | | Builds a minimal JWT header JSON (`{"typ":"JWT","alg":…}`) and Base64URL-encodes it. | `{{ "HS256" \| jwt_header }}` |
| `replace` | `from` (string), `to` (string) | Replaces every occurrence of `from` with `to` in the input string. | `{{ "hello world" \| replace: "world", "mars" }}` |
@ -497,6 +520,11 @@ Authorization: Basic {{ "api:" | append: TOKEN | b64enc }}
```
**Runtime Values:** Filters like unix_timestamp and uuid are evaluated at runtime, enabling nonces, timestamps, and unique IDs in your requests.
**Stable Request Values:** HTTP and gRPC validation requests also expose stable per-request template variables. Use these when the same generated value must appear in multiple places within one request. Currently:
- `REQUEST_RFC1123_DATE`
- `REQUEST_UNIX_MILLIS`
### How depends_on_rule Works
- **Dependency Declaration:**
@ -743,7 +771,7 @@ When writing custom rules, consider the following best practices:
1. **Multi-line Regex:** Write your regex patterns over multiple lines for clarity. Use the `(?x)` flag to enable free-spacing mode.
2. **Optimize for Performance:** Structure your regex to minimize backtracking. Use non-capturing groups where possible and keep the pattern as concise as possible.
3. **Validation Integration:** Define a `validation` section if you want to verify the detected secret. You can use Liquid templating to insert dynamic values—use the unnamed capture as `TOKEN` and any named captures in uppercase.
3. **Validation Integration:** Define a `validation` section if you want to verify the detected secret. Prefer `Http` or `Grpc`; use an existing typed validator when the rule matches a supported validator family; use `Raw` only for rare provider-specific exception paths. You can use Liquid templating to insert dynamic values where supported. Use the unnamed capture as `TOKEN` and any named captures in uppercase.
4. **Revocation Integration:** Define a `revocation` section if you want to revoke a detected secret. It uses the same HTTP request format and template variables as `validation`.
5. **Test with Examples:** Always include examples that should match and, optionally, negative examples to ensure your rule behaves as expected.
@ -920,4 +948,5 @@ rules:
words: ['"Arn"']
depends_on_rule:
- rule_id: kingfisher.alibabacloud.1
variable: AKID```
variable: AKID
```

View file

@ -21,6 +21,7 @@ use crate::{
cli::{commands::revoke::RevokeArgs, global::GlobalArgs},
liquid_filters::register_all,
rule_loader::RuleLoader,
template_vars::extract_template_vars,
validation::aws::{revoke_aws_access_key, validate_aws_credentials_input},
validation::gcp::revoke_gcp_service_account_key,
validation::httpvalidation::{build_request_builder, retry_request, validate_response},
@ -88,12 +89,6 @@ fn find_rules_by_selector<'a>(
Ok(matches)
}
/// Extract Liquid template variable names from a string.
fn extract_template_vars(text: &str) -> BTreeSet<String> {
let re = Regex::new(r"\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*(?:\|[^}]*)?\}\}").unwrap();
re.captures_iter(text).filter_map(|cap| cap.get(1).map(|m| m.as_str().to_uppercase())).collect()
}
/// Extract all template variables used in a revocation configuration.
fn extract_revocation_vars(revocation: &Revocation) -> BTreeSet<String> {
let mut vars = BTreeSet::new();

View file

@ -14,7 +14,6 @@ use anyhow::{anyhow, bail, Context, Result};
use crossbeam_skiplist::SkipMap;
use liquid::Object;
use liquid_core::{Value, ValueView};
use regex::Regex;
use reqwest::Client;
use serde::Serialize;
use tracing::debug;
@ -24,6 +23,7 @@ use crate::{
liquid_filters::register_all,
rule_loader::RuleLoader,
rules::{rule::Rule, HttpValidation, Validation},
template_vars::extract_template_vars,
validation::{
aws::validate_aws_credentials,
azure::validate_azure_storage_credentials,
@ -125,15 +125,6 @@ fn get_global_var(globals: &Object, name: &str) -> Option<String> {
globals.get(name).and_then(|v| v.to_kstr().to_string().into())
}
/// Extract Liquid template variable names from a string.
/// Matches patterns like {{ VAR }} or {{ VAR | filter }}.
fn extract_template_vars(text: &str) -> BTreeSet<String> {
// Match {{ VAR }} or {{ VAR | filter }} patterns
// Variable names are alphanumeric with underscores
let re = Regex::new(r"\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*(?:\|[^}]*)?\}\}").unwrap();
re.captures_iter(text).filter_map(|cap| cap.get(1).map(|m| m.as_str().to_uppercase())).collect()
}
/// Extract all template variables used in a validation configuration.
fn extract_validation_vars(validation: &Validation) -> BTreeSet<String> {
let mut vars = BTreeSet::new();

View file

@ -54,6 +54,7 @@ pub mod slack;
pub mod snippet;
pub mod sqlite;
pub mod teams;
pub(crate) mod template_vars;
pub mod toon;
pub mod update;
pub mod util;

View file

@ -169,6 +169,7 @@ pub(crate) fn filter_match<'b>(
start: usize,
end: usize,
matches: &mut Vec<BlobMatch<'b>>,
full_matches: Option<&mut FxHashMap<usize, Vec<OffsetSpan>>>,
previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
rule_id: usize,
seen_matches: &mut FxHashSet<u64>,
@ -183,6 +184,7 @@ pub(crate) fn filter_match<'b>(
) {
let mut timer =
profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
let mut full_matches = full_matches;
let initial_len = matches.len();
@ -192,6 +194,13 @@ pub(crate) fn filter_match<'b>(
for captures in re.captures_iter(haystack) {
let full_capture = captures.get(0).unwrap();
let full_capture_offset_span =
OffsetSpan::from_range((start + full_capture.start())..(start + full_capture.end()));
if let Some(full_matches) = full_matches.as_deref_mut() {
if !record_match(full_matches, rule_id, full_capture_offset_span) {
continue;
}
}
let matching_input_for_entropy = find_secret_capture(re, &captures);
let min_entropy = rule.min_entropy();

View file

@ -33,12 +33,11 @@ use crate::{
};
use kingfisher_scanner::primitives::find_secret_capture;
use self::{
base64_decode::get_base64_strings as get_b64_strings, dedup::record_match, filter::filter_match,
};
use self::{base64_decode::get_base64_strings as get_b64_strings, filter::filter_match};
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
const RAW_MATCH_LOOKBACK: usize = 64 * 1024; // Re-scan a bounded suffix ending at the raw match.
const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
// The old tree-sitter limit was 128 KiB due to full-AST parsing cost.
// The lightweight regex-based lexer is O(n) line-by-line, so we can afford
@ -243,6 +242,63 @@ impl<'a> Matcher<'a> {
Ok(())
}
fn process_raw_matches<'b>(
&self,
blob: &'b Blob,
origin: &OriginSet,
filename: &str,
redact: bool,
matches: &mut Vec<BlobMatch<'b>>,
previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
seen_matches: &mut FxHashSet<u64>,
match_rule_indices: &mut Vec<usize>,
) where
'a: 'b,
{
let rules_db = self.rules_db;
let mut seen_raw_match_ends: FxHashSet<(usize, usize)> = FxHashSet::default();
let mut previous_full_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
for &RawMatch { rule_id, start_idx, end_idx } in
self.user_data.raw_matches_scratch.iter().rev()
{
let rule_id_usize: usize = rule_id as usize;
let rule = Arc::clone(&rules_db.rules()[rule_id_usize]);
let re = &rules_db.anchored_regexes()[rule_id_usize];
let end_idx_usize = end_idx as usize;
let _ = start_idx; // Vectorscan block mode does not provide a reliable start offset.
if !seen_raw_match_ends.insert((rule_id_usize, end_idx_usize)) {
continue;
}
// Re-scan a bounded suffix ending at the raw match and dedupe on the
// actual capture spans produced by the anchored regex.
let scan_start = end_idx_usize.saturating_sub(RAW_MATCH_LOOKBACK);
let before_len = matches.len();
filter_match(
blob,
rule,
re,
scan_start,
end_idx_usize,
matches,
Some(&mut previous_full_matches),
previous_matches,
rule_id_usize,
seen_matches,
origin,
None,
false,
redact,
filename,
self.profiler.as_ref(),
self.respect_ignore_if_contains,
&self.inline_ignore_config,
);
match_rule_indices
.extend(std::iter::repeat_n(rule_id_usize, matches.len() - before_len));
}
}
pub fn scan_blob<'b>(
&mut self,
blob: &'b Blob,
@ -289,51 +345,25 @@ impl<'a> Matcher<'a> {
return Ok(ScanResult::New(Vec::new()));
}
let rules_db = self.rules_db;
let mut seen_matches = FxHashSet::default();
let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
let mut match_rule_indices: Vec<usize> = Vec::new();
let blob_len = blob.len();
let mut matches = Vec::new();
let mut previous_raw_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
for &RawMatch { rule_id, start_idx, end_idx } in
self.user_data.raw_matches_scratch.iter().rev()
{
let rule_id_usize: usize = rule_id as usize;
let rule = Arc::clone(&rules_db.rules()[rule_id_usize]);
let re = &rules_db.anchored_regexes()[rule_id_usize];
let start_idx_usize = start_idx as usize;
let end_idx_usize = end_idx as usize;
let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
continue;
}
let before_len = matches.len();
filter_match(
blob,
rule,
re,
start_idx_usize,
end_idx_usize,
&mut matches,
&mut previous_matches,
rule_id_usize,
&mut seen_matches,
origin,
None,
false,
redact,
&filename,
self.profiler.as_ref(),
self.respect_ignore_if_contains,
&self.inline_ignore_config,
);
match_rule_indices
.extend(std::iter::repeat_n(rule_id_usize, matches.len() - before_len));
}
self.process_raw_matches(
blob,
origin,
&filename,
redact,
&mut matches,
&mut previous_matches,
&mut seen_matches,
&mut match_rule_indices,
);
if !no_base64 {
let rules_db = self.rules_db;
// If the blob contains standalone Base64 blobs, decode and scan them as well
const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep
let mut b64_stack: Vec<(DecodedData, usize)> =
@ -349,6 +379,7 @@ impl<'a> Matcher<'a> {
item.pos_start,
item.pos_end,
&mut matches,
None,
&mut previous_matches,
rule_id_usize,
&mut seen_matches,
@ -379,7 +410,7 @@ impl<'a> Matcher<'a> {
}
}
maybe_apply_context_verification(
rules_db,
self.rules_db,
blob,
lang_hint,
blob_len,
@ -896,6 +927,65 @@ mod test {
Ok(())
}
#[test]
fn bogus_raw_starts_do_not_hide_earlier_matches() -> Result<()> {
let rule = Rule::new(RuleSyntax {
id: "bogus.start".into(),
name: "bogus start".into(),
pattern: r#"key\s*=\s*"([A-Z]{3})""#.into(),
confidence: crate::rules::rule::Confidence::Low,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let mut matcher = matcher;
matcher.user_data.raw_matches_scratch = vec![
RawMatch { rule_id: 0, start_idx: 5, end_idx: 9 },
RawMatch { rule_id: 0, start_idx: 5, end_idx: 19 },
];
let blob = Blob::from_bytes(b"key=\"ABC\"\nkey=\"DEF\"".to_vec());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("bogus-starts.txt")));
let mut matches = Vec::new();
let mut previous_matches = FxHashMap::default();
let mut seen_matches = FxHashSet::default();
let mut match_rule_indices = Vec::new();
matcher.process_raw_matches(
&blob,
&origin,
"bogus-starts.txt",
false,
&mut matches,
&mut previous_matches,
&mut seen_matches,
&mut match_rule_indices,
);
let secrets = matches
.iter()
.map(|m| String::from_utf8_lossy(m.matching_input).to_string())
.collect::<Vec<_>>();
assert_eq!(secrets, vec!["ABC", "DEF"]);
assert_eq!(match_rule_indices, vec![0, 0]);
Ok(())
}
#[test]
fn inline_comment_skips_match() -> Result<()> {
let rule = Rule::new(RuleSyntax {

View file

@ -25,6 +25,7 @@ use crate::{
origin::{Origin, OriginSet},
rules::rule::Confidence,
rules::Revocation,
template_vars::extract_template_vars,
validation_body::{self, ValidationResponseBody},
};
mod bson_format;
@ -49,45 +50,6 @@ fn escape_for_shell(s: &str) -> String {
format!("'{}'", s.replace('\'', "'\\''"))
}
static TEMPLATE_BLOCK_RE: once_cell::sync::Lazy<regex::Regex> = once_cell::sync::Lazy::new(|| {
regex::Regex::new(r"\{\{\s*([^}]*)\}\}").expect("template block regex should compile")
});
static TEMPLATE_IDENT_RE: once_cell::sync::Lazy<regex::Regex> = once_cell::sync::Lazy::new(|| {
regex::Regex::new(r"[A-Za-z_][A-Za-z0-9_]*").expect("template identifier regex should compile")
});
const TEMPLATE_FILTER_NAMES: &[&str] = &[
"append",
"b64enc",
"base62",
"crc32",
"crc32_hex",
"default",
"downcase",
"json_escape",
"prefix",
"replace",
"url_encode",
];
fn extract_template_vars(text: &str) -> BTreeSet<String> {
let mut vars = BTreeSet::new();
for block_cap in TEMPLATE_BLOCK_RE.captures_iter(text) {
let inner = block_cap.get(1).map(|m| m.as_str()).unwrap_or_default();
for ident_cap in TEMPLATE_IDENT_RE.captures_iter(inner) {
let ident = ident_cap.get(0).map(|m| m.as_str()).unwrap_or_default();
if TEMPLATE_FILTER_NAMES.iter().any(|f| f.eq_ignore_ascii_case(ident)) {
continue;
}
vars.insert(ident.to_uppercase());
}
}
vars
}
fn required_vars_for_validation(validation: &crate::rules::Validation) -> BTreeSet<String> {
use crate::rules::Validation;
let mut vars = BTreeSet::new();

151
src/template_vars.rs Normal file
View file

@ -0,0 +1,151 @@
use std::collections::BTreeSet;
static TEMPLATE_BLOCK_RE: once_cell::sync::Lazy<regex::Regex> = once_cell::sync::Lazy::new(|| {
regex::Regex::new(r"\{\{\s*([^}]*)\}\}").expect("template block regex should compile")
});
const LIQUID_LITERAL_NAMES: &[&str] = &["blank", "empty", "false", "nil", "null", "true"];
pub(crate) fn extract_template_vars(text: &str) -> BTreeSet<String> {
let mut vars = BTreeSet::new();
for block_cap in TEMPLATE_BLOCK_RE.captures_iter(text) {
let inner = block_cap.get(1).map(|m| m.as_str()).unwrap_or_default();
for (segment_index, segment) in split_filter_segments(inner).into_iter().enumerate() {
collect_segment_vars(segment, segment_index != 0, &mut vars);
}
}
vars
}
fn split_filter_segments(inner: &str) -> Vec<&str> {
let mut segments = Vec::new();
let mut start = 0;
let mut in_single = false;
let mut in_double = false;
let mut escaped = false;
for (idx, ch) in inner.char_indices() {
if escaped {
escaped = false;
continue;
}
match ch {
'\\' if in_single || in_double => escaped = true,
'\'' if !in_double => in_single = !in_single,
'"' if !in_single => in_double = !in_double,
'|' if !in_single && !in_double => {
segments.push(&inner[start..idx]);
start = idx + ch.len_utf8();
}
_ => {}
}
}
segments.push(&inner[start..]);
segments
}
fn collect_segment_vars(segment: &str, skip_first_ident: bool, vars: &mut BTreeSet<String>) {
let mut chars = segment.char_indices().peekable();
let mut in_single = false;
let mut in_double = false;
let mut escaped = false;
let mut skipped_filter_name = !skip_first_ident;
while let Some((idx, ch)) = chars.next() {
if escaped {
escaped = false;
continue;
}
match ch {
'\\' if in_single || in_double => {
escaped = true;
continue;
}
'\'' if !in_double => {
in_single = !in_single;
continue;
}
'"' if !in_single => {
in_double = !in_double;
continue;
}
_ => {}
}
if in_single || in_double || !is_ident_start(ch) {
continue;
}
let mut end = idx + ch.len_utf8();
while let Some(&(next_idx, next_ch)) = chars.peek() {
if !is_ident_continue(next_ch) {
break;
}
chars.next();
end = next_idx + next_ch.len_utf8();
}
let ident = &segment[idx..end];
if !skipped_filter_name {
skipped_filter_name = true;
continue;
}
if LIQUID_LITERAL_NAMES.iter().any(|name| name.eq_ignore_ascii_case(ident)) {
continue;
}
vars.insert(ident.to_ascii_uppercase());
}
}
fn is_ident_start(ch: char) -> bool {
ch.is_ascii_alphabetic() || ch == '_'
}
fn is_ident_continue(ch: char) -> bool {
ch.is_ascii_alphanumeric() || ch == '_'
}
#[cfg(test)]
mod tests {
use super::extract_template_vars;
use std::collections::BTreeSet;
#[test]
fn ignores_filter_names_but_keeps_filter_argument_vars() {
let vars = extract_template_vars(
"{{ NEXT_PUBLIC_VERCEL_APP_CLIENT_ID | default: VERCEL_APP_CLIENT_ID | append: ':' | append: VERCEL_APP_CLIENT_SECRET | b64enc }}",
);
assert_eq!(
vars,
BTreeSet::from([
"NEXT_PUBLIC_VERCEL_APP_CLIENT_ID".to_string(),
"VERCEL_APP_CLIENT_ID".to_string(),
"VERCEL_APP_CLIENT_SECRET".to_string(),
])
);
}
#[test]
fn ignores_literal_strings_and_new_filter_names() {
let vars = extract_template_vars(
r#"{{ "" | unix_timestamp_ms }} {{ "" | rfc1123_date }} {{ TOKEN | hmac_sha384_hex: SECRET }} {{ "https://example.com/oauth/callback" | url_encode }}"#,
);
assert_eq!(vars, BTreeSet::from(["SECRET".to_string(), "TOKEN".to_string()]));
}
#[test]
fn ignores_liquid_literal_arguments() {
let vars = extract_template_vars(r#"{{ TOKEN | default: blank | append: FALLBACK }}"#);
assert_eq!(vars, BTreeSet::from(["FALLBACK".to_string(), "TOKEN".to_string()]));
}
}