changes in response to PR review

2026-04-08 16:16:31 -07:00 · 2026-04-08 16:16:31 -07:00 · 58e9cfd585
commit 58e9cfd585
parent a0934737dc
15 changed files with 1092 additions and 143 deletions
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
    <img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" alt="License" style="height: 24px;" />
  </a>
  <a href="https://github.com/mongodb/kingfisher">
-    <img src="https://img.shields.io/badge/Detection%20Rules-820-2ea043.svg" alt="Detection Rules" style="height: 24px;" />
+    <img src="https://img.shields.io/badge/Detection%20Rules-821-2ea043.svg" alt="Detection Rules" style="height: 24px;" />
  </a>
  <br>
  <a href="https://github.com/mongodb/kingfisher/pkgs/container/kingfisher">
--- a/crates/kingfisher-scanner/src/scanner.rs
+++ b/crates/kingfisher-scanner/src/scanner.rs
@ -14,6 +14,8 @@ use crate::finding::{Finding, FindingLocation};
 use crate::primitives;
 use crate::scanner_pool::ScannerPool;

+const RAW_MATCH_LOOKBACK: usize = 64 * 1024;
+
 /// Configuration options for the scanner.
 #[derive(Debug, Clone)]
 pub struct ScannerConfig {
@ -167,9 +169,14 @@ impl Scanner {
        // Process matches through regex
        let mut findings = Vec::new();
        let mut seen_matches: FxHashSet<u64> = FxHashSet::default();
-        let mut previous_spans: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
+        let mut seen_raw_match_ends: FxHashSet<(usize, usize)> = FxHashSet::default();
+        let mut previous_full_spans: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();

        for (rule_id, start, end) in raw_matches.into_iter().rev() {
+            let _ = start; // Block-mode Vectorscan reports `from` as 0 unless SOM is enabled.
+            if !seen_raw_match_ends.insert((rule_id, end)) {
+                continue;
+            }
            let rule = match self.rules_db.get_rule(rule_id) {
                Some(r) => r,
                None => continue,
@ -180,16 +187,18 @@ impl Scanner {
                Err(_) => continue,
            };

-            let current_span = OffsetSpan::from_range(start..end);
-
-            // Check for overlapping spans
-            if !primitives::record_match(&mut previous_spans, rule_id, current_span) {
-                continue;
-            }
-
-            let haystack = &bytes[start..end];
+            let scan_start = end.saturating_sub(RAW_MATCH_LOOKBACK);
+            let haystack = &bytes[scan_start..end];

            for captures in anchored_regex.captures_iter(haystack) {
+                let full_capture = captures.get(0).unwrap();
+                let full_capture_span = OffsetSpan::from_range(
+                    (scan_start + full_capture.start())..(scan_start + full_capture.end()),
+                );
+                if !primitives::record_match(&mut previous_full_spans, rule_id, full_capture_span) {
+                    continue;
+                }
+
                // Get the primary secret value
                let secret_capture = primitives::find_secret_capture(&anchored_regex, &captures);
                let secret_bytes = secret_capture.as_bytes();
@ -203,20 +212,20 @@ impl Scanner {
                }

                // Compute match key for dedup
+                let offset_start = scan_start + secret_capture.start();
+                let offset_end = scan_start + secret_capture.end();
                let match_key = primitives::compute_match_key(
                    secret_bytes,
                    rule.id().as_bytes(),
-                    start + secret_capture.start(),
-                    start + secret_capture.end(),
+                    offset_start,
+                    offset_end,
                );
                if !seen_matches.insert(match_key) {
                    continue;
                }

                // Build the finding
-                let offset_span = OffsetSpan::from_range(
-                    (start + secret_capture.start())..(start + secret_capture.end()),
-                );
+                let offset_span = OffsetSpan::from_range(offset_start..offset_end);
                let source_span = loc_mapping.get_source_span(&offset_span);

                let secret = if self.config.redact_secrets {
--- a/docs-site/docs/features/parsing.md
+++ b/docs-site/docs/features/parsing.md
@ -1,6 +1,6 @@
 ---
 title: "Source Code Parsing"
-description: "Language-aware secret detection using lightweight lexers for 16 languages including Python, JavaScript, Go, Rust, and more."
+description: "Language-aware secret detection using tree-sitter parsing for 13+ languages including Python, JavaScript, Go, Rust, and more."
 ---

 # Kingfisher Source Code Parsing
@ -11,7 +11,7 @@ The implementation favors lightweight extractors over full AST parsing:

 - **Handwritten lexers** for common programming and config languages — comment-aware stripping followed by regex-based `key = value` extraction
 - **`tl`** for HTML — attribute values, element text, and embedded `<script>` / `<style>` delegation
- **`cssparser`** for CSS — declaration parsing via Mozilla's CSS tokenizer
+- **`cssparser`** for CSS — declaration parsing via Mozilla’s CSS tokenizer

 > **History:** Earlier versions used tree-sitter with 17 statically-linked
 > grammar crates. This added ~20 MB to the binary and required building a
@ -19,9 +19,9 @@ The implementation favors lightweight extractors over full AST parsing:
 > approach achieves the same extraction quality with near-zero binary overhead
 > and no external grammar dependencies.

-## How It's Called
+## How It’s Called

-In the scanning phase (in the Matcher's implementation), Kingfisher does the following:
+In the scanning phase (in the Matcher’s implementation), Kingfisher does the following:

 - **Primary Regex Pass:** Kingfisher always scans the full blob with Vectorscan/Hyperscan first.
 - **Candidate Selection:** Findings from rules classified as context-dependent become parser-verification candidates.
@ -42,8 +42,8 @@ The design supports many common source code languages. The Language enum (define

 Context verification is skipped in certain cases:

- **No Language Identified:** If the file isn't recognized as belonging to one of the supported languages or no language hint is provided, the context verifier isn't even constructed.
- **Non-source Files:** Binary files or files that aren't expected to contain code (or aren't extracted from archives) bypass parser-based context verification.
+- **No Language Identified:** If the file isn’t recognized as belonging to one of the supported languages or no language hint is provided, the context verifier isn’t even constructed.
+- **Non-source Files:** Binary files or files that aren’t expected to contain code (or aren’t extracted from archives) bypass parser-based context verification.
 - **Large Blobs:** Files larger than 2 MiB skip context verification to avoid spending time on generated or minified content.
 - **Verification Errors:** If extraction fails, context-dependent matches are suppressed instead of falling back to raw regex hits.

--- a/docs-site/docs/reference/architecture.md
+++ b/docs-site/docs/reference/architecture.md
@ -115,7 +115,7 @@ flowchart LR
 - `src/parser.rs` and `src/parser/*`: parser-based context verification for language-aware matching, with handwritten lexers plus lightweight HTML and CSS parsers.
 - `src/scanner_pool.rs`: thread-local vectorscan `BlockScanner` pool, providing safe reuse of compiled pattern databases across scan threads.
 - `src/reporter.rs` and `src/reporter/*`: report rendering for pretty, JSON, BSON, TOON, SARIF, and HTML outputs, plus the data model used by the viewer.
- `src/direct_validate.rs`: direct validation of a known secret without going through pattern matching. Supports HTTP, AWS, Azure, GCP, JDBC, MongoDB, MySQL, PostgreSQL, JWT, and Coinbase validators, with Liquid template integration for custom validation logic.
+- `src/direct_validate.rs`: direct validation of a known secret without going through pattern matching. Supports HTTP, gRPC, plus schema-level typed validators such as AWS, AzureStorage, GCP, JDBC, MongoDB, MySQL, PostgreSQL, JWT, and Coinbase, and delegates ad-hoc `Raw` validators to `crates/kingfisher-scanner/src/validation/raw.rs`.
 - `src/direct_revoke.rs`: direct revocation of a known secret without going through the scan pipeline. Uses Liquid templates for revocation configurations and supports multi-step HTTP revocation flows.
 - `src/access_map.rs` and `src/access_map/*`: standalone blast-radius mapping with 24 provider implementations including AWS, Azure, GCP, GitHub, GitLab, Slack, Bitbucket, Gitea, Hugging Face, Buildkite, Anthropic, OpenAI, and more.

@ -123,6 +123,7 @@ flowchart LR

 - The main CLI scan path is implemented primarily in the application modules under `src/`, not in `kingfisher-scanner`.
 - `kingfisher-scanner` is still important: it provides the embeddable scanner API plus shared validation and primitive functionality reused by the application.
+- The shared validation layer in `crates/kingfisher-scanner/src/validation/` contains both reusable typed validator families and the `Raw` exception-path validators used by rule YAML.
 - Direct `validate`, `revoke`, and standalone `access-map` are sibling command paths. They are not downstream stages of `FindingsStore`.
 - Reporting is downstream from the datastore, which lets Kingfisher emit multiple output formats and drive the local viewer from the same finding set.
 - The matching layer is intentionally hybrid: vectorscan provides high-throughput SIMD-accelerated pattern detection, while regex helpers, Base64 support, and parser-based context verification improve accuracy and reduce false positives.
--- a/docs-site/docs/reference/comparison.md
+++ b/docs-site/docs/reference/comparison.md
@ -7,6 +7,7 @@ description: "Benchmark results comparing Kingfisher performance against Truffle

 ## Runtime Comparison (seconds)
 *Lower runtimes are better.*
+
 | Repository | Kingfisher Runtime | TruffleHog Runtime | GitLeaks Runtime |
 |------------|--------------------|--------------------|------------------|
 | croc | 2.64 | 10.36 | 3.10 |
@ -67,7 +68,7 @@ Note: For GitLeaks and detect-secrets, validated/verified counts are not availab
 *Smaller binaries are easier to distribute, deploy in CI, and embed in container images*

 <p align="center">
-  <img src="../assets/images/binary-size-comparison.png" alt="Binary Size Comparison" />
+  <img src="./binary-size-comparison.png" alt="Binary Size Comparison" />
 </p>

 ## Benchmark Environment
--- a/docs-site/docs/reference/library.md
+++ b/docs-site/docs/reference/library.md
@ -39,7 +39,13 @@ The `kingfisher-scanner` crate supports optional validation features:
 | ------- | ----------- |
 | `validation` | Core validation support (includes HTTP validation) |
 | `validation-http` | HTTP-based validation for API tokens |
+| `validation-raw` | Provider/protocol-specific raw validation flows for `validation: type: Raw` rules |
 | `validation-aws` | AWS credential validation via STS GetCallerIdentity |
+| `validation-azure` | Azure storage credential validation |
+| `validation-coinbase` | Coinbase credential validation |
+| `validation-gcp` | GCP credential validation |
+| `validation-jwt` | JWT validation |
+| `validation-database` | MongoDB, MySQL, PostgreSQL, and JDBC validation |
 | `validation-all` | Enable all validation features |

 ## Quick Start
@ -727,9 +733,17 @@ kingfisher-scanner = { git = "https://github.com/mongodb/kingfisher", features =
 | ------- | ----------- |
 | `validation` | Core validation support with HTTP validation |
 | `validation-http` | HTTP-based validation for API tokens |
+| `validation-raw` | Provider/protocol-specific raw validation flows for `validation: type: Raw` rules |
 | `validation-aws` | AWS credential validation via STS |
+| `validation-azure` | Azure storage credential validation |
+| `validation-coinbase` | Coinbase credential validation |
+| `validation-gcp` | GCP credential validation |
+| `validation-jwt` | JWT validation |
+| `validation-database` | MongoDB, MySQL, PostgreSQL, and JDBC validation |
 | `validation-all` | Enable all validation features |

+`validation: type: Raw` is the ad-hoc validator path for provider-specific or protocol-specific checks that are not generic enough to become schema-level validator families. Typed validators such as `AWS`, `GCP`, `MongoDB`, and `JWT` remain separate validator kinds in the rule schema.
+
 ### HTTP Validation Example

 ```rust
--- a/docs-site/docs/rules/builtin-rules.md
+++ b/docs-site/docs/rules/builtin-rules.md
--- a/docs-site/docs/rules/overview.md
+++ b/docs-site/docs/rules/overview.md
@ -171,9 +171,29 @@ revocation:
 | visible                 | false to hide non‑secret captures (e.g. IDs)                         |
 | depends_on_rule         | Chain rules: use captures from one rule in another's validation      |
 | pattern_requirements  | Require character types and/or exclude placeholder words from matches |
-| validation              | Configure HTTP, AWS, GCP, etc. checks to verify live validity        |
+| validation              | Configure `Http`, `Grpc`, typed validators (`AWS`, `GCP`, etc.), or `Raw` exception-path checks to verify live validity |
 | revocation              | Configure HTTP, AWS, or multi-step revocation for a detected secret  |

+## Validation Types
+
+Kingfisher supports three validation buckets:
+
+1. `Http` and `Grpc`: YAML-native validation flows. Prefer these first.
+2. Typed validators: schema-level validation families already modeled in the rule schema, such as `AWS`, `AzureStorage`, `Coinbase`, `GCP`, `MongoDB`, `MySQL`, `Postgres`, `Jdbc`, and `JWT`.
+3. Raw validators: provider-specific or protocol-specific exception paths dispatched through `validation: type: Raw`.
+
+Raw validation looks like this:
+
+```yaml
+validation:
+  type: Raw
+  content: kraken
+```
+
+Use `Raw` only when the provider check cannot be expressed reliably with `Http` or `Grpc` and does not justify a new reusable validator family. Raw validator implementations live in `crates/kingfisher-scanner/src/validation/raw.rs`.
+
+Typed validators are safer and more reusable because the validator kind is part of the schema. `Raw` validators are string-dispatched and fail at runtime if the `content` name is unknown. If you need a Rust-backed exception path for one provider, prefer `Raw`; reserve new typed validators for stable validation families that can be reused across rules.
+
 ## gRPC Validation (Grpc)

 Some services (notably CLI/SDK control planes) are **gRPC-only**. For these, `validation: type: Http`
@ -473,6 +493,7 @@ Below is the complete list of Liquid filters available in Kingfisher, along with
 | `hmac_sha1`           | `key` (string)                               | Computes HMAC-SHA1 over the input, returns Base64-encoded result.                                              | `{{ TOKEN \| hmac_sha1: "secret-key" }}`                             |
 | `hmac_sha256`         | `key` (string)                               | Computes HMAC-SHA256 over the input, returns Base64-encoded result.                                            | `{{ TOKEN \| hmac_sha256: "secret-key" }}`                           |
 | `hmac_sha384`         | `key` (string)                               | Computes HMAC-SHA384 over the input, returns Base64-encoded result.                                            | `{{ TOKEN \| hmac_sha384: "secret-key" }}`                           |
+| `hmac_sha384_hex`     | `key` (string)                               | Computes HMAC-SHA384 over the input, returns lowercase hexadecimal output.                                     | `{{ TOKEN \| hmac_sha384_hex: "secret-key" }}`                       |
 | `hmac_sha256_b64key`  | `key` (string, base64-encoded)               | Decodes the key from Base64 to raw bytes, then computes HMAC-SHA256. Returns Base64. Use for Azure SAS and other protocols where the signing key is base64-encoded. | `{{ to_sign \| hmac_sha256_b64key: TOKEN }}`                         |
 | `random_string`       | `len` (integer, optional)                    | Generates a cryptographically-secure random alphanumeric string of the specified length (default: 32).        | `{{ "" \| random_string: 16 }}`                                      |
 | `prefix`              | `len` (integer, optional)                    | Returns the first `len` characters from the string (default: full).                                            | `{{ TOKEN \| prefix: 6 }}`                                           |
@ -481,8 +502,10 @@ Below is the complete list of Liquid filters available in Kingfisher, along with
 | `url_encode`          | –                                            | Percent-encodes the input according to RFC 3986.                                                                | `{{ TOKEN \| url_encode }}`                                          |
 | `json_escape`         | –                                            | Escapes special characters so a string can be safely injected into JSON contexts.                              | `{{ TOKEN \| json_escape }}`                                         |
 | `unix_timestamp`      | –                                            | Returns the current Unix epoch time in seconds (UTC).                                                          | `{{ "" \| unix_timestamp }}`                                         |
+| `unix_timestamp_ms`   | –                                            | Returns the current Unix epoch time in milliseconds (UTC).                                                     | `{{ "" \| unix_timestamp_ms }}`                                      |
 | `iso_timestamp`       | –                                            | Returns the current UTC timestamp in full ISO-8601 format (may include fractional seconds).                    | `{{ "" \| iso_timestamp }}`                                          |
 | `iso_timestamp_no_frac` | –                                          | Current ISO-8601 timestamp (UTC) **without** fractional seconds.                                               | `{{ "" \| iso_timestamp_no_frac }}`                                  |
+| `rfc1123_date`        | –                                            | Returns the current RFC-1123 timestamp in GMT.                                                                 | `{{ "" \| rfc1123_date }}`                                           |
 | `uuid`                | –                                            | Generates a random UUIDv4 string.                                                                              | `{{ "" \| uuid }}`                                                   |
 | `jwt_header`          | –                                            | Builds a minimal JWT header JSON (`{"typ":"JWT","alg":…}`) and Base64URL-encodes it.                           | `{{ "HS256" \| jwt_header }}`                                        |
 | `replace`             | `from` (string), `to` (string)               | Replaces every occurrence of `from` with `to` in the input string.                                             | `{{ "hello world" \| replace: "world", "mars" }}`                    |
@ -497,6 +520,11 @@ Authorization: Basic {{ "api:" | append: TOKEN | b64enc }}
 ```

 **Runtime Values:** Filters like unix_timestamp and uuid are evaluated at runtime, enabling nonces, timestamps, and unique IDs in your requests.
+
+**Stable Request Values:** HTTP and gRPC validation requests also expose stable per-request template variables. Use these when the same generated value must appear in multiple places within one request. Currently:
+- `REQUEST_RFC1123_DATE`
+- `REQUEST_UNIX_MILLIS`
+
 ### How depends_on_rule Works

 - **Dependency Declaration:**  
@ -743,7 +771,7 @@ When writing custom rules, consider the following best practices:

 1. **Multi-line Regex:** Write your regex patterns over multiple lines for clarity. Use the `(?x)` flag to enable free-spacing mode.
 2. **Optimize for Performance:** Structure your regex to minimize backtracking. Use non-capturing groups where possible and keep the pattern as concise as possible.
-3. **Validation Integration:** Define a `validation` section if you want to verify the detected secret. You can use Liquid templating to insert dynamic values—use the unnamed capture as `TOKEN` and any named captures in uppercase.
+3. **Validation Integration:** Define a `validation` section if you want to verify the detected secret. Prefer `Http` or `Grpc`; use an existing typed validator when the rule matches a supported validator family; use `Raw` only for rare provider-specific exception paths. You can use Liquid templating to insert dynamic values where supported. Use the unnamed capture as `TOKEN` and any named captures in uppercase.
 4. **Revocation Integration:** Define a `revocation` section if you want to revoke a detected secret. It uses the same HTTP request format and template variables as `validation`.
 5. **Test with Examples:** Always include examples that should match and, optionally, negative examples to ensure your rule behaves as expected.

@ -920,4 +948,5 @@ rules:
              words: ['"Arn"']
    depends_on_rule:
      - rule_id: kingfisher.alibabacloud.1
-        variable: AKID```
+        variable: AKID
+```
--- a/src/direct_revoke.rs
+++ b/src/direct_revoke.rs
@ -21,6 +21,7 @@ use crate::{
    cli::{commands::revoke::RevokeArgs, global::GlobalArgs},
    liquid_filters::register_all,
    rule_loader::RuleLoader,
+    template_vars::extract_template_vars,
    validation::aws::{revoke_aws_access_key, validate_aws_credentials_input},
    validation::gcp::revoke_gcp_service_account_key,
    validation::httpvalidation::{build_request_builder, retry_request, validate_response},
@ -88,12 +89,6 @@ fn find_rules_by_selector<'a>(
    Ok(matches)
 }

-/// Extract Liquid template variable names from a string.
-fn extract_template_vars(text: &str) -> BTreeSet<String> {
-    let re = Regex::new(r"\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*(?:\|[^}]*)?\}\}").unwrap();
-    re.captures_iter(text).filter_map(|cap| cap.get(1).map(|m| m.as_str().to_uppercase())).collect()
-}
-
 /// Extract all template variables used in a revocation configuration.
 fn extract_revocation_vars(revocation: &Revocation) -> BTreeSet<String> {
    let mut vars = BTreeSet::new();
--- a/src/direct_validate.rs
+++ b/src/direct_validate.rs
@ -14,7 +14,6 @@ use anyhow::{anyhow, bail, Context, Result};
 use crossbeam_skiplist::SkipMap;
 use liquid::Object;
 use liquid_core::{Value, ValueView};
-use regex::Regex;
 use reqwest::Client;
 use serde::Serialize;
 use tracing::debug;
@ -24,6 +23,7 @@ use crate::{
    liquid_filters::register_all,
    rule_loader::RuleLoader,
    rules::{rule::Rule, HttpValidation, Validation},
+    template_vars::extract_template_vars,
    validation::{
        aws::validate_aws_credentials,
        azure::validate_azure_storage_credentials,
@ -125,15 +125,6 @@ fn get_global_var(globals: &Object, name: &str) -> Option<String> {
    globals.get(name).and_then(|v| v.to_kstr().to_string().into())
 }

-/// Extract Liquid template variable names from a string.
-/// Matches patterns like {{ VAR }} or {{ VAR | filter }}.
-fn extract_template_vars(text: &str) -> BTreeSet<String> {
-    // Match {{ VAR }} or {{ VAR | filter }} patterns
-    // Variable names are alphanumeric with underscores
-    let re = Regex::new(r"\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*(?:\|[^}]*)?\}\}").unwrap();
-    re.captures_iter(text).filter_map(|cap| cap.get(1).map(|m| m.as_str().to_uppercase())).collect()
-}
-
 /// Extract all template variables used in a validation configuration.
 fn extract_validation_vars(validation: &Validation) -> BTreeSet<String> {
    let mut vars = BTreeSet::new();
--- a/src/lib.rs
+++ b/src/lib.rs
@ -54,6 +54,7 @@ pub mod slack;
 pub mod snippet;
 pub mod sqlite;
 pub mod teams;
+pub(crate) mod template_vars;
 pub mod toon;
 pub mod update;
 pub mod util;
--- a/src/matcher/filter.rs
+++ b/src/matcher/filter.rs
@ -169,6 +169,7 @@ pub(crate) fn filter_match<'b>(
    start: usize,
    end: usize,
    matches: &mut Vec<BlobMatch<'b>>,
+    full_matches: Option<&mut FxHashMap<usize, Vec<OffsetSpan>>>,
    previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
    rule_id: usize,
    seen_matches: &mut FxHashSet<u64>,
@ -183,6 +184,7 @@ pub(crate) fn filter_match<'b>(
 ) {
    let mut timer =
        profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
+    let mut full_matches = full_matches;

    let initial_len = matches.len();

@ -192,6 +194,13 @@ pub(crate) fn filter_match<'b>(

    for captures in re.captures_iter(haystack) {
        let full_capture = captures.get(0).unwrap();
+        let full_capture_offset_span =
+            OffsetSpan::from_range((start + full_capture.start())..(start + full_capture.end()));
+        if let Some(full_matches) = full_matches.as_deref_mut() {
+            if !record_match(full_matches, rule_id, full_capture_offset_span) {
+                continue;
+            }
+        }
        let matching_input_for_entropy = find_secret_capture(re, &captures);

        let min_entropy = rule.min_entropy();
--- a/src/matcher/mod.rs
+++ b/src/matcher/mod.rs
@ -33,12 +33,11 @@ use crate::{
 };
 use kingfisher_scanner::primitives::find_secret_capture;

-use self::{
-    base64_decode::get_base64_strings as get_b64_strings, dedup::record_match, filter::filter_match,
-};
+use self::{base64_decode::get_base64_strings as get_b64_strings, filter::filter_match};

 const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
 const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
+const RAW_MATCH_LOOKBACK: usize = 64 * 1024; // Re-scan a bounded suffix ending at the raw match.
 const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
                                                   // The old tree-sitter limit was 128 KiB due to full-AST parsing cost.
                                                   // The lightweight regex-based lexer is O(n) line-by-line, so we can afford
@ -243,6 +242,63 @@ impl<'a> Matcher<'a> {
        Ok(())
    }

+    fn process_raw_matches<'b>(
+        &self,
+        blob: &'b Blob,
+        origin: &OriginSet,
+        filename: &str,
+        redact: bool,
+        matches: &mut Vec<BlobMatch<'b>>,
+        previous_matches: &mut FxHashMap<usize, Vec<OffsetSpan>>,
+        seen_matches: &mut FxHashSet<u64>,
+        match_rule_indices: &mut Vec<usize>,
+    ) where
+        'a: 'b,
+    {
+        let rules_db = self.rules_db;
+        let mut seen_raw_match_ends: FxHashSet<(usize, usize)> = FxHashSet::default();
+        let mut previous_full_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
+        for &RawMatch { rule_id, start_idx, end_idx } in
+            self.user_data.raw_matches_scratch.iter().rev()
+        {
+            let rule_id_usize: usize = rule_id as usize;
+            let rule = Arc::clone(&rules_db.rules()[rule_id_usize]);
+            let re = &rules_db.anchored_regexes()[rule_id_usize];
+            let end_idx_usize = end_idx as usize;
+            let _ = start_idx; // Vectorscan block mode does not provide a reliable start offset.
+            if !seen_raw_match_ends.insert((rule_id_usize, end_idx_usize)) {
+                continue;
+            }
+
+            // Re-scan a bounded suffix ending at the raw match and dedupe on the
+            // actual capture spans produced by the anchored regex.
+            let scan_start = end_idx_usize.saturating_sub(RAW_MATCH_LOOKBACK);
+            let before_len = matches.len();
+            filter_match(
+                blob,
+                rule,
+                re,
+                scan_start,
+                end_idx_usize,
+                matches,
+                Some(&mut previous_full_matches),
+                previous_matches,
+                rule_id_usize,
+                seen_matches,
+                origin,
+                None,
+                false,
+                redact,
+                filename,
+                self.profiler.as_ref(),
+                self.respect_ignore_if_contains,
+                &self.inline_ignore_config,
+            );
+            match_rule_indices
+                .extend(std::iter::repeat_n(rule_id_usize, matches.len() - before_len));
+        }
+    }
+
    pub fn scan_blob<'b>(
        &mut self,
        blob: &'b Blob,
@ -289,51 +345,25 @@ impl<'a> Matcher<'a> {
            return Ok(ScanResult::New(Vec::new()));
        }

-        let rules_db = self.rules_db;
        let mut seen_matches = FxHashSet::default();
        let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
        let mut match_rule_indices: Vec<usize> = Vec::new();

        let blob_len = blob.len();
        let mut matches = Vec::new();
-        let mut previous_raw_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
-        for &RawMatch { rule_id, start_idx, end_idx } in
-            self.user_data.raw_matches_scratch.iter().rev()
-        {
-            let rule_id_usize: usize = rule_id as usize;
-            let rule = Arc::clone(&rules_db.rules()[rule_id_usize]);
-            let re = &rules_db.anchored_regexes()[rule_id_usize];
-            let start_idx_usize = start_idx as usize;
-            let end_idx_usize = end_idx as usize;
-            let current_span = OffsetSpan::from_range(start_idx_usize..end_idx_usize);
-            if !record_match(&mut previous_raw_matches, rule_id_usize, current_span) {
-                continue;
-            }
-            let before_len = matches.len();
-            filter_match(
-                blob,
-                rule,
-                re,
-                start_idx_usize,
-                end_idx_usize,
-                &mut matches,
-                &mut previous_matches,
-                rule_id_usize,
-                &mut seen_matches,
-                origin,
-                None,
-                false,
-                redact,
-                &filename,
-                self.profiler.as_ref(),
-                self.respect_ignore_if_contains,
-                &self.inline_ignore_config,
-            );
-            match_rule_indices
-                .extend(std::iter::repeat_n(rule_id_usize, matches.len() - before_len));
-        }
+        self.process_raw_matches(
+            blob,
+            origin,
+            &filename,
+            redact,
+            &mut matches,
+            &mut previous_matches,
+            &mut seen_matches,
+            &mut match_rule_indices,
+        );

        if !no_base64 {
+            let rules_db = self.rules_db;
            // If the blob contains standalone Base64 blobs, decode and scan them as well
            const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep
            let mut b64_stack: Vec<(DecodedData, usize)> =
@ -349,6 +379,7 @@ impl<'a> Matcher<'a> {
                        item.pos_start,
                        item.pos_end,
                        &mut matches,
+                        None,
                        &mut previous_matches,
                        rule_id_usize,
                        &mut seen_matches,
@ -379,7 +410,7 @@ impl<'a> Matcher<'a> {
            }
        }
        maybe_apply_context_verification(
-            rules_db,
+            self.rules_db,
            blob,
            lang_hint,
            blob_len,
@ -896,6 +927,65 @@ mod test {
        Ok(())
    }

+    #[test]
+    fn bogus_raw_starts_do_not_hide_earlier_matches() -> Result<()> {
+        let rule = Rule::new(RuleSyntax {
+            id: "bogus.start".into(),
+            name: "bogus start".into(),
+            pattern: r#"key\s*=\s*"([A-Z]{3})""#.into(),
+            confidence: crate::rules::rule::Confidence::Low,
+            min_entropy: 0.0,
+            visible: true,
+            examples: vec![],
+            negative_examples: vec![],
+            references: vec![],
+            validation: None::<Validation>,
+            revocation: None,
+            depends_on_rule: vec![],
+            pattern_requirements: None,
+            tls_mode: None,
+        });
+
+        let rules_db = RulesDatabase::from_rules(vec![rule])?;
+        let seen = BlobIdMap::new();
+        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
+        let matcher =
+            Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
+
+        let mut matcher = matcher;
+        matcher.user_data.raw_matches_scratch = vec![
+            RawMatch { rule_id: 0, start_idx: 5, end_idx: 9 },
+            RawMatch { rule_id: 0, start_idx: 5, end_idx: 19 },
+        ];
+
+        let blob = Blob::from_bytes(b"key=\"ABC\"\nkey=\"DEF\"".to_vec());
+        let origin = OriginSet::from(Origin::from_file(PathBuf::from("bogus-starts.txt")));
+        let mut matches = Vec::new();
+        let mut previous_matches = FxHashMap::default();
+        let mut seen_matches = FxHashSet::default();
+        let mut match_rule_indices = Vec::new();
+
+        matcher.process_raw_matches(
+            &blob,
+            &origin,
+            "bogus-starts.txt",
+            false,
+            &mut matches,
+            &mut previous_matches,
+            &mut seen_matches,
+            &mut match_rule_indices,
+        );
+
+        let secrets = matches
+            .iter()
+            .map(|m| String::from_utf8_lossy(m.matching_input).to_string())
+            .collect::<Vec<_>>();
+
+        assert_eq!(secrets, vec!["ABC", "DEF"]);
+        assert_eq!(match_rule_indices, vec![0, 0]);
+        Ok(())
+    }
+
    #[test]
    fn inline_comment_skips_match() -> Result<()> {
        let rule = Rule::new(RuleSyntax {
--- a/src/reporter.rs
+++ b/src/reporter.rs
@ -25,6 +25,7 @@ use crate::{
    origin::{Origin, OriginSet},
    rules::rule::Confidence,
    rules::Revocation,
+    template_vars::extract_template_vars,
    validation_body::{self, ValidationResponseBody},
 };
 mod bson_format;
@ -49,45 +50,6 @@ fn escape_for_shell(s: &str) -> String {
    format!("'{}'", s.replace('\'', "'\\''"))
 }

-static TEMPLATE_BLOCK_RE: once_cell::sync::Lazy<regex::Regex> = once_cell::sync::Lazy::new(|| {
-    regex::Regex::new(r"\{\{\s*([^}]*)\}\}").expect("template block regex should compile")
-});
-
-static TEMPLATE_IDENT_RE: once_cell::sync::Lazy<regex::Regex> = once_cell::sync::Lazy::new(|| {
-    regex::Regex::new(r"[A-Za-z_][A-Za-z0-9_]*").expect("template identifier regex should compile")
-});
-
-const TEMPLATE_FILTER_NAMES: &[&str] = &[
-    "append",
-    "b64enc",
-    "base62",
-    "crc32",
-    "crc32_hex",
-    "default",
-    "downcase",
-    "json_escape",
-    "prefix",
-    "replace",
-    "url_encode",
-];
-
-fn extract_template_vars(text: &str) -> BTreeSet<String> {
-    let mut vars = BTreeSet::new();
-
-    for block_cap in TEMPLATE_BLOCK_RE.captures_iter(text) {
-        let inner = block_cap.get(1).map(|m| m.as_str()).unwrap_or_default();
-        for ident_cap in TEMPLATE_IDENT_RE.captures_iter(inner) {
-            let ident = ident_cap.get(0).map(|m| m.as_str()).unwrap_or_default();
-            if TEMPLATE_FILTER_NAMES.iter().any(|f| f.eq_ignore_ascii_case(ident)) {
-                continue;
-            }
-            vars.insert(ident.to_uppercase());
-        }
-    }
-
-    vars
-}
-
 fn required_vars_for_validation(validation: &crate::rules::Validation) -> BTreeSet<String> {
    use crate::rules::Validation;
    let mut vars = BTreeSet::new();
--- a/src/template_vars.rs
+++ b/src/template_vars.rs
@ -0,0 +1,151 @@
+use std::collections::BTreeSet;
+
+static TEMPLATE_BLOCK_RE: once_cell::sync::Lazy<regex::Regex> = once_cell::sync::Lazy::new(|| {
+    regex::Regex::new(r"\{\{\s*([^}]*)\}\}").expect("template block regex should compile")
+});
+
+const LIQUID_LITERAL_NAMES: &[&str] = &["blank", "empty", "false", "nil", "null", "true"];
+
+pub(crate) fn extract_template_vars(text: &str) -> BTreeSet<String> {
+    let mut vars = BTreeSet::new();
+
+    for block_cap in TEMPLATE_BLOCK_RE.captures_iter(text) {
+        let inner = block_cap.get(1).map(|m| m.as_str()).unwrap_or_default();
+        for (segment_index, segment) in split_filter_segments(inner).into_iter().enumerate() {
+            collect_segment_vars(segment, segment_index != 0, &mut vars);
+        }
+    }
+
+    vars
+}
+
+fn split_filter_segments(inner: &str) -> Vec<&str> {
+    let mut segments = Vec::new();
+    let mut start = 0;
+    let mut in_single = false;
+    let mut in_double = false;
+    let mut escaped = false;
+
+    for (idx, ch) in inner.char_indices() {
+        if escaped {
+            escaped = false;
+            continue;
+        }
+
+        match ch {
+            '\\' if in_single || in_double => escaped = true,
+            '\'' if !in_double => in_single = !in_single,
+            '"' if !in_single => in_double = !in_double,
+            '|' if !in_single && !in_double => {
+                segments.push(&inner[start..idx]);
+                start = idx + ch.len_utf8();
+            }
+            _ => {}
+        }
+    }
+
+    segments.push(&inner[start..]);
+    segments
+}
+
+fn collect_segment_vars(segment: &str, skip_first_ident: bool, vars: &mut BTreeSet<String>) {
+    let mut chars = segment.char_indices().peekable();
+    let mut in_single = false;
+    let mut in_double = false;
+    let mut escaped = false;
+    let mut skipped_filter_name = !skip_first_ident;
+
+    while let Some((idx, ch)) = chars.next() {
+        if escaped {
+            escaped = false;
+            continue;
+        }
+
+        match ch {
+            '\\' if in_single || in_double => {
+                escaped = true;
+                continue;
+            }
+            '\'' if !in_double => {
+                in_single = !in_single;
+                continue;
+            }
+            '"' if !in_single => {
+                in_double = !in_double;
+                continue;
+            }
+            _ => {}
+        }
+
+        if in_single || in_double || !is_ident_start(ch) {
+            continue;
+        }
+
+        let mut end = idx + ch.len_utf8();
+        while let Some(&(next_idx, next_ch)) = chars.peek() {
+            if !is_ident_continue(next_ch) {
+                break;
+            }
+            chars.next();
+            end = next_idx + next_ch.len_utf8();
+        }
+
+        let ident = &segment[idx..end];
+        if !skipped_filter_name {
+            skipped_filter_name = true;
+            continue;
+        }
+
+        if LIQUID_LITERAL_NAMES.iter().any(|name| name.eq_ignore_ascii_case(ident)) {
+            continue;
+        }
+
+        vars.insert(ident.to_ascii_uppercase());
+    }
+}
+
+fn is_ident_start(ch: char) -> bool {
+    ch.is_ascii_alphabetic() || ch == '_'
+}
+
+fn is_ident_continue(ch: char) -> bool {
+    ch.is_ascii_alphanumeric() || ch == '_'
+}
+
+#[cfg(test)]
+mod tests {
+    use super::extract_template_vars;
+    use std::collections::BTreeSet;
+
+    #[test]
+    fn ignores_filter_names_but_keeps_filter_argument_vars() {
+        let vars = extract_template_vars(
+            "{{ NEXT_PUBLIC_VERCEL_APP_CLIENT_ID | default: VERCEL_APP_CLIENT_ID | append: ':' | append: VERCEL_APP_CLIENT_SECRET | b64enc }}",
+        );
+
+        assert_eq!(
+            vars,
+            BTreeSet::from([
+                "NEXT_PUBLIC_VERCEL_APP_CLIENT_ID".to_string(),
+                "VERCEL_APP_CLIENT_ID".to_string(),
+                "VERCEL_APP_CLIENT_SECRET".to_string(),
+            ])
+        );
+    }
+
+    #[test]
+    fn ignores_literal_strings_and_new_filter_names() {
+        let vars = extract_template_vars(
+            r#"{{ "" | unix_timestamp_ms }} {{ "" | rfc1123_date }} {{ TOKEN | hmac_sha384_hex: SECRET }} {{ "https://example.com/oauth/callback" | url_encode }}"#,
+        );
+
+        assert_eq!(vars, BTreeSet::from(["SECRET".to_string(), "TOKEN".to_string()]));
+    }
+
+    #[test]
+    fn ignores_liquid_literal_arguments() {
+        let vars = extract_template_vars(r#"{{ TOKEN | default: blank | append: FALLBACK }}"#);
+
+        assert_eq!(vars, BTreeSet::from(["FALLBACK".to_string(), "TOKEN".to_string()]));
+    }
+}