diff --git a/README.md b/README.md
index c89ea8f..d2ddc61 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
-
+
diff --git a/crates/kingfisher-scanner/src/scanner.rs b/crates/kingfisher-scanner/src/scanner.rs
index 55dff65..cca0ecb 100644
--- a/crates/kingfisher-scanner/src/scanner.rs
+++ b/crates/kingfisher-scanner/src/scanner.rs
@@ -14,6 +14,8 @@ use crate::finding::{Finding, FindingLocation};
use crate::primitives;
use crate::scanner_pool::ScannerPool;
+const RAW_MATCH_LOOKBACK: usize = 64 * 1024;
+
/// Configuration options for the scanner.
#[derive(Debug, Clone)]
pub struct ScannerConfig {
@@ -167,9 +169,14 @@ impl Scanner {
// Process matches through regex
let mut findings = Vec::new();
let mut seen_matches: FxHashSet = FxHashSet::default();
- let mut previous_spans: FxHashMap> = FxHashMap::default();
+ let mut seen_raw_match_ends: FxHashSet<(usize, usize)> = FxHashSet::default();
+ let mut previous_full_spans: FxHashMap> = FxHashMap::default();
for (rule_id, start, end) in raw_matches.into_iter().rev() {
+ let _ = start; // Block-mode Vectorscan reports `from` as 0 unless SOM is enabled.
+ if !seen_raw_match_ends.insert((rule_id, end)) {
+ continue;
+ }
let rule = match self.rules_db.get_rule(rule_id) {
Some(r) => r,
None => continue,
@@ -180,16 +187,18 @@ impl Scanner {
Err(_) => continue,
};
- let current_span = OffsetSpan::from_range(start..end);
-
- // Check for overlapping spans
- if !primitives::record_match(&mut previous_spans, rule_id, current_span) {
- continue;
- }
-
- let haystack = &bytes[start..end];
+ let scan_start = end.saturating_sub(RAW_MATCH_LOOKBACK);
+ let haystack = &bytes[scan_start..end];
for captures in anchored_regex.captures_iter(haystack) {
+ let full_capture = captures.get(0).unwrap();
+ let full_capture_span = OffsetSpan::from_range(
+ (scan_start + full_capture.start())..(scan_start + full_capture.end()),
+ );
+ if !primitives::record_match(&mut previous_full_spans, rule_id, full_capture_span) {
+ continue;
+ }
+
// Get the primary secret value
let secret_capture = primitives::find_secret_capture(&anchored_regex, &captures);
let secret_bytes = secret_capture.as_bytes();
@@ -203,20 +212,20 @@ impl Scanner {
}
// Compute match key for dedup
+ let offset_start = scan_start + secret_capture.start();
+ let offset_end = scan_start + secret_capture.end();
let match_key = primitives::compute_match_key(
secret_bytes,
rule.id().as_bytes(),
- start + secret_capture.start(),
- start + secret_capture.end(),
+ offset_start,
+ offset_end,
);
if !seen_matches.insert(match_key) {
continue;
}
// Build the finding
- let offset_span = OffsetSpan::from_range(
- (start + secret_capture.start())..(start + secret_capture.end()),
- );
+ let offset_span = OffsetSpan::from_range(offset_start..offset_end);
let source_span = loc_mapping.get_source_span(&offset_span);
let secret = if self.config.redact_secrets {
diff --git a/docs-site/docs/features/parsing.md b/docs-site/docs/features/parsing.md
index da71121..dc0c04c 100644
--- a/docs-site/docs/features/parsing.md
+++ b/docs-site/docs/features/parsing.md
@@ -1,6 +1,6 @@
---
title: "Source Code Parsing"
-description: "Language-aware secret detection using lightweight lexers for 16 languages including Python, JavaScript, Go, Rust, and more."
+description: "Language-aware secret detection using tree-sitter parsing for 13+ languages including Python, JavaScript, Go, Rust, and more."
---
# Kingfisher Source Code Parsing
@@ -11,7 +11,7 @@ The implementation favors lightweight extractors over full AST parsing:
- **Handwritten lexers** for common programming and config languages — comment-aware stripping followed by regex-based `key = value` extraction
- **`tl`** for HTML — attribute values, element text, and embedded `