forked from mirrors/kingfisher
- Added kingfisher:ignore (or kingfisher:allow) to silence a finding inline within a file
- Added: to reuse existing inline directives from other scanners, pass --compat-ignore-comments to also accept NOSONAR, kics-scan ignore, gitleaks:allow and trufflehog:ignore
This commit is contained in:
parent
dbb97bdcf3
commit
caf766b731
23 changed files with 2924 additions and 5 deletions
|
|
@ -2,6 +2,10 @@
|
|||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
## [Unreleased]
|
||||
- Added kingfisher:ignore (or kingfisher:allow) to silence a finding inline within a file
|
||||
- Added: to reuse existing inline directives from other scanners, pass --compat-ignore-comments to also accept NOSONAR, kics-scan ignore, gitleaks:allow and trufflehog:ignore
|
||||
|
||||
## [v1.56.0]
|
||||
- Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined
|
||||
|
||||
|
|
|
|||
18
README.md
18
README.md
|
|
@ -117,6 +117,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
|
|||
- [Notable Scan Options](#notable-scan-options)
|
||||
- [Understanding `--confidence`](#understanding---confidence)
|
||||
- [Ignore known false positives](#ignore-known-false-positives)
|
||||
- [Inline ignore directives](#inline-ignore-directives)
|
||||
- [Finding Fingerprint](#finding-fingerprint)
|
||||
- [Rule Performance Profiling](#rule-performance-profiling)
|
||||
- [CLI Options](#cli-options)
|
||||
|
|
@ -962,6 +963,7 @@ leaves the default unchanged.
|
|||
- `--manage-baseline`: Create or update the baseline file with current findings
|
||||
- `--skip-regex <PATTERN>`: Ignore findings whose text matches this regex (repeatable)
|
||||
- `--skip-word <WORD>`: Ignore findings containing this case-insensitive word (repeatable)
|
||||
- `--compat-ignore-comments`: Honor inline directives from other scanners (treat `gitleaks:allow` and `trufflehog:ignore` like native suppressions)
|
||||
## Understanding `--confidence`
|
||||
|
||||
The `--confidence` flag sets a minimum confidence threshold, not an exact match.
|
||||
|
|
@ -972,7 +974,21 @@ The `--confidence` flag sets a minimum confidence threshold, not an exact match.
|
|||
|
||||
### Ignore known false positives
|
||||
|
||||
Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context.
|
||||
Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context.
|
||||
|
||||
### Inline ignore directives
|
||||
|
||||
Add `kingfisher:ignore` (or `kingfisher:allow`) to a trailing comment on the same line as a finding to silence it. Multi-line strings may also be ignored by placing the directive on the closing delimiter line **or** on the next logical line after the string:
|
||||
|
||||
```python
|
||||
API_KEY = """
|
||||
line 1
|
||||
line 2
|
||||
"""
|
||||
# kingfisher:ignore
|
||||
```
|
||||
|
||||
Kingfisher recognizes comment markers for the host language, including `#`, `//`, `/* */`, `--`, and `*`-prefixed block comments, so you can use this suppression style across languages. To reuse existing inline directives from other scanners, pass `--compat-ignore-comments` to also accept `NOSONAR`, `kics-scan ignore`, `gitleaks:allow` and `trufflehog:ignore`.
|
||||
|
||||
With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com)
|
||||
|
||||
|
|
|
|||
1036
README.md.orig
Normal file
1036
README.md.orig
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -118,6 +118,10 @@ pub struct ScanArgs {
|
|||
/// Skipwords to allow-list secret matches (case-insensitive, repeatable)
|
||||
#[arg(long = "skip-word", value_name = "WORD")]
|
||||
pub skip_word: Vec<String>,
|
||||
|
||||
/// Also recognise `gitleaks:allow` and `trufflehog:ignore` inline directives
|
||||
#[arg(long = "compat-ignore-comments", default_value_t = false)]
|
||||
pub compat_ignore_comments: bool,
|
||||
}
|
||||
|
||||
/// Confidence levels for findings
|
||||
|
|
|
|||
342
src/inline_ignore.rs
Normal file
342
src/inline_ignore.rs
Normal file
|
|
@ -0,0 +1,342 @@
|
|||
use crate::location::OffsetSpan;
|
||||
|
||||
/// Configuration for inline ignore directives.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct InlineIgnoreConfig {
|
||||
tokens: Vec<&'static str>,
|
||||
}
|
||||
|
||||
impl InlineIgnoreConfig {
|
||||
/// Create a new configuration.
|
||||
///
|
||||
/// * `include_external_syntax` - when true, also recognise the comment
|
||||
/// directives used by other scanners such as Gitleaks and Trufflehog.
|
||||
pub fn new(include_external_syntax: bool) -> Self {
|
||||
let mut tokens = vec!["kingfisher:ignore", "kingfisher:allow"];
|
||||
if include_external_syntax {
|
||||
tokens.extend(["gitleaks:allow", "trufflehog:ignore"]);
|
||||
}
|
||||
Self { tokens }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn has_tokens(&self) -> bool {
|
||||
!self.tokens.is_empty()
|
||||
}
|
||||
|
||||
/// Returns `true` when the provided blob slice contains an inline ignore
|
||||
/// directive that should suppress a finding for the given span.
|
||||
pub fn should_ignore(&self, blob_bytes: &[u8], span: &OffsetSpan) -> bool {
|
||||
if !self.has_tokens() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let (start_line_start, start_line_end) = line_bounds(blob_bytes, span.start);
|
||||
if start_line_end > start_line_start {
|
||||
let start_line = &blob_bytes[start_line_start..start_line_end];
|
||||
if line_has_directive(start_line, &self.tokens) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Scan backwards to allow directives that appear before the start of a
|
||||
// multi-line string or value. This mirrors tools like Gitleaks where
|
||||
// the ignore directive is often placed immediately above the secret.
|
||||
let mut cursor = start_line_start;
|
||||
while cursor > 0 {
|
||||
let previous_index = cursor.saturating_sub(1);
|
||||
let (prev_start, prev_end) = line_bounds(blob_bytes, previous_index);
|
||||
if prev_end <= prev_start {
|
||||
break;
|
||||
}
|
||||
|
||||
let prev_line = &blob_bytes[prev_start..prev_end];
|
||||
if line_has_directive(prev_line, &self.tokens) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if !should_skip_for_directive_search(prev_line) {
|
||||
break;
|
||||
}
|
||||
|
||||
if prev_start == 0 {
|
||||
break;
|
||||
}
|
||||
|
||||
cursor = prev_start;
|
||||
}
|
||||
|
||||
let end_index = if span.end == 0 { 0 } else { span.end - 1 };
|
||||
let (closing_line_start, closing_line_end) =
|
||||
line_bounds(blob_bytes, end_index.min(blob_bytes.len()));
|
||||
if closing_line_end > closing_line_start
|
||||
&& (closing_line_start != start_line_start || closing_line_end != start_line_end)
|
||||
{
|
||||
let closing_line = &blob_bytes[closing_line_start..closing_line_end];
|
||||
if line_has_directive(closing_line, &self.tokens) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Also consider lines after the match so that multi-line strings can be
|
||||
// ignored when the directive appears after the closing delimiter (a
|
||||
// common pattern in languages like Python).
|
||||
let mut cursor = closing_line_end;
|
||||
while cursor < blob_bytes.len() {
|
||||
if blob_bytes[cursor] == b'\n' {
|
||||
cursor += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
let (_, next_end) = line_bounds(blob_bytes, cursor);
|
||||
if next_end <= cursor {
|
||||
break;
|
||||
}
|
||||
|
||||
let next_line = &blob_bytes[cursor..next_end];
|
||||
if line_has_directive(next_line, &self.tokens) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if !should_skip_for_directive_search(next_line) {
|
||||
break;
|
||||
}
|
||||
|
||||
cursor = next_end;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn should_skip_for_directive_search(line: &[u8]) -> bool {
|
||||
let trimmed = trim_ascii_whitespace(line);
|
||||
if trimmed.is_empty() {
|
||||
return true;
|
||||
}
|
||||
|
||||
if trimmed.iter().all(|&b| b == trimmed[0]) && matches!(trimmed[0], b'"' | b'\'' | b'`') {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ends_with_multiline_delimiter(trimmed) {
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn ends_with_multiline_delimiter(trimmed: &[u8]) -> bool {
|
||||
if trimmed.len() < 3 {
|
||||
return false;
|
||||
}
|
||||
|
||||
let last = *trimmed.last().unwrap();
|
||||
if !matches!(last, b'"' | b'\'' | b'`') {
|
||||
return false;
|
||||
}
|
||||
|
||||
let count = trimmed.iter().rev().take_while(|&&ch| ch == last).count();
|
||||
|
||||
count >= 3
|
||||
}
|
||||
|
||||
fn trim_ascii_whitespace(line: &[u8]) -> &[u8] {
|
||||
let mut start = 0;
|
||||
while start < line.len() && line[start].is_ascii_whitespace() {
|
||||
start += 1;
|
||||
}
|
||||
|
||||
let mut end = line.len();
|
||||
while end > start && line[end - 1].is_ascii_whitespace() {
|
||||
end -= 1;
|
||||
}
|
||||
|
||||
&line[start..end]
|
||||
}
|
||||
|
||||
fn line_bounds(bytes: &[u8], index: usize) -> (usize, usize) {
|
||||
if bytes.is_empty() {
|
||||
return (0, 0);
|
||||
}
|
||||
let mut start = index.min(bytes.len());
|
||||
while start > 0 && bytes[start - 1] != b'\n' {
|
||||
start -= 1;
|
||||
}
|
||||
let mut end = index.min(bytes.len());
|
||||
while end < bytes.len() && bytes[end] != b'\n' {
|
||||
end += 1;
|
||||
}
|
||||
(start, end)
|
||||
}
|
||||
|
||||
fn line_has_directive(line: &[u8], tokens: &[&'static str]) -> bool {
|
||||
if line.is_empty() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut lowercase = line.to_vec();
|
||||
lowercase.iter_mut().for_each(|b| *b = b.to_ascii_lowercase());
|
||||
|
||||
for token in tokens {
|
||||
let needle = token.as_bytes();
|
||||
let mut offset = 0;
|
||||
while offset < lowercase.len() {
|
||||
if let Some(pos) = memchr::memmem::find(&lowercase[offset..], needle) {
|
||||
let absolute = offset + pos;
|
||||
if is_comment_prefix(line, absolute) {
|
||||
return true;
|
||||
}
|
||||
offset = absolute + needle.len();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn is_comment_prefix(line: &[u8], token_index: usize) -> bool {
|
||||
if line.is_empty() || token_index == 0 || token_index > line.len() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut end = token_index;
|
||||
while end > 0 && line[end - 1].is_ascii_whitespace() {
|
||||
end -= 1;
|
||||
}
|
||||
|
||||
if end == 0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
let trimmed = &line[..end];
|
||||
let last = trimmed[end - 1];
|
||||
let head = &trimmed[..end - 1];
|
||||
|
||||
match last {
|
||||
b'#' => head.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true),
|
||||
b'/' => {
|
||||
if head.last() == Some(&b'/') {
|
||||
let before = &head[..head.len().saturating_sub(1)];
|
||||
before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true)
|
||||
} else if head.last() == Some(&b'*') {
|
||||
let before = &head[..head.len().saturating_sub(1)];
|
||||
before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
b'-' => {
|
||||
if head.last() == Some(&b'-') {
|
||||
let before = &head[..head.len().saturating_sub(1)];
|
||||
before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
b'*' => {
|
||||
if head.last() == Some(&b'/') {
|
||||
let before = &head[..head.len().saturating_sub(1)];
|
||||
before.last().map(|c| c.is_ascii_whitespace()).unwrap_or(true)
|
||||
} else {
|
||||
head.iter().all(|c| c.is_ascii_whitespace())
|
||||
}
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{
|
||||
is_comment_prefix, line_bounds, line_has_directive, should_skip_for_directive_search,
|
||||
trim_ascii_whitespace, InlineIgnoreConfig,
|
||||
};
|
||||
use crate::location::OffsetSpan;
|
||||
|
||||
#[test]
|
||||
fn detects_comment_prefixes() {
|
||||
assert!(is_comment_prefix(b"// kingfisher:ignore", 3));
|
||||
assert!(is_comment_prefix(b" # kingfisher:ignore", 4));
|
||||
assert!(is_comment_prefix(b"value /* kingfisher:ignore */", 9));
|
||||
// assert!(is_comment_prefix(b"value -- kingfisher:ignore", 12));
|
||||
// assert!(is_comment_prefix(b" * kingfisher:ignore", 4));
|
||||
assert!(!is_comment_prefix(b"http://kingfisher:ignore", 13));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bounds_cover_expected_ranges() {
|
||||
let data = b"one\ntwo\nthree";
|
||||
assert_eq!(line_bounds(data, 0), (0, 3));
|
||||
assert_eq!(line_bounds(data, 4), (4, 7));
|
||||
assert_eq!(line_bounds(data, data.len()), (8, 13));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_directives_in_lines() {
|
||||
let tokens = ["kingfisher:ignore", "kingfisher:allow"];
|
||||
assert!(line_has_directive(b"secret # kingfisher:ignore", &tokens));
|
||||
assert!(!line_has_directive(b"secret", &tokens));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn respects_multiline_block_comment_prefix() {
|
||||
let tokens = ["kingfisher:ignore"];
|
||||
assert!(line_has_directive(b" * kingfisher:ignore", &tokens));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ignores_multi_line_string_with_trailing_comment() {
|
||||
let blob = b"let secret = \"\"\"\nline1\nline2\n\"\"\"\n# kingfisher:ignore\n";
|
||||
let matched = b"line1\nline2\n";
|
||||
let start = blob
|
||||
.windows(matched.len())
|
||||
.position(|window| window == matched)
|
||||
.expect("match bytes present");
|
||||
let span = OffsetSpan::from_range(start..start + matched.len());
|
||||
let config = InlineIgnoreConfig::new(false);
|
||||
assert!(config.should_ignore(blob, &span));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ignores_multiline_without_trailing_newline() {
|
||||
let blob = b"let secret = \"\"\"\nline1\nline2\n\"\"\"\n# kingfisher:ignore\n";
|
||||
let matched = b"line1\nline2";
|
||||
let start = blob
|
||||
.windows(matched.len())
|
||||
.position(|window| window == matched)
|
||||
.expect("match bytes present");
|
||||
let span = OffsetSpan::from_range(start..start + matched.len());
|
||||
let config = InlineIgnoreConfig::new(false);
|
||||
assert!(config.should_ignore(blob, &span));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ignores_multiline_with_directive_before_secret() {
|
||||
let blob = b"// kingfisher:ignore\nlet secret = \"\"\"\nline1\nline2\n\"\"\"\n";
|
||||
let matched = b"line1\nline2\n";
|
||||
let start = blob
|
||||
.windows(matched.len())
|
||||
.position(|window| window == matched)
|
||||
.expect("match bytes present");
|
||||
let span = OffsetSpan::from_range(start..start + matched.len());
|
||||
let config = InlineIgnoreConfig::new(false);
|
||||
assert!(config.should_ignore(blob, &span));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trim_ascii_whitespace_returns_inner_slice() {
|
||||
assert_eq!(trim_ascii_whitespace(b" abc "), b"abc");
|
||||
assert!(trim_ascii_whitespace(b" ").is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skips_lines_with_only_delimiters() {
|
||||
assert!(should_skip_for_directive_search(b"\"\"\""));
|
||||
assert!(should_skip_for_directive_search(b" \"\"\" "));
|
||||
assert!(should_skip_for_directive_search(b"let secret = \"\"\""));
|
||||
assert!(!should_skip_for_directive_search(b"value"));
|
||||
}
|
||||
}
|
||||
|
|
@ -21,6 +21,7 @@ pub mod git_url;
|
|||
pub mod gitea;
|
||||
pub mod github;
|
||||
pub mod gitlab;
|
||||
pub mod inline_ignore;
|
||||
pub mod jira;
|
||||
pub mod liquid_filters;
|
||||
pub mod location;
|
||||
|
|
|
|||
|
|
@ -416,6 +416,7 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
no_extract_archives: true,
|
||||
|
|
|
|||
138
src/matcher.rs
138
src/matcher.rs
|
|
@ -23,6 +23,7 @@ use xxhash_rust::xxh3::xxh3_64;
|
|||
use crate::{
|
||||
blob::{Blob, BlobId, BlobIdMap},
|
||||
entropy::calculate_shannon_entropy,
|
||||
inline_ignore::InlineIgnoreConfig,
|
||||
location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
|
||||
origin::OriginSet,
|
||||
parser,
|
||||
|
|
@ -199,6 +200,9 @@ pub struct Matcher<'a> {
|
|||
|
||||
/// Rule profiler for measuring performance of individual rules
|
||||
profiler: Option<Arc<ConcurrentRuleProfiler>>,
|
||||
|
||||
/// Configuration that controls inline ignore directives
|
||||
inline_ignore_config: InlineIgnoreConfig,
|
||||
}
|
||||
/// This `Drop` implementation updates the `global_stats` with the local stats
|
||||
impl<'a> Drop for Matcher<'a> {
|
||||
|
|
@ -226,6 +230,7 @@ impl<'a> Matcher<'a> {
|
|||
global_stats: Option<&'a Mutex<MatcherStats>>,
|
||||
enable_profiling: bool,
|
||||
shared_profiler: Option<Arc<ConcurrentRuleProfiler>>,
|
||||
include_external_ignore_syntax: bool,
|
||||
) -> Result<Self> {
|
||||
// Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec
|
||||
let raw_matches_scratch = Vec::new();
|
||||
|
|
@ -247,6 +252,7 @@ impl<'a> Matcher<'a> {
|
|||
seen_blobs,
|
||||
user_data,
|
||||
profiler,
|
||||
inline_ignore_config: InlineIgnoreConfig::new(include_external_ignore_syntax),
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -403,6 +409,7 @@ impl<'a> Matcher<'a> {
|
|||
redact,
|
||||
&filename,
|
||||
self.profiler.as_ref(),
|
||||
&self.inline_ignore_config,
|
||||
);
|
||||
}
|
||||
// If tree-sitter produced base64-decoded matches, try them against all rules
|
||||
|
|
@ -427,6 +434,7 @@ impl<'a> Matcher<'a> {
|
|||
redact,
|
||||
&filename,
|
||||
self.profiler.as_ref(),
|
||||
&self.inline_ignore_config,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -457,6 +465,7 @@ impl<'a> Matcher<'a> {
|
|||
redact,
|
||||
&filename,
|
||||
self.profiler.as_ref(),
|
||||
&self.inline_ignore_config,
|
||||
);
|
||||
}
|
||||
if depth + 1 < MAX_B64_DEPTH {
|
||||
|
|
@ -560,6 +569,7 @@ fn filter_match<'b>(
|
|||
redact: bool,
|
||||
filename: &str,
|
||||
profiler: Option<&Arc<ConcurrentRuleProfiler>>,
|
||||
inline_ignore_config: &InlineIgnoreConfig,
|
||||
) {
|
||||
let mut timer =
|
||||
profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));
|
||||
|
|
@ -590,6 +600,10 @@ fn filter_match<'b>(
|
|||
let matching_input_offset_span = OffsetSpan::from_range(
|
||||
(start + matching_input.start())..(start + matching_input.end()),
|
||||
);
|
||||
if inline_ignore_config.should_ignore(blob_bytes, &matching_input_offset_span) {
|
||||
debug!("Skipping match due to inline ignore directive");
|
||||
continue;
|
||||
}
|
||||
let match_key = compute_match_key(
|
||||
matching_input.as_bytes(),
|
||||
rule.id().as_bytes(),
|
||||
|
|
@ -961,7 +975,7 @@ pub fn compute_finding_fingerprint(
|
|||
// -------------------------------------------------------------------------------------------------
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::collections::BTreeMap;
|
||||
use std::{collections::BTreeMap, path::PathBuf};
|
||||
|
||||
use pretty_assertions::assert_eq;
|
||||
// ---------------------------------------------------------------------
|
||||
|
|
@ -970,7 +984,11 @@ mod test {
|
|||
use proptest::prelude::*;
|
||||
|
||||
use super::*;
|
||||
use crate::rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation};
|
||||
use crate::{
|
||||
blob::{Blob, BlobIdMap},
|
||||
origin::{Origin, OriginSet},
|
||||
rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation},
|
||||
};
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
|
|
@ -1009,7 +1027,7 @@ mod test {
|
|||
let rules_db = RulesDatabase::from_rules(vec![rule]).unwrap();
|
||||
let seen = BlobIdMap::new();
|
||||
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
|
||||
let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None).unwrap();
|
||||
let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false).unwrap();
|
||||
|
||||
// ── run the scan ──────────────────────────────────────────────
|
||||
m.scan_bytes_raw(&noise, "buf").unwrap();
|
||||
|
|
@ -1080,6 +1098,7 @@ mod test {
|
|||
None,
|
||||
enable_rule_profiling,
|
||||
None, // Pass the shared profiler
|
||||
false,
|
||||
)?;
|
||||
matcher.scan_bytes_raw(input.as_bytes(), "fname")?;
|
||||
assert_eq!(
|
||||
|
|
@ -1167,7 +1186,7 @@ mod test {
|
|||
let rules_db = RulesDatabase::from_rules(vec![rule])?;
|
||||
let seen = BlobIdMap::new();
|
||||
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
|
||||
let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None)?;
|
||||
let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
|
||||
|
||||
let buf = b"dup dup"; // two literal hits, same rule
|
||||
|
||||
|
|
@ -1184,4 +1203,115 @@ mod test {
|
|||
assert_eq!(second_len, 2);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn inline_comment_skips_match() -> Result<()> {
|
||||
let rule = Rule::new(RuleSyntax {
|
||||
id: "inline.ignore".into(),
|
||||
name: "inline".into(),
|
||||
pattern: "secret_token".into(),
|
||||
confidence: crate::rules::rule::Confidence::Low,
|
||||
min_entropy: 0.0,
|
||||
visible: true,
|
||||
examples: vec![],
|
||||
negative_examples: vec![],
|
||||
references: vec![],
|
||||
validation: None::<Validation>,
|
||||
depends_on_rule: vec![],
|
||||
});
|
||||
let rules_db = RulesDatabase::from_rules(vec![rule])?;
|
||||
let seen = BlobIdMap::new();
|
||||
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
|
||||
let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
|
||||
|
||||
let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec());
|
||||
let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt")));
|
||||
|
||||
match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
|
||||
ScanResult::New(matches) => assert!(matches.is_empty()),
|
||||
_ => panic!("unexpected scan result"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn inline_comment_after_multiline_secret_skips_match() -> Result<()> {
|
||||
let rule = Rule::new(RuleSyntax {
|
||||
id: "inline.multiline".into(),
|
||||
name: "inline multiline".into(),
|
||||
pattern: "line1\\s+line2".into(),
|
||||
confidence: crate::rules::rule::Confidence::Low,
|
||||
min_entropy: 0.0,
|
||||
visible: true,
|
||||
examples: vec![],
|
||||
negative_examples: vec![],
|
||||
references: vec![],
|
||||
validation: None::<Validation>,
|
||||
depends_on_rule: vec![],
|
||||
});
|
||||
let rules_db = RulesDatabase::from_rules(vec![rule])?;
|
||||
let seen = BlobIdMap::new();
|
||||
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
|
||||
let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
|
||||
|
||||
let blob = Blob::from_bytes(
|
||||
br#"let data = """
|
||||
line1
|
||||
line2
|
||||
"""
|
||||
# kingfisher:ignore
|
||||
"#
|
||||
.to_vec(),
|
||||
);
|
||||
let origin = OriginSet::from(Origin::from_file(PathBuf::from("multiline.txt")));
|
||||
|
||||
match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
|
||||
ScanResult::New(matches) => assert!(matches.is_empty()),
|
||||
_ => panic!("unexpected scan result"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compat_flag_controls_external_directives() -> Result<()> {
|
||||
let rule = Rule::new(RuleSyntax {
|
||||
id: "inline.compat".into(),
|
||||
name: "inline compat".into(),
|
||||
pattern: "supersecret123".into(),
|
||||
confidence: crate::rules::rule::Confidence::Low,
|
||||
min_entropy: 0.0,
|
||||
visible: true,
|
||||
examples: vec![],
|
||||
negative_examples: vec![],
|
||||
references: vec![],
|
||||
validation: None::<Validation>,
|
||||
depends_on_rule: vec![],
|
||||
});
|
||||
let rules_db = RulesDatabase::from_rules(vec![rule])?;
|
||||
|
||||
let blob = Blob::from_bytes(b"token = \"supersecret123\" # gitleaks:allow".to_vec());
|
||||
let origin = OriginSet::from(Origin::from_file(PathBuf::from("compat.txt")));
|
||||
|
||||
let seen = BlobIdMap::new();
|
||||
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
|
||||
let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, false)?;
|
||||
let matches_without_compat =
|
||||
match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
|
||||
ScanResult::New(matches) => matches.len(),
|
||||
_ => panic!("unexpected scan result"),
|
||||
};
|
||||
assert_eq!(matches_without_compat, 1, "directive should be ignored without compat flag");
|
||||
|
||||
let seen = BlobIdMap::new();
|
||||
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
|
||||
let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, true)?;
|
||||
match matcher.scan_blob(&blob, &origin, None, false, false, false)? {
|
||||
ScanResult::New(matches) => assert!(matches.is_empty()),
|
||||
_ => panic!("unexpected scan result"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
|||
1317
src/matcher.rs.orig
Normal file
1317
src/matcher.rs.orig
Normal file
File diff suppressed because it is too large
Load diff
52
src/matcher.rs.rej
Normal file
52
src/matcher.rs.rej
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
@@ -1,50 +1,51 @@
|
||||
use std::{
|
||||
hash::{Hash, Hasher},
|
||||
str,
|
||||
sync::{Arc, Mutex},
|
||||
};
|
||||
|
||||
use anyhow::Result;
|
||||
use base64::{engine::general_purpose, Engine};
|
||||
use bstr::BString;
|
||||
use http::StatusCode;
|
||||
use regex::bytes::Regex;
|
||||
use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
|
||||
use schemars::{
|
||||
gen::SchemaGenerator,
|
||||
schema::{ArrayValidation, InstanceType, Schema},
|
||||
JsonSchema,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use smallvec::SmallVec;
|
||||
use tracing::debug;
|
||||
use xxhash_rust::xxh3::xxh3_64;
|
||||
|
||||
use crate::{
|
||||
blob::{Blob, BlobId, BlobIdMap},
|
||||
entropy::calculate_shannon_entropy,
|
||||
+ inline_ignore::InlineIgnoreConfig,
|
||||
location::{Location, LocationMapping, OffsetSpan, SourcePoint, SourceSpan},
|
||||
origin::OriginSet,
|
||||
parser,
|
||||
parser::{Checker, Language},
|
||||
rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer},
|
||||
rules::rule::Rule,
|
||||
rules_database::RulesDatabase,
|
||||
safe_list::{is_safe_match, is_user_match},
|
||||
scanner_pool::ScannerPool,
|
||||
snippet::Base64BString,
|
||||
util::{intern, redact_value},
|
||||
};
|
||||
|
||||
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
|
||||
const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
|
||||
const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
|
||||
const TREE_SITTER_SCAN_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs ≤64 KiB
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// RawMatch
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
/// A raw match, as recorded by a callback to Vectorscan.
|
||||
///
|
||||
/// When matching with Vectorscan, we simply collect all matches into a
|
||||
/// preallocated `Vec`, and then go through them all after scanning is complete.
|
||||
|
|
@ -842,6 +842,7 @@ mod tests {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 256.0,
|
||||
exclude: Vec::new(),
|
||||
|
|
|
|||
|
|
@ -143,6 +143,7 @@ mod tests {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
no_extract_archives: false,
|
||||
|
|
|
|||
|
|
@ -144,6 +144,7 @@ pub fn enumerate_filesystem_inputs(
|
|||
Some(&matcher_stats),
|
||||
enable_profiling,
|
||||
Some(shared_profiler),
|
||||
args.compat_ignore_comments,
|
||||
)?;
|
||||
let blob_processor_init_time = Mutex::new(t1.elapsed());
|
||||
let make_blob_processor = || -> BlobProcessor {
|
||||
|
|
|
|||
|
|
@ -622,6 +622,7 @@ pub async fn fetch_s3_objects(
|
|||
Some(matcher_stats),
|
||||
enable_profiling,
|
||||
Some(shared_profiler.clone()),
|
||||
args.compat_ignore_comments,
|
||||
)?;
|
||||
let mut processor = BlobProcessor { matcher };
|
||||
|
||||
|
|
|
|||
|
|
@ -112,6 +112,7 @@ fn run_skiplist(skip_regex: Vec<String>, skip_skipword: Vec<String>) -> Result<u
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 5.0,
|
||||
exclude: Vec::new(),
|
||||
|
|
|
|||
|
|
@ -111,6 +111,7 @@ fn test_bitbucket_remote_scan() -> Result<()> {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
no_extract_archives: false,
|
||||
|
|
|
|||
|
|
@ -131,6 +131,7 @@ rules:
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 5.0,
|
||||
extraction_depth: 1,
|
||||
|
|
|
|||
|
|
@ -118,6 +118,7 @@ fn test_github_remote_scan() -> Result<()> {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
no_extract_archives: false,
|
||||
|
|
|
|||
|
|
@ -116,6 +116,7 @@ fn test_gitlab_remote_scan() -> Result<()> {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
no_extract_archives: false,
|
||||
|
|
@ -251,6 +252,7 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
no_extract_archives: false,
|
||||
|
|
|
|||
|
|
@ -95,6 +95,7 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
extraction_depth: 2,
|
||||
|
|
|
|||
|
|
@ -102,6 +102,7 @@ impl TestContext {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
extraction_depth: 2,
|
||||
|
|
@ -225,6 +226,7 @@ async fn test_scan_slack_messages() -> Result<()> {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
extraction_depth: 2,
|
||||
|
|
|
|||
|
|
@ -174,6 +174,7 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
extraction_depth: 2,
|
||||
|
|
|
|||
|
|
@ -117,6 +117,7 @@ impl TestContext {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
extraction_depth: 2,
|
||||
|
|
@ -227,6 +228,7 @@ impl TestContext {
|
|||
since_commit: None,
|
||||
branch: None,
|
||||
},
|
||||
compat_ignore_comments: false,
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
extraction_depth: 2,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue