From dc02abac6368329e12792ea410f5527c7ea328a2 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Wed, 5 Nov 2025 17:19:11 -0800 Subject: [PATCH] Added an optional exclude_words list to PatternRequirements so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. --- CHANGELOG.md | 2 +- README.md | 17 ++++ data/rules/aiven.yml | 2 +- data/rules/bitbucket.yml | 2 +- data/rules/confluent.yml | 4 +- data/rules/coze.yml | 7 +- data/rules/easypost.yml | 2 +- data/rules/generic.yml | 2 +- data/rules/intercom.yml | 2 +- data/rules/mongodb.yml | 2 +- data/rules/odbc.yml | 4 + data/rules/postgres.yml | 4 + data/rules/recaptcha.yml | 2 +- data/rules/sentry.yml | 2 +- data/rules/square.yml | 2 +- data/rules/twitch.yml | 3 +- docs/RULES.md | 16 +-- src/cli/commands/scan.rs | 4 + src/main.rs | 1 + src/matcher.rs | 128 ++++++++++++++++++++---- src/reporter.rs | 1 + src/reporter/json_format.rs | 1 + src/rules/rule.rs | 180 ++++++++++++++++++++++------------ src/scanner/enumerate.rs | 1 + src/scanner/repos.rs | 2 + tests/int_allowlist.rs | 1 + tests/int_bitbucket.rs | 1 + tests/int_dedup.rs | 1 + tests/int_github.rs | 1 + tests/int_gitlab.rs | 2 + tests/int_redact.rs | 1 + tests/int_slack.rs | 2 + tests/int_validation_cache.rs | 1 + tests/int_vulnerable_files.rs | 2 + tests/smoke_baseline.rs | 2 +- 35 files changed, 299 insertions(+), 108 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4337324..ecf43b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. ## [Unreleased] - Added `pattern_requirements` for rules. Enables post-regex character-class checks (digits, uppercase, lowercase, specials) to reduce false positives without lookarounds. Provides lightweight, in-memory validation after matches, keeping patterns fast and readable. -- Added an optional `exclude_words` list to `PatternRequirements` so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. +- Added an optional `ignore_if_contains` list to `PatternRequirements` within the Rules structure, so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. - Updated many rules with `pattern_requirements` - Automatically set `--no-dedup` whenever `--manage-baseline` is supplied so baseline management retains every occurrence of a finding diff --git a/README.md b/README.md index 5924c55..656e73c 100644 --- a/README.md +++ b/README.md @@ -323,6 +323,21 @@ However, you may want to add your own custom rules, or modify a detection to bet First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom Kingfisher rules. +### Pattern requirements and placeholder filtering + +Every rule can declare optional `pattern_requirements` to enforce additional character checks after a regex matches. Each field +is independent: + +- `min_digits`, `min_uppercase`, `min_lowercase`, and `min_special_chars` enforce complexity thresholds. +- `special_chars` lets you override the set of characters counted as "special" when `min_special_chars` is used. +- `ignore_if_contains` lists case-insensitive substrings that should cause a match to be discarded (for example, to drop + `test`, `demo`, or `localhost` values). Kingfisher still accepts the legacy `exclude_words` key as an alias when loading + existing rule files. + +When a match is skipped because of `ignore_if_contains`, Kingfisher logs the event at the `DEBUG` level alongside the rule that +was evaluated. If you need to keep those matches for a particular scan, pass `--no-ignore-if-contains` to `kingfisher scan` to +disable the substring filter without editing any rule files. + Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! # 🎉 Usage @@ -1168,6 +1183,8 @@ leaves the default unchanged. - `--skip-aws-account-file `: Load AWS account numbers to skip from a file (one account per line; `#` comments allowed) - `--ignore-comment `: Honor additional inline directives from other scanners (repeatable; e.g. `--ignore-comment "gitleaks:allow"`) - `--no-ignore`: Disable inline directives entirely so every match is reported +- `--no-ignore-if-contains`: Ignore the `ignore_if_contains` filter in rules so placeholder words still produce findings + ## Understanding `--confidence` The `--confidence` flag sets a minimum confidence threshold, not an exact match. diff --git a/data/rules/aiven.yml b/data/rules/aiven.yml index de6fdae..0d1ac80 100644 --- a/data/rules/aiven.yml +++ b/data/rules/aiven.yml @@ -9,7 +9,7 @@ rules: ( [a-z0-9/+=]{372} ) - \b + (?:[^A-Za-z0-9/+=]|$) pattern_requirements: min_digits: 2 min_uppercase: 1 diff --git a/data/rules/bitbucket.yml b/data/rules/bitbucket.yml index 61370fa..37912de 100644 --- a/data/rules/bitbucket.yml +++ b/data/rules/bitbucket.yml @@ -51,7 +51,7 @@ rules: confidence: medium examples: - bitbucket_key=HedmnK9h6KD_eh9KK8FlI9ahUc8WfaNZ4gulbrtN2ouV - - bitbucket_secret=kd8j2h4jf9s8mf6l4k9j2h4jf9s8mf6l4k9j2h4jf9s8mf6l + - bitbucket_secret=kd8j2h4jf9s8mf6l4k9j2h4jf9s8mf6l4k9j2h4jf9s8 validation: type: Http content: diff --git a/data/rules/confluent.yml b/data/rules/confluent.yml index 96fefe2..86e7b13 100644 --- a/data/rules/confluent.yml +++ b/data/rules/confluent.yml @@ -32,12 +32,12 @@ rules: ( [A-Z0-9\+/]{64} ) - \b + (?:[^A-Za-z0-9/+=]|$) min_entropy: 3.3 confidence: medium examples: - confluent secret=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890ab - - kafka_token=ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyzABCD + - kafka_token=ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyzAB references: - https://docs.confluent.io/cloud/current/api.html#tag/API-Keys-(iamv2)/operation/getIamV2ApiKey validation: diff --git a/data/rules/coze.yml b/data/rules/coze.yml index 982578b..c9783f3 100644 --- a/data/rules/coze.yml +++ b/data/rules/coze.yml @@ -3,7 +3,6 @@ rules: id: kingfisher.coze.1 pattern: | (?xi) - \b coze (?:.|[\n\r]){0,32}? \b @@ -37,6 +36,6 @@ rules: - https://www.coze.com/docs/developer_guides/coze_api_overview - https://www.coze.com/docs/developer_guides/retrieve_files examples: - - "pat_DlOG7fNcVfmw8cYhPWNcdfwrjjzwDr9EkV8EBjzHdgRWU2DzqHC1pPe0x590NN5f" - - "pat_93QiTdIvZGuRCFcfGTQJJ1VIYZ9dNHanX88wKoMojwMk3tX5tKqfFtxUp0ux8CjI" - - "pat_WvUTLYq5yZyaqegkyLSxXJMjXAJotjYEuC1sqT8daFlfwM3BiaRVJIZsER42DnhV" + - "key_coze = pat_DlOG7fNcVfmw8cYhPWNcdfwrjjzwDr9EkV8EBjzHdgRWU2DzqHC1pPe0x590NN5f" + - "coze_token = pat_93QiTdIvZGuRCFcfGTQJJ1VIYZ9dNHanX88wKoMojwMk3tX5tKqfFtxUp0ux8CjI" + - "coze-key: pat_WvUTLYq5yZyaqegkyLSxXJMjXAJotjYEuC1sqT8daFlfwM3BiaRVJIZsER42DnhV" diff --git a/data/rules/easypost.yml b/data/rules/easypost.yml index d5f1a42..0c85fb6 100644 --- a/data/rules/easypost.yml +++ b/data/rules/easypost.yml @@ -6,7 +6,7 @@ rules: \b ( EZ[AT]K - [A-Z0-9]{54} + [A-Za-z0-9]{54} ) \b pattern_requirements: diff --git a/data/rules/generic.yml b/data/rules/generic.yml index 38308f4..75d8fc5 100644 --- a/data/rules/generic.yml +++ b/data/rules/generic.yml @@ -10,7 +10,7 @@ rules: ) \b pattern_requirements: - min_digits: 4 + min_digits: 2 min_entropy: 3.3 confidence: low examples: diff --git a/data/rules/intercom.yml b/data/rules/intercom.yml index d6f159e..0b408ae 100644 --- a/data/rules/intercom.yml +++ b/data/rules/intercom.yml @@ -18,7 +18,7 @@ rules: examples: - "intercom_access_token: dG9rOvI0NmJlMTA5XzQwM2NfNDVlM184MjQzXzkwMDnmOTE1NGIyONoxOjA=" - - ic_token = "g1ZsclJXTjNfc1pBSzJDemE0eFVDU0U5c25CeDN4Vm9hQ2Zac0hXemZHNGVDPQ==" + - ic_token = "g1ZsclJXTjNfc1pBSzJDemE0eFVDU0U5c25CeDN4Vm9hQ2Zac0hXemZHNPQ==" references: - https://developers.intercom.com/docs/build-an-integration/learn-more/rest-apis diff --git a/data/rules/mongodb.yml b/data/rules/mongodb.yml index f6696c3..e70adec 100644 --- a/data/rules/mongodb.yml +++ b/data/rules/mongodb.yml @@ -82,7 +82,7 @@ rules: ) \b pattern_requirements: - exclude_words: + ignore_if_contains: - "@localhost" - "@127.0.0.1" min_entropy: 3 diff --git a/data/rules/odbc.yml b/data/rules/odbc.yml index 6593f92..6f60c6a 100644 --- a/data/rules/odbc.yml +++ b/data/rules/odbc.yml @@ -6,6 +6,10 @@ rules: (?: User | User\ Id | UserId | Uid) \s*=\s* ([^\s;]{3,100}) \s* ; [\ \t]* .{0,10} [\ \t]* (?: Password | Pwd) \s*=\s* ([^\t\ ;]{3,100}) \s* (?: [;] | $) + pattern_requirements: + ignore_if_contains: + - "localhost" + - "127.0.0.1" min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/postgres.yml b/data/rules/postgres.yml index 5cbb4ba..27b54ea 100644 --- a/data/rules/postgres.yml +++ b/data/rules/postgres.yml @@ -24,6 +24,10 @@ rules: (?: \d+ ) + pattern_requirements: + ignore_if_contains: + - "@localhost" + - "@127.0.0.1" min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/recaptcha.yml b/data/rules/recaptcha.yml index f13db51..d513a8d 100644 --- a/data/rules/recaptcha.yml +++ b/data/rules/recaptcha.yml @@ -9,7 +9,7 @@ rules: ( 6l[c-f][a-z0-9_-].{36} ) - \b + (?:[^A-Za-z0-9/]|$) pattern_requirements: min_digits: 3 min_entropy: 3 diff --git a/data/rules/sentry.yml b/data/rules/sentry.yml index 8d424ba..6e0d15e 100644 --- a/data/rules/sentry.yml +++ b/data/rules/sentry.yml @@ -50,7 +50,7 @@ rules: confidence: medium examples: - sntrys_eyJpYXQiOjE2OTA4ODAwMDAsInJlZ2lvbl91cmwiOiJodHRwczovL3NlbnRyeS5pby9vcmdzL215LW9yZy8ifQ==_abcdefghijklmnopqrstuvwx1234567890abcdefabc - - sntrys_eyJpYXQiOiIxNjkwODgwMDAwIiwicmVnaW9uX3VybCI6Imh0dHBzOi8vc2VudHJ5LmlvLyJ9_abcdABCD1234567890abcdABCD1234567890abcdABCD + - sntrys_eyJpYXQiOiIxNjkwODgwMDAwIiwicmVnaW9uX3VybCI6Imh0dHBzOi8vc2VudHJ5LmlvLyJ9_abcdABCD1234567890abcdABCD1234567890abcdABC references: - https://docs.sentry.io/api/auth/ validation: diff --git a/data/rules/square.yml b/data/rules/square.yml index c4aa62f..24a635a 100644 --- a/data/rules/square.yml +++ b/data/rules/square.yml @@ -16,7 +16,7 @@ rules: min_entropy: 3.3 confidence: medium examples: - - square EAAA7h9fL9zQJR8P0eAioAf9239345rDA2349bQ8edUA9FgA5JojdsF3A9f6nKLmn + - square EAAA7h9fL9zQJR8P0eAioAf9239345rDA2349bQ8edUA9FgA5JojdsF3A9f6nKLm - square EAAAvlYh9H7dZwC9ash2hrHjtlL5D2srERGK5OM6F2nvle23he3NzA60PAeFXNHj validation: type: Http diff --git a/data/rules/twitch.yml b/data/rules/twitch.yml index 1ab6102..d9ecc55 100644 --- a/data/rules/twitch.yml +++ b/data/rules/twitch.yml @@ -3,7 +3,6 @@ rules: id: kingfisher.twitch.1 pattern: | (?xi) - \b twitch (?:.|[\n\r]){0,32}? \b @@ -19,7 +18,7 @@ rules: confidence: medium examples: - TWITCH_TOKEN=abcdefghijklmnopqrstuvwx123456 - - "twitch_api_token: '0123456789abcdefghijklmnopqrstuv'" + - "twitch_api_token: '0123456789abcdefghijklmnopqrst'" references: - https://dev.twitch.tv/docs/authentication/validate-tokens/ validation: diff --git a/docs/RULES.md b/docs/RULES.md index 816dbe7..8a0d964 100644 --- a/docs/RULES.md +++ b/docs/RULES.md @@ -44,7 +44,7 @@ rules: min_lowercase: 1 # require at least 1 lowercase letter min_special_chars: 1 # require at least 1 special character special_chars: "!@#$%^&*()" # optional: custom special character set - exclude_words: # optional: drop matches containing these words + ignore_if_contains: # optional: drop matches containing these words - test validation: # (optional) live validation @@ -266,14 +266,16 @@ pattern_requirements: min_lowercase: 1 # Require at least 1 lowercase letter (a-z) min_special_chars: 1 # Require at least 1 special character special_chars: "!@#$%^&*" # Optional: define which characters are "special" - exclude_words: # Optional: reject matches containing any of these (case-insensitive) + ignore_if_contains: # Optional: reject matches containing any of these (case-insensitive) - test - demo ``` All fields are optional. If `special_chars` is not specified, the default set includes: `!@#$%^&*()_+-=[]{}|;:'",.<>?/\`~` -`exclude_words` performs a case-insensitive substring check. If any entry (after trimming whitespace) appears within the match, the match is discarded. This is helpful for dropping known dummy tokens such as "test" or "demo" that otherwise satisfy the regex. +`ignore_if_contains` performs a case-insensitive substring check. If any entry (after trimming whitespace) appears within the match, the match is discarded. This is helpful for dropping known dummy tokens such as "test" or "demo" that otherwise satisfy the regex. + +When this filter removes a match it is logged at the `DEBUG` level so you can see exactly which substring caused the skip. If you need to keep every match even when one of these substrings appears, pass `--no-ignore-if-contains` to `kingfisher scan`. The flag disables this post-processing step without changing the rule definitions. ### Example: Secure API Key @@ -295,7 +297,7 @@ rules: min_uppercase: 1 # Must contain at least 1 uppercase letter min_lowercase: 1 # Must contain at least 1 lowercase letter min_special_chars: 1 # Must contain at least 1 special character - exclude_words: + ignore_if_contains: - test examples: - api_key = "MyS3cur3K3y!2024" @@ -307,7 +309,7 @@ In this example: - The `pattern_requirements` filters out matches that don't have at least one of each required type - A match like `"abcdefghijklmnopqrst"` would be rejected (no uppercase, no digit, no special) - A match like `"Abc123!SecureToken"` would be accepted (has all required types) -- A match like `"Test123!SecureToken"` would be rejected because it contains the excluded word `test` +- A match like `"Test123!SecureToken"` would be rejected because it contains the `ignore_if_contains` term `test` ### Example: Excluding Dummy Values @@ -318,13 +320,13 @@ rules: pattern: |- (?i)token[:=]\s*([A-Za-z0-9]{12,}) pattern_requirements: - exclude_words: + ignore_if_contains: - placeholder - sample examples: - token: "REALVALUE1234" negative_examples: - - token = "SAMPLETOKEN9999" # dropped by exclude_words + - token = "SAMPLETOKEN9999" # dropped by ignore_if_contains ``` ### Example: Custom Special Characters diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index 1f0aaf5..f30d028 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -144,6 +144,10 @@ pub struct ScanArgs { /// Disable inline ignore directives entirely #[arg(long = "no-ignore", default_value_t = false)] pub no_inline_ignore: bool, + + /// Disable rule-level `ignore_if_contains` filtering for pattern requirements + #[arg(long = "no-ignore-if-contains", default_value_t = false)] + pub no_ignore_if_contains: bool, } /// Confidence levels for findings diff --git a/src/main.rs b/src/main.rs index a7deda6..b52fcd3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -446,6 +446,7 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_base64: false, no_inline_ignore: false, + no_ignore_if_contains: false, } } /// Run the rules check command diff --git a/src/matcher.rs b/src/matcher.rs index e572c8b..fb39ab4 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -29,7 +29,7 @@ use crate::{ parser, parser::{Checker, Language}, rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, - rules::rule::Rule, + rules::rule::{PatternValidationResult, Rule}, rules_database::RulesDatabase, safe_list::{is_safe_match, is_user_match}, scanner_pool::ScannerPool, @@ -203,6 +203,9 @@ pub struct Matcher<'a> { /// Configuration that controls inline ignore directives inline_ignore_config: InlineIgnoreConfig, + + /// Whether matches should honour `ignore_if_contains` requirements. + respect_ignore_if_contains: bool, } /// This `Drop` implementation updates the `global_stats` with the local stats impl<'a> Drop for Matcher<'a> { @@ -232,6 +235,7 @@ impl<'a> Matcher<'a> { shared_profiler: Option>, extra_ignore_directives: &[String], disable_inline_ignores: bool, + respect_ignore_if_contains: bool, ) -> Result { // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec let raw_matches_scratch = Vec::new(); @@ -258,6 +262,7 @@ impl<'a> Matcher<'a> { } else { InlineIgnoreConfig::new(extra_ignore_directives) }, + respect_ignore_if_contains, }) } @@ -414,6 +419,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + self.respect_ignore_if_contains, &self.inline_ignore_config, ); } @@ -439,6 +445,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + self.respect_ignore_if_contains, &self.inline_ignore_config, ); } @@ -470,6 +477,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + self.respect_ignore_if_contains, &self.inline_ignore_config, ); } @@ -574,6 +582,7 @@ fn filter_match<'b>( redact: bool, filename: &str, profiler: Option<&Arc>, + respect_ignore_if_contains: bool, inline_ignore_config: &InlineIgnoreConfig, ) { let mut timer = @@ -605,12 +614,22 @@ fn filter_match<'b>( // Check character requirements if specified if let Some(char_reqs) = rule.pattern_requirements() { - if !char_reqs.validate(mi_bytes) { - debug!( - "Skipping match that does not meet character requirements for rule {}", - rule.id() - ); - continue; + match char_reqs.validate(mi_bytes, respect_ignore_if_contains) { + PatternValidationResult::Passed => {} + PatternValidationResult::Failed => { + debug!( + "Skipping match that does not meet character requirements for rule {}", + rule.id() + ); + continue; + } + PatternValidationResult::IgnoredBySubstring { matched_term } => { + debug!( + "Skipping match for rule {} because it contains ignored term {matched_term}", + rule.id() + ); + continue; + } } } @@ -1056,6 +1075,7 @@ mod test { None, &[], false, + true, ) .unwrap(); @@ -1131,6 +1151,7 @@ mod test { None, // Pass the shared profiler &[], false, + true, )?; matcher.scan_bytes_raw(input.as_bytes(), "fname")?; assert_eq!( @@ -1141,7 +1162,7 @@ mod test { } #[test] - fn test_pattern_requirements_exclude_words_filters_matches() -> Result<()> { + fn test_pattern_requirements_ignore_if_contains_filters_matches() -> Result<()> { let rules = vec![Rule::new(RuleSyntax { id: "test.exclude".to_string(), name: "exclude words".to_string(), @@ -1160,7 +1181,7 @@ mod test { min_lowercase: None, min_special_chars: None, special_chars: None, - exclude_words: Some(vec!["TEST".to_string()]), + ignore_if_contains: Some(vec!["TEST".to_string()]), }), })]; @@ -1168,8 +1189,17 @@ mod test { let input = b"prefixgood prefixtest"; let seen_blobs: BlobIdMap = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen_blobs, None, false, None, &[], false)?; + let mut matcher = Matcher::new( + &rules_db, + scanner_pool, + &seen_blobs, + None, + false, + None, + &[], + false, + true, + )?; let blob = Blob::from_bytes(input.to_vec()); let origin = OriginSet::from(Origin::from_file(PathBuf::from("exclude.txt"))); @@ -1184,16 +1214,75 @@ mod test { } }; - assert_eq!(matches.len(), 1, "exclude_words should drop filtered matches"); + assert_eq!(matches.len(), 1, "ignore_if_contains should drop filtered matches"); assert_eq!( - matches[0].matching_input, - b"prefixgood", + matches[0].matching_input, b"prefixgood", "remaining match should be the non-excluded token", ); Ok(()) } + #[test] + fn test_pattern_requirements_ignore_if_contains_can_be_disabled_in_matcher() -> Result<()> { + let rules = vec![Rule::new(RuleSyntax { + id: "test.exclude".to_string(), + name: "exclude words".to_string(), + pattern: "(?Pprefix[A-Za-z]+)".to_string(), + confidence: crate::rules::rule::Confidence::Medium, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None, + depends_on_rule: vec![], + pattern_requirements: Some(PatternRequirements { + min_digits: None, + min_uppercase: None, + min_lowercase: None, + min_special_chars: None, + special_chars: None, + ignore_if_contains: Some(vec!["TEST".to_string()]), + }), + })]; + + let rules_db = RulesDatabase::from_rules(rules)?; + let input = b"prefixgood prefixtest"; + let seen_blobs: BlobIdMap = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new( + &rules_db, + scanner_pool, + &seen_blobs, + None, + false, + None, + &[], + false, + false, + )?; + + let blob = Blob::from_bytes(input.to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("exclude-disabled.txt"))); + + let matches = match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => matches, + ScanResult::SeenWithMatches => { + panic!( + "unexpected scan result: blob should not be considered previously seen with matches" + ) + } + ScanResult::SeenSansMatches => { + panic!( + "unexpected scan result: blob should not be considered previously seen without matches" + ) + } + }; + + assert_eq!(matches.len(), 2, "disabling ignore_if_contains should keep all matches"); + Ok(()) + } // --------------------------------------------------------------------- // additional deterministic unit-tests @@ -1274,7 +1363,8 @@ mod test { let rules_db = RulesDatabase::from_rules(vec![rule])?; let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false)?; + let mut m = + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; let buf = b"dup dup"; // two literal hits, same rule @@ -1312,7 +1402,7 @@ mod test { let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false)?; + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec()); let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt"))); @@ -1345,7 +1435,7 @@ mod test { let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false)?; + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; let blob = Blob::from_bytes( br#"let data = """ @@ -1390,7 +1480,7 @@ line2 let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false)?; + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; let matches_without_compat = match matcher.scan_blob(&blob, &origin, None, false, false, false)? { ScanResult::New(matches) => matches.len(), @@ -1402,7 +1492,7 @@ line2 let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); let extra = vec![String::from("gitleaks:allow")]; let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &extra, false)?; + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &extra, false, true)?; match matcher.scan_blob(&blob, &origin, None, false, false, false)? { ScanResult::New(matches) => assert!(matches.is_empty()), _ => panic!("unexpected scan result"), diff --git a/src/reporter.rs b/src/reporter.rs index bb96064..e8e29ba 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -807,6 +807,7 @@ mod tests { skip_aws_account: Vec::new(), skip_aws_account_file: None, no_inline_ignore: false, + no_ignore_if_contains: false, } } diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 11c5b30..f9fe2bf 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -180,6 +180,7 @@ mod tests { skip_aws_account_file: None, no_base64: false, no_inline_ignore: false, + no_ignore_if_contains: false, } } diff --git a/src/rules/rule.rs b/src/rules/rule.rs index 051adc3..f1f9615 100644 --- a/src/rules/rule.rs +++ b/src/rules/rule.rs @@ -72,7 +72,7 @@ pub struct PatternRequirements { pub special_chars: Option, /// Words that should cause the match to be excluded when present (case-insensitive) #[serde(default)] - pub exclude_words: Option>, + pub ignore_if_contains: Option>, } impl PatternRequirements { @@ -80,8 +80,13 @@ impl PatternRequirements { const DEFAULT_SPECIAL_CHARS: &'static str = "!@#$%^&*()_+-=[]{}|;:'\",.<>?/\\`~"; /// Validates whether the given byte slice meets the character requirements. - /// Returns true if all requirements are met, false otherwise. - pub fn validate(&self, input: &[u8]) -> bool { + /// Returns the validation outcome, including whether the match should be ignored + /// due to `ignore_if_contains` entries when that behaviour is enabled. + pub fn validate( + &self, + input: &[u8], + respect_ignore_if_contains: bool, + ) -> PatternValidationResult { // Convert to string (lossy for non-UTF8) let s = String::from_utf8_lossy(input); @@ -89,7 +94,7 @@ impl PatternRequirements { if let Some(min_digits) = self.min_digits { let digit_count = s.chars().filter(|c| c.is_ascii_digit()).count(); if digit_count < min_digits { - return false; + return PatternValidationResult::Failed; } } @@ -97,7 +102,7 @@ impl PatternRequirements { if let Some(min_uppercase) = self.min_uppercase { let uppercase_count = s.chars().filter(|c| c.is_ascii_uppercase()).count(); if uppercase_count < min_uppercase { - return false; + return PatternValidationResult::Failed; } } @@ -105,7 +110,7 @@ impl PatternRequirements { if let Some(min_lowercase) = self.min_lowercase { let lowercase_count = s.chars().filter(|c| c.is_ascii_lowercase()).count(); if lowercase_count < min_lowercase { - return false; + return PatternValidationResult::Failed; } } @@ -115,33 +120,52 @@ impl PatternRequirements { self.special_chars.as_deref().unwrap_or(Self::DEFAULT_SPECIAL_CHARS); let special_count = s.chars().filter(|c| special_chars.contains(*c)).count(); if special_count < min_special { - return false; + return PatternValidationResult::Failed; } } - // Check exclude words requirement - if let Some(exclude_words) = self.exclude_words.as_ref() { - let lowercase_input = s.to_lowercase(); - if exclude_words - .iter() - .filter_map(|word| { - let trimmed = word.trim(); - if trimmed.is_empty() { - None - } else { - Some(trimmed.to_lowercase()) - } - }) - .any(|word| lowercase_input.contains(&word)) - { - return false; + // Check ignore-if-contains requirement + if respect_ignore_if_contains { + if let Some(ignore_terms) = self.ignore_if_contains.as_ref() { + let lowercase_input = s.to_lowercase(); + if let Some(matched_term) = ignore_terms + .iter() + .filter_map(|term| { + let trimmed = term.trim(); + if trimmed.is_empty() { + None + } else { + Some((trimmed, trimmed.to_lowercase())) + } + }) + .find_map(|(original, lowered)| { + if lowercase_input.contains(&lowered) { + Some(original.to_string()) + } else { + None + } + }) + { + return PatternValidationResult::IgnoredBySubstring { matched_term }; + } } } - true + PatternValidationResult::Passed } } +/// Result of validating [`PatternRequirements`] against a potential match. +#[derive(Debug, PartialEq, Eq)] +pub enum PatternValidationResult { + /// All requirements are satisfied and the match should be kept. + Passed, + /// Requirements were not satisfied. + Failed, + /// The match contains one of the `ignore_if_contains` substrings and should be skipped. + IgnoredBySubstring { matched_term: String }, +} + /// Configuration for HTTP validation. This contains a request configuration /// and an optional multipart configuration. #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] @@ -549,17 +573,17 @@ mod tests { min_lowercase: None, min_special_chars: None, special_chars: None, - exclude_words: None, + ignore_if_contains: None, }; // Should pass: has 3 digits - assert!(reqs.validate(b"abc123def")); + assert!(matches!(reqs.validate(b"abc123def", true), PatternValidationResult::Passed)); // Should fail: only 1 digit - assert!(!reqs.validate(b"abc1def")); + assert!(matches!(reqs.validate(b"abc1def", true), PatternValidationResult::Failed)); // Should fail: no digits - assert!(!reqs.validate(b"abcdef")); + assert!(matches!(reqs.validate(b"abcdef", true), PatternValidationResult::Failed)); } #[test] @@ -570,17 +594,17 @@ mod tests { min_lowercase: None, min_special_chars: None, special_chars: None, - exclude_words: None, + ignore_if_contains: None, }; // Should pass: has 3 uppercase - assert!(reqs.validate(b"ABCdef")); + assert!(matches!(reqs.validate(b"ABCdef", true), PatternValidationResult::Passed)); // Should fail: only 1 uppercase - assert!(!reqs.validate(b"Adef")); + assert!(matches!(reqs.validate(b"Adef", true), PatternValidationResult::Failed)); // Should fail: no uppercase - assert!(!reqs.validate(b"abcdef")); + assert!(matches!(reqs.validate(b"abcdef", true), PatternValidationResult::Failed)); } #[test] @@ -591,17 +615,17 @@ mod tests { min_lowercase: Some(2), min_special_chars: None, special_chars: None, - exclude_words: None, + ignore_if_contains: None, }; // Should pass: has 3 lowercase - assert!(reqs.validate(b"ABCdef")); + assert!(matches!(reqs.validate(b"ABCdef", true), PatternValidationResult::Passed)); // Should fail: only 1 lowercase - assert!(!reqs.validate(b"ABCd")); + assert!(matches!(reqs.validate(b"ABCd", true), PatternValidationResult::Failed)); // Should fail: no lowercase - assert!(!reqs.validate(b"ABC123")); + assert!(matches!(reqs.validate(b"ABC123", true), PatternValidationResult::Failed)); } #[test] @@ -612,17 +636,17 @@ mod tests { min_lowercase: None, min_special_chars: Some(2), special_chars: None, // uses default - exclude_words: None, + ignore_if_contains: None, }; // Should pass: has 2 special chars - assert!(reqs.validate(b"abc!@def")); + assert!(matches!(reqs.validate(b"abc!@def", true), PatternValidationResult::Passed)); // Should fail: only 1 special char - assert!(!reqs.validate(b"abc!def")); + assert!(matches!(reqs.validate(b"abc!def", true), PatternValidationResult::Failed)); // Should fail: no special chars - assert!(!reqs.validate(b"abcdef")); + assert!(matches!(reqs.validate(b"abcdef", true), PatternValidationResult::Failed)); } #[test] @@ -633,17 +657,17 @@ mod tests { min_lowercase: None, min_special_chars: Some(2), special_chars: Some("$%^".to_string()), - exclude_words: None, + ignore_if_contains: None, }; // Should pass: has 2 custom special chars - assert!(reqs.validate(b"abc$%def")); + assert!(matches!(reqs.validate(b"abc$%def", true), PatternValidationResult::Passed)); // Should fail: has special chars but not the custom ones - assert!(!reqs.validate(b"abc!@def")); + assert!(matches!(reqs.validate(b"abc!@def", true), PatternValidationResult::Failed)); // Should fail: only 1 custom special char - assert!(!reqs.validate(b"abc$def")); + assert!(matches!(reqs.validate(b"abc$def", true), PatternValidationResult::Failed)); } #[test] @@ -654,60 +678,90 @@ mod tests { min_lowercase: Some(1), min_special_chars: Some(1), special_chars: None, - exclude_words: None, + ignore_if_contains: None, }; // Should pass: has all requirements - assert!(reqs.validate(b"Abc1!")); + assert!(matches!(reqs.validate(b"Abc1!", true), PatternValidationResult::Passed)); // Should fail: missing digit - assert!(!reqs.validate(b"Abc!")); + assert!(matches!(reqs.validate(b"Abc!", true), PatternValidationResult::Failed)); // Should fail: missing uppercase - assert!(!reqs.validate(b"abc1!")); + assert!(matches!(reqs.validate(b"abc1!", true), PatternValidationResult::Failed)); // Should fail: missing lowercase - assert!(!reqs.validate(b"ABC1!")); + assert!(matches!(reqs.validate(b"ABC1!", true), PatternValidationResult::Failed)); // Should fail: missing special - assert!(!reqs.validate(b"Abc1")); + assert!(matches!(reqs.validate(b"Abc1", true), PatternValidationResult::Failed)); } #[test] - fn test_pattern_requirements_exclude_words() { + fn test_pattern_requirements_ignore_if_contains() { let reqs = PatternRequirements { min_digits: None, min_uppercase: None, min_lowercase: None, min_special_chars: None, special_chars: None, - exclude_words: Some(vec!["test".to_string(), "Demo".to_string()]), + ignore_if_contains: Some(vec!["test".to_string(), "Demo".to_string()]), }; // Should fail: contains "test" (case-insensitive) - assert!(!reqs.validate(b"MyTestToken")); + assert!(matches!( + reqs.validate(b"MyTestToken", true), + PatternValidationResult::IgnoredBySubstring { .. } + )); // Should fail: contains "demo" (case-insensitive) - assert!(!reqs.validate(b"example-demo-value")); + assert!(matches!( + reqs.validate(b"example-demo-value", true), + PatternValidationResult::IgnoredBySubstring { .. } + )); // Should pass: does not contain excluded words - assert!(reqs.validate(b"example-value")); + assert!(matches!(reqs.validate(b"example-value", true), PatternValidationResult::Passed)); } #[test] - fn test_pattern_requirements_exclude_words_ignores_empty_entries() { + fn test_pattern_requirements_ignore_if_contains_ignores_empty_entries() { let reqs = PatternRequirements { min_digits: None, min_uppercase: None, min_lowercase: None, min_special_chars: None, special_chars: None, - exclude_words: Some(vec![" ".to_string(), "".to_string(), "BLOCK".to_string()]), + ignore_if_contains: Some(vec![" ".to_string(), "".to_string(), "BLOCK".to_string()]), }; // Should fail only when non-empty exclusion matches - assert!(!reqs.validate(b"needs-blocking")); - assert!(reqs.validate(b"allowed")); + assert!(matches!( + reqs.validate(b"needs-blocking", true), + PatternValidationResult::IgnoredBySubstring { .. } + )); + assert!(matches!(reqs.validate(b"allowed", true), PatternValidationResult::Passed)); + } + + #[test] + fn test_pattern_requirements_ignore_if_contains_can_be_disabled() { + let reqs = PatternRequirements { + min_digits: None, + min_uppercase: None, + min_lowercase: None, + min_special_chars: None, + special_chars: None, + ignore_if_contains: Some(vec!["ignoreme".to_string()]), + }; + + // With ignoring enabled, the match is skipped + assert!(matches!( + reqs.validate(b"value-ignoreme", true), + PatternValidationResult::IgnoredBySubstring { .. } + )); + + // With ignoring disabled, the same input passes requirements + assert!(matches!(reqs.validate(b"value-ignoreme", false), PatternValidationResult::Passed)); } #[test] @@ -718,12 +772,12 @@ mod tests { min_lowercase: None, min_special_chars: None, special_chars: None, - exclude_words: None, + ignore_if_contains: None, }; // Should pass: no requirements - assert!(reqs.validate(b"anything")); - assert!(reqs.validate(b"123")); - assert!(reqs.validate(b"!@#")); + assert!(matches!(reqs.validate(b"anything", true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"123", true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"!@#", true), PatternValidationResult::Passed)); } } diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index 06cc3c9..1a354ae 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -169,6 +169,7 @@ pub fn enumerate_filesystem_inputs( Some(shared_profiler), &args.extra_ignore_comments, args.no_inline_ignore, + !args.no_ignore_if_contains, )?; let blob_processor_init_time = Mutex::new(t1.elapsed()); let make_blob_processor = || -> BlobProcessor { diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index f041049..331f610 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -683,6 +683,7 @@ pub async fn fetch_s3_objects( Some(shared_profiler.clone()), &args.extra_ignore_comments, args.no_inline_ignore, + !args.no_ignore_if_contains, )?; let mut processor = BlobProcessor { matcher }; @@ -764,6 +765,7 @@ pub async fn fetch_gcs_objects( Some(shared_profiler.clone()), &args.extra_ignore_comments, args.no_inline_ignore, + !args.no_ignore_if_contains, )?; let mut processor = BlobProcessor { matcher }; diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs index d950c47..dcd9067 100644 --- a/tests/int_allowlist.rs +++ b/tests/int_allowlist.rs @@ -148,6 +148,7 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result Result<()> { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index b6eaae9..d059e73 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -168,6 +168,7 @@ rules: no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { diff --git a/tests/int_github.rs b/tests/int_github.rs index f96dd16..91af8db 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -155,6 +155,7 @@ fn test_github_remote_scan() -> Result<()> { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; // Create global arguments let global_args = GlobalArgs { diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index 71421d1..6651fa7 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -153,6 +153,7 @@ fn test_gitlab_remote_scan() -> Result<()> { skip_aws_account_file: None, no_base64: false, no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { @@ -304,6 +305,7 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { diff --git a/tests/int_redact.rs b/tests/int_redact.rs index 86b9dd1..b9fb1f8 100644 --- a/tests/int_redact.rs +++ b/tests/int_redact.rs @@ -131,6 +131,7 @@ async fn test_redact_hashes_finding_values() -> Result<()> { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { diff --git a/tests/int_slack.rs b/tests/int_slack.rs index d2d9048..27c5999 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -139,6 +139,7 @@ impl TestContext { skip_aws_account_file: None, no_base64: false, no_inline_ignore: false, + no_ignore_if_contains: false, }; let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules).load(&scan_args)?; @@ -278,6 +279,7 @@ async fn test_scan_slack_messages() -> Result<()> { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index d868936..970ad0a 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -211,6 +211,7 @@ async fn test_validation_cache_and_depvars() -> Result<()> { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; /* --------------------------------------------------------- * diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 9a17797..3e8bc4a 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -154,6 +154,7 @@ impl TestContext { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules) @@ -281,6 +282,7 @@ impl TestContext { skip_aws_account_file: None, no_base64: false, no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { diff --git a/tests/smoke_baseline.rs b/tests/smoke_baseline.rs index 5c36797..8f9a8f2 100644 --- a/tests/smoke_baseline.rs +++ b/tests/smoke_baseline.rs @@ -1,9 +1,9 @@ use std::fs; use assert_cmd::Command; +use clap::Parser; use predicates::prelude::*; use tempfile::tempdir; -use clap::Parser; const GH_PAT: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa";