Added an optional exclude_words list to PatternRequirements so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior.

2025-11-04 14:15:04 -05:00 · 2025-11-04 14:15:04 -05:00 · bd8bc09d0e
commit bd8bc09d0e
parent a3e426e6ee
4 changed files with 149 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,10 +2,11 @@

 All notable changes to this project will be documented in this file.

-## [Unrelease]
+## [Unreleased]
 - pattern_requirements for rules — Post-regex character-class gating to cut false positives without lookarounds. Authors can now require minimum counts of digits, uppercase, lowercase, and special characters, with an optional custom special-char set. Why? Hyperscan doesn’t support lookaheads/behinds, so many "must contain X and Y" checks had to be baked into the regex (hurting readability) or were impossible. `pattern_requirements` applies lightweight, in-memory checks after a match is found, keeping patterns fast and clean.
- updated rules with support for `pattern_requirements`
+- Updated many rules with `pattern_requirements`
 - Automatically set `--no-dedup` whenever `--manage-baseline` is supplied so baseline management retains every occurrence of a finding
+- Added an optional `exclude_words` list to `PatternRequirements` so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior.

 ## [v1.61.0]
 - Fixed local filesystem scans to keep `open_path_as_is` enabled when opening Git repositories and only disable it for diff-based scans.
--- a/docs/RULES.md
+++ b/docs/RULES.md
@ -38,12 +38,14 @@ rules:
      - rule_id: kingfisher.aws.id
        variable: AKID              # referenced as {{ AKID }}

-    pattern_requirements:         # (optional) character type requirements
+    pattern_requirements:         # (optional) character/word requirements
      min_digits: 1                 # require at least 1 digit
      min_uppercase: 1              # require at least 1 uppercase letter
      min_lowercase: 1              # require at least 1 lowercase letter
      min_special_chars: 1          # require at least 1 special character
      special_chars: "!@#$%^&*()"   # optional: custom special character set
+      exclude_words:                # optional: drop matches containing these words
+        - test

    validation:                     # (optional) live validation
      type: Http
@ -78,7 +80,7 @@ rules:
 | examples                | Good matches; used for testing                                       |
 | visible                 | false to hide non‑secret captures (e.g. IDs)                         |
 | depends_on_rule         | Chain rules: use captures from one rule in another's validation      |
-| pattern_requirements  | Require specific character types (digits, uppercase, lowercase, special) |
+| pattern_requirements  | Require character types and/or exclude placeholder words from matches |
 | validation              | Configure HTTP, AWS, GCP, etc. checks to verify live validity        |


@ -264,10 +266,15 @@ pattern_requirements:
  min_lowercase: 1           # Require at least 1 lowercase letter (a-z)
  min_special_chars: 1       # Require at least 1 special character
  special_chars: "!@#$%^&*"  # Optional: define which characters are "special"
+  exclude_words:             # Optional: reject matches containing any of these (case-insensitive)
+    - test
+    - demo
 ```

 All fields are optional. If `special_chars` is not specified, the default set includes: `!@#$%^&*()_+-=[]{}|;:'",.<>?/\`~`

+`exclude_words` performs a case-insensitive substring check. If any entry (after trimming whitespace) appears within the match, the match is discarded. This is helpful for dropping known dummy tokens such as "test" or "demo" that otherwise satisfy the regex.
+
 ### Example: Secure API Key

 ```yaml
@ -288,6 +295,8 @@ rules:
      min_uppercase: 1        # Must contain at least 1 uppercase letter
      min_lowercase: 1        # Must contain at least 1 lowercase letter
      min_special_chars: 1    # Must contain at least 1 special character
+      exclude_words:
+        - test
    examples:
      - api_key = "MyS3cur3K3y!2024"
      - api-key: "Abc123!@#Token"
@ -298,6 +307,25 @@ In this example:
 - The `pattern_requirements` filters out matches that don't have at least one of each required type
 - A match like `"abcdefghijklmnopqrst"` would be rejected (no uppercase, no digit, no special)
 - A match like `"Abc123!SecureToken"` would be accepted (has all required types)
+- A match like `"Test123!SecureToken"` would be rejected because it contains the excluded word `test`
+
+### Example: Excluding Dummy Values
+
+```yaml
+rules:
+  - name: Token without placeholders
+    id: custom.token.2
+    pattern: |-
+      (?i)token[:=]\s*([A-Za-z0-9]{12,})
+    pattern_requirements:
+      exclude_words:
+        - placeholder
+        - sample
+    examples:
+      - token: "REALVALUE1234"
+    negative_examples:
+      - token = "SAMPLETOKEN9999"  # dropped by exclude_words
+```

 ### Example: Custom Special Characters

--- a/src/matcher.rs
+++ b/src/matcher.rs
@ -1004,7 +1004,9 @@ mod test {
    use crate::{
        blob::{Blob, BlobIdMap},
        origin::{Origin, OriginSet},
-        rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation},
+        rules::rule::{
+            DependsOnRule, HttpRequest, HttpValidation, PatternRequirements, RuleSyntax, Validation,
+        },
    };

    proptest! {
@ -1138,6 +1140,51 @@ mod test {
        Ok(())
    }

+    #[test]
+    fn test_pattern_requirements_exclude_words_filters_matches() -> Result<()> {
+        let rules = vec![Rule::new(RuleSyntax {
+            id: "test.exclude".to_string(),
+            name: "exclude words".to_string(),
+            pattern: "(?P<token>prefix[A-Za-z]+)".to_string(),
+            confidence: crate::rules::rule::Confidence::Medium,
+            min_entropy: 0.0,
+            visible: true,
+            examples: vec![],
+            negative_examples: vec![],
+            references: vec![],
+            validation: None,
+            depends_on_rule: vec![],
+            pattern_requirements: Some(PatternRequirements {
+                min_digits: None,
+                min_uppercase: None,
+                min_lowercase: None,
+                min_special_chars: None,
+                special_chars: None,
+                exclude_words: Some(vec!["TEST".to_string()]),
+            }),
+        })];
+
+        let rules_db = RulesDatabase::from_rules(rules)?;
+        let input = b"prefixgood prefixtest";
+        let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
+        let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+        let mut matcher =
+            Matcher::new(&rules_db, scanner_pool, &seen_blobs, None, false, None, &[], false)?;
+
+        matcher.scan_bytes_raw(input, "fname")?;
+
+        let matches = &matcher.user_data.raw_matches_scratch;
+        assert_eq!(matches.len(), 1, "exclude_words should drop filtered matches");
+        let RawMatch { start_idx, end_idx, .. } = matches[0];
+        assert_eq!(
+            &input[start_idx as usize..end_idx as usize],
+            b"prefixgood",
+            "remaining match should be the non-excluded token",
+        );
+
+        Ok(())
+    }
+
    // ---------------------------------------------------------------------
    // additional deterministic unit-tests
    // ---------------------------------------------------------------------
--- a/src/rules/rule.rs
+++ b/src/rules/rule.rs
@ -70,6 +70,9 @@ pub struct PatternRequirements {
    /// Custom set of characters to consider as "special" (defaults to common punctuation)
    #[serde(default)]
    pub special_chars: Option<String>,
+    /// Words that should cause the match to be excluded when present (case-insensitive)
+    #[serde(default)]
+    pub exclude_words: Option<Vec<String>>,
 }

 impl PatternRequirements {
@ -108,13 +111,33 @@ impl PatternRequirements {

        // Check special character requirement
        if let Some(min_special) = self.min_special_chars {
-            let special_chars = self.special_chars.as_deref().unwrap_or(Self::DEFAULT_SPECIAL_CHARS);
+            let special_chars =
+                self.special_chars.as_deref().unwrap_or(Self::DEFAULT_SPECIAL_CHARS);
            let special_count = s.chars().filter(|c| special_chars.contains(*c)).count();
            if special_count < min_special {
                return false;
            }
        }

+        // Check exclude words requirement
+        if let Some(exclude_words) = self.exclude_words.as_ref() {
+            let lowercase_input = s.to_lowercase();
+            if exclude_words
+                .iter()
+                .filter_map(|word| {
+                    let trimmed = word.trim();
+                    if trimmed.is_empty() {
+                        None
+                    } else {
+                        Some(trimmed.to_lowercase())
+                    }
+                })
+                .any(|word| lowercase_input.contains(&word))
+            {
+                return false;
+            }
+        }
+
        true
    }
 }
@ -526,6 +549,7 @@ mod tests {
            min_lowercase: None,
            min_special_chars: None,
            special_chars: None,
+            exclude_words: None,
        };

        // Should pass: has 3 digits
@ -546,6 +570,7 @@ mod tests {
            min_lowercase: None,
            min_special_chars: None,
            special_chars: None,
+            exclude_words: None,
        };

        // Should pass: has 3 uppercase
@ -566,6 +591,7 @@ mod tests {
            min_lowercase: Some(2),
            min_special_chars: None,
            special_chars: None,
+            exclude_words: None,
        };

        // Should pass: has 3 lowercase
@ -586,6 +612,7 @@ mod tests {
            min_lowercase: None,
            min_special_chars: Some(2),
            special_chars: None, // uses default
+            exclude_words: None,
        };

        // Should pass: has 2 special chars
@ -606,6 +633,7 @@ mod tests {
            min_lowercase: None,
            min_special_chars: Some(2),
            special_chars: Some("$%^".to_string()),
+            exclude_words: None,
        };

        // Should pass: has 2 custom special chars
@ -626,6 +654,7 @@ mod tests {
            min_lowercase: Some(1),
            min_special_chars: Some(1),
            special_chars: None,
+            exclude_words: None,
        };

        // Should pass: has all requirements
@ -644,6 +673,43 @@ mod tests {
        assert!(!reqs.validate(b"Abc1"));
    }

+    #[test]
+    fn test_pattern_requirements_exclude_words() {
+        let reqs = PatternRequirements {
+            min_digits: None,
+            min_uppercase: None,
+            min_lowercase: None,
+            min_special_chars: None,
+            special_chars: None,
+            exclude_words: Some(vec!["test".to_string(), "Demo".to_string()]),
+        };
+
+        // Should fail: contains "test" (case-insensitive)
+        assert!(!reqs.validate(b"MyTestToken"));
+
+        // Should fail: contains "demo" (case-insensitive)
+        assert!(!reqs.validate(b"example-demo-value"));
+
+        // Should pass: does not contain excluded words
+        assert!(reqs.validate(b"example-value"));
+    }
+
+    #[test]
+    fn test_pattern_requirements_exclude_words_ignores_empty_entries() {
+        let reqs = PatternRequirements {
+            min_digits: None,
+            min_uppercase: None,
+            min_lowercase: None,
+            min_special_chars: None,
+            special_chars: None,
+            exclude_words: Some(vec![" ".to_string(), "".to_string(), "BLOCK".to_string()]),
+        };
+
+        // Should fail only when non-empty exclusion matches
+        assert!(!reqs.validate(b"needs-blocking"));
+        assert!(reqs.validate(b"allowed"));
+    }
+
    #[test]
    fn test_pattern_requirements_none() {
        let reqs = PatternRequirements {
@ -652,6 +718,7 @@ mod tests {
            min_lowercase: None,
            min_special_chars: None,
            special_chars: None,
+            exclude_words: None,
        };

        // Should pass: no requirements