Added an optional exclude_words list to PatternRequirements so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior.

This commit is contained in:
Mick Grove 2025-11-04 14:15:04 -05:00
commit 3b3a4e5030
4 changed files with 149 additions and 6 deletions

View file

@ -1004,7 +1004,9 @@ mod test {
use crate::{
blob::{Blob, BlobIdMap},
origin::{Origin, OriginSet},
rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation},
rules::rule::{
DependsOnRule, HttpRequest, HttpValidation, PatternRequirements, RuleSyntax, Validation,
},
};
proptest! {
@ -1138,6 +1140,51 @@ mod test {
Ok(())
}
#[test]
fn test_pattern_requirements_exclude_words_filters_matches() -> Result<()> {
let rules = vec![Rule::new(RuleSyntax {
id: "test.exclude".to_string(),
name: "exclude words".to_string(),
pattern: "(?P<token>prefix[A-Za-z]+)".to_string(),
confidence: crate::rules::rule::Confidence::Medium,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
depends_on_rule: vec![],
pattern_requirements: Some(PatternRequirements {
min_digits: None,
min_uppercase: None,
min_lowercase: None,
min_special_chars: None,
special_chars: None,
exclude_words: Some(vec!["TEST".to_string()]),
}),
})];
let rules_db = RulesDatabase::from_rules(rules)?;
let input = b"prefixgood prefixtest";
let seen_blobs: BlobIdMap<bool> = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen_blobs, None, false, None, &[], false)?;
matcher.scan_bytes_raw(input, "fname")?;
let matches = &matcher.user_data.raw_matches_scratch;
assert_eq!(matches.len(), 1, "exclude_words should drop filtered matches");
let RawMatch { start_idx, end_idx, .. } = matches[0];
assert_eq!(
&input[start_idx as usize..end_idx as usize],
b"prefixgood",
"remaining match should be the non-excluded token",
);
Ok(())
}
// ---------------------------------------------------------------------
// additional deterministic unit-tests
// ---------------------------------------------------------------------

View file

@ -70,6 +70,9 @@ pub struct PatternRequirements {
/// Custom set of characters to consider as "special" (defaults to common punctuation)
#[serde(default)]
pub special_chars: Option<String>,
/// Words that should cause the match to be excluded when present (case-insensitive)
#[serde(default)]
pub exclude_words: Option<Vec<String>>,
}
impl PatternRequirements {
@ -108,13 +111,33 @@ impl PatternRequirements {
// Check special character requirement
if let Some(min_special) = self.min_special_chars {
let special_chars = self.special_chars.as_deref().unwrap_or(Self::DEFAULT_SPECIAL_CHARS);
let special_chars =
self.special_chars.as_deref().unwrap_or(Self::DEFAULT_SPECIAL_CHARS);
let special_count = s.chars().filter(|c| special_chars.contains(*c)).count();
if special_count < min_special {
return false;
}
}
// Check exclude words requirement
if let Some(exclude_words) = self.exclude_words.as_ref() {
let lowercase_input = s.to_lowercase();
if exclude_words
.iter()
.filter_map(|word| {
let trimmed = word.trim();
if trimmed.is_empty() {
None
} else {
Some(trimmed.to_lowercase())
}
})
.any(|word| lowercase_input.contains(&word))
{
return false;
}
}
true
}
}
@ -526,6 +549,7 @@ mod tests {
min_lowercase: None,
min_special_chars: None,
special_chars: None,
exclude_words: None,
};
// Should pass: has 3 digits
@ -546,6 +570,7 @@ mod tests {
min_lowercase: None,
min_special_chars: None,
special_chars: None,
exclude_words: None,
};
// Should pass: has 3 uppercase
@ -566,6 +591,7 @@ mod tests {
min_lowercase: Some(2),
min_special_chars: None,
special_chars: None,
exclude_words: None,
};
// Should pass: has 3 lowercase
@ -586,6 +612,7 @@ mod tests {
min_lowercase: None,
min_special_chars: Some(2),
special_chars: None, // uses default
exclude_words: None,
};
// Should pass: has 2 special chars
@ -606,6 +633,7 @@ mod tests {
min_lowercase: None,
min_special_chars: Some(2),
special_chars: Some("$%^".to_string()),
exclude_words: None,
};
// Should pass: has 2 custom special chars
@ -626,6 +654,7 @@ mod tests {
min_lowercase: Some(1),
min_special_chars: Some(1),
special_chars: None,
exclude_words: None,
};
// Should pass: has all requirements
@ -644,6 +673,43 @@ mod tests {
assert!(!reqs.validate(b"Abc1"));
}
#[test]
fn test_pattern_requirements_exclude_words() {
let reqs = PatternRequirements {
min_digits: None,
min_uppercase: None,
min_lowercase: None,
min_special_chars: None,
special_chars: None,
exclude_words: Some(vec!["test".to_string(), "Demo".to_string()]),
};
// Should fail: contains "test" (case-insensitive)
assert!(!reqs.validate(b"MyTestToken"));
// Should fail: contains "demo" (case-insensitive)
assert!(!reqs.validate(b"example-demo-value"));
// Should pass: does not contain excluded words
assert!(reqs.validate(b"example-value"));
}
#[test]
fn test_pattern_requirements_exclude_words_ignores_empty_entries() {
let reqs = PatternRequirements {
min_digits: None,
min_uppercase: None,
min_lowercase: None,
min_special_chars: None,
special_chars: None,
exclude_words: Some(vec![" ".to_string(), "".to_string(), "BLOCK".to_string()]),
};
// Should fail only when non-empty exclusion matches
assert!(!reqs.validate(b"needs-blocking"));
assert!(reqs.validate(b"allowed"));
}
#[test]
fn test_pattern_requirements_none() {
let reqs = PatternRequirements {
@ -652,6 +718,7 @@ mod tests {
min_lowercase: None,
min_special_chars: None,
special_chars: None,
exclude_words: None,
};
// Should pass: no requirements