use std::{sync::Arc, time::Instant}; use anyhow::{anyhow, bail, Result}; use regex::bytes::Regex; use tracing::{debug, debug_span, error}; use vectorscan_rs::{BlockDatabase, Flag, Pattern}; use crate::rules::rule::{Rule, RULE_COMMENTS_PATTERN}; pub struct RulesDatabase { // pub(crate) rules: Vec, pub(crate) rules: Vec>, pub(crate) anchored_regexes: Vec, pub(crate) vsdb: BlockDatabase, } pub fn format_regex_pattern(pattern: &str) -> String { // Remove comments and whitespace while preserving the regex pattern let no_comment_pattern = RULE_COMMENTS_PATTERN.replace_all(pattern, ""); // flattens multi-line regex into a single line no_comment_pattern .lines() .map(|line| line.trim()) .filter(|line| !line.is_empty()) .collect::>() .join("") } impl RulesDatabase { pub fn get_regex_by_rule_id(&self, rule_id: &str) -> Option<&Regex> { self.rules .iter() .position(|r| r.syntax().id == rule_id) .and_then(|index| self.anchored_regexes.get(index)) } pub fn get_rule_by_finding_fingerprint(&self, finding_fingerprint: &str) -> Option> { self.rules.iter().find(|r| r.finding_sha1_fingerprint() == finding_fingerprint).cloned() } pub fn get_rule_by_text_id(&self, text_id: &str) -> Option> { self.rules.iter().find(|r| r.id() == text_id).cloned() } pub fn get_rule_by_name(&self, name: &str) -> Option> { self.rules.iter().find(|r| r.name() == name).cloned() } pub fn from_rules(rules: Vec) -> Result { let rules: Vec> = rules.into_iter().map(Arc::new).collect(); let _span = debug_span!("RulesDatabase::from_rules").entered(); if rules.is_empty() { bail!("No rules to compile"); } let patterns: Vec = rules .iter() .enumerate() .map(|(id, rule)| { Pattern::new( rule.syntax().pattern.clone().into_bytes(), Flag::default(), Some(id.try_into().unwrap()), ) }) .collect(); let t1 = Instant::now(); match BlockDatabase::new(patterns) { Ok(vsdb) => { let d1 = t1.elapsed().as_secs_f64(); let (anchored_regexes, d2) = Self::compile_regexes(&rules)?; debug!("Compiled {} rules: vectorscan {}s; regex {}s", rules.len(), d1, d2); Ok(RulesDatabase { rules, vsdb, anchored_regexes }) } Err(e) => { error!( "Failed to create BlockDatabase: {}. Attempting to compile rules individually.", e ); Self::compile_rules_individually(rules) .map_err(|err| anyhow!("Failed to compile rules: {}\n{}", e, err)) } } } fn compile_rules_individually(rules: Vec>) -> Result { // NOTE: This function only used when attempting to determine which rule failed // to compile let mut compiled_rules = Vec::new(); let mut compiled_patterns = Vec::new(); let mut compiled_regexes = Vec::new(); let mut error_messages = Vec::new(); for (id, rule) in rules.into_iter().enumerate() { let pattern = Pattern::new( rule.syntax().pattern.clone().into_bytes(), Flag::default(), Some(id.try_into().unwrap()), ); match BlockDatabase::new(vec![pattern]) { Ok(_) => { // Recreate the pattern for the final compilation let final_pattern = Pattern::new( rule.syntax().pattern.clone().into_bytes(), Flag::default(), Some(id.try_into().unwrap()), ); compiled_patterns.push(final_pattern); match rule.syntax().as_regex() { Ok(regex) => { compiled_regexes.push(regex); compiled_rules.push(rule); } Err(e) => { error_messages.push(format!( "Failed to compile Regex for rule '{}' (ID: {}): {}", rule.name(), rule.id(), e )); } } } Err(e) => { error_messages.push(format!( "Failed to compile vectorscan pattern for rule '{}' (ID: {}): {}", rule.name(), rule.id(), e )); } } } if !error_messages.is_empty() { error!( "Errors occurred while compiling rules individually:\n{}", error_messages.join("\n") ); bail!("Failed to compile the following rules:\n{}", error_messages.join("\n")); } let vsdb = BlockDatabase::new(compiled_patterns)?; Ok(RulesDatabase { rules: compiled_rules, vsdb, anchored_regexes: compiled_regexes }) } fn compile_regexes(rules: &[Arc]) -> Result<(Vec, f64)> { // fn compile_regexes(rules: &[Rule],) -> Result<(Vec, f64,),> { let t2 = Instant::now(); let mut anchored_regexes = Vec::with_capacity(rules.len()); for rule in rules { match rule.syntax().as_regex() { Ok(regex) => anchored_regexes.push(regex), Err(e) => { error!( "Failed to compile Regex for rule '{}' (ID: {}): {}", rule.name(), rule.id(), e ); return Err(anyhow!( "Failed to compile Regex for rule '{}' (ID: {}): {}", rule.name(), rule.id(), e )); } } } let d2 = t2.elapsed().as_secs_f64(); Ok((anchored_regexes, d2)) } #[inline] pub fn num_rules(&self) -> usize { self.rules.len() } #[inline] pub fn get_rule(&self, index: usize) -> Option> { self.rules.get(index).cloned() } pub fn rules(&self) -> &[Arc] { &self.rules } } #[cfg(test)] mod test_vectorscan { use pretty_assertions::assert_eq; use super::*; #[test] pub fn test_vectorscan_sanity() -> Result<()> { use vectorscan_rs::{BlockDatabase, BlockScanner, Pattern, Scan}; let input = b"some test data for vectorscan"; let pattern = Pattern::new(b"test".to_vec(), Flag::CASELESS | Flag::SOM_LEFTMOST, None); let db: BlockDatabase = BlockDatabase::new(vec![pattern])?; let mut scanner = BlockScanner::new(&db)?; let mut matches: Vec<(u64, u64)> = vec![]; scanner.scan(input, |id: u32, from: u64, to: u64, _flags: u32| { println!("found pattern #{} @ [{}, {})", id, from, to); matches.push((from, to)); Scan::Continue })?; assert_eq!(matches, vec![(5, 9)]); Ok(()) } } #[cfg(test)] #[cfg(test)] mod test_regex_cleaning { use super::*; #[test] fn test_format_regex_pattern() { let input = r#"(?x) (?i) (?: \\b (?:AWS|AMAZON|AMZN|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA) (?:\\.|[\\n\\r]){0,32}? (?# THIS IS A COMMENTCOMMENTCOMMENTCOMMENTCOMMENTCOMMENTCOMMENT) (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) # THIS IS A COMMENT THAT SHOULD NOT BE USED BUT MIGHT BE (?:\\.|[\\n\\r]){0,32}? \\b ( [A-Za-z0-9/+=]{40} ) \\b | \\b (?:SECRET|PRIVATE|ACCESS) (?:\\.|[\\n\\r]){0,16}? (?:KEY|TOKEN) (?:\\.|[\\n\\r]){0,32}? \\b ( [A-Za-z0-9/+=]{40} ) \\b )"#; let data = format_regex_pattern(input); println!("{}", data); } }