From c3513ea2067eec283293f7edb4eea8b49d8e9cd8 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Tue, 2 Sep 2025 19:54:44 -0700 Subject: [PATCH] Optimized memory usage via string interning and extensive data sharing --- CHANGELOG.md | 3 ++- src/content_type.rs | 3 +-- src/decompress.rs | 8 +++--- src/findings_store.rs | 17 ++++++------ src/matcher.rs | 40 ++++++++++------------------ src/reporter.rs | 8 +++--- src/reporter/json_format.rs | 43 ++++++++++++++++-------------- src/scanner/docker.rs | 10 ++++--- src/scanner/validation.rs | 53 +++++++++++-------------------------- src/validation.rs | 6 +++-- src/validation/utils.rs | 21 ++++++++------- tests/fingerprint_dedup.rs | 27 ++++++++++++++----- 12 files changed, 114 insertions(+), 125 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fda4998..2eae1b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,8 @@ All notable changes to this project will be documented in this file. ## [Unreleased] -- Improved error message when self-update cannot find the current binary. +- Improved error message when self-update cannot find the current binary +- Optimized memory usage via string interning and extensive data sharing ## [1.47.0] - MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them diff --git a/src/content_type.rs b/src/content_type.rs index 197258d..ee44da6 100644 --- a/src/content_type.rs +++ b/src/content_type.rs @@ -96,9 +96,8 @@ impl ContentInspector { #[inline] #[must_use] pub fn guess_charset(&self, bytes: &[u8]) -> Option { - String::from_utf8(bytes.to_vec()).ok().map(|_| "UTF-8".to_string()) + std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string()) } - /// Guess programming language with broad coverage using `tokei`. /// /// Strategy (no disk I/O): diff --git a/src/decompress.rs b/src/decompress.rs index 49dc7ef..09f327f 100644 --- a/src/decompress.rs +++ b/src/decompress.rs @@ -162,10 +162,10 @@ fn handle_zip_archive_streaming( } fn handle_asar_archive_in_memory( - buffer: Vec, + buffer: &[u8], archive_path: &Path, ) -> Result { - match AsarReader::new(&buffer, None) { + match AsarReader::new(buffer, None) { Ok(reader) => { let mut contents = Vec::new(); for (path_in_asar, file) in reader.files() { @@ -200,7 +200,7 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result { let mmap = unsafe { Mmap::map(&file)? }; - return handle_asar_archive_in_memory(mmap.to_vec(), path); + return handle_asar_archive_in_memory(&mmap, path); } "tar" => { if let Some(base) = base_dir { @@ -525,4 +525,4 @@ mod tests { Ok(()) } -} +} \ No newline at end of file diff --git a/src/findings_store.rs b/src/findings_store.rs index fc610f2..23d8c7b 100644 --- a/src/findings_store.rs +++ b/src/findings_store.rs @@ -18,6 +18,7 @@ use crate::{ matcher::Match, origin::{Origin, OriginSet}, rules::rule::Rule, + util::intern, }; // share with Arc so every blob/origin is materialised once @@ -151,7 +152,7 @@ impl FindingsStore { .captures .get(1) .or_else(|| m.groups.captures.get(0)) - .map_or("", |c| c.value.as_ref()); + .map_or("", |c| c.value); let origin_kind = match origin.first() { Origin::GitRepo(_) => "git", @@ -160,7 +161,7 @@ impl FindingsStore { }; let key = xxh3_64( - format!("{}|{}|{}", m.rule_text_id.to_uppercase(), origin_kind, snippet) + format!("{}|{}|{}", m.rule.id().to_uppercase(), origin_kind, snippet) .as_bytes(), ); @@ -280,7 +281,7 @@ impl FindingsStore { pub fn get_summary(&self) -> FxHashMap<&'static str, usize> { self.matches.iter().fold(FxHashMap::default(), |mut acc, msg| { let (_, _, m) = &**msg; - *acc.entry(m.rule_name).or_insert(0) += 1; // borrow, no alloc + *acc.entry(intern(m.rule.name())).or_insert(0) += 1; acc }) } @@ -342,13 +343,13 @@ impl FindingsStore { self.matches.iter().map(|msg| { let (_, _, match_item) = &**msg; finding_data::FindingMetadata { - rule_name: match_item.rule_name.to_string(), + rule_name: match_item.rule.name().to_string(), num_matches: 1, comment: None, visible: match_item.visible, finding_id: match_item.finding_id(), - rule_finding_fingerprint: match_item.rule_finding_fingerprint.to_string(), - rule_text_id: match_item.rule_text_id.to_string(), + rule_finding_fingerprint: match_item.rule.finding_sha1_fingerprint().to_string(), + rule_text_id: match_item.rule.id().to_string(), } }) } @@ -362,7 +363,7 @@ impl FindingsStore { .iter() .filter(|msg| { let (_, _, match_item) = &***msg; - match_item.rule_name == metadata.rule_name + match_item.rule.name() == metadata.rule_name }) .map(|msg| { let (origin, blob_metadata, match_item) = &**msg; @@ -373,7 +374,7 @@ impl FindingsStore { match_id: MatchIdInt::from_str(&match_item.finding_id())?, match_comment: None, visible: match_item.visible, - match_confidence: match_item.rule_confidence, + match_confidence: match_item.rule.confidence(), validation_response_body: match_item.validation_response_body.clone(), validation_response_status: match_item.validation_response_status, validation_success: match_item.validation_success, diff --git a/src/matcher.rs b/src/matcher.rs index 1cbb16f..0ecc7ec 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -682,20 +682,20 @@ impl JsonSchema for Groups { // pub end: usize, // End position of the match // pub value: String, // The actual captured value // } -#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Clone, Serialize, JsonSchema)] pub struct SerializableCapture { pub name: Option, pub match_number: i32, pub start: usize, pub end: usize, - // Instead of storing an owned String, store a borrowed or interned value. - // Here we use Cow to allow either borrowing or owning as needed. - pub value: std::borrow::Cow<'static, str>, + /// Interned value of the capture. + pub value: &'static str, } -#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Clone, Serialize, JsonSchema)] pub struct SerializableCaptures { - pub captures: Vec, // All captures (named and unnamed) + #[schemars(with = "Vec")] + pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed) } impl SerializableCaptures { pub fn from_captures( @@ -704,7 +704,7 @@ impl SerializableCaptures { re: &Regex, redact: bool, ) -> Self { - let mut serialized_captures = Vec::new(); + let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new(); // Process named captures for name in re.capture_names().flatten() { if let Some(capture) = captures.name(name) { @@ -718,7 +718,7 @@ impl SerializableCaptures { match_number: -1, start: capture.start(), end: capture.end(), - value: value.into(), + value: intern(&value), }); } } @@ -735,7 +735,7 @@ impl SerializableCaptures { match_number: i32::try_from(i).unwrap_or(0), start: capture.start(), end: capture.end(), - value: value.into(), + value: intern(&value), }); } } @@ -764,16 +764,9 @@ pub struct Match { pub finding_fingerprint: u64, /// The rule that produced this match - pub rule_finding_fingerprint: &'static str, - - /// The text identifier of the rule that produced this match - pub rule_text_id: &'static str, - - /// The name of the rule that produced this match - pub rule_name: &'static str, - - /// The confidence property of the rule that produced this match - pub rule_confidence: crate::rules::rule::Confidence, + #[serde(skip_serializing)] + #[schemars(skip)] + pub rule: Arc, /// Validation Body pub validation_response_body: String, @@ -813,8 +806,6 @@ impl Match { let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or(""); let source_span = loc_mapping.get_source_span(&offset_span); - let rule_finding_fingerprint = owned_blob_match.rule.finding_sha1_fingerprint().to_owned(); - let offset_start: u64 = owned_blob_match.matching_input_offset_span.start.try_into().unwrap(); let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap(); @@ -828,10 +819,7 @@ impl Match { // matching_snippet Match { - rule_finding_fingerprint: intern(&rule_finding_fingerprint), - rule_name: intern(owned_blob_match.rule.name()), - rule_confidence: owned_blob_match.rule.confidence(), - rule_text_id: intern(owned_blob_match.rule.id()), + rule: owned_blob_match.rule.clone(), visible: owned_blob_match.rule.visible().to_owned(), location: Location { offset_span, source_span: source_span.clone() }, groups: owned_blob_match.captures.clone(), @@ -852,7 +840,7 @@ impl Match { pub fn finding_id(&self) -> String { let mut h = Sha1::new(); - write!(&mut h, "{}\0", self.rule_finding_fingerprint) + write!(&mut h, "{}\0", self.rule.finding_sha1_fingerprint()) .expect("should be able to write to memory"); serde_json::to_writer(&mut h, &self.groups) .expect("should be able to serialize groups as JSON"); diff --git a/src/reporter.rs b/src/reporter.rs index c9c2a62..dacf75b 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -216,7 +216,7 @@ impl DetailsReporter { m: match_item.clone(), comment: None, visible: match_item.visible, - match_confidence: match_item.rule_confidence, + match_confidence: match_item.rule.confidence(), validation_response_body: match_item.validation_response_body.clone(), validation_response_status: match_item.validation_response_status, validation_success: match_item.validation_success, @@ -366,13 +366,13 @@ impl DetailsReporter { FindingReporterRecord { rule: RuleMetadata { - name: rm.m.rule_name.to_string(), - id: rm.m.rule_text_id.to_string(), + name: rm.m.rule.name().to_string(), + id: rm.m.rule.id().to_string(), }, finding: FindingRecordData { snippet, fingerprint: rm.m.finding_fingerprint.to_string(), - confidence: rm.match_confidence.to_string(), + confidence: rm.m.rule.confidence().to_string(), entropy: format!("{:.2}", rm.m.calculated_entropy), validation: ValidationInfo { status: validation_status, response: response_body }, language: rm diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 0b562d2..7cf90c8 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -35,6 +35,7 @@ mod tests { use crate::cli::commands::github::GitHistoryMode; use crate::cli::commands::rules::RuleSpecifierArgs; use crate::matcher::{SerializableCapture, SerializableCaptures}; + use crate::rules::rule::{Rule, RuleSyntax, Confidence}; use crate::util::intern; use crate::{ blob::BlobId, @@ -54,6 +55,7 @@ mod tests { path::PathBuf, sync::{Arc, Mutex}, }; + use smallvec::smallvec; use url::Url; fn create_default_args() -> cli::commands::scan::ScanArgs { use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope @@ -132,12 +134,21 @@ mod tests { } } - fn create_mock_match( - rule_name: &str, - rule_text_id: &str, - rule_finding_fingerprint: &str, - validation_success: bool, - ) -> Match { + fn create_mock_match(rule_name: &str, rule_text_id: &str, validation_success: bool) -> Match { + let syntax = RuleSyntax { + name: rule_name.to_string(), + id: rule_text_id.to_string(), + pattern: "dummy".to_string(), + min_entropy: 0.0, + confidence: Confidence::Medium, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None, + depends_on_rule: vec![], + }; + let rule = Arc::new(Rule::new(syntax)); Match { location: Location { offset_span: OffsetSpan { start: 10, end: 20 }, @@ -147,20 +158,17 @@ mod tests { }, }, groups: SerializableCaptures { - captures: vec![SerializableCapture { + captures: smallvec![SerializableCapture { name: Some("token".to_string()), match_number: 1, start: 10, end: 20, - value: "mock_token".into(), + value: intern("mock_token"), }], }, blob_id: BlobId::new(b"mock_blob"), finding_fingerprint: 0123, - rule_finding_fingerprint: intern(rule_finding_fingerprint), - rule_text_id: intern(rule_text_id), - rule_name: intern(rule_name), - rule_confidence: Confidence::Medium, + rule, validation_response_body: "validation response".to_string(), validation_response_status: 200, validation_success, @@ -204,8 +212,7 @@ mod tests { #[test] fn test_json_format() -> Result<()> { - let mock_match = - create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); + let mock_match = create_mock_match("MockRule", "mock_rule_1", true); let matches = vec![ReportMatch { origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]), blob_metadata: BlobMetadata { @@ -238,12 +245,8 @@ mod tests { fn test_validation_status_in_json() -> Result<()> { let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")]; for (validation_success, expected_status) in test_cases { - let mock_match = create_mock_match( - "MockRule", - "mock_rule_1", - "mock_finding_fingerprint", - validation_success, - ); + let mock_match = + create_mock_match("MockRule", "mock_rule_1", validation_success); let matches = vec![ReportMatch { origin: OriginSet::new( Origin::from_file(PathBuf::from("/mock/path/file.rs")), diff --git a/src/scanner/docker.rs b/src/scanner/docker.rs index b427a3b..405a553 100644 --- a/src/scanner/docker.rs +++ b/src/scanner/docker.rs @@ -175,9 +175,11 @@ impl Docker { pb.set_length(layer_paths.len() as u64); for p in layer_paths { - let mut data = Vec::new(); - File::open(&p)?.read_to_end(&mut data)?; - let digest = format!("{:x}", Sha256::digest(&data)); + let mut file = File::open(&p)?; + let mut hasher = Sha256::new(); + std::io::copy(&mut file, &mut hasher)?; + let digest = format!("{:x}", hasher.finalize()); + let new_path = out_dir.join(format!("layer_{digest}.tar")); std::fs::rename(&p, &new_path)?; // extract layer contents so inner filenames appear in scan results @@ -280,4 +282,4 @@ mod tests { fn docker_struct_new() { let _ = Docker::new(); } -} +} \ No newline at end of file diff --git a/src/scanner/validation.rs b/src/scanner/validation.rs index 05865b2..f01872a 100644 --- a/src/scanner/validation.rs +++ b/src/scanner/validation.rs @@ -13,7 +13,7 @@ use futures::{stream, StreamExt}; use indicatif::{ProgressBar, ProgressStyle}; use liquid::Parser; use reqwest::{Client, StatusCode}; -use rustc_hash::{FxHashMap, FxHashSet}; +use rustc_hash::FxHashMap; use tokio::{sync::Notify, time::timeout}; use crate::{ @@ -21,7 +21,6 @@ use crate::{ findings_store::{FindingsStore, FindingsStoreMessage}, location::OffsetSpan, matcher::{Match, OwnedBlobMatch}, - rules::rule, validation::{collect_variables_and_dependencies, validate_single_match, CachedResponse}, }; @@ -40,7 +39,7 @@ pub async fn run_secret_validation( let fail_count = Arc::new(AtomicUsize::new(0)); // ── 2. Fetch rules + matches ──────────────────────────────────────────── - let (all_rules, all_matches_by_blob) = { + let (_all_rules, all_matches_by_blob) = { let ds = datastore.lock().unwrap(); let rules = ds.get_rules()?; let mut map: FxHashMap>> = FxHashMap::default(); @@ -51,16 +50,13 @@ pub async fn run_secret_validation( }; // ── 3. Partition blobs ────────────────────────────────────────────────── - let rules_with_deps: FxHashSet<&str> = all_rules - .iter() - .filter(|r| !r.syntax().depends_on_rule.is_empty()) - .map(|r| r.id()) - .collect(); - let mut simple_matches = Vec::new(); let mut dependent_blobs = FxHashMap::default(); // blob_id -- Vec> for (blob_id, matches) in all_matches_by_blob { - if matches.iter().any(|m| rules_with_deps.contains(m.2.rule_text_id)) { + if matches + .iter() + .any(|m| !m.2.rule.syntax().depends_on_rule.is_empty()) + { dependent_blobs.insert(blob_id, matches); } else { simple_matches.extend(matches); @@ -80,9 +76,9 @@ pub async fn run_secret_validation( .captures .get(1) .or_else(|| arc_msg.2.groups.captures.get(0)) - .map_or("", |c| c.value.as_ref()); + .map_or("", |c| c.value); groups - .entry(format!("{}|{}", arc_msg.2.rule_text_id, secret)) + .entry(format!("{}|{}", arc_msg.2.rule.id(), secret)) .or_default() .push(arc_msg); } @@ -109,7 +105,6 @@ pub async fn run_secret_validation( let client = client.clone(); let cache_glob = cache.clone(); let val_res = &validation_results; - let rules = &all_rules; let success = success_count.clone(); let fail = fail_count.clone(); // *** FIX: Clone the progress bar for each concurrent task *** @@ -122,8 +117,8 @@ pub async fn run_secret_validation( .captures .get(1) .or_else(|| rep_arc.2.groups.captures.get(0)) - .map_or("", |c| c.value.as_ref()); - let key = format!("{}|{}", rep_arc.2.rule_text_id, secret); + .map_or("", |c| c.value); + let key = format!("{}|{}", rep_arc.2.rule.id(), secret); match val_res.entry(key.clone()) { dashmap::mapref::entry::Entry::Occupied(_) => return, @@ -138,8 +133,8 @@ pub async fn run_secret_validation( } } - let rule = find_rule_for_match(rules, rep_arc.2.rule_text_id).unwrap(); - let mut om = OwnedBlobMatch::convert_match_to_owned_blobmatch(&rep_arc.2, rule); + let mut om = + OwnedBlobMatch::convert_match_to_owned_blobmatch(&rep_arc.2, rep_arc.2.rule.clone()); validate_single( &mut om, @@ -211,7 +206,6 @@ pub async fn run_secret_validation( let val_cache = Arc::new(DashMap::::new()); let in_flight = Arc::new(DashMap::::new()); - let rules_ref = Arc::new(all_rules.clone()); for chunk in blob_ids.chunks(chunk_size) { let tasks: Vec<_> = chunk @@ -225,15 +219,15 @@ pub async fn run_secret_validation( let success = success_count.clone(); let fail = fail_count.clone(); let cache_glob = cache.clone(); - let rules = rules_ref.clone(); async move { let owned = matches_for_blob .iter() .map(|arc_msg| { - let rule = find_rule_for_match(&rules, arc_msg.2.rule_text_id) - .expect("rule"); - OwnedBlobMatch::convert_match_to_owned_blobmatch(&arc_msg.2, rule) + OwnedBlobMatch::convert_match_to_owned_blobmatch( + &arc_msg.2, + arc_msg.2.rule.clone(), + ) }) .collect::>(); @@ -338,21 +332,6 @@ pub async fn run_secret_validation( Ok(()) } -/// Returns `Some(Arc)` if a matching rule is found; otherwise returns `None`. -/// Callers can decide how to handle the `None` case (e.g., skip processing). -fn find_rule_for_match( - all_rules: &[Arc], - rule_text_id: &str, -) -> Option> { - match all_rules.iter().find(|r| r.syntax().id == rule_text_id).cloned() { - Some(rule) => Some(rule), - None => { - eprintln!("Warning: no rule found with id '{}'. Skipping.", rule_text_id); - None - } - } -} - // --------------------------------------------------- // The core validation logic, used in an async pipeline // --------------------------------------------------- diff --git a/src/validation.rs b/src/validation.rs index 3548d33..a645ce7 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -910,6 +910,7 @@ mod tests { use crossbeam_skiplist::SkipMap; use http::StatusCode; use rustc_hash::FxHashMap; + use smallvec::smallvec; use crate::{ blob::BlobId, @@ -920,6 +921,7 @@ mod tests { rule::{Confidence, Rule}, Rules, }, + util::intern, validation::{validate_single_match, Cache}, }; #[tokio::test] @@ -1016,12 +1018,12 @@ rules: // matching_input: token.as_bytes().to_vec(), matching_input_offset_span: OffsetSpan { start: 0, end: token.len() }, captures: SerializableCaptures { - captures: vec![SerializableCapture { + captures: smallvec![SerializableCapture { name: Some("TOKEN".to_string()), match_number: -1, start: 0, end: token.len(), - value: token.into(), + value: intern(token), }], }, validation_response_body: String::new(), diff --git a/src/validation/utils.rs b/src/validation/utils.rs index ee118c8..cb2068d 100644 --- a/src/validation/utils.rs +++ b/src/validation/utils.rs @@ -22,7 +22,7 @@ pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String, .map(|cap| { let name = cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string()); - (name, cap.value.clone().into_owned(), cap.start, cap.end) + (name, cap.value.to_string(), cap.start, cap.end) }) .collect() } @@ -75,17 +75,18 @@ pub async fn check_url_resolvable(url: &Url) -> Result<(), Box Match { + let syntax = RuleSyntax { + name: "Example Rule".to_string(), + id: "RULE.1".to_string(), + pattern: "dummy".to_string(), + min_entropy: 0.0, + confidence: Confidence::Medium, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None, + depends_on_rule: vec![], + }; + let rule = Arc::new(Rule::new(syntax)); Match { location: Location { offset_span: OffsetSpan { start: 0, end: 10 }, @@ -28,20 +44,17 @@ fn make_match(fp: u64) -> Match { }, }, groups: SerializableCaptures { - captures: vec![SerializableCapture { + captures: smallvec![SerializableCapture { name: None, match_number: 0, start: 0, end: 10, - value: "dummy".into(), + value: intern("dummy"), }], }, blob_id: BlobId::new(b"dummy"), finding_fingerprint: fp, - rule_finding_fingerprint: "structural.1".into(), - rule_text_id: "RULE.1".into(), - rule_name: "Example Rule".into(), - rule_confidence: Confidence::Medium, + rule, validation_response_body: String::new(), validation_response_status: 0, validation_success: false,