Optimized memory usage via string interning and extensive data sharing

This commit is contained in:
Mick Grove 2025-09-02 19:54:44 -07:00
commit c3513ea206
12 changed files with 114 additions and 125 deletions

View file

@ -3,7 +3,8 @@
All notable changes to this project will be documented in this file.
## [Unreleased]
- Improved error message when self-update cannot find the current binary.
- Improved error message when self-update cannot find the current binary
- Optimized memory usage via string interning and extensive data sharing
## [1.47.0]
- MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them

View file

@ -96,9 +96,8 @@ impl ContentInspector {
#[inline]
#[must_use]
pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
String::from_utf8(bytes.to_vec()).ok().map(|_| "UTF-8".to_string())
std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
}
/// Guess programming language with broad coverage using `tokei`.
///
/// Strategy (no disk I/O):

View file

@ -162,10 +162,10 @@ fn handle_zip_archive_streaming(
}
fn handle_asar_archive_in_memory(
buffer: Vec<u8>,
buffer: &[u8],
archive_path: &Path,
) -> Result<CompressedContent> {
match AsarReader::new(&buffer, None) {
match AsarReader::new(buffer, None) {
Ok(reader) => {
let mut contents = Vec::new();
for (path_in_asar, file) in reader.files() {
@ -200,7 +200,7 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result<CompressedCon
match ext {
"asar" => {
let mmap = unsafe { Mmap::map(&file)? };
return handle_asar_archive_in_memory(mmap.to_vec(), path);
return handle_asar_archive_in_memory(&mmap, path);
}
"tar" => {
if let Some(base) = base_dir {
@ -525,4 +525,4 @@ mod tests {
Ok(())
}
}
}

View file

@ -18,6 +18,7 @@ use crate::{
matcher::Match,
origin::{Origin, OriginSet},
rules::rule::Rule,
util::intern,
};
// share with Arc so every blob/origin is materialised once
@ -151,7 +152,7 @@ impl FindingsStore {
.captures
.get(1)
.or_else(|| m.groups.captures.get(0))
.map_or("", |c| c.value.as_ref());
.map_or("", |c| c.value);
let origin_kind = match origin.first() {
Origin::GitRepo(_) => "git",
@ -160,7 +161,7 @@ impl FindingsStore {
};
let key = xxh3_64(
format!("{}|{}|{}", m.rule_text_id.to_uppercase(), origin_kind, snippet)
format!("{}|{}|{}", m.rule.id().to_uppercase(), origin_kind, snippet)
.as_bytes(),
);
@ -280,7 +281,7 @@ impl FindingsStore {
pub fn get_summary(&self) -> FxHashMap<&'static str, usize> {
self.matches.iter().fold(FxHashMap::default(), |mut acc, msg| {
let (_, _, m) = &**msg;
*acc.entry(m.rule_name).or_insert(0) += 1; // borrow, no alloc
*acc.entry(intern(m.rule.name())).or_insert(0) += 1;
acc
})
}
@ -342,13 +343,13 @@ impl FindingsStore {
self.matches.iter().map(|msg| {
let (_, _, match_item) = &**msg;
finding_data::FindingMetadata {
rule_name: match_item.rule_name.to_string(),
rule_name: match_item.rule.name().to_string(),
num_matches: 1,
comment: None,
visible: match_item.visible,
finding_id: match_item.finding_id(),
rule_finding_fingerprint: match_item.rule_finding_fingerprint.to_string(),
rule_text_id: match_item.rule_text_id.to_string(),
rule_finding_fingerprint: match_item.rule.finding_sha1_fingerprint().to_string(),
rule_text_id: match_item.rule.id().to_string(),
}
})
}
@ -362,7 +363,7 @@ impl FindingsStore {
.iter()
.filter(|msg| {
let (_, _, match_item) = &***msg;
match_item.rule_name == metadata.rule_name
match_item.rule.name() == metadata.rule_name
})
.map(|msg| {
let (origin, blob_metadata, match_item) = &**msg;
@ -373,7 +374,7 @@ impl FindingsStore {
match_id: MatchIdInt::from_str(&match_item.finding_id())?,
match_comment: None,
visible: match_item.visible,
match_confidence: match_item.rule_confidence,
match_confidence: match_item.rule.confidence(),
validation_response_body: match_item.validation_response_body.clone(),
validation_response_status: match_item.validation_response_status,
validation_success: match_item.validation_success,

View file

@ -682,20 +682,20 @@ impl JsonSchema for Groups {
// pub end: usize, // End position of the match
// pub value: String, // The actual captured value
// }
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCapture {
pub name: Option<String>,
pub match_number: i32,
pub start: usize,
pub end: usize,
// Instead of storing an owned String, store a borrowed or interned value.
// Here we use Cow to allow either borrowing or owning as needed.
pub value: std::borrow::Cow<'static, str>,
/// Interned value of the capture.
pub value: &'static str,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCaptures {
pub captures: Vec<SerializableCapture>, // All captures (named and unnamed)
#[schemars(with = "Vec<SerializableCapture>")]
pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
}
impl SerializableCaptures {
pub fn from_captures(
@ -704,7 +704,7 @@ impl SerializableCaptures {
re: &Regex,
redact: bool,
) -> Self {
let mut serialized_captures = Vec::new();
let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
// Process named captures
for name in re.capture_names().flatten() {
if let Some(capture) = captures.name(name) {
@ -718,7 +718,7 @@ impl SerializableCaptures {
match_number: -1,
start: capture.start(),
end: capture.end(),
value: value.into(),
value: intern(&value),
});
}
}
@ -735,7 +735,7 @@ impl SerializableCaptures {
match_number: i32::try_from(i).unwrap_or(0),
start: capture.start(),
end: capture.end(),
value: value.into(),
value: intern(&value),
});
}
}
@ -764,16 +764,9 @@ pub struct Match {
pub finding_fingerprint: u64,
/// The rule that produced this match
pub rule_finding_fingerprint: &'static str,
/// The text identifier of the rule that produced this match
pub rule_text_id: &'static str,
/// The name of the rule that produced this match
pub rule_name: &'static str,
/// The confidence property of the rule that produced this match
pub rule_confidence: crate::rules::rule::Confidence,
#[serde(skip_serializing)]
#[schemars(skip)]
pub rule: Arc<Rule>,
/// Validation Body
pub validation_response_body: String,
@ -813,8 +806,6 @@ impl Match {
let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
let source_span = loc_mapping.get_source_span(&offset_span);
let rule_finding_fingerprint = owned_blob_match.rule.finding_sha1_fingerprint().to_owned();
let offset_start: u64 =
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
@ -828,10 +819,7 @@ impl Match {
// matching_snippet
Match {
rule_finding_fingerprint: intern(&rule_finding_fingerprint),
rule_name: intern(owned_blob_match.rule.name()),
rule_confidence: owned_blob_match.rule.confidence(),
rule_text_id: intern(owned_blob_match.rule.id()),
rule: owned_blob_match.rule.clone(),
visible: owned_blob_match.rule.visible().to_owned(),
location: Location { offset_span, source_span: source_span.clone() },
groups: owned_blob_match.captures.clone(),
@ -852,7 +840,7 @@ impl Match {
pub fn finding_id(&self) -> String {
let mut h = Sha1::new();
write!(&mut h, "{}\0", self.rule_finding_fingerprint)
write!(&mut h, "{}\0", self.rule.finding_sha1_fingerprint())
.expect("should be able to write to memory");
serde_json::to_writer(&mut h, &self.groups)
.expect("should be able to serialize groups as JSON");

View file

@ -216,7 +216,7 @@ impl DetailsReporter {
m: match_item.clone(),
comment: None,
visible: match_item.visible,
match_confidence: match_item.rule_confidence,
match_confidence: match_item.rule.confidence(),
validation_response_body: match_item.validation_response_body.clone(),
validation_response_status: match_item.validation_response_status,
validation_success: match_item.validation_success,
@ -366,13 +366,13 @@ impl DetailsReporter {
FindingReporterRecord {
rule: RuleMetadata {
name: rm.m.rule_name.to_string(),
id: rm.m.rule_text_id.to_string(),
name: rm.m.rule.name().to_string(),
id: rm.m.rule.id().to_string(),
},
finding: FindingRecordData {
snippet,
fingerprint: rm.m.finding_fingerprint.to_string(),
confidence: rm.match_confidence.to_string(),
confidence: rm.m.rule.confidence().to_string(),
entropy: format!("{:.2}", rm.m.calculated_entropy),
validation: ValidationInfo { status: validation_status, response: response_body },
language: rm

View file

@ -35,6 +35,7 @@ mod tests {
use crate::cli::commands::github::GitHistoryMode;
use crate::cli::commands::rules::RuleSpecifierArgs;
use crate::matcher::{SerializableCapture, SerializableCaptures};
use crate::rules::rule::{Rule, RuleSyntax, Confidence};
use crate::util::intern;
use crate::{
blob::BlobId,
@ -54,6 +55,7 @@ mod tests {
path::PathBuf,
sync::{Arc, Mutex},
};
use smallvec::smallvec;
use url::Url;
fn create_default_args() -> cli::commands::scan::ScanArgs {
use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope
@ -132,12 +134,21 @@ mod tests {
}
}
fn create_mock_match(
rule_name: &str,
rule_text_id: &str,
rule_finding_fingerprint: &str,
validation_success: bool,
) -> Match {
fn create_mock_match(rule_name: &str, rule_text_id: &str, validation_success: bool) -> Match {
let syntax = RuleSyntax {
name: rule_name.to_string(),
id: rule_text_id.to_string(),
pattern: "dummy".to_string(),
min_entropy: 0.0,
confidence: Confidence::Medium,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
depends_on_rule: vec![],
};
let rule = Arc::new(Rule::new(syntax));
Match {
location: Location {
offset_span: OffsetSpan { start: 10, end: 20 },
@ -147,20 +158,17 @@ mod tests {
},
},
groups: SerializableCaptures {
captures: vec![SerializableCapture {
captures: smallvec![SerializableCapture {
name: Some("token".to_string()),
match_number: 1,
start: 10,
end: 20,
value: "mock_token".into(),
value: intern("mock_token"),
}],
},
blob_id: BlobId::new(b"mock_blob"),
finding_fingerprint: 0123,
rule_finding_fingerprint: intern(rule_finding_fingerprint),
rule_text_id: intern(rule_text_id),
rule_name: intern(rule_name),
rule_confidence: Confidence::Medium,
rule,
validation_response_body: "validation response".to_string(),
validation_response_status: 200,
validation_success,
@ -204,8 +212,7 @@ mod tests {
#[test]
fn test_json_format() -> Result<()> {
let mock_match =
create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
let mock_match = create_mock_match("MockRule", "mock_rule_1", true);
let matches = vec![ReportMatch {
origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]),
blob_metadata: BlobMetadata {
@ -238,12 +245,8 @@ mod tests {
fn test_validation_status_in_json() -> Result<()> {
let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")];
for (validation_success, expected_status) in test_cases {
let mock_match = create_mock_match(
"MockRule",
"mock_rule_1",
"mock_finding_fingerprint",
validation_success,
);
let mock_match =
create_mock_match("MockRule", "mock_rule_1", validation_success);
let matches = vec![ReportMatch {
origin: OriginSet::new(
Origin::from_file(PathBuf::from("/mock/path/file.rs")),

View file

@ -175,9 +175,11 @@ impl Docker {
pb.set_length(layer_paths.len() as u64);
for p in layer_paths {
let mut data = Vec::new();
File::open(&p)?.read_to_end(&mut data)?;
let digest = format!("{:x}", Sha256::digest(&data));
let mut file = File::open(&p)?;
let mut hasher = Sha256::new();
std::io::copy(&mut file, &mut hasher)?;
let digest = format!("{:x}", hasher.finalize());
let new_path = out_dir.join(format!("layer_{digest}.tar"));
std::fs::rename(&p, &new_path)?;
// extract layer contents so inner filenames appear in scan results
@ -280,4 +282,4 @@ mod tests {
fn docker_struct_new() {
let _ = Docker::new();
}
}
}

View file

@ -13,7 +13,7 @@ use futures::{stream, StreamExt};
use indicatif::{ProgressBar, ProgressStyle};
use liquid::Parser;
use reqwest::{Client, StatusCode};
use rustc_hash::{FxHashMap, FxHashSet};
use rustc_hash::FxHashMap;
use tokio::{sync::Notify, time::timeout};
use crate::{
@ -21,7 +21,6 @@ use crate::{
findings_store::{FindingsStore, FindingsStoreMessage},
location::OffsetSpan,
matcher::{Match, OwnedBlobMatch},
rules::rule,
validation::{collect_variables_and_dependencies, validate_single_match, CachedResponse},
};
@ -40,7 +39,7 @@ pub async fn run_secret_validation(
let fail_count = Arc::new(AtomicUsize::new(0));
// ── 2. Fetch rules + matches ────────────────────────────────────────────
let (all_rules, all_matches_by_blob) = {
let (_all_rules, all_matches_by_blob) = {
let ds = datastore.lock().unwrap();
let rules = ds.get_rules()?;
let mut map: FxHashMap<BlobId, Vec<Arc<FindingsStoreMessage>>> = FxHashMap::default();
@ -51,16 +50,13 @@ pub async fn run_secret_validation(
};
// ── 3. Partition blobs ──────────────────────────────────────────────────
let rules_with_deps: FxHashSet<&str> = all_rules
.iter()
.filter(|r| !r.syntax().depends_on_rule.is_empty())
.map(|r| r.id())
.collect();
let mut simple_matches = Vec::new();
let mut dependent_blobs = FxHashMap::default(); // blob_id -- Vec<Arc<…>>
for (blob_id, matches) in all_matches_by_blob {
if matches.iter().any(|m| rules_with_deps.contains(m.2.rule_text_id)) {
if matches
.iter()
.any(|m| !m.2.rule.syntax().depends_on_rule.is_empty())
{
dependent_blobs.insert(blob_id, matches);
} else {
simple_matches.extend(matches);
@ -80,9 +76,9 @@ pub async fn run_secret_validation(
.captures
.get(1)
.or_else(|| arc_msg.2.groups.captures.get(0))
.map_or("", |c| c.value.as_ref());
.map_or("", |c| c.value);
groups
.entry(format!("{}|{}", arc_msg.2.rule_text_id, secret))
.entry(format!("{}|{}", arc_msg.2.rule.id(), secret))
.or_default()
.push(arc_msg);
}
@ -109,7 +105,6 @@ pub async fn run_secret_validation(
let client = client.clone();
let cache_glob = cache.clone();
let val_res = &validation_results;
let rules = &all_rules;
let success = success_count.clone();
let fail = fail_count.clone();
// *** FIX: Clone the progress bar for each concurrent task ***
@ -122,8 +117,8 @@ pub async fn run_secret_validation(
.captures
.get(1)
.or_else(|| rep_arc.2.groups.captures.get(0))
.map_or("", |c| c.value.as_ref());
let key = format!("{}|{}", rep_arc.2.rule_text_id, secret);
.map_or("", |c| c.value);
let key = format!("{}|{}", rep_arc.2.rule.id(), secret);
match val_res.entry(key.clone()) {
dashmap::mapref::entry::Entry::Occupied(_) => return,
@ -138,8 +133,8 @@ pub async fn run_secret_validation(
}
}
let rule = find_rule_for_match(rules, rep_arc.2.rule_text_id).unwrap();
let mut om = OwnedBlobMatch::convert_match_to_owned_blobmatch(&rep_arc.2, rule);
let mut om =
OwnedBlobMatch::convert_match_to_owned_blobmatch(&rep_arc.2, rep_arc.2.rule.clone());
validate_single(
&mut om,
@ -211,7 +206,6 @@ pub async fn run_secret_validation(
let val_cache = Arc::new(DashMap::<String, CachedResponse>::new());
let in_flight = Arc::new(DashMap::<String, ()>::new());
let rules_ref = Arc::new(all_rules.clone());
for chunk in blob_ids.chunks(chunk_size) {
let tasks: Vec<_> = chunk
@ -225,15 +219,15 @@ pub async fn run_secret_validation(
let success = success_count.clone();
let fail = fail_count.clone();
let cache_glob = cache.clone();
let rules = rules_ref.clone();
async move {
let owned = matches_for_blob
.iter()
.map(|arc_msg| {
let rule = find_rule_for_match(&rules, arc_msg.2.rule_text_id)
.expect("rule");
OwnedBlobMatch::convert_match_to_owned_blobmatch(&arc_msg.2, rule)
OwnedBlobMatch::convert_match_to_owned_blobmatch(
&arc_msg.2,
arc_msg.2.rule.clone(),
)
})
.collect::<Vec<_>>();
@ -338,21 +332,6 @@ pub async fn run_secret_validation(
Ok(())
}
/// Returns `Some(Arc<Rule>)` if a matching rule is found; otherwise returns `None`.
/// Callers can decide how to handle the `None` case (e.g., skip processing).
fn find_rule_for_match(
all_rules: &[Arc<rule::Rule>],
rule_text_id: &str,
) -> Option<Arc<rule::Rule>> {
match all_rules.iter().find(|r| r.syntax().id == rule_text_id).cloned() {
Some(rule) => Some(rule),
None => {
eprintln!("Warning: no rule found with id '{}'. Skipping.", rule_text_id);
None
}
}
}
// ---------------------------------------------------
// The core validation logic, used in an async pipeline
// ---------------------------------------------------

View file

@ -910,6 +910,7 @@ mod tests {
use crossbeam_skiplist::SkipMap;
use http::StatusCode;
use rustc_hash::FxHashMap;
use smallvec::smallvec;
use crate::{
blob::BlobId,
@ -920,6 +921,7 @@ mod tests {
rule::{Confidence, Rule},
Rules,
},
util::intern,
validation::{validate_single_match, Cache},
};
#[tokio::test]
@ -1016,12 +1018,12 @@ rules:
// matching_input: token.as_bytes().to_vec(),
matching_input_offset_span: OffsetSpan { start: 0, end: token.len() },
captures: SerializableCaptures {
captures: vec![SerializableCapture {
captures: smallvec![SerializableCapture {
name: Some("TOKEN".to_string()),
match_number: -1,
start: 0,
end: token.len(),
value: token.into(),
value: intern(token),
}],
},
validation_response_body: String::new(),

View file

@ -22,7 +22,7 @@ pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String,
.map(|cap| {
let name =
cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string());
(name, cap.value.clone().into_owned(), cap.start, cap.end)
(name, cap.value.to_string(), cap.start, cap.end)
})
.collect()
}
@ -75,17 +75,18 @@ pub async fn check_url_resolvable(url: &Url) -> Result<(), Box<dyn std::error::E
mod tests {
use super::*;
use crate::matcher::{SerializableCapture, SerializableCaptures};
use smallvec::smallvec;
use pretty_assertions::assert_eq;
#[test]
fn single_unnamed_capture_is_returned() {
let captures = SerializableCaptures {
captures: vec![SerializableCapture {
captures: smallvec![SerializableCapture {
name: None,
match_number: 0,
start: 1,
end: 4,
value: "abc".into(),
value: "abc",
}],
};
let result = process_captures(&captures);
@ -95,20 +96,20 @@ mod tests {
#[test]
fn skips_whole_match_when_multiple() {
let captures = SerializableCaptures {
captures: vec![
captures: smallvec![
SerializableCapture {
name: None,
match_number: 0,
start: 0,
end: 5,
value: "abcde".into(),
value: "abcde",
},
SerializableCapture {
name: Some("foo".to_string()),
match_number: -1,
start: 1,
end: 4,
value: "bcd".into(),
value: "bcd",
},
],
};
@ -119,27 +120,27 @@ mod tests {
#[test]
fn includes_unnamed_groups_but_skips_whole_match() {
let captures = SerializableCaptures {
captures: vec![
captures: smallvec![
SerializableCapture {
name: None,
match_number: 0,
start: 0,
end: 6,
value: "aabbcc".into(),
value: "aabbcc",
},
SerializableCapture {
name: Some("foo".to_string()),
match_number: -1,
start: 0,
end: 2,
value: "aa".into(),
value: "aa",
},
SerializableCapture {
name: None,
match_number: 1,
start: 4,
end: 6,
value: "cc".into(),
value: "cc",
},
],
};

View file

@ -6,6 +6,7 @@ use std::{
use anyhow::Result;
use gix::{date, ObjectId};
use smallvec::smallvec;
use kingfisher::{
blob::{BlobId, BlobMetadata},
findings_store::FindingsStore,
@ -14,11 +15,26 @@ use kingfisher::{
matcher::{Match, SerializableCapture, SerializableCaptures},
origin::{Origin, OriginSet},
reporter::{styles::Styles, DetailsReporter, ReportMatch},
rules::rule::Confidence,
rules::rule::{Confidence, Rule, RuleSyntax},
util::intern,
};
// ---- helpers -------------------------------------------------------------------------------
fn make_match(fp: u64) -> Match {
let syntax = RuleSyntax {
name: "Example Rule".to_string(),
id: "RULE.1".to_string(),
pattern: "dummy".to_string(),
min_entropy: 0.0,
confidence: Confidence::Medium,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
depends_on_rule: vec![],
};
let rule = Arc::new(Rule::new(syntax));
Match {
location: Location {
offset_span: OffsetSpan { start: 0, end: 10 },
@ -28,20 +44,17 @@ fn make_match(fp: u64) -> Match {
},
},
groups: SerializableCaptures {
captures: vec![SerializableCapture {
captures: smallvec![SerializableCapture {
name: None,
match_number: 0,
start: 0,
end: 10,
value: "dummy".into(),
value: intern("dummy"),
}],
},
blob_id: BlobId::new(b"dummy"),
finding_fingerprint: fp,
rule_finding_fingerprint: "structural.1".into(),
rule_text_id: "RULE.1".into(),
rule_name: "Example Rule".into(),
rule_confidence: Confidence::Medium,
rule,
validation_response_body: String::new(),
validation_response_status: 0,
validation_success: false,