Optimized memory usage via string interning and extensive data sharing

This commit is contained in:
Mick Grove 2025-09-03 09:52:49 -07:00
commit bf60c5584b
8 changed files with 59 additions and 58 deletions

View file

@ -65,7 +65,7 @@ smallvec = { version = "1", features = [
tracing = "0.1.41"
indicatif = { version = "0.17", features = ["improved_unicode"] }
rayon = "1.10"
sha1 = "0.10.6"
sha1 = { version = "0.10.6", features = ["asm"] }
hex = "0.4.3"
vectorscan-rs = "0.0.5"
regex = "1.11.1"
@ -151,6 +151,7 @@ tar = "0.4.44"
xz2 = "0.1.7"
asar = "0.3.0"
blake3 = "1.8.2"
memchr = "2.7"
memmap2 = "0.9.7"
futures = "0.3.31"
dashmap = "6.1.0"

View file

@ -10,16 +10,20 @@ use anyhow::Result;
use bstr::{BString, ByteSlice};
use gix::ObjectId;
use hex;
use once_cell::sync::OnceCell;
use parking_lot::Mutex;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use sha1::{Digest, Sha1};
use smallvec::SmallVec;
use std::sync::atomic::{AtomicU64, Ordering};
use crate::git_commit_metadata::CommitMetadata;
// const LARGE_FILE_THRESHOLD: u64 = 512 * 1024; // 512 KB
const LARGE_FILE_THRESHOLD: u64 = 0; // always mmap
static NEXT_ID: AtomicU64 = AtomicU64::new(1);
/// The data of a blob, either owned (small files) or memory mapped (large files).
pub enum BlobData<'a> {
/// Small blobs remains as-is.
@ -75,47 +79,68 @@ pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>;
/// A Git blob, storing its SHA-1 id and its contents.
pub struct Blob<'a> {
pub id: BlobId,
pub data: BlobData<'a>,
id: OnceCell<BlobId>,
data: BlobData<'a>,
temp_id: u64,
}
impl Blob<'_> {
#[inline]
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let mut file = File::open(&path)?;
let file_size = file.metadata()?.len();
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
if file_size > LARGE_FILE_THRESHOLD {
// Large files: one mmap, zero extra copies.
let mmap = unsafe { memmap2::Mmap::map(&file)? };
let id = BlobId::new(mmap.as_ref());
Ok(Blob { id, data: BlobData::Mapped(mmap) })
Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id })
} else {
// Small files: reuse the same handle and pre-allocate exact capacity
let mut bytes = Vec::with_capacity(file_size as usize);
file.read_to_end(&mut bytes)?;
let id = BlobId::new(&bytes);
Ok(Blob { id, data: BlobData::Owned(bytes) })
Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id })
}
}
/// Returns the blob's bytes as a slice.
#[inline]
pub fn bytes(&self) -> &[u8] {
self.data.as_ref()
}
/// Lazily compute and return the blob's SHA-1 `BlobId`.
#[inline]
pub fn id(&self) -> BlobId {
*self.id.get_or_init(|| BlobId::new(self.bytes()))
}
/// Get a reference to the blob's SHA-1 `BlobId`, computing it if necessary.
#[inline]
pub fn id_ref(&self) -> &BlobId {
self.id.get_or_init(|| BlobId::new(self.bytes()))
}
/// Return the temporary identifier assigned on blob creation.
#[inline]
pub fn temp_id(&self) -> u64 {
self.temp_id
}
/// Create a new `Blob` from a vector of bytes.
#[inline]
pub fn from_bytes(bytes: Vec<u8>) -> Self {
let id = BlobId::compute_from_bytes(&bytes);
Blob { id, data: BlobData::Owned(bytes) }
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }
}
/// Create a new `Blob` with the given id and data.
#[inline]
pub fn new(id: BlobId, bytes: Vec<u8>) -> Self {
Blob { id, data: BlobData::Owned(bytes) }
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
let cell = OnceCell::new();
let _ = cell.set(id);
Blob { id: cell, data: BlobData::Owned(bytes), temp_id }
}
#[inline]
@ -360,9 +385,6 @@ pub struct BlobMetadata {
/// The guessed multimedia type of the blob
pub mime_essence: Option<String>,
/// The guessed charset of the blob
pub charset: Option<String>,
/// The guessed programming language of the blob
pub language: Option<String>,
}
@ -385,8 +407,4 @@ impl BlobMetadata {
self.mime_essence.as_deref()
}
#[inline]
pub fn charset(&self) -> Option<&str> {
self.charset.as_deref()
}
}

View file

@ -94,8 +94,7 @@ pub struct LocationMapping {
impl LocationMapping {
/// Scan once for all `\n` positions.
pub fn new(input: &[u8]) -> Self {
let newline_offsets =
input.iter().enumerate().filter_map(|(i, &b)| (b == b'\n').then_some(i)).collect();
let newline_offsets = memchr::memchr_iter(b'\n', input).collect();
LocationMapping { newline_offsets }
}

View file

@ -288,17 +288,6 @@ impl<'a> Matcher<'a> {
self.local_stats.blobs_scanned += 1;
self.local_stats.bytes_scanned += blob.bytes().len() as u64;
// Check if blob was already seen and respect no_dedup flag
if !no_dedup {
if let Some(had_matches) = self.seen_blobs.get(&blob.id) {
return Ok(if had_matches {
ScanResult::SeenWithMatches
} else {
ScanResult::SeenSansMatches
});
}
}
// Extract filename from origin
let filename = origin
.first()
@ -316,16 +305,7 @@ impl<'a> Matcher<'a> {
let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) };
if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() {
// Only record in seen_blobs if deduplication is enabled
if !no_dedup {
return Ok(match self.seen_blobs.insert(blob.id, false) {
None => ScanResult::New(Vec::new()),
Some(true) => ScanResult::SeenWithMatches,
Some(false) => ScanResult::SeenSansMatches,
});
} else {
return Ok(ScanResult::New(Vec::new()));
}
return Ok(ScanResult::New(Vec::new()));
}
let rules_db = self.rules_db;
@ -472,9 +452,15 @@ impl<'a> Matcher<'a> {
}
}
// Finalize
// Only record in seen_blobs if deduplication is enabled
if !no_dedup {
self.seen_blobs.insert(blob.id, !matches.is_empty());
if !no_dedup && !matches.is_empty() {
let blob_id = blob.id();
if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
return Ok(if had_matches {
ScanResult::SeenWithMatches
} else {
ScanResult::SeenSansMatches
});
}
}
// --- opportunistic capacity cap ---------------------------------
@ -571,7 +557,7 @@ fn filter_match<'b>(
SerializableCaptures::from_captures(&captures, byte_slice.as_ref(), re, redact);
matches.push(BlobMatch {
rule: Arc::clone(&rule),
blob_id: &blob.id,
blob_id: blob.id_ref(),
matching_input: only_matching_input,
matching_input_offset_span,
captures: groups,

View file

@ -185,7 +185,6 @@ mod tests {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
};
let dedup = true;
@ -219,7 +218,6 @@ mod tests {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
},
m: mock_match,
@ -256,7 +254,6 @@ mod tests {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
},
m: mock_match,

View file

@ -172,10 +172,15 @@ pub fn enumerate_filesystem_inputs(
};
if should_skip {
progress.suspend(|| {
let path = origin
.first()
.blob_path()
.map(|p| p.display().to_string())
.unwrap_or_else(|| blob.temp_id().to_string());
if is_archive {
debug!("Skipping archive: {}", blob.id);
debug!("Skipping archive: {path}");
} else {
debug!("Skipping binary blob: {}", blob.id);
debug!("Skipping binary blob: {path}");
}
});
return Ok(());

View file

@ -27,8 +27,7 @@ impl<'a> BlobProcessor<'a> {
redact: bool,
no_base64: bool,
) -> Result<Option<DatastoreMessage>> {
let blob_id = blob.id.hex();
let _span = debug_span!("matcher", blob_id).entered();
let _span = debug_span!("matcher", temp_id = blob.temp_id()).entered();
let t1 = Instant::now();
let res = self.matcher.scan_blob(&blob, &origin, None, redact, no_dedup, no_base64)?;
let scan_us = t1.elapsed().as_micros();
@ -43,10 +42,9 @@ impl<'a> BlobProcessor<'a> {
ScanResult::SeenWithMatches => {
trace!("({scan_us}us) blob already scanned with matches");
let metadata = BlobMetadata {
id: blob.id,
id: blob.id(),
num_bytes: blob.len(),
mime_essence: None,
charset: None,
language: None,
};
Ok(Some((origin, metadata, Vec::new())))
@ -63,10 +61,9 @@ impl<'a> BlobProcessor<'a> {
}
let md = MetadataResult::from_blob_and_origin(&blob, &origin);
let metadata = BlobMetadata {
id: blob.id,
id: blob.id(),
num_bytes: blob.len(),
mime_essence: md.mime_essence,
charset: md.charset,
language: md.language,
};

View file

@ -119,7 +119,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> {
id: BlobId::new(b"dummy"),
num_bytes: 10,
mime_essence: None,
charset: None,
language: None,
},
m: m1,
@ -136,7 +135,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> {
id: BlobId::new(b"dummy"),
num_bytes: 10,
mime_essence: None,
charset: None,
language: None,
},
m: m2,