forked from mirrors/kingfisher
Optimized memory usage via string interning and extensive data sharing
This commit is contained in:
parent
ac34f35f61
commit
bf60c5584b
8 changed files with 59 additions and 58 deletions
|
|
@ -65,7 +65,7 @@ smallvec = { version = "1", features = [
|
|||
tracing = "0.1.41"
|
||||
indicatif = { version = "0.17", features = ["improved_unicode"] }
|
||||
rayon = "1.10"
|
||||
sha1 = "0.10.6"
|
||||
sha1 = { version = "0.10.6", features = ["asm"] }
|
||||
hex = "0.4.3"
|
||||
vectorscan-rs = "0.0.5"
|
||||
regex = "1.11.1"
|
||||
|
|
@ -151,6 +151,7 @@ tar = "0.4.44"
|
|||
xz2 = "0.1.7"
|
||||
asar = "0.3.0"
|
||||
blake3 = "1.8.2"
|
||||
memchr = "2.7"
|
||||
memmap2 = "0.9.7"
|
||||
futures = "0.3.31"
|
||||
dashmap = "6.1.0"
|
||||
|
|
|
|||
52
src/blob.rs
52
src/blob.rs
|
|
@ -10,16 +10,20 @@ use anyhow::Result;
|
|||
use bstr::{BString, ByteSlice};
|
||||
use gix::ObjectId;
|
||||
use hex;
|
||||
use once_cell::sync::OnceCell;
|
||||
use parking_lot::Mutex;
|
||||
use rustc_hash::FxHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sha1::{Digest, Sha1};
|
||||
use smallvec::SmallVec;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use crate::git_commit_metadata::CommitMetadata;
|
||||
// const LARGE_FILE_THRESHOLD: u64 = 512 * 1024; // 512 KB
|
||||
const LARGE_FILE_THRESHOLD: u64 = 0; // always mmap
|
||||
|
||||
static NEXT_ID: AtomicU64 = AtomicU64::new(1);
|
||||
|
||||
/// The data of a blob, either owned (small files) or memory mapped (large files).
|
||||
pub enum BlobData<'a> {
|
||||
/// Small blobs – remains as-is.
|
||||
|
|
@ -75,47 +79,68 @@ pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>;
|
|||
/// A Git blob, storing its SHA-1 id and its contents.
|
||||
|
||||
pub struct Blob<'a> {
|
||||
pub id: BlobId,
|
||||
pub data: BlobData<'a>,
|
||||
id: OnceCell<BlobId>,
|
||||
data: BlobData<'a>,
|
||||
temp_id: u64,
|
||||
}
|
||||
|
||||
impl Blob<'_> {
|
||||
#[inline]
|
||||
|
||||
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||
let mut file = File::open(&path)?;
|
||||
let file_size = file.metadata()?.len();
|
||||
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
if file_size > LARGE_FILE_THRESHOLD {
|
||||
// Large files: one mmap, zero extra copies.
|
||||
let mmap = unsafe { memmap2::Mmap::map(&file)? };
|
||||
let id = BlobId::new(mmap.as_ref());
|
||||
Ok(Blob { id, data: BlobData::Mapped(mmap) })
|
||||
Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id })
|
||||
} else {
|
||||
// Small files: reuse the same handle and pre-allocate exact capacity
|
||||
let mut bytes = Vec::with_capacity(file_size as usize);
|
||||
file.read_to_end(&mut bytes)?;
|
||||
let id = BlobId::new(&bytes);
|
||||
Ok(Blob { id, data: BlobData::Owned(bytes) })
|
||||
Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id })
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the blob's bytes as a slice.
|
||||
#[inline]
|
||||
pub fn bytes(&self) -> &[u8] {
|
||||
self.data.as_ref()
|
||||
}
|
||||
|
||||
/// Lazily compute and return the blob's SHA-1 `BlobId`.
|
||||
#[inline]
|
||||
pub fn id(&self) -> BlobId {
|
||||
*self.id.get_or_init(|| BlobId::new(self.bytes()))
|
||||
}
|
||||
|
||||
/// Get a reference to the blob's SHA-1 `BlobId`, computing it if necessary.
|
||||
#[inline]
|
||||
pub fn id_ref(&self) -> &BlobId {
|
||||
self.id.get_or_init(|| BlobId::new(self.bytes()))
|
||||
}
|
||||
|
||||
/// Return the temporary identifier assigned on blob creation.
|
||||
#[inline]
|
||||
pub fn temp_id(&self) -> u64 {
|
||||
self.temp_id
|
||||
}
|
||||
|
||||
/// Create a new `Blob` from a vector of bytes.
|
||||
#[inline]
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> Self {
|
||||
let id = BlobId::compute_from_bytes(&bytes);
|
||||
Blob { id, data: BlobData::Owned(bytes) }
|
||||
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
|
||||
Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }
|
||||
}
|
||||
|
||||
/// Create a new `Blob` with the given id and data.
|
||||
#[inline]
|
||||
pub fn new(id: BlobId, bytes: Vec<u8>) -> Self {
|
||||
Blob { id, data: BlobData::Owned(bytes) }
|
||||
let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
|
||||
let cell = OnceCell::new();
|
||||
let _ = cell.set(id);
|
||||
Blob { id: cell, data: BlobData::Owned(bytes), temp_id }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
|
@ -360,9 +385,6 @@ pub struct BlobMetadata {
|
|||
/// The guessed multimedia type of the blob
|
||||
pub mime_essence: Option<String>,
|
||||
|
||||
/// The guessed charset of the blob
|
||||
pub charset: Option<String>,
|
||||
|
||||
/// The guessed programming language of the blob
|
||||
pub language: Option<String>,
|
||||
}
|
||||
|
|
@ -385,8 +407,4 @@ impl BlobMetadata {
|
|||
self.mime_essence.as_deref()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn charset(&self) -> Option<&str> {
|
||||
self.charset.as_deref()
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -94,8 +94,7 @@ pub struct LocationMapping {
|
|||
impl LocationMapping {
|
||||
/// Scan once for all `\n` positions.
|
||||
pub fn new(input: &[u8]) -> Self {
|
||||
let newline_offsets =
|
||||
input.iter().enumerate().filter_map(|(i, &b)| (b == b'\n').then_some(i)).collect();
|
||||
let newline_offsets = memchr::memchr_iter(b'\n', input).collect();
|
||||
LocationMapping { newline_offsets }
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -288,17 +288,6 @@ impl<'a> Matcher<'a> {
|
|||
self.local_stats.blobs_scanned += 1;
|
||||
self.local_stats.bytes_scanned += blob.bytes().len() as u64;
|
||||
|
||||
// Check if blob was already seen and respect no_dedup flag
|
||||
if !no_dedup {
|
||||
if let Some(had_matches) = self.seen_blobs.get(&blob.id) {
|
||||
return Ok(if had_matches {
|
||||
ScanResult::SeenWithMatches
|
||||
} else {
|
||||
ScanResult::SeenSansMatches
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Extract filename from origin
|
||||
let filename = origin
|
||||
.first()
|
||||
|
|
@ -316,16 +305,7 @@ impl<'a> Matcher<'a> {
|
|||
let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) };
|
||||
|
||||
if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() {
|
||||
// Only record in seen_blobs if deduplication is enabled
|
||||
if !no_dedup {
|
||||
return Ok(match self.seen_blobs.insert(blob.id, false) {
|
||||
None => ScanResult::New(Vec::new()),
|
||||
Some(true) => ScanResult::SeenWithMatches,
|
||||
Some(false) => ScanResult::SeenSansMatches,
|
||||
});
|
||||
} else {
|
||||
return Ok(ScanResult::New(Vec::new()));
|
||||
}
|
||||
return Ok(ScanResult::New(Vec::new()));
|
||||
}
|
||||
|
||||
let rules_db = self.rules_db;
|
||||
|
|
@ -472,9 +452,15 @@ impl<'a> Matcher<'a> {
|
|||
}
|
||||
}
|
||||
// Finalize
|
||||
// Only record in seen_blobs if deduplication is enabled
|
||||
if !no_dedup {
|
||||
self.seen_blobs.insert(blob.id, !matches.is_empty());
|
||||
if !no_dedup && !matches.is_empty() {
|
||||
let blob_id = blob.id();
|
||||
if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
|
||||
return Ok(if had_matches {
|
||||
ScanResult::SeenWithMatches
|
||||
} else {
|
||||
ScanResult::SeenSansMatches
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// --- opportunistic capacity cap ---------------------------------
|
||||
|
|
@ -571,7 +557,7 @@ fn filter_match<'b>(
|
|||
SerializableCaptures::from_captures(&captures, byte_slice.as_ref(), re, redact);
|
||||
matches.push(BlobMatch {
|
||||
rule: Arc::clone(&rule),
|
||||
blob_id: &blob.id,
|
||||
blob_id: blob.id_ref(),
|
||||
matching_input: only_matching_input,
|
||||
matching_input_offset_span,
|
||||
captures: groups,
|
||||
|
|
|
|||
|
|
@ -185,7 +185,6 @@ mod tests {
|
|||
id: BlobId::new(b"mock_blob"),
|
||||
num_bytes: 1024,
|
||||
mime_essence: Some("text/plain".to_string()),
|
||||
charset: Some("UTF-8".to_string()),
|
||||
language: Some("Rust".to_string()),
|
||||
};
|
||||
let dedup = true;
|
||||
|
|
@ -219,7 +218,6 @@ mod tests {
|
|||
id: BlobId::new(b"mock_blob"),
|
||||
num_bytes: 1024,
|
||||
mime_essence: Some("text/plain".to_string()),
|
||||
charset: Some("UTF-8".to_string()),
|
||||
language: Some("Rust".to_string()),
|
||||
},
|
||||
m: mock_match,
|
||||
|
|
@ -256,7 +254,6 @@ mod tests {
|
|||
id: BlobId::new(b"mock_blob"),
|
||||
num_bytes: 1024,
|
||||
mime_essence: Some("text/plain".to_string()),
|
||||
charset: Some("UTF-8".to_string()),
|
||||
language: Some("Rust".to_string()),
|
||||
},
|
||||
m: mock_match,
|
||||
|
|
|
|||
|
|
@ -172,10 +172,15 @@ pub fn enumerate_filesystem_inputs(
|
|||
};
|
||||
if should_skip {
|
||||
progress.suspend(|| {
|
||||
let path = origin
|
||||
.first()
|
||||
.blob_path()
|
||||
.map(|p| p.display().to_string())
|
||||
.unwrap_or_else(|| blob.temp_id().to_string());
|
||||
if is_archive {
|
||||
debug!("Skipping archive: {}", blob.id);
|
||||
debug!("Skipping archive: {path}");
|
||||
} else {
|
||||
debug!("Skipping binary blob: {}", blob.id);
|
||||
debug!("Skipping binary blob: {path}");
|
||||
}
|
||||
});
|
||||
return Ok(());
|
||||
|
|
|
|||
|
|
@ -27,8 +27,7 @@ impl<'a> BlobProcessor<'a> {
|
|||
redact: bool,
|
||||
no_base64: bool,
|
||||
) -> Result<Option<DatastoreMessage>> {
|
||||
let blob_id = blob.id.hex();
|
||||
let _span = debug_span!("matcher", blob_id).entered();
|
||||
let _span = debug_span!("matcher", temp_id = blob.temp_id()).entered();
|
||||
let t1 = Instant::now();
|
||||
let res = self.matcher.scan_blob(&blob, &origin, None, redact, no_dedup, no_base64)?;
|
||||
let scan_us = t1.elapsed().as_micros();
|
||||
|
|
@ -43,10 +42,9 @@ impl<'a> BlobProcessor<'a> {
|
|||
ScanResult::SeenWithMatches => {
|
||||
trace!("({scan_us}us) blob already scanned with matches");
|
||||
let metadata = BlobMetadata {
|
||||
id: blob.id,
|
||||
id: blob.id(),
|
||||
num_bytes: blob.len(),
|
||||
mime_essence: None,
|
||||
charset: None,
|
||||
language: None,
|
||||
};
|
||||
Ok(Some((origin, metadata, Vec::new())))
|
||||
|
|
@ -63,10 +61,9 @@ impl<'a> BlobProcessor<'a> {
|
|||
}
|
||||
let md = MetadataResult::from_blob_and_origin(&blob, &origin);
|
||||
let metadata = BlobMetadata {
|
||||
id: blob.id,
|
||||
id: blob.id(),
|
||||
num_bytes: blob.len(),
|
||||
mime_essence: md.mime_essence,
|
||||
charset: md.charset,
|
||||
language: md.language,
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -119,7 +119,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> {
|
|||
id: BlobId::new(b"dummy"),
|
||||
num_bytes: 10,
|
||||
mime_essence: None,
|
||||
charset: None,
|
||||
language: None,
|
||||
},
|
||||
m: m1,
|
||||
|
|
@ -136,7 +135,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> {
|
|||
id: BlobId::new(b"dummy"),
|
||||
num_bytes: 10,
|
||||
mime_essence: None,
|
||||
charset: None,
|
||||
language: None,
|
||||
},
|
||||
m: m2,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue