diff --git a/Cargo.toml b/Cargo.toml index 4e774bf..ac3b6a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,7 +65,7 @@ smallvec = { version = "1", features = [ tracing = "0.1.41" indicatif = { version = "0.17", features = ["improved_unicode"] } rayon = "1.10" -sha1 = "0.10.6" +sha1 = { version = "0.10.6", features = ["asm"] } hex = "0.4.3" vectorscan-rs = "0.0.5" regex = "1.11.1" @@ -151,6 +151,7 @@ tar = "0.4.44" xz2 = "0.1.7" asar = "0.3.0" blake3 = "1.8.2" +memchr = "2.7" memmap2 = "0.9.7" futures = "0.3.31" dashmap = "6.1.0" diff --git a/src/blob.rs b/src/blob.rs index a54a4a8..9530882 100644 --- a/src/blob.rs +++ b/src/blob.rs @@ -10,16 +10,20 @@ use anyhow::Result; use bstr::{BString, ByteSlice}; use gix::ObjectId; use hex; +use once_cell::sync::OnceCell; use parking_lot::Mutex; use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; use sha1::{Digest, Sha1}; use smallvec::SmallVec; +use std::sync::atomic::{AtomicU64, Ordering}; use crate::git_commit_metadata::CommitMetadata; // const LARGE_FILE_THRESHOLD: u64 = 512 * 1024; // 512 KB const LARGE_FILE_THRESHOLD: u64 = 0; // always mmap +static NEXT_ID: AtomicU64 = AtomicU64::new(1); + /// The data of a blob, either owned (small files) or memory mapped (large files). pub enum BlobData<'a> { /// Small blobs – remains as-is. @@ -75,47 +79,68 @@ pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>; /// A Git blob, storing its SHA-1 id and its contents. pub struct Blob<'a> { - pub id: BlobId, - pub data: BlobData<'a>, + id: OnceCell, + data: BlobData<'a>, + temp_id: u64, } impl Blob<'_> { #[inline] - pub fn from_file>(path: P) -> Result { let mut file = File::open(&path)?; let file_size = file.metadata()?.len(); + let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed); if file_size > LARGE_FILE_THRESHOLD { // Large files: one mmap, zero extra copies. let mmap = unsafe { memmap2::Mmap::map(&file)? }; - let id = BlobId::new(mmap.as_ref()); - Ok(Blob { id, data: BlobData::Mapped(mmap) }) + Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id }) } else { // Small files: reuse the same handle and pre-allocate exact capacity let mut bytes = Vec::with_capacity(file_size as usize); file.read_to_end(&mut bytes)?; - let id = BlobId::new(&bytes); - Ok(Blob { id, data: BlobData::Owned(bytes) }) + Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }) } } + /// Returns the blob's bytes as a slice. #[inline] pub fn bytes(&self) -> &[u8] { self.data.as_ref() } + /// Lazily compute and return the blob's SHA-1 `BlobId`. + #[inline] + pub fn id(&self) -> BlobId { + *self.id.get_or_init(|| BlobId::new(self.bytes())) + } + + /// Get a reference to the blob's SHA-1 `BlobId`, computing it if necessary. + #[inline] + pub fn id_ref(&self) -> &BlobId { + self.id.get_or_init(|| BlobId::new(self.bytes())) + } + + /// Return the temporary identifier assigned on blob creation. + #[inline] + pub fn temp_id(&self) -> u64 { + self.temp_id + } + /// Create a new `Blob` from a vector of bytes. #[inline] pub fn from_bytes(bytes: Vec) -> Self { - let id = BlobId::compute_from_bytes(&bytes); - Blob { id, data: BlobData::Owned(bytes) } + let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id } } /// Create a new `Blob` with the given id and data. #[inline] pub fn new(id: BlobId, bytes: Vec) -> Self { - Blob { id, data: BlobData::Owned(bytes) } + let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + let cell = OnceCell::new(); + let _ = cell.set(id); + Blob { id: cell, data: BlobData::Owned(bytes), temp_id } } #[inline] @@ -360,9 +385,6 @@ pub struct BlobMetadata { /// The guessed multimedia type of the blob pub mime_essence: Option, - /// The guessed charset of the blob - pub charset: Option, - /// The guessed programming language of the blob pub language: Option, } @@ -385,8 +407,4 @@ impl BlobMetadata { self.mime_essence.as_deref() } - #[inline] - pub fn charset(&self) -> Option<&str> { - self.charset.as_deref() - } } diff --git a/src/location.rs b/src/location.rs index 4af7021..05a7586 100644 --- a/src/location.rs +++ b/src/location.rs @@ -94,8 +94,7 @@ pub struct LocationMapping { impl LocationMapping { /// Scan once for all `\n` positions. pub fn new(input: &[u8]) -> Self { - let newline_offsets = - input.iter().enumerate().filter_map(|(i, &b)| (b == b'\n').then_some(i)).collect(); + let newline_offsets = memchr::memchr_iter(b'\n', input).collect(); LocationMapping { newline_offsets } } diff --git a/src/matcher.rs b/src/matcher.rs index 0ecc7ec..07f53d3 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -288,17 +288,6 @@ impl<'a> Matcher<'a> { self.local_stats.blobs_scanned += 1; self.local_stats.bytes_scanned += blob.bytes().len() as u64; - // Check if blob was already seen and respect no_dedup flag - if !no_dedup { - if let Some(had_matches) = self.seen_blobs.get(&blob.id) { - return Ok(if had_matches { - ScanResult::SeenWithMatches - } else { - ScanResult::SeenSansMatches - }); - } - } - // Extract filename from origin let filename = origin .first() @@ -316,16 +305,7 @@ impl<'a> Matcher<'a> { let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) }; if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() { - // Only record in seen_blobs if deduplication is enabled - if !no_dedup { - return Ok(match self.seen_blobs.insert(blob.id, false) { - None => ScanResult::New(Vec::new()), - Some(true) => ScanResult::SeenWithMatches, - Some(false) => ScanResult::SeenSansMatches, - }); - } else { - return Ok(ScanResult::New(Vec::new())); - } + return Ok(ScanResult::New(Vec::new())); } let rules_db = self.rules_db; @@ -472,9 +452,15 @@ impl<'a> Matcher<'a> { } } // Finalize - // Only record in seen_blobs if deduplication is enabled - if !no_dedup { - self.seen_blobs.insert(blob.id, !matches.is_empty()); + if !no_dedup && !matches.is_empty() { + let blob_id = blob.id(); + if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) { + return Ok(if had_matches { + ScanResult::SeenWithMatches + } else { + ScanResult::SeenSansMatches + }); + } } // --- opportunistic capacity cap --------------------------------- @@ -571,7 +557,7 @@ fn filter_match<'b>( SerializableCaptures::from_captures(&captures, byte_slice.as_ref(), re, redact); matches.push(BlobMatch { rule: Arc::clone(&rule), - blob_id: &blob.id, + blob_id: blob.id_ref(), matching_input: only_matching_input, matching_input_offset_span, captures: groups, diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 7cf90c8..f2b065e 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -185,7 +185,6 @@ mod tests { id: BlobId::new(b"mock_blob"), num_bytes: 1024, mime_essence: Some("text/plain".to_string()), - charset: Some("UTF-8".to_string()), language: Some("Rust".to_string()), }; let dedup = true; @@ -219,7 +218,6 @@ mod tests { id: BlobId::new(b"mock_blob"), num_bytes: 1024, mime_essence: Some("text/plain".to_string()), - charset: Some("UTF-8".to_string()), language: Some("Rust".to_string()), }, m: mock_match, @@ -256,7 +254,6 @@ mod tests { id: BlobId::new(b"mock_blob"), num_bytes: 1024, mime_essence: Some("text/plain".to_string()), - charset: Some("UTF-8".to_string()), language: Some("Rust".to_string()), }, m: mock_match, diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index 14861ff..970ca5c 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -172,10 +172,15 @@ pub fn enumerate_filesystem_inputs( }; if should_skip { progress.suspend(|| { + let path = origin + .first() + .blob_path() + .map(|p| p.display().to_string()) + .unwrap_or_else(|| blob.temp_id().to_string()); if is_archive { - debug!("Skipping archive: {}", blob.id); + debug!("Skipping archive: {path}"); } else { - debug!("Skipping binary blob: {}", blob.id); + debug!("Skipping binary blob: {path}"); } }); return Ok(()); diff --git a/src/scanner/processing.rs b/src/scanner/processing.rs index 9447dbf..410233c 100644 --- a/src/scanner/processing.rs +++ b/src/scanner/processing.rs @@ -27,8 +27,7 @@ impl<'a> BlobProcessor<'a> { redact: bool, no_base64: bool, ) -> Result> { - let blob_id = blob.id.hex(); - let _span = debug_span!("matcher", blob_id).entered(); + let _span = debug_span!("matcher", temp_id = blob.temp_id()).entered(); let t1 = Instant::now(); let res = self.matcher.scan_blob(&blob, &origin, None, redact, no_dedup, no_base64)?; let scan_us = t1.elapsed().as_micros(); @@ -43,10 +42,9 @@ impl<'a> BlobProcessor<'a> { ScanResult::SeenWithMatches => { trace!("({scan_us}us) blob already scanned with matches"); let metadata = BlobMetadata { - id: blob.id, + id: blob.id(), num_bytes: blob.len(), mime_essence: None, - charset: None, language: None, }; Ok(Some((origin, metadata, Vec::new()))) @@ -63,10 +61,9 @@ impl<'a> BlobProcessor<'a> { } let md = MetadataResult::from_blob_and_origin(&blob, &origin); let metadata = BlobMetadata { - id: blob.id, + id: blob.id(), num_bytes: blob.len(), mime_essence: md.mime_essence, - charset: md.charset, language: md.language, }; diff --git a/tests/fingerprint_dedup.rs b/tests/fingerprint_dedup.rs index 9311ddf..5d5e89f 100644 --- a/tests/fingerprint_dedup.rs +++ b/tests/fingerprint_dedup.rs @@ -119,7 +119,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> { id: BlobId::new(b"dummy"), num_bytes: 10, mime_essence: None, - charset: None, language: None, }, m: m1, @@ -136,7 +135,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> { id: BlobId::new(b"dummy"), num_bytes: 10, mime_essence: None, - charset: None, language: None, }, m: m2,