Optimized memory usage via string interning and extensive data sharing

2025-09-03 09:52:49 -07:00 · 2025-09-03 09:52:49 -07:00 · bf60c5584b
commit bf60c5584b
parent ac34f35f61
8 changed files with 59 additions and 58 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -65,7 +65,7 @@ smallvec = { version = "1", features = [
 tracing = "0.1.41"
 indicatif = { version = "0.17", features = ["improved_unicode"] }
 rayon = "1.10"
-sha1 = "0.10.6"
+sha1 = { version = "0.10.6", features = ["asm"] }
 hex = "0.4.3"
 vectorscan-rs = "0.0.5"
 regex = "1.11.1"
@ -151,6 +151,7 @@ tar = "0.4.44"
 xz2 = "0.1.7"
 asar = "0.3.0"
 blake3 = "1.8.2"
+memchr = "2.7"
 memmap2 = "0.9.7"
 futures = "0.3.31"
 dashmap = "6.1.0"
--- a/src/blob.rs
+++ b/src/blob.rs
@ -10,16 +10,20 @@ use anyhow::Result;
 use bstr::{BString, ByteSlice};
 use gix::ObjectId;
 use hex;
+use once_cell::sync::OnceCell;
 use parking_lot::Mutex;
 use rustc_hash::FxHashMap;
 use serde::{Deserialize, Serialize};
 use sha1::{Digest, Sha1};
 use smallvec::SmallVec;
+use std::sync::atomic::{AtomicU64, Ordering};

 use crate::git_commit_metadata::CommitMetadata;
 // const LARGE_FILE_THRESHOLD: u64 = 512 * 1024; // 512 KB
 const LARGE_FILE_THRESHOLD: u64 = 0; // always mmap

+static NEXT_ID: AtomicU64 = AtomicU64::new(1);
+
 /// The data of a blob, either owned (small files) or memory mapped (large files).
 pub enum BlobData<'a> {
    /// Small blobs – remains as-is.
@ -75,47 +79,68 @@ pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>;
 /// A Git blob, storing its SHA-1 id and its contents.

 pub struct Blob<'a> {
-    pub id: BlobId,
-    pub data: BlobData<'a>,
+    id: OnceCell<BlobId>,
+    data: BlobData<'a>,
+    temp_id: u64,
 }

 impl Blob<'_> {
    #[inline]
-
    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
        let mut file = File::open(&path)?;
        let file_size = file.metadata()?.len();
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);

        if file_size > LARGE_FILE_THRESHOLD {
            // Large files: one mmap, zero extra copies.
            let mmap = unsafe { memmap2::Mmap::map(&file)? };
-            let id = BlobId::new(mmap.as_ref());
-            Ok(Blob { id, data: BlobData::Mapped(mmap) })
+            Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id })
        } else {
            // Small files: reuse the same handle and pre-allocate exact capacity
            let mut bytes = Vec::with_capacity(file_size as usize);
            file.read_to_end(&mut bytes)?;
-            let id = BlobId::new(&bytes);
-            Ok(Blob { id, data: BlobData::Owned(bytes) })
+            Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id })
        }
    }
+
    /// Returns the blob's bytes as a slice.
    #[inline]
    pub fn bytes(&self) -> &[u8] {
        self.data.as_ref()
    }

+    /// Lazily compute and return the blob's SHA-1 `BlobId`.
+    #[inline]
+    pub fn id(&self) -> BlobId {
+        *self.id.get_or_init(|| BlobId::new(self.bytes()))
+    }
+
+    /// Get a reference to the blob's SHA-1 `BlobId`, computing it if necessary.
+    #[inline]
+    pub fn id_ref(&self) -> &BlobId {
+        self.id.get_or_init(|| BlobId::new(self.bytes()))
+    }
+
+    /// Return the temporary identifier assigned on blob creation.
+    #[inline]
+    pub fn temp_id(&self) -> u64 {
+        self.temp_id
+    }
+
    /// Create a new `Blob` from a vector of bytes.
    #[inline]
    pub fn from_bytes(bytes: Vec<u8>) -> Self {
-        let id = BlobId::compute_from_bytes(&bytes);
-        Blob { id, data: BlobData::Owned(bytes) }
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+        Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }
    }

    /// Create a new `Blob` with the given id and data.
    #[inline]
    pub fn new(id: BlobId, bytes: Vec<u8>) -> Self {
-        Blob { id, data: BlobData::Owned(bytes) }
+        let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+        let cell = OnceCell::new();
+        let _ = cell.set(id);
+        Blob { id: cell, data: BlobData::Owned(bytes), temp_id }
    }

    #[inline]
@ -360,9 +385,6 @@ pub struct BlobMetadata {
    /// The guessed multimedia type of the blob
    pub mime_essence: Option<String>,

-    /// The guessed charset of the blob
-    pub charset: Option<String>,
-
    /// The guessed programming language of the blob
    pub language: Option<String>,
 }
@ -385,8 +407,4 @@ impl BlobMetadata {
        self.mime_essence.as_deref()
    }

-    #[inline]
-    pub fn charset(&self) -> Option<&str> {
-        self.charset.as_deref()
-    }
 }
--- a/src/location.rs
+++ b/src/location.rs
@ -94,8 +94,7 @@ pub struct LocationMapping {
 impl LocationMapping {
    /// Scan once for all `\n` positions.
    pub fn new(input: &[u8]) -> Self {
-        let newline_offsets =
-            input.iter().enumerate().filter_map(|(i, &b)| (b == b'\n').then_some(i)).collect();
+        let newline_offsets = memchr::memchr_iter(b'\n', input).collect();
        LocationMapping { newline_offsets }
    }

--- a/src/matcher.rs
+++ b/src/matcher.rs
@ -288,17 +288,6 @@ impl<'a> Matcher<'a> {
        self.local_stats.blobs_scanned += 1;
        self.local_stats.bytes_scanned += blob.bytes().len() as u64;

-        // Check if blob was already seen and respect no_dedup flag
-        if !no_dedup {
-            if let Some(had_matches) = self.seen_blobs.get(&blob.id) {
-                return Ok(if had_matches {
-                    ScanResult::SeenWithMatches
-                } else {
-                    ScanResult::SeenSansMatches
-                });
-            }
-        }
-
        // Extract filename from origin
        let filename = origin
            .first()
@ -316,16 +305,7 @@ impl<'a> Matcher<'a> {
        let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) };

        if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() {
-            // Only record in seen_blobs if deduplication is enabled
-            if !no_dedup {
-                return Ok(match self.seen_blobs.insert(blob.id, false) {
-                    None => ScanResult::New(Vec::new()),
-                    Some(true) => ScanResult::SeenWithMatches,
-                    Some(false) => ScanResult::SeenSansMatches,
-                });
-            } else {
-                return Ok(ScanResult::New(Vec::new()));
-            }
+            return Ok(ScanResult::New(Vec::new()));
        }

        let rules_db = self.rules_db;
@ -472,9 +452,15 @@ impl<'a> Matcher<'a> {
            }
        }
        // Finalize
-        // Only record in seen_blobs if deduplication is enabled
-        if !no_dedup {
-            self.seen_blobs.insert(blob.id, !matches.is_empty());
+        if !no_dedup && !matches.is_empty() {
+            let blob_id = blob.id();
+            if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
+                return Ok(if had_matches {
+                    ScanResult::SeenWithMatches
+                } else {
+                    ScanResult::SeenSansMatches
+                });
+            }
        }

        // --- opportunistic capacity cap ---------------------------------
@ -571,7 +557,7 @@ fn filter_match<'b>(
            SerializableCaptures::from_captures(&captures, byte_slice.as_ref(), re, redact);
        matches.push(BlobMatch {
            rule: Arc::clone(&rule),
-            blob_id: &blob.id,
+            blob_id: blob.id_ref(),
            matching_input: only_matching_input,
            matching_input_offset_span,
            captures: groups,
--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@ -185,7 +185,6 @@ mod tests {
                id: BlobId::new(b"mock_blob"),
                num_bytes: 1024,
                mime_essence: Some("text/plain".to_string()),
-                charset: Some("UTF-8".to_string()),
                language: Some("Rust".to_string()),
            };
            let dedup = true;
@ -219,7 +218,6 @@ mod tests {
                id: BlobId::new(b"mock_blob"),
                num_bytes: 1024,
                mime_essence: Some("text/plain".to_string()),
-                charset: Some("UTF-8".to_string()),
                language: Some("Rust".to_string()),
            },
            m: mock_match,
@ -256,7 +254,6 @@ mod tests {
                    id: BlobId::new(b"mock_blob"),
                    num_bytes: 1024,
                    mime_essence: Some("text/plain".to_string()),
-                    charset: Some("UTF-8".to_string()),
                    language: Some("Rust".to_string()),
                },
                m: mock_match,
--- a/src/scanner/enumerate.rs
+++ b/src/scanner/enumerate.rs
@ -172,10 +172,15 @@ pub fn enumerate_filesystem_inputs(
                };
                if should_skip {
                    progress.suspend(|| {
+                        let path = origin
+                            .first()
+                            .blob_path()
+                            .map(|p| p.display().to_string())
+                            .unwrap_or_else(|| blob.temp_id().to_string());
                        if is_archive {
-                            debug!("Skipping archive: {}", blob.id);
+                            debug!("Skipping archive: {path}");
                        } else {
-                            debug!("Skipping binary blob: {}", blob.id);
+                            debug!("Skipping binary blob: {path}");
                        }
                    });
                    return Ok(());
--- a/src/scanner/processing.rs
+++ b/src/scanner/processing.rs
@ -27,8 +27,7 @@ impl<'a> BlobProcessor<'a> {
        redact: bool,
        no_base64: bool,
    ) -> Result<Option<DatastoreMessage>> {
-        let blob_id = blob.id.hex();
-        let _span = debug_span!("matcher", blob_id).entered();
+        let _span = debug_span!("matcher", temp_id = blob.temp_id()).entered();
        let t1 = Instant::now();
        let res = self.matcher.scan_blob(&blob, &origin, None, redact, no_dedup, no_base64)?;
        let scan_us = t1.elapsed().as_micros();
@ -43,10 +42,9 @@ impl<'a> BlobProcessor<'a> {
            ScanResult::SeenWithMatches => {
                trace!("({scan_us}us) blob already scanned with matches");
                let metadata = BlobMetadata {
-                    id: blob.id,
+                    id: blob.id(),
                    num_bytes: blob.len(),
                    mime_essence: None,
-                    charset: None,
                    language: None,
                };
                Ok(Some((origin, metadata, Vec::new())))
@ -63,10 +61,9 @@ impl<'a> BlobProcessor<'a> {
                }
                let md = MetadataResult::from_blob_and_origin(&blob, &origin);
                let metadata = BlobMetadata {
-                    id: blob.id,
+                    id: blob.id(),
                    num_bytes: blob.len(),
                    mime_essence: md.mime_essence,
-                    charset: md.charset,
                    language: md.language,
                };

--- a/tests/fingerprint_dedup.rs
+++ b/tests/fingerprint_dedup.rs
@ -119,7 +119,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> {
                id: BlobId::new(b"dummy"),
                num_bytes: 10,
                mime_essence: None,
-                charset: None,
                language: None,
            },
            m: m1,
@ -136,7 +135,6 @@ fn reporter_deduplicates_across_git_commits() -> Result<()> {
                id: BlobId::new(b"dummy"),
                num_bytes: 10,
                mime_essence: None,
-                charset: None,
                language: None,
            },
            m: m2,