From 4905ace028f98ca660be210534ad6c0c573802bf Mon Sep 17 00:00:00 2001
From: Mick Grove <mick.grove@mongodb.com>
Date: Mon, 23 Feb 2026 23:14:39 -0700
Subject: [PATCH] performance improvements

---
 CHANGELOG.md               |   3 +
 src/git_metadata_graph.rs  | 133 +++++++++++++++++++++++--------------
 src/git_repo_enumerator.rs |  63 ++++++++++--------
 src/pyc.rs                 |  40 +++--------
 src/scanner/enumerate.rs   |  22 +++---
 src/sqlite.rs              |  19 ++----
 6 files changed, 149 insertions(+), 131 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d6ebada..e9537da 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,9 @@ All notable changes to this project will be documented in this file.
 - Performance: pipelined ODB enumeration — scanning now begins while blob OIDs are still being discovered, overlapping I/O with pattern matching.
 - Performance: skip blobs smaller than 20 bytes during enumeration (too small to contain any secret).
 - Performance: preserve pack-ascending blob order in the metadata path for better I/O locality when Rayon splits work.
+- Performance: defer Git committer metadata materialization until commits actually introduce scannable blobs, reducing unnecessary string/time parsing work.
+- Performance: push `--exclude` filtering into Git tree traversal so excluded paths/subtrees are pruned before blob-introduction bookkeeping.
+- Performance: make Git repository object indexing single-pass (removed the extra ODB scan in `RepositoryIndex::new`).
 
 ## [v1.84.0]
 - Added/updated `pipedrive` and `amplitude` rules
diff --git a/src/git_metadata_graph.rs b/src/git_metadata_graph.rs
index 1ecfb8f..345e4c3 100644
--- a/src/git_metadata_graph.rs
+++ b/src/git_metadata_graph.rs
@@ -1,7 +1,7 @@
 use std::{collections::BinaryHeap, time::Instant};
 
 use anyhow::{bail, Context, Result};
-use bstr::BString;
+use bstr::{BString, ByteSlice};
 use fixedbitset::FixedBitSet;
 use gix::{
     hashtable::{hash_map, HashMap},
@@ -10,6 +10,7 @@ use gix::{
     prelude::*,
     ObjectId, OdbHandle,
 };
+use globset::GlobSet;
 
 use crate::git_repo_enumerator::MIN_SCANNABLE_BLOB_SIZE;
 use petgraph::{
@@ -137,43 +138,11 @@ pub(crate) struct RepositoryIndex {
 impl RepositoryIndex {
     pub(crate) fn new(odb: &OdbHandle) -> Result<Self> {
         use gix::{odb::store::iter::Ordering, prelude::*};
-        let mut num_tags = 0;
-        let mut num_trees = 0;
-        let mut num_blobs = 0;
-        let mut num_commits = 0;
+        let mut tree_oids = Vec::new();
+        let mut commit_oids = Vec::new();
+        let mut blob_oids = Vec::new();
+        let mut tag_oids = Vec::new();
 
-        for oid_result in odb
-            .iter()
-            .context("Failed to iterate object database")?
-            .with_ordering(Ordering::PackLexicographicalThenLooseLexicographical)
-        {
-            let oid = match oid_result {
-                Ok(oid) => oid,
-                Err(e) => {
-                    debug!("Failed to read object id: {e}");
-                    continue;
-                }
-            };
-            let hdr = match odb.header(oid) {
-                Ok(hdr) => hdr,
-                Err(e) => {
-                    debug!("Failed to read object header for {oid}: {e}");
-                    continue;
-                }
-            };
-            match hdr.kind() {
-                Kind::Tree => num_trees += 1,
-                Kind::Blob if hdr.size() >= MIN_SCANNABLE_BLOB_SIZE => num_blobs += 1,
-                Kind::Blob => {}
-                Kind::Commit => num_commits += 1,
-                Kind::Tag => num_tags += 1,
-            }
-        }
-
-        let mut trees = ObjectIdBimap::with_capacity(num_trees);
-        let mut commits = ObjectIdBimap::with_capacity(num_commits);
-        let mut blobs = ObjectIdBimap::with_capacity(num_blobs);
-        let mut tags = ObjectIdBimap::with_capacity(num_tags);
         for oid_result in odb
             .iter()
             .context("Failed to iterate object database")?
@@ -194,13 +163,32 @@ impl RepositoryIndex {
                 }
             };
             match hdr.kind() {
-                Kind::Tree => trees.insert(oid),
-                Kind::Blob if hdr.size() >= MIN_SCANNABLE_BLOB_SIZE => blobs.insert(oid),
+                Kind::Tree => tree_oids.push(oid),
+                Kind::Blob if hdr.size() >= MIN_SCANNABLE_BLOB_SIZE => blob_oids.push(oid),
                 Kind::Blob => {}
-                Kind::Commit => commits.insert(oid),
-                Kind::Tag => tags.insert(oid),
+                Kind::Commit => commit_oids.push(oid),
+                Kind::Tag => tag_oids.push(oid),
             }
         }
+
+        let mut trees = ObjectIdBimap::with_capacity(tree_oids.len());
+        let mut commits = ObjectIdBimap::with_capacity(commit_oids.len());
+        let mut blobs = ObjectIdBimap::with_capacity(blob_oids.len());
+        let mut tags = ObjectIdBimap::with_capacity(tag_oids.len());
+
+        for oid in tree_oids {
+            trees.insert(oid);
+        }
+        for oid in commit_oids {
+            commits.insert(oid);
+        }
+        for oid in blob_oids {
+            blobs.insert(oid);
+        }
+        for oid in tag_oids {
+            tags.insert(oid);
+        }
+
         Ok(Self { trees, commits, blobs, tags })
     }
     pub(crate) fn num_commits(&self) -> usize {
@@ -293,6 +281,7 @@ impl GitMetadataGraph {
         self,
         repo_index: &RepositoryIndex,
         repo: &gix::Repository,
+        exclude_globset: Option<&GlobSet>,
     ) -> Result<Vec<CommitBlobMetadata>> {
         let _span =
             error_span!("get_repo_metadata", path = repo.path().display().to_string()).entered();
@@ -340,6 +329,7 @@ impl GitMetadataGraph {
                     visit_tree(
                         repo,
                         &mut symbols,
+                        exclude_globset,
                         repo_index,
                         &mut num_trees_introduced,
                         &mut num_blobs_introduced,
@@ -409,10 +399,51 @@ impl GitMetadataGraph {
     }
 }
 
+#[inline]
+fn path_is_excluded(path: &BString, exclude_globset: Option<&GlobSet>) -> bool {
+    let Some(gs) = exclude_globset else {
+        return false;
+    };
+    match path.to_path() {
+        Ok(p) => gs.is_match(p),
+        Err(_) => false,
+    }
+}
+
+#[inline]
+fn tree_path_is_excluded(path: &BString, exclude_globset: Option<&GlobSet>) -> bool {
+    if path_is_excluded(path, exclude_globset) {
+        return true;
+    }
+    let Some(gs) = exclude_globset else {
+        return false;
+    };
+    let mut dir_path = path.clone();
+    dir_path.push(b'/');
+    match dir_path.to_path() {
+        Ok(p) => gs.is_match(p),
+        Err(_) => false,
+    }
+}
+
+#[inline]
+fn render_symbol_path(symbols: &BStringTable, path: &[Symbol]) -> BString {
+    let mut buf = Vec::new();
+    if let Some(first) = path.first() {
+        buf.extend_from_slice(symbols.resolve(*first));
+        for s in &path[1..] {
+            buf.push(b'/');
+            buf.extend_from_slice(symbols.resolve(*s));
+        }
+    }
+    BString::from(buf)
+}
+
 #[allow(clippy::too_many_arguments)]
 fn visit_tree(
     repo: &gix::Repository,
     symbols: &mut BStringTable,
+    exclude_globset: Option<&GlobSet>,
     repo_index: &RepositoryIndex,
     num_trees_introduced: &mut usize,
     num_blobs_introduced: &mut usize,
@@ -450,6 +481,12 @@ fn visit_tree(
                     if seen.insert_tree(child_idx)? {
                         let mut new_path = name_path.clone();
                         new_path.push(symbols.get_or_intern(child.filename.into()));
+                        if exclude_globset.is_some() {
+                            let path = render_symbol_path(symbols, &new_path);
+                            if tree_path_is_excluded(&path, exclude_globset) {
+                                continue;
+                            }
+                        }
                         tree_worklist.push((new_path, child.oid.to_owned()));
                     }
                 }
@@ -460,18 +497,14 @@ fn visit_tree(
                     };
                     if !seen.contains_blob(child_idx)? {
                         blobs_encountered.push(child_idx);
-                        *num_blobs_introduced += 1;
                         let mut new_path = name_path.clone();
                         new_path.push(symbols.get_or_intern(child.filename.into()));
-                        let mut buf = Vec::new();
-                        if let Some(first) = new_path.first() {
-                            buf.extend_from_slice(symbols.resolve(*first));
-                            for s in &new_path[1..] {
-                                buf.push(b'/');
-                                buf.extend_from_slice(symbols.resolve(*s));
-                            }
+                        let path = render_symbol_path(symbols, &new_path);
+                        if path_is_excluded(&path, exclude_globset) {
+                            continue;
                         }
-                        introduced.push((child.oid.to_owned(), BString::from(buf)));
+                        *num_blobs_introduced += 1;
+                        introduced.push((child.oid.to_owned(), path));
                     }
                 }
             }
diff --git a/src/git_repo_enumerator.rs b/src/git_repo_enumerator.rs
index df974dd..d03f752 100644
--- a/src/git_repo_enumerator.rs
+++ b/src/git_repo_enumerator.rs
@@ -87,10 +87,9 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
 
         let mut metadata_graph = GitMetadataGraph::with_capacity(object_index.num_commits());
         let mut scratch = Vec::with_capacity(4 * 1024 * 1024);
-        let mut commit_metadata =
-            HashMap::with_capacity_and_hasher(object_index.num_commits(), Default::default());
 
-        // Collect commit metadata and build commit graph
+        // Build commit graph first; materialize committer metadata only for commits that
+        // actually introduce blobs.
         for commit_oid in object_index.commits() {
             let commit = match odb.find_commit(commit_oid, &mut scratch) {
                 Ok(commit) => commit,
@@ -113,25 +112,16 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
                 let parent_idx = metadata_graph.get_commit_idx(parent_oid, None);
                 metadata_graph.add_commit_edge(parent_idx, commit_idx);
             }
-
-            let committer = &commit.committer;
-            // let author = &commit.author;
-
-            commit_metadata.insert(
-                *commit_oid,
-                Arc::new(CommitMetadata {
-                    commit_id: *commit_oid,
-                    committer_name: String::from_utf8_lossy(&committer.name).into_owned(),
-                    committer_email: String::from_utf8_lossy(&committer.email).into_owned(),
-                    committer_timestamp: parse_sig_time(committer.time),
-                }),
-            );
         }
 
         debug!("Built metadata graph in {:.6}s", started.elapsed().as_secs_f64());
 
         // Compute metadata once, then get all blob IDs (in pack-ascending order)
-        let meta_result = metadata_graph.get_repo_metadata(&object_index, &self.repo);
+        let meta_result = metadata_graph.get_repo_metadata(
+            &object_index,
+            &self.repo,
+            self.exclude_globset.as_deref(),
+        );
         let all_blobs = object_index.into_blobs();
 
         // Assemble final blob list, preserving pack-ascending order for I/O locality
@@ -144,22 +134,44 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
                     .collect()
             }
             Ok(metadata) => {
+                let mut commit_metadata: HashMap<ObjectId, Arc<CommitMetadata>> =
+                    HashMap::with_capacity_and_hasher(0, Default::default());
                 let mut blob_appearances: HashMap<ObjectId, SmallVec<_>> =
                     HashMap::with_capacity_and_hasher(all_blobs.len(), Default::default());
 
                 for e in metadata {
-                    let cm = match commit_metadata.get(&e.commit_oid) {
-                        Some(cm) => cm,
-                        None => {
-                            debug!("Missing commit metadata for {}", e.commit_oid);
-                            continue;
-                        }
+                    if e.introduced_blobs.is_empty() {
+                        continue;
+                    }
+
+                    let cm = if let Some(cm) = commit_metadata.get(&e.commit_oid) {
+                        cm.clone()
+                    } else {
+                        let commit = match odb.find_commit(&e.commit_oid, &mut scratch) {
+                            Ok(commit) => commit,
+                            Err(err) => {
+                                debug!(
+                                    "Failed to load commit metadata for {}: {err}",
+                                    e.commit_oid
+                                );
+                                continue;
+                            }
+                        };
+                        let committer = &commit.committer;
+                        let parsed = Arc::new(CommitMetadata {
+                            commit_id: e.commit_oid,
+                            committer_name: String::from_utf8_lossy(&committer.name).into_owned(),
+                            committer_email: String::from_utf8_lossy(&committer.email).into_owned(),
+                            committer_timestamp: parse_sig_time(committer.time),
+                        });
+                        commit_metadata.insert(e.commit_oid, Arc::clone(&parsed));
+                        parsed
                     };
                     for (blob_oid, path) in e.introduced_blobs {
                         blob_appearances
                             .entry(blob_oid)
                             .or_default()
-                            .push(BlobAppearance { commit_metadata: cm.clone(), path });
+                            .push(BlobAppearance { commit_metadata: Arc::clone(&cm), path });
                     }
                 }
 
@@ -167,8 +179,7 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
                 all_blobs
                     .into_iter()
                     .filter_map(|blob_oid| {
-                        let appearances =
-                            blob_appearances.remove(&blob_oid).unwrap_or_default();
+                        let appearances = blob_appearances.remove(&blob_oid).unwrap_or_default();
                         if appearances.is_empty() {
                             return Some(GitBlobMetadata { blob_oid, first_seen: appearances });
                         }
diff --git a/src/pyc.rs b/src/pyc.rs
index 23aa4e0..d7ae6fd 100644
--- a/src/pyc.rs
+++ b/src/pyc.rs
@@ -364,19 +364,13 @@ pub fn extract_pyc_strings(path: &Path) -> Result<Vec<u8>> {
     let (header_size, code_format) = match pyc_version_info(magic) {
         Some(info) => info,
         None => {
-            debug!(
-                "unsupported .pyc magic number {magic} in {}, skipping",
-                path.display()
-            );
+            debug!("unsupported .pyc magic number {magic} in {}, skipping", path.display());
             return Ok(Vec::new());
         }
     };
 
     if data.len() < header_size {
-        bail!(
-            ".pyc header requires {header_size} bytes but file is only {} bytes",
-            data.len()
-        );
+        bail!(".pyc header requires {header_size} bytes but file is only {} bytes", data.len());
     }
 
     let marshal_data = &data[header_size..];
@@ -496,7 +490,7 @@ mod tests {
         buf.extend_from_slice(&marshal_small_tuple(&[])); // cellvars
         buf.extend_from_slice(&marshal_short_ascii("<test>")); // filename
         buf.extend_from_slice(&marshal_short_ascii("<module>")); // name
-        // firstlineno
+                                                                 // firstlineno
         buf.extend_from_slice(&1i32.to_le_bytes());
         // 1 trailing object: lnotab
         buf.extend_from_slice(&marshal_string(b""));
@@ -552,10 +546,8 @@ mod tests {
     #[test]
     fn extracts_strings_from_code_object() {
         let mut data = make_pyc_header(3413, 16);
-        let consts = marshal_small_tuple(&[
-            marshal_none(),
-            marshal_short_ascii("ghp_abc123def456"),
-        ]);
+        let consts =
+            marshal_small_tuple(&[marshal_none(), marshal_short_ascii("ghp_abc123def456")]);
         let names = marshal_small_tuple(&[marshal_short_ascii("api_key")]);
         let code = marshal_code_38(consts, names);
         data.extend_from_slice(&code);
@@ -666,13 +658,8 @@ mod tests {
     #[test]
     fn handles_nested_tuples() {
         let mut data = make_pyc_header(3413, 16);
-        let inner = marshal_small_tuple(&[
-            marshal_short_ascii("inner_secret"),
-        ]);
-        let outer = marshal_small_tuple(&[
-            marshal_short_ascii("outer"),
-            inner,
-        ]);
+        let inner = marshal_small_tuple(&[marshal_short_ascii("inner_secret")]);
+        let outer = marshal_small_tuple(&[marshal_short_ascii("outer"), inner]);
         data.extend_from_slice(&outer);
         let tmp = tempfile::NamedTempFile::new().unwrap();
         std::fs::write(tmp.path(), &data).unwrap();
@@ -711,7 +698,7 @@ mod tests {
         buf.extend_from_slice(&marshal_short_ascii("<test>")); // filename
         buf.extend_from_slice(&marshal_short_ascii("<module>")); // name
         buf.extend_from_slice(&marshal_short_ascii("<module>")); // qualname
-        // firstlineno
+                                                                 // firstlineno
         buf.extend_from_slice(&1i32.to_le_bytes());
         // 2 trailing objects: linetable, exceptiontable
         buf.extend_from_slice(&marshal_string(b""));
@@ -722,10 +709,8 @@ mod tests {
     #[test]
     fn extracts_strings_from_code_object_v313() {
         let mut data = make_pyc_header(3627, 16); // Python 3.14
-        let consts = marshal_small_tuple(&[
-            marshal_none(),
-            marshal_short_ascii("sk-proj-ABCDEF123456"),
-        ]);
+        let consts =
+            marshal_small_tuple(&[marshal_none(), marshal_short_ascii("sk-proj-ABCDEF123456")]);
         let names = marshal_small_tuple(&[marshal_short_ascii("openai_key")]);
         let code = marshal_code_313(consts, names);
         data.extend_from_slice(&code);
@@ -733,10 +718,7 @@ mod tests {
         std::fs::write(tmp.path(), &data).unwrap();
         let result = extract_pyc_strings(tmp.path()).unwrap();
         let result_str = String::from_utf8_lossy(&result);
-        assert!(
-            result_str.contains("sk-proj-ABCDEF123456"),
-            "missing secret from consts"
-        );
+        assert!(result_str.contains("sk-proj-ABCDEF123456"), "missing secret from consts");
         assert!(result_str.contains("openai_key"), "missing name");
         assert!(result_str.contains("<test>"), "missing filename");
     }
diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs
index e964483..01075db 100644
--- a/src/scanner/enumerate.rs
+++ b/src/scanner/enumerate.rs
@@ -34,6 +34,7 @@ use crate::{
     matcher::{Matcher, MatcherStats},
     open_git_repo_with_options,
     origin::{Origin, OriginSet},
+    pyc::extract_pyc_strings,
     rule_profiling::ConcurrentRuleProfiler,
     rules_database::RulesDatabase,
     scanner::{
@@ -42,7 +43,6 @@ use crate::{
         util::{is_compressed_file, is_pyc_file, is_sqlite_file},
     },
     scanner_pool::ScannerPool,
-    pyc::extract_pyc_strings,
     sqlite::extract_sqlite_contents,
     DirectoryResult, EnumeratorConfig, EnumeratorFileResult, FileResult, FilesystemEnumerator,
     FoundInput, GitDiffConfig, GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator,
@@ -584,9 +584,7 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> {
                     .name("odb_enumerator".to_string())
                     .spawn(move || {
                         use gix::{
-                            object::Kind,
-                            odb::store::iter::Ordering as OdbOrdering,
-                            prelude::*,
+                            object::Kind, odb::store::iter::Ordering as OdbOrdering, prelude::*,
                         };
                         let repo = enum_repo_sync.to_thread_local();
                         let odb = &repo.objects;
@@ -594,9 +592,9 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> {
                             Ok(i) => i,
                             Err(_) => return,
                         };
-                        for oid_result in iter.with_ordering(
-                            OdbOrdering::PackAscendingOffsetThenLooseLexicographical,
-                        ) {
+                        for oid_result in iter
+                            .with_ordering(OdbOrdering::PackAscendingOffsetThenLooseLexicographical)
+                        {
                             let oid = match oid_result {
                                 Ok(oid) => oid,
                                 Err(_) => continue,
@@ -605,9 +603,7 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> {
                                 Ok(hdr) => hdr,
                                 Err(_) => continue,
                             };
-                            if hdr.kind() == Kind::Blob
-                                && hdr.size() >= MIN_SCANNABLE_BLOB_SIZE
-                            {
+                            if hdr.kind() == Kind::Blob && hdr.size() >= MIN_SCANNABLE_BLOB_SIZE {
                                 let md = GitBlobMetadata {
                                     blob_oid: oid,
                                     first_seen: Default::default(),
@@ -1013,7 +1009,11 @@ fn enumerate_git_diff_repo(
         blobs
     };
 
-    Ok(GitRepoResult { repository, path: path.to_owned(), blobs: GitBlobSource::Precomputed(blobs) })
+    Ok(GitRepoResult {
+        repository,
+        path: path.to_owned(),
+        blobs: GitBlobSource::Precomputed(blobs),
+    })
 }
 
 fn synthesize_staged_commit(path: &Path, parent_ref: &str) -> Result<String> {
diff --git a/src/sqlite.rs b/src/sqlite.rs
index 3fc1b8e..c2b968f 100644
--- a/src/sqlite.rs
+++ b/src/sqlite.rs
@@ -46,11 +46,7 @@ pub fn extract_sqlite_contents(path: &Path) -> Result<Vec<(String, Vec<u8>)>> {
                 results.push((logical_name, sql_text.into_bytes()));
             }
             Err(e) => {
-                debug!(
-                    "Failed to dump table '{}' from {}: {e:#}",
-                    table_name,
-                    path.display()
-                );
+                debug!("Failed to dump table '{}' from {}: {e:#}", table_name, path.display());
             }
         }
     }
@@ -136,10 +132,7 @@ fn dump_table(
 }
 
 fn column_names(conn: &Connection, table_name: &str) -> Result<Vec<String>> {
-    let query = format!(
-        "PRAGMA table_info(\"{}\")",
-        table_name.replace('"', "\"\"")
-    );
+    let query = format!("PRAGMA table_info(\"{}\")", table_name.replace('"', "\"\""));
     let mut stmt = conn.prepare(&query)?;
     let names = stmt
         .query_map([], |row| {
@@ -207,10 +200,7 @@ mod tests {
         let (_tmp, path) = create_test_db();
         let results = extract_sqlite_contents(&path).unwrap();
 
-        let user_info = results
-            .iter()
-            .find(|(n, _)| n == "user_info.sql")
-            .unwrap();
+        let user_info = results.iter().find(|(n, _)| n == "user_info.sql").unwrap();
         let sql = String::from_utf8_lossy(&user_info.1);
 
         assert!(sql.contains("CREATE TABLE"));
@@ -224,8 +214,7 @@ mod tests {
         let tmp = NamedTempFile::new().unwrap();
         let path = tmp.path().to_path_buf();
         let conn = Connection::open(&path).unwrap();
-        conn.execute_batch("CREATE TABLE empty_table (id INTEGER);")
-            .unwrap();
+        conn.execute_batch("CREATE TABLE empty_table (id INTEGER);").unwrap();
 
         let results = extract_sqlite_contents(&path).unwrap();
         assert_eq!(results.len(), 1);