From 4905ace028f98ca660be210534ad6c0c573802bf Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 23 Feb 2026 23:14:39 -0700 Subject: [PATCH] performance improvements --- CHANGELOG.md | 3 + src/git_metadata_graph.rs | 133 +++++++++++++++++++++++-------------- src/git_repo_enumerator.rs | 63 ++++++++++-------- src/pyc.rs | 40 +++-------- src/scanner/enumerate.rs | 22 +++--- src/sqlite.rs | 19 ++---- 6 files changed, 149 insertions(+), 131 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6ebada..e9537da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ All notable changes to this project will be documented in this file. - Performance: pipelined ODB enumeration — scanning now begins while blob OIDs are still being discovered, overlapping I/O with pattern matching. - Performance: skip blobs smaller than 20 bytes during enumeration (too small to contain any secret). - Performance: preserve pack-ascending blob order in the metadata path for better I/O locality when Rayon splits work. +- Performance: defer Git committer metadata materialization until commits actually introduce scannable blobs, reducing unnecessary string/time parsing work. +- Performance: push `--exclude` filtering into Git tree traversal so excluded paths/subtrees are pruned before blob-introduction bookkeeping. +- Performance: make Git repository object indexing single-pass (removed the extra ODB scan in `RepositoryIndex::new`). ## [v1.84.0] - Added/updated `pipedrive` and `amplitude` rules diff --git a/src/git_metadata_graph.rs b/src/git_metadata_graph.rs index 1ecfb8f..345e4c3 100644 --- a/src/git_metadata_graph.rs +++ b/src/git_metadata_graph.rs @@ -1,7 +1,7 @@ use std::{collections::BinaryHeap, time::Instant}; use anyhow::{bail, Context, Result}; -use bstr::BString; +use bstr::{BString, ByteSlice}; use fixedbitset::FixedBitSet; use gix::{ hashtable::{hash_map, HashMap}, @@ -10,6 +10,7 @@ use gix::{ prelude::*, ObjectId, OdbHandle, }; +use globset::GlobSet; use crate::git_repo_enumerator::MIN_SCANNABLE_BLOB_SIZE; use petgraph::{ @@ -137,43 +138,11 @@ pub(crate) struct RepositoryIndex { impl RepositoryIndex { pub(crate) fn new(odb: &OdbHandle) -> Result { use gix::{odb::store::iter::Ordering, prelude::*}; - let mut num_tags = 0; - let mut num_trees = 0; - let mut num_blobs = 0; - let mut num_commits = 0; + let mut tree_oids = Vec::new(); + let mut commit_oids = Vec::new(); + let mut blob_oids = Vec::new(); + let mut tag_oids = Vec::new(); - for oid_result in odb - .iter() - .context("Failed to iterate object database")? - .with_ordering(Ordering::PackLexicographicalThenLooseLexicographical) - { - let oid = match oid_result { - Ok(oid) => oid, - Err(e) => { - debug!("Failed to read object id: {e}"); - continue; - } - }; - let hdr = match odb.header(oid) { - Ok(hdr) => hdr, - Err(e) => { - debug!("Failed to read object header for {oid}: {e}"); - continue; - } - }; - match hdr.kind() { - Kind::Tree => num_trees += 1, - Kind::Blob if hdr.size() >= MIN_SCANNABLE_BLOB_SIZE => num_blobs += 1, - Kind::Blob => {} - Kind::Commit => num_commits += 1, - Kind::Tag => num_tags += 1, - } - } - - let mut trees = ObjectIdBimap::with_capacity(num_trees); - let mut commits = ObjectIdBimap::with_capacity(num_commits); - let mut blobs = ObjectIdBimap::with_capacity(num_blobs); - let mut tags = ObjectIdBimap::with_capacity(num_tags); for oid_result in odb .iter() .context("Failed to iterate object database")? @@ -194,13 +163,32 @@ impl RepositoryIndex { } }; match hdr.kind() { - Kind::Tree => trees.insert(oid), - Kind::Blob if hdr.size() >= MIN_SCANNABLE_BLOB_SIZE => blobs.insert(oid), + Kind::Tree => tree_oids.push(oid), + Kind::Blob if hdr.size() >= MIN_SCANNABLE_BLOB_SIZE => blob_oids.push(oid), Kind::Blob => {} - Kind::Commit => commits.insert(oid), - Kind::Tag => tags.insert(oid), + Kind::Commit => commit_oids.push(oid), + Kind::Tag => tag_oids.push(oid), } } + + let mut trees = ObjectIdBimap::with_capacity(tree_oids.len()); + let mut commits = ObjectIdBimap::with_capacity(commit_oids.len()); + let mut blobs = ObjectIdBimap::with_capacity(blob_oids.len()); + let mut tags = ObjectIdBimap::with_capacity(tag_oids.len()); + + for oid in tree_oids { + trees.insert(oid); + } + for oid in commit_oids { + commits.insert(oid); + } + for oid in blob_oids { + blobs.insert(oid); + } + for oid in tag_oids { + tags.insert(oid); + } + Ok(Self { trees, commits, blobs, tags }) } pub(crate) fn num_commits(&self) -> usize { @@ -293,6 +281,7 @@ impl GitMetadataGraph { self, repo_index: &RepositoryIndex, repo: &gix::Repository, + exclude_globset: Option<&GlobSet>, ) -> Result> { let _span = error_span!("get_repo_metadata", path = repo.path().display().to_string()).entered(); @@ -340,6 +329,7 @@ impl GitMetadataGraph { visit_tree( repo, &mut symbols, + exclude_globset, repo_index, &mut num_trees_introduced, &mut num_blobs_introduced, @@ -409,10 +399,51 @@ impl GitMetadataGraph { } } +#[inline] +fn path_is_excluded(path: &BString, exclude_globset: Option<&GlobSet>) -> bool { + let Some(gs) = exclude_globset else { + return false; + }; + match path.to_path() { + Ok(p) => gs.is_match(p), + Err(_) => false, + } +} + +#[inline] +fn tree_path_is_excluded(path: &BString, exclude_globset: Option<&GlobSet>) -> bool { + if path_is_excluded(path, exclude_globset) { + return true; + } + let Some(gs) = exclude_globset else { + return false; + }; + let mut dir_path = path.clone(); + dir_path.push(b'/'); + match dir_path.to_path() { + Ok(p) => gs.is_match(p), + Err(_) => false, + } +} + +#[inline] +fn render_symbol_path(symbols: &BStringTable, path: &[Symbol]) -> BString { + let mut buf = Vec::new(); + if let Some(first) = path.first() { + buf.extend_from_slice(symbols.resolve(*first)); + for s in &path[1..] { + buf.push(b'/'); + buf.extend_from_slice(symbols.resolve(*s)); + } + } + BString::from(buf) +} + #[allow(clippy::too_many_arguments)] fn visit_tree( repo: &gix::Repository, symbols: &mut BStringTable, + exclude_globset: Option<&GlobSet>, repo_index: &RepositoryIndex, num_trees_introduced: &mut usize, num_blobs_introduced: &mut usize, @@ -450,6 +481,12 @@ fn visit_tree( if seen.insert_tree(child_idx)? { let mut new_path = name_path.clone(); new_path.push(symbols.get_or_intern(child.filename.into())); + if exclude_globset.is_some() { + let path = render_symbol_path(symbols, &new_path); + if tree_path_is_excluded(&path, exclude_globset) { + continue; + } + } tree_worklist.push((new_path, child.oid.to_owned())); } } @@ -460,18 +497,14 @@ fn visit_tree( }; if !seen.contains_blob(child_idx)? { blobs_encountered.push(child_idx); - *num_blobs_introduced += 1; let mut new_path = name_path.clone(); new_path.push(symbols.get_or_intern(child.filename.into())); - let mut buf = Vec::new(); - if let Some(first) = new_path.first() { - buf.extend_from_slice(symbols.resolve(*first)); - for s in &new_path[1..] { - buf.push(b'/'); - buf.extend_from_slice(symbols.resolve(*s)); - } + let path = render_symbol_path(symbols, &new_path); + if path_is_excluded(&path, exclude_globset) { + continue; } - introduced.push((child.oid.to_owned(), BString::from(buf))); + *num_blobs_introduced += 1; + introduced.push((child.oid.to_owned(), path)); } } } diff --git a/src/git_repo_enumerator.rs b/src/git_repo_enumerator.rs index df974dd..d03f752 100644 --- a/src/git_repo_enumerator.rs +++ b/src/git_repo_enumerator.rs @@ -87,10 +87,9 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { let mut metadata_graph = GitMetadataGraph::with_capacity(object_index.num_commits()); let mut scratch = Vec::with_capacity(4 * 1024 * 1024); - let mut commit_metadata = - HashMap::with_capacity_and_hasher(object_index.num_commits(), Default::default()); - // Collect commit metadata and build commit graph + // Build commit graph first; materialize committer metadata only for commits that + // actually introduce blobs. for commit_oid in object_index.commits() { let commit = match odb.find_commit(commit_oid, &mut scratch) { Ok(commit) => commit, @@ -113,25 +112,16 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { let parent_idx = metadata_graph.get_commit_idx(parent_oid, None); metadata_graph.add_commit_edge(parent_idx, commit_idx); } - - let committer = &commit.committer; - // let author = &commit.author; - - commit_metadata.insert( - *commit_oid, - Arc::new(CommitMetadata { - commit_id: *commit_oid, - committer_name: String::from_utf8_lossy(&committer.name).into_owned(), - committer_email: String::from_utf8_lossy(&committer.email).into_owned(), - committer_timestamp: parse_sig_time(committer.time), - }), - ); } debug!("Built metadata graph in {:.6}s", started.elapsed().as_secs_f64()); // Compute metadata once, then get all blob IDs (in pack-ascending order) - let meta_result = metadata_graph.get_repo_metadata(&object_index, &self.repo); + let meta_result = metadata_graph.get_repo_metadata( + &object_index, + &self.repo, + self.exclude_globset.as_deref(), + ); let all_blobs = object_index.into_blobs(); // Assemble final blob list, preserving pack-ascending order for I/O locality @@ -144,22 +134,44 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { .collect() } Ok(metadata) => { + let mut commit_metadata: HashMap> = + HashMap::with_capacity_and_hasher(0, Default::default()); let mut blob_appearances: HashMap> = HashMap::with_capacity_and_hasher(all_blobs.len(), Default::default()); for e in metadata { - let cm = match commit_metadata.get(&e.commit_oid) { - Some(cm) => cm, - None => { - debug!("Missing commit metadata for {}", e.commit_oid); - continue; - } + if e.introduced_blobs.is_empty() { + continue; + } + + let cm = if let Some(cm) = commit_metadata.get(&e.commit_oid) { + cm.clone() + } else { + let commit = match odb.find_commit(&e.commit_oid, &mut scratch) { + Ok(commit) => commit, + Err(err) => { + debug!( + "Failed to load commit metadata for {}: {err}", + e.commit_oid + ); + continue; + } + }; + let committer = &commit.committer; + let parsed = Arc::new(CommitMetadata { + commit_id: e.commit_oid, + committer_name: String::from_utf8_lossy(&committer.name).into_owned(), + committer_email: String::from_utf8_lossy(&committer.email).into_owned(), + committer_timestamp: parse_sig_time(committer.time), + }); + commit_metadata.insert(e.commit_oid, Arc::clone(&parsed)); + parsed }; for (blob_oid, path) in e.introduced_blobs { blob_appearances .entry(blob_oid) .or_default() - .push(BlobAppearance { commit_metadata: cm.clone(), path }); + .push(BlobAppearance { commit_metadata: Arc::clone(&cm), path }); } } @@ -167,8 +179,7 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { all_blobs .into_iter() .filter_map(|blob_oid| { - let appearances = - blob_appearances.remove(&blob_oid).unwrap_or_default(); + let appearances = blob_appearances.remove(&blob_oid).unwrap_or_default(); if appearances.is_empty() { return Some(GitBlobMetadata { blob_oid, first_seen: appearances }); } diff --git a/src/pyc.rs b/src/pyc.rs index 23aa4e0..d7ae6fd 100644 --- a/src/pyc.rs +++ b/src/pyc.rs @@ -364,19 +364,13 @@ pub fn extract_pyc_strings(path: &Path) -> Result> { let (header_size, code_format) = match pyc_version_info(magic) { Some(info) => info, None => { - debug!( - "unsupported .pyc magic number {magic} in {}, skipping", - path.display() - ); + debug!("unsupported .pyc magic number {magic} in {}, skipping", path.display()); return Ok(Vec::new()); } }; if data.len() < header_size { - bail!( - ".pyc header requires {header_size} bytes but file is only {} bytes", - data.len() - ); + bail!(".pyc header requires {header_size} bytes but file is only {} bytes", data.len()); } let marshal_data = &data[header_size..]; @@ -496,7 +490,7 @@ mod tests { buf.extend_from_slice(&marshal_small_tuple(&[])); // cellvars buf.extend_from_slice(&marshal_short_ascii("")); // filename buf.extend_from_slice(&marshal_short_ascii("")); // name - // firstlineno + // firstlineno buf.extend_from_slice(&1i32.to_le_bytes()); // 1 trailing object: lnotab buf.extend_from_slice(&marshal_string(b"")); @@ -552,10 +546,8 @@ mod tests { #[test] fn extracts_strings_from_code_object() { let mut data = make_pyc_header(3413, 16); - let consts = marshal_small_tuple(&[ - marshal_none(), - marshal_short_ascii("ghp_abc123def456"), - ]); + let consts = + marshal_small_tuple(&[marshal_none(), marshal_short_ascii("ghp_abc123def456")]); let names = marshal_small_tuple(&[marshal_short_ascii("api_key")]); let code = marshal_code_38(consts, names); data.extend_from_slice(&code); @@ -666,13 +658,8 @@ mod tests { #[test] fn handles_nested_tuples() { let mut data = make_pyc_header(3413, 16); - let inner = marshal_small_tuple(&[ - marshal_short_ascii("inner_secret"), - ]); - let outer = marshal_small_tuple(&[ - marshal_short_ascii("outer"), - inner, - ]); + let inner = marshal_small_tuple(&[marshal_short_ascii("inner_secret")]); + let outer = marshal_small_tuple(&[marshal_short_ascii("outer"), inner]); data.extend_from_slice(&outer); let tmp = tempfile::NamedTempFile::new().unwrap(); std::fs::write(tmp.path(), &data).unwrap(); @@ -711,7 +698,7 @@ mod tests { buf.extend_from_slice(&marshal_short_ascii("")); // filename buf.extend_from_slice(&marshal_short_ascii("")); // name buf.extend_from_slice(&marshal_short_ascii("")); // qualname - // firstlineno + // firstlineno buf.extend_from_slice(&1i32.to_le_bytes()); // 2 trailing objects: linetable, exceptiontable buf.extend_from_slice(&marshal_string(b"")); @@ -722,10 +709,8 @@ mod tests { #[test] fn extracts_strings_from_code_object_v313() { let mut data = make_pyc_header(3627, 16); // Python 3.14 - let consts = marshal_small_tuple(&[ - marshal_none(), - marshal_short_ascii("sk-proj-ABCDEF123456"), - ]); + let consts = + marshal_small_tuple(&[marshal_none(), marshal_short_ascii("sk-proj-ABCDEF123456")]); let names = marshal_small_tuple(&[marshal_short_ascii("openai_key")]); let code = marshal_code_313(consts, names); data.extend_from_slice(&code); @@ -733,10 +718,7 @@ mod tests { std::fs::write(tmp.path(), &data).unwrap(); let result = extract_pyc_strings(tmp.path()).unwrap(); let result_str = String::from_utf8_lossy(&result); - assert!( - result_str.contains("sk-proj-ABCDEF123456"), - "missing secret from consts" - ); + assert!(result_str.contains("sk-proj-ABCDEF123456"), "missing secret from consts"); assert!(result_str.contains("openai_key"), "missing name"); assert!(result_str.contains(""), "missing filename"); } diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index e964483..01075db 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -34,6 +34,7 @@ use crate::{ matcher::{Matcher, MatcherStats}, open_git_repo_with_options, origin::{Origin, OriginSet}, + pyc::extract_pyc_strings, rule_profiling::ConcurrentRuleProfiler, rules_database::RulesDatabase, scanner::{ @@ -42,7 +43,6 @@ use crate::{ util::{is_compressed_file, is_pyc_file, is_sqlite_file}, }, scanner_pool::ScannerPool, - pyc::extract_pyc_strings, sqlite::extract_sqlite_contents, DirectoryResult, EnumeratorConfig, EnumeratorFileResult, FileResult, FilesystemEnumerator, FoundInput, GitDiffConfig, GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator, @@ -584,9 +584,7 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> { .name("odb_enumerator".to_string()) .spawn(move || { use gix::{ - object::Kind, - odb::store::iter::Ordering as OdbOrdering, - prelude::*, + object::Kind, odb::store::iter::Ordering as OdbOrdering, prelude::*, }; let repo = enum_repo_sync.to_thread_local(); let odb = &repo.objects; @@ -594,9 +592,9 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> { Ok(i) => i, Err(_) => return, }; - for oid_result in iter.with_ordering( - OdbOrdering::PackAscendingOffsetThenLooseLexicographical, - ) { + for oid_result in iter + .with_ordering(OdbOrdering::PackAscendingOffsetThenLooseLexicographical) + { let oid = match oid_result { Ok(oid) => oid, Err(_) => continue, @@ -605,9 +603,7 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> { Ok(hdr) => hdr, Err(_) => continue, }; - if hdr.kind() == Kind::Blob - && hdr.size() >= MIN_SCANNABLE_BLOB_SIZE - { + if hdr.kind() == Kind::Blob && hdr.size() >= MIN_SCANNABLE_BLOB_SIZE { let md = GitBlobMetadata { blob_oid: oid, first_seen: Default::default(), @@ -1013,7 +1009,11 @@ fn enumerate_git_diff_repo( blobs }; - Ok(GitRepoResult { repository, path: path.to_owned(), blobs: GitBlobSource::Precomputed(blobs) }) + Ok(GitRepoResult { + repository, + path: path.to_owned(), + blobs: GitBlobSource::Precomputed(blobs), + }) } fn synthesize_staged_commit(path: &Path, parent_ref: &str) -> Result { diff --git a/src/sqlite.rs b/src/sqlite.rs index 3fc1b8e..c2b968f 100644 --- a/src/sqlite.rs +++ b/src/sqlite.rs @@ -46,11 +46,7 @@ pub fn extract_sqlite_contents(path: &Path) -> Result)>> { results.push((logical_name, sql_text.into_bytes())); } Err(e) => { - debug!( - "Failed to dump table '{}' from {}: {e:#}", - table_name, - path.display() - ); + debug!("Failed to dump table '{}' from {}: {e:#}", table_name, path.display()); } } } @@ -136,10 +132,7 @@ fn dump_table( } fn column_names(conn: &Connection, table_name: &str) -> Result> { - let query = format!( - "PRAGMA table_info(\"{}\")", - table_name.replace('"', "\"\"") - ); + let query = format!("PRAGMA table_info(\"{}\")", table_name.replace('"', "\"\"")); let mut stmt = conn.prepare(&query)?; let names = stmt .query_map([], |row| { @@ -207,10 +200,7 @@ mod tests { let (_tmp, path) = create_test_db(); let results = extract_sqlite_contents(&path).unwrap(); - let user_info = results - .iter() - .find(|(n, _)| n == "user_info.sql") - .unwrap(); + let user_info = results.iter().find(|(n, _)| n == "user_info.sql").unwrap(); let sql = String::from_utf8_lossy(&user_info.1); assert!(sql.contains("CREATE TABLE")); @@ -224,8 +214,7 @@ mod tests { let tmp = NamedTempFile::new().unwrap(); let path = tmp.path().to_path_buf(); let conn = Connection::open(&path).unwrap(); - conn.execute_batch("CREATE TABLE empty_table (id INTEGER);") - .unwrap(); + conn.execute_batch("CREATE TABLE empty_table (id INTEGER);").unwrap(); let results = extract_sqlite_contents(&path).unwrap(); assert_eq!(results.len(), 1);