performance improvements

This commit is contained in:
Mick Grove 2026-02-23 23:14:39 -07:00
commit 4905ace028
6 changed files with 148 additions and 130 deletions

View file

@ -9,6 +9,9 @@ All notable changes to this project will be documented in this file.
- Performance: pipelined ODB enumeration — scanning now begins while blob OIDs are still being discovered, overlapping I/O with pattern matching.
- Performance: skip blobs smaller than 20 bytes during enumeration (too small to contain any secret).
- Performance: preserve pack-ascending blob order in the metadata path for better I/O locality when Rayon splits work.
- Performance: defer Git committer metadata materialization until commits actually introduce scannable blobs, reducing unnecessary string/time parsing work.
- Performance: push `--exclude` filtering into Git tree traversal so excluded paths/subtrees are pruned before blob-introduction bookkeeping.
- Performance: make Git repository object indexing single-pass (removed the extra ODB scan in `RepositoryIndex::new`).
## [v1.84.0]
- Added/updated `pipedrive` and `amplitude` rules

View file

@ -1,7 +1,7 @@
use std::{collections::BinaryHeap, time::Instant};
use anyhow::{bail, Context, Result};
use bstr::BString;
use bstr::{BString, ByteSlice};
use fixedbitset::FixedBitSet;
use gix::{
hashtable::{hash_map, HashMap},
@ -10,6 +10,7 @@ use gix::{
prelude::*,
ObjectId, OdbHandle,
};
use globset::GlobSet;
use crate::git_repo_enumerator::MIN_SCANNABLE_BLOB_SIZE;
use petgraph::{
@ -137,43 +138,11 @@ pub(crate) struct RepositoryIndex {
impl RepositoryIndex {
pub(crate) fn new(odb: &OdbHandle) -> Result<Self> {
use gix::{odb::store::iter::Ordering, prelude::*};
let mut num_tags = 0;
let mut num_trees = 0;
let mut num_blobs = 0;
let mut num_commits = 0;
let mut tree_oids = Vec::new();
let mut commit_oids = Vec::new();
let mut blob_oids = Vec::new();
let mut tag_oids = Vec::new();
for oid_result in odb
.iter()
.context("Failed to iterate object database")?
.with_ordering(Ordering::PackLexicographicalThenLooseLexicographical)
{
let oid = match oid_result {
Ok(oid) => oid,
Err(e) => {
debug!("Failed to read object id: {e}");
continue;
}
};
let hdr = match odb.header(oid) {
Ok(hdr) => hdr,
Err(e) => {
debug!("Failed to read object header for {oid}: {e}");
continue;
}
};
match hdr.kind() {
Kind::Tree => num_trees += 1,
Kind::Blob if hdr.size() >= MIN_SCANNABLE_BLOB_SIZE => num_blobs += 1,
Kind::Blob => {}
Kind::Commit => num_commits += 1,
Kind::Tag => num_tags += 1,
}
}
let mut trees = ObjectIdBimap::with_capacity(num_trees);
let mut commits = ObjectIdBimap::with_capacity(num_commits);
let mut blobs = ObjectIdBimap::with_capacity(num_blobs);
let mut tags = ObjectIdBimap::with_capacity(num_tags);
for oid_result in odb
.iter()
.context("Failed to iterate object database")?
@ -194,13 +163,32 @@ impl RepositoryIndex {
}
};
match hdr.kind() {
Kind::Tree => trees.insert(oid),
Kind::Blob if hdr.size() >= MIN_SCANNABLE_BLOB_SIZE => blobs.insert(oid),
Kind::Tree => tree_oids.push(oid),
Kind::Blob if hdr.size() >= MIN_SCANNABLE_BLOB_SIZE => blob_oids.push(oid),
Kind::Blob => {}
Kind::Commit => commits.insert(oid),
Kind::Tag => tags.insert(oid),
Kind::Commit => commit_oids.push(oid),
Kind::Tag => tag_oids.push(oid),
}
}
let mut trees = ObjectIdBimap::with_capacity(tree_oids.len());
let mut commits = ObjectIdBimap::with_capacity(commit_oids.len());
let mut blobs = ObjectIdBimap::with_capacity(blob_oids.len());
let mut tags = ObjectIdBimap::with_capacity(tag_oids.len());
for oid in tree_oids {
trees.insert(oid);
}
for oid in commit_oids {
commits.insert(oid);
}
for oid in blob_oids {
blobs.insert(oid);
}
for oid in tag_oids {
tags.insert(oid);
}
Ok(Self { trees, commits, blobs, tags })
}
pub(crate) fn num_commits(&self) -> usize {
@ -293,6 +281,7 @@ impl GitMetadataGraph {
self,
repo_index: &RepositoryIndex,
repo: &gix::Repository,
exclude_globset: Option<&GlobSet>,
) -> Result<Vec<CommitBlobMetadata>> {
let _span =
error_span!("get_repo_metadata", path = repo.path().display().to_string()).entered();
@ -340,6 +329,7 @@ impl GitMetadataGraph {
visit_tree(
repo,
&mut symbols,
exclude_globset,
repo_index,
&mut num_trees_introduced,
&mut num_blobs_introduced,
@ -409,10 +399,51 @@ impl GitMetadataGraph {
}
}
#[inline]
fn path_is_excluded(path: &BString, exclude_globset: Option<&GlobSet>) -> bool {
let Some(gs) = exclude_globset else {
return false;
};
match path.to_path() {
Ok(p) => gs.is_match(p),
Err(_) => false,
}
}
#[inline]
fn tree_path_is_excluded(path: &BString, exclude_globset: Option<&GlobSet>) -> bool {
if path_is_excluded(path, exclude_globset) {
return true;
}
let Some(gs) = exclude_globset else {
return false;
};
let mut dir_path = path.clone();
dir_path.push(b'/');
match dir_path.to_path() {
Ok(p) => gs.is_match(p),
Err(_) => false,
}
}
#[inline]
fn render_symbol_path(symbols: &BStringTable, path: &[Symbol]) -> BString {
let mut buf = Vec::new();
if let Some(first) = path.first() {
buf.extend_from_slice(symbols.resolve(*first));
for s in &path[1..] {
buf.push(b'/');
buf.extend_from_slice(symbols.resolve(*s));
}
}
BString::from(buf)
}
#[allow(clippy::too_many_arguments)]
fn visit_tree(
repo: &gix::Repository,
symbols: &mut BStringTable,
exclude_globset: Option<&GlobSet>,
repo_index: &RepositoryIndex,
num_trees_introduced: &mut usize,
num_blobs_introduced: &mut usize,
@ -450,6 +481,12 @@ fn visit_tree(
if seen.insert_tree(child_idx)? {
let mut new_path = name_path.clone();
new_path.push(symbols.get_or_intern(child.filename.into()));
if exclude_globset.is_some() {
let path = render_symbol_path(symbols, &new_path);
if tree_path_is_excluded(&path, exclude_globset) {
continue;
}
}
tree_worklist.push((new_path, child.oid.to_owned()));
}
}
@ -460,18 +497,14 @@ fn visit_tree(
};
if !seen.contains_blob(child_idx)? {
blobs_encountered.push(child_idx);
*num_blobs_introduced += 1;
let mut new_path = name_path.clone();
new_path.push(symbols.get_or_intern(child.filename.into()));
let mut buf = Vec::new();
if let Some(first) = new_path.first() {
buf.extend_from_slice(symbols.resolve(*first));
for s in &new_path[1..] {
buf.push(b'/');
buf.extend_from_slice(symbols.resolve(*s));
}
let path = render_symbol_path(symbols, &new_path);
if path_is_excluded(&path, exclude_globset) {
continue;
}
introduced.push((child.oid.to_owned(), BString::from(buf)));
*num_blobs_introduced += 1;
introduced.push((child.oid.to_owned(), path));
}
}
}

View file

@ -87,10 +87,9 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
let mut metadata_graph = GitMetadataGraph::with_capacity(object_index.num_commits());
let mut scratch = Vec::with_capacity(4 * 1024 * 1024);
let mut commit_metadata =
HashMap::with_capacity_and_hasher(object_index.num_commits(), Default::default());
// Collect commit metadata and build commit graph
// Build commit graph first; materialize committer metadata only for commits that
// actually introduce blobs.
for commit_oid in object_index.commits() {
let commit = match odb.find_commit(commit_oid, &mut scratch) {
Ok(commit) => commit,
@ -113,25 +112,16 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
let parent_idx = metadata_graph.get_commit_idx(parent_oid, None);
metadata_graph.add_commit_edge(parent_idx, commit_idx);
}
let committer = &commit.committer;
// let author = &commit.author;
commit_metadata.insert(
*commit_oid,
Arc::new(CommitMetadata {
commit_id: *commit_oid,
committer_name: String::from_utf8_lossy(&committer.name).into_owned(),
committer_email: String::from_utf8_lossy(&committer.email).into_owned(),
committer_timestamp: parse_sig_time(committer.time),
}),
);
}
debug!("Built metadata graph in {:.6}s", started.elapsed().as_secs_f64());
// Compute metadata once, then get all blob IDs (in pack-ascending order)
let meta_result = metadata_graph.get_repo_metadata(&object_index, &self.repo);
let meta_result = metadata_graph.get_repo_metadata(
&object_index,
&self.repo,
self.exclude_globset.as_deref(),
);
let all_blobs = object_index.into_blobs();
// Assemble final blob list, preserving pack-ascending order for I/O locality
@ -144,22 +134,44 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
.collect()
}
Ok(metadata) => {
let mut commit_metadata: HashMap<ObjectId, Arc<CommitMetadata>> =
HashMap::with_capacity_and_hasher(0, Default::default());
let mut blob_appearances: HashMap<ObjectId, SmallVec<_>> =
HashMap::with_capacity_and_hasher(all_blobs.len(), Default::default());
for e in metadata {
let cm = match commit_metadata.get(&e.commit_oid) {
Some(cm) => cm,
None => {
debug!("Missing commit metadata for {}", e.commit_oid);
continue;
}
if e.introduced_blobs.is_empty() {
continue;
}
let cm = if let Some(cm) = commit_metadata.get(&e.commit_oid) {
cm.clone()
} else {
let commit = match odb.find_commit(&e.commit_oid, &mut scratch) {
Ok(commit) => commit,
Err(err) => {
debug!(
"Failed to load commit metadata for {}: {err}",
e.commit_oid
);
continue;
}
};
let committer = &commit.committer;
let parsed = Arc::new(CommitMetadata {
commit_id: e.commit_oid,
committer_name: String::from_utf8_lossy(&committer.name).into_owned(),
committer_email: String::from_utf8_lossy(&committer.email).into_owned(),
committer_timestamp: parse_sig_time(committer.time),
});
commit_metadata.insert(e.commit_oid, Arc::clone(&parsed));
parsed
};
for (blob_oid, path) in e.introduced_blobs {
blob_appearances
.entry(blob_oid)
.or_default()
.push(BlobAppearance { commit_metadata: cm.clone(), path });
.push(BlobAppearance { commit_metadata: Arc::clone(&cm), path });
}
}
@ -167,8 +179,7 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
all_blobs
.into_iter()
.filter_map(|blob_oid| {
let appearances =
blob_appearances.remove(&blob_oid).unwrap_or_default();
let appearances = blob_appearances.remove(&blob_oid).unwrap_or_default();
if appearances.is_empty() {
return Some(GitBlobMetadata { blob_oid, first_seen: appearances });
}

View file

@ -364,19 +364,13 @@ pub fn extract_pyc_strings(path: &Path) -> Result<Vec<u8>> {
let (header_size, code_format) = match pyc_version_info(magic) {
Some(info) => info,
None => {
debug!(
"unsupported .pyc magic number {magic} in {}, skipping",
path.display()
);
debug!("unsupported .pyc magic number {magic} in {}, skipping", path.display());
return Ok(Vec::new());
}
};
if data.len() < header_size {
bail!(
".pyc header requires {header_size} bytes but file is only {} bytes",
data.len()
);
bail!(".pyc header requires {header_size} bytes but file is only {} bytes", data.len());
}
let marshal_data = &data[header_size..];
@ -496,7 +490,7 @@ mod tests {
buf.extend_from_slice(&marshal_small_tuple(&[])); // cellvars
buf.extend_from_slice(&marshal_short_ascii("<test>")); // filename
buf.extend_from_slice(&marshal_short_ascii("<module>")); // name
// firstlineno
// firstlineno
buf.extend_from_slice(&1i32.to_le_bytes());
// 1 trailing object: lnotab
buf.extend_from_slice(&marshal_string(b""));
@ -552,10 +546,8 @@ mod tests {
#[test]
fn extracts_strings_from_code_object() {
let mut data = make_pyc_header(3413, 16);
let consts = marshal_small_tuple(&[
marshal_none(),
marshal_short_ascii("ghp_abc123def456"),
]);
let consts =
marshal_small_tuple(&[marshal_none(), marshal_short_ascii("ghp_abc123def456")]);
let names = marshal_small_tuple(&[marshal_short_ascii("api_key")]);
let code = marshal_code_38(consts, names);
data.extend_from_slice(&code);
@ -666,13 +658,8 @@ mod tests {
#[test]
fn handles_nested_tuples() {
let mut data = make_pyc_header(3413, 16);
let inner = marshal_small_tuple(&[
marshal_short_ascii("inner_secret"),
]);
let outer = marshal_small_tuple(&[
marshal_short_ascii("outer"),
inner,
]);
let inner = marshal_small_tuple(&[marshal_short_ascii("inner_secret")]);
let outer = marshal_small_tuple(&[marshal_short_ascii("outer"), inner]);
data.extend_from_slice(&outer);
let tmp = tempfile::NamedTempFile::new().unwrap();
std::fs::write(tmp.path(), &data).unwrap();
@ -711,7 +698,7 @@ mod tests {
buf.extend_from_slice(&marshal_short_ascii("<test>")); // filename
buf.extend_from_slice(&marshal_short_ascii("<module>")); // name
buf.extend_from_slice(&marshal_short_ascii("<module>")); // qualname
// firstlineno
// firstlineno
buf.extend_from_slice(&1i32.to_le_bytes());
// 2 trailing objects: linetable, exceptiontable
buf.extend_from_slice(&marshal_string(b""));
@ -722,10 +709,8 @@ mod tests {
#[test]
fn extracts_strings_from_code_object_v313() {
let mut data = make_pyc_header(3627, 16); // Python 3.14
let consts = marshal_small_tuple(&[
marshal_none(),
marshal_short_ascii("sk-proj-ABCDEF123456"),
]);
let consts =
marshal_small_tuple(&[marshal_none(), marshal_short_ascii("sk-proj-ABCDEF123456")]);
let names = marshal_small_tuple(&[marshal_short_ascii("openai_key")]);
let code = marshal_code_313(consts, names);
data.extend_from_slice(&code);
@ -733,10 +718,7 @@ mod tests {
std::fs::write(tmp.path(), &data).unwrap();
let result = extract_pyc_strings(tmp.path()).unwrap();
let result_str = String::from_utf8_lossy(&result);
assert!(
result_str.contains("sk-proj-ABCDEF123456"),
"missing secret from consts"
);
assert!(result_str.contains("sk-proj-ABCDEF123456"), "missing secret from consts");
assert!(result_str.contains("openai_key"), "missing name");
assert!(result_str.contains("<test>"), "missing filename");
}

View file

@ -34,6 +34,7 @@ use crate::{
matcher::{Matcher, MatcherStats},
open_git_repo_with_options,
origin::{Origin, OriginSet},
pyc::extract_pyc_strings,
rule_profiling::ConcurrentRuleProfiler,
rules_database::RulesDatabase,
scanner::{
@ -42,7 +43,6 @@ use crate::{
util::{is_compressed_file, is_pyc_file, is_sqlite_file},
},
scanner_pool::ScannerPool,
pyc::extract_pyc_strings,
sqlite::extract_sqlite_contents,
DirectoryResult, EnumeratorConfig, EnumeratorFileResult, FileResult, FilesystemEnumerator,
FoundInput, GitDiffConfig, GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator,
@ -584,9 +584,7 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> {
.name("odb_enumerator".to_string())
.spawn(move || {
use gix::{
object::Kind,
odb::store::iter::Ordering as OdbOrdering,
prelude::*,
object::Kind, odb::store::iter::Ordering as OdbOrdering, prelude::*,
};
let repo = enum_repo_sync.to_thread_local();
let odb = &repo.objects;
@ -594,9 +592,9 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> {
Ok(i) => i,
Err(_) => return,
};
for oid_result in iter.with_ordering(
OdbOrdering::PackAscendingOffsetThenLooseLexicographical,
) {
for oid_result in iter
.with_ordering(OdbOrdering::PackAscendingOffsetThenLooseLexicographical)
{
let oid = match oid_result {
Ok(oid) => oid,
Err(_) => continue,
@ -605,9 +603,7 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> {
Ok(hdr) => hdr,
Err(_) => continue,
};
if hdr.kind() == Kind::Blob
&& hdr.size() >= MIN_SCANNABLE_BLOB_SIZE
{
if hdr.kind() == Kind::Blob && hdr.size() >= MIN_SCANNABLE_BLOB_SIZE {
let md = GitBlobMetadata {
blob_oid: oid,
first_seen: Default::default(),
@ -1013,7 +1009,11 @@ fn enumerate_git_diff_repo(
blobs
};
Ok(GitRepoResult { repository, path: path.to_owned(), blobs: GitBlobSource::Precomputed(blobs) })
Ok(GitRepoResult {
repository,
path: path.to_owned(),
blobs: GitBlobSource::Precomputed(blobs),
})
}
fn synthesize_staged_commit(path: &Path, parent_ref: &str) -> Result<String> {

View file

@ -46,11 +46,7 @@ pub fn extract_sqlite_contents(path: &Path) -> Result<Vec<(String, Vec<u8>)>> {
results.push((logical_name, sql_text.into_bytes()));
}
Err(e) => {
debug!(
"Failed to dump table '{}' from {}: {e:#}",
table_name,
path.display()
);
debug!("Failed to dump table '{}' from {}: {e:#}", table_name, path.display());
}
}
}
@ -136,10 +132,7 @@ fn dump_table(
}
fn column_names(conn: &Connection, table_name: &str) -> Result<Vec<String>> {
let query = format!(
"PRAGMA table_info(\"{}\")",
table_name.replace('"', "\"\"")
);
let query = format!("PRAGMA table_info(\"{}\")", table_name.replace('"', "\"\""));
let mut stmt = conn.prepare(&query)?;
let names = stmt
.query_map([], |row| {
@ -207,10 +200,7 @@ mod tests {
let (_tmp, path) = create_test_db();
let results = extract_sqlite_contents(&path).unwrap();
let user_info = results
.iter()
.find(|(n, _)| n == "user_info.sql")
.unwrap();
let user_info = results.iter().find(|(n, _)| n == "user_info.sql").unwrap();
let sql = String::from_utf8_lossy(&user_info.1);
assert!(sql.contains("CREATE TABLE"));
@ -224,8 +214,7 @@ mod tests {
let tmp = NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
let conn = Connection::open(&path).unwrap();
conn.execute_batch("CREATE TABLE empty_table (id INTEGER);")
.unwrap();
conn.execute_batch("CREATE TABLE empty_table (id INTEGER);").unwrap();
let results = extract_sqlite_contents(&path).unwrap();
assert_eq!(results.len(), 1);