diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e99583..8aaf007 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ All notable changes to this project will be documented in this file. +## [1.51.0] +- Added diff-only Git scanning via `--since-commit` and `--branch`, including remote-aware ref resolution so CI jobs can pair `--git-url` clones with pull request branches + ## [1.50.0] - Added `--github-exclude` and `--gitlab-exclude` options to skip specific repositories when scanning or listing GitHub and GitLab sources, including support for gitignore-style glob patterns diff --git a/Cargo.toml b/Cargo.toml index fd82763..64665aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.50.0" +version = "1.51.0" description = "MongoDB's blazingly fast secret scanning and validation tool" edition.workspace = true rust-version.workspace = true diff --git a/README.md b/README.md index 2338597..e042806 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [Display rule performance statistics](#display-rule-performance-statistics) - [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files) - [Exclude specific paths](#exclude-specific-paths) + - [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines) - [Scan an S3 bucket](#scan-an-s3-bucket) - [Scanning Docker Images](#scanning-docker-images) - [Scanning GitHub](#scanning-github) @@ -331,6 +332,36 @@ kingfisher scan ./my-project \ --exclude '[Tt]ests' ``` +### Scan changes in CI pipelines + +Limit scanning to the delta between your default branch and a pull request branch by combining `--since-commit` with `--branch` (defaults to `HEAD`). This only scans files that differ between the two references, which keeps CI runs fast while still blocking new secrets. + +```bash +kingfisher scan . \ + --since-commit origin/main \ + --branch "$CI_BRANCH" +``` + +When the branch under test is already checked out, `--branch HEAD` or omitting `--branch` entirely is sufficient. Kingfisher exits with `200` when any findings are discovered and `205` when validated secrets are present, allowing CI jobs to fail automatically if new credentials slip in. + +The same diff-focused workflow works when cloning repositories on the fly with `--git-url`. Kingfisher automatically tries remote-tracking names like `origin/main` and `origin/feature-1`, so you can target the branches involved in a pull request without performing a local checkout first. + +```bash +kingfisher scan \ + --git-url https://github.com/org/repo.git \ + --since-commit main \ + --branch development +``` + +In CI systems that expose the base and head commits explicitly, you can pass those SHAs directly while still using `--git-url`: + +```bash +kingfisher scan \ + --git-url git@github.com:org/repo.git \ + --since-commit "$BASE_COMMIT" \ + --branch "$PR_HEAD_COMMIT" +``` + If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude): ```bash diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs index 77fc192..6b8cb4b 100644 --- a/src/cli/commands/inputs.rs +++ b/src/cli/commands/inputs.rs @@ -174,6 +174,14 @@ pub struct InputSpecifierArgs { /// Enable or disable scanning nested git repositories #[arg(long, default_value_t = true)] pub scan_nested_repos: bool, + + /// Limit Git scanning to changes made since this commit or ref + #[arg(long = "since-commit", value_name = "GIT-REF", help_heading = "Git Options")] + pub since_commit: Option, + + /// Branch or ref containing changes to scan (defaults to HEAD) + #[arg(long, value_name = "GIT-REF", requires = "since_commit", help_heading = "Git Options")] + pub branch: Option, } // ----------------------------------------------------------------------------- diff --git a/src/lib.rs b/src/lib.rs index 76f7f86..18335c6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,11 +52,18 @@ use ignore::{DirEntry, WalkBuilder, WalkState}; use tokio::time::Duration; use tracing::debug; +#[derive(Clone)] +pub struct GitDiffConfig { + pub since_ref: String, + pub branch_ref: Option, +} + struct EnumeratorConfig { enumerate_git_history: bool, collect_git_metadata: bool, repo_scan_timeout: Duration, exclude_globset: Option>, + git_diff: Option, } pub enum FoundInput { diff --git a/src/main.rs b/src/main.rs index cc15c6c..d44019f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -320,6 +320,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { commit_metadata: true, repo_artifacts: false, scan_nested_repos: true, + since_commit: None, + branch: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index e6074be..79a1bc5 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -112,6 +112,8 @@ mod tests { commit_metadata: true, repo_artifacts: false, scan_nested_repos: true, + since_commit: None, + branch: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index 970ca5c..df56427 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -1,5 +1,6 @@ use std::{ marker::PhantomData, + path::Path, sync::{ atomic::{AtomicBool, Ordering}, Arc, Mutex, @@ -7,10 +8,10 @@ use std::{ time::{Duration, Instant as StdInstant, Instant}, }; -use anyhow::{bail, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use base64::{engine::general_purpose::STANDARD, Engine}; -use bstr::BString; -use gix::Repository as GixRepo; +use bstr::{BString, ByteSlice}; +use gix::{object::tree::diff::ChangeDetached, object::tree::EntryKind, Repository as GixRepo}; use indicatif::{ProgressBar, ProgressStyle}; use rayon::{ iter::plumbing::Folder, @@ -19,12 +20,16 @@ use rayon::{ use serde::{Deserialize, Deserializer}; use tracing::{debug, error}; +use smallvec::smallvec; + use crate::{ binary::is_binary, - blob::{Blob, BlobId, BlobIdMap}, + blob::{Blob, BlobAppearance, BlobId, BlobIdMap}, cli::commands::{github::GitHistoryMode, scan}, decompress::{decompress_file_to_temp, CompressedContent}, findings_store, + git_commit_metadata::CommitMetadata, + git_repo_enumerator::GitBlobMetadata, matcher::{Matcher, MatcherStats}, open_git_repo, origin::{Origin, OriginSet}, @@ -36,8 +41,9 @@ use crate::{ util::is_compressed_file, }, scanner_pool::ScannerPool, - EnumeratorConfig, EnumeratorFileResult, FileResult, FilesystemEnumerator, FoundInput, - GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator, PathBuf, + DirectoryResult, EnumeratorConfig, EnumeratorFileResult, FileResult, FilesystemEnumerator, + FoundInput, GitDiffConfig, GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator, + PathBuf, }; type OwnedBlob = Blob<'static>; @@ -54,6 +60,11 @@ pub fn enumerate_filesystem_inputs( ) -> Result<()> { let repo_scan_timeout = Duration::from_secs(args.git_repo_timeout); + let diff_config = args.input_specifier_args.since_commit.as_ref().map(|since| GitDiffConfig { + since_ref: since.clone(), + branch_ref: args.input_specifier_args.branch.clone(), + }); + let progress = if progress_enabled { let style = ProgressStyle::with_template("{spinner} {msg} {total_bytes} [{elapsed_precise}]") @@ -81,16 +92,24 @@ pub fn enumerate_filesystem_inputs( .context("Failed to initialize filesystem enumerator")?; let (enum_thread, input_recv, exclude_globset) = { - let fs_enumerator = make_fs_enumerator(args, input_roots.into()) + let fs_enumerator = make_fs_enumerator(args, input_roots.to_vec()) .context("Failed to initialize filesystem enumerator")?; let exclude_globset = fs_enumerator.as_ref().and_then(|ie| ie.exclude_globset()); let channel_size = std::cmp::max(args.num_jobs * 128, 1024); let (input_send, input_recv) = crossbeam_channel::bounded(channel_size); + let diff_config_for_thread = diff_config.clone(); + let roots_for_thread = input_roots.to_vec(); let input_enumerator_thread = std::thread::Builder::new() .name("input_enumerator".to_string()) .spawn(move || -> Result<_> { - if let Some(fs_enumerator) = fs_enumerator { + if diff_config_for_thread.is_some() { + for root in roots_for_thread { + input_send + .send(FoundInput::Directory(DirectoryResult { path: root })) + .context("Failed to queue repository for scanning")?; + } + } else if let Some(fs_enumerator) = fs_enumerator { fs_enumerator.run(input_send.clone())?; } Ok(()) @@ -106,7 +125,8 @@ pub fn enumerate_filesystem_inputs( }, collect_git_metadata: args.input_specifier_args.commit_metadata, repo_scan_timeout, - exclude_globset, + exclude_globset: exclude_globset.clone(), + git_diff: diff_config.clone(), }; let (send_ds, recv_ds) = create_datastore_channel(args.num_jobs); let datastore_writer_thread = @@ -578,7 +598,7 @@ impl<'cfg> ParallelBlobIterator for (&'cfg EnumeratorConfig, FoundInput) { FoundInput::Directory(i) => { let path = &i.path; - if !cfg.enumerate_git_history { + if cfg.git_diff.is_none() && !cfg.enumerate_git_history { return Ok(None); } @@ -597,8 +617,17 @@ impl<'cfg> ParallelBlobIterator for (&'cfg EnumeratorConfig, FoundInput) { let path_clone = path.to_path_buf(); let (tx, rx) = std::sync::mpsc::channel(); let exclude_globset = cfg.exclude_globset.clone(); + let diff_cfg = cfg.git_diff.clone(); let handle = std::thread::spawn(move || { - let res = if collect_git_metadata { + let res = if let Some(diff_cfg) = diff_cfg { + enumerate_git_diff_repo( + &path_clone, + repository, + diff_cfg, + exclude_globset.clone(), + collect_git_metadata, + ) + } else if collect_git_metadata { GitRepoWithMetadataEnumerator::new( &path_clone, repository, @@ -671,6 +700,247 @@ impl<'cfg> ParallelBlobIterator for (&'cfg EnumeratorConfig, FoundInput) { } } +fn enumerate_git_diff_repo( + path: &Path, + repository: gix::Repository, + diff_cfg: GitDiffConfig, + exclude_globset: Option>, + collect_commit_metadata: bool, +) -> Result { + let since_ref = diff_cfg.since_ref.clone(); + let branch_ref = diff_cfg.branch_ref.clone().unwrap_or_else(|| "HEAD".to_string()); + + let base_id = resolve_diff_ref(&repository, path, &since_ref).with_context(|| { + format!("Failed to resolve --since-commit '{}' in repository {}", since_ref, path.display()) + })?; + let head_id = resolve_diff_ref(&repository, path, &branch_ref).with_context(|| { + format!("Failed to resolve --branch '{}' in repository {}", branch_ref, path.display()) + })?; + + let base_commit = base_id + .object() + .with_context(|| format!("Failed to load commit {} for diffing", base_id.to_hex()))? + .try_into_commit() + .with_context(|| format!("Referenced object {} is not a commit", base_id.to_hex()))?; + let head_commit = head_id + .object() + .with_context(|| format!("Failed to load commit {} for diffing", head_id.to_hex()))? + .try_into_commit() + .with_context(|| format!("Referenced object {} is not a commit", head_id.to_hex()))?; + + let base_tree = base_commit + .tree() + .with_context(|| format!("Failed to read tree for commit {}", base_id.to_hex()))?; + let head_tree = head_commit + .tree() + .with_context(|| format!("Failed to read tree for commit {}", head_id.to_hex()))?; + + let changes = + repository.diff_tree_to_tree(Some(&base_tree), Some(&head_tree), None).with_context( + || format!("Failed to compute diff between '{}' and '{}'", since_ref, branch_ref), + )?; + + // Release tree handles before returning the repository to avoid borrow check conflicts. + drop(base_tree); + drop(head_tree); + + let commit_metadata = if collect_commit_metadata { + let committer = head_commit + .committer() + .with_context(|| format!("Failed to read committer for {}", branch_ref))? + .trim(); + let timestamp = committer.time().unwrap_or_else(|_| gix::date::Time::new(0, 0)); + Arc::new(CommitMetadata { + commit_id: head_commit.id, + committer_name: committer.name.to_str_lossy().into_owned(), + committer_email: committer.email.to_str_lossy().into_owned(), + committer_timestamp: timestamp, + }) + } else { + Arc::new(CommitMetadata { + commit_id: head_commit.id, + committer_name: String::new(), + committer_email: String::new(), + committer_timestamp: gix::date::Time::new(0, 0), + }) + }; + + let mut blobs = Vec::new(); + for change in changes { + let (entry_mode, id, location) = match change { + ChangeDetached::Addition { entry_mode, id, location, .. } => (entry_mode, id, location), + ChangeDetached::Modification { entry_mode, id, location, .. } => { + (entry_mode, id, location) + } + ChangeDetached::Rewrite { entry_mode, id, location, .. } => (entry_mode, id, location), + ChangeDetached::Deletion { .. } => continue, + }; + + match entry_mode.kind() { + EntryKind::Blob | EntryKind::BlobExecutable | EntryKind::Link => {} + _ => continue, + } + + let relative_path_str = String::from_utf8_lossy(location.as_ref()).into_owned(); + let relative_path = Path::new(&relative_path_str); + if let Some(gs) = &exclude_globset { + if gs.is_match(relative_path) || gs.is_match(&path.join(relative_path)) { + debug!( + "Skipping {} due to --exclude while diffing {}", + relative_path.display(), + path.display() + ); + continue; + } + } + + let appearance = + BlobAppearance { commit_metadata: Arc::clone(&commit_metadata), path: location }; + blobs.push(GitBlobMetadata { blob_oid: id, first_seen: smallvec![appearance] }); + } + + // Release commit handles before returning the repository to avoid borrow check conflicts. + drop(base_commit); + drop(head_commit); + + Ok(GitRepoResult { repository, path: path.to_owned(), blobs }) +} + +fn resolve_diff_ref<'repo>( + repository: &'repo gix::Repository, + path: &Path, + reference: &str, +) -> Result> { + let mut candidates = reference_candidates(reference); + if candidates.is_empty() { + candidates.push(reference.to_string()); + } + + let mut last_err: Option = None; + for candidate in &candidates { + match repository.rev_parse_single(candidate.as_bytes()) { + Ok(id) => return Ok(id), + Err(err) => last_err = Some(err.into()), + } + } + + let attempted = candidates.join(", "); + let err = last_err.unwrap_or_else(|| { + anyhow!("Reference resolution failed for '{}' without a more specific error", reference) + }); + Err(err).with_context(|| { + if attempted.is_empty() { + format!("Failed to resolve reference '{}' in repository {}", reference, path.display()) + } else { + format!( + "Failed to resolve reference '{}' in repository {} (tried: {})", + reference, + path.display(), + attempted + ) + } + }) +} + +fn reference_candidates(reference: &str) -> Vec { + fn push_unique(vec: &mut Vec, candidate: String) { + if !vec.iter().any(|existing| existing == &candidate) { + vec.push(candidate); + } + } + + let trimmed = reference.trim(); + if trimmed.is_empty() { + return Vec::new(); + } + + let mut candidates = Vec::new(); + push_unique(&mut candidates, trimmed.to_string()); + + if trimmed.eq_ignore_ascii_case("HEAD") { + return candidates; + } + + if trimmed.starts_with("refs/") { + return candidates; + } + + push_unique(&mut candidates, format!("refs/heads/{trimmed}")); + push_unique(&mut candidates, format!("refs/tags/{trimmed}")); + + if let Some((remote, rest)) = trimmed.split_once('/') { + if remote == "origin" { + if !rest.is_empty() { + push_unique(&mut candidates, format!("refs/remotes/{remote}/{rest}")); + } + } else if !rest.is_empty() { + push_unique(&mut candidates, format!("refs/remotes/origin/{trimmed}")); + push_unique(&mut candidates, format!("refs/remotes/{remote}/{rest}")); + } + } else { + push_unique(&mut candidates, format!("origin/{trimmed}")); + push_unique(&mut candidates, format!("refs/remotes/origin/{trimmed}")); + } + + candidates +} + +#[cfg(test)] +mod tests { + use super::reference_candidates; + + #[test] + fn reference_candidates_for_plain_branch() { + assert_eq!( + reference_candidates("main"), + vec![ + "main".to_string(), + "refs/heads/main".to_string(), + "refs/tags/main".to_string(), + "origin/main".to_string(), + "refs/remotes/origin/main".to_string(), + ] + ); + } + + #[test] + fn reference_candidates_for_remote_branch() { + assert_eq!( + reference_candidates("origin/feature"), + vec![ + "origin/feature".to_string(), + "refs/heads/origin/feature".to_string(), + "refs/tags/origin/feature".to_string(), + "refs/remotes/origin/feature".to_string(), + ] + ); + } + + #[test] + fn reference_candidates_for_branch_with_path() { + assert_eq!( + reference_candidates("feature/foo"), + vec![ + "feature/foo".to_string(), + "refs/heads/feature/foo".to_string(), + "refs/tags/feature/foo".to_string(), + "refs/remotes/origin/feature/foo".to_string(), + "refs/remotes/feature/foo".to_string(), + ] + ); + } + + #[test] + fn reference_candidates_for_explicit_ref() { + assert_eq!(reference_candidates("refs/heads/main"), vec!["refs/heads/main".to_string()]); + } + + #[test] + fn reference_candidates_for_head_symbol() { + assert_eq!(reference_candidates("HEAD"), vec!["HEAD".to_string()]); + } +} + /// A simple enum describing how we yield file content: /// - Single: one `(origin, blob)` /// - Archive: multiple `(origin, blob)` items from a decompressed archive diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs index ecc53ec..cf4e456 100644 --- a/tests/int_allowlist.rs +++ b/tests/int_allowlist.rs @@ -86,6 +86,8 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result Result<()> { commit_metadata: true, repo_artifacts: false, scan_nested_repos: true, + since_commit: None, + branch: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index b048864..cc6c5e7 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -87,6 +87,8 @@ fn test_gitlab_remote_scan() -> Result<()> { commit_metadata: true, repo_artifacts: false, scan_nested_repos: true, + since_commit: None, + branch: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, @@ -197,6 +199,8 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { commit_metadata: true, repo_artifacts: false, scan_nested_repos: true, + since_commit: None, + branch: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, diff --git a/tests/int_redact.rs b/tests/int_redact.rs index 2248d2d..2fa8bcb 100644 --- a/tests/int_redact.rs +++ b/tests/int_redact.rs @@ -69,6 +69,8 @@ async fn test_redact_hashes_finding_values() -> Result<()> { commit_metadata: true, repo_artifacts: false, scan_nested_repos: true, + since_commit: None, + branch: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, diff --git a/tests/int_slack.rs b/tests/int_slack.rs index 265fe06..0b96150 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -75,6 +75,8 @@ impl TestContext { commit_metadata: true, repo_artifacts: false, scan_nested_repos: true, + since_commit: None, + branch: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, @@ -175,6 +177,8 @@ async fn test_scan_slack_messages() -> Result<()> { commit_metadata: true, repo_artifacts: false, scan_nested_repos: true, + since_commit: None, + branch: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index ba1f5a7..b468b0e 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -145,6 +145,8 @@ async fn test_validation_cache_and_depvars() -> Result<()> { commit_metadata: true, repo_artifacts: false, scan_nested_repos: true, + since_commit: None, + branch: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 7eaa147..db50f94 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -88,6 +88,8 @@ impl TestContext { commit_metadata: true, repo_artifacts: false, scan_nested_repos: true, + since_commit: None, + branch: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, @@ -173,6 +175,8 @@ impl TestContext { commit_metadata: true, repo_artifacts: false, scan_nested_repos: true, + since_commit: None, + branch: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0,