diff --git a/CHANGELOG.md b/CHANGELOG.md index 032379c..fb1ede4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file. - Added tests covering baseline and exclude workflow - Added validation for JWT tokens that checks `exp` and `nbf` claims - JWT validation performs OpenID Connect discovery using the `iss` claim and verifies signatures via JWKS +- Removed `--ignore-tests` argument, because the `--exclude` flag provides more granular functionality ## [1.20.0] diff --git a/README.md b/README.md index d7ae8b6..0797344 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,12 @@ kingfisher scan /path/to/repo --rule-stats ```bash # Scan source but skip likely unit / integration tests -kingfisher scan ./my-project --ignore-tests +kingfisher scan ./my-project \ + --exclude='test' \ + --exclude='spec' \ + --exclude='fixture' \ + --exclude='example' \ + --exclude='sample' ``` ### Exclude specific paths @@ -304,7 +309,6 @@ kingfisher github repos list --organization my-org - `--no-extract-archives`: Do not scan inside archives - `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) - `--redact`: Replaces discovered secrets with a one-way hash for secure output -- `--ignore-tests`: Skip files or directories whose path component contains _test_, _spec_, _fixture_, _example_, or _sample_ (case-insensitive) - `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax) - `--baseline-file `: Ignore matches listed in a baseline YAML file - `--manage-baseline`: Create or update the baseline file with current findings diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index 406c896..fe4d5d6 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -88,10 +88,6 @@ pub struct ScanArgs { #[arg(long, default_value_t = false)] pub no_dedup: bool, - /// Ignore matches that appear to come from test files - #[arg(long, default_value_t = false)] - pub ignore_tests: bool, - /// Redact findings values using a secure hash #[arg(long, short = 'r', default_value_t = false)] pub redact: bool, diff --git a/src/git_repo_enumerator.rs b/src/git_repo_enumerator.rs index 77e506e..9b3151e 100644 --- a/src/git_repo_enumerator.rs +++ b/src/git_repo_enumerator.rs @@ -73,11 +73,16 @@ pub struct GitBlobMetadata { pub struct GitRepoWithMetadataEnumerator<'a> { path: &'a Path, repo: Repository, + exclude_globset: Option>, } impl<'a> GitRepoWithMetadataEnumerator<'a> { - pub fn new(path: &'a Path, repo: Repository) -> Self { - Self { path, repo } + pub fn new( + path: &'a Path, + repo: Repository, + exclude_globset: Option>, + ) -> Self { + Self { path, repo, exclude_globset } } pub fn run(self) -> Result { @@ -173,12 +178,19 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { } let filtered = appearances .into_iter() - .filter(|entry| { - // Apply your actual ignore-logic here: - match entry.path.to_path() { - Ok(_path) => true, - Err(_) => true, + .filter(|entry| match entry.path.to_path() { + Ok(p) => { + if let Some(gs) = &self.exclude_globset { + let m = gs.is_match(p); + if m { + debug!("Skipping {} due to --exclude", p.display()); + } + !m + } else { + true + } } + Err(_) => true, }) .collect::>(); if filtered.is_empty() { diff --git a/src/lib.rs b/src/lib.rs index 9f04d46..26703fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -82,7 +82,7 @@ struct EnumeratorConfig { enumerate_git_history: bool, collect_git_metadata: bool, repo_scan_timeout: Duration, - // gitignore: Gitignore, + exclude_globset: Option>, } pub enum FoundInput { @@ -312,6 +312,10 @@ impl FilesystemEnumerator { Ok(self) } + pub fn exclude_globset(&self) -> Option> { + self.exclude_globset.clone() + } + pub fn gitignore(&self) -> Result { Ok(self.gitignore_builder.build()?) } diff --git a/src/main.rs b/src/main.rs index 2567187..44a3ec3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -300,7 +300,6 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { redact: false, git_repo_timeout: 1800, no_dedup: false, - ignore_tests: false, snippet_length: 256, baseline_file: None, manage_baseline: false, diff --git a/src/reporter/pretty_format.rs b/src/reporter/pretty_format.rs index 7a6212c..b9c868c 100644 --- a/src/reporter/pretty_format.rs +++ b/src/reporter/pretty_format.rs @@ -314,7 +314,6 @@ fn test_pretty_format_with_nan_entropy_panics() { // core execution / performance num_jobs: 1, no_dedup: false, - ignore_tests: false, // rule selection rules: RuleSpecifierArgs { diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index 62dbd56..8cfa9ff 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -82,9 +82,10 @@ pub fn enumerate_filesystem_inputs( }() .context("Failed to initialize filesystem enumerator")?; - let (enum_thread, input_recv) = { + let (enum_thread, input_recv, exclude_globset) = { let fs_enumerator = make_fs_enumerator(args, input_roots.into()) .context("Failed to initialize filesystem enumerator")?; + let exclude_globset = fs_enumerator.as_ref().and_then(|ie| ie.exclude_globset()); let channel_size = std::cmp::max(args.num_jobs * 128, 1024); let (input_send, input_recv) = crossbeam_channel::bounded(channel_size); @@ -97,7 +98,7 @@ pub fn enumerate_filesystem_inputs( Ok(()) }) .context("Failed to enumerate filesystem inputs")?; - (input_enumerator_thread, input_recv) + (input_enumerator_thread, input_recv, exclude_globset) }; let enum_cfg = EnumeratorConfig { @@ -107,6 +108,7 @@ pub fn enumerate_filesystem_inputs( }, collect_git_metadata: args.input_specifier_args.commit_metadata, repo_scan_timeout, + exclude_globset, }; let (send_ds, recv_ds) = create_datastore_channel(args.num_jobs); let datastore_writer_thread = @@ -189,23 +191,11 @@ pub fn enumerate_filesystem_inputs( Ok(Some((origin_set, blob_metadata, vec_of_matches))) => { for (_, single_match) in vec_of_matches { // Send each match - let is_test = if args.ignore_tests { - origin_set - .iter() - .filter_map(|o| o.full_path()) - .any(|p| is_test_like_path(&p)) - } else { - false - }; - - if !is_test { - // Send each match - send_ds.send(( - Arc::new(origin_set.clone()), - Arc::new(blob_metadata.clone()), - single_match, - ))?; - } + send_ds.send(( + Arc::new(origin_set.clone()), + Arc::new(blob_metadata.clone()), + single_match, + ))?; } } Err(e) => { @@ -604,9 +594,15 @@ impl<'cfg> ParallelBlobIterator for (&'cfg EnumeratorConfig, FoundInput) { // Spawn an enumerator thread so we can time-out cleanly let path_clone = path.to_path_buf(); let (tx, rx) = std::sync::mpsc::channel(); + let exclude_globset = cfg.exclude_globset.clone(); let handle = std::thread::spawn(move || { let res = if collect_git_metadata { - GitRepoWithMetadataEnumerator::new(&path_clone, repository).run() + GitRepoWithMetadataEnumerator::new( + &path_clone, + repository, + exclude_globset.clone(), + ) + .run() } else { GitRepoEnumerator::new(&path_clone, repository).run() }; diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index 3c41afd..64c2c92 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -101,7 +101,6 @@ rules: git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_dedup, - ignore_tests: false, snippet_length: 64, baseline_file: None, manage_baseline: false, diff --git a/tests/int_github.rs b/tests/int_github.rs index c66fdb8..330299b 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -88,7 +88,6 @@ fn test_github_remote_scan() -> Result<()> { git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_dedup: true, - ignore_tests: false, snippet_length: 256, baseline_file: None, manage_baseline: false, diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index 308f6a0..66a7f37 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -85,7 +85,6 @@ fn test_gitlab_remote_scan() -> Result<()> { git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_dedup: true, - ignore_tests: false, snippet_length: 256, baseline_file: None, manage_baseline: false, diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index 06bc55a..7e422e1 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -144,7 +144,6 @@ async fn test_validation_cache_and_depvars() -> Result<()> { git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_dedup: true, // keep duplicates so the cache is stressed - ignore_tests: false, snippet_length: 128, baseline_file: None, manage_baseline: false, diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 032a51b..bb1d064 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -87,7 +87,6 @@ impl TestContext { git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_dedup: true, - ignore_tests: false, snippet_length: 256, baseline_file: None, manage_baseline: false, @@ -153,7 +152,6 @@ impl TestContext { git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_dedup: true, - ignore_tests: false, snippet_length: 256, baseline_file: None, manage_baseline: false,