From 0b8e8fcc754dbed3f309f91d48e562e84fac3ade Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Wed, 6 Aug 2025 19:15:50 -0700 Subject: [PATCH 1/4] Remote scans with --git-history=none now clone repositories with a working tree and scan the current files instead of erroring with 'No inputs to scan.' --- CHANGELOG.md | 3 ++ Cargo.toml | 2 +- README.md | 6 ++- src/git_binary.rs | 27 ++++++++---- src/scanner/repos.rs | 18 ++++---- tests/int_gitlab.rs | 103 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 140 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a982f5..0ff7097 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ All notable changes to this project will be documented in this file. +## [1.35.0] +- Remote scans with `--git-history=none` now clone repositories with a working tree and scan the current files instead of erroring with "No inputs to scan". + ## [1.34.0] - Use system TLS root certificates to support self-hosted GitLab instances with internal CAs - Added new rule: Coze personal access token diff --git a/Cargo.toml b/Cargo.toml index 4295167..930a196 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.34.0" +version = "1.35.0" description = "MongoDB's blazingly fast secret scanning and validation tool" edition.workspace = true rust-version.workspace = true diff --git a/README.md b/README.md index ef68c1f..1fba1fb 100644 --- a/README.md +++ b/README.md @@ -5,16 +5,17 @@ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -Kingfisher is a blazingly fast secret‑scanning and validation tool built in Rust. It combines Intel’s hardware‑accelerated Hyperscan regex engine with language‑aware parsing via Tree‑Sitter, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production +Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s hardware‑accelerated Hyperscan regex engine with language‑aware parsing via Tree‑Sitter, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production

Kingfisher originated as a fork of Praetorian's [Nosey Parker](https://github.com/praetorian-inc/noseyparker), and is built atop their incredible work and the work contributed by the Nosey Parker community. ## What Kingfisher Adds - **Live validation** via cloud-provider APIs -- **Language-aware detection** (source-code parsing) for ~20 languages - **Extra targets**: GitLab repos, S3 buckets, Docker images, Jira issues, and Slack messages +- **Compressed Files**: Supports extracting and scanning compressed files for secrets - **Baseline mode**: ignore known secrets, flag only new ones +- **Language-aware detection** (source-code parsing) for ~20 languages - **Native Windows** binary @@ -27,6 +28,7 @@ Kingfisher originated as a fork of Praetorian's [Nosey Parker](https://github.co - **Jira issues**: JQL‑driven scans with `--jira-url` and `--jql` - **Slack messages**: query‑based scans with `--slack-query` - **AWS S3**: bucket scans via `--s3-bucket`/`--s3-prefix` with credentials from `KF_AWS_KEY`/`KF_AWS_SECRET`, `--role-arn`, `--aws-local-profile`, or anonymous +- **Compressed Files**: Supports extracting and scanning compressed files for secrets - **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) **Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation) diff --git a/src/git_binary.rs b/src/git_binary.rs index fc2baa1..6e9bd8d 100644 --- a/src/git_binary.rs +++ b/src/git_binary.rs @@ -105,8 +105,13 @@ impl Git { let _span = debug_span!("git_update", "{repo_url} {}", output_dir.display()).entered(); debug!("Attempting to update clone of {repo_url} at {}", output_dir.display()); let mut cmd = self.git(); - cmd.arg("--git-dir"); - cmd.arg(output_dir); + if output_dir.join(".git").is_dir() { + cmd.arg("-C"); + cmd.arg(output_dir); + } else { + cmd.arg("--git-dir"); + cmd.arg(output_dir); + } cmd.arg("remote"); cmd.arg("update"); cmd.arg("--prune"); @@ -129,7 +134,9 @@ impl Git { debug!("Attempting to create fresh clone of {} at {}", repo_url, output_dir.display()); let mut cmd = self.git(); cmd.arg("clone"); - cmd.arg(clone_mode.arg()); + if let Some(arg) = clone_mode.arg() { + cmd.arg(arg); + } cmd.arg(repo_url.as_str()); cmd.arg(output_dir); debug!("{cmd:#?}"); @@ -151,14 +158,17 @@ pub enum CloneMode { Bare, /// Equivalent to `git clone --mirror` Mirror, + /// Standard clone with a working tree + Checkout, } impl CloneMode { /// Return the CLI argument for this clone mode. - pub fn arg(&self) -> &str { + pub fn arg(&self) -> Option<&str> { match self { - Self::Bare => "--bare", - Self::Mirror => "--mirror", + Self::Bare => Some("--bare"), + Self::Mirror => Some("--mirror"), + Self::Checkout => None, } } } @@ -183,8 +193,9 @@ mod tests { #[test] fn test_clone_mode_arg() { - assert_eq!(CloneMode::Bare.arg(), "--bare"); - assert_eq!(CloneMode::Mirror.arg(), "--mirror"); + assert_eq!(CloneMode::Bare.arg(), Some("--bare")); + assert_eq!(CloneMode::Mirror.arg(), Some("--mirror")); + assert_eq!(CloneMode::Checkout.arg(), None); } #[test] diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index 7999988..19f6b4d 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -12,10 +12,7 @@ use crate::blob::BlobIdMap; use crate::{ blob::BlobMetadata, cli::{ - commands::{ - github::{GitCloneMode, GitHistoryMode}, - scan, - }, + commands::{github::GitCloneMode, github::GitHistoryMode, scan}, global, }, findings_store, @@ -42,16 +39,20 @@ pub fn clone_or_update_git_repos( datastore: &Arc>, ) -> Result> { let mut input_roots = args.input_specifier_args.path_inputs.clone(); - if repo_urls.is_empty() || args.input_specifier_args.git_history == GitHistoryMode::None { + if repo_urls.is_empty() { return Ok(input_roots); } info!("{} Git URLs to fetch", repo_urls.len()); for repo_url in repo_urls { debug!("Need to fetch {repo_url}") } - let clone_mode = match args.input_specifier_args.git_clone { - GitCloneMode::Mirror => CloneMode::Mirror, - GitCloneMode::Bare => CloneMode::Bare, + let clone_mode = if args.input_specifier_args.git_history == GitHistoryMode::None { + CloneMode::Checkout + } else { + match args.input_specifier_args.git_clone { + GitCloneMode::Mirror => CloneMode::Mirror, + GitCloneMode::Bare => CloneMode::Bare, + } }; let git = Git::new(global_args.ignore_certs); @@ -68,6 +69,7 @@ pub fn clone_or_update_git_repos( } else { ProgressBar::hidden() }; + for repo_url in repo_urls { let output_dir = { let datastore = datastore.lock().unwrap(); diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index 0b55799..3903ddb 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -139,3 +139,106 @@ fn test_gitlab_remote_scan() -> Result<()> { drop(rt); Ok(()) } + + +#[test] +fn test_gitlab_remote_scan_no_history() -> Result<()> { + let temp_dir = TempDir::new().context("tmp dir")?; + let clone_dir = temp_dir.path().to_path_buf(); + + let test_repo_url = "https://gitlab.com/micksmix/SecretsTest.git"; + let git_url = GitUrl::from_str(test_repo_url).expect("parse GitLab URL"); + + let scan_args = ScanArgs { + num_jobs: 2, + rules: RuleSpecifierArgs { + rules_path: Vec::new(), + rule: vec!["all".into()], + load_builtins: true, + }, + input_specifier_args: InputSpecifierArgs { + path_inputs: Vec::new(), + git_url: vec![git_url], + github_user: Vec::new(), + github_organization: Vec::new(), + all_github_organizations: false, + github_api_url: Url::parse("https://api.github.com/")?, + github_repo_type: GitHubRepoType::Source, + gitlab_user: Vec::new(), + gitlab_group: Vec::new(), + all_gitlab_groups: false, + gitlab_api_url: Url::parse("https://gitlab.com/")?, + gitlab_repo_type: GitLabRepoType::Owner, + + jira_url: None, + jql: None, + max_results: 100, + slack_query: None, + slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, + docker_image: Vec::new(), + git_clone: GitCloneMode::Bare, + git_history: GitHistoryMode::None, + scan_nested_repos: true, + commit_metadata: true, + }, + content_filtering_args: ContentFilteringArgs { + max_file_size_mb: 25.0, + no_extract_archives: false, + extraction_depth: 2, + no_binary: true, + exclude: Vec::new(), + }, + confidence: ConfidenceLevel::Medium, + no_validate: false, + rule_stats: false, + only_valid: false, + min_entropy: None, + redact: false, + git_repo_timeout: 1800, + output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, + no_dedup: true, + snippet_length: 256, + baseline_file: None, + manage_baseline: false, + }; + + let global_args = GlobalArgs { + verbose: 0, + quiet: false, + color: Mode::Auto, + progress: Mode::Auto, + no_update_check: false, + self_update: false, + ignore_certs: false, + advanced: AdvancedArgs { rlimit_nofile: 16_384 }, + }; + + let datastore = Arc::new(Mutex::new(FindingsStore::new(clone_dir))); + let rt = Runtime::new()?; + + let rules_db = Arc::new(load_and_record_rules(&scan_args, &datastore)?); + + rt.block_on(async { + run_scan(&global_args, &scan_args, &rules_db, Arc::clone(&datastore)).await + })?; + + let ds = datastore.lock().unwrap(); + let findings = ds.get_matches(); + let total = findings.len(); + let validated = findings.iter().filter(|m| m.as_ref().2.validation_success).count(); + + assert!(total >= 10, "expected at least 10 findings from GitLab repo, got {total}"); + + let exit_code = determine_exit_code(total, validated); + assert!( + exit_code >= 200, + "expected kingfisher to report findings (exit_code >= 200), got {exit_code}" + ); + + drop(rt); + Ok(()) +} \ No newline at end of file From e48b9617c8e072fdf52a4e4b98e8b58c1234f839 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Wed, 6 Aug 2025 19:16:22 -0700 Subject: [PATCH 2/4] Remote scans with --git-history=none now clone repositories with a working tree and scan the current files instead of erroring with 'No inputs to scan.' --- src/scanner/repos.rs | 2 +- tests/int_gitlab.rs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index 19f6b4d..c3bb8ba 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -69,7 +69,7 @@ pub fn clone_or_update_git_repos( } else { ProgressBar::hidden() }; - + for repo_url in repo_urls { let output_dir = { let datastore = datastore.lock().unwrap(); diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index 3903ddb..fa3e169 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -140,7 +140,6 @@ fn test_gitlab_remote_scan() -> Result<()> { Ok(()) } - #[test] fn test_gitlab_remote_scan_no_history() -> Result<()> { let temp_dir = TempDir::new().context("tmp dir")?; @@ -241,4 +240,4 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { drop(rt); Ok(()) -} \ No newline at end of file +} From 8d32662c1a528010b7eea6b7ebebb6133fe7964f Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Wed, 6 Aug 2025 21:23:27 -0700 Subject: [PATCH 3/4] fixed issue where --redact did not function properly --- CHANGELOG.md | 1 + src/scanner/enumerate.rs | 2 +- src/scanner/processing.rs | 3 +- src/scanner/repos.rs | 2 +- tests/int_redact.rs | 115 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 tests/int_redact.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ff7097..4201640 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file. ## [1.35.0] - Remote scans with `--git-history=none` now clone repositories with a working tree and scan the current files instead of erroring with "No inputs to scan". +- Fixed issue where `--redact` did not function properly ## [1.34.0] - Use system TLS root certificates to support self-hosted GitLab instances with internal CAs diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index 0e0c795..87a8011 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -183,7 +183,7 @@ pub fn enumerate_filesystem_inputs( return Ok(()); } progress.inc(blob.len().try_into().unwrap()); - match processor.run(origin, blob, args.no_dedup) { + match processor.run(origin, blob, args.no_dedup, args.redact) { Ok(None) => { // nothing to record } diff --git a/src/scanner/processing.rs b/src/scanner/processing.rs index 8c36514..e1551b0 100644 --- a/src/scanner/processing.rs +++ b/src/scanner/processing.rs @@ -25,11 +25,12 @@ impl<'a> BlobProcessor<'a> { origin: OriginSet, blob: Blob, no_dedup: bool, + redact: bool, ) -> Result> { let blob_id = blob.id.hex(); let _span = debug_span!("matcher", blob_id).entered(); let t1 = Instant::now(); - let res = self.matcher.scan_blob(&blob, &origin, None, false, no_dedup)?; + let res = self.matcher.scan_blob(&blob, &origin, None, redact, no_dedup)?; let scan_us = t1.elapsed().as_micros(); match res { // blob already seen, but with no matches; nothing to do! diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index c3bb8ba..7bd6b8d 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -349,7 +349,7 @@ pub async fn fetch_s3_objects( let blob = crate::blob::Blob::from_bytes(bytes); if let Some((origin, blob_md, scored_matches)) = - processor.run(origin, blob, args.no_dedup)? + processor.run(origin, blob, args.no_dedup, args.redact)? { // Wrap origin & metadata once: let origin_arc = Arc::new(origin); diff --git a/tests/int_redact.rs b/tests/int_redact.rs new file mode 100644 index 0000000..5d72c61 --- /dev/null +++ b/tests/int_redact.rs @@ -0,0 +1,115 @@ +// Integration test to ensure --redact replaces secret values with hashes +use std::{ + path::PathBuf, + sync::{Arc, Mutex}, +}; + +use anyhow::Result; +use kingfisher::{ + cli::{ + commands::{ + github::{GitCloneMode, GitHistoryMode, GitHubRepoType}, + gitlab::GitLabRepoType, + inputs::{ContentFilteringArgs, InputSpecifierArgs}, + output::{OutputArgs, ReportOutputFormat}, + rules::RuleSpecifierArgs, + scan::{ConfidenceLevel, ScanArgs}, + }, + global::{AdvancedArgs, GlobalArgs, Mode}, + }, + findings_store::FindingsStore, + rule_loader::RuleLoader, + rules_database::RulesDatabase, + scanner::run_async_scan, +}; +use tempfile::TempDir; +use url::Url; + +#[tokio::test] +async fn test_redact_hashes_finding_values() -> Result<()> { + let temp_dir = TempDir::new()?; + + let scan_args = ScanArgs { + num_jobs: 2, + rules: RuleSpecifierArgs { + rules_path: Vec::new(), + rule: vec!["all".into()], + load_builtins: true, + }, + input_specifier_args: InputSpecifierArgs { + path_inputs: vec![PathBuf::from("testdata/generic_secrets.py")], + git_url: Vec::new(), + github_user: Vec::new(), + github_organization: Vec::new(), + all_github_organizations: false, + github_api_url: Url::parse("https://api.github.com/").unwrap(), + github_repo_type: GitHubRepoType::Source, + gitlab_user: Vec::new(), + gitlab_group: Vec::new(), + all_gitlab_groups: false, + gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(), + gitlab_repo_type: GitLabRepoType::Owner, + jira_url: None, + jql: None, + max_results: 100, + slack_query: None, + slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, + docker_image: Vec::new(), + git_clone: GitCloneMode::Bare, + git_history: GitHistoryMode::Full, + scan_nested_repos: true, + commit_metadata: true, + }, + content_filtering_args: ContentFilteringArgs { + max_file_size_mb: 25.0, + extraction_depth: 2, + no_binary: true, + no_extract_archives: false, + exclude: Vec::new(), + }, + confidence: ConfidenceLevel::Low, + no_validate: true, + rule_stats: false, + only_valid: false, + min_entropy: Some(0.0), + redact: true, + git_repo_timeout: 1800, + output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, + no_dedup: true, + snippet_length: 256, + baseline_file: None, + manage_baseline: false, + }; + + let global_args = GlobalArgs { + verbose: 0, + quiet: true, + color: Mode::Never, + no_update_check: false, + self_update: false, + progress: Mode::Never, + ignore_certs: false, + advanced: AdvancedArgs { rlimit_nofile: 16384 }, + }; + + let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules).load(&scan_args)?; + let resolved = loaded.resolve_enabled_rules()?; + let rules_db = RulesDatabase::from_rules(resolved.into_iter().cloned().collect())?; + + let datastore = Arc::new(Mutex::new(FindingsStore::new(temp_dir.path().to_path_buf()))); + run_async_scan(&global_args, &scan_args, Arc::clone(&datastore), &rules_db).await?; + + let ds = datastore.lock().unwrap(); + let matches = ds.get_matches(); + assert!(!matches.is_empty()); + for m_arc in matches { + let m = &m_arc.2; + assert!(m.groups.captures.iter().any(|cap| cap.value.starts_with("[REDACTED:"))); + } + + Ok(()) +} \ No newline at end of file From 57ba607a3391925a74fc4b85cadbab7ee5b6ca21 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Wed, 6 Aug 2025 21:31:02 -0700 Subject: [PATCH 4/4] Fixed validation logic for clarifai rule --- CHANGELOG.md | 1 + data/rules/clarifai.yml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4201640..482eca5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file. ## [1.35.0] - Remote scans with `--git-history=none` now clone repositories with a working tree and scan the current files instead of erroring with "No inputs to scan". - Fixed issue where `--redact` did not function properly +- Fixed validation logic for clarifai rule ## [1.34.0] - Use system TLS root certificates to support self-hosted GitLab instances with internal CAs diff --git a/data/rules/clarifai.yml b/data/rules/clarifai.yml index e3f7167..d72c9f2 100644 --- a/data/rules/clarifai.yml +++ b/data/rules/clarifai.yml @@ -34,5 +34,5 @@ rules: - type: WordMatch match_all_words: true words: - - '"models"' - - '"status"' + - '"code":10000' + - '"description":"Ok"'