From 96ab0d4b59305116b2f4229cf4aaabcf96fe1067 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sat, 2 Aug 2025 20:40:16 -0700 Subject: [PATCH 01/13] -Added support for scanning AWS S3 buckets via --s3-bucket and optional --s3-prefix - Added --role-arn and --aws-local-profile flags for S3 authentication alongside KF_AWS_KEY/KF_AWS_SECRET --- CHANGELOG.md | 4 ++ Cargo.toml | 5 +- README.md | 10 ++++ src/cli/commands/inputs.rs | 20 +++++++- src/findings_store.rs | 10 ++++ src/lib.rs | 1 + src/main.rs | 5 ++ src/reporter.rs | 11 +++++ src/reporter/json_format.rs | 22 ++++++--- src/reporter/pretty_format.rs | 25 ++++++++-- src/reporter/sarif_format.rs | 23 ++++++++-- src/s3.rs | 86 +++++++++++++++++++++++++++++++++++ src/scanner/repos.rs | 73 +++++++++++++++++++++++++++-- src/scanner/runner.rs | 43 ++++++++++++++---- tests/int_dedup.rs | 5 ++ tests/int_github.rs | 5 ++ tests/int_gitlab.rs | 5 ++ tests/int_slack.rs | 9 ++++ tests/int_validation_cache.rs | 5 ++ tests/int_vulnerable_files.rs | 10 ++++ 20 files changed, 347 insertions(+), 30 deletions(-) create mode 100644 src/s3.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 350a3fb..07607fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ All notable changes to this project will be documented in this file. +## [1.32.0] +- Added support for scanning AWS S3 buckets via `--s3-bucket` and optional `--s3-prefix` +- Added `--role-arn` and `--aws-local-profile` flags for S3 authentication alongside `KF_AWS_KEY`/`KF_AWS_SECRET` +- ## [1.31.0] - New rules: Telegram bot token, OpenWeatherMap, Apify, Groq - New OpenAI detectors added (@joshlarsen) diff --git a/Cargo.toml b/Cargo.toml index de43779..425db27 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.31.0" +version = "1.32.0" description = "MongoDB's blazingly fast secret scanning and validation tool" edition.workspace = true rust-version.workspace = true @@ -186,6 +186,7 @@ oci-client = { version = "0.15", default-features = false, features = ["rustls-t walkdir = "2.5.0" p256 = "0.13.2" ed25519-dalek = { version = "2.2", features = ["pkcs8"] } +aws-sdk-s3 = "1.100.0" [dependencies.tikv-jemallocator] version = "0.6" @@ -207,7 +208,7 @@ rand_chacha = "0.9.0" [profile.release] debug = false -strip = "debuginfo" +strip = true #"debuginfo" opt-level = 3 # Maximum optimization for performance lto = true # Enable Link Time Optimization codegen-units = 1 # Optimize for size but slower compilation diff --git a/README.md b/README.md index a837351..4af4295 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ Kingfisher originated as a fork of Praetorian's [Nosey Parker](https://github.co - **Docker images**: public or private via `--docker-image` - **Jira issues**: JQL‑driven scans with `--jira-url` and `--jql` - **Slack messages**: query‑based scans with `--slack-query` + - **AWS S3**: bucket scans via `--s3-bucket`/`--s3-prefix` with credentials from `KF_AWS_KEY`/`KF_AWS_SECRET`, `--role-arn`, or `--aws-local-profile` - **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) **Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation) @@ -109,6 +110,15 @@ docker run --rm \ ghcr.io/mongodb/kingfisher:latest \ scan --git-url https://github.com/org/private_repo.git +# Scan an S3 bucket +# Credentials can come from KF_AWS_KEY/KF_AWS_SECRET, --role-arn, or --aws-local-profile +docker run --rm \ + -e KF_AWS_KEY=AKIA... \ + -e KF_AWS_SECRET=g5nYW... \ + ghcr.io/mongodb/kingfisher:latest \ + scan --s3-bucket bucket-name + + # Scan and write a JSON report locally # Here we: # 1. Mount $PWD → /proj diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs index 8a1c23d..ea38722 100644 --- a/src/cli/commands/inputs.rs +++ b/src/cli/commands/inputs.rs @@ -28,7 +28,8 @@ pub struct InputSpecifierArgs { "all_gitlab_groups", "jira_url", "docker_image", - "slack_query" + "slack_query", + "s3_bucket" ]), value_hint = ValueHint::AnyPath )] @@ -107,6 +108,23 @@ pub struct InputSpecifierArgs { #[arg(long, default_value_t = 100)] pub max_results: usize, + /// Scan the specified S3 bucket + #[arg(long)] + pub s3_bucket: Option, + + /// Optional prefix within the S3 bucket + #[arg(long, requires = "s3_bucket")] + pub s3_prefix: Option, + + /// AWS IAM role ARN to assume for S3 access + #[arg(long, requires = "s3_bucket")] + pub role_arn: Option, + + /// Use credentials from a local AWS profile in ~/.aws/config + #[arg(long, requires = "s3_bucket")] + pub aws_local_profile: Option, + + /// Docker/OCI images to scan (no local Docker required) #[arg(long = "docker-image")] pub docker_image: Vec, diff --git a/src/findings_store.rs b/src/findings_store.rs index 93e9f1c..a1c94d4 100644 --- a/src/findings_store.rs +++ b/src/findings_store.rs @@ -54,6 +54,7 @@ pub struct FindingsStore { origin_meta: FxHashMap>, docker_images: FxHashMap, slack_links: FxHashMap, + s3_buckets: FxHashMap, } impl FindingsStore { pub fn new(clone_dir: PathBuf) -> Self { @@ -73,6 +74,7 @@ impl FindingsStore { bloom_items: 0, docker_images: FxHashMap::default(), slack_links: FxHashMap::default(), + s3_buckets: FxHashMap::default(), } } @@ -306,6 +308,14 @@ impl FindingsStore { &self.slack_links } + pub fn register_s3_bucket(&mut self, dir: PathBuf, bucket: String) { + self.s3_buckets.insert(dir, bucket); + } + + pub fn s3_buckets(&self) -> &FxHashMap { + &self.s3_buckets + } + pub fn get_finding_data_iter( &self, ) -> impl Iterator + '_ { diff --git a/src/lib.rs b/src/lib.rs index 85bc57c..90d0451 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,6 +30,7 @@ pub mod rule_profiling; pub mod rules; pub mod rules_database; pub mod safe_list; +pub mod s3; pub mod scanner; pub mod scanner_pool; pub mod serde_utils; diff --git a/src/main.rs b/src/main.rs index 9c30b92..73c77a5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -286,6 +286,11 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { jira_url: None, jql: None, max_results: 100, + + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, // Slack query slack_query: None, slack_api_url: Url::parse("https://slack.com/api/").unwrap(), diff --git a/src/reporter.rs b/src/reporter.rs index 210da31..ad0efe9 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -141,6 +141,17 @@ impl DetailsReporter { ds.slack_links().get(path).cloned() } + fn s3_display_path(&self, path: &std::path::Path) -> Option { + let ds = self.datastore.lock().ok()?; + for (dir, bucket) in ds.s3_buckets().iter() { + if path.starts_with(dir) { + let rel = path.strip_prefix(dir).ok()?; + return Some(format!("s3://{}/{}", bucket, rel.display())); + } + } + None + } + fn docker_display_path(&self, path: &std::path::Path) -> Option { let ds = self.datastore.lock().ok()?; for (dir, image) in ds.docker_images().iter() { diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 5533b55..9fcb1ec 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -99,20 +99,22 @@ impl DetailsReporter { let file_path = rm .origin .iter() - .find_map(|origin| { - if let Origin::File(e) = origin { + .find_map(|origin| match origin { + Origin::File(e) => { if let Some(url) = self.jira_issue_url(&e.path, args) { Some(url) } else if let Some(url) = self.slack_message_url(&e.path) { Some(url) + } else if let Some(mapped) = self.s3_display_path(&e.path) { + Some(mapped) } else if let Some(mapped) = self.docker_display_path(&e.path) { Some(mapped) } else { Some(e.path.display().to_string()) } - } else { - None - } + } + Origin::Extended(e) => e.path().map(|p| p.display().to_string()), + _ => None, }) .unwrap_or_default(); @@ -258,11 +260,15 @@ impl DetailsReporter { Some(url) } else if let Some(url) = self.slack_message_url(&e.path) { Some(url) + } else if let Some(mapped) = self.s3_display_path(&e.path) { + Some(mapped) } else if let Some(mapped) = self.docker_display_path(&e.path) { Some(mapped) } else { Some(e.path.display().to_string()) } + } else if let Origin::Extended(e) = origin { + e.path().map(|p| p.display().to_string()) } else { None } @@ -437,10 +443,14 @@ mod tests { jira_url: None, jql: None, max_results: 100, - // Docker image scanning // Slack options slack_query: None, slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + // s3 + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, docker_image: Vec::new(), // clone / history options diff --git a/src/reporter/pretty_format.rs b/src/reporter/pretty_format.rs index 62dd354..942e7ad 100644 --- a/src/reporter/pretty_format.rs +++ b/src/reporter/pretty_format.rs @@ -218,6 +218,8 @@ impl<'a> Display for PrettyFinding<'a> { url } else if let Some(url) = reporter.slack_message_url(&e.path) { url + } else if let Some(mapped) = reporter.s3_display_path(&e.path) { + mapped } else if let Some(mapped) = reporter.docker_display_path(&e.path) { mapped } else { @@ -233,13 +235,23 @@ impl<'a> Display for PrettyFinding<'a> { } )?; } + Origin::Extended(e) => { + if let Some(p) = e.path() { + let display_path = p.display().to_string(); + writeln!( + f, + " |Path..........: {}", + if rm.validation_success { + reporter.style_active_creds(&display_path).to_string() + } else { + display_path + } + )?; + } + } Origin::GitRepo(e) => { reporter.write_git_metadata(f, e, args, source_span.start.line)?; } - Origin::Extended(e) => { - writeln!(f, " |Extended......: {}", reporter.style_metadata(e).to_string())?; - // Convert StyledObject to String - } } } Ok(()) @@ -353,6 +365,11 @@ fn test_pretty_format_with_nan_entropy_panics() { // Slack options slack_query: None, slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + // s3 + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, // Docker image scanning docker_image: Vec::new(), // git clone / history options diff --git a/src/reporter/sarif_format.rs b/src/reporter/sarif_format.rs index 5829bba..033d37c 100644 --- a/src/reporter/sarif_format.rs +++ b/src/reporter/sarif_format.rs @@ -75,6 +75,8 @@ impl DetailsReporter { url } else if let Some(url) = self.slack_message_url(&e.path) { url + } else if let Some(mapped) = self.s3_display_path(&e.path) { + mapped } else { e.path.display().to_string() }; @@ -82,6 +84,16 @@ impl DetailsReporter { sarif::ArtifactLocationBuilder::default().uri(uri).build().ok()?, ); } + Origin::Extended(e) => { + if let Some(p) = e.path() { + artifact_locations.push( + sarif::ArtifactLocationBuilder::default() + .uri(p.display().to_string()) + .build() + .ok()?, + ); + } + } Origin::GitRepo(e) => { // Extract and store Git metadata if let Some(git_metadata) = self.extract_git_metadata(e, source_span) { @@ -111,7 +123,6 @@ impl DetailsReporter { ); } } - Origin::Extended(_) => (), } } @@ -212,11 +223,18 @@ impl DetailsReporter { url } else if let Some(url) = self.slack_message_url(&e.path) { url + } else if let Some(mapped) = self.s3_display_path(&e.path) { + mapped } else { e.path.display().to_string() }; msg.push_str(&format!("Location: {}\n", uri)); } + Origin::Extended(e) => { + if let Some(p) = e.path() { + msg.push_str(&format!("Location: {}\n", p.display())); + } + } Origin::GitRepo(e) => { if let Some(cs) = &e.first_commit { let repo_url = get_repo_url(&e.repo_path) @@ -235,9 +253,6 @@ impl DetailsReporter { msg.push_str(&format!("File: {}", cs.blob_path)); } } - Origin::Extended(e) => { - msg.push_str(&format!("Extended: {}\n", e)); - } } msg } else { diff --git a/src/s3.rs b/src/s3.rs new file mode 100644 index 0000000..5f35dde --- /dev/null +++ b/src/s3.rs @@ -0,0 +1,86 @@ +use anyhow::{Context, Result}; +use aws_config::{meta::region::RegionProviderChain, BehaviorVersion}; +use aws_credential_types::Credentials; +use aws_sdk_s3::Client; + +/// Visit all objects in the given S3 bucket (optionally under a prefix), +/// calling `visitor` with each object's key and bytes. +pub async fn visit_bucket_objects( + bucket: &str, + prefix: Option<&str>, + role_arn: Option<&str>, + profile: Option<&str>, + mut visitor: F, +) -> Result<()> +where + F: FnMut(String, Vec) -> Result<()>, +{ + let mut config_loader = aws_config::defaults(BehaviorVersion::latest()); + + if let Some(profile) = profile { + config_loader = config_loader.profile_name(profile); + } + + // If explicit credentials are provided via KF_AWS_KEY/KF_AWS_SECRET use them + if let (Ok(key), Ok(secret)) = (std::env::var("KF_AWS_KEY"), std::env::var("KF_AWS_SECRET")) { + let creds = Credentials::new(key, secret, None, None, "kf_env"); + config_loader = config_loader.credentials_provider(creds); + } + + // Resolve region using the default chain, falling back to us-east-1 + let region_provider = RegionProviderChain::default_provider().or_else("us-east-1"); + let base_config = config_loader.region(region_provider).load().await; + + let client = if let Some(role) = role_arn { + let assume_role = aws_config::sts::AssumeRoleProvider::builder(role.to_string()) + .session_name("kingfisher") + .configure(&base_config) + .build() + .await; + let conf = aws_sdk_s3::config::Builder::from(&base_config) + .credentials_provider(assume_role) + .build(); + Client::from_conf(conf) + } else { + Client::new(&base_config) + }; + + let mut continuation_token = None; + + loop { + let mut req = client.list_objects_v2().bucket(bucket.to_string()); + if let Some(p) = prefix { + req = req.prefix(p.to_string()); + } + if let Some(token) = continuation_token.clone() { + req = req.continuation_token(token); + } + + let resp = req.send().await.context("Failed to list objects in bucket")?; + + if let Some(objects) = resp.contents { + for obj in objects { + if let Some(key) = obj.key { + let get_resp = client + .get_object() + .bucket(bucket) + .key(&key) + .send() + .await + .with_context(|| format!("Failed to fetch object {key}"))?; + let data = + get_resp.body.collect().await.context("Failed to read S3 object body")?; + visitor(key, data.into_bytes().to_vec())?; + } + } + } + + if resp.is_truncated.unwrap_or(false) { + continuation_token = resp.next_continuation_token; + } else { + break; + } + } + + Ok(()) +} \ No newline at end of file diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index 9d944ea..735e381 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -8,6 +8,7 @@ use indicatif::{HumanCount, ProgressBar, ProgressStyle}; use tokio::time::Duration; use tracing::{debug, error, info}; +use crate::blob::BlobIdMap; use crate::{ blob::BlobMetadata, cli::{ @@ -21,10 +22,15 @@ use crate::{ git_binary::{CloneMode, Git}, git_url::GitUrl, github, gitlab, jira, - matcher::Match, - origin::OriginSet, - slack, PathBuf, + matcher::{Match, Matcher, MatcherStats}, + origin::{Origin, OriginSet}, + rules_database::RulesDatabase, + s3, + scanner::processing::BlobProcessor, + scanner_pool::ScannerPool, + slack, guesser::Guesser, PathBuf, }; + pub type DatastoreMessage = (OriginSet, BlobMetadata, Vec<(Option, Match)>); pub fn clone_or_update_git_repos( @@ -284,3 +290,64 @@ pub async fn fetch_slack_messages( } Ok(vec![output_dir]) } + + +pub async fn fetch_s3_objects( + args: &scan::ScanArgs, + datastore: &Arc>, + rules_db: &RulesDatabase, + matcher_stats: &Mutex, + enable_profiling: bool, + shared_profiler: Arc, +) -> Result<()> { + let Some(bucket) = args.input_specifier_args.s3_bucket.as_deref() else { + return Ok(()); + }; + let prefix = args.input_specifier_args.s3_prefix.as_deref(); + let role_arn = args.input_specifier_args.role_arn.as_deref(); + let profile = args.input_specifier_args.aws_local_profile.as_deref(); + + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let seen_blobs = BlobIdMap::new(); + let matcher = Matcher::new( + rules_db, + scanner_pool, + &seen_blobs, + Some(matcher_stats), + enable_profiling, + Some(shared_profiler.clone()), + )?; + let guesser = Guesser::new().expect("should be able to create filetype guesser"); + let mut processor = BlobProcessor { matcher, guesser }; + let bucket_name = bucket.to_string(); + + s3::visit_bucket_objects(bucket, prefix, role_arn, profile, |key, bytes| { + let origin = OriginSet::new( + Origin::from_extended(serde_json::json!({ + "path": format!("s3://{}/{}", bucket_name, key) + })), + Vec::new(), + ); + let blob = crate::blob::Blob::from_bytes(bytes); + + if let Some((origin, blob_md, scored_matches)) = processor.run(origin, blob, args.no_dedup)? { + // Wrap origin & metadata once: + let origin_arc = Arc::new(origin); + let blob_arc = Arc::new(blob_md); + + // Now build a batch of exactly one FindingsStoreMessage per Match + let mut batch = Vec::with_capacity(scored_matches.len()); + for (_score, m) in scored_matches { + batch.push((origin_arc.clone(), blob_arc.clone(), m)); + } + + // Call record with the right type + let added = datastore.lock().unwrap().record(batch, !args.no_dedup); + debug!("Added {} new S3 blobs", added); + } + Ok(()) + }) + .await?; + + Ok(()) +} \ No newline at end of file diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs index 0a880da..f8dae87 100644 --- a/src/scanner/runner.rs +++ b/src/scanner/runner.rs @@ -18,7 +18,9 @@ use crate::{ rules_database::RulesDatabase, scanner::{ clone_or_update_git_repos, enumerate_filesystem_inputs, enumerate_github_repos, - repos::{enumerate_gitlab_repos, fetch_jira_issues, fetch_slack_messages}, + repos::{ + enumerate_gitlab_repos, fetch_jira_issues, fetch_s3_objects, fetch_slack_messages, + }, run_secret_validation, save_docker_images, summary::print_scan_summary, }, @@ -72,6 +74,7 @@ pub async fn run_async_scan( let slack_dirs = fetch_slack_messages(args, global_args, &datastore).await?; input_roots.extend(slack_dirs); + // Save Docker images if specified if !args.input_specifier_args.docker_image.is_empty() { let clone_root = { @@ -93,22 +96,42 @@ pub async fn run_async_scan( } } - if input_roots.is_empty() { - bail!("No inputs to scan"); - } + // if input_roots.is_empty() { + // bail!("No inputs to scan"); + // } let shared_profiler = Arc::new(ConcurrentRuleProfiler::new()); let enable_profiling = args.rule_stats; let matcher_stats = Mutex::new(MatcherStats::default()); - let _inputs = enumerate_filesystem_inputs( + + // Fetch S3 objects if requested (scanned immediately) + fetch_s3_objects( args, - datastore.clone(), - &input_roots, - progress_enabled, + &datastore, rules_db, + &matcher_stats, enable_profiling, Arc::clone(&shared_profiler), - &matcher_stats, - )?; + ) + .await?; + + let has_s3 = args.input_specifier_args.s3_bucket.is_some(); + if input_roots.is_empty() && !has_s3 { + bail!("No inputs to scan"); + } + + if !input_roots.is_empty() { + let _inputs = enumerate_filesystem_inputs( + args, + datastore.clone(), + &input_roots, + progress_enabled, + rules_db, + enable_profiling, + Arc::clone(&shared_profiler), + &matcher_stats, + )?; + } + if !args.no_dedup { // Final deduplication step before validation (or before reporting) diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index c42967f..0c93023 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -84,6 +84,11 @@ rules: max_results: 100, slack_query: None, slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + // s3 + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, // Docker image scanning docker_image: Vec::new(), // git clone / history options diff --git a/tests/int_github.rs b/tests/int_github.rs index 4bda269..2892b91 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -71,6 +71,11 @@ fn test_github_remote_scan() -> Result<()> { max_results: 100, slack_query: None, slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + // s3 + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, // Docker image scanning docker_image: Vec::new(), // git clone / history options diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index e53bbd8..0b55799 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -70,6 +70,11 @@ fn test_gitlab_remote_scan() -> Result<()> { max_results: 100, slack_query: None, slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + // s3 + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, // Docker image scanning docker_image: Vec::new(), git_clone: GitCloneMode::Bare, diff --git a/tests/int_slack.rs b/tests/int_slack.rs index 699dad9..d22b8f0 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -59,6 +59,10 @@ impl TestContext { jql: None, slack_query: None, slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, max_results: 10, docker_image: Vec::new(), git_clone: GitCloneMode::Bare, @@ -147,6 +151,11 @@ async fn test_scan_slack_messages() -> Result<()> { slack_query: Some("test".into()), slack_api_url: Url::parse(&format!("{}/", server.uri()))?, max_results: 10, + // s3 + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, docker_image: Vec::new(), git_clone: GitCloneMode::Bare, git_history: GitHistoryMode::Full, diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index ae8dd50..a7ab9ea 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -127,6 +127,11 @@ async fn test_validation_cache_and_depvars() -> Result<()> { max_results: 100, slack_query: None, slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + // s3 + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, // Docker image scanning docker_image: Vec::new(), // git clone / history options diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 187427e..abeb6f1 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -70,6 +70,11 @@ impl TestContext { max_results: 100, slack_query: None, slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + // s3 + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, // Docker image scanning docker_image: Vec::new(), // git clone / history options @@ -142,6 +147,11 @@ impl TestContext { max_results: 100, slack_query: None, slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + // s3 + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, // Docker image scanning docker_image: Vec::new(), // git clone / history options From ef6ba415f25f952c0f429089c99070c2b066cb43 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sun, 3 Aug 2025 08:13:08 -0700 Subject: [PATCH 02/13] improving s3 bucket scanning feature --- Cargo.toml | 1 + src/s3.rs | 144 ++++++++++++------ .../{ => baseline}/archive/kfArchiveTest.7z | Bin .../{ => baseline}/archive/kfArchiveTest.tar | Bin .../archive/kfArchiveTest.tar.bz2 | Bin .../archive/kfArchiveTest.tar.gz | Bin .../archive/kfArchiveTest.tar.lz4 | Bin .../archive/kfArchiveTest.tar.xz | Bin .../{ => baseline}/archive/kfArchiveTest.zip | Bin .../archive/kfArchiveTest_zip_inside.zip | Bin .../{ => baseline}/archive/makeArchives.sh | 0 testdata/{ => baseline}/archive/template.zip | Bin 12 files changed, 102 insertions(+), 43 deletions(-) rename testdata/{ => baseline}/archive/kfArchiveTest.7z (100%) rename testdata/{ => baseline}/archive/kfArchiveTest.tar (100%) rename testdata/{ => baseline}/archive/kfArchiveTest.tar.bz2 (100%) rename testdata/{ => baseline}/archive/kfArchiveTest.tar.gz (100%) rename testdata/{ => baseline}/archive/kfArchiveTest.tar.lz4 (100%) rename testdata/{ => baseline}/archive/kfArchiveTest.tar.xz (100%) rename testdata/{ => baseline}/archive/kfArchiveTest.zip (100%) rename testdata/{ => baseline}/archive/kfArchiveTest_zip_inside.zip (100%) rename testdata/{ => baseline}/archive/makeArchives.sh (100%) rename testdata/{ => baseline}/archive/template.zip (100%) diff --git a/Cargo.toml b/Cargo.toml index 425db27..686812d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -187,6 +187,7 @@ walkdir = "2.5.0" p256 = "0.13.2" ed25519-dalek = { version = "2.2", features = ["pkcs8"] } aws-sdk-s3 = "1.100.0" +aws-smithy-http = "0.62.2" [dependencies.tikv-jemallocator] version = "0.6" diff --git a/src/s3.rs b/src/s3.rs index 5f35dde..7a180f4 100644 --- a/src/s3.rs +++ b/src/s3.rs @@ -1,10 +1,14 @@ use anyhow::{Context, Result}; -use aws_config::{meta::region::RegionProviderChain, BehaviorVersion}; +use aws_config::{defaults, meta::region::RegionProviderChain, BehaviorVersion, ConfigLoader}; use aws_credential_types::Credentials; -use aws_sdk_s3::Client; +use aws_sdk_s3::{ + Client, + operation::list_objects_v2::ListObjectsV2Error, // modeled service error + error::ProvideErrorMetadata, // for .code() :contentReference[oaicite:8]{index=8} +}; +use aws_types::region::Region; +use reqwest; // HTTP client for HEAD fallback -/// Visit all objects in the given S3 bucket (optionally under a prefix), -/// calling `visitor` with each object's key and bytes. pub async fn visit_bucket_objects( bucket: &str, prefix: Option<&str>, @@ -15,66 +19,120 @@ pub async fn visit_bucket_objects( where F: FnMut(String, Vec) -> Result<()>, { - let mut config_loader = aws_config::defaults(BehaviorVersion::latest()); + // Helper to build ConfigLoader with profile/creds/no_credentials + let build_loader = || { + let mut loader = defaults(BehaviorVersion::latest()); + if let Some(p) = profile { + loader = loader.profile_name(p); + } + if let (Ok(k), Ok(s)) = (std::env::var("KF_AWS_KEY"), std::env::var("KF_AWS_SECRET")) { + loader = loader.credentials_provider(Credentials::new(k, s, None, None, "kf_env")); + } + if profile.is_none() && std::env::var("KF_AWS_KEY").is_err() && role_arn.is_none() { + loader = loader.no_credentials(); + } + loader + }; - if let Some(profile) = profile { - config_loader = config_loader.profile_name(profile); - } - - // If explicit credentials are provided via KF_AWS_KEY/KF_AWS_SECRET use them - if let (Ok(key), Ok(secret)) = (std::env::var("KF_AWS_KEY"), std::env::var("KF_AWS_SECRET")) { - let creds = Credentials::new(key, secret, None, None, "kf_env"); - config_loader = config_loader.credentials_provider(creds); - } - - // Resolve region using the default chain, falling back to us-east-1 - let region_provider = RegionProviderChain::default_provider().or_else("us-east-1"); - let base_config = config_loader.region(region_provider).load().await; - - let client = if let Some(role) = role_arn { - let assume_role = aws_config::sts::AssumeRoleProvider::builder(role.to_string()) + // Initial client in default→us-east-1 + let default_region = RegionProviderChain::default_provider().or_else("us-east-1"); + let mut config = build_loader().region(default_region).load().await; + let mut client = if let Some(role) = role_arn { + let assume = aws_config::sts::AssumeRoleProvider::builder(role.to_string()) .session_name("kingfisher") - .configure(&base_config) + .configure(&config) .build() .await; - let conf = aws_sdk_s3::config::Builder::from(&base_config) - .credentials_provider(assume_role) + let conf = aws_sdk_s3::config::Builder::from(&config) + .credentials_provider(assume) .build(); Client::from_conf(conf) } else { - Client::new(&base_config) + Client::new(&config) }; - let mut continuation_token = None; - + let mut continuation_token: Option = None; loop { - let mut req = client.list_objects_v2().bucket(bucket.to_string()); + let mut req = client.list_objects_v2().bucket(bucket); if let Some(p) = prefix { - req = req.prefix(p.to_string()); + req = req.prefix(p); } - if let Some(token) = continuation_token.clone() { + if let Some(ref token) = continuation_token { req = req.continuation_token(token); } - let resp = req.send().await.context("Failed to list objects in bucket")?; + let resp = match req.send().await { + Ok(r) => r, - if let Some(objects) = resp.contents { - for obj in objects { - if let Some(key) = obj.key { - let get_resp = client - .get_object() - .bucket(bucket) - .key(&key) + // On error, extract the modeled service error + Err(err) => { + let svc_err: ListObjectsV2Error = err.into_service_error(); // from SdkError :contentReference[oaicite:9]{index=9} + + // If the bucket must be addressed at another region... + if svc_err.code() == Some("PermanentRedirect") { + // HEAD request to get x-amz-bucket-region header + let url = format!("https://{bucket}.s3.amazonaws.com"); + let head = reqwest::Client::new() + .head(&url) .send() .await - .with_context(|| format!("Failed to fetch object {key}"))?; - let data = - get_resp.body.collect().await.context("Failed to read S3 object body")?; - visitor(key, data.into_bytes().to_vec())?; + .context("Failed to HEAD bucket for region")?; + let region_str = head + .headers() + .get("x-amz-bucket-region") + .and_then(|v| v.to_str().ok()) + .unwrap_or("us-east-1") + .to_string(); + + // Rebuild client in the correct region + let override_region = RegionProviderChain::first_try(Region::new(region_str)) + .or_else("us-east-1"); + config = build_loader().region(override_region).load().await; + client = if let Some(r) = role_arn { + let assume = aws_config::sts::AssumeRoleProvider::builder(r.to_string()) + .session_name("kingfisher") + .configure(&config) + .build() + .await; + let conf = aws_sdk_s3::config::Builder::from(&config) + .credentials_provider(assume) + .build(); + Client::from_conf(conf) + } else { + Client::new(&config) + }; + + // Reset pagination and retry list + continuation_token = None; + continue; } + + // Any other error is fatal + return Err(svc_err).context("Failed to list objects in bucket"); + } + }; + + // Process objects + for obj in resp.contents.unwrap_or_default() { + if let Some(key) = obj.key { + let data = client + .get_object() + .bucket(bucket) + .key(&key) + .send() + .await + .with_context(|| format!("Failed to fetch object {}", key))? + .body + .collect() + .await + .context("Failed to read S3 object body")? + .into_bytes() + .to_vec(); + visitor(key, data)?; } } + // Continue or finish pagination if resp.is_truncated.unwrap_or(false) { continuation_token = resp.next_continuation_token; } else { @@ -83,4 +141,4 @@ where } Ok(()) -} \ No newline at end of file +} diff --git a/testdata/archive/kfArchiveTest.7z b/testdata/baseline/archive/kfArchiveTest.7z similarity index 100% rename from testdata/archive/kfArchiveTest.7z rename to testdata/baseline/archive/kfArchiveTest.7z diff --git a/testdata/archive/kfArchiveTest.tar b/testdata/baseline/archive/kfArchiveTest.tar similarity index 100% rename from testdata/archive/kfArchiveTest.tar rename to testdata/baseline/archive/kfArchiveTest.tar diff --git a/testdata/archive/kfArchiveTest.tar.bz2 b/testdata/baseline/archive/kfArchiveTest.tar.bz2 similarity index 100% rename from testdata/archive/kfArchiveTest.tar.bz2 rename to testdata/baseline/archive/kfArchiveTest.tar.bz2 diff --git a/testdata/archive/kfArchiveTest.tar.gz b/testdata/baseline/archive/kfArchiveTest.tar.gz similarity index 100% rename from testdata/archive/kfArchiveTest.tar.gz rename to testdata/baseline/archive/kfArchiveTest.tar.gz diff --git a/testdata/archive/kfArchiveTest.tar.lz4 b/testdata/baseline/archive/kfArchiveTest.tar.lz4 similarity index 100% rename from testdata/archive/kfArchiveTest.tar.lz4 rename to testdata/baseline/archive/kfArchiveTest.tar.lz4 diff --git a/testdata/archive/kfArchiveTest.tar.xz b/testdata/baseline/archive/kfArchiveTest.tar.xz similarity index 100% rename from testdata/archive/kfArchiveTest.tar.xz rename to testdata/baseline/archive/kfArchiveTest.tar.xz diff --git a/testdata/archive/kfArchiveTest.zip b/testdata/baseline/archive/kfArchiveTest.zip similarity index 100% rename from testdata/archive/kfArchiveTest.zip rename to testdata/baseline/archive/kfArchiveTest.zip diff --git a/testdata/archive/kfArchiveTest_zip_inside.zip b/testdata/baseline/archive/kfArchiveTest_zip_inside.zip similarity index 100% rename from testdata/archive/kfArchiveTest_zip_inside.zip rename to testdata/baseline/archive/kfArchiveTest_zip_inside.zip diff --git a/testdata/archive/makeArchives.sh b/testdata/baseline/archive/makeArchives.sh similarity index 100% rename from testdata/archive/makeArchives.sh rename to testdata/baseline/archive/makeArchives.sh diff --git a/testdata/archive/template.zip b/testdata/baseline/archive/template.zip similarity index 100% rename from testdata/archive/template.zip rename to testdata/baseline/archive/template.zip From f8789607e7262ac650c2c38ca44e9e221d02b7c7 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sun, 3 Aug 2025 08:14:32 -0700 Subject: [PATCH 03/13] improving s3 bucket scanning feature --- .../{baseline => }/archive/kfArchiveTest.7z | Bin .../{baseline => }/archive/kfArchiveTest.tar | Bin .../archive/kfArchiveTest.tar.bz2 | Bin .../archive/kfArchiveTest.tar.gz | Bin .../archive/kfArchiveTest.tar.lz4 | Bin .../archive/kfArchiveTest.tar.xz | Bin .../{baseline => }/archive/kfArchiveTest.zip | Bin .../archive/kfArchiveTest_zip_inside.zip | Bin .../{baseline => }/archive/makeArchives.sh | 0 testdata/{baseline => }/archive/template.zip | Bin testdata/baseline/baseline_test.go | 95 ------------------ 11 files changed, 95 deletions(-) rename testdata/{baseline => }/archive/kfArchiveTest.7z (100%) rename testdata/{baseline => }/archive/kfArchiveTest.tar (100%) rename testdata/{baseline => }/archive/kfArchiveTest.tar.bz2 (100%) rename testdata/{baseline => }/archive/kfArchiveTest.tar.gz (100%) rename testdata/{baseline => }/archive/kfArchiveTest.tar.lz4 (100%) rename testdata/{baseline => }/archive/kfArchiveTest.tar.xz (100%) rename testdata/{baseline => }/archive/kfArchiveTest.zip (100%) rename testdata/{baseline => }/archive/kfArchiveTest_zip_inside.zip (100%) rename testdata/{baseline => }/archive/makeArchives.sh (100%) rename testdata/{baseline => }/archive/template.zip (100%) delete mode 100644 testdata/baseline/baseline_test.go diff --git a/testdata/baseline/archive/kfArchiveTest.7z b/testdata/archive/kfArchiveTest.7z similarity index 100% rename from testdata/baseline/archive/kfArchiveTest.7z rename to testdata/archive/kfArchiveTest.7z diff --git a/testdata/baseline/archive/kfArchiveTest.tar b/testdata/archive/kfArchiveTest.tar similarity index 100% rename from testdata/baseline/archive/kfArchiveTest.tar rename to testdata/archive/kfArchiveTest.tar diff --git a/testdata/baseline/archive/kfArchiveTest.tar.bz2 b/testdata/archive/kfArchiveTest.tar.bz2 similarity index 100% rename from testdata/baseline/archive/kfArchiveTest.tar.bz2 rename to testdata/archive/kfArchiveTest.tar.bz2 diff --git a/testdata/baseline/archive/kfArchiveTest.tar.gz b/testdata/archive/kfArchiveTest.tar.gz similarity index 100% rename from testdata/baseline/archive/kfArchiveTest.tar.gz rename to testdata/archive/kfArchiveTest.tar.gz diff --git a/testdata/baseline/archive/kfArchiveTest.tar.lz4 b/testdata/archive/kfArchiveTest.tar.lz4 similarity index 100% rename from testdata/baseline/archive/kfArchiveTest.tar.lz4 rename to testdata/archive/kfArchiveTest.tar.lz4 diff --git a/testdata/baseline/archive/kfArchiveTest.tar.xz b/testdata/archive/kfArchiveTest.tar.xz similarity index 100% rename from testdata/baseline/archive/kfArchiveTest.tar.xz rename to testdata/archive/kfArchiveTest.tar.xz diff --git a/testdata/baseline/archive/kfArchiveTest.zip b/testdata/archive/kfArchiveTest.zip similarity index 100% rename from testdata/baseline/archive/kfArchiveTest.zip rename to testdata/archive/kfArchiveTest.zip diff --git a/testdata/baseline/archive/kfArchiveTest_zip_inside.zip b/testdata/archive/kfArchiveTest_zip_inside.zip similarity index 100% rename from testdata/baseline/archive/kfArchiveTest_zip_inside.zip rename to testdata/archive/kfArchiveTest_zip_inside.zip diff --git a/testdata/baseline/archive/makeArchives.sh b/testdata/archive/makeArchives.sh similarity index 100% rename from testdata/baseline/archive/makeArchives.sh rename to testdata/archive/makeArchives.sh diff --git a/testdata/baseline/archive/template.zip b/testdata/archive/template.zip similarity index 100% rename from testdata/baseline/archive/template.zip rename to testdata/archive/template.zip diff --git a/testdata/baseline/baseline_test.go b/testdata/baseline/baseline_test.go deleted file mode 100644 index 00e7770..0000000 --- a/testdata/baseline/baseline_test.go +++ /dev/null @@ -1,95 +0,0 @@ -package core - -import ( - "io/ioutil" - "os" - "path" - "path/filepath" - "runtime" - "testing" - - "github.com/10gen/kingfisher/core" -) - -func rootDir() string { - _, b, _, _ := runtime.Caller(0) - return filepath.Dir(path.Dir(b)) -} - -func NewTestSession(baselineFilename string) (*core.Session, error) { - session := core.PrepareTestSession() - session.Testing = true - session.ReqScanMode = core.LocalFiles - session.Options.ValidateSecrets = true - session.Options.BaselineFilename = baselineFilename - session.Options.KingfisherTempDir = core.GetTempDir() - core.GlobalSessionRef = session - session.InitializeTargetModeClient() - return session, nil -} - -func beginTesting(t *testing.T, testfile string, expectedSkippedFindings, expectedFindingsSuppressKingfisher int) { - rootdir := rootDir() - testfilePath := filepath.Join(rootdir, testfile) - _, filename := filepath.Split(testfilePath) - - byteBaseLine := []byte(`FileContent: - matches: [] -FilePaths: - matches: [] -ExactFindings: - matches: - - filepath: testdata/ruby_vulnerable.rb - findinghash: 701c302855ecc97e8415c44f37123bc2ca0c3343bd87028682aaaeaa90568084 - linenum: 40 - lastupdated: Tue Apr 16 13:04:10 PDT 2024 - - filepath: testdata/ruby_vulnerable.rb - findinghash: 065d1e2faeae9328ca8b2f2754afa6c196d3ef2da2720dabca7e5161d67a6ca1 - linenum: 40 - lastupdated: Tue Apr 16 13:04:10 PDT 2024 -`) - - // Write byteBaseline to a file in a temp directory and give yaml extension - tempFile, err := ioutil.TempFile("", "baseline-*.yaml") - if err != nil { - t.Fatal(err) - } - defer os.Remove(tempFile.Name()) // Clean up the file after test - - if _, err := tempFile.Write(byteBaseLine); err != nil { - t.Fatal(err) - } - if err := tempFile.Close(); err != nil { - t.Fatal(err) - } - - sess, err := NewTestSession(tempFile.Name()) - if err != nil { - t.Fatal(err) - } - - matchFile := core.NewMatchFile(testfilePath, sess, nil) - core.BeginFileAnalysis(matchFile) - if sess.Stats.SkippedFindings != expectedSkippedFindings { - core.PrintSessionStats(sess) - t.Errorf("Expected %d findings, got %d -- file: <%s>", expectedSkippedFindings, sess.Stats.SkippedFindings, filename) - } -} - -func TestBaselineFeature(t *testing.T) { - - tests := []struct { - fileName string - expectedSkippedFindings int - expectedFindingsSuppressKingfisher int - }{ - {"ruby_vulnerable.rb", 3, 0}, - } - - for _, tt := range tests { - t.Run(tt.fileName, func(t *testing.T) { - beginTesting(t, tt.fileName, tt.expectedSkippedFindings, tt.expectedFindingsSuppressKingfisher) - }) - } - -} From 9a3d27f88136b5ac9903f5ab9f16f0f07a895b42 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sun, 3 Aug 2025 08:56:22 -0700 Subject: [PATCH 04/13] added integration test --- tests/int_s3.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tests/int_s3.rs diff --git a/tests/int_s3.rs b/tests/int_s3.rs new file mode 100644 index 0000000..948734c --- /dev/null +++ b/tests/int_s3.rs @@ -0,0 +1,21 @@ +use anyhow::Result; +use kingfisher::s3::visit_bucket_objects; + +#[tokio::test] +async fn test_visit_public_bucket() -> Result<()> { + let mut objects = Vec::new(); + visit_bucket_objects("wikisum", None, None, None, |key, data| { + objects.push((key, data)); + Ok(()) + }) + .await?; + + assert!(objects.iter().any(|(k, _)| k == "README.txt"), "README object not found"); + let creds = objects.iter().find(|(k, _)| k == "README.txt").expect("README object"); + let body = std::str::from_utf8(&creds.1)?; + assert!( + body.contains("This dataset provides how-to articles"), + "expected README file" + ); + Ok(()) +} \ No newline at end of file From 10d604418b2b9541cb33ff345f2b585e697c20d6 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sun, 3 Aug 2025 09:45:52 -0700 Subject: [PATCH 05/13] improved integration test and updated README --- README.md | 55 +++++++++++++++++++++++++++++++++++++++++++++++-- src/s3.rs | 6 +++--- tests/int_s3.rs | 17 ++++++++++----- 3 files changed, 68 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 4af4295..2297709 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Kingfisher originated as a fork of Praetorian's [Nosey Parker](https://github.co ## What Kingfisher Adds - **Live validation** via cloud-provider APIs - **Language-aware detection** (source-code parsing) for ~20 languages -- **Extra targets**: GitLab repos, Docker images, Jira issues, and Slack messages +- **Extra targets**: GitLab repos, S3 buckets, Docker images, Jira issues, and Slack messages - **Baseline mode**: ignore known secrets, flag only new ones - **Native Windows** binary @@ -26,7 +26,7 @@ Kingfisher originated as a fork of Praetorian's [Nosey Parker](https://github.co - **Docker images**: public or private via `--docker-image` - **Jira issues**: JQL‑driven scans with `--jira-url` and `--jql` - **Slack messages**: query‑based scans with `--slack-query` - - **AWS S3**: bucket scans via `--s3-bucket`/`--s3-prefix` with credentials from `KF_AWS_KEY`/`KF_AWS_SECRET`, `--role-arn`, or `--aws-local-profile` + - **AWS S3**: bucket scans via `--s3-bucket`/`--s3-prefix` with credentials from `KF_AWS_KEY`/`KF_AWS_SECRET`, `--role-arn`, `--aws-local-profile`, or anonymous - **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) **Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation) @@ -274,6 +274,57 @@ kingfisher scan ./my-project \ --exclude tests \ -v ``` +## Scan an S3 bucket +You can scan S3 objects directly: + +```bash +kingfisher scan --s3-bucket bucket-name [--s3-prefix path/] +``` + +Credential resolution happens in this order: + +1. `KF_AWS_KEY` and `KF_AWS_SECRET` environment variables +2. `--aws-local-profile` pointing to a profile in `~/.aws/config` (works with AWS SSO) +3. anonymous access for public buckets + +If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role. + +Examples: + +```bash +# using explicit keys +export KF_AWS_KEY=AKIA... +export KF_AWS_SECRET=g5nYW... +kingfisher scan --s3-bucket some-example-bucket + +# Above can also be run as: +KF_AWS_KEY=AKIA... KF_AWS_SECRET=g5nYW... kingfisher scan --s3-bucket some-example-bucket + +# using a local profile (e.g., SSO) that exists in your AWS profile (~/.aws/config) +kingfisher scan --s3-bucket some-example-bucket --aws-local-profile myprofile + +# anonymous scan of a bucket, while providing an object prefix to only scan subset of the s3 bucket +kingfisher scan \ + --s3-bucket awsglue-datasets \ + --s3-prefix examples/us-legislators/all + +# assuming a role when scanning +kingfisher scan --s3-bucket some-example-bucket \ + --role-arn arn:aws:iam::123456789012:role/MyRole + +# anonymous scan of a public bucket +kingfisher scan --s3-bucket some-example-bucket +``` + +Docker example: + +```bash +docker run --rm \ + -e KF_AWS_KEY=AKIA... \ + -e KF_AWS_SECRET=g5nYW... \ + ghcr.io/mongodb/kingfisher:latest \ + scan --s3-bucket bucket-name +``` ## Scanning Docker Images Kingfisher will first try to use any locally available image, then fall back to pulling via OCI. diff --git a/src/s3.rs b/src/s3.rs index 7a180f4..ed18a52 100644 --- a/src/s3.rs +++ b/src/s3.rs @@ -1,10 +1,10 @@ use anyhow::{Context, Result}; -use aws_config::{defaults, meta::region::RegionProviderChain, BehaviorVersion, ConfigLoader}; +use aws_config::{defaults, meta::region::RegionProviderChain, BehaviorVersion}; use aws_credential_types::Credentials; use aws_sdk_s3::{ Client, operation::list_objects_v2::ListObjectsV2Error, // modeled service error - error::ProvideErrorMetadata, // for .code() :contentReference[oaicite:8]{index=8} + error::ProvideErrorMetadata, // for .code() }; use aws_types::region::Region; use reqwest; // HTTP client for HEAD fallback @@ -66,7 +66,7 @@ where // On error, extract the modeled service error Err(err) => { - let svc_err: ListObjectsV2Error = err.into_service_error(); // from SdkError :contentReference[oaicite:9]{index=9} + let svc_err: ListObjectsV2Error = err.into_service_error(); // from SdkError // If the bucket must be addressed at another region... if svc_err.code() == Some("PermanentRedirect") { diff --git a/tests/int_s3.rs b/tests/int_s3.rs index 948734c..c44afe8 100644 --- a/tests/int_s3.rs +++ b/tests/int_s3.rs @@ -4,18 +4,25 @@ use kingfisher::s3::visit_bucket_objects; #[tokio::test] async fn test_visit_public_bucket() -> Result<()> { let mut objects = Vec::new(); - visit_bucket_objects("wikisum", None, None, None, |key, data| { + visit_bucket_objects("awsglue-datasets", Some("examples/us-legislators/all/"), None, None, |key, data| { objects.push((key, data)); Ok(()) }) .await?; - assert!(objects.iter().any(|(k, _)| k == "README.txt"), "README object not found"); - let creds = objects.iter().find(|(k, _)| k == "README.txt").expect("README object"); + assert!( + objects.iter().any(|(k, _)| k.ends_with("events.json")), + "events.json object not found" + ); + let creds = objects + .iter() + .find(|(k, _)| k.ends_with("events.json")) + .expect("events.json object"); + let body = std::str::from_utf8(&creds.1)?; assert!( - body.contains("This dataset provides how-to articles"), - "expected README file" + body.contains("Q4450263"), + "expected events.json file" ); Ok(()) } \ No newline at end of file From 141640ef4a1732f8c4ac47375a42499bfe9a7d17 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sun, 3 Aug 2025 10:35:52 -0700 Subject: [PATCH 06/13] Update src/scanner/runner.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/scanner/runner.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs index f8dae87..63f7bee 100644 --- a/src/scanner/runner.rs +++ b/src/scanner/runner.rs @@ -96,9 +96,6 @@ pub async fn run_async_scan( } } - // if input_roots.is_empty() { - // bail!("No inputs to scan"); - // } let shared_profiler = Arc::new(ConcurrentRuleProfiler::new()); let enable_profiling = args.rule_stats; let matcher_stats = Mutex::new(MatcherStats::default()); From 459d4d0ef0560bb9b15841863ed0932167b39c97 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sun, 3 Aug 2025 10:37:02 -0700 Subject: [PATCH 07/13] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- CHANGELOG.md | 1 - Cargo.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 07607fe..2c92a92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,6 @@ All notable changes to this project will be documented in this file. ## [1.32.0] - Added support for scanning AWS S3 buckets via `--s3-bucket` and optional `--s3-prefix` - Added `--role-arn` and `--aws-local-profile` flags for S3 authentication alongside `KF_AWS_KEY`/`KF_AWS_SECRET` -- ## [1.31.0] - New rules: Telegram bot token, OpenWeatherMap, Apify, Groq - New OpenAI detectors added (@joshlarsen) diff --git a/Cargo.toml b/Cargo.toml index 686812d..425db27 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -187,7 +187,6 @@ walkdir = "2.5.0" p256 = "0.13.2" ed25519-dalek = { version = "2.2", features = ["pkcs8"] } aws-sdk-s3 = "1.100.0" -aws-smithy-http = "0.62.2" [dependencies.tikv-jemallocator] version = "0.6" From ef51e77e249c48c17d26920c6abca463abfd66da Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sun, 3 Aug 2025 20:59:58 -0700 Subject: [PATCH 08/13] updating s3 feature --- README.md | 2 +- data/rules/vmware.yml | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 data/rules/vmware.yml diff --git a/README.md b/README.md index 2297709..b60ae55 100644 --- a/README.md +++ b/README.md @@ -301,7 +301,7 @@ kingfisher scan --s3-bucket some-example-bucket KF_AWS_KEY=AKIA... KF_AWS_SECRET=g5nYW... kingfisher scan --s3-bucket some-example-bucket # using a local profile (e.g., SSO) that exists in your AWS profile (~/.aws/config) -kingfisher scan --s3-bucket some-example-bucket --aws-local-profile myprofile +kingfisher scan --s3-bucket some-example-bucket --aws-local-profile default # anonymous scan of a bucket, while providing an object prefix to only scan subset of the s3 bucket kingfisher scan \ diff --git a/data/rules/vmware.yml b/data/rules/vmware.yml new file mode 100644 index 0000000..9332a37 --- /dev/null +++ b/data/rules/vmware.yml @@ -0,0 +1,21 @@ +rules: + - name: Credentials in Connect-VIServer Invocation + id: kingfisher.vmware.1 + pattern: | + (?xi) + Connect-VIServer + .{0,50} + -User \s+ (\S{3,30}) \s+ (?# username ) + .{0,50} + -Password \s+ (\S{3,30}) (?# password ) + + examples: + - 'Connect-VIServer -Server 192.168.1.51 -User administrator@vSphere.local -Password VMware1!' + - | + #Set-PowerCLIConfiguration -InvalidCertificateAction:Ignore + Connect-VIServer "$endpoint" -User "$username" -Password "$password" | Out-Null + - 'Connect-VIServer $ESXiHost.EsxiHost -user $ESXiUser -password $ESXipass' + - '$null = connect-viserver vc.lab.local -user administrator@vsphere.local -password VMware1!' + + references: + - https://developer.broadcom.com/powercli/latest/vmware.vimautomation.core/commands/connect-viserver \ No newline at end of file From 27b37245e71b0d7b7472dfb8ede298037e0c363d Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 4 Aug 2025 08:58:06 -0700 Subject: [PATCH 09/13] refactored output reporting and formatting logic --- CHANGELOG.md | 2 + data/rules/credentials.yml | 2 +- data/rules/generic.yml | 1 - f1.patch | 719 ++++++++++++++++++++++++++++++++++ src/cli/commands/inputs.rs | 1 - src/lib.rs | 2 +- src/reporter.rs | 229 +++++++++-- src/reporter/bson_format.rs | 81 +--- src/reporter/json_format.rs | 500 ++--------------------- src/reporter/pretty_format.rs | 439 ++++----------------- src/reporter/sarif_format.rs | 350 +++-------------- src/s3.rs | 12 +- src/scanner/repos.rs | 15 +- src/scanner/runner.rs | 2 - tests/int_s3.rs | 27 +- 15 files changed, 1108 insertions(+), 1274 deletions(-) create mode 100644 f1.patch diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c92a92..6483f58 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project will be documented in this file. ## [1.32.0] - Added support for scanning AWS S3 buckets via `--s3-bucket` and optional `--s3-prefix` - Added `--role-arn` and `--aws-local-profile` flags for S3 authentication alongside `KF_AWS_KEY`/`KF_AWS_SECRET` +- Refactored output reporting and formatting logic + ## [1.31.0] - New rules: Telegram bot token, OpenWeatherMap, Apify, Groq - New OpenAI detectors added (@joshlarsen) diff --git a/data/rules/credentials.yml b/data/rules/credentials.yml index 38c1057..cb7e866 100644 --- a/data/rules/credentials.yml +++ b/data/rules/credentials.yml @@ -20,6 +20,6 @@ rules: [a-z0-9\/._~-]* )? min_entropy: 3.0 - confidence: low + confidence: medium examples: - https://eaRIWNkE:qyOIhJiM@j2LYY414Q5cCYD \ No newline at end of file diff --git a/data/rules/generic.yml b/data/rules/generic.yml index 9338797..3f71d4a 100644 --- a/data/rules/generic.yml +++ b/data/rules/generic.yml @@ -96,7 +96,6 @@ rules: ["'] min_entropy: 3.3 confidence: low - categories: [fuzzy, generic, secret] examples: - | password = "super$ecret" diff --git a/f1.patch b/f1.patch new file mode 100644 index 0000000..a132a31 --- /dev/null +++ b/f1.patch @@ -0,0 +1,719 @@ +diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs +index 9fcb1ecdfe8decc60278848c4a7be43cc9ebee70..b600f9f65838e52ce5dc3d7bb3bb1a5d5ff2bcaf 100644 +--- a/src/reporter/json_format.rs ++++ b/src/reporter/json_format.rs +@@ -1,436 +1,80 @@ +-use http::StatusCode; +-use serde_json::json; +- + use super::*; +-use crate::bstring_escape::Escaped; + + impl DetailsReporter { +- pub fn deduplicate_matches( +- &self, +- matches: Vec, +- no_dedup: bool, +- ) -> Vec { +- if no_dedup { +- return matches; +- } +- +- use std::collections::HashMap; +- let mut by_fp: HashMap = HashMap::new(); +- +- for rm in matches { +- let fp = rm.m.finding_fingerprint; +- if let Some(existing) = by_fp.get_mut(&fp) { +- // merge origin sets (keep first origin, append the rest) +- for o in rm.origin.iter() { +- if !existing.origin.iter().any(|e| e == o) { +- existing.origin = OriginSet::new( +- existing.origin.first().clone(), +- existing +- .origin +- .iter() +- .skip(1) +- .cloned() +- .chain(std::iter::once(o.clone())) +- .collect(), +- ); +- } +- } +- continue; +- } +- by_fp.insert(fp, rm); +- } +- by_fp.into_values().collect() +- } +- +- pub fn gather_json_findings( +- &self, +- args: &cli::commands::scan::ScanArgs, +- ) -> Result> { +- let mut matches = self.get_filtered_matches()?; +- if !args.no_dedup { +- matches = self.deduplicate_matches(matches, args.no_dedup); +- } +- +- let mut json_findings = Vec::new(); +- for rm in matches { +- let source_span = &rm.m.location.source_span; +- let line_num = source_span.start.line; +- +- let snippet = Escaped( +- rm.m.groups +- .captures +- .get(1) +- .or_else(|| rm.m.groups.captures.get(0)) +- .map(|capture| capture.value.as_bytes()) +- .unwrap_or_default(), +- ) +- .to_string(); +- +- let validation_status = if rm.validation_success { +- "Active Credential" +- } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { +- "Not Attempted" +- } else { +- "Inactive Credential" +- }; +- +- const MAX_RESPONSE_LENGTH: usize = 512; +- let truncated_body: String = +- rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); +- let ellipsis = +- if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; +- let response_body = format!("{}{}", truncated_body, ellipsis); +- +- // Call extract_git_metadata on each GitRepo origin and take the first non-null result. +- let git_metadata_val = rm +- .origin +- .iter() +- .filter_map(|origin| { +- if let Origin::GitRepo(e) = origin { +- self.extract_git_metadata(e, source_span) +- } else { +- None +- } +- }) +- .next() +- .unwrap_or(serde_json::Value::Null); +- +- // Collect a file path from an Origin::File, if available. +- let file_path = rm +- .origin +- .iter() +- .find_map(|origin| match origin { +- Origin::File(e) => { +- if let Some(url) = self.jira_issue_url(&e.path, args) { +- Some(url) +- } else if let Some(url) = self.slack_message_url(&e.path) { +- Some(url) +- } else if let Some(mapped) = self.s3_display_path(&e.path) { +- Some(mapped) +- } else if let Some(mapped) = self.docker_display_path(&e.path) { +- Some(mapped) +- } else { +- Some(e.path.display().to_string()) +- } +- } +- Origin::Extended(e) => e.path().map(|p| p.display().to_string()), +- _ => None, +- }) +- .unwrap_or_default(); +- +- let match_json = json!({ +- "rule": { +- "name": rm.m.rule_name, +- "id": rm.m.rule_text_id, +- }, +- "finding": { +- "snippet": snippet, +- "fingerprint": rm.m.finding_fingerprint.to_string(), +- "confidence": rm.match_confidence.to_string(), +- "entropy": format!("{:.2}", rm.m.calculated_entropy), +- "validation": { +- "status": validation_status, +- "response": response_body, +- }, +- "language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()), +- "line": line_num, +- "column_start": source_span.start.column, +- "column_end": source_span.end.column, +- "path": file_path, +- "git_metadata": git_metadata_val +- } +- }); +- +- let finding_json = json!({ +- "id": rm.m.rule_text_id, +- "matches": [ match_json ] +- }); +- json_findings.push(finding_json); +- } +- Ok(json_findings) +- } + pub fn json_format( + &self, + mut writer: W, + args: &cli::commands::scan::ScanArgs, + ) -> Result<()> { +- let mut findings = Vec::new(); +- +- // Get filtered matches +- let mut matches = self.get_filtered_matches()?; +- +- // Apply deduplication only if requested +- if !args.no_dedup { +- matches = self.deduplicate_matches(matches, args.no_dedup); +- } +- +- // For each match, handle it based on the no_dedup flag +- for rm in matches { +- if args.no_dedup && rm.origin.len() > 1 { +- // For no_dedup and multiple origins, create separate findings for each origin +- for origin in rm.origin.iter() { +- // Create a single-origin version of this match +- let single_origin_rm = ReportMatch { +- origin: OriginSet::new(origin.clone(), Vec::new()), +- blob_metadata: rm.blob_metadata.clone(), +- m: rm.m.clone(), +- comment: rm.comment.clone(), +- visible: rm.visible, +- match_confidence: rm.match_confidence, +- validation_response_body: rm.validation_response_body.clone(), +- validation_response_status: rm.validation_response_status, +- validation_success: rm.validation_success, +- }; +- +- // Process this single-origin match into a JSON finding +- let json_finding = self.process_match_to_json(&single_origin_rm, args)?; +- findings.push(json_finding); +- } +- } else { +- // Process normally for deduped matches or matches with only one origin +- let json_finding = self.process_match_to_json(&rm, args)?; +- findings.push(json_finding); +- } +- } +- +- // Write the JSON output +- if !findings.is_empty() { +- serde_json::to_writer_pretty(&mut writer, &findings)?; ++ let records = self.build_finding_records(args)?; ++ if !records.is_empty() { ++ serde_json::to_writer_pretty(&mut writer, &records)?; + writeln!(writer)?; + } + Ok(()) + } + +- // Add a helper method to convert a ReportMatch to a JSON finding +- pub fn process_match_to_json( +- &self, +- rm: &ReportMatch, +- args: &cli::commands::scan::ScanArgs, +- ) -> Result { +- // Extract the relevant data from the match as you already do in your current implementation +- let source_span = &rm.m.location.source_span; +- let line_num = source_span.start.line; +- +- let snippet = Escaped( +- rm.m.groups +- .captures +- .get(1) +- .or_else(|| rm.m.groups.captures.get(0)) +- .map(|capture| capture.value.as_bytes()) +- .unwrap_or_default(), +- ) +- .to_string(); +- +- let validation_status = if rm.validation_success { +- "Active Credential" +- } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { +- "Not Attempted" +- } else { +- "Inactive Credential" +- }; +- +- const MAX_RESPONSE_LENGTH: usize = 512; +- let truncated_body: String = +- rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); +- let ellipsis = +- if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; +- let response_body = format!("{}{}", truncated_body, ellipsis); +- +- // Call extract_git_metadata on each GitRepo origin and take the first non-null result. +- let git_metadata_val = rm +- .origin +- .iter() +- .filter_map(|origin| { +- if let Origin::GitRepo(e) = origin { +- self.extract_git_metadata(e, source_span) +- } else { +- None +- } +- }) +- .next() +- .unwrap_or(serde_json::Value::Null); +- +- // Collect a file path from an Origin::File, if available. +- let file_path = rm +- .origin +- .iter() +- .find_map(|origin| { +- if let Origin::File(e) = origin { +- if let Some(url) = self.jira_issue_url(&e.path, args) { +- Some(url) +- } else if let Some(url) = self.slack_message_url(&e.path) { +- Some(url) +- } else if let Some(mapped) = self.s3_display_path(&e.path) { +- Some(mapped) +- } else if let Some(mapped) = self.docker_display_path(&e.path) { +- Some(mapped) +- } else { +- Some(e.path.display().to_string()) +- } +- } else if let Origin::Extended(e) = origin { +- e.path().map(|p| p.display().to_string()) +- } else { +- None +- } +- }) +- .unwrap_or_default(); +- +- let match_json = json!({ +- "rule": { +- "name": rm.m.rule_name, +- "id": rm.m.rule_text_id, +- }, +- "finding": { +- "snippet": snippet, +- "fingerprint": rm.m.finding_fingerprint.to_string(), +- "confidence": rm.match_confidence.to_string(), +- "entropy": format!("{:.2}", rm.m.calculated_entropy), +- "validation": { +- "status": validation_status, +- "response": response_body, +- }, +- "language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()), +- "line": line_num, +- "column_start": source_span.start.column, +- "column_end": source_span.end.column, +- "path": file_path, +- "git_metadata": git_metadata_val +- } +- }); +- +- let finding_json = json!({ +- "id": rm.m.rule_text_id, +- "matches": [ match_json ] +- }); +- +- Ok(finding_json) +- } +- // // Modified JSON format to pass args to gather_json_findings +- // pub fn json_format( +- // &self, +- // mut writer: W, +- // args: &cli::commands::scan::ScanArgs, +- // ) -> Result<()> { +- // let findings = self.gather_json_findings(args)?; +- // if !findings.is_empty() { +- // serde_json::to_writer_pretty(&mut writer, &findings)?; +- // writeln!(writer)?; +- // } +- // Ok(()) +- // } +- + pub fn jsonl_format( + &self, + mut writer: W, + args: &cli::commands::scan::ScanArgs, + ) -> Result<()> { +- // Get filtered matches +- let mut matches = self.get_filtered_matches()?; +- +- // Apply deduplication only if requested +- if !args.no_dedup { +- matches = self.deduplicate_matches(matches, args.no_dedup); +- } +- +- // For each match, handle it based on the no_dedup flag +- for rm in matches { +- if args.no_dedup && rm.origin.len() > 1 { +- // For no_dedup and multiple origins, create separate findings for each origin +- for origin in rm.origin.iter() { +- // Create a single-origin version of this match +- let single_origin_rm = ReportMatch { +- origin: OriginSet::new(origin.clone(), Vec::new()), +- blob_metadata: rm.blob_metadata.clone(), +- m: rm.m.clone(), +- comment: rm.comment.clone(), +- visible: rm.visible, +- match_confidence: rm.match_confidence, +- validation_response_body: rm.validation_response_body.clone(), +- validation_response_status: rm.validation_response_status, +- validation_success: rm.validation_success, +- }; +- +- // Process this single-origin match into a JSON finding and write it +- let json_finding = self.process_match_to_json(&single_origin_rm, args)?; +- serde_json::to_writer(&mut writer, &json_finding)?; +- writeln!(writer)?; +- } +- } else { +- // Process normally for deduped matches or matches with only one origin +- let json_finding = self.process_match_to_json(&rm, args)?; +- serde_json::to_writer(&mut writer, &json_finding)?; +- writeln!(writer)?; +- } ++ let records = self.build_finding_records(args)?; ++ for record in records { ++ serde_json::to_writer(&mut writer, &record)?; ++ writeln!(writer)?; + } + Ok(()) + } +- // // Modified JSONL format to pass args to gather_json_findings +- // pub fn jsonl_format( +- // &self, +- // mut writer: W, +- // args: &cli::commands::scan::ScanArgs, +- // ) -> Result<()> { +- // let findings = self.gather_json_findings(args)?; +- // for finding in findings { +- // serde_json::to_writer(&mut writer, &finding)?; +- // writeln!(writer)?; +- // } +- // Ok(()) +- // } + } + + #[cfg(test)] + mod tests { +- use std::{ +- io::Cursor, +- path::PathBuf, +- sync::{Arc, Mutex}, +- }; +- +- use anyhow::Result; +- use serde_json::Value; +- use url::Url; +- + use super::*; + use crate::{ + blob::BlobId, +- cli::commands::{ +- github::{GitCloneMode, GitHistoryMode, GitHubRepoType}, +- inputs::{ContentFilteringArgs, InputSpecifierArgs}, +- output::OutputArgs, +- rules::RuleSpecifierArgs, +- scan::ConfidenceLevel, ++ cli::commands::github::GitHubRepoType, ++ cli::commands::output::{OutputArgs, ReportOutputFormat}, ++ cli::commands::scan::{ ++ ConfidenceLevel, ContentFilteringArgs, GitCloneMode, GitHistoryMode, ++ InputSpecifierArgs, RuleSpecifierArgs, + }, + findings_store::FindingsStore, + location::{Location, OffsetSpan, SourcePoint, SourceSpan}, +- matcher::{Match, SerializableCapture, SerializableCaptures}, +- origin::{Origin, OriginSet}, +- reporter::{ReportMatch, Styles}, +- rules::rule::Confidence, +- util::intern, ++ matcher::serializable::{SerializableCapture, SerializableCaptures}, ++ matcher::Match, ++ origin::Origin, ++ reporter::styles::Styles, ++ scanner::test_utils::intern, + }; ++ use std::{ ++ io::Cursor, ++ path::PathBuf, ++ sync::{Arc, Mutex}, ++ }; ++ use url::Url; + + fn create_default_args() -> cli::commands::scan::ScanArgs { + use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope + + cli::commands::scan::ScanArgs { + num_jobs: 1, + no_dedup: false, + rules: RuleSpecifierArgs { + rules_path: Vec::new(), + rule: vec!["all".into()], + load_builtins: true, + }, + input_specifier_args: InputSpecifierArgs { + // local path / git URL inputs + path_inputs: Vec::new(), + git_url: Vec::new(), + + // GitHub + github_user: Vec::new(), + github_organization: Vec::new(), + all_github_organizations: false, + github_api_url: Url::parse("https://api.github.com/").unwrap(), + github_repo_type: GitHubRepoType::Source, + + // GitLab +diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs +index 9fcb1ecdfe8decc60278848c4a7be43cc9ebee70..b600f9f65838e52ce5dc3d7bb3bb1a5d5ff2bcaf 100644 +--- a/src/reporter/json_format.rs ++++ b/src/reporter/json_format.rs +@@ -458,240 +102,168 @@ mod tests { + git_history: GitHistoryMode::Full, + scan_nested_repos: true, + commit_metadata: true, + }, + content_filtering_args: ContentFilteringArgs { + max_file_size_mb: 25.0, + no_extract_archives: false, + extraction_depth: 2, + exclude: Vec::new(), // Exclude patterns + no_binary: true, + }, + confidence: ConfidenceLevel::Medium, + no_validate: false, + rule_stats: false, + only_valid: false, + min_entropy: None, + redact: false, + git_repo_timeout: 1800, // 30 minutes + output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, + snippet_length: 256, + baseline_file: None, + manage_baseline: false, + } + } + +- // Helper function to create a mock Match + fn create_mock_match( + rule_name: &str, + rule_text_id: &str, + rule_finding_fingerprint: &str, + validation_success: bool, + ) -> Match { + Match { + location: Location { + offset_span: OffsetSpan { start: 10, end: 20 }, + source_span: SourceSpan { + start: SourcePoint { line: 5, column: 10 }, + end: SourcePoint { line: 5, column: 20 }, + }, + }, + groups: SerializableCaptures { + captures: vec![SerializableCapture { + name: Some("token".to_string()), + match_number: 1, + start: 10, + end: 20, + value: "mock_token".into(), + }], + }, + blob_id: BlobId::new(b"mock_blob"), + finding_fingerprint: 0123, + rule_finding_fingerprint: intern(rule_finding_fingerprint), + rule_text_id: intern(rule_text_id), +- rule_name: intern(rule_name), //.to_string(), ++ rule_name: intern(rule_name), + rule_confidence: Confidence::Medium, + validation_response_body: "validation response".to_string(), + validation_response_status: 200, + validation_success, + calculated_entropy: 4.5, + visible: true, + } + } + +- // Helper function to create a mock DetailsReporter + fn setup_mock_reporter(matches: Vec) -> DetailsReporter { + let mut datastore = FindingsStore::new(PathBuf::from("/tmp")); +- // Create mock origin and blob metadata for the first test match + if !matches.is_empty() { + let blob_metadata = BlobMetadata { + id: BlobId::new(b"mock_blob"), + num_bytes: 1024, + mime_essence: Some("text/plain".to_string()), + charset: Some("UTF-8".to_string()), + language: Some("Rust".to_string()), + }; + let dedup = true; +- // Add matches to datastore + for m in matches.clone() { + datastore.record( + vec![( + Arc::new(OriginSet::new( +- // OriginSet -- Arc<…> + Origin::from_file(PathBuf::from("/mock/path/file.rs")), + vec![], + )), +- Arc::new(blob_metadata.clone()), // BlobMetadata -- Arc<…> ++ Arc::new(blob_metadata.clone()), + m.m.clone(), + )], + dedup, + ); + } + } + DetailsReporter { + datastore: Arc::new(Mutex::new(datastore)), + styles: Styles::new(false), + only_valid: false, + } + } ++ + #[test] + fn test_json_format() -> Result<()> { +- // Create a mock match with successful validation + let mock_match = + create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); + let matches = vec![ReportMatch { + origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]), + blob_metadata: BlobMetadata { + id: BlobId::new(b"mock_blob"), + num_bytes: 1024, + mime_essence: Some("text/plain".to_string()), + charset: Some("UTF-8".to_string()), + language: Some("Rust".to_string()), + }, + m: mock_match, + comment: None, + match_confidence: Confidence::Medium, + visible: true, + validation_response_body: "validation response".to_string(), + validation_response_status: 200, + validation_success: true, + }]; + let reporter = setup_mock_reporter(matches); + let mut output = Cursor::new(Vec::new()); +- // Call the json_format method + reporter.json_format(&mut output, &create_default_args())?; +- // Parse and validate JSON output +- let json_output: Vec = serde_json::from_slice(&output.into_inner())?; ++ let json_output: Vec = serde_json::from_slice(&output.into_inner())?; + assert!(!json_output.is_empty(), "JSON output should not be empty"); +- let first_finding = &json_output[0]; +- assert!(first_finding.get("id").is_some(), "Finding should have an 'id'"); +- assert!(first_finding.get("matches").is_some(), "Finding should have 'matches'"); +- // Validate the structure of the first match +- let matches = first_finding.get("matches").unwrap().as_array().unwrap(); +- let first_match = &matches[0]; +- assert_eq!(first_match.get("rule").unwrap().get("name").unwrap(), "MockRule"); +- assert_eq!(first_match.get("finding").unwrap().get("language").unwrap(), "Rust"); ++ let first = &json_output[0]; ++ assert_eq!(first["rule"]["name"], "MockRule"); ++ assert_eq!(first["finding"]["language"], "Rust"); + Ok(()) + } + +- // #[test] +- // fn test_jsonl_format() -> Result<()> { +- // // Create a mock match with successful validation +- // let mock_match = +- // create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); +- // let matches = vec![ReportMatch { +- // origin: OriginSet::new( +- // Origin::from_file(PathBuf::from("/mock/path/file.rs")), +- // vec![], +- // ), +- // blob_metadata: BlobMetadata { +- // id: BlobId::new(b"mock_blob"), +- // num_bytes: 1024, +- // mime_essence: Some("text/plain".to_string()), +- // charset: Some("UTF-8".to_string()), +- // language: Some("Rust".to_string()), +- // }, +- // m: mock_match, +- // comment: None, +- // match_confidence: Confidence::Medium, +- // visible: true, +- // validation_response_body: "validation response".to_string(), +- // validation_response_status: 200, +- // validation_success: true, +- // }]; +- // let reporter = setup_mock_reporter(matches); +- // let mut output = Cursor::new(Vec::new()); +- // // Call the jsonl_format method +- // reporter.jsonl_format(&mut output, &create_default_args())?; +- // // Split output into lines and validate +- // let jsonl_output = String::from_utf8(output.into_inner())?; +- // let lines: Vec<&str> = jsonl_output.lines().collect(); +- // assert!(!lines.is_empty(), "JSONL output should not be empty"); +- // for line in &lines { +- // let json_value: serde_json::Value = serde_json::from_str(line)?; +- // assert!( +- // json_value.get("rule_name").is_some(), +- // "Each line should have a 'rule_name'" +- // ); +- // assert!( +- // json_value.get("matches").is_some(), +- // "Each line should have 'matches'" +- // ); +- // } +- // Ok(()) +- // } +- + #[test] + fn test_validation_status_in_json() -> Result<()> { +- // Test validation status in JSON output + let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")]; + for (validation_success, expected_status) in test_cases { + let mock_match = create_mock_match( + "MockRule", + "mock_rule_1", + "mock_finding_fingerprint", + validation_success, + ); + let matches = vec![ReportMatch { + origin: OriginSet::new( + Origin::from_file(PathBuf::from("/mock/path/file.rs")), + vec![], + ), + blob_metadata: BlobMetadata { + id: BlobId::new(b"mock_blob"), + num_bytes: 1024, + mime_essence: Some("text/plain".to_string()), + charset: Some("UTF-8".to_string()), + language: Some("Rust".to_string()), + }, + m: mock_match, + comment: None, + match_confidence: Confidence::Medium, + visible: true, + validation_response_body: "validation response".to_string(), + validation_response_status: 200, + validation_success, + }]; + let reporter = setup_mock_reporter(matches); + let mut output = Cursor::new(Vec::new()); +- // Call the json_format method + reporter.json_format(&mut output, &create_default_args())?; +- // Parse and validate JSON output +- let json_output: Vec = serde_json::from_slice(&output.into_inner())?; ++ let json_output: Vec = serde_json::from_slice(&output.into_inner())?; + assert!(!json_output.is_empty(), "JSON output should not be empty"); +- let first_finding = &json_output[0]; +- let matches = first_finding.get("matches").unwrap().as_array().unwrap(); +- let first_match = &matches[0]; +- let validation_status = first_match +- .get("finding") +- .unwrap() +- .get("validation") +- .unwrap() +- .get("status") +- .unwrap() +- .as_str() +- .unwrap(); ++ let first = &json_output[0]; ++ let validation_status = first["finding"]["validation"]["status"].as_str().unwrap(); + assert_eq!(validation_status, expected_status); + } + Ok(()) + } + } diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs index ea38722..13bc78b 100644 --- a/src/cli/commands/inputs.rs +++ b/src/cli/commands/inputs.rs @@ -124,7 +124,6 @@ pub struct InputSpecifierArgs { #[arg(long, requires = "s3_bucket")] pub aws_local_profile: Option, - /// Docker/OCI images to scan (no local Docker required) #[arg(long = "docker-image")] pub docker_image: Vec, diff --git a/src/lib.rs b/src/lib.rs index 90d0451..04f7303 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,8 +29,8 @@ pub mod rule_loader; pub mod rule_profiling; pub mod rules; pub mod rules_database; -pub mod safe_list; pub mod s3; +pub mod safe_list; pub mod scanner; pub mod scanner_pool; pub mod serde_utils; diff --git a/src/reporter.rs b/src/reporter.rs index ad0efe9..5159651 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -5,12 +5,12 @@ use std::{ use anyhow::Result; use http::StatusCode; -use indenter::indented; use schemars::JsonSchema; use serde::Serialize; use crate::{ blob::BlobMetadata, + bstring_escape::Escaped, cli, cli::global::GlobalArgs, finding_data, findings_store, @@ -226,38 +226,6 @@ impl DetailsReporter { .collect()) } - // fn process_matches(&self, only_valid: bool) -> Result> { - // let datastore = self.datastore.lock().unwrap(); - // Ok(datastore - // .get_matches() - // .iter() - // .filter(|msg| { - // let (_origin, _blob_metadata, match_item) = &***msg; - // if only_valid { - // match_item.validation_success - // && match_item.validation_response_status != StatusCode::CONTINUE.as_u16() - // && match_item.visible - // } else { - // match_item.visible - // } - // }) - // .map(|msg| { - // let (origin, blob_metadata, match_item) = &**msg; - // ReportMatch { - // origin: origin.clone(), - // blob_metadata: blob_metadata.clone(), - // m: match_item.clone(), - // comment: None, - // visible: match_item.visible, - // match_confidence: match_item.rule_confidence, - // validation_response_body: match_item.validation_response_body.clone(), - // validation_response_status: match_item.validation_response_status, - // validation_success: match_item.validation_success, - // } - // }) - // .collect()) - // } - pub fn get_filtered_matches(&self) -> Result> { self.process_matches(self.only_valid, true) } @@ -266,6 +234,166 @@ impl DetailsReporter { self.process_matches(only_valid.unwrap_or(self.only_valid), false) } + pub fn deduplicate_matches( + &self, + matches: Vec, + no_dedup: bool, + ) -> Vec { + if no_dedup { + return matches; + } + + use std::collections::HashMap; + let mut by_fp: HashMap = HashMap::new(); + + for rm in matches { + let fp = rm.m.finding_fingerprint; + if let Some(existing) = by_fp.get_mut(&fp) { + // merge origin sets (keep first origin, append the rest) + for o in rm.origin.iter() { + if !existing.origin.iter().any(|e| e == o) { + existing.origin = OriginSet::new( + existing.origin.first().clone(), + existing + .origin + .iter() + .skip(1) + .cloned() + .chain(std::iter::once(o.clone())) + .collect(), + ); + } + } + continue; + } + by_fp.insert(fp, rm); + } + by_fp.into_values().collect() + } + + fn matches_for_output(&self, args: &cli::commands::scan::ScanArgs) -> Result> { + let mut matches = self.get_filtered_matches()?; + if !args.no_dedup { + matches = self.deduplicate_matches(matches, args.no_dedup); + } + if args.no_dedup { + let mut expanded = Vec::new(); + for rm in matches { + if rm.origin.len() > 1 { + for origin in rm.origin.iter() { + let mut single = rm.clone(); + single.origin = OriginSet::new(origin.clone(), Vec::new()); + expanded.push(single); + } + } else { + expanded.push(rm); + } + } + matches = expanded; + } + Ok(matches) + } + + pub fn build_finding_record( + &self, + rm: &ReportMatch, + args: &cli::commands::scan::ScanArgs, + ) -> FindingReporterRecord { + let source_span = &rm.m.location.source_span; + let line_num = source_span.start.line; + + let snippet = Escaped( + rm.m.groups + .captures + .get(1) + .or_else(|| rm.m.groups.captures.get(0)) + .map(|capture| capture.value.as_bytes()) + .unwrap_or_default(), + ) + .to_string(); + + let validation_status = if rm.validation_success { + "Active Credential".to_string() + } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { + "Not Attempted".to_string() + } else { + "Inactive Credential".to_string() + }; + + const MAX_RESPONSE_LENGTH: usize = 512; + let truncated_body: String = + rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); + let ellipsis = + if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; + let response_body = format!("{}{}", truncated_body, ellipsis); + + let git_metadata_val = rm + .origin + .iter() + .filter_map(|origin| { + if let Origin::GitRepo(e) = origin { + self.extract_git_metadata(e, source_span) + } else { + None + } + }) + .next(); + + let file_path = rm + .origin + .iter() + .find_map(|origin| match origin { + Origin::File(e) => { + if let Some(url) = self.jira_issue_url(&e.path, args) { + Some(url) + } else if let Some(url) = self.slack_message_url(&e.path) { + Some(url) + } else if let Some(mapped) = self.s3_display_path(&e.path) { + Some(mapped) + } else if let Some(mapped) = self.docker_display_path(&e.path) { + Some(mapped) + } else { + Some(e.path.display().to_string()) + } + } + Origin::Extended(e) => e.path().map(|p| p.display().to_string()), + _ => None, + }) + .unwrap_or_default(); + + FindingReporterRecord { + rule: RuleMetadata { + name: rm.m.rule_name.to_string().clone(), + id: rm.m.rule_text_id.to_string().clone(), + }, + finding: FindingRecordData { + snippet, + fingerprint: rm.m.finding_fingerprint.to_string(), + confidence: rm.match_confidence.to_string(), + entropy: format!("{:.2}", rm.m.calculated_entropy), + validation: ValidationInfo { status: validation_status, response: response_body }, + language: rm + .blob_metadata + .language + .clone() + .unwrap_or_else(|| "Unknown".to_string()), + line: line_num as u32, + column_start: source_span.start.column as u32, + column_end: source_span.end.column as u32, + path: file_path, + git_metadata: git_metadata_val, + }, + } + } + + pub fn build_finding_records( + &self, + args: &cli::commands::scan::ScanArgs, + ) -> Result> { + let matches = self.matches_for_output(args)?; + Ok(matches.iter().map(|rm| self.build_finding_record(rm, args)).collect()) + } + fn get_finding_data(&self) -> Result> { let datastore = self.datastore.lock().unwrap(); Ok(datastore @@ -388,6 +516,41 @@ pub struct ReportMatch { /// Validation Success pub validation_success: bool, } + +#[derive(Serialize, JsonSchema, Clone, Debug)] +pub struct FindingReporterRecord { + pub rule: RuleMetadata, + pub finding: FindingRecordData, +} + +#[derive(Serialize, JsonSchema, Clone, Debug)] +pub struct RuleMetadata { + pub name: String, + pub id: String, +} + +#[derive(Serialize, JsonSchema, Clone, Debug)] +pub struct ValidationInfo { + pub status: String, + pub response: String, +} + +#[derive(Serialize, JsonSchema, Clone, Debug)] +pub struct FindingRecordData { + pub snippet: String, + pub fingerprint: String, + pub confidence: String, + pub entropy: String, + pub validation: ValidationInfo, + pub language: String, + pub line: u32, + pub column_start: u32, + pub column_end: u32, + pub path: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub git_metadata: Option, +} + impl From for ReportMatch { fn from(e: finding_data::FindingDataEntry) -> Self { ReportMatch { diff --git a/src/reporter/bson_format.rs b/src/reporter/bson_format.rs index c1470d0..6691c3b 100644 --- a/src/reporter/bson_format.rs +++ b/src/reporter/bson_format.rs @@ -1,90 +1,17 @@ -use bson::Document; -use serde_json::Value; - use super::*; + impl DetailsReporter { /// Formats findings as BSON and writes them to the provided writer. - /// For testing purposes, prints the full JSON for each finding before converting. pub fn bson_format( &self, mut writer: W, args: &cli::commands::scan::ScanArgs, ) -> Result<()> { - // Get filtered matches - let mut matches = self.get_filtered_matches()?; - - // Apply deduplication only if requested - if !args.no_dedup { - matches = self.deduplicate_matches(matches, args.no_dedup); - } - - let mut bson_findings = Vec::new(); - - // For each match, handle it based on the no_dedup flag - for rm in matches { - if args.no_dedup && rm.origin.len() > 1 { - // For no_dedup and multiple origins, create separate findings for each origin - for origin in rm.origin.iter() { - // Create a single-origin version of this match - let single_origin_rm = ReportMatch { - origin: OriginSet::new(origin.clone(), Vec::new()), - blob_metadata: rm.blob_metadata.clone(), - m: rm.m.clone(), - comment: rm.comment.clone(), - visible: rm.visible, - match_confidence: rm.match_confidence, - validation_response_body: rm.validation_response_body.clone(), - validation_response_status: rm.validation_response_status, - validation_success: rm.validation_success, - }; - - // Process to JSON first, then convert to BSON - let json_finding = self.process_match_to_json(&single_origin_rm, args)?; - if let Ok(bson_doc) = json_to_bson_document(&json_finding) { - bson_findings.push(bson_doc); - } - } - } else { - // Process normally for deduped matches or matches with only one origin - let json_finding = self.process_match_to_json(&rm, args)?; - if let Ok(bson_doc) = json_to_bson_document(&json_finding) { - bson_findings.push(bson_doc); - } - } - } - - // Write each BSON document - for doc in bson_findings { + let records = self.build_finding_records(args)?; + for record in records { + let doc = bson::to_document(&record)?; doc.to_writer(&mut writer)?; } Ok(()) } - // pub fn bson_format( - // &self, - // mut writer: W, - // args: &cli::commands::scan::ScanArgs, - // ) -> Result<()> { - // let findings = self.gather_json_findings(args)?; - - // // Print the full JSON for each finding - // for finding in &findings { - // println!("Full JSON:\n{}", serde_json::to_string_pretty(finding)?); - // } - - // let bson_findings: Vec = findings - // .into_iter() - // .filter_map(|finding| json_to_bson_document(&finding).ok()) - // .collect(); - // for doc in bson_findings { - // doc.to_writer(&mut writer)?; - // } - // Ok(()) - // } -} - -fn json_to_bson_document(json: &Value) -> Result { - match bson::to_bson(json)? { - bson::Bson::Document(doc) => Ok(doc), - _ => Err(anyhow::anyhow!("Failed to convert JSON to BSON document")), - } } diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 9fcb1ec..b64a777 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -1,415 +1,61 @@ -use http::StatusCode; -use serde_json::json; - use super::*; -use crate::bstring_escape::Escaped; +use serde_json::Value; impl DetailsReporter { - pub fn deduplicate_matches( - &self, - matches: Vec, - no_dedup: bool, - ) -> Vec { - if no_dedup { - return matches; - } - - use std::collections::HashMap; - let mut by_fp: HashMap = HashMap::new(); - - for rm in matches { - let fp = rm.m.finding_fingerprint; - if let Some(existing) = by_fp.get_mut(&fp) { - // merge origin sets (keep first origin, append the rest) - for o in rm.origin.iter() { - if !existing.origin.iter().any(|e| e == o) { - existing.origin = OriginSet::new( - existing.origin.first().clone(), - existing - .origin - .iter() - .skip(1) - .cloned() - .chain(std::iter::once(o.clone())) - .collect(), - ); - } - } - continue; - } - by_fp.insert(fp, rm); - } - by_fp.into_values().collect() - } - - pub fn gather_json_findings( - &self, - args: &cli::commands::scan::ScanArgs, - ) -> Result> { - let mut matches = self.get_filtered_matches()?; - if !args.no_dedup { - matches = self.deduplicate_matches(matches, args.no_dedup); - } - - let mut json_findings = Vec::new(); - for rm in matches { - let source_span = &rm.m.location.source_span; - let line_num = source_span.start.line; - - let snippet = Escaped( - rm.m.groups - .captures - .get(1) - .or_else(|| rm.m.groups.captures.get(0)) - .map(|capture| capture.value.as_bytes()) - .unwrap_or_default(), - ) - .to_string(); - - let validation_status = if rm.validation_success { - "Active Credential" - } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { - "Not Attempted" - } else { - "Inactive Credential" - }; - - const MAX_RESPONSE_LENGTH: usize = 512; - let truncated_body: String = - rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); - let ellipsis = - if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; - let response_body = format!("{}{}", truncated_body, ellipsis); - - // Call extract_git_metadata on each GitRepo origin and take the first non-null result. - let git_metadata_val = rm - .origin - .iter() - .filter_map(|origin| { - if let Origin::GitRepo(e) = origin { - self.extract_git_metadata(e, source_span) - } else { - None - } - }) - .next() - .unwrap_or(serde_json::Value::Null); - - // Collect a file path from an Origin::File, if available. - let file_path = rm - .origin - .iter() - .find_map(|origin| match origin { - Origin::File(e) => { - if let Some(url) = self.jira_issue_url(&e.path, args) { - Some(url) - } else if let Some(url) = self.slack_message_url(&e.path) { - Some(url) - } else if let Some(mapped) = self.s3_display_path(&e.path) { - Some(mapped) - } else if let Some(mapped) = self.docker_display_path(&e.path) { - Some(mapped) - } else { - Some(e.path.display().to_string()) - } - } - Origin::Extended(e) => e.path().map(|p| p.display().to_string()), - _ => None, - }) - .unwrap_or_default(); - - let match_json = json!({ - "rule": { - "name": rm.m.rule_name, - "id": rm.m.rule_text_id, - }, - "finding": { - "snippet": snippet, - "fingerprint": rm.m.finding_fingerprint.to_string(), - "confidence": rm.match_confidence.to_string(), - "entropy": format!("{:.2}", rm.m.calculated_entropy), - "validation": { - "status": validation_status, - "response": response_body, - }, - "language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()), - "line": line_num, - "column_start": source_span.start.column, - "column_end": source_span.end.column, - "path": file_path, - "git_metadata": git_metadata_val - } - }); - - let finding_json = json!({ - "id": rm.m.rule_text_id, - "matches": [ match_json ] - }); - json_findings.push(finding_json); - } - Ok(json_findings) - } pub fn json_format( &self, mut writer: W, args: &cli::commands::scan::ScanArgs, ) -> Result<()> { - let mut findings = Vec::new(); - - // Get filtered matches - let mut matches = self.get_filtered_matches()?; - - // Apply deduplication only if requested - if !args.no_dedup { - matches = self.deduplicate_matches(matches, args.no_dedup); - } - - // For each match, handle it based on the no_dedup flag - for rm in matches { - if args.no_dedup && rm.origin.len() > 1 { - // For no_dedup and multiple origins, create separate findings for each origin - for origin in rm.origin.iter() { - // Create a single-origin version of this match - let single_origin_rm = ReportMatch { - origin: OriginSet::new(origin.clone(), Vec::new()), - blob_metadata: rm.blob_metadata.clone(), - m: rm.m.clone(), - comment: rm.comment.clone(), - visible: rm.visible, - match_confidence: rm.match_confidence, - validation_response_body: rm.validation_response_body.clone(), - validation_response_status: rm.validation_response_status, - validation_success: rm.validation_success, - }; - - // Process this single-origin match into a JSON finding - let json_finding = self.process_match_to_json(&single_origin_rm, args)?; - findings.push(json_finding); - } - } else { - // Process normally for deduped matches or matches with only one origin - let json_finding = self.process_match_to_json(&rm, args)?; - findings.push(json_finding); - } - } - - // Write the JSON output - if !findings.is_empty() { - serde_json::to_writer_pretty(&mut writer, &findings)?; + let records = self.build_finding_records(args)?; + if !records.is_empty() { + serde_json::to_writer_pretty(&mut writer, &records)?; writeln!(writer)?; } Ok(()) } - // Add a helper method to convert a ReportMatch to a JSON finding - pub fn process_match_to_json( - &self, - rm: &ReportMatch, - args: &cli::commands::scan::ScanArgs, - ) -> Result { - // Extract the relevant data from the match as you already do in your current implementation - let source_span = &rm.m.location.source_span; - let line_num = source_span.start.line; - - let snippet = Escaped( - rm.m.groups - .captures - .get(1) - .or_else(|| rm.m.groups.captures.get(0)) - .map(|capture| capture.value.as_bytes()) - .unwrap_or_default(), - ) - .to_string(); - - let validation_status = if rm.validation_success { - "Active Credential" - } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { - "Not Attempted" - } else { - "Inactive Credential" - }; - - const MAX_RESPONSE_LENGTH: usize = 512; - let truncated_body: String = - rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); - let ellipsis = - if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; - let response_body = format!("{}{}", truncated_body, ellipsis); - - // Call extract_git_metadata on each GitRepo origin and take the first non-null result. - let git_metadata_val = rm - .origin - .iter() - .filter_map(|origin| { - if let Origin::GitRepo(e) = origin { - self.extract_git_metadata(e, source_span) - } else { - None - } - }) - .next() - .unwrap_or(serde_json::Value::Null); - - // Collect a file path from an Origin::File, if available. - let file_path = rm - .origin - .iter() - .find_map(|origin| { - if let Origin::File(e) = origin { - if let Some(url) = self.jira_issue_url(&e.path, args) { - Some(url) - } else if let Some(url) = self.slack_message_url(&e.path) { - Some(url) - } else if let Some(mapped) = self.s3_display_path(&e.path) { - Some(mapped) - } else if let Some(mapped) = self.docker_display_path(&e.path) { - Some(mapped) - } else { - Some(e.path.display().to_string()) - } - } else if let Origin::Extended(e) = origin { - e.path().map(|p| p.display().to_string()) - } else { - None - } - }) - .unwrap_or_default(); - - let match_json = json!({ - "rule": { - "name": rm.m.rule_name, - "id": rm.m.rule_text_id, - }, - "finding": { - "snippet": snippet, - "fingerprint": rm.m.finding_fingerprint.to_string(), - "confidence": rm.match_confidence.to_string(), - "entropy": format!("{:.2}", rm.m.calculated_entropy), - "validation": { - "status": validation_status, - "response": response_body, - }, - "language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()), - "line": line_num, - "column_start": source_span.start.column, - "column_end": source_span.end.column, - "path": file_path, - "git_metadata": git_metadata_val - } - }); - - let finding_json = json!({ - "id": rm.m.rule_text_id, - "matches": [ match_json ] - }); - - Ok(finding_json) - } - // // Modified JSON format to pass args to gather_json_findings - // pub fn json_format( - // &self, - // mut writer: W, - // args: &cli::commands::scan::ScanArgs, - // ) -> Result<()> { - // let findings = self.gather_json_findings(args)?; - // if !findings.is_empty() { - // serde_json::to_writer_pretty(&mut writer, &findings)?; - // writeln!(writer)?; - // } - // Ok(()) - // } - pub fn jsonl_format( &self, mut writer: W, args: &cli::commands::scan::ScanArgs, ) -> Result<()> { - // Get filtered matches - let mut matches = self.get_filtered_matches()?; - - // Apply deduplication only if requested - if !args.no_dedup { - matches = self.deduplicate_matches(matches, args.no_dedup); - } - - // For each match, handle it based on the no_dedup flag - for rm in matches { - if args.no_dedup && rm.origin.len() > 1 { - // For no_dedup and multiple origins, create separate findings for each origin - for origin in rm.origin.iter() { - // Create a single-origin version of this match - let single_origin_rm = ReportMatch { - origin: OriginSet::new(origin.clone(), Vec::new()), - blob_metadata: rm.blob_metadata.clone(), - m: rm.m.clone(), - comment: rm.comment.clone(), - visible: rm.visible, - match_confidence: rm.match_confidence, - validation_response_body: rm.validation_response_body.clone(), - validation_response_status: rm.validation_response_status, - validation_success: rm.validation_success, - }; - - // Process this single-origin match into a JSON finding and write it - let json_finding = self.process_match_to_json(&single_origin_rm, args)?; - serde_json::to_writer(&mut writer, &json_finding)?; - writeln!(writer)?; - } - } else { - // Process normally for deduped matches or matches with only one origin - let json_finding = self.process_match_to_json(&rm, args)?; - serde_json::to_writer(&mut writer, &json_finding)?; - writeln!(writer)?; - } + let records = self.build_finding_records(args)?; + for record in records { + serde_json::to_writer(&mut writer, &record)?; + writeln!(writer)?; } Ok(()) } - // // Modified JSONL format to pass args to gather_json_findings - // pub fn jsonl_format( - // &self, - // mut writer: W, - // args: &cli::commands::scan::ScanArgs, - // ) -> Result<()> { - // let findings = self.gather_json_findings(args)?; - // for finding in findings { - // serde_json::to_writer(&mut writer, &finding)?; - // writeln!(writer)?; - // } - // Ok(()) - // } } #[cfg(test)] mod tests { + use super::*; + use crate::cli::commands::github::GitCloneMode; + use crate::cli::commands::github::GitHistoryMode; + use crate::cli::commands::rules::RuleSpecifierArgs; + use crate::matcher::{SerializableCapture, SerializableCaptures}; + use crate::util::intern; + use crate::{ + blob::BlobId, + cli::commands::github::GitHubRepoType, + cli::commands::inputs::ContentFilteringArgs, + cli::commands::inputs::InputSpecifierArgs, + cli::commands::output::{OutputArgs, ReportOutputFormat}, + cli::commands::scan::ConfidenceLevel, + findings_store::FindingsStore, + location::{Location, OffsetSpan, SourcePoint, SourceSpan}, + matcher::Match, + origin::Origin, + reporter::styles::Styles, + }; use std::{ io::Cursor, path::PathBuf, sync::{Arc, Mutex}, }; - - use anyhow::Result; - use serde_json::Value; use url::Url; - - use super::*; - use crate::{ - blob::BlobId, - cli::commands::{ - github::{GitCloneMode, GitHistoryMode, GitHubRepoType}, - inputs::{ContentFilteringArgs, InputSpecifierArgs}, - output::OutputArgs, - rules::RuleSpecifierArgs, - scan::ConfidenceLevel, - }, - findings_store::FindingsStore, - location::{Location, OffsetSpan, SourcePoint, SourceSpan}, - matcher::{Match, SerializableCapture, SerializableCaptures}, - origin::{Origin, OriginSet}, - reporter::{ReportMatch, Styles}, - rules::rule::Confidence, - util::intern, - }; - fn create_default_args() -> cli::commands::scan::ScanArgs { use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope @@ -480,7 +126,6 @@ mod tests { } } - // Helper function to create a mock Match fn create_mock_match( rule_name: &str, rule_text_id: &str, @@ -508,7 +153,7 @@ mod tests { finding_fingerprint: 0123, rule_finding_fingerprint: intern(rule_finding_fingerprint), rule_text_id: intern(rule_text_id), - rule_name: intern(rule_name), //.to_string(), + rule_name: intern(rule_name), rule_confidence: Confidence::Medium, validation_response_body: "validation response".to_string(), validation_response_status: 200, @@ -518,10 +163,8 @@ mod tests { } } - // Helper function to create a mock DetailsReporter fn setup_mock_reporter(matches: Vec) -> DetailsReporter { let mut datastore = FindingsStore::new(PathBuf::from("/tmp")); - // Create mock origin and blob metadata for the first test match if !matches.is_empty() { let blob_metadata = BlobMetadata { id: BlobId::new(b"mock_blob"), @@ -531,16 +174,14 @@ mod tests { language: Some("Rust".to_string()), }; let dedup = true; - // Add matches to datastore for m in matches.clone() { datastore.record( vec![( Arc::new(OriginSet::new( - // OriginSet -- Arc<…> Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![], )), - Arc::new(blob_metadata.clone()), // BlobMetadata -- Arc<…> + Arc::new(blob_metadata.clone()), m.m.clone(), )], dedup, @@ -553,9 +194,9 @@ mod tests { only_valid: false, } } + #[test] fn test_json_format() -> Result<()> { - // Create a mock match with successful validation let mock_match = create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); let matches = vec![ReportMatch { @@ -577,72 +218,17 @@ mod tests { }]; let reporter = setup_mock_reporter(matches); let mut output = Cursor::new(Vec::new()); - // Call the json_format method reporter.json_format(&mut output, &create_default_args())?; - // Parse and validate JSON output - let json_output: Vec = serde_json::from_slice(&output.into_inner())?; + let json_output: Vec = serde_json::from_slice(&output.into_inner())?; assert!(!json_output.is_empty(), "JSON output should not be empty"); - let first_finding = &json_output[0]; - assert!(first_finding.get("id").is_some(), "Finding should have an 'id'"); - assert!(first_finding.get("matches").is_some(), "Finding should have 'matches'"); - // Validate the structure of the first match - let matches = first_finding.get("matches").unwrap().as_array().unwrap(); - let first_match = &matches[0]; - assert_eq!(first_match.get("rule").unwrap().get("name").unwrap(), "MockRule"); - assert_eq!(first_match.get("finding").unwrap().get("language").unwrap(), "Rust"); + let first = &json_output[0]; + assert_eq!(first["rule"]["name"], "MockRule"); + assert_eq!(first["finding"]["language"], "Rust"); Ok(()) } - // #[test] - // fn test_jsonl_format() -> Result<()> { - // // Create a mock match with successful validation - // let mock_match = - // create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); - // let matches = vec![ReportMatch { - // origin: OriginSet::new( - // Origin::from_file(PathBuf::from("/mock/path/file.rs")), - // vec![], - // ), - // blob_metadata: BlobMetadata { - // id: BlobId::new(b"mock_blob"), - // num_bytes: 1024, - // mime_essence: Some("text/plain".to_string()), - // charset: Some("UTF-8".to_string()), - // language: Some("Rust".to_string()), - // }, - // m: mock_match, - // comment: None, - // match_confidence: Confidence::Medium, - // visible: true, - // validation_response_body: "validation response".to_string(), - // validation_response_status: 200, - // validation_success: true, - // }]; - // let reporter = setup_mock_reporter(matches); - // let mut output = Cursor::new(Vec::new()); - // // Call the jsonl_format method - // reporter.jsonl_format(&mut output, &create_default_args())?; - // // Split output into lines and validate - // let jsonl_output = String::from_utf8(output.into_inner())?; - // let lines: Vec<&str> = jsonl_output.lines().collect(); - // assert!(!lines.is_empty(), "JSONL output should not be empty"); - // for line in &lines { - // let json_value: serde_json::Value = serde_json::from_str(line)?; - // assert!( - // json_value.get("rule_name").is_some(), - // "Each line should have a 'rule_name'" - // ); - // assert!( - // json_value.get("matches").is_some(), - // "Each line should have 'matches'" - // ); - // } - // Ok(()) - // } - #[test] fn test_validation_status_in_json() -> Result<()> { - // Test validation status in JSON output let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")]; for (validation_success, expected_status) in test_cases { let mock_match = create_mock_match( @@ -673,23 +259,11 @@ mod tests { }]; let reporter = setup_mock_reporter(matches); let mut output = Cursor::new(Vec::new()); - // Call the json_format method reporter.json_format(&mut output, &create_default_args())?; - // Parse and validate JSON output - let json_output: Vec = serde_json::from_slice(&output.into_inner())?; + let json_output: Vec = serde_json::from_slice(&output.into_inner())?; assert!(!json_output.is_empty(), "JSON output should not be empty"); - let first_finding = &json_output[0]; - let matches = first_finding.get("matches").unwrap().as_array().unwrap(); - let first_match = &matches[0]; - let validation_status = first_match - .get("finding") - .unwrap() - .get("validation") - .unwrap() - .get("status") - .unwrap() - .as_str() - .unwrap(); + let first = &json_output[0]; + let validation_status = first["finding"]["validation"]["status"].as_str().unwrap(); assert_eq!(validation_status, expected_status); } Ok(()) diff --git a/src/reporter/pretty_format.rs b/src/reporter/pretty_format.rs index 942e7ad..6790a44 100644 --- a/src/reporter/pretty_format.rs +++ b/src/reporter/pretty_format.rs @@ -1,413 +1,126 @@ use std::fmt::{Display, Formatter, Result as FmtResult}; -use http::StatusCode; +use indenter::indented; use super::*; -use crate::{ - bstring_escape::Escaped, - origin::{get_repo_url, GitRepoOrigin}, -}; + impl DetailsReporter { - // Modified pretty format to use deduplicate_matches helper pub fn pretty_format( &self, mut writer: W, args: &cli::commands::scan::ScanArgs, ) -> Result<()> { - let mut matches = self.get_filtered_matches()?; - let num_findings = matches.len(); - - if !args.no_dedup { - matches = self.deduplicate_matches(matches, args.no_dedup); - } - - for (index, rm) in matches.into_iter().enumerate() { - // When no_dedup is true, we'll handle each origin separately - if args.no_dedup && rm.origin.len() > 1 { - // For each origin, create a separate "finding" - for origin in rm.origin.iter() { - // Create a new ReportMatch with just this single origin - let single_origin_rm = ReportMatch { - origin: OriginSet::new(origin.clone(), Vec::new()), - blob_metadata: rm.blob_metadata.clone(), - m: rm.m.clone(), - comment: rm.comment.clone(), - visible: rm.visible, - match_confidence: rm.match_confidence, - validation_response_body: rm.validation_response_body.clone(), - validation_response_status: rm.validation_response_status, - validation_success: rm.validation_success, - }; - - self.write_finding( - &mut writer, - &single_origin_rm, - index + 1, - num_findings, - args, - )?; - } - } else { - // Normal processing for deduped matches or matches with only one origin - self.write_finding(&mut writer, &rm, index + 1, num_findings, args)?; - } + let records = self.build_finding_records(args)?; + let num_findings = records.len(); + for (index, record) in records.iter().enumerate() { + self.write_finding_record(&mut writer, record, index + 1, num_findings)?; } Ok(()) } - fn write_finding( + fn write_finding_record( &self, writer: &mut W, - rm: &ReportMatch, + record: &FindingReporterRecord, _finding_num: usize, _num_findings: usize, - args: &cli::commands::scan::ScanArgs, ) -> Result<()> { - let lock_icon = if rm.validation_success { "🔓 " } else { "" }; + let is_active = record.finding.validation.status == "Active Credential"; + let lock_icon = if is_active { "🔓 " } else { "" }; let formatted_heading = format!( "{}{} => [{}]", lock_icon, - rm.m.rule_name.to_uppercase(), - rm.m.rule_text_id.to_uppercase() + record.rule.name.to_uppercase(), + record.rule.id.to_uppercase() ); - if rm.validation_success { + if is_active { writeln!(writer, "{}", self.style_finding_active_heading(formatted_heading))?; } else { writeln!(writer, "{}", self.style_finding_heading(formatted_heading))?; } - writeln!(writer, "{}", PrettyFinding(self, rm, args))?; + writeln!(writer, "{}", PrettyFindingRecord(self, record))?; writeln!(writer)?; Ok(()) } - fn write_git_metadata( + fn write_git_metadata_value( &self, f: &mut Formatter<'_>, - e: &GitRepoOrigin, - _args: &cli::commands::scan::ScanArgs, - line_num: usize, + git: &serde_json::Value, ) -> FmtResult { - // Check if this is a remote git scan - // let mut is_remote_git_scan = !args.input_specifier_args.git_url.is_empty(); - // let mut git_url_string = String::new(); - let repo_url = get_repo_url(&e.repo_path) - .unwrap_or_else(|_| e.repo_path.to_string_lossy().to_string().into()); - let mut git_url_string = repo_url.clone(); - if git_url_string.ends_with(".git") { - git_url_string = git_url_string.strip_suffix(".git").unwrap().to_string().into(); + let repo_url = git["repository_url"].as_str().unwrap_or(""); + writeln!(f, " |Git Repo......: {}", self.style_metadata(repo_url))?; + if let Some(commit) = git.get("commit") { + if let Some(url) = commit.get("url").and_then(|v| v.as_str()) { + writeln!(f, " |__Commit......: {}", self.style_metadata(url))?; + } + if let Some(committer) = commit.get("committer") { + let name = committer.get("name").and_then(|v| v.as_str()).unwrap_or(""); + let email = committer.get("email").and_then(|v| v.as_str()).unwrap_or(""); + writeln!(indented(f).with_str(" |__"), "Committer...: {} <{}>", name, email)?; + } + if let Some(date) = commit.get("date").and_then(|v| v.as_str()) { + writeln!(indented(f).with_str(" |__"), "Date........: {}", date)?; + } } - writeln!(f, " |Git Repo......: {}", self.style_metadata(&git_url_string),)?; - if let Some(cs) = &e.first_commit { - let cmd = &cs.commit_metadata; - - let atime = - cmd.committer_timestamp.format(gix::date::time::format::SHORT.clone()).to_string(); - - let commit_id = &cmd.commit_id; - let commit_url = format!("{}/commit/{}", &git_url_string, commit_id); - // Write Commit Information - writeln!(f, " |__Commit......: {}", self.style_metadata(&commit_url))?; - writeln!( - indented(f).with_str(" |__"), - "Committer...: {} <{}>", - cmd.committer_name, - cmd.committer_email - )?; - writeln!(indented(f).with_str(" |__"), "Date........: {}", atime)?; - // writeln!(indented(f).with_str(" |__"), "Summary.....: {}", msg)?; - writeln!(indented(f).with_str(" |__"), "Path........: {}", cs.blob_path)?; - // Construct Git Command - let git_link = - format!("{}/blob/{}/{}#L{}", &git_url_string, commit_id, cs.blob_path, line_num); - let git_command = - format!("git -C {} show {}:{}", e.repo_path.display(), commit_id, cs.blob_path); - writeln!( - indented(f).with_str(" |__"), - "Git Link....: {}", - self.style_metadata(&git_link) - )?; - writeln!( - indented(f).with_str(" |__"), - "Git Command.: {}", - self.style_metadata(&git_command) - )?; + if let Some(file) = git.get("file") { + if let Some(path) = file.get("path").and_then(|v| v.as_str()) { + writeln!(indented(f).with_str(" |__"), "Path........: {}", path)?; + } + if let Some(url) = file.get("url").and_then(|v| v.as_str()) { + writeln!( + indented(f).with_str(" |__"), + "Git Link....: {}", + self.style_metadata(url) + )?; + } + if let Some(cmd) = file.get("git_command").and_then(|v| v.as_str()) { + writeln!( + indented(f).with_str(" |__"), + "Git Command.: {}", + self.style_metadata(cmd) + )?; + } } Ok(()) } } -// pub struct PrettyFinding<'a>(&'a DetailsReporter, &'a Finding); -pub struct PrettyFinding<'a>( - &'a DetailsReporter, - &'a ReportMatch, - &'a cli::commands::scan::ScanArgs, -); -impl<'a> Display for PrettyFinding<'a> { + +pub struct PrettyFindingRecord<'a>(&'a DetailsReporter, &'a FindingReporterRecord); + +impl<'a> Display for PrettyFindingRecord<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - let PrettyFinding(reporter, rm, args) = self; - // Use Box String> to store the closure - let style_fn: Box String> = if rm.validation_success { - Box::new(|s: &str| reporter.style_active_creds(s).to_string()) // Convert StyledObject - // to String + let reporter = self.0; + let record = self.1; + let is_active = record.finding.validation.status == "Active Credential"; + let style_fn: Box String> = if is_active { + Box::new(|s| reporter.style_active_creds(s).to_string()) } else { - Box::new(|s: &str| reporter.style_match(s).to_string()) // Convert StyledObject to - // String + Box::new(|s| reporter.style_match(s).to_string()) }; - let matching_finding = - rm.m.groups - .captures - .get(1) - .or_else(|| rm.m.groups.captures.get(0)) - .map(|capture| capture.value.as_bytes()) - .unwrap_or(&[]); - writeln!(f, " |Finding.......: {}", style_fn(&Escaped(matching_finding).to_string()))?; - writeln!(f, " |Fingerprint...: {}", rm.m.finding_fingerprint)?; - writeln!(f, " |Confidence....: {}", rm.match_confidence.to_string())?; - writeln!(f, " |Entropy.......: {:.2}", rm.m.calculated_entropy)?; - let validation_status = if rm.validation_response_status == StatusCode::CONTINUE.as_u16() - || rm.validation_response_status == StatusCode::PRECONDITION_REQUIRED.as_u16() - { - "Not Attempted".to_string() - } else if rm.validation_success { - "Active Credential".to_string() - } else { - "Inactive Credential".to_string() - }; - writeln!( - f, - " |Validation....: {}", - if rm.validation_success { - reporter.style_finding_active_heading(&validation_status).to_string() - // Convert StyledObject to String - } else { - (&validation_status).to_string() - } - )?; - const MAX_RESPONSE_LENGTH: usize = 512; - if rm.validation_response_status != StatusCode::CONTINUE.as_u16() { - let truncated_body: String = - rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); - let ellipsis = - if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; + let finding = &record.finding; + writeln!(f, " |Finding.......: {}", style_fn(&finding.snippet))?; + writeln!(f, " |Fingerprint...: {}", finding.fingerprint)?; + writeln!(f, " |Confidence....: {}", finding.confidence)?; + writeln!(f, " |Entropy.......: {}", finding.entropy)?; + if is_active { writeln!( f, - " |__Response....: {}{}", - if rm.validation_success { - reporter.style_active_creds(&truncated_body).to_string() // Convert StyledObject - // to String - } else { - reporter.style_metadata(&truncated_body).to_string() // Convert StyledObject to - // String - }, - ellipsis + " |Validation....: {}", + reporter.style_finding_active_heading(&finding.validation.status).to_string() )?; + } else { + writeln!(f, " |Validation....: {}", finding.validation.status)?; } - writeln!( - f, - " |Language......: {}", - rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()) - )?; - - let source_span = &rm.m.location.source_span; - writeln!(f, " |Line Num......: {}", source_span.start.line)?; - - //print all the other areas where this was seen - for p in rm.origin.iter() { - match p { - Origin::File(e) => { - let display_path = if let Some(url) = reporter.jira_issue_url(&e.path, args) { - url - } else if let Some(url) = reporter.slack_message_url(&e.path) { - url - } else if let Some(mapped) = reporter.s3_display_path(&e.path) { - mapped - } else if let Some(mapped) = reporter.docker_display_path(&e.path) { - mapped - } else { - e.path.display().to_string() - }; - writeln!( - f, - " |Path..........: {}", - if rm.validation_success { - reporter.style_active_creds(&display_path).to_string() - } else { - display_path - } - )?; - } - Origin::Extended(e) => { - if let Some(p) = e.path() { - let display_path = p.display().to_string(); - writeln!( - f, - " |Path..........: {}", - if rm.validation_success { - reporter.style_active_creds(&display_path).to_string() - } else { - display_path - } - )?; - } - } - Origin::GitRepo(e) => { - reporter.write_git_metadata(f, e, args, source_span.start.line)?; - } - } + if finding.validation.status != "Not Attempted" { + writeln!(f, " |__Response....: {}", style_fn(&finding.validation.response))?; + } + writeln!(f, " |Language......: {}", finding.language)?; + writeln!(f, " |Line Num......: {}", finding.line)?; + writeln!(f, " |Path..........: {}", style_fn(&finding.path))?; + if let Some(git) = &finding.git_metadata { + reporter.write_git_metadata_value(f, git)?; } Ok(()) } } - -#[test] -fn test_pretty_format_with_nan_entropy_panics() { - use std::{ - io::Cursor, - sync::{Arc, Mutex}, - }; - - use http::StatusCode; - use url::Url; - - use crate::{ - blob::BlobMetadata, - cli::commands::{ - github::{GitCloneMode, GitHistoryMode, GitHubRepoType}, - gitlab::GitLabRepoType, - inputs::{ContentFilteringArgs, InputSpecifierArgs}, - output::{OutputArgs, ReportOutputFormat}, - rules::RuleSpecifierArgs, - scan::{ConfidenceLevel, ScanArgs}, - }, - location::{Location, OffsetSpan, SourcePoint, SourceSpan}, - matcher::{Match, SerializableCaptures}, - origin::{Origin, OriginSet}, - reporter::{DetailsReporter, Styles}, - }; - - // Construct a fake match with NaN entropy - let m = Match { - rule_name: "dummy_rule".into(), - rule_text_id: "dummy.id".into(), - finding_fingerprint: 123456789, - rule_finding_fingerprint: "abc".into(), - location: Location { - offset_span: OffsetSpan { start: 0, end: 1 }, - source_span: SourceSpan { - start: SourcePoint { line: 1, column: 0 }, - end: SourcePoint { line: 1, column: 10 }, - }, - }, - blob_id: crate::blob::BlobId::default(), - groups: SerializableCaptures { captures: vec![] }, - rule_confidence: crate::rules::rule::Confidence::Medium, - validation_success: true, - validation_response_status: StatusCode::OK.as_u16(), - validation_response_body: "OK".into(), - calculated_entropy: f32::NAN, // Here's the trigger - visible: true, - }; - - let _rm = crate::reporter::ReportMatch { - origin: OriginSet::new(Origin::from_file("dummy.txt".into()), vec![]), - blob_metadata: BlobMetadata { - id: m.blob_id, - num_bytes: 1, - mime_essence: None, - charset: None, - language: Some("Rust".into()), - }, - m, - comment: None, - visible: true, - match_confidence: crate::rules::rule::Confidence::Medium, - validation_response_body: "OK".into(), - validation_response_status: StatusCode::OK.as_u16(), - validation_success: true, - }; - - let store = Arc::new(Mutex::new(crate::findings_store::FindingsStore::new(".".into()))); - let reporter = - DetailsReporter { datastore: store, styles: Styles::new(false), only_valid: false }; - - let mut buf = Cursor::new(Vec::new()); - let args = ScanArgs { - // core execution / performance - num_jobs: 1, - no_dedup: false, - - // rule selection - rules: RuleSpecifierArgs { - rules_path: Vec::new(), - rule: vec!["all".into()], - load_builtins: true, - }, - - // input discovery - input_specifier_args: InputSpecifierArgs { - path_inputs: Vec::new(), - git_url: Vec::new(), - github_user: Vec::new(), - github_organization: Vec::new(), - all_github_organizations: false, - github_api_url: url::Url::parse("https://api.github.com/").unwrap(), - github_repo_type: GitHubRepoType::Source, - // new GitLab defaults - gitlab_user: Vec::new(), - gitlab_group: Vec::new(), - all_gitlab_groups: false, - gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(), - gitlab_repo_type: GitLabRepoType::Owner, - // Jira options - jira_url: None, - jql: None, - max_results: 100, - - // Slack options - slack_query: None, - slack_api_url: Url::parse("https://slack.com/api/").unwrap(), - // s3 - s3_bucket: None, - s3_prefix: None, - role_arn: None, - aws_local_profile: None, - // Docker image scanning - docker_image: Vec::new(), - // git clone / history options - git_clone: GitCloneMode::Bare, - git_history: GitHistoryMode::Full, - scan_nested_repos: true, - commit_metadata: true, - }, - - // content filtering - content_filtering_args: ContentFilteringArgs { - max_file_size_mb: 25.0, - no_extract_archives: false, - extraction_depth: 2, - exclude: Vec::new(), // Exclude patterns - no_binary: true, - }, - - // scanning behaviour - confidence: ConfidenceLevel::Medium, - no_validate: false, - rule_stats: false, - only_valid: false, - min_entropy: None, - redact: false, - git_repo_timeout: 1800, // 30 minutes - - // output - output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, - - // display - snippet_length: 256, - baseline_file: None, - manage_baseline: false, - }; - - // This will panic if the entropy isn't checked for NaN - let _result = reporter.pretty_format(&mut buf, &args); - // assert!(result.is_err() || result.is_ok(), "Should not crash"); // remove this line if panic - // is expected pre-fix -} diff --git a/src/reporter/sarif_format.rs b/src/reporter/sarif_format.rs index 033d37c..ff771dd 100644 --- a/src/reporter/sarif_format.rs +++ b/src/reporter/sarif_format.rs @@ -1,274 +1,56 @@ -use std::collections::HashMap; +use std::collections::{BTreeMap, HashSet}; use rayon::prelude::*; use serde_sarif::sarif; use super::*; -use crate::{bstring_escape::Escaped, defaults::get_builtin_rules, origin::get_repo_url}; -#[derive(Hash, Eq, PartialEq)] -struct LocationKey { - file_path: String, - line: usize, - column_start: usize, - column_end: usize, - text: String, -} +use crate::defaults::get_builtin_rules; + impl DetailsReporter { - fn make_sarif_result( - &self, - finding: &Finding, - no_dedup: bool, - args: &cli::commands::scan::ScanArgs, - ) -> Result { - // Deduplicate exactly as in the JSON reporter - // let matches = self.deduplicate_matches(finding.matches.clone(), no_dedup); - // Deduplicate exactly as in the JSON reporter - but only if no_dedup is false - let matches = if no_dedup { - finding.matches.clone() - } else { - self.deduplicate_matches(finding.matches.clone(), no_dedup) - }; + fn record_to_sarif_result(&self, record: &FindingReporterRecord) -> Result { + let finding = &record.finding; + let artifact_location = + sarif::ArtifactLocationBuilder::default().uri(finding.path.clone()).build()?; + let region = sarif::RegionBuilder::default() + .start_line(finding.line as i64) + .start_column(finding.column_start as i64) + .end_line(finding.line as i64) + .end_column(finding.column_end as i64) + .snippet( + sarif::ArtifactContentBuilder::default().text(finding.snippet.clone()).build()?, + ) + .build()?; - let metadata = &finding.metadata; - - let mut location_map: HashMap> = HashMap::new(); - for rm in &matches { - let source_span = &rm.m.location.source_span; - let snippet = - rm.m.groups - .captures - .get(1) - .or_else(|| rm.m.groups.captures.get(0)) - .map(|capture| capture.value.as_bytes()) - .unwrap_or(&[]); - let key = LocationKey { - file_path: rm - .origin - .first() - .blob_path() - .map(|p| p.to_string_lossy().into_owned()) - .unwrap_or_default(), - line: source_span.start.line, - column_start: source_span.start.column, - column_end: source_span.end.column, - text: Escaped(snippet).to_string(), - }; - location_map.entry(key).or_default().push((&rm.origin, &rm.m)); + let mut props = BTreeMap::new(); + props.insert("validation_status".to_string(), serde_json::json!(finding.validation.status)); + props.insert("entropy".to_string(), serde_json::json!(finding.entropy)); + if let Some(git) = &finding.git_metadata { + props.insert("git_metadata".to_string(), git.clone()); } + let properties = + sarif::PropertyBagBuilder::default().additional_properties(props).build()?; - let mut fpu64: u64 = 0; + let location = sarif::LocationBuilder::default() + .physical_location( + sarif::PhysicalLocationBuilder::default() + .artifact_location(artifact_location) + .region(region) + .build()?, + ) + .properties(properties) + .build()?; - let locations: Vec = location_map - .into_iter() - .filter_map(|(key, matches)| { - let (prov, m) = matches[0]; - let source_span = &m.location.source_span; - let mut artifact_locations = Vec::new(); - let mut git_metadata_list = Vec::new(); - - fpu64 = m.finding_fingerprint; - - for p in prov.iter() { - match p { - Origin::File(e) => { - let uri = if let Some(url) = self.jira_issue_url(&e.path, args) { - url - } else if let Some(url) = self.slack_message_url(&e.path) { - url - } else if let Some(mapped) = self.s3_display_path(&e.path) { - mapped - } else { - e.path.display().to_string() - }; - artifact_locations.push( - sarif::ArtifactLocationBuilder::default().uri(uri).build().ok()?, - ); - } - Origin::Extended(e) => { - if let Some(p) = e.path() { - artifact_locations.push( - sarif::ArtifactLocationBuilder::default() - .uri(p.display().to_string()) - .build() - .ok()?, - ); - } - } - Origin::GitRepo(e) => { - // Extract and store Git metadata - if let Some(git_metadata) = self.extract_git_metadata(e, source_span) { - git_metadata_list.push(git_metadata); - } - - // Build Git artifact location - if let Some(cs) = &e.first_commit { - let repo_url = get_repo_url(&e.repo_path) - .unwrap_or_else(|_| { - e.repo_path.to_string_lossy().to_string().into() - }) - .trim_end_matches(".git") - .to_string(); - let git_url = format!( - "{}/blob/{}/{}#L{}", - repo_url, - cs.commit_metadata.commit_id, - cs.blob_path, - source_span.start.line - ); - artifact_locations.push( - sarif::ArtifactLocationBuilder::default() - .uri(git_url) - .build() - .ok()?, - ); - } - } - } - } - - if artifact_locations.is_empty() { - return None; - } - - let region = sarif::RegionBuilder::default() - .start_line(key.line as i64) - .start_column(key.column_start as i64) - .end_line(key.line as i64) - .end_column(key.column_end as i64) - .snippet(sarif::ArtifactContentBuilder::default().text(key.text).build().ok()?) - .build() - .ok()?; - - let logical_location = sarif::LogicalLocationBuilder::default() - .kind("blob") - .name(m.finding_fingerprint.to_string()) - .build() - .ok()?; - - let validation_status = - if m.validation_response_status == StatusCode::CONTINUE.as_u16() { - "Not Attempted" - } else if m.validation_success { - "Active Credential" - } else { - "Inactive Credential" - }; - - // Build combined properties including Git metadata and fingerprint - let mut props = std::collections::BTreeMap::new(); - props.insert("validation_status".to_string(), serde_json::json!(validation_status)); - - props.insert( - "entropy".to_string(), - serde_json::json!(format!("{:.2}", m.calculated_entropy)), - ); - - // Add the fingerprint property from the match - props.insert("fingerprint".to_string(), serde_json::json!(m.finding_fingerprint)); - - if !git_metadata_list.is_empty() { - props.insert("git_metadata".to_string(), serde_json::json!(git_metadata_list)); - } - - let properties = sarif::PropertyBagBuilder::default() - .additional_properties(props) - .build() - .ok()?; - - // Create locations for each artifact location - let locations = artifact_locations - .into_iter() - .map(|artifact_location| { - sarif::LocationBuilder::default() - .physical_location( - sarif::PhysicalLocationBuilder::default() - .artifact_location(artifact_location) - .region(region.clone()) - .build() - .ok()?, - ) - .logical_locations(vec![logical_location.clone()]) - .properties(properties.clone()) - .build() - .ok() - }) - .collect::>>()?; - Some(locations) - }) - .flatten() - .collect(); - // let message = sarif::MessageBuilder::default() - // .text(format!( - // "Rule {} found {} unique {}.\nFirst blob id matched: {}", - // metadata.rule_name, - // locations.len(), - // if locations.len() == 1 { "match" } else { "matches" }, - // first_match_blob_id - // )) - // .build()?; - // Create detailed message from first location's information - let detailed_msg = if let Some(first_match) = matches.first() { - let mut msg = format!( - "Rule {} found {} unique {}.\n", - metadata.rule_name, - locations.len(), - if locations.len() == 1 { "match" } else { "matches" } - ); - // Add file or Git information based on origin - // Get first origin of first match - we know this exists - let p = first_match.origin.first(); - match p { - Origin::File(e) => { - let uri = if let Some(url) = self.jira_issue_url(&e.path, args) { - url - } else if let Some(url) = self.slack_message_url(&e.path) { - url - } else if let Some(mapped) = self.s3_display_path(&e.path) { - mapped - } else { - e.path.display().to_string() - }; - msg.push_str(&format!("Location: {}\n", uri)); - } - Origin::Extended(e) => { - if let Some(p) = e.path() { - msg.push_str(&format!("Location: {}\n", p.display())); - } - } - Origin::GitRepo(e) => { - if let Some(cs) = &e.first_commit { - let repo_url = get_repo_url(&e.repo_path) - .unwrap_or_else(|_| e.repo_path.to_string_lossy().to_string().into()) - .trim_end_matches(".git") - .to_string(); - // Add commit and author information - let cmd = &cs.commit_metadata; - msg.push_str(&format!("Repository: {}\n", repo_url)); - msg.push_str(&format!("Commit: {}\n", cmd.commit_id)); - msg.push_str(&format!( - "Committer: {} <{}>\n", - String::from_utf8_lossy(&cmd.committer_name), - String::from_utf8_lossy(&cmd.committer_email) - )); - msg.push_str(&format!("File: {}", cs.blob_path)); - } - } - } - msg - } else { - format!("Rule {} found {} unique matches.", metadata.rule_name, locations.len(),) - }; - let message = sarif::MessageBuilder::default().text(detailed_msg).build()?; - let fingerprint_name = "fingerprint".to_string(); - let fingerprint = fpu64.to_string(); + let message = sarif::MessageBuilder::default() + .text(format!("Rule {} matched {}", record.rule.name, finding.path)) + .build()?; let result = sarif::ResultBuilder::default() - .rule_id(&metadata.rule_name) + .rule_id(&record.rule.name) .message(message) .kind(sarif::ResultKind::Review.to_string()) - .locations(locations) + .locations(vec![location]) .level(sarif::ResultLevel::Warning.to_string()) - .partial_fingerprints([(fingerprint_name, fingerprint)]) + .partial_fingerprints([("fingerprint".to_string(), finding.fingerprint.clone())]) .build()?; Ok(result) } @@ -276,54 +58,11 @@ impl DetailsReporter { pub fn sarif_format( &self, mut writer: W, - no_dedup: bool, + _no_dedup: bool, args: &cli::commands::scan::ScanArgs, ) -> Result<()> { - // Gather findings first - let mut findings = self.gather_findings()?; - - // If no_dedup is true, expand findings with multiple origins into separate findings - if no_dedup { - let mut expanded_findings = Vec::new(); - for finding in findings { - // Check matches with multiple origins - let matches_with_multiple_origins: Vec<_> = - finding.matches.iter().filter(|rm| rm.origin.len() > 1).collect(); - - if !matches_with_multiple_origins.is_empty() { - // For each match with multiple origins, create separate findings - for rm in matches_with_multiple_origins { - for origin in rm.origin.iter() { - // Create a single-origin match - let single_origin_rm = ReportMatch { - origin: OriginSet::new(origin.clone(), Vec::new()), - blob_metadata: rm.blob_metadata.clone(), - m: rm.m.clone(), - comment: rm.comment.clone(), - visible: rm.visible, - match_confidence: rm.match_confidence, - validation_response_body: rm.validation_response_body.clone(), - validation_response_status: rm.validation_response_status, - validation_success: rm.validation_success, - }; - - // Create a new finding with just this single-origin match - let new_finding = - Finding::new(finding.metadata.clone(), vec![single_origin_rm]); - expanded_findings.push(new_finding); - } - } - } else { - // If the finding has no matches with multiple origins, keep it as is - expanded_findings.push(finding); - } - } - findings = expanded_findings; - } - - // Filter only rules relevant to the findings - let finding_rule_ids: std::collections::HashSet<_> = - findings.iter().map(|f| f.metadata.rule_name.clone()).collect(); + let records = self.build_finding_records(args)?; + let finding_rule_ids: HashSet<_> = records.iter().map(|r| r.rule.name.clone()).collect(); let rules: Vec = get_builtin_rules(None)? .iter_rules() .par_bridge() @@ -366,10 +105,9 @@ impl DetailsReporter { ) .build()?; - let sarif_results: Vec = findings - .par_iter() - .filter_map(|f| self.make_sarif_result(f, no_dedup, args).ok()) - .collect(); + let sarif_results: Vec = + records.iter().filter_map(|r| self.record_to_sarif_result(r).ok()).collect(); + let run = sarif::RunBuilder::default().tool(tool).results(sarif_results).build()?; let sarif = sarif::SarifBuilder::default() .version(sarif::Version::V2_1_0.to_string()) diff --git a/src/s3.rs b/src/s3.rs index ed18a52..0f1fcef 100644 --- a/src/s3.rs +++ b/src/s3.rs @@ -2,12 +2,12 @@ use anyhow::{Context, Result}; use aws_config::{defaults, meta::region::RegionProviderChain, BehaviorVersion}; use aws_credential_types::Credentials; use aws_sdk_s3::{ + error::ProvideErrorMetadata, // for .code() + operation::list_objects_v2::ListObjectsV2Error, // modeled service error Client, - operation::list_objects_v2::ListObjectsV2Error, // modeled service error - error::ProvideErrorMetadata, // for .code() }; use aws_types::region::Region; -use reqwest; // HTTP client for HEAD fallback +use reqwest; // HTTP client for HEAD fallback pub async fn visit_bucket_objects( bucket: &str, @@ -43,9 +43,7 @@ where .configure(&config) .build() .await; - let conf = aws_sdk_s3::config::Builder::from(&config) - .credentials_provider(assume) - .build(); + let conf = aws_sdk_s3::config::Builder::from(&config).credentials_provider(assume).build(); Client::from_conf(conf) } else { Client::new(&config) @@ -66,7 +64,7 @@ where // On error, extract the modeled service error Err(err) => { - let svc_err: ListObjectsV2Error = err.into_service_error(); // from SdkError + let svc_err: ListObjectsV2Error = err.into_service_error(); // from SdkError // If the bucket must be addressed at another region... if svc_err.code() == Some("PermanentRedirect") { diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index 735e381..59bc536 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -21,14 +21,16 @@ use crate::{ findings_store, git_binary::{CloneMode, Git}, git_url::GitUrl, - github, gitlab, jira, + github, gitlab, + guesser::Guesser, + jira, matcher::{Match, Matcher, MatcherStats}, origin::{Origin, OriginSet}, rules_database::RulesDatabase, s3, scanner::processing::BlobProcessor, scanner_pool::ScannerPool, - slack, guesser::Guesser, PathBuf, + slack, PathBuf, }; pub type DatastoreMessage = (OriginSet, BlobMetadata, Vec<(Option, Match)>); @@ -291,7 +293,6 @@ pub async fn fetch_slack_messages( Ok(vec![output_dir]) } - pub async fn fetch_s3_objects( args: &scan::ScanArgs, datastore: &Arc>, @@ -330,10 +331,12 @@ pub async fn fetch_s3_objects( ); let blob = crate::blob::Blob::from_bytes(bytes); - if let Some((origin, blob_md, scored_matches)) = processor.run(origin, blob, args.no_dedup)? { + if let Some((origin, blob_md, scored_matches)) = + processor.run(origin, blob, args.no_dedup)? + { // Wrap origin & metadata once: let origin_arc = Arc::new(origin); - let blob_arc = Arc::new(blob_md); + let blob_arc = Arc::new(blob_md); // Now build a batch of exactly one FindingsStoreMessage per Match let mut batch = Vec::with_capacity(scored_matches.len()); @@ -350,4 +353,4 @@ pub async fn fetch_s3_objects( .await?; Ok(()) -} \ No newline at end of file +} diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs index 63f7bee..33dc644 100644 --- a/src/scanner/runner.rs +++ b/src/scanner/runner.rs @@ -74,7 +74,6 @@ pub async fn run_async_scan( let slack_dirs = fetch_slack_messages(args, global_args, &datastore).await?; input_roots.extend(slack_dirs); - // Save Docker images if specified if !args.input_specifier_args.docker_image.is_empty() { let clone_root = { @@ -129,7 +128,6 @@ pub async fn run_async_scan( )?; } - if !args.no_dedup { // Final deduplication step before validation (or before reporting) let reporter = crate::reporter::DetailsReporter { diff --git a/tests/int_s3.rs b/tests/int_s3.rs index c44afe8..c8d6b9c 100644 --- a/tests/int_s3.rs +++ b/tests/int_s3.rs @@ -4,25 +4,26 @@ use kingfisher::s3::visit_bucket_objects; #[tokio::test] async fn test_visit_public_bucket() -> Result<()> { let mut objects = Vec::new(); - visit_bucket_objects("awsglue-datasets", Some("examples/us-legislators/all/"), None, None, |key, data| { - objects.push((key, data)); - Ok(()) - }) + visit_bucket_objects( + "awsglue-datasets", + Some("examples/us-legislators/all/"), + None, + None, + |key, data| { + objects.push((key, data)); + Ok(()) + }, + ) .await?; assert!( objects.iter().any(|(k, _)| k.ends_with("events.json")), "events.json object not found" ); - let creds = objects - .iter() - .find(|(k, _)| k.ends_with("events.json")) - .expect("events.json object"); + let creds = + objects.iter().find(|(k, _)| k.ends_with("events.json")).expect("events.json object"); let body = std::str::from_utf8(&creds.1)?; - assert!( - body.contains("Q4450263"), - "expected events.json file" - ); + assert!(body.contains("Q4450263"), "expected events.json file"); Ok(()) -} \ No newline at end of file +} From 73fd3d0dec2c93fb5b7e84d8c49afe42958dc6c8 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 4 Aug 2025 08:58:20 -0700 Subject: [PATCH 10/13] refactored output reporting and formatting logic --- f1.patch | 719 ------------------------------------------------------- 1 file changed, 719 deletions(-) delete mode 100644 f1.patch diff --git a/f1.patch b/f1.patch deleted file mode 100644 index a132a31..0000000 --- a/f1.patch +++ /dev/null @@ -1,719 +0,0 @@ -diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs -index 9fcb1ecdfe8decc60278848c4a7be43cc9ebee70..b600f9f65838e52ce5dc3d7bb3bb1a5d5ff2bcaf 100644 ---- a/src/reporter/json_format.rs -+++ b/src/reporter/json_format.rs -@@ -1,436 +1,80 @@ --use http::StatusCode; --use serde_json::json; -- - use super::*; --use crate::bstring_escape::Escaped; - - impl DetailsReporter { -- pub fn deduplicate_matches( -- &self, -- matches: Vec, -- no_dedup: bool, -- ) -> Vec { -- if no_dedup { -- return matches; -- } -- -- use std::collections::HashMap; -- let mut by_fp: HashMap = HashMap::new(); -- -- for rm in matches { -- let fp = rm.m.finding_fingerprint; -- if let Some(existing) = by_fp.get_mut(&fp) { -- // merge origin sets (keep first origin, append the rest) -- for o in rm.origin.iter() { -- if !existing.origin.iter().any(|e| e == o) { -- existing.origin = OriginSet::new( -- existing.origin.first().clone(), -- existing -- .origin -- .iter() -- .skip(1) -- .cloned() -- .chain(std::iter::once(o.clone())) -- .collect(), -- ); -- } -- } -- continue; -- } -- by_fp.insert(fp, rm); -- } -- by_fp.into_values().collect() -- } -- -- pub fn gather_json_findings( -- &self, -- args: &cli::commands::scan::ScanArgs, -- ) -> Result> { -- let mut matches = self.get_filtered_matches()?; -- if !args.no_dedup { -- matches = self.deduplicate_matches(matches, args.no_dedup); -- } -- -- let mut json_findings = Vec::new(); -- for rm in matches { -- let source_span = &rm.m.location.source_span; -- let line_num = source_span.start.line; -- -- let snippet = Escaped( -- rm.m.groups -- .captures -- .get(1) -- .or_else(|| rm.m.groups.captures.get(0)) -- .map(|capture| capture.value.as_bytes()) -- .unwrap_or_default(), -- ) -- .to_string(); -- -- let validation_status = if rm.validation_success { -- "Active Credential" -- } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { -- "Not Attempted" -- } else { -- "Inactive Credential" -- }; -- -- const MAX_RESPONSE_LENGTH: usize = 512; -- let truncated_body: String = -- rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); -- let ellipsis = -- if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; -- let response_body = format!("{}{}", truncated_body, ellipsis); -- -- // Call extract_git_metadata on each GitRepo origin and take the first non-null result. -- let git_metadata_val = rm -- .origin -- .iter() -- .filter_map(|origin| { -- if let Origin::GitRepo(e) = origin { -- self.extract_git_metadata(e, source_span) -- } else { -- None -- } -- }) -- .next() -- .unwrap_or(serde_json::Value::Null); -- -- // Collect a file path from an Origin::File, if available. -- let file_path = rm -- .origin -- .iter() -- .find_map(|origin| match origin { -- Origin::File(e) => { -- if let Some(url) = self.jira_issue_url(&e.path, args) { -- Some(url) -- } else if let Some(url) = self.slack_message_url(&e.path) { -- Some(url) -- } else if let Some(mapped) = self.s3_display_path(&e.path) { -- Some(mapped) -- } else if let Some(mapped) = self.docker_display_path(&e.path) { -- Some(mapped) -- } else { -- Some(e.path.display().to_string()) -- } -- } -- Origin::Extended(e) => e.path().map(|p| p.display().to_string()), -- _ => None, -- }) -- .unwrap_or_default(); -- -- let match_json = json!({ -- "rule": { -- "name": rm.m.rule_name, -- "id": rm.m.rule_text_id, -- }, -- "finding": { -- "snippet": snippet, -- "fingerprint": rm.m.finding_fingerprint.to_string(), -- "confidence": rm.match_confidence.to_string(), -- "entropy": format!("{:.2}", rm.m.calculated_entropy), -- "validation": { -- "status": validation_status, -- "response": response_body, -- }, -- "language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()), -- "line": line_num, -- "column_start": source_span.start.column, -- "column_end": source_span.end.column, -- "path": file_path, -- "git_metadata": git_metadata_val -- } -- }); -- -- let finding_json = json!({ -- "id": rm.m.rule_text_id, -- "matches": [ match_json ] -- }); -- json_findings.push(finding_json); -- } -- Ok(json_findings) -- } - pub fn json_format( - &self, - mut writer: W, - args: &cli::commands::scan::ScanArgs, - ) -> Result<()> { -- let mut findings = Vec::new(); -- -- // Get filtered matches -- let mut matches = self.get_filtered_matches()?; -- -- // Apply deduplication only if requested -- if !args.no_dedup { -- matches = self.deduplicate_matches(matches, args.no_dedup); -- } -- -- // For each match, handle it based on the no_dedup flag -- for rm in matches { -- if args.no_dedup && rm.origin.len() > 1 { -- // For no_dedup and multiple origins, create separate findings for each origin -- for origin in rm.origin.iter() { -- // Create a single-origin version of this match -- let single_origin_rm = ReportMatch { -- origin: OriginSet::new(origin.clone(), Vec::new()), -- blob_metadata: rm.blob_metadata.clone(), -- m: rm.m.clone(), -- comment: rm.comment.clone(), -- visible: rm.visible, -- match_confidence: rm.match_confidence, -- validation_response_body: rm.validation_response_body.clone(), -- validation_response_status: rm.validation_response_status, -- validation_success: rm.validation_success, -- }; -- -- // Process this single-origin match into a JSON finding -- let json_finding = self.process_match_to_json(&single_origin_rm, args)?; -- findings.push(json_finding); -- } -- } else { -- // Process normally for deduped matches or matches with only one origin -- let json_finding = self.process_match_to_json(&rm, args)?; -- findings.push(json_finding); -- } -- } -- -- // Write the JSON output -- if !findings.is_empty() { -- serde_json::to_writer_pretty(&mut writer, &findings)?; -+ let records = self.build_finding_records(args)?; -+ if !records.is_empty() { -+ serde_json::to_writer_pretty(&mut writer, &records)?; - writeln!(writer)?; - } - Ok(()) - } - -- // Add a helper method to convert a ReportMatch to a JSON finding -- pub fn process_match_to_json( -- &self, -- rm: &ReportMatch, -- args: &cli::commands::scan::ScanArgs, -- ) -> Result { -- // Extract the relevant data from the match as you already do in your current implementation -- let source_span = &rm.m.location.source_span; -- let line_num = source_span.start.line; -- -- let snippet = Escaped( -- rm.m.groups -- .captures -- .get(1) -- .or_else(|| rm.m.groups.captures.get(0)) -- .map(|capture| capture.value.as_bytes()) -- .unwrap_or_default(), -- ) -- .to_string(); -- -- let validation_status = if rm.validation_success { -- "Active Credential" -- } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { -- "Not Attempted" -- } else { -- "Inactive Credential" -- }; -- -- const MAX_RESPONSE_LENGTH: usize = 512; -- let truncated_body: String = -- rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); -- let ellipsis = -- if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; -- let response_body = format!("{}{}", truncated_body, ellipsis); -- -- // Call extract_git_metadata on each GitRepo origin and take the first non-null result. -- let git_metadata_val = rm -- .origin -- .iter() -- .filter_map(|origin| { -- if let Origin::GitRepo(e) = origin { -- self.extract_git_metadata(e, source_span) -- } else { -- None -- } -- }) -- .next() -- .unwrap_or(serde_json::Value::Null); -- -- // Collect a file path from an Origin::File, if available. -- let file_path = rm -- .origin -- .iter() -- .find_map(|origin| { -- if let Origin::File(e) = origin { -- if let Some(url) = self.jira_issue_url(&e.path, args) { -- Some(url) -- } else if let Some(url) = self.slack_message_url(&e.path) { -- Some(url) -- } else if let Some(mapped) = self.s3_display_path(&e.path) { -- Some(mapped) -- } else if let Some(mapped) = self.docker_display_path(&e.path) { -- Some(mapped) -- } else { -- Some(e.path.display().to_string()) -- } -- } else if let Origin::Extended(e) = origin { -- e.path().map(|p| p.display().to_string()) -- } else { -- None -- } -- }) -- .unwrap_or_default(); -- -- let match_json = json!({ -- "rule": { -- "name": rm.m.rule_name, -- "id": rm.m.rule_text_id, -- }, -- "finding": { -- "snippet": snippet, -- "fingerprint": rm.m.finding_fingerprint.to_string(), -- "confidence": rm.match_confidence.to_string(), -- "entropy": format!("{:.2}", rm.m.calculated_entropy), -- "validation": { -- "status": validation_status, -- "response": response_body, -- }, -- "language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()), -- "line": line_num, -- "column_start": source_span.start.column, -- "column_end": source_span.end.column, -- "path": file_path, -- "git_metadata": git_metadata_val -- } -- }); -- -- let finding_json = json!({ -- "id": rm.m.rule_text_id, -- "matches": [ match_json ] -- }); -- -- Ok(finding_json) -- } -- // // Modified JSON format to pass args to gather_json_findings -- // pub fn json_format( -- // &self, -- // mut writer: W, -- // args: &cli::commands::scan::ScanArgs, -- // ) -> Result<()> { -- // let findings = self.gather_json_findings(args)?; -- // if !findings.is_empty() { -- // serde_json::to_writer_pretty(&mut writer, &findings)?; -- // writeln!(writer)?; -- // } -- // Ok(()) -- // } -- - pub fn jsonl_format( - &self, - mut writer: W, - args: &cli::commands::scan::ScanArgs, - ) -> Result<()> { -- // Get filtered matches -- let mut matches = self.get_filtered_matches()?; -- -- // Apply deduplication only if requested -- if !args.no_dedup { -- matches = self.deduplicate_matches(matches, args.no_dedup); -- } -- -- // For each match, handle it based on the no_dedup flag -- for rm in matches { -- if args.no_dedup && rm.origin.len() > 1 { -- // For no_dedup and multiple origins, create separate findings for each origin -- for origin in rm.origin.iter() { -- // Create a single-origin version of this match -- let single_origin_rm = ReportMatch { -- origin: OriginSet::new(origin.clone(), Vec::new()), -- blob_metadata: rm.blob_metadata.clone(), -- m: rm.m.clone(), -- comment: rm.comment.clone(), -- visible: rm.visible, -- match_confidence: rm.match_confidence, -- validation_response_body: rm.validation_response_body.clone(), -- validation_response_status: rm.validation_response_status, -- validation_success: rm.validation_success, -- }; -- -- // Process this single-origin match into a JSON finding and write it -- let json_finding = self.process_match_to_json(&single_origin_rm, args)?; -- serde_json::to_writer(&mut writer, &json_finding)?; -- writeln!(writer)?; -- } -- } else { -- // Process normally for deduped matches or matches with only one origin -- let json_finding = self.process_match_to_json(&rm, args)?; -- serde_json::to_writer(&mut writer, &json_finding)?; -- writeln!(writer)?; -- } -+ let records = self.build_finding_records(args)?; -+ for record in records { -+ serde_json::to_writer(&mut writer, &record)?; -+ writeln!(writer)?; - } - Ok(()) - } -- // // Modified JSONL format to pass args to gather_json_findings -- // pub fn jsonl_format( -- // &self, -- // mut writer: W, -- // args: &cli::commands::scan::ScanArgs, -- // ) -> Result<()> { -- // let findings = self.gather_json_findings(args)?; -- // for finding in findings { -- // serde_json::to_writer(&mut writer, &finding)?; -- // writeln!(writer)?; -- // } -- // Ok(()) -- // } - } - - #[cfg(test)] - mod tests { -- use std::{ -- io::Cursor, -- path::PathBuf, -- sync::{Arc, Mutex}, -- }; -- -- use anyhow::Result; -- use serde_json::Value; -- use url::Url; -- - use super::*; - use crate::{ - blob::BlobId, -- cli::commands::{ -- github::{GitCloneMode, GitHistoryMode, GitHubRepoType}, -- inputs::{ContentFilteringArgs, InputSpecifierArgs}, -- output::OutputArgs, -- rules::RuleSpecifierArgs, -- scan::ConfidenceLevel, -+ cli::commands::github::GitHubRepoType, -+ cli::commands::output::{OutputArgs, ReportOutputFormat}, -+ cli::commands::scan::{ -+ ConfidenceLevel, ContentFilteringArgs, GitCloneMode, GitHistoryMode, -+ InputSpecifierArgs, RuleSpecifierArgs, - }, - findings_store::FindingsStore, - location::{Location, OffsetSpan, SourcePoint, SourceSpan}, -- matcher::{Match, SerializableCapture, SerializableCaptures}, -- origin::{Origin, OriginSet}, -- reporter::{ReportMatch, Styles}, -- rules::rule::Confidence, -- util::intern, -+ matcher::serializable::{SerializableCapture, SerializableCaptures}, -+ matcher::Match, -+ origin::Origin, -+ reporter::styles::Styles, -+ scanner::test_utils::intern, - }; -+ use std::{ -+ io::Cursor, -+ path::PathBuf, -+ sync::{Arc, Mutex}, -+ }; -+ use url::Url; - - fn create_default_args() -> cli::commands::scan::ScanArgs { - use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope - - cli::commands::scan::ScanArgs { - num_jobs: 1, - no_dedup: false, - rules: RuleSpecifierArgs { - rules_path: Vec::new(), - rule: vec!["all".into()], - load_builtins: true, - }, - input_specifier_args: InputSpecifierArgs { - // local path / git URL inputs - path_inputs: Vec::new(), - git_url: Vec::new(), - - // GitHub - github_user: Vec::new(), - github_organization: Vec::new(), - all_github_organizations: false, - github_api_url: Url::parse("https://api.github.com/").unwrap(), - github_repo_type: GitHubRepoType::Source, - - // GitLab -diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs -index 9fcb1ecdfe8decc60278848c4a7be43cc9ebee70..b600f9f65838e52ce5dc3d7bb3bb1a5d5ff2bcaf 100644 ---- a/src/reporter/json_format.rs -+++ b/src/reporter/json_format.rs -@@ -458,240 +102,168 @@ mod tests { - git_history: GitHistoryMode::Full, - scan_nested_repos: true, - commit_metadata: true, - }, - content_filtering_args: ContentFilteringArgs { - max_file_size_mb: 25.0, - no_extract_archives: false, - extraction_depth: 2, - exclude: Vec::new(), // Exclude patterns - no_binary: true, - }, - confidence: ConfidenceLevel::Medium, - no_validate: false, - rule_stats: false, - only_valid: false, - min_entropy: None, - redact: false, - git_repo_timeout: 1800, // 30 minutes - output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, - snippet_length: 256, - baseline_file: None, - manage_baseline: false, - } - } - -- // Helper function to create a mock Match - fn create_mock_match( - rule_name: &str, - rule_text_id: &str, - rule_finding_fingerprint: &str, - validation_success: bool, - ) -> Match { - Match { - location: Location { - offset_span: OffsetSpan { start: 10, end: 20 }, - source_span: SourceSpan { - start: SourcePoint { line: 5, column: 10 }, - end: SourcePoint { line: 5, column: 20 }, - }, - }, - groups: SerializableCaptures { - captures: vec![SerializableCapture { - name: Some("token".to_string()), - match_number: 1, - start: 10, - end: 20, - value: "mock_token".into(), - }], - }, - blob_id: BlobId::new(b"mock_blob"), - finding_fingerprint: 0123, - rule_finding_fingerprint: intern(rule_finding_fingerprint), - rule_text_id: intern(rule_text_id), -- rule_name: intern(rule_name), //.to_string(), -+ rule_name: intern(rule_name), - rule_confidence: Confidence::Medium, - validation_response_body: "validation response".to_string(), - validation_response_status: 200, - validation_success, - calculated_entropy: 4.5, - visible: true, - } - } - -- // Helper function to create a mock DetailsReporter - fn setup_mock_reporter(matches: Vec) -> DetailsReporter { - let mut datastore = FindingsStore::new(PathBuf::from("/tmp")); -- // Create mock origin and blob metadata for the first test match - if !matches.is_empty() { - let blob_metadata = BlobMetadata { - id: BlobId::new(b"mock_blob"), - num_bytes: 1024, - mime_essence: Some("text/plain".to_string()), - charset: Some("UTF-8".to_string()), - language: Some("Rust".to_string()), - }; - let dedup = true; -- // Add matches to datastore - for m in matches.clone() { - datastore.record( - vec![( - Arc::new(OriginSet::new( -- // OriginSet -- Arc<…> - Origin::from_file(PathBuf::from("/mock/path/file.rs")), - vec![], - )), -- Arc::new(blob_metadata.clone()), // BlobMetadata -- Arc<…> -+ Arc::new(blob_metadata.clone()), - m.m.clone(), - )], - dedup, - ); - } - } - DetailsReporter { - datastore: Arc::new(Mutex::new(datastore)), - styles: Styles::new(false), - only_valid: false, - } - } -+ - #[test] - fn test_json_format() -> Result<()> { -- // Create a mock match with successful validation - let mock_match = - create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); - let matches = vec![ReportMatch { - origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]), - blob_metadata: BlobMetadata { - id: BlobId::new(b"mock_blob"), - num_bytes: 1024, - mime_essence: Some("text/plain".to_string()), - charset: Some("UTF-8".to_string()), - language: Some("Rust".to_string()), - }, - m: mock_match, - comment: None, - match_confidence: Confidence::Medium, - visible: true, - validation_response_body: "validation response".to_string(), - validation_response_status: 200, - validation_success: true, - }]; - let reporter = setup_mock_reporter(matches); - let mut output = Cursor::new(Vec::new()); -- // Call the json_format method - reporter.json_format(&mut output, &create_default_args())?; -- // Parse and validate JSON output -- let json_output: Vec = serde_json::from_slice(&output.into_inner())?; -+ let json_output: Vec = serde_json::from_slice(&output.into_inner())?; - assert!(!json_output.is_empty(), "JSON output should not be empty"); -- let first_finding = &json_output[0]; -- assert!(first_finding.get("id").is_some(), "Finding should have an 'id'"); -- assert!(first_finding.get("matches").is_some(), "Finding should have 'matches'"); -- // Validate the structure of the first match -- let matches = first_finding.get("matches").unwrap().as_array().unwrap(); -- let first_match = &matches[0]; -- assert_eq!(first_match.get("rule").unwrap().get("name").unwrap(), "MockRule"); -- assert_eq!(first_match.get("finding").unwrap().get("language").unwrap(), "Rust"); -+ let first = &json_output[0]; -+ assert_eq!(first["rule"]["name"], "MockRule"); -+ assert_eq!(first["finding"]["language"], "Rust"); - Ok(()) - } - -- // #[test] -- // fn test_jsonl_format() -> Result<()> { -- // // Create a mock match with successful validation -- // let mock_match = -- // create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); -- // let matches = vec![ReportMatch { -- // origin: OriginSet::new( -- // Origin::from_file(PathBuf::from("/mock/path/file.rs")), -- // vec![], -- // ), -- // blob_metadata: BlobMetadata { -- // id: BlobId::new(b"mock_blob"), -- // num_bytes: 1024, -- // mime_essence: Some("text/plain".to_string()), -- // charset: Some("UTF-8".to_string()), -- // language: Some("Rust".to_string()), -- // }, -- // m: mock_match, -- // comment: None, -- // match_confidence: Confidence::Medium, -- // visible: true, -- // validation_response_body: "validation response".to_string(), -- // validation_response_status: 200, -- // validation_success: true, -- // }]; -- // let reporter = setup_mock_reporter(matches); -- // let mut output = Cursor::new(Vec::new()); -- // // Call the jsonl_format method -- // reporter.jsonl_format(&mut output, &create_default_args())?; -- // // Split output into lines and validate -- // let jsonl_output = String::from_utf8(output.into_inner())?; -- // let lines: Vec<&str> = jsonl_output.lines().collect(); -- // assert!(!lines.is_empty(), "JSONL output should not be empty"); -- // for line in &lines { -- // let json_value: serde_json::Value = serde_json::from_str(line)?; -- // assert!( -- // json_value.get("rule_name").is_some(), -- // "Each line should have a 'rule_name'" -- // ); -- // assert!( -- // json_value.get("matches").is_some(), -- // "Each line should have 'matches'" -- // ); -- // } -- // Ok(()) -- // } -- - #[test] - fn test_validation_status_in_json() -> Result<()> { -- // Test validation status in JSON output - let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")]; - for (validation_success, expected_status) in test_cases { - let mock_match = create_mock_match( - "MockRule", - "mock_rule_1", - "mock_finding_fingerprint", - validation_success, - ); - let matches = vec![ReportMatch { - origin: OriginSet::new( - Origin::from_file(PathBuf::from("/mock/path/file.rs")), - vec![], - ), - blob_metadata: BlobMetadata { - id: BlobId::new(b"mock_blob"), - num_bytes: 1024, - mime_essence: Some("text/plain".to_string()), - charset: Some("UTF-8".to_string()), - language: Some("Rust".to_string()), - }, - m: mock_match, - comment: None, - match_confidence: Confidence::Medium, - visible: true, - validation_response_body: "validation response".to_string(), - validation_response_status: 200, - validation_success, - }]; - let reporter = setup_mock_reporter(matches); - let mut output = Cursor::new(Vec::new()); -- // Call the json_format method - reporter.json_format(&mut output, &create_default_args())?; -- // Parse and validate JSON output -- let json_output: Vec = serde_json::from_slice(&output.into_inner())?; -+ let json_output: Vec = serde_json::from_slice(&output.into_inner())?; - assert!(!json_output.is_empty(), "JSON output should not be empty"); -- let first_finding = &json_output[0]; -- let matches = first_finding.get("matches").unwrap().as_array().unwrap(); -- let first_match = &matches[0]; -- let validation_status = first_match -- .get("finding") -- .unwrap() -- .get("validation") -- .unwrap() -- .get("status") -- .unwrap() -- .as_str() -- .unwrap(); -+ let first = &json_output[0]; -+ let validation_status = first["finding"]["validation"]["status"].as_str().unwrap(); - assert_eq!(validation_status, expected_status); - } - Ok(()) - } - } From 3ac5e04ea06c15b7491a222189c0f1fbba6d0a9f Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 4 Aug 2025 09:09:51 -0700 Subject: [PATCH 11/13] Update src/reporter.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Mick Grove --- src/reporter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/reporter.rs b/src/reporter.rs index 5159651..f639c6f 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -363,8 +363,8 @@ impl DetailsReporter { FindingReporterRecord { rule: RuleMetadata { - name: rm.m.rule_name.to_string().clone(), - id: rm.m.rule_text_id.to_string().clone(), + name: rm.m.rule_name.to_string(), + id: rm.m.rule_text_id.to_string(), }, finding: FindingRecordData { snippet, From f14c2b6e61d3a4924fb2d4d98844354c544bb5e8 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 4 Aug 2025 09:21:49 -0700 Subject: [PATCH 12/13] fixed issues found by pr review --- src/reporter.rs | 57 ++++--------------------------------- src/reporter/json_format.rs | 1 - 2 files changed, 5 insertions(+), 53 deletions(-) diff --git a/src/reporter.rs b/src/reporter.rs index f639c6f..4669484 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -23,7 +23,7 @@ mod json_format; mod pretty_format; mod sarif_format; pub mod styles; -use std::{hash::Hash, io::IsTerminal}; +use std::io::IsTerminal; use styles::{StyledObject, Styles}; @@ -167,19 +167,6 @@ impl DetailsReporter { None } - fn gather_findings(&self) -> Result> { - let metadata_list = self.get_finding_data()?; - let all_matches = self.get_filtered_matches()?; - let mut findings = Vec::new(); - for md in metadata_list { - // Filter matches that belong to this metadata if needed - let matches_for_md = - all_matches.iter().filter(|m| m.m.rule_name == md.rule_name).cloned().collect(); - findings.push(Finding::new(md.clone(), matches_for_md)); - } - Ok(findings) - } - fn process_matches(&self, only_valid: bool, filter_visible: bool) -> Result> { let datastore = self.datastore.lock().unwrap(); Ok(datastore @@ -394,26 +381,6 @@ impl DetailsReporter { Ok(matches.iter().map(|rm| self.build_finding_record(rm, args)).collect()) } - fn get_finding_data(&self) -> Result> { - let datastore = self.datastore.lock().unwrap(); - Ok(datastore - .get_finding_data_iter() - .filter(|metadata| { - if self.only_valid { - datastore.get_matches().iter().any(|msg| { - let (_, _, match_item) = &**msg; - match_item.rule_name == metadata.rule_name - && match_item.validation_success - && match_item.validation_response_status - != StatusCode::CONTINUE.as_u16() - }) - } else { - true - } - }) - .collect()) - } - fn style_finding_heading(&self, val: D) -> StyledObject { self.styles.style_finding_heading.apply_to(val) } @@ -475,13 +442,7 @@ impl Reportable for DetailsReporter { } } } -/// A group of matches that all have the same rule and capture group content -#[derive(Serialize, JsonSchema)] -pub(crate) struct Finding { - #[serde(flatten)] - metadata: finding_data::FindingMetadata, - matches: Vec, -} + /// A match produced by one of kingfisher's rules. /// This corresponds to a single location. #[derive(Serialize, JsonSchema, Clone)] @@ -494,18 +455,14 @@ pub struct ReportMatch { #[serde(flatten)] pub m: Match, - /// An optional score assigned to the match - // #[validate(range(min = 0.0, max = 1.0))] - // score: Option, - /// An optional comment assigned to the match pub comment: Option, + /// The confidence level of the match pub match_confidence: Confidence, + /// Whether the match is visible in the output pub visible: bool, - /// An optional status assigned to the match - // status: Option, /// Validation Body pub validation_response_body: String, @@ -566,8 +523,4 @@ impl From for ReportMatch { } } } -impl Finding { - fn new(metadata: finding_data::FindingMetadata, matches: Vec) -> Self { - Self { metadata, matches } - } -} + diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index b64a777..154bb58 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -1,5 +1,4 @@ use super::*; -use serde_json::Value; impl DetailsReporter { pub fn json_format( From ce2126608afd91663dc402691188ab37994e5f7c Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 4 Aug 2025 12:32:19 -0700 Subject: [PATCH 13/13] added progress bar to s3 downloads, and attempting to fix linux-arm64 test failure due to code 143 --- .github/workflows/ci.yml | 2 ++ CHANGELOG.md | 1 + src/scanner/repos.rs | 23 ++++++++++++++++++++++- src/scanner/runner.rs | 1 + 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 389b9c3..27e1f98 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,6 +24,8 @@ jobs: run: make ubuntu-arm64 - name: Run tests run: make tests + env: + CARGO_BUILD_JOBS: 1 macos-arm64: name: macOS arm64 diff --git a/CHANGELOG.md b/CHANGELOG.md index 6483f58..59f655a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file. ## [1.32.0] - Added support for scanning AWS S3 buckets via `--s3-bucket` and optional `--s3-prefix` - Added `--role-arn` and `--aws-local-profile` flags for S3 authentication alongside `KF_AWS_KEY`/`KF_AWS_SECRET` +- Added progress bar for scanning s3 buckets - Refactored output reporting and formatting logic ## [1.31.0] diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index 59bc536..ffa29ea 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -300,6 +300,7 @@ pub async fn fetch_s3_objects( matcher_stats: &Mutex, enable_profiling: bool, shared_profiler: Arc, + progress_enabled: bool, ) -> Result<()> { let Some(bucket) = args.input_specifier_args.s3_bucket.as_deref() else { return Ok(()); @@ -320,9 +321,25 @@ pub async fn fetch_s3_objects( )?; let guesser = Guesser::new().expect("should be able to create filetype guesser"); let mut processor = BlobProcessor { matcher, guesser }; + + let progress = if progress_enabled { + let style = + ProgressStyle::with_template("{spinner} {msg} ({pos} objects) [{elapsed_precise}]") + .expect("progress bar style template should compile"); + let pb = ProgressBar::new_spinner().with_style(style).with_message("Fetching S3 objects"); + pb.enable_steady_tick(Duration::from_millis(500)); + pb + } else { + ProgressBar::hidden() + }; + + let bucket_name = bucket.to_string(); + let pb = progress.clone(); + + let bucket_name = bucket.to_string(); - s3::visit_bucket_objects(bucket, prefix, role_arn, profile, |key, bytes| { + s3::visit_bucket_objects(bucket, prefix, role_arn, profile, move |key, bytes| { let origin = OriginSet::new( Origin::from_extended(serde_json::json!({ "path": format!("s3://{}/{}", bucket_name, key) @@ -348,9 +365,13 @@ pub async fn fetch_s3_objects( let added = datastore.lock().unwrap().record(batch, !args.no_dedup); debug!("Added {} new S3 blobs", added); } + pb.inc(1); Ok(()) }) .await?; + let total = progress.position(); + progress.finish_with_message(format!("Fetched {} S3 objects", total)); + Ok(()) } diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs index 33dc644..10b6e51 100644 --- a/src/scanner/runner.rs +++ b/src/scanner/runner.rs @@ -107,6 +107,7 @@ pub async fn run_async_scan( &matcher_stats, enable_profiling, Arc::clone(&shared_profiler), + progress_enabled, ) .await?;