-Added support for scanning AWS S3 buckets via --s3-bucket and optional --s3-prefix

- Added --role-arn and --aws-local-profile flags for S3 authentication alongside KF_AWS_KEY/KF_AWS_SECRET
This commit is contained in:
Mick Grove 2025-08-02 20:40:16 -07:00
commit 40e760ea2c
20 changed files with 347 additions and 30 deletions

View file

@ -2,6 +2,10 @@
All notable changes to this project will be documented in this file.
## [1.32.0]
- Added support for scanning AWS S3 buckets via `--s3-bucket` and optional `--s3-prefix`
- Added `--role-arn` and `--aws-local-profile` flags for S3 authentication alongside `KF_AWS_KEY`/`KF_AWS_SECRET`
-
## [1.31.0]
- New rules: Telegram bot token, OpenWeatherMap, Apify, Groq
- New OpenAI detectors added (@joshlarsen)

View file

@ -10,7 +10,7 @@ publish = false
[package]
name = "kingfisher"
version = "1.31.0"
version = "1.32.0"
description = "MongoDB's blazingly fast secret scanning and validation tool"
edition.workspace = true
rust-version.workspace = true
@ -186,6 +186,7 @@ oci-client = { version = "0.15", default-features = false, features = ["rustls-t
walkdir = "2.5.0"
p256 = "0.13.2"
ed25519-dalek = { version = "2.2", features = ["pkcs8"] }
aws-sdk-s3 = "1.100.0"
[dependencies.tikv-jemallocator]
version = "0.6"
@ -207,7 +208,7 @@ rand_chacha = "0.9.0"
[profile.release]
debug = false
strip = "debuginfo"
strip = true #"debuginfo"
opt-level = 3 # Maximum optimization for performance
lto = true # Enable Link Time Optimization
codegen-units = 1 # Optimize for size but slower compilation

View file

@ -26,6 +26,7 @@ Kingfisher originated as a fork of Praetorian's [Nosey Parker](https://github.co
- **Docker images**: public or private via `--docker-image`
- **Jira issues**: JQLdriven scans with `--jira-url` and `--jql`
- **Slack messages**: querybased scans with `--slack-query`
- **AWS S3**: bucket scans via `--s3-bucket`/`--s3-prefix` with credentials from `KF_AWS_KEY`/`KF_AWS_SECRET`, `--role-arn`, or `--aws-local-profile`
- **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md))
**Learn more:** [Introducing Kingfisher: RealTime Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation)
@ -109,6 +110,15 @@ docker run --rm \
ghcr.io/mongodb/kingfisher:latest \
scan --git-url https://github.com/org/private_repo.git
# Scan an S3 bucket
# Credentials can come from KF_AWS_KEY/KF_AWS_SECRET, --role-arn, or --aws-local-profile
docker run --rm \
-e KF_AWS_KEY=AKIA... \
-e KF_AWS_SECRET=g5nYW... \
ghcr.io/mongodb/kingfisher:latest \
scan --s3-bucket bucket-name
# Scan and write a JSON report locally
# Here we:
# 1. Mount $PWD → /proj

View file

@ -28,7 +28,8 @@ pub struct InputSpecifierArgs {
"all_gitlab_groups",
"jira_url",
"docker_image",
"slack_query"
"slack_query",
"s3_bucket"
]),
value_hint = ValueHint::AnyPath
)]
@ -107,6 +108,23 @@ pub struct InputSpecifierArgs {
#[arg(long, default_value_t = 100)]
pub max_results: usize,
/// Scan the specified S3 bucket
#[arg(long)]
pub s3_bucket: Option<String>,
/// Optional prefix within the S3 bucket
#[arg(long, requires = "s3_bucket")]
pub s3_prefix: Option<String>,
/// AWS IAM role ARN to assume for S3 access
#[arg(long, requires = "s3_bucket")]
pub role_arn: Option<String>,
/// Use credentials from a local AWS profile in ~/.aws/config
#[arg(long, requires = "s3_bucket")]
pub aws_local_profile: Option<String>,
/// Docker/OCI images to scan (no local Docker required)
#[arg(long = "docker-image")]
pub docker_image: Vec<String>,

View file

@ -54,6 +54,7 @@ pub struct FindingsStore {
origin_meta: FxHashMap<u64, Arc<OriginSet>>,
docker_images: FxHashMap<PathBuf, String>,
slack_links: FxHashMap<PathBuf, String>,
s3_buckets: FxHashMap<PathBuf, String>,
}
impl FindingsStore {
pub fn new(clone_dir: PathBuf) -> Self {
@ -73,6 +74,7 @@ impl FindingsStore {
bloom_items: 0,
docker_images: FxHashMap::default(),
slack_links: FxHashMap::default(),
s3_buckets: FxHashMap::default(),
}
}
@ -306,6 +308,14 @@ impl FindingsStore {
&self.slack_links
}
pub fn register_s3_bucket(&mut self, dir: PathBuf, bucket: String) {
self.s3_buckets.insert(dir, bucket);
}
pub fn s3_buckets(&self) -> &FxHashMap<PathBuf, String> {
&self.s3_buckets
}
pub fn get_finding_data_iter(
&self,
) -> impl Iterator<Item = finding_data::FindingMetadata> + '_ {

View file

@ -30,6 +30,7 @@ pub mod rule_profiling;
pub mod rules;
pub mod rules_database;
pub mod safe_list;
pub mod s3;
pub mod scanner;
pub mod scanner_pool;
pub mod serde_utils;

View file

@ -286,6 +286,11 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
jira_url: None,
jql: None,
max_results: 100,
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Slack query
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),

View file

@ -141,6 +141,17 @@ impl DetailsReporter {
ds.slack_links().get(path).cloned()
}
fn s3_display_path(&self, path: &std::path::Path) -> Option<String> {
let ds = self.datastore.lock().ok()?;
for (dir, bucket) in ds.s3_buckets().iter() {
if path.starts_with(dir) {
let rel = path.strip_prefix(dir).ok()?;
return Some(format!("s3://{}/{}", bucket, rel.display()));
}
}
None
}
fn docker_display_path(&self, path: &std::path::Path) -> Option<String> {
let ds = self.datastore.lock().ok()?;
for (dir, image) in ds.docker_images().iter() {

View file

@ -99,20 +99,22 @@ impl DetailsReporter {
let file_path = rm
.origin
.iter()
.find_map(|origin| {
if let Origin::File(e) = origin {
.find_map(|origin| match origin {
Origin::File(e) => {
if let Some(url) = self.jira_issue_url(&e.path, args) {
Some(url)
} else if let Some(url) = self.slack_message_url(&e.path) {
Some(url)
} else if let Some(mapped) = self.s3_display_path(&e.path) {
Some(mapped)
} else if let Some(mapped) = self.docker_display_path(&e.path) {
Some(mapped)
} else {
Some(e.path.display().to_string())
}
} else {
None
}
}
Origin::Extended(e) => e.path().map(|p| p.display().to_string()),
_ => None,
})
.unwrap_or_default();
@ -258,11 +260,15 @@ impl DetailsReporter {
Some(url)
} else if let Some(url) = self.slack_message_url(&e.path) {
Some(url)
} else if let Some(mapped) = self.s3_display_path(&e.path) {
Some(mapped)
} else if let Some(mapped) = self.docker_display_path(&e.path) {
Some(mapped)
} else {
Some(e.path.display().to_string())
}
} else if let Origin::Extended(e) = origin {
e.path().map(|p| p.display().to_string())
} else {
None
}
@ -437,10 +443,14 @@ mod tests {
jira_url: None,
jql: None,
max_results: 100,
// Docker image scanning
// Slack options
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
docker_image: Vec::new(),
// clone / history options

View file

@ -218,6 +218,8 @@ impl<'a> Display for PrettyFinding<'a> {
url
} else if let Some(url) = reporter.slack_message_url(&e.path) {
url
} else if let Some(mapped) = reporter.s3_display_path(&e.path) {
mapped
} else if let Some(mapped) = reporter.docker_display_path(&e.path) {
mapped
} else {
@ -233,13 +235,23 @@ impl<'a> Display for PrettyFinding<'a> {
}
)?;
}
Origin::Extended(e) => {
if let Some(p) = e.path() {
let display_path = p.display().to_string();
writeln!(
f,
" |Path..........: {}",
if rm.validation_success {
reporter.style_active_creds(&display_path).to_string()
} else {
display_path
}
)?;
}
}
Origin::GitRepo(e) => {
reporter.write_git_metadata(f, e, args, source_span.start.line)?;
}
Origin::Extended(e) => {
writeln!(f, " |Extended......: {}", reporter.style_metadata(e).to_string())?;
// Convert StyledObject to String
}
}
}
Ok(())
@ -353,6 +365,11 @@ fn test_pretty_format_with_nan_entropy_panics() {
// Slack options
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options

View file

@ -75,6 +75,8 @@ impl DetailsReporter {
url
} else if let Some(url) = self.slack_message_url(&e.path) {
url
} else if let Some(mapped) = self.s3_display_path(&e.path) {
mapped
} else {
e.path.display().to_string()
};
@ -82,6 +84,16 @@ impl DetailsReporter {
sarif::ArtifactLocationBuilder::default().uri(uri).build().ok()?,
);
}
Origin::Extended(e) => {
if let Some(p) = e.path() {
artifact_locations.push(
sarif::ArtifactLocationBuilder::default()
.uri(p.display().to_string())
.build()
.ok()?,
);
}
}
Origin::GitRepo(e) => {
// Extract and store Git metadata
if let Some(git_metadata) = self.extract_git_metadata(e, source_span) {
@ -111,7 +123,6 @@ impl DetailsReporter {
);
}
}
Origin::Extended(_) => (),
}
}
@ -212,11 +223,18 @@ impl DetailsReporter {
url
} else if let Some(url) = self.slack_message_url(&e.path) {
url
} else if let Some(mapped) = self.s3_display_path(&e.path) {
mapped
} else {
e.path.display().to_string()
};
msg.push_str(&format!("Location: {}\n", uri));
}
Origin::Extended(e) => {
if let Some(p) = e.path() {
msg.push_str(&format!("Location: {}\n", p.display()));
}
}
Origin::GitRepo(e) => {
if let Some(cs) = &e.first_commit {
let repo_url = get_repo_url(&e.repo_path)
@ -235,9 +253,6 @@ impl DetailsReporter {
msg.push_str(&format!("File: {}", cs.blob_path));
}
}
Origin::Extended(e) => {
msg.push_str(&format!("Extended: {}\n", e));
}
}
msg
} else {

86
src/s3.rs Normal file
View file

@ -0,0 +1,86 @@
use anyhow::{Context, Result};
use aws_config::{meta::region::RegionProviderChain, BehaviorVersion};
use aws_credential_types::Credentials;
use aws_sdk_s3::Client;
/// Visit all objects in the given S3 bucket (optionally under a prefix),
/// calling `visitor` with each object's key and bytes.
pub async fn visit_bucket_objects<F>(
bucket: &str,
prefix: Option<&str>,
role_arn: Option<&str>,
profile: Option<&str>,
mut visitor: F,
) -> Result<()>
where
F: FnMut(String, Vec<u8>) -> Result<()>,
{
let mut config_loader = aws_config::defaults(BehaviorVersion::latest());
if let Some(profile) = profile {
config_loader = config_loader.profile_name(profile);
}
// If explicit credentials are provided via KF_AWS_KEY/KF_AWS_SECRET use them
if let (Ok(key), Ok(secret)) = (std::env::var("KF_AWS_KEY"), std::env::var("KF_AWS_SECRET")) {
let creds = Credentials::new(key, secret, None, None, "kf_env");
config_loader = config_loader.credentials_provider(creds);
}
// Resolve region using the default chain, falling back to us-east-1
let region_provider = RegionProviderChain::default_provider().or_else("us-east-1");
let base_config = config_loader.region(region_provider).load().await;
let client = if let Some(role) = role_arn {
let assume_role = aws_config::sts::AssumeRoleProvider::builder(role.to_string())
.session_name("kingfisher")
.configure(&base_config)
.build()
.await;
let conf = aws_sdk_s3::config::Builder::from(&base_config)
.credentials_provider(assume_role)
.build();
Client::from_conf(conf)
} else {
Client::new(&base_config)
};
let mut continuation_token = None;
loop {
let mut req = client.list_objects_v2().bucket(bucket.to_string());
if let Some(p) = prefix {
req = req.prefix(p.to_string());
}
if let Some(token) = continuation_token.clone() {
req = req.continuation_token(token);
}
let resp = req.send().await.context("Failed to list objects in bucket")?;
if let Some(objects) = resp.contents {
for obj in objects {
if let Some(key) = obj.key {
let get_resp = client
.get_object()
.bucket(bucket)
.key(&key)
.send()
.await
.with_context(|| format!("Failed to fetch object {key}"))?;
let data =
get_resp.body.collect().await.context("Failed to read S3 object body")?;
visitor(key, data.into_bytes().to_vec())?;
}
}
}
if resp.is_truncated.unwrap_or(false) {
continuation_token = resp.next_continuation_token;
} else {
break;
}
}
Ok(())
}

View file

@ -8,6 +8,7 @@ use indicatif::{HumanCount, ProgressBar, ProgressStyle};
use tokio::time::Duration;
use tracing::{debug, error, info};
use crate::blob::BlobIdMap;
use crate::{
blob::BlobMetadata,
cli::{
@ -21,10 +22,15 @@ use crate::{
git_binary::{CloneMode, Git},
git_url::GitUrl,
github, gitlab, jira,
matcher::Match,
origin::OriginSet,
slack, PathBuf,
matcher::{Match, Matcher, MatcherStats},
origin::{Origin, OriginSet},
rules_database::RulesDatabase,
s3,
scanner::processing::BlobProcessor,
scanner_pool::ScannerPool,
slack, guesser::Guesser, PathBuf,
};
pub type DatastoreMessage = (OriginSet, BlobMetadata, Vec<(Option<f64>, Match)>);
pub fn clone_or_update_git_repos(
@ -284,3 +290,64 @@ pub async fn fetch_slack_messages(
}
Ok(vec![output_dir])
}
pub async fn fetch_s3_objects(
args: &scan::ScanArgs,
datastore: &Arc<Mutex<findings_store::FindingsStore>>,
rules_db: &RulesDatabase,
matcher_stats: &Mutex<MatcherStats>,
enable_profiling: bool,
shared_profiler: Arc<crate::rule_profiling::ConcurrentRuleProfiler>,
) -> Result<()> {
let Some(bucket) = args.input_specifier_args.s3_bucket.as_deref() else {
return Ok(());
};
let prefix = args.input_specifier_args.s3_prefix.as_deref();
let role_arn = args.input_specifier_args.role_arn.as_deref();
let profile = args.input_specifier_args.aws_local_profile.as_deref();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let seen_blobs = BlobIdMap::new();
let matcher = Matcher::new(
rules_db,
scanner_pool,
&seen_blobs,
Some(matcher_stats),
enable_profiling,
Some(shared_profiler.clone()),
)?;
let guesser = Guesser::new().expect("should be able to create filetype guesser");
let mut processor = BlobProcessor { matcher, guesser };
let bucket_name = bucket.to_string();
s3::visit_bucket_objects(bucket, prefix, role_arn, profile, |key, bytes| {
let origin = OriginSet::new(
Origin::from_extended(serde_json::json!({
"path": format!("s3://{}/{}", bucket_name, key)
})),
Vec::new(),
);
let blob = crate::blob::Blob::from_bytes(bytes);
if let Some((origin, blob_md, scored_matches)) = processor.run(origin, blob, args.no_dedup)? {
// Wrap origin & metadata once:
let origin_arc = Arc::new(origin);
let blob_arc = Arc::new(blob_md);
// Now build a batch of exactly one FindingsStoreMessage per Match
let mut batch = Vec::with_capacity(scored_matches.len());
for (_score, m) in scored_matches {
batch.push((origin_arc.clone(), blob_arc.clone(), m));
}
// Call record with the right type
let added = datastore.lock().unwrap().record(batch, !args.no_dedup);
debug!("Added {} new S3 blobs", added);
}
Ok(())
})
.await?;
Ok(())
}

View file

@ -18,7 +18,9 @@ use crate::{
rules_database::RulesDatabase,
scanner::{
clone_or_update_git_repos, enumerate_filesystem_inputs, enumerate_github_repos,
repos::{enumerate_gitlab_repos, fetch_jira_issues, fetch_slack_messages},
repos::{
enumerate_gitlab_repos, fetch_jira_issues, fetch_s3_objects, fetch_slack_messages,
},
run_secret_validation, save_docker_images,
summary::print_scan_summary,
},
@ -72,6 +74,7 @@ pub async fn run_async_scan(
let slack_dirs = fetch_slack_messages(args, global_args, &datastore).await?;
input_roots.extend(slack_dirs);
// Save Docker images if specified
if !args.input_specifier_args.docker_image.is_empty() {
let clone_root = {
@ -93,22 +96,42 @@ pub async fn run_async_scan(
}
}
if input_roots.is_empty() {
bail!("No inputs to scan");
}
// if input_roots.is_empty() {
// bail!("No inputs to scan");
// }
let shared_profiler = Arc::new(ConcurrentRuleProfiler::new());
let enable_profiling = args.rule_stats;
let matcher_stats = Mutex::new(MatcherStats::default());
let _inputs = enumerate_filesystem_inputs(
// Fetch S3 objects if requested (scanned immediately)
fetch_s3_objects(
args,
datastore.clone(),
&input_roots,
progress_enabled,
&datastore,
rules_db,
&matcher_stats,
enable_profiling,
Arc::clone(&shared_profiler),
&matcher_stats,
)?;
)
.await?;
let has_s3 = args.input_specifier_args.s3_bucket.is_some();
if input_roots.is_empty() && !has_s3 {
bail!("No inputs to scan");
}
if !input_roots.is_empty() {
let _inputs = enumerate_filesystem_inputs(
args,
datastore.clone(),
&input_roots,
progress_enabled,
rules_db,
enable_profiling,
Arc::clone(&shared_profiler),
&matcher_stats,
)?;
}
if !args.no_dedup {
// Final deduplication step before validation (or before reporting)

View file

@ -84,6 +84,11 @@ rules:
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options

View file

@ -71,6 +71,11 @@ fn test_github_remote_scan() -> Result<()> {
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options

View file

@ -70,6 +70,11 @@ fn test_gitlab_remote_scan() -> Result<()> {
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,

View file

@ -59,6 +59,10 @@ impl TestContext {
jql: None,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
max_results: 10,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
@ -147,6 +151,11 @@ async fn test_scan_slack_messages() -> Result<()> {
slack_query: Some("test".into()),
slack_api_url: Url::parse(&format!("{}/", server.uri()))?,
max_results: 10,
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,

View file

@ -127,6 +127,11 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options

View file

@ -70,6 +70,11 @@ impl TestContext {
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@ -142,6 +147,11 @@ impl TestContext {
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options