Merge pull request #71 from mongodb/development

v1.32.0
This commit is contained in:
Mick Grove 2025-08-04 13:32:29 -07:00 committed by GitHub
commit 861c3d1ff7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
27 changed files with 835 additions and 1367 deletions

View file

@ -24,6 +24,8 @@ jobs:
run: make ubuntu-arm64
- name: Run tests
run: make tests
env:
CARGO_BUILD_JOBS: 1
macos-arm64:
name: macOS arm64

View file

@ -2,6 +2,12 @@
All notable changes to this project will be documented in this file.
## [1.32.0]
- Added support for scanning AWS S3 buckets via `--s3-bucket` and optional `--s3-prefix`
- Added `--role-arn` and `--aws-local-profile` flags for S3 authentication alongside `KF_AWS_KEY`/`KF_AWS_SECRET`
- Added progress bar for scanning s3 buckets
- Refactored output reporting and formatting logic
## [1.31.0]
- New rules: Telegram bot token, OpenWeatherMap, Apify, Groq
- New OpenAI detectors added (@joshlarsen)

View file

@ -10,7 +10,7 @@ publish = false
[package]
name = "kingfisher"
version = "1.31.0"
version = "1.32.0"
description = "MongoDB's blazingly fast secret scanning and validation tool"
edition.workspace = true
rust-version.workspace = true
@ -186,6 +186,7 @@ oci-client = { version = "0.15", default-features = false, features = ["rustls-t
walkdir = "2.5.0"
p256 = "0.13.2"
ed25519-dalek = { version = "2.2", features = ["pkcs8"] }
aws-sdk-s3 = "1.100.0"
[dependencies.tikv-jemallocator]
version = "0.6"
@ -207,7 +208,7 @@ rand_chacha = "0.9.0"
[profile.release]
debug = false
strip = "debuginfo"
strip = true #"debuginfo"
opt-level = 3 # Maximum optimization for performance
lto = true # Enable Link Time Optimization
codegen-units = 1 # Optimize for size but slower compilation

View file

@ -13,7 +13,7 @@ Kingfisher originated as a fork of Praetorian's [Nosey Parker](https://github.co
## What Kingfisher Adds
- **Live validation** via cloud-provider APIs
- **Language-aware detection** (source-code parsing) for ~20 languages
- **Extra targets**: GitLab repos, Docker images, Jira issues, and Slack messages
- **Extra targets**: GitLab repos, S3 buckets, Docker images, Jira issues, and Slack messages
- **Baseline mode**: ignore known secrets, flag only new ones
- **Native Windows** binary
@ -26,6 +26,7 @@ Kingfisher originated as a fork of Praetorian's [Nosey Parker](https://github.co
- **Docker images**: public or private via `--docker-image`
- **Jira issues**: JQLdriven scans with `--jira-url` and `--jql`
- **Slack messages**: querybased scans with `--slack-query`
- **AWS S3**: bucket scans via `--s3-bucket`/`--s3-prefix` with credentials from `KF_AWS_KEY`/`KF_AWS_SECRET`, `--role-arn`, `--aws-local-profile`, or anonymous
- **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md))
**Learn more:** [Introducing Kingfisher: RealTime Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation)
@ -109,6 +110,15 @@ docker run --rm \
ghcr.io/mongodb/kingfisher:latest \
scan --git-url https://github.com/org/private_repo.git
# Scan an S3 bucket
# Credentials can come from KF_AWS_KEY/KF_AWS_SECRET, --role-arn, or --aws-local-profile
docker run --rm \
-e KF_AWS_KEY=AKIA... \
-e KF_AWS_SECRET=g5nYW... \
ghcr.io/mongodb/kingfisher:latest \
scan --s3-bucket bucket-name
# Scan and write a JSON report locally
# Here we:
# 1. Mount $PWD → /proj
@ -264,6 +274,57 @@ kingfisher scan ./my-project \
--exclude tests \
-v
```
## Scan an S3 bucket
You can scan S3 objects directly:
```bash
kingfisher scan --s3-bucket bucket-name [--s3-prefix path/]
```
Credential resolution happens in this order:
1. `KF_AWS_KEY` and `KF_AWS_SECRET` environment variables
2. `--aws-local-profile` pointing to a profile in `~/.aws/config` (works with AWS SSO)
3. anonymous access for public buckets
If `--role-arn` is supplied, the credentials from steps 12 are used to assume that role.
Examples:
```bash
# using explicit keys
export KF_AWS_KEY=AKIA...
export KF_AWS_SECRET=g5nYW...
kingfisher scan --s3-bucket some-example-bucket
# Above can also be run as:
KF_AWS_KEY=AKIA... KF_AWS_SECRET=g5nYW... kingfisher scan --s3-bucket some-example-bucket
# using a local profile (e.g., SSO) that exists in your AWS profile (~/.aws/config)
kingfisher scan --s3-bucket some-example-bucket --aws-local-profile default
# anonymous scan of a bucket, while providing an object prefix to only scan subset of the s3 bucket
kingfisher scan \
--s3-bucket awsglue-datasets \
--s3-prefix examples/us-legislators/all
# assuming a role when scanning
kingfisher scan --s3-bucket some-example-bucket \
--role-arn arn:aws:iam::123456789012:role/MyRole
# anonymous scan of a public bucket
kingfisher scan --s3-bucket some-example-bucket
```
Docker example:
```bash
docker run --rm \
-e KF_AWS_KEY=AKIA... \
-e KF_AWS_SECRET=g5nYW... \
ghcr.io/mongodb/kingfisher:latest \
scan --s3-bucket bucket-name
```
## Scanning Docker Images
Kingfisher will first try to use any locally available image, then fall back to pulling via OCI.

View file

@ -20,6 +20,6 @@ rules:
[a-z0-9\/._~-]*
)?
min_entropy: 3.0
confidence: low
confidence: medium
examples:
- https://eaRIWNkE:qyOIhJiM@j2LYY414Q5cCYD

View file

@ -96,7 +96,6 @@ rules:
["']
min_entropy: 3.3
confidence: low
categories: [fuzzy, generic, secret]
examples:
- |
password = "super$ecret"

21
data/rules/vmware.yml Normal file
View file

@ -0,0 +1,21 @@
rules:
- name: Credentials in Connect-VIServer Invocation
id: kingfisher.vmware.1
pattern: |
(?xi)
Connect-VIServer
.{0,50}
-User \s+ (\S{3,30}) \s+ (?# username )
.{0,50}
-Password \s+ (\S{3,30}) (?# password )
examples:
- 'Connect-VIServer -Server 192.168.1.51 -User administrator@vSphere.local -Password VMware1!'
- |
#Set-PowerCLIConfiguration -InvalidCertificateAction:Ignore
Connect-VIServer "$endpoint" -User "$username" -Password "$password" | Out-Null
- 'Connect-VIServer $ESXiHost.EsxiHost -user $ESXiUser -password $ESXipass'
- '$null = connect-viserver vc.lab.local -user administrator@vsphere.local -password VMware1!'
references:
- https://developer.broadcom.com/powercli/latest/vmware.vimautomation.core/commands/connect-viserver

View file

@ -28,7 +28,8 @@ pub struct InputSpecifierArgs {
"all_gitlab_groups",
"jira_url",
"docker_image",
"slack_query"
"slack_query",
"s3_bucket"
]),
value_hint = ValueHint::AnyPath
)]
@ -107,6 +108,22 @@ pub struct InputSpecifierArgs {
#[arg(long, default_value_t = 100)]
pub max_results: usize,
/// Scan the specified S3 bucket
#[arg(long)]
pub s3_bucket: Option<String>,
/// Optional prefix within the S3 bucket
#[arg(long, requires = "s3_bucket")]
pub s3_prefix: Option<String>,
/// AWS IAM role ARN to assume for S3 access
#[arg(long, requires = "s3_bucket")]
pub role_arn: Option<String>,
/// Use credentials from a local AWS profile in ~/.aws/config
#[arg(long, requires = "s3_bucket")]
pub aws_local_profile: Option<String>,
/// Docker/OCI images to scan (no local Docker required)
#[arg(long = "docker-image")]
pub docker_image: Vec<String>,

View file

@ -54,6 +54,7 @@ pub struct FindingsStore {
origin_meta: FxHashMap<u64, Arc<OriginSet>>,
docker_images: FxHashMap<PathBuf, String>,
slack_links: FxHashMap<PathBuf, String>,
s3_buckets: FxHashMap<PathBuf, String>,
}
impl FindingsStore {
pub fn new(clone_dir: PathBuf) -> Self {
@ -73,6 +74,7 @@ impl FindingsStore {
bloom_items: 0,
docker_images: FxHashMap::default(),
slack_links: FxHashMap::default(),
s3_buckets: FxHashMap::default(),
}
}
@ -306,6 +308,14 @@ impl FindingsStore {
&self.slack_links
}
pub fn register_s3_bucket(&mut self, dir: PathBuf, bucket: String) {
self.s3_buckets.insert(dir, bucket);
}
pub fn s3_buckets(&self) -> &FxHashMap<PathBuf, String> {
&self.s3_buckets
}
pub fn get_finding_data_iter(
&self,
) -> impl Iterator<Item = finding_data::FindingMetadata> + '_ {

View file

@ -29,6 +29,7 @@ pub mod rule_loader;
pub mod rule_profiling;
pub mod rules;
pub mod rules_database;
pub mod s3;
pub mod safe_list;
pub mod scanner;
pub mod scanner_pool;

View file

@ -286,6 +286,11 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
jira_url: None,
jql: None,
max_results: 100,
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Slack query
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),

View file

@ -5,12 +5,12 @@ use std::{
use anyhow::Result;
use http::StatusCode;
use indenter::indented;
use schemars::JsonSchema;
use serde::Serialize;
use crate::{
blob::BlobMetadata,
bstring_escape::Escaped,
cli,
cli::global::GlobalArgs,
finding_data, findings_store,
@ -23,7 +23,7 @@ mod json_format;
mod pretty_format;
mod sarif_format;
pub mod styles;
use std::{hash::Hash, io::IsTerminal};
use std::io::IsTerminal;
use styles::{StyledObject, Styles};
@ -141,6 +141,17 @@ impl DetailsReporter {
ds.slack_links().get(path).cloned()
}
fn s3_display_path(&self, path: &std::path::Path) -> Option<String> {
let ds = self.datastore.lock().ok()?;
for (dir, bucket) in ds.s3_buckets().iter() {
if path.starts_with(dir) {
let rel = path.strip_prefix(dir).ok()?;
return Some(format!("s3://{}/{}", bucket, rel.display()));
}
}
None
}
fn docker_display_path(&self, path: &std::path::Path) -> Option<String> {
let ds = self.datastore.lock().ok()?;
for (dir, image) in ds.docker_images().iter() {
@ -156,19 +167,6 @@ impl DetailsReporter {
None
}
fn gather_findings(&self) -> Result<Vec<Finding>> {
let metadata_list = self.get_finding_data()?;
let all_matches = self.get_filtered_matches()?;
let mut findings = Vec::new();
for md in metadata_list {
// Filter matches that belong to this metadata if needed
let matches_for_md =
all_matches.iter().filter(|m| m.m.rule_name == md.rule_name).cloned().collect();
findings.push(Finding::new(md.clone(), matches_for_md));
}
Ok(findings)
}
fn process_matches(&self, only_valid: bool, filter_visible: bool) -> Result<Vec<ReportMatch>> {
let datastore = self.datastore.lock().unwrap();
Ok(datastore
@ -215,38 +213,6 @@ impl DetailsReporter {
.collect())
}
// fn process_matches(&self, only_valid: bool) -> Result<Vec<ReportMatch>> {
// let datastore = self.datastore.lock().unwrap();
// Ok(datastore
// .get_matches()
// .iter()
// .filter(|msg| {
// let (_origin, _blob_metadata, match_item) = &***msg;
// if only_valid {
// match_item.validation_success
// && match_item.validation_response_status != StatusCode::CONTINUE.as_u16()
// && match_item.visible
// } else {
// match_item.visible
// }
// })
// .map(|msg| {
// let (origin, blob_metadata, match_item) = &**msg;
// ReportMatch {
// origin: origin.clone(),
// blob_metadata: blob_metadata.clone(),
// m: match_item.clone(),
// comment: None,
// visible: match_item.visible,
// match_confidence: match_item.rule_confidence,
// validation_response_body: match_item.validation_response_body.clone(),
// validation_response_status: match_item.validation_response_status,
// validation_success: match_item.validation_success,
// }
// })
// .collect())
// }
pub fn get_filtered_matches(&self) -> Result<Vec<ReportMatch>> {
self.process_matches(self.only_valid, true)
}
@ -255,24 +221,164 @@ impl DetailsReporter {
self.process_matches(only_valid.unwrap_or(self.only_valid), false)
}
fn get_finding_data(&self) -> Result<Vec<finding_data::FindingMetadata>> {
let datastore = self.datastore.lock().unwrap();
Ok(datastore
.get_finding_data_iter()
.filter(|metadata| {
if self.only_valid {
datastore.get_matches().iter().any(|msg| {
let (_, _, match_item) = &**msg;
match_item.rule_name == metadata.rule_name
&& match_item.validation_success
&& match_item.validation_response_status
!= StatusCode::CONTINUE.as_u16()
})
pub fn deduplicate_matches(
&self,
matches: Vec<ReportMatch>,
no_dedup: bool,
) -> Vec<ReportMatch> {
if no_dedup {
return matches;
}
use std::collections::HashMap;
let mut by_fp: HashMap<u64, ReportMatch> = HashMap::new();
for rm in matches {
let fp = rm.m.finding_fingerprint;
if let Some(existing) = by_fp.get_mut(&fp) {
// merge origin sets (keep first origin, append the rest)
for o in rm.origin.iter() {
if !existing.origin.iter().any(|e| e == o) {
existing.origin = OriginSet::new(
existing.origin.first().clone(),
existing
.origin
.iter()
.skip(1)
.cloned()
.chain(std::iter::once(o.clone()))
.collect(),
);
}
}
continue;
}
by_fp.insert(fp, rm);
}
by_fp.into_values().collect()
}
fn matches_for_output(&self, args: &cli::commands::scan::ScanArgs) -> Result<Vec<ReportMatch>> {
let mut matches = self.get_filtered_matches()?;
if !args.no_dedup {
matches = self.deduplicate_matches(matches, args.no_dedup);
}
if args.no_dedup {
let mut expanded = Vec::new();
for rm in matches {
if rm.origin.len() > 1 {
for origin in rm.origin.iter() {
let mut single = rm.clone();
single.origin = OriginSet::new(origin.clone(), Vec::new());
expanded.push(single);
}
} else {
true
expanded.push(rm);
}
}
matches = expanded;
}
Ok(matches)
}
pub fn build_finding_record(
&self,
rm: &ReportMatch,
args: &cli::commands::scan::ScanArgs,
) -> FindingReporterRecord {
let source_span = &rm.m.location.source_span;
let line_num = source_span.start.line;
let snippet = Escaped(
rm.m.groups
.captures
.get(1)
.or_else(|| rm.m.groups.captures.get(0))
.map(|capture| capture.value.as_bytes())
.unwrap_or_default(),
)
.to_string();
let validation_status = if rm.validation_success {
"Active Credential".to_string()
} else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() {
"Not Attempted".to_string()
} else {
"Inactive Credential".to_string()
};
const MAX_RESPONSE_LENGTH: usize = 512;
let truncated_body: String =
rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect();
let ellipsis =
if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" };
let response_body = format!("{}{}", truncated_body, ellipsis);
let git_metadata_val = rm
.origin
.iter()
.filter_map(|origin| {
if let Origin::GitRepo(e) = origin {
self.extract_git_metadata(e, source_span)
} else {
None
}
})
.collect())
.next();
let file_path = rm
.origin
.iter()
.find_map(|origin| match origin {
Origin::File(e) => {
if let Some(url) = self.jira_issue_url(&e.path, args) {
Some(url)
} else if let Some(url) = self.slack_message_url(&e.path) {
Some(url)
} else if let Some(mapped) = self.s3_display_path(&e.path) {
Some(mapped)
} else if let Some(mapped) = self.docker_display_path(&e.path) {
Some(mapped)
} else {
Some(e.path.display().to_string())
}
}
Origin::Extended(e) => e.path().map(|p| p.display().to_string()),
_ => None,
})
.unwrap_or_default();
FindingReporterRecord {
rule: RuleMetadata {
name: rm.m.rule_name.to_string(),
id: rm.m.rule_text_id.to_string(),
},
finding: FindingRecordData {
snippet,
fingerprint: rm.m.finding_fingerprint.to_string(),
confidence: rm.match_confidence.to_string(),
entropy: format!("{:.2}", rm.m.calculated_entropy),
validation: ValidationInfo { status: validation_status, response: response_body },
language: rm
.blob_metadata
.language
.clone()
.unwrap_or_else(|| "Unknown".to_string()),
line: line_num as u32,
column_start: source_span.start.column as u32,
column_end: source_span.end.column as u32,
path: file_path,
git_metadata: git_metadata_val,
},
}
}
pub fn build_finding_records(
&self,
args: &cli::commands::scan::ScanArgs,
) -> Result<Vec<FindingReporterRecord>> {
let matches = self.matches_for_output(args)?;
Ok(matches.iter().map(|rm| self.build_finding_record(rm, args)).collect())
}
fn style_finding_heading<D>(&self, val: D) -> StyledObject<D> {
@ -336,13 +442,7 @@ impl Reportable for DetailsReporter {
}
}
}
/// A group of matches that all have the same rule and capture group content
#[derive(Serialize, JsonSchema)]
pub(crate) struct Finding {
#[serde(flatten)]
metadata: finding_data::FindingMetadata,
matches: Vec<ReportMatch>,
}
/// A match produced by one of kingfisher's rules.
/// This corresponds to a single location.
#[derive(Serialize, JsonSchema, Clone)]
@ -355,18 +455,14 @@ pub struct ReportMatch {
#[serde(flatten)]
pub m: Match,
/// An optional score assigned to the match
// #[validate(range(min = 0.0, max = 1.0))]
// score: Option<f64>,
/// An optional comment assigned to the match
pub comment: Option<String>,
/// The confidence level of the match
pub match_confidence: Confidence,
/// Whether the match is visible in the output
pub visible: bool,
/// An optional status assigned to the match
// status: Option<finding_data::Status>,
/// Validation Body
pub validation_response_body: String,
@ -377,6 +473,41 @@ pub struct ReportMatch {
/// Validation Success
pub validation_success: bool,
}
#[derive(Serialize, JsonSchema, Clone, Debug)]
pub struct FindingReporterRecord {
pub rule: RuleMetadata,
pub finding: FindingRecordData,
}
#[derive(Serialize, JsonSchema, Clone, Debug)]
pub struct RuleMetadata {
pub name: String,
pub id: String,
}
#[derive(Serialize, JsonSchema, Clone, Debug)]
pub struct ValidationInfo {
pub status: String,
pub response: String,
}
#[derive(Serialize, JsonSchema, Clone, Debug)]
pub struct FindingRecordData {
pub snippet: String,
pub fingerprint: String,
pub confidence: String,
pub entropy: String,
pub validation: ValidationInfo,
pub language: String,
pub line: u32,
pub column_start: u32,
pub column_end: u32,
pub path: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub git_metadata: Option<serde_json::Value>,
}
impl From<finding_data::FindingDataEntry> for ReportMatch {
fn from(e: finding_data::FindingDataEntry) -> Self {
ReportMatch {
@ -392,8 +523,4 @@ impl From<finding_data::FindingDataEntry> for ReportMatch {
}
}
}
impl Finding {
fn new(metadata: finding_data::FindingMetadata, matches: Vec<ReportMatch>) -> Self {
Self { metadata, matches }
}
}

View file

@ -1,90 +1,17 @@
use bson::Document;
use serde_json::Value;
use super::*;
impl DetailsReporter {
/// Formats findings as BSON and writes them to the provided writer.
/// For testing purposes, prints the full JSON for each finding before converting.
pub fn bson_format<W: std::io::Write>(
&self,
mut writer: W,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
// Get filtered matches
let mut matches = self.get_filtered_matches()?;
// Apply deduplication only if requested
if !args.no_dedup {
matches = self.deduplicate_matches(matches, args.no_dedup);
}
let mut bson_findings = Vec::new();
// For each match, handle it based on the no_dedup flag
for rm in matches {
if args.no_dedup && rm.origin.len() > 1 {
// For no_dedup and multiple origins, create separate findings for each origin
for origin in rm.origin.iter() {
// Create a single-origin version of this match
let single_origin_rm = ReportMatch {
origin: OriginSet::new(origin.clone(), Vec::new()),
blob_metadata: rm.blob_metadata.clone(),
m: rm.m.clone(),
comment: rm.comment.clone(),
visible: rm.visible,
match_confidence: rm.match_confidence,
validation_response_body: rm.validation_response_body.clone(),
validation_response_status: rm.validation_response_status,
validation_success: rm.validation_success,
};
// Process to JSON first, then convert to BSON
let json_finding = self.process_match_to_json(&single_origin_rm, args)?;
if let Ok(bson_doc) = json_to_bson_document(&json_finding) {
bson_findings.push(bson_doc);
}
}
} else {
// Process normally for deduped matches or matches with only one origin
let json_finding = self.process_match_to_json(&rm, args)?;
if let Ok(bson_doc) = json_to_bson_document(&json_finding) {
bson_findings.push(bson_doc);
}
}
}
// Write each BSON document
for doc in bson_findings {
let records = self.build_finding_records(args)?;
for record in records {
let doc = bson::to_document(&record)?;
doc.to_writer(&mut writer)?;
}
Ok(())
}
// pub fn bson_format<W: std::io::Write>(
// &self,
// mut writer: W,
// args: &cli::commands::scan::ScanArgs,
// ) -> Result<()> {
// let findings = self.gather_json_findings(args)?;
// // Print the full JSON for each finding
// for finding in &findings {
// println!("Full JSON:\n{}", serde_json::to_string_pretty(finding)?);
// }
// let bson_findings: Vec<Document> = findings
// .into_iter()
// .filter_map(|finding| json_to_bson_document(&finding).ok())
// .collect();
// for doc in bson_findings {
// doc.to_writer(&mut writer)?;
// }
// Ok(())
// }
}
fn json_to_bson_document(json: &Value) -> Result<Document> {
match bson::to_bson(json)? {
bson::Bson::Document(doc) => Ok(doc),
_ => Err(anyhow::anyhow!("Failed to convert JSON to BSON document")),
}
}

View file

@ -1,409 +1,60 @@
use http::StatusCode;
use serde_json::json;
use super::*;
use crate::bstring_escape::Escaped;
impl DetailsReporter {
pub fn deduplicate_matches(
&self,
matches: Vec<ReportMatch>,
no_dedup: bool,
) -> Vec<ReportMatch> {
if no_dedup {
return matches;
}
use std::collections::HashMap;
let mut by_fp: HashMap<u64, ReportMatch> = HashMap::new();
for rm in matches {
let fp = rm.m.finding_fingerprint;
if let Some(existing) = by_fp.get_mut(&fp) {
// merge origin sets (keep first origin, append the rest)
for o in rm.origin.iter() {
if !existing.origin.iter().any(|e| e == o) {
existing.origin = OriginSet::new(
existing.origin.first().clone(),
existing
.origin
.iter()
.skip(1)
.cloned()
.chain(std::iter::once(o.clone()))
.collect(),
);
}
}
continue;
}
by_fp.insert(fp, rm);
}
by_fp.into_values().collect()
}
pub fn gather_json_findings(
&self,
args: &cli::commands::scan::ScanArgs,
) -> Result<Vec<serde_json::Value>> {
let mut matches = self.get_filtered_matches()?;
if !args.no_dedup {
matches = self.deduplicate_matches(matches, args.no_dedup);
}
let mut json_findings = Vec::new();
for rm in matches {
let source_span = &rm.m.location.source_span;
let line_num = source_span.start.line;
let snippet = Escaped(
rm.m.groups
.captures
.get(1)
.or_else(|| rm.m.groups.captures.get(0))
.map(|capture| capture.value.as_bytes())
.unwrap_or_default(),
)
.to_string();
let validation_status = if rm.validation_success {
"Active Credential"
} else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() {
"Not Attempted"
} else {
"Inactive Credential"
};
const MAX_RESPONSE_LENGTH: usize = 512;
let truncated_body: String =
rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect();
let ellipsis =
if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" };
let response_body = format!("{}{}", truncated_body, ellipsis);
// Call extract_git_metadata on each GitRepo origin and take the first non-null result.
let git_metadata_val = rm
.origin
.iter()
.filter_map(|origin| {
if let Origin::GitRepo(e) = origin {
self.extract_git_metadata(e, source_span)
} else {
None
}
})
.next()
.unwrap_or(serde_json::Value::Null);
// Collect a file path from an Origin::File, if available.
let file_path = rm
.origin
.iter()
.find_map(|origin| {
if let Origin::File(e) = origin {
if let Some(url) = self.jira_issue_url(&e.path, args) {
Some(url)
} else if let Some(url) = self.slack_message_url(&e.path) {
Some(url)
} else if let Some(mapped) = self.docker_display_path(&e.path) {
Some(mapped)
} else {
Some(e.path.display().to_string())
}
} else {
None
}
})
.unwrap_or_default();
let match_json = json!({
"rule": {
"name": rm.m.rule_name,
"id": rm.m.rule_text_id,
},
"finding": {
"snippet": snippet,
"fingerprint": rm.m.finding_fingerprint.to_string(),
"confidence": rm.match_confidence.to_string(),
"entropy": format!("{:.2}", rm.m.calculated_entropy),
"validation": {
"status": validation_status,
"response": response_body,
},
"language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()),
"line": line_num,
"column_start": source_span.start.column,
"column_end": source_span.end.column,
"path": file_path,
"git_metadata": git_metadata_val
}
});
let finding_json = json!({
"id": rm.m.rule_text_id,
"matches": [ match_json ]
});
json_findings.push(finding_json);
}
Ok(json_findings)
}
pub fn json_format<W: std::io::Write>(
&self,
mut writer: W,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
let mut findings = Vec::new();
// Get filtered matches
let mut matches = self.get_filtered_matches()?;
// Apply deduplication only if requested
if !args.no_dedup {
matches = self.deduplicate_matches(matches, args.no_dedup);
}
// For each match, handle it based on the no_dedup flag
for rm in matches {
if args.no_dedup && rm.origin.len() > 1 {
// For no_dedup and multiple origins, create separate findings for each origin
for origin in rm.origin.iter() {
// Create a single-origin version of this match
let single_origin_rm = ReportMatch {
origin: OriginSet::new(origin.clone(), Vec::new()),
blob_metadata: rm.blob_metadata.clone(),
m: rm.m.clone(),
comment: rm.comment.clone(),
visible: rm.visible,
match_confidence: rm.match_confidence,
validation_response_body: rm.validation_response_body.clone(),
validation_response_status: rm.validation_response_status,
validation_success: rm.validation_success,
};
// Process this single-origin match into a JSON finding
let json_finding = self.process_match_to_json(&single_origin_rm, args)?;
findings.push(json_finding);
}
} else {
// Process normally for deduped matches or matches with only one origin
let json_finding = self.process_match_to_json(&rm, args)?;
findings.push(json_finding);
}
}
// Write the JSON output
if !findings.is_empty() {
serde_json::to_writer_pretty(&mut writer, &findings)?;
let records = self.build_finding_records(args)?;
if !records.is_empty() {
serde_json::to_writer_pretty(&mut writer, &records)?;
writeln!(writer)?;
}
Ok(())
}
// Add a helper method to convert a ReportMatch to a JSON finding
pub fn process_match_to_json(
&self,
rm: &ReportMatch,
args: &cli::commands::scan::ScanArgs,
) -> Result<serde_json::Value> {
// Extract the relevant data from the match as you already do in your current implementation
let source_span = &rm.m.location.source_span;
let line_num = source_span.start.line;
let snippet = Escaped(
rm.m.groups
.captures
.get(1)
.or_else(|| rm.m.groups.captures.get(0))
.map(|capture| capture.value.as_bytes())
.unwrap_or_default(),
)
.to_string();
let validation_status = if rm.validation_success {
"Active Credential"
} else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() {
"Not Attempted"
} else {
"Inactive Credential"
};
const MAX_RESPONSE_LENGTH: usize = 512;
let truncated_body: String =
rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect();
let ellipsis =
if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" };
let response_body = format!("{}{}", truncated_body, ellipsis);
// Call extract_git_metadata on each GitRepo origin and take the first non-null result.
let git_metadata_val = rm
.origin
.iter()
.filter_map(|origin| {
if let Origin::GitRepo(e) = origin {
self.extract_git_metadata(e, source_span)
} else {
None
}
})
.next()
.unwrap_or(serde_json::Value::Null);
// Collect a file path from an Origin::File, if available.
let file_path = rm
.origin
.iter()
.find_map(|origin| {
if let Origin::File(e) = origin {
if let Some(url) = self.jira_issue_url(&e.path, args) {
Some(url)
} else if let Some(url) = self.slack_message_url(&e.path) {
Some(url)
} else if let Some(mapped) = self.docker_display_path(&e.path) {
Some(mapped)
} else {
Some(e.path.display().to_string())
}
} else {
None
}
})
.unwrap_or_default();
let match_json = json!({
"rule": {
"name": rm.m.rule_name,
"id": rm.m.rule_text_id,
},
"finding": {
"snippet": snippet,
"fingerprint": rm.m.finding_fingerprint.to_string(),
"confidence": rm.match_confidence.to_string(),
"entropy": format!("{:.2}", rm.m.calculated_entropy),
"validation": {
"status": validation_status,
"response": response_body,
},
"language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()),
"line": line_num,
"column_start": source_span.start.column,
"column_end": source_span.end.column,
"path": file_path,
"git_metadata": git_metadata_val
}
});
let finding_json = json!({
"id": rm.m.rule_text_id,
"matches": [ match_json ]
});
Ok(finding_json)
}
// // Modified JSON format to pass args to gather_json_findings
// pub fn json_format<W: std::io::Write>(
// &self,
// mut writer: W,
// args: &cli::commands::scan::ScanArgs,
// ) -> Result<()> {
// let findings = self.gather_json_findings(args)?;
// if !findings.is_empty() {
// serde_json::to_writer_pretty(&mut writer, &findings)?;
// writeln!(writer)?;
// }
// Ok(())
// }
pub fn jsonl_format<W: std::io::Write>(
&self,
mut writer: W,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
// Get filtered matches
let mut matches = self.get_filtered_matches()?;
// Apply deduplication only if requested
if !args.no_dedup {
matches = self.deduplicate_matches(matches, args.no_dedup);
}
// For each match, handle it based on the no_dedup flag
for rm in matches {
if args.no_dedup && rm.origin.len() > 1 {
// For no_dedup and multiple origins, create separate findings for each origin
for origin in rm.origin.iter() {
// Create a single-origin version of this match
let single_origin_rm = ReportMatch {
origin: OriginSet::new(origin.clone(), Vec::new()),
blob_metadata: rm.blob_metadata.clone(),
m: rm.m.clone(),
comment: rm.comment.clone(),
visible: rm.visible,
match_confidence: rm.match_confidence,
validation_response_body: rm.validation_response_body.clone(),
validation_response_status: rm.validation_response_status,
validation_success: rm.validation_success,
};
// Process this single-origin match into a JSON finding and write it
let json_finding = self.process_match_to_json(&single_origin_rm, args)?;
serde_json::to_writer(&mut writer, &json_finding)?;
writeln!(writer)?;
}
} else {
// Process normally for deduped matches or matches with only one origin
let json_finding = self.process_match_to_json(&rm, args)?;
serde_json::to_writer(&mut writer, &json_finding)?;
writeln!(writer)?;
}
let records = self.build_finding_records(args)?;
for record in records {
serde_json::to_writer(&mut writer, &record)?;
writeln!(writer)?;
}
Ok(())
}
// // Modified JSONL format to pass args to gather_json_findings
// pub fn jsonl_format<W: std::io::Write>(
// &self,
// mut writer: W,
// args: &cli::commands::scan::ScanArgs,
// ) -> Result<()> {
// let findings = self.gather_json_findings(args)?;
// for finding in findings {
// serde_json::to_writer(&mut writer, &finding)?;
// writeln!(writer)?;
// }
// Ok(())
// }
}
#[cfg(test)]
mod tests {
use super::*;
use crate::cli::commands::github::GitCloneMode;
use crate::cli::commands::github::GitHistoryMode;
use crate::cli::commands::rules::RuleSpecifierArgs;
use crate::matcher::{SerializableCapture, SerializableCaptures};
use crate::util::intern;
use crate::{
blob::BlobId,
cli::commands::github::GitHubRepoType,
cli::commands::inputs::ContentFilteringArgs,
cli::commands::inputs::InputSpecifierArgs,
cli::commands::output::{OutputArgs, ReportOutputFormat},
cli::commands::scan::ConfidenceLevel,
findings_store::FindingsStore,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::Match,
origin::Origin,
reporter::styles::Styles,
};
use std::{
io::Cursor,
path::PathBuf,
sync::{Arc, Mutex},
};
use anyhow::Result;
use serde_json::Value;
use url::Url;
use super::*;
use crate::{
blob::BlobId,
cli::commands::{
github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::OutputArgs,
rules::RuleSpecifierArgs,
scan::ConfidenceLevel,
},
findings_store::FindingsStore,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::{Match, SerializableCapture, SerializableCaptures},
origin::{Origin, OriginSet},
reporter::{ReportMatch, Styles},
rules::rule::Confidence,
util::intern,
};
fn create_default_args() -> cli::commands::scan::ScanArgs {
use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope
@ -437,10 +88,14 @@ mod tests {
jira_url: None,
jql: None,
max_results: 100,
// Docker image scanning
// Slack options
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
docker_image: Vec::new(),
// clone / history options
@ -470,7 +125,6 @@ mod tests {
}
}
// Helper function to create a mock Match
fn create_mock_match(
rule_name: &str,
rule_text_id: &str,
@ -498,7 +152,7 @@ mod tests {
finding_fingerprint: 0123,
rule_finding_fingerprint: intern(rule_finding_fingerprint),
rule_text_id: intern(rule_text_id),
rule_name: intern(rule_name), //.to_string(),
rule_name: intern(rule_name),
rule_confidence: Confidence::Medium,
validation_response_body: "validation response".to_string(),
validation_response_status: 200,
@ -508,10 +162,8 @@ mod tests {
}
}
// Helper function to create a mock DetailsReporter
fn setup_mock_reporter(matches: Vec<ReportMatch>) -> DetailsReporter {
let mut datastore = FindingsStore::new(PathBuf::from("/tmp"));
// Create mock origin and blob metadata for the first test match
if !matches.is_empty() {
let blob_metadata = BlobMetadata {
id: BlobId::new(b"mock_blob"),
@ -521,16 +173,14 @@ mod tests {
language: Some("Rust".to_string()),
};
let dedup = true;
// Add matches to datastore
for m in matches.clone() {
datastore.record(
vec![(
Arc::new(OriginSet::new(
// OriginSet -- Arc<…>
Origin::from_file(PathBuf::from("/mock/path/file.rs")),
vec![],
)),
Arc::new(blob_metadata.clone()), // BlobMetadata -- Arc<…>
Arc::new(blob_metadata.clone()),
m.m.clone(),
)],
dedup,
@ -543,9 +193,9 @@ mod tests {
only_valid: false,
}
}
#[test]
fn test_json_format() -> Result<()> {
// Create a mock match with successful validation
let mock_match =
create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
let matches = vec![ReportMatch {
@ -567,72 +217,17 @@ mod tests {
}];
let reporter = setup_mock_reporter(matches);
let mut output = Cursor::new(Vec::new());
// Call the json_format method
reporter.json_format(&mut output, &create_default_args())?;
// Parse and validate JSON output
let json_output: Vec<Value> = serde_json::from_slice(&output.into_inner())?;
let json_output: Vec<serde_json::Value> = serde_json::from_slice(&output.into_inner())?;
assert!(!json_output.is_empty(), "JSON output should not be empty");
let first_finding = &json_output[0];
assert!(first_finding.get("id").is_some(), "Finding should have an 'id'");
assert!(first_finding.get("matches").is_some(), "Finding should have 'matches'");
// Validate the structure of the first match
let matches = first_finding.get("matches").unwrap().as_array().unwrap();
let first_match = &matches[0];
assert_eq!(first_match.get("rule").unwrap().get("name").unwrap(), "MockRule");
assert_eq!(first_match.get("finding").unwrap().get("language").unwrap(), "Rust");
let first = &json_output[0];
assert_eq!(first["rule"]["name"], "MockRule");
assert_eq!(first["finding"]["language"], "Rust");
Ok(())
}
// #[test]
// fn test_jsonl_format() -> Result<()> {
// // Create a mock match with successful validation
// let mock_match =
// create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
// let matches = vec![ReportMatch {
// origin: OriginSet::new(
// Origin::from_file(PathBuf::from("/mock/path/file.rs")),
// vec![],
// ),
// blob_metadata: BlobMetadata {
// id: BlobId::new(b"mock_blob"),
// num_bytes: 1024,
// mime_essence: Some("text/plain".to_string()),
// charset: Some("UTF-8".to_string()),
// language: Some("Rust".to_string()),
// },
// m: mock_match,
// comment: None,
// match_confidence: Confidence::Medium,
// visible: true,
// validation_response_body: "validation response".to_string(),
// validation_response_status: 200,
// validation_success: true,
// }];
// let reporter = setup_mock_reporter(matches);
// let mut output = Cursor::new(Vec::new());
// // Call the jsonl_format method
// reporter.jsonl_format(&mut output, &create_default_args())?;
// // Split output into lines and validate
// let jsonl_output = String::from_utf8(output.into_inner())?;
// let lines: Vec<&str> = jsonl_output.lines().collect();
// assert!(!lines.is_empty(), "JSONL output should not be empty");
// for line in &lines {
// let json_value: serde_json::Value = serde_json::from_str(line)?;
// assert!(
// json_value.get("rule_name").is_some(),
// "Each line should have a 'rule_name'"
// );
// assert!(
// json_value.get("matches").is_some(),
// "Each line should have 'matches'"
// );
// }
// Ok(())
// }
#[test]
fn test_validation_status_in_json() -> Result<()> {
// Test validation status in JSON output
let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")];
for (validation_success, expected_status) in test_cases {
let mock_match = create_mock_match(
@ -663,23 +258,11 @@ mod tests {
}];
let reporter = setup_mock_reporter(matches);
let mut output = Cursor::new(Vec::new());
// Call the json_format method
reporter.json_format(&mut output, &create_default_args())?;
// Parse and validate JSON output
let json_output: Vec<Value> = serde_json::from_slice(&output.into_inner())?;
let json_output: Vec<serde_json::Value> = serde_json::from_slice(&output.into_inner())?;
assert!(!json_output.is_empty(), "JSON output should not be empty");
let first_finding = &json_output[0];
let matches = first_finding.get("matches").unwrap().as_array().unwrap();
let first_match = &matches[0];
let validation_status = first_match
.get("finding")
.unwrap()
.get("validation")
.unwrap()
.get("status")
.unwrap()
.as_str()
.unwrap();
let first = &json_output[0];
let validation_status = first["finding"]["validation"]["status"].as_str().unwrap();
assert_eq!(validation_status, expected_status);
}
Ok(())

View file

@ -1,396 +1,126 @@
use std::fmt::{Display, Formatter, Result as FmtResult};
use http::StatusCode;
use indenter::indented;
use super::*;
use crate::{
bstring_escape::Escaped,
origin::{get_repo_url, GitRepoOrigin},
};
impl DetailsReporter {
// Modified pretty format to use deduplicate_matches helper
pub fn pretty_format<W: std::io::Write>(
&self,
mut writer: W,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
let mut matches = self.get_filtered_matches()?;
let num_findings = matches.len();
if !args.no_dedup {
matches = self.deduplicate_matches(matches, args.no_dedup);
}
for (index, rm) in matches.into_iter().enumerate() {
// When no_dedup is true, we'll handle each origin separately
if args.no_dedup && rm.origin.len() > 1 {
// For each origin, create a separate "finding"
for origin in rm.origin.iter() {
// Create a new ReportMatch with just this single origin
let single_origin_rm = ReportMatch {
origin: OriginSet::new(origin.clone(), Vec::new()),
blob_metadata: rm.blob_metadata.clone(),
m: rm.m.clone(),
comment: rm.comment.clone(),
visible: rm.visible,
match_confidence: rm.match_confidence,
validation_response_body: rm.validation_response_body.clone(),
validation_response_status: rm.validation_response_status,
validation_success: rm.validation_success,
};
self.write_finding(
&mut writer,
&single_origin_rm,
index + 1,
num_findings,
args,
)?;
}
} else {
// Normal processing for deduped matches or matches with only one origin
self.write_finding(&mut writer, &rm, index + 1, num_findings, args)?;
}
let records = self.build_finding_records(args)?;
let num_findings = records.len();
for (index, record) in records.iter().enumerate() {
self.write_finding_record(&mut writer, record, index + 1, num_findings)?;
}
Ok(())
}
fn write_finding<W: std::io::Write>(
fn write_finding_record<W: std::io::Write>(
&self,
writer: &mut W,
rm: &ReportMatch,
record: &FindingReporterRecord,
_finding_num: usize,
_num_findings: usize,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
let lock_icon = if rm.validation_success { "🔓 " } else { "" };
let is_active = record.finding.validation.status == "Active Credential";
let lock_icon = if is_active { "🔓 " } else { "" };
let formatted_heading = format!(
"{}{} => [{}]",
lock_icon,
rm.m.rule_name.to_uppercase(),
rm.m.rule_text_id.to_uppercase()
record.rule.name.to_uppercase(),
record.rule.id.to_uppercase()
);
if rm.validation_success {
if is_active {
writeln!(writer, "{}", self.style_finding_active_heading(formatted_heading))?;
} else {
writeln!(writer, "{}", self.style_finding_heading(formatted_heading))?;
}
writeln!(writer, "{}", PrettyFinding(self, rm, args))?;
writeln!(writer, "{}", PrettyFindingRecord(self, record))?;
writeln!(writer)?;
Ok(())
}
fn write_git_metadata(
fn write_git_metadata_value(
&self,
f: &mut Formatter<'_>,
e: &GitRepoOrigin,
_args: &cli::commands::scan::ScanArgs,
line_num: usize,
git: &serde_json::Value,
) -> FmtResult {
// Check if this is a remote git scan
// let mut is_remote_git_scan = !args.input_specifier_args.git_url.is_empty();
// let mut git_url_string = String::new();
let repo_url = get_repo_url(&e.repo_path)
.unwrap_or_else(|_| e.repo_path.to_string_lossy().to_string().into());
let mut git_url_string = repo_url.clone();
if git_url_string.ends_with(".git") {
git_url_string = git_url_string.strip_suffix(".git").unwrap().to_string().into();
let repo_url = git["repository_url"].as_str().unwrap_or("");
writeln!(f, " |Git Repo......: {}", self.style_metadata(repo_url))?;
if let Some(commit) = git.get("commit") {
if let Some(url) = commit.get("url").and_then(|v| v.as_str()) {
writeln!(f, " |__Commit......: {}", self.style_metadata(url))?;
}
if let Some(committer) = commit.get("committer") {
let name = committer.get("name").and_then(|v| v.as_str()).unwrap_or("");
let email = committer.get("email").and_then(|v| v.as_str()).unwrap_or("");
writeln!(indented(f).with_str(" |__"), "Committer...: {} <{}>", name, email)?;
}
if let Some(date) = commit.get("date").and_then(|v| v.as_str()) {
writeln!(indented(f).with_str(" |__"), "Date........: {}", date)?;
}
}
writeln!(f, " |Git Repo......: {}", self.style_metadata(&git_url_string),)?;
if let Some(cs) = &e.first_commit {
let cmd = &cs.commit_metadata;
let atime =
cmd.committer_timestamp.format(gix::date::time::format::SHORT.clone()).to_string();
let commit_id = &cmd.commit_id;
let commit_url = format!("{}/commit/{}", &git_url_string, commit_id);
// Write Commit Information
writeln!(f, " |__Commit......: {}", self.style_metadata(&commit_url))?;
writeln!(
indented(f).with_str(" |__"),
"Committer...: {} <{}>",
cmd.committer_name,
cmd.committer_email
)?;
writeln!(indented(f).with_str(" |__"), "Date........: {}", atime)?;
// writeln!(indented(f).with_str(" |__"), "Summary.....: {}", msg)?;
writeln!(indented(f).with_str(" |__"), "Path........: {}", cs.blob_path)?;
// Construct Git Command
let git_link =
format!("{}/blob/{}/{}#L{}", &git_url_string, commit_id, cs.blob_path, line_num);
let git_command =
format!("git -C {} show {}:{}", e.repo_path.display(), commit_id, cs.blob_path);
writeln!(
indented(f).with_str(" |__"),
"Git Link....: {}",
self.style_metadata(&git_link)
)?;
writeln!(
indented(f).with_str(" |__"),
"Git Command.: {}",
self.style_metadata(&git_command)
)?;
if let Some(file) = git.get("file") {
if let Some(path) = file.get("path").and_then(|v| v.as_str()) {
writeln!(indented(f).with_str(" |__"), "Path........: {}", path)?;
}
if let Some(url) = file.get("url").and_then(|v| v.as_str()) {
writeln!(
indented(f).with_str(" |__"),
"Git Link....: {}",
self.style_metadata(url)
)?;
}
if let Some(cmd) = file.get("git_command").and_then(|v| v.as_str()) {
writeln!(
indented(f).with_str(" |__"),
"Git Command.: {}",
self.style_metadata(cmd)
)?;
}
}
Ok(())
}
}
// pub struct PrettyFinding<'a>(&'a DetailsReporter, &'a Finding);
pub struct PrettyFinding<'a>(
&'a DetailsReporter,
&'a ReportMatch,
&'a cli::commands::scan::ScanArgs,
);
impl<'a> Display for PrettyFinding<'a> {
pub struct PrettyFindingRecord<'a>(&'a DetailsReporter, &'a FindingReporterRecord);
impl<'a> Display for PrettyFindingRecord<'a> {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
let PrettyFinding(reporter, rm, args) = self;
// Use Box<dyn Fn(&str) -> String> to store the closure
let style_fn: Box<dyn Fn(&str) -> String> = if rm.validation_success {
Box::new(|s: &str| reporter.style_active_creds(s).to_string()) // Convert StyledObject
// to String
let reporter = self.0;
let record = self.1;
let is_active = record.finding.validation.status == "Active Credential";
let style_fn: Box<dyn Fn(&str) -> String> = if is_active {
Box::new(|s| reporter.style_active_creds(s).to_string())
} else {
Box::new(|s: &str| reporter.style_match(s).to_string()) // Convert StyledObject to
// String
Box::new(|s| reporter.style_match(s).to_string())
};
let matching_finding =
rm.m.groups
.captures
.get(1)
.or_else(|| rm.m.groups.captures.get(0))
.map(|capture| capture.value.as_bytes())
.unwrap_or(&[]);
writeln!(f, " |Finding.......: {}", style_fn(&Escaped(matching_finding).to_string()))?;
writeln!(f, " |Fingerprint...: {}", rm.m.finding_fingerprint)?;
writeln!(f, " |Confidence....: {}", rm.match_confidence.to_string())?;
writeln!(f, " |Entropy.......: {:.2}", rm.m.calculated_entropy)?;
let validation_status = if rm.validation_response_status == StatusCode::CONTINUE.as_u16()
|| rm.validation_response_status == StatusCode::PRECONDITION_REQUIRED.as_u16()
{
"Not Attempted".to_string()
} else if rm.validation_success {
"Active Credential".to_string()
} else {
"Inactive Credential".to_string()
};
writeln!(
f,
" |Validation....: {}",
if rm.validation_success {
reporter.style_finding_active_heading(&validation_status).to_string()
// Convert StyledObject to String
} else {
(&validation_status).to_string()
}
)?;
const MAX_RESPONSE_LENGTH: usize = 512;
if rm.validation_response_status != StatusCode::CONTINUE.as_u16() {
let truncated_body: String =
rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect();
let ellipsis =
if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" };
let finding = &record.finding;
writeln!(f, " |Finding.......: {}", style_fn(&finding.snippet))?;
writeln!(f, " |Fingerprint...: {}", finding.fingerprint)?;
writeln!(f, " |Confidence....: {}", finding.confidence)?;
writeln!(f, " |Entropy.......: {}", finding.entropy)?;
if is_active {
writeln!(
f,
" |__Response....: {}{}",
if rm.validation_success {
reporter.style_active_creds(&truncated_body).to_string() // Convert StyledObject
// to String
} else {
reporter.style_metadata(&truncated_body).to_string() // Convert StyledObject to
// String
},
ellipsis
" |Validation....: {}",
reporter.style_finding_active_heading(&finding.validation.status).to_string()
)?;
} else {
writeln!(f, " |Validation....: {}", finding.validation.status)?;
}
writeln!(
f,
" |Language......: {}",
rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string())
)?;
let source_span = &rm.m.location.source_span;
writeln!(f, " |Line Num......: {}", source_span.start.line)?;
//print all the other areas where this was seen
for p in rm.origin.iter() {
match p {
Origin::File(e) => {
let display_path = if let Some(url) = reporter.jira_issue_url(&e.path, args) {
url
} else if let Some(url) = reporter.slack_message_url(&e.path) {
url
} else if let Some(mapped) = reporter.docker_display_path(&e.path) {
mapped
} else {
e.path.display().to_string()
};
writeln!(
f,
" |Path..........: {}",
if rm.validation_success {
reporter.style_active_creds(&display_path).to_string()
} else {
display_path
}
)?;
}
Origin::GitRepo(e) => {
reporter.write_git_metadata(f, e, args, source_span.start.line)?;
}
Origin::Extended(e) => {
writeln!(f, " |Extended......: {}", reporter.style_metadata(e).to_string())?;
// Convert StyledObject to String
}
}
if finding.validation.status != "Not Attempted" {
writeln!(f, " |__Response....: {}", style_fn(&finding.validation.response))?;
}
writeln!(f, " |Language......: {}", finding.language)?;
writeln!(f, " |Line Num......: {}", finding.line)?;
writeln!(f, " |Path..........: {}", style_fn(&finding.path))?;
if let Some(git) = &finding.git_metadata {
reporter.write_git_metadata_value(f, git)?;
}
Ok(())
}
}
#[test]
fn test_pretty_format_with_nan_entropy_panics() {
use std::{
io::Cursor,
sync::{Arc, Mutex},
};
use http::StatusCode;
use url::Url;
use crate::{
blob::BlobMetadata,
cli::commands::{
github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
gitlab::GitLabRepoType,
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::{OutputArgs, ReportOutputFormat},
rules::RuleSpecifierArgs,
scan::{ConfidenceLevel, ScanArgs},
},
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::{Match, SerializableCaptures},
origin::{Origin, OriginSet},
reporter::{DetailsReporter, Styles},
};
// Construct a fake match with NaN entropy
let m = Match {
rule_name: "dummy_rule".into(),
rule_text_id: "dummy.id".into(),
finding_fingerprint: 123456789,
rule_finding_fingerprint: "abc".into(),
location: Location {
offset_span: OffsetSpan { start: 0, end: 1 },
source_span: SourceSpan {
start: SourcePoint { line: 1, column: 0 },
end: SourcePoint { line: 1, column: 10 },
},
},
blob_id: crate::blob::BlobId::default(),
groups: SerializableCaptures { captures: vec![] },
rule_confidence: crate::rules::rule::Confidence::Medium,
validation_success: true,
validation_response_status: StatusCode::OK.as_u16(),
validation_response_body: "OK".into(),
calculated_entropy: f32::NAN, // Here's the trigger
visible: true,
};
let _rm = crate::reporter::ReportMatch {
origin: OriginSet::new(Origin::from_file("dummy.txt".into()), vec![]),
blob_metadata: BlobMetadata {
id: m.blob_id,
num_bytes: 1,
mime_essence: None,
charset: None,
language: Some("Rust".into()),
},
m,
comment: None,
visible: true,
match_confidence: crate::rules::rule::Confidence::Medium,
validation_response_body: "OK".into(),
validation_response_status: StatusCode::OK.as_u16(),
validation_success: true,
};
let store = Arc::new(Mutex::new(crate::findings_store::FindingsStore::new(".".into())));
let reporter =
DetailsReporter { datastore: store, styles: Styles::new(false), only_valid: false };
let mut buf = Cursor::new(Vec::new());
let args = ScanArgs {
// core execution / performance
num_jobs: 1,
no_dedup: false,
// rule selection
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
// input discovery
input_specifier_args: InputSpecifierArgs {
path_inputs: Vec::new(),
git_url: Vec::new(),
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: url::Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
// new GitLab defaults
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
// Jira options
jira_url: None,
jql: None,
max_results: 100,
// Slack options
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
// content filtering
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
no_extract_archives: false,
extraction_depth: 2,
exclude: Vec::new(), // Exclude patterns
no_binary: true,
},
// scanning behaviour
confidence: ConfidenceLevel::Medium,
no_validate: false,
rule_stats: false,
only_valid: false,
min_entropy: None,
redact: false,
git_repo_timeout: 1800, // 30 minutes
// output
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
// display
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
// This will panic if the entropy isn't checked for NaN
let _result = reporter.pretty_format(&mut buf, &args);
// assert!(result.is_err() || result.is_ok(), "Should not crash"); // remove this line if panic
// is expected pre-fix
}

View file

@ -1,259 +1,56 @@
use std::collections::HashMap;
use std::collections::{BTreeMap, HashSet};
use rayon::prelude::*;
use serde_sarif::sarif;
use super::*;
use crate::{bstring_escape::Escaped, defaults::get_builtin_rules, origin::get_repo_url};
#[derive(Hash, Eq, PartialEq)]
struct LocationKey {
file_path: String,
line: usize,
column_start: usize,
column_end: usize,
text: String,
}
use crate::defaults::get_builtin_rules;
impl DetailsReporter {
fn make_sarif_result(
&self,
finding: &Finding,
no_dedup: bool,
args: &cli::commands::scan::ScanArgs,
) -> Result<sarif::Result> {
// Deduplicate exactly as in the JSON reporter
// let matches = self.deduplicate_matches(finding.matches.clone(), no_dedup);
// Deduplicate exactly as in the JSON reporter - but only if no_dedup is false
let matches = if no_dedup {
finding.matches.clone()
} else {
self.deduplicate_matches(finding.matches.clone(), no_dedup)
};
fn record_to_sarif_result(&self, record: &FindingReporterRecord) -> Result<sarif::Result> {
let finding = &record.finding;
let artifact_location =
sarif::ArtifactLocationBuilder::default().uri(finding.path.clone()).build()?;
let region = sarif::RegionBuilder::default()
.start_line(finding.line as i64)
.start_column(finding.column_start as i64)
.end_line(finding.line as i64)
.end_column(finding.column_end as i64)
.snippet(
sarif::ArtifactContentBuilder::default().text(finding.snippet.clone()).build()?,
)
.build()?;
let metadata = &finding.metadata;
let mut location_map: HashMap<LocationKey, Vec<(&OriginSet, &Match)>> = HashMap::new();
for rm in &matches {
let source_span = &rm.m.location.source_span;
let snippet =
rm.m.groups
.captures
.get(1)
.or_else(|| rm.m.groups.captures.get(0))
.map(|capture| capture.value.as_bytes())
.unwrap_or(&[]);
let key = LocationKey {
file_path: rm
.origin
.first()
.blob_path()
.map(|p| p.to_string_lossy().into_owned())
.unwrap_or_default(),
line: source_span.start.line,
column_start: source_span.start.column,
column_end: source_span.end.column,
text: Escaped(snippet).to_string(),
};
location_map.entry(key).or_default().push((&rm.origin, &rm.m));
let mut props = BTreeMap::new();
props.insert("validation_status".to_string(), serde_json::json!(finding.validation.status));
props.insert("entropy".to_string(), serde_json::json!(finding.entropy));
if let Some(git) = &finding.git_metadata {
props.insert("git_metadata".to_string(), git.clone());
}
let properties =
sarif::PropertyBagBuilder::default().additional_properties(props).build()?;
let mut fpu64: u64 = 0;
let location = sarif::LocationBuilder::default()
.physical_location(
sarif::PhysicalLocationBuilder::default()
.artifact_location(artifact_location)
.region(region)
.build()?,
)
.properties(properties)
.build()?;
let locations: Vec<sarif::Location> = location_map
.into_iter()
.filter_map(|(key, matches)| {
let (prov, m) = matches[0];
let source_span = &m.location.source_span;
let mut artifact_locations = Vec::new();
let mut git_metadata_list = Vec::new();
fpu64 = m.finding_fingerprint;
for p in prov.iter() {
match p {
Origin::File(e) => {
let uri = if let Some(url) = self.jira_issue_url(&e.path, args) {
url
} else if let Some(url) = self.slack_message_url(&e.path) {
url
} else {
e.path.display().to_string()
};
artifact_locations.push(
sarif::ArtifactLocationBuilder::default().uri(uri).build().ok()?,
);
}
Origin::GitRepo(e) => {
// Extract and store Git metadata
if let Some(git_metadata) = self.extract_git_metadata(e, source_span) {
git_metadata_list.push(git_metadata);
}
// Build Git artifact location
if let Some(cs) = &e.first_commit {
let repo_url = get_repo_url(&e.repo_path)
.unwrap_or_else(|_| {
e.repo_path.to_string_lossy().to_string().into()
})
.trim_end_matches(".git")
.to_string();
let git_url = format!(
"{}/blob/{}/{}#L{}",
repo_url,
cs.commit_metadata.commit_id,
cs.blob_path,
source_span.start.line
);
artifact_locations.push(
sarif::ArtifactLocationBuilder::default()
.uri(git_url)
.build()
.ok()?,
);
}
}
Origin::Extended(_) => (),
}
}
if artifact_locations.is_empty() {
return None;
}
let region = sarif::RegionBuilder::default()
.start_line(key.line as i64)
.start_column(key.column_start as i64)
.end_line(key.line as i64)
.end_column(key.column_end as i64)
.snippet(sarif::ArtifactContentBuilder::default().text(key.text).build().ok()?)
.build()
.ok()?;
let logical_location = sarif::LogicalLocationBuilder::default()
.kind("blob")
.name(m.finding_fingerprint.to_string())
.build()
.ok()?;
let validation_status =
if m.validation_response_status == StatusCode::CONTINUE.as_u16() {
"Not Attempted"
} else if m.validation_success {
"Active Credential"
} else {
"Inactive Credential"
};
// Build combined properties including Git metadata and fingerprint
let mut props = std::collections::BTreeMap::new();
props.insert("validation_status".to_string(), serde_json::json!(validation_status));
props.insert(
"entropy".to_string(),
serde_json::json!(format!("{:.2}", m.calculated_entropy)),
);
// Add the fingerprint property from the match
props.insert("fingerprint".to_string(), serde_json::json!(m.finding_fingerprint));
if !git_metadata_list.is_empty() {
props.insert("git_metadata".to_string(), serde_json::json!(git_metadata_list));
}
let properties = sarif::PropertyBagBuilder::default()
.additional_properties(props)
.build()
.ok()?;
// Create locations for each artifact location
let locations = artifact_locations
.into_iter()
.map(|artifact_location| {
sarif::LocationBuilder::default()
.physical_location(
sarif::PhysicalLocationBuilder::default()
.artifact_location(artifact_location)
.region(region.clone())
.build()
.ok()?,
)
.logical_locations(vec![logical_location.clone()])
.properties(properties.clone())
.build()
.ok()
})
.collect::<Option<Vec<_>>>()?;
Some(locations)
})
.flatten()
.collect();
// let message = sarif::MessageBuilder::default()
// .text(format!(
// "Rule {} found {} unique {}.\nFirst blob id matched: {}",
// metadata.rule_name,
// locations.len(),
// if locations.len() == 1 { "match" } else { "matches" },
// first_match_blob_id
// ))
// .build()?;
// Create detailed message from first location's information
let detailed_msg = if let Some(first_match) = matches.first() {
let mut msg = format!(
"Rule {} found {} unique {}.\n",
metadata.rule_name,
locations.len(),
if locations.len() == 1 { "match" } else { "matches" }
);
// Add file or Git information based on origin
// Get first origin of first match - we know this exists
let p = first_match.origin.first();
match p {
Origin::File(e) => {
let uri = if let Some(url) = self.jira_issue_url(&e.path, args) {
url
} else if let Some(url) = self.slack_message_url(&e.path) {
url
} else {
e.path.display().to_string()
};
msg.push_str(&format!("Location: {}\n", uri));
}
Origin::GitRepo(e) => {
if let Some(cs) = &e.first_commit {
let repo_url = get_repo_url(&e.repo_path)
.unwrap_or_else(|_| e.repo_path.to_string_lossy().to_string().into())
.trim_end_matches(".git")
.to_string();
// Add commit and author information
let cmd = &cs.commit_metadata;
msg.push_str(&format!("Repository: {}\n", repo_url));
msg.push_str(&format!("Commit: {}\n", cmd.commit_id));
msg.push_str(&format!(
"Committer: {} <{}>\n",
String::from_utf8_lossy(&cmd.committer_name),
String::from_utf8_lossy(&cmd.committer_email)
));
msg.push_str(&format!("File: {}", cs.blob_path));
}
}
Origin::Extended(e) => {
msg.push_str(&format!("Extended: {}\n", e));
}
}
msg
} else {
format!("Rule {} found {} unique matches.", metadata.rule_name, locations.len(),)
};
let message = sarif::MessageBuilder::default().text(detailed_msg).build()?;
let fingerprint_name = "fingerprint".to_string();
let fingerprint = fpu64.to_string();
let message = sarif::MessageBuilder::default()
.text(format!("Rule {} matched {}", record.rule.name, finding.path))
.build()?;
let result = sarif::ResultBuilder::default()
.rule_id(&metadata.rule_name)
.rule_id(&record.rule.name)
.message(message)
.kind(sarif::ResultKind::Review.to_string())
.locations(locations)
.locations(vec![location])
.level(sarif::ResultLevel::Warning.to_string())
.partial_fingerprints([(fingerprint_name, fingerprint)])
.partial_fingerprints([("fingerprint".to_string(), finding.fingerprint.clone())])
.build()?;
Ok(result)
}
@ -261,54 +58,11 @@ impl DetailsReporter {
pub fn sarif_format<W: std::io::Write>(
&self,
mut writer: W,
no_dedup: bool,
_no_dedup: bool,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
// Gather findings first
let mut findings = self.gather_findings()?;
// If no_dedup is true, expand findings with multiple origins into separate findings
if no_dedup {
let mut expanded_findings = Vec::new();
for finding in findings {
// Check matches with multiple origins
let matches_with_multiple_origins: Vec<_> =
finding.matches.iter().filter(|rm| rm.origin.len() > 1).collect();
if !matches_with_multiple_origins.is_empty() {
// For each match with multiple origins, create separate findings
for rm in matches_with_multiple_origins {
for origin in rm.origin.iter() {
// Create a single-origin match
let single_origin_rm = ReportMatch {
origin: OriginSet::new(origin.clone(), Vec::new()),
blob_metadata: rm.blob_metadata.clone(),
m: rm.m.clone(),
comment: rm.comment.clone(),
visible: rm.visible,
match_confidence: rm.match_confidence,
validation_response_body: rm.validation_response_body.clone(),
validation_response_status: rm.validation_response_status,
validation_success: rm.validation_success,
};
// Create a new finding with just this single-origin match
let new_finding =
Finding::new(finding.metadata.clone(), vec![single_origin_rm]);
expanded_findings.push(new_finding);
}
}
} else {
// If the finding has no matches with multiple origins, keep it as is
expanded_findings.push(finding);
}
}
findings = expanded_findings;
}
// Filter only rules relevant to the findings
let finding_rule_ids: std::collections::HashSet<_> =
findings.iter().map(|f| f.metadata.rule_name.clone()).collect();
let records = self.build_finding_records(args)?;
let finding_rule_ids: HashSet<_> = records.iter().map(|r| r.rule.name.clone()).collect();
let rules: Vec<sarif::ReportingDescriptor> = get_builtin_rules(None)?
.iter_rules()
.par_bridge()
@ -351,10 +105,9 @@ impl DetailsReporter {
)
.build()?;
let sarif_results: Vec<sarif::Result> = findings
.par_iter()
.filter_map(|f| self.make_sarif_result(f, no_dedup, args).ok())
.collect();
let sarif_results: Vec<sarif::Result> =
records.iter().filter_map(|r| self.record_to_sarif_result(r).ok()).collect();
let run = sarif::RunBuilder::default().tool(tool).results(sarif_results).build()?;
let sarif = sarif::SarifBuilder::default()
.version(sarif::Version::V2_1_0.to_string())

142
src/s3.rs Normal file
View file

@ -0,0 +1,142 @@
use anyhow::{Context, Result};
use aws_config::{defaults, meta::region::RegionProviderChain, BehaviorVersion};
use aws_credential_types::Credentials;
use aws_sdk_s3::{
error::ProvideErrorMetadata, // for .code()
operation::list_objects_v2::ListObjectsV2Error, // modeled service error
Client,
};
use aws_types::region::Region;
use reqwest; // HTTP client for HEAD fallback
pub async fn visit_bucket_objects<F>(
bucket: &str,
prefix: Option<&str>,
role_arn: Option<&str>,
profile: Option<&str>,
mut visitor: F,
) -> Result<()>
where
F: FnMut(String, Vec<u8>) -> Result<()>,
{
// Helper to build ConfigLoader with profile/creds/no_credentials
let build_loader = || {
let mut loader = defaults(BehaviorVersion::latest());
if let Some(p) = profile {
loader = loader.profile_name(p);
}
if let (Ok(k), Ok(s)) = (std::env::var("KF_AWS_KEY"), std::env::var("KF_AWS_SECRET")) {
loader = loader.credentials_provider(Credentials::new(k, s, None, None, "kf_env"));
}
if profile.is_none() && std::env::var("KF_AWS_KEY").is_err() && role_arn.is_none() {
loader = loader.no_credentials();
}
loader
};
// Initial client in default→us-east-1
let default_region = RegionProviderChain::default_provider().or_else("us-east-1");
let mut config = build_loader().region(default_region).load().await;
let mut client = if let Some(role) = role_arn {
let assume = aws_config::sts::AssumeRoleProvider::builder(role.to_string())
.session_name("kingfisher")
.configure(&config)
.build()
.await;
let conf = aws_sdk_s3::config::Builder::from(&config).credentials_provider(assume).build();
Client::from_conf(conf)
} else {
Client::new(&config)
};
let mut continuation_token: Option<String> = None;
loop {
let mut req = client.list_objects_v2().bucket(bucket);
if let Some(p) = prefix {
req = req.prefix(p);
}
if let Some(ref token) = continuation_token {
req = req.continuation_token(token);
}
let resp = match req.send().await {
Ok(r) => r,
// On error, extract the modeled service error
Err(err) => {
let svc_err: ListObjectsV2Error = err.into_service_error(); // from SdkError
// If the bucket must be addressed at another region...
if svc_err.code() == Some("PermanentRedirect") {
// HEAD request to get x-amz-bucket-region header
let url = format!("https://{bucket}.s3.amazonaws.com");
let head = reqwest::Client::new()
.head(&url)
.send()
.await
.context("Failed to HEAD bucket for region")?;
let region_str = head
.headers()
.get("x-amz-bucket-region")
.and_then(|v| v.to_str().ok())
.unwrap_or("us-east-1")
.to_string();
// Rebuild client in the correct region
let override_region = RegionProviderChain::first_try(Region::new(region_str))
.or_else("us-east-1");
config = build_loader().region(override_region).load().await;
client = if let Some(r) = role_arn {
let assume = aws_config::sts::AssumeRoleProvider::builder(r.to_string())
.session_name("kingfisher")
.configure(&config)
.build()
.await;
let conf = aws_sdk_s3::config::Builder::from(&config)
.credentials_provider(assume)
.build();
Client::from_conf(conf)
} else {
Client::new(&config)
};
// Reset pagination and retry list
continuation_token = None;
continue;
}
// Any other error is fatal
return Err(svc_err).context("Failed to list objects in bucket");
}
};
// Process objects
for obj in resp.contents.unwrap_or_default() {
if let Some(key) = obj.key {
let data = client
.get_object()
.bucket(bucket)
.key(&key)
.send()
.await
.with_context(|| format!("Failed to fetch object {}", key))?
.body
.collect()
.await
.context("Failed to read S3 object body")?
.into_bytes()
.to_vec();
visitor(key, data)?;
}
}
// Continue or finish pagination
if resp.is_truncated.unwrap_or(false) {
continuation_token = resp.next_continuation_token;
} else {
break;
}
}
Ok(())
}

View file

@ -8,6 +8,7 @@ use indicatif::{HumanCount, ProgressBar, ProgressStyle};
use tokio::time::Duration;
use tracing::{debug, error, info};
use crate::blob::BlobIdMap;
use crate::{
blob::BlobMetadata,
cli::{
@ -20,11 +21,18 @@ use crate::{
findings_store,
git_binary::{CloneMode, Git},
git_url::GitUrl,
github, gitlab, jira,
matcher::Match,
origin::OriginSet,
github, gitlab,
guesser::Guesser,
jira,
matcher::{Match, Matcher, MatcherStats},
origin::{Origin, OriginSet},
rules_database::RulesDatabase,
s3,
scanner::processing::BlobProcessor,
scanner_pool::ScannerPool,
slack, PathBuf,
};
pub type DatastoreMessage = (OriginSet, BlobMetadata, Vec<(Option<f64>, Match)>);
pub fn clone_or_update_git_repos(
@ -284,3 +292,86 @@ pub async fn fetch_slack_messages(
}
Ok(vec![output_dir])
}
pub async fn fetch_s3_objects(
args: &scan::ScanArgs,
datastore: &Arc<Mutex<findings_store::FindingsStore>>,
rules_db: &RulesDatabase,
matcher_stats: &Mutex<MatcherStats>,
enable_profiling: bool,
shared_profiler: Arc<crate::rule_profiling::ConcurrentRuleProfiler>,
progress_enabled: bool,
) -> Result<()> {
let Some(bucket) = args.input_specifier_args.s3_bucket.as_deref() else {
return Ok(());
};
let prefix = args.input_specifier_args.s3_prefix.as_deref();
let role_arn = args.input_specifier_args.role_arn.as_deref();
let profile = args.input_specifier_args.aws_local_profile.as_deref();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let seen_blobs = BlobIdMap::new();
let matcher = Matcher::new(
rules_db,
scanner_pool,
&seen_blobs,
Some(matcher_stats),
enable_profiling,
Some(shared_profiler.clone()),
)?;
let guesser = Guesser::new().expect("should be able to create filetype guesser");
let mut processor = BlobProcessor { matcher, guesser };
let progress = if progress_enabled {
let style =
ProgressStyle::with_template("{spinner} {msg} ({pos} objects) [{elapsed_precise}]")
.expect("progress bar style template should compile");
let pb = ProgressBar::new_spinner().with_style(style).with_message("Fetching S3 objects");
pb.enable_steady_tick(Duration::from_millis(500));
pb
} else {
ProgressBar::hidden()
};
let bucket_name = bucket.to_string();
let pb = progress.clone();
let bucket_name = bucket.to_string();
s3::visit_bucket_objects(bucket, prefix, role_arn, profile, move |key, bytes| {
let origin = OriginSet::new(
Origin::from_extended(serde_json::json!({
"path": format!("s3://{}/{}", bucket_name, key)
})),
Vec::new(),
);
let blob = crate::blob::Blob::from_bytes(bytes);
if let Some((origin, blob_md, scored_matches)) =
processor.run(origin, blob, args.no_dedup)?
{
// Wrap origin & metadata once:
let origin_arc = Arc::new(origin);
let blob_arc = Arc::new(blob_md);
// Now build a batch of exactly one FindingsStoreMessage per Match
let mut batch = Vec::with_capacity(scored_matches.len());
for (_score, m) in scored_matches {
batch.push((origin_arc.clone(), blob_arc.clone(), m));
}
// Call record with the right type
let added = datastore.lock().unwrap().record(batch, !args.no_dedup);
debug!("Added {} new S3 blobs", added);
}
pb.inc(1);
Ok(())
})
.await?;
let total = progress.position();
progress.finish_with_message(format!("Fetched {} S3 objects", total));
Ok(())
}

View file

@ -18,7 +18,9 @@ use crate::{
rules_database::RulesDatabase,
scanner::{
clone_or_update_git_repos, enumerate_filesystem_inputs, enumerate_github_repos,
repos::{enumerate_gitlab_repos, fetch_jira_issues, fetch_slack_messages},
repos::{
enumerate_gitlab_repos, fetch_jira_issues, fetch_s3_objects, fetch_slack_messages,
},
run_secret_validation, save_docker_images,
summary::print_scan_summary,
},
@ -93,22 +95,39 @@ pub async fn run_async_scan(
}
}
if input_roots.is_empty() {
bail!("No inputs to scan");
}
let shared_profiler = Arc::new(ConcurrentRuleProfiler::new());
let enable_profiling = args.rule_stats;
let matcher_stats = Mutex::new(MatcherStats::default());
let _inputs = enumerate_filesystem_inputs(
// Fetch S3 objects if requested (scanned immediately)
fetch_s3_objects(
args,
datastore.clone(),
&input_roots,
progress_enabled,
&datastore,
rules_db,
&matcher_stats,
enable_profiling,
Arc::clone(&shared_profiler),
&matcher_stats,
)?;
progress_enabled,
)
.await?;
let has_s3 = args.input_specifier_args.s3_bucket.is_some();
if input_roots.is_empty() && !has_s3 {
bail!("No inputs to scan");
}
if !input_roots.is_empty() {
let _inputs = enumerate_filesystem_inputs(
args,
datastore.clone(),
&input_roots,
progress_enabled,
rules_db,
enable_profiling,
Arc::clone(&shared_profiler),
&matcher_stats,
)?;
}
if !args.no_dedup {
// Final deduplication step before validation (or before reporting)

View file

@ -1,95 +0,0 @@
package core
import (
"io/ioutil"
"os"
"path"
"path/filepath"
"runtime"
"testing"
"github.com/10gen/kingfisher/core"
)
func rootDir() string {
_, b, _, _ := runtime.Caller(0)
return filepath.Dir(path.Dir(b))
}
func NewTestSession(baselineFilename string) (*core.Session, error) {
session := core.PrepareTestSession()
session.Testing = true
session.ReqScanMode = core.LocalFiles
session.Options.ValidateSecrets = true
session.Options.BaselineFilename = baselineFilename
session.Options.KingfisherTempDir = core.GetTempDir()
core.GlobalSessionRef = session
session.InitializeTargetModeClient()
return session, nil
}
func beginTesting(t *testing.T, testfile string, expectedSkippedFindings, expectedFindingsSuppressKingfisher int) {
rootdir := rootDir()
testfilePath := filepath.Join(rootdir, testfile)
_, filename := filepath.Split(testfilePath)
byteBaseLine := []byte(`FileContent:
matches: []
FilePaths:
matches: []
ExactFindings:
matches:
- filepath: testdata/ruby_vulnerable.rb
findinghash: 701c302855ecc97e8415c44f37123bc2ca0c3343bd87028682aaaeaa90568084
linenum: 40
lastupdated: Tue Apr 16 13:04:10 PDT 2024
- filepath: testdata/ruby_vulnerable.rb
findinghash: 065d1e2faeae9328ca8b2f2754afa6c196d3ef2da2720dabca7e5161d67a6ca1
linenum: 40
lastupdated: Tue Apr 16 13:04:10 PDT 2024
`)
// Write byteBaseline to a file in a temp directory and give yaml extension
tempFile, err := ioutil.TempFile("", "baseline-*.yaml")
if err != nil {
t.Fatal(err)
}
defer os.Remove(tempFile.Name()) // Clean up the file after test
if _, err := tempFile.Write(byteBaseLine); err != nil {
t.Fatal(err)
}
if err := tempFile.Close(); err != nil {
t.Fatal(err)
}
sess, err := NewTestSession(tempFile.Name())
if err != nil {
t.Fatal(err)
}
matchFile := core.NewMatchFile(testfilePath, sess, nil)
core.BeginFileAnalysis(matchFile)
if sess.Stats.SkippedFindings != expectedSkippedFindings {
core.PrintSessionStats(sess)
t.Errorf("Expected %d findings, got %d -- file: <%s>", expectedSkippedFindings, sess.Stats.SkippedFindings, filename)
}
}
func TestBaselineFeature(t *testing.T) {
tests := []struct {
fileName string
expectedSkippedFindings int
expectedFindingsSuppressKingfisher int
}{
{"ruby_vulnerable.rb", 3, 0},
}
for _, tt := range tests {
t.Run(tt.fileName, func(t *testing.T) {
beginTesting(t, tt.fileName, tt.expectedSkippedFindings, tt.expectedFindingsSuppressKingfisher)
})
}
}

View file

@ -84,6 +84,11 @@ rules:
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options

View file

@ -71,6 +71,11 @@ fn test_github_remote_scan() -> Result<()> {
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options

View file

@ -70,6 +70,11 @@ fn test_gitlab_remote_scan() -> Result<()> {
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,

29
tests/int_s3.rs Normal file
View file

@ -0,0 +1,29 @@
use anyhow::Result;
use kingfisher::s3::visit_bucket_objects;
#[tokio::test]
async fn test_visit_public_bucket() -> Result<()> {
let mut objects = Vec::new();
visit_bucket_objects(
"awsglue-datasets",
Some("examples/us-legislators/all/"),
None,
None,
|key, data| {
objects.push((key, data));
Ok(())
},
)
.await?;
assert!(
objects.iter().any(|(k, _)| k.ends_with("events.json")),
"events.json object not found"
);
let creds =
objects.iter().find(|(k, _)| k.ends_with("events.json")).expect("events.json object");
let body = std::str::from_utf8(&creds.1)?;
assert!(body.contains("Q4450263"), "expected events.json file");
Ok(())
}

View file

@ -59,6 +59,10 @@ impl TestContext {
jql: None,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
max_results: 10,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
@ -147,6 +151,11 @@ async fn test_scan_slack_messages() -> Result<()> {
slack_query: Some("test".into()),
slack_api_url: Url::parse(&format!("{}/", server.uri()))?,
max_results: 10,
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,

View file

@ -127,6 +127,11 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options

View file

@ -70,6 +70,11 @@ impl TestContext {
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@ -142,6 +147,11 @@ impl TestContext {
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options