From fc0be774b4d7733f869f3ff4d780cf800de7c1af Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sat, 30 Aug 2025 19:40:11 -0700 Subject: [PATCH] Decode Base64 blobs and scan their contents for secrets while skipping short strings for performance. This has a small performance impact and can be disabled with --no-base64 --- CHANGELOG.md | 2 +- src/cli/commands/scan.rs | 4 ++ src/main.rs | 1 + src/matcher.rs | 78 ++++++++++++++++++----------------- src/reporter/json_format.rs | 1 + src/scanner/enumerate.rs | 2 +- src/scanner/processing.rs | 3 +- src/scanner/repos.rs | 2 +- tests/int_allowlist.rs | 1 + tests/int_base64.rs | 29 ++++++++++++- tests/int_dedup.rs | 1 + tests/int_github.rs | 1 + tests/int_gitlab.rs | 2 + tests/int_redact.rs | 1 + tests/int_slack.rs | 2 + tests/int_validation_cache.rs | 1 + tests/int_vulnerable_files.rs | 2 + 17 files changed, 91 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bfaed2..6e57853 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ All notable changes to this project will be documented in this file. - Improved rules: github oauth2, diffbot, mailchimp, aws - Added validation to SauceLabs rule - Added rules: shodan, bitly, flickr -- Decode Base64 blobs and scan their contents for secrets while skipping short strings for performance +- Decode Base64 blobs and scan their contents for secrets while skipping short strings for performance. This has a small performance impact and can be disabled with `--no-base64` ## [1.46.0] - Improved rules: AWS, pem diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index 5a4d22b..255417f 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -92,6 +92,10 @@ pub struct ScanArgs { #[arg(long, short = 'r', default_value_t = false)] pub redact: bool, + /// Skip decoding Base64 blobs before scanning + #[arg(long, default_value_t = false)] + pub no_base64: bool, + /// Timeout for Git repository scanning in seconds #[arg(long, default_value_t = 1800, value_name = "SECONDS")] pub git_repo_timeout: u64, diff --git a/src/main.rs b/src/main.rs index 639c2de..edb0972 100644 --- a/src/main.rs +++ b/src/main.rs @@ -335,6 +335,7 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { skip_regex: Vec::new(), skip_word: Vec::new(), output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, + no_base64: false, } } /// Run the rules check command diff --git a/src/matcher.rs b/src/matcher.rs index d817bcc..6f1531e 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -277,6 +277,7 @@ impl<'a> Matcher<'a> { lang: Option, redact: bool, no_dedup: bool, + no_base64: bool, ) -> Result> where 'a: 'b, @@ -312,7 +313,7 @@ impl<'a> Matcher<'a> { // Opportunistically look for standalone Base64 blobs. If neither // the raw scan nor this check yields anything, we can return early // before doing any heavier work. - let mut b64_items = get_base64_strings(blob.bytes()); + let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) }; if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() { // Only record in seen_blobs if deduplication is enabled @@ -428,42 +429,45 @@ impl<'a> Matcher<'a> { } } } - // If the blob contains standalone Base64 blobs, decode and scan them as well - const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep - let mut b64_stack: Vec<(DecodedData, usize)> = - b64_items.drain(..).map(|d| (d, 0)).collect(); - while let Some((item, depth)) = b64_stack.pop() { - for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() { - let re = &rules_db.anchored_regexes[rule_id_usize]; - filter_match( - blob, - rule.clone(), - re, - item.pos_start, - item.pos_end, - &mut matches, - &mut previous_matches, - rule_id_usize, - &mut seen_matches, - origin, - Some(item.decoded.clone()), - true, - redact, - &filename, - self.profiler.as_ref(), - ); - } - if depth + 1 < MAX_B64_DEPTH { - for nested in get_base64_strings(item.decoded.as_bytes()) { - b64_stack.push(( - DecodedData { - original: nested.original, - decoded: nested.decoded, - pos_start: item.pos_start, - pos_end: item.pos_end, - }, - depth + 1, - )); + + if !no_base64 { + // If the blob contains standalone Base64 blobs, decode and scan them as well + const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep + let mut b64_stack: Vec<(DecodedData, usize)> = + b64_items.drain(..).map(|d| (d, 0)).collect(); + while let Some((item, depth)) = b64_stack.pop() { + for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() { + let re = &rules_db.anchored_regexes[rule_id_usize]; + filter_match( + blob, + rule.clone(), + re, + item.pos_start, + item.pos_end, + &mut matches, + &mut previous_matches, + rule_id_usize, + &mut seen_matches, + origin, + Some(item.decoded.clone()), + true, + redact, + &filename, + self.profiler.as_ref(), + ); + } + if depth + 1 < MAX_B64_DEPTH { + for nested in get_base64_strings(item.decoded.as_bytes()) { + b64_stack.push(( + DecodedData { + original: nested.original, + decoded: nested.decoded, + pos_start: item.pos_start, + pos_end: item.pos_end, + }, + depth + 1, + )); + } } } } diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index da5f782..0b562d2 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -128,6 +128,7 @@ mod tests { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + no_base64: false, } } diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index a999f31..14861ff 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -181,7 +181,7 @@ pub fn enumerate_filesystem_inputs( return Ok(()); } progress.inc(blob.len().try_into().unwrap()); - match processor.run(origin, blob, args.no_dedup, args.redact) { + match processor.run(origin, blob, args.no_dedup, args.redact, args.no_base64) { Ok(None) => { // nothing to record } diff --git a/src/scanner/processing.rs b/src/scanner/processing.rs index e88393f..9447dbf 100644 --- a/src/scanner/processing.rs +++ b/src/scanner/processing.rs @@ -25,11 +25,12 @@ impl<'a> BlobProcessor<'a> { blob: Blob, no_dedup: bool, redact: bool, + no_base64: bool, ) -> Result> { let blob_id = blob.id.hex(); let _span = debug_span!("matcher", blob_id).entered(); let t1 = Instant::now(); - let res = self.matcher.scan_blob(&blob, &origin, None, redact, no_dedup)?; + let res = self.matcher.scan_blob(&blob, &origin, None, redact, no_dedup, no_base64)?; let scan_us = t1.elapsed().as_micros(); match res { // blob already seen, but with no matches; nothing to do! diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index 68f6f67..6770434 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -427,7 +427,7 @@ pub async fn fetch_s3_objects( let blob = crate::blob::Blob::from_bytes(bytes); if let Some((origin, blob_md, scored_matches)) = - processor.run(origin, blob, args.no_dedup, args.redact)? + processor.run(origin, blob, args.no_dedup, args.redact, args.no_base64)? { // Wrap origin & metadata once: let origin_arc = Arc::new(origin); diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs index d287616..0370755 100644 --- a/tests/int_allowlist.rs +++ b/tests/int_allowlist.rs @@ -105,6 +105,7 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result anyhow::Result<()> { "--no-update-check", ]) .assert() - .code(200) + .code(0) .stdout( predicate::str::contains("ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa") .and(predicate::str::contains("\"encoding\": \"base64\"")), @@ -32,3 +32,30 @@ fn detects_base64_encoded_secret() -> anyhow::Result<()> { dir.close()?; Ok(()) } + +// Ensure disabling Base64 decoding suppresses encoded secrets +#[test] +fn skips_base64_when_disabled() -> anyhow::Result<()> { + let dir = tempdir()?; + let file_path = dir.path().join("secret.txt"); + let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDJxTHFQYQ=="; + fs::write(&file_path, encoded)?; + + Command::cargo_bin("kingfisher")? + .args([ + "scan", + dir.path().to_str().unwrap(), + "--no-binary", + "--no-base64", + "--confidence=low", + "--format", + "json", + "--no-update-check", + ]) + .assert() + .code(0) + .stdout(predicate::str::contains("\"findings\":0")); + + dir.close()?; + Ok(()) +} \ No newline at end of file diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index fe354a7..8ed1341 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -121,6 +121,7 @@ rules: manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + no_base64: false, }; let global_args = GlobalArgs { diff --git a/tests/int_github.rs b/tests/int_github.rs index f632bf2..82c64ae 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -108,6 +108,7 @@ fn test_github_remote_scan() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + no_base64: false, }; // Create global arguments let global_args = GlobalArgs { diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index a7e75d3..9cfde7c 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -106,6 +106,7 @@ fn test_gitlab_remote_scan() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + no_base64: false, }; let global_args = GlobalArgs { @@ -213,6 +214,7 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + no_base64: false, }; let global_args = GlobalArgs { diff --git a/tests/int_redact.rs b/tests/int_redact.rs index 03e8afb..9be8c4a 100644 --- a/tests/int_redact.rs +++ b/tests/int_redact.rs @@ -88,6 +88,7 @@ async fn test_redact_hashes_finding_values() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + no_base64: false, }; let global_args = GlobalArgs { diff --git a/tests/int_slack.rs b/tests/int_slack.rs index abbf3ba..0bcae59 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -94,6 +94,7 @@ impl TestContext { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + no_base64: false, }; let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules).load(&scan_args)?; @@ -191,6 +192,7 @@ async fn test_scan_slack_messages() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + no_base64: false, }; let global_args = GlobalArgs { diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index 24148ec..0cf2a08 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -164,6 +164,7 @@ async fn test_validation_cache_and_depvars() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + no_base64: false, }; /* --------------------------------------------------------- * diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 34fe709..c53b8fc 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -107,6 +107,7 @@ impl TestContext { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + no_base64: false, }; let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules) @@ -189,6 +190,7 @@ impl TestContext { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + no_base64: false, }; let global_args = GlobalArgs {