Decode Base64 blobs and scan their contents for secrets while skipping short strings for performance

2025-08-30 16:44:55 -07:00 · 2025-08-30 16:44:55 -07:00 · 9de355a5c8
commit 9de355a5c8
parent e54dbe90d0
14 changed files with 1266 additions and 52 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,11 +2,12 @@

 All notable changes to this project will be documented in this file.

-## [Unreleased]
+## [1.47.0]
 - MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them
 - Improved rules: github oauth2, diffbot, mailchimp, aws
 - Added validation to SauceLabs rule
 - Added rules: shodan, bitly, flickr
+- Decode Base64 blobs and scan their contents for secrets while skipping short strings for performance

 ## [1.46.0]
 - Improved rules: AWS, pem
--- a/Cargo.toml
+++ b/Cargo.toml
@ -10,7 +10,7 @@ publish = false

 [package]
 name = "kingfisher"
-version = "1.46.0"
+version = "1.47.0"
 description = "MongoDB's blazingly fast secret scanning and validation tool"
 edition.workspace = true
 rust-version.workspace = true
--- a/data/rules/docker.yml
+++ b/data/rules/docker.yml
@ -0,0 +1,48 @@
+rules:
+  - name: Docker Registry Credentials (auths JSON)
+    id: kingfisher.docker.auths.1
+    pattern: |
+      (?xis)
+      "auths"\s*:\s*\{
+        [^}]*?
+        " (?P<REG> (?:https?:\/\/)? [a-z0-9.\-:+/]+ ) "\s*:\s*\{
+          [^}]*?
+          "auth"\s*:\s*"(?P<B64> [A-Za-z0-9+/=]{16,} )"
+          [^}]*?
+        \}
+        [^}]*?
+      \}
+    min_entropy: 2.0
+    confidence: medium
+    examples:
+      - |
+        {
+          "auths": {
+            "quay.io": {
+              "auth": "cmhkaCtyaHRhcDowM1BERk1RTTJQTDlaQUE5T1gzSU9IQjFYTUlXOVNGNU1XRzNSRVRHNThKVVpKMzEwV0ZZRVNOQTdGMExNNTYx"
+            }
+          }
+        }
+      - |
+        {"auths":{"index.docker.io/v1/":{"auth":"dXNlcjp0b2tlbg=="}}}
+    references:
+      - https://distribution.github.io/distribution/spec/api/
+    validation:
+      type: Http
+      content:
+        request:
+          method: GET
+          url: >
+            {%- assign r = REG -%}
+            {%- if r contains "://" -%}
+              {{ r | replace: "/$", "" }}/v2/auth
+            {%- else -%}
+              https://{{ r }}/v2/auth
+            {%- endif -%}
+          headers:
+            Authorization: "Basic {{ B64 }}"
+            Accept: application/json
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status: [200]
--- a/data/rules/generic.yml
+++ b/data/rules/generic.yml
@ -192,4 +192,22 @@ rules:
        password = 'abuser123456'  # some other comment
      - |
        user = 'Aladdin'
-        password = 'open sesame'
+        password = 'open sesame'
+  - name: Docker Robot Credentials (plaintext pair)
+    id: kingfisher.generic.9
+    pattern: |
+      (?xi)
+      \b
+      (                                  
+        (?P<USER> [a-z0-9._-]+ \+ [a-z0-9._-]+ )  
+        :
+        (?P<PASS> [A-Z0-9]{32,80} )
+      )
+      \b
+    min_entropy: 2.0
+    confidence: low
+    examples:
+      - some+thing:02PDFMQN2PL2ZAB9OX3IOHC1XMIW1SE5NWG3RETG58JUZJ310WFYESRA7F0LM461
+      - org+builder:1C2F9D0BB1E67E9F6B3B5B9A2A3D4E5F6A7B8C9D0E1F2A3B4C5D6E7F8A9B0C1
+    references:
+      - https://docs.quay.io/use_quay.html#robot-accounts
--- a/src/main.rs
+++ b/src/main.rs
@ -5,27 +5,27 @@
 //   * Fallback - system allocator     (`system-alloc` feature)
 // ────────────────────────────────────────────────────────────

-// --- jemalloc (opt-in) ---
-#[cfg(feature = "use-jemalloc")]
-#[global_allocator]
-static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+// // --- jemalloc (opt-in) ---
+// #[cfg(feature = "use-jemalloc")]
+// #[global_allocator]
+// static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;

-// --- mimalloc (default) ---
-#[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))]
-#[global_allocator]
-static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
-
-// --- system allocator (explicit opt-out) ---
-#[cfg(feature = "system-alloc")]
-use std::alloc::System;
-#[cfg(feature = "system-alloc")]
-#[global_allocator]
-static GLOBAL: System = System;
+// // --- mimalloc (default) ---
+// #[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))]
+// #[global_allocator]
+// static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;

+// // --- system allocator (explicit opt-out) ---
+// #[cfg(feature = "system-alloc")]
 // use std::alloc::System;
+// #[cfg(feature = "system-alloc")]
 // #[global_allocator]
 // static GLOBAL: System = System;

+use std::alloc::System;
+#[global_allocator]
+static GLOBAL: System = System;
+
 use std::{
    io::Read,
    sync::{Arc, Mutex},
--- a/src/matcher.rs
+++ b/src/matcher.rs
@ -65,6 +65,7 @@ pub struct OwnedBlobMatch {
    pub validation_response_status: StatusCode,
    pub validation_success: bool,
    pub calculated_entropy: f32,
+    pub is_base64: bool,
 }
 impl<'a> Matcher<'a> {
    pub fn get_profiling_report(&self) -> Option<Vec<RuleStats>> {
@ -85,6 +86,7 @@ impl OwnedBlobMatch {
                .unwrap_or(StatusCode::CONTINUE),
            validation_success: m.validation_success,
            calculated_entropy: m.calculated_entropy,
+            is_base64: m.is_base64,
        }
    }

@ -108,6 +110,7 @@ impl OwnedBlobMatch {
            validation_success: blob_match.validation_success,
            calculated_entropy: blob_match.calculated_entropy,
            finding_fingerprint: 0, //default
+            is_base64: blob_match.is_base64,
        };

        // Convert matching_finding to a &str (using lossy conversion if needed)
@ -154,6 +157,7 @@ pub struct BlobMatch<'a> {

    pub validation_success: bool,
    pub calculated_entropy: f32,
+    pub is_base64: bool,
 }
 #[derive(Clone)]
 struct UserData {
@ -305,8 +309,12 @@ impl<'a> Matcher<'a> {
        // Perform the scan
        self.scan_bytes_raw(&blob.bytes(), &filename)?;

-        // Early exit if no matches found
-        if self.user_data.raw_matches_scratch.is_empty() {
+        // Opportunistically look for standalone Base64 blobs. If neither
+        // the raw scan nor this check yields anything, we can return early
+        // before doing any heavier work.
+        let mut b64_items = get_base64_strings(blob.bytes());
+
+        if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() {
            // Only record in seen_blobs if deduplication is enabled
            if !no_dedup {
                return Ok(match self.seen_blobs.insert(blob.id, false) {
@ -322,18 +330,22 @@ impl<'a> Matcher<'a> {
        let rules_db = self.rules_db;
        let mut seen_matches = FxHashSet::default();
        let mut previous_matches = Vec::new();
-        let tree_sitter_result = lang.and_then(|lang_str| {
-            get_language_and_queries(&lang_str).and_then(|(language, queries)| {
-                let checker = Checker { language, rules: queries };
-                match checker.check(&blob.bytes()) {
-                    Ok(results) => Some(results),
-                    Err(e) => {
-                        println!("Error in checker.check: {}", e);
-                        None
+        let tree_sitter_result = if self.user_data.raw_matches_scratch.is_empty() {
+            None
+        } else {
+            lang.and_then(|lang_str| {
+                get_language_and_queries(&lang_str).and_then(|(language, queries)| {
+                    let checker = Checker { language, rules: queries };
+                    match checker.check(&blob.bytes()) {
+                        Ok(results) => Some(results),
+                        Err(e) => {
+                            println!("Error in checker.check: {}", e);
+                            None
+                        }
                    }
-                }
+                })
            })
-        });
+        };
        // Process matches
        let mut matches = Vec::new();
        let owned_ts_results = tree_sitter_result.map(|ts_results| {
@ -383,6 +395,7 @@ impl<'a> Matcher<'a> {
                &mut seen_matches,
                origin,
                None,
+                false,
                redact,
                &filename,
                self.profiler.as_ref(),
@ -406,6 +419,7 @@ impl<'a> Matcher<'a> {
                            &mut seen_matches,
                            origin,
                            Some(ts_match.clone()),
+                            *is_base64_decoded,
                            redact,
                            &filename,
                            self.profiler.as_ref(),
@ -414,6 +428,45 @@ impl<'a> Matcher<'a> {
                }
            }
        }
+        // If the blob contains standalone Base64 blobs, decode and scan them as well
+        const MAX_B64_DEPTH: usize = 2; // decode at most two levels deep
+        let mut b64_stack: Vec<(DecodedData, usize)> =
+            b64_items.drain(..).map(|d| (d, 0)).collect();
+        while let Some((item, depth)) = b64_stack.pop() {
+            for (rule_id_usize, rule) in rules_db.rules.iter().enumerate() {
+                let re = &rules_db.anchored_regexes[rule_id_usize];
+                filter_match(
+                    blob,
+                    rule.clone(),
+                    re,
+                    item.pos_start,
+                    item.pos_end,
+                    &mut matches,
+                    &mut previous_matches,
+                    rule_id_usize,
+                    &mut seen_matches,
+                    origin,
+                    Some(item.decoded.clone()),
+                    true,
+                    redact,
+                    &filename,
+                    self.profiler.as_ref(),
+                );
+            }
+            if depth + 1 < MAX_B64_DEPTH {
+                for nested in get_base64_strings(item.decoded.as_bytes()) {
+                    b64_stack.push((
+                        DecodedData {
+                            original: nested.original,
+                            decoded: nested.decoded,
+                            pos_start: item.pos_start,
+                            pos_end: item.pos_end,
+                        },
+                        depth + 1,
+                    ));
+                }
+            }
+        }
        // Finalize
        // Only record in seen_blobs if deduplication is enabled
        if !no_dedup {
@ -457,6 +510,7 @@ fn filter_match<'b>(
    seen_matches: &mut FxHashSet<u64>,
    _origin: &OriginSet,
    ts_match: Option<String>,
+    is_base64: bool,
    redact: bool,
    filename: &str,
    profiler: Option<&Arc<ConcurrentRuleProfiler>>,
@ -521,6 +575,7 @@ fn filter_match<'b>(
            validation_response_status: StatusCode::from_u16(0).unwrap_or(StatusCode::CONTINUE),
            validation_success: false,
            calculated_entropy,
+            is_base64,
        });
        previous_matches.push((rule_id, matching_input_offset_span));
    }
@ -729,6 +784,8 @@ pub struct Match {
    pub calculated_entropy: f32,

    pub visible: bool,
+    #[serde(default)]
+    pub is_base64: bool,
 }
 impl Match {
    #[inline]
@ -780,6 +837,7 @@ impl Match {
            validation_response_status: owned_blob_match.validation_response_status.as_u16(),
            validation_success: owned_blob_match.validation_success,
            calculated_entropy: owned_blob_match.calculated_entropy,
+            is_base64: owned_blob_match.is_base64,
        }
    }

@ -832,33 +890,26 @@ pub struct DecodedData {
 }
 pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
    lazy_static! {
-        static ref RE_BASE64: Regex =
-            Regex::new(r"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?").unwrap();
+        // Require a reasonably long run of valid Base64 characters to reduce
+        // noise. 32 bytes corresponds to 24 decoded bytes.
+        static ref RE_BASE64: Regex = Regex::new(r"[A-Za-z0-9+/]{32,}={0,2}").unwrap();
    }
    let mut results = Vec::new();
-    for capture in RE_BASE64.captures_iter(input) {
-        let base64_match = capture.get(0).unwrap();
-
-        if base64_match.is_empty() {
-            continue;
-        }
-
-        let start = base64_match.start();
-        let end = base64_match.end();
-        let base64_string = &input[start..end];
-        // Check if the length is a multiple of 4
+    for m in RE_BASE64.find_iter(input) {
+        let base64_string = m.as_bytes();
+        // Skip candidates whose length isn't a multiple of four – they cannot
+        // be valid Base64.
        if base64_string.len() % 4 != 0 {
            continue;
        }
        if let Ok(decoded) = general_purpose::STANDARD.decode(base64_string) {
-            // Check if the decoded string is valid UTF-8
            if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
                if decoded_str.is_ascii() {
                    results.push(DecodedData {
                        original: String::from_utf8_lossy(base64_string).into_owned(),
                        decoded: decoded_str.to_string(),
-                        pos_start: start,
-                        pos_end: end,
+                        pos_start: m.start(),
+                        pos_end: m.end(),
                    });
                }
            }
@ -1026,12 +1077,13 @@ mod test {
    /// and report correct byte-offsets.
    #[test]
    fn test_get_base64_strings_basic() {
-        let raw = b"foo SGVsbG8gV29ybGQ= bar"; // "Hello World"
+        let raw = b"foo MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY= bar";
+        // decodes to "0123456789abcdef0123456789abcdef"
        let hits = get_base64_strings(raw);
        assert_eq!(hits.len(), 1);
        let item = &hits[0];
-        assert_eq!(item.decoded, "Hello World");
-        assert_eq!(item.original, "SGVsbG8gV29ybGQ=");
+        assert_eq!(item.decoded, "0123456789abcdef0123456789abcdef");
+        assert_eq!(item.original, "MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY=");
        // "foo␠" is 4 bytes, so the start offset is 4
        assert_eq!((item.pos_start, item.pos_end), (4, 4 + item.original.len()));
    }
--- a/src/reporter.rs
+++ b/src/reporter.rs
@ -384,6 +384,7 @@ impl DetailsReporter {
                column_start: source_span.start.column as u32,
                column_end: source_span.end.column as u32,
                path: file_path,
+                encoding: if rm.m.is_base64 { Some("base64".to_string()) } else { None },
                git_metadata: git_metadata_val,
            },
        }
@ -521,6 +522,8 @@ pub struct FindingRecordData {
    pub column_end: u32,
    pub path: String,
    #[serde(skip_serializing_if = "Option::is_none")]
+    pub encoding: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
    pub git_metadata: Option<serde_json::Value>,
 }

--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@ -165,6 +165,7 @@ mod tests {
            validation_success,
            calculated_entropy: 4.5,
            visible: true,
+            is_base64: false,
        }
    }

--- a/src/reporter/pretty_format.rs
+++ b/src/reporter/pretty_format.rs
@ -100,6 +100,9 @@ impl<'a> Display for PrettyFindingRecord<'a> {
        };
        let finding = &record.finding;
        writeln!(f, " |Finding.......: {}", style_fn(&finding.snippet))?;
+        if let Some(enc) = &finding.encoding {
+            writeln!(f, " |Encoding.....: {}", enc)?;
+        }
        writeln!(f, " |Fingerprint...: {}", finding.fingerprint)?;
        writeln!(f, " |Confidence....: {}", finding.confidence)?;
        writeln!(f, " |Entropy.......: {}", finding.entropy)?;
--- a/src/validation.rs
+++ b/src/validation.rs
@ -1028,6 +1028,7 @@ rules:
            validation_response_status: StatusCode::OK,
            validation_success: false,
            calculated_entropy: 0.0, // or compute your own
+            is_base64: false,
        };
        let parser = register_all(liquid::ParserBuilder::with_stdlib()).build()?;
        let client = reqwest::Client::new();
--- a/src/validation.rs.orig
+++ b/src/validation.rs.orig
--- a/src/validation/mongodb.rs
+++ b/src/validation/mongodb.rs
@ -93,9 +93,9 @@ fn is_local_host(h: &str) -> bool {

 const FAST_CONNECT_MS: u64 = 700; // direct single-host URIs
 const FAST_SELECT_MS: u64 = 300;
-const SRV_PARSE_MS: u64 = 1_000; // limit DNS resolution time
-const SRV_CONNECT_MS: u64 = 1500;//700;
-const SRV_SELECT_MS: u64 = 1500;//300;
+const SRV_PARSE_MS: u64 = 2_000; // limit DNS resolution time
+const SRV_CONNECT_MS: u64 = 2500; //700;
+const SRV_SELECT_MS: u64 = 2500; //300;

 /// Validates a MongoDB URI in ≤ 2 s. Returns `(bool, String)` where the
 /// boolean indicates success and the string provides a status message.
--- a/tests/fingerprint_dedup.rs
+++ b/tests/fingerprint_dedup.rs
@ -47,6 +47,7 @@ fn make_match(fp: u64) -> Match {
        validation_success: false,
        calculated_entropy: 0.0,
        visible: true,
+        is_base64: false,
    }
 }

--- a/tests/int_base64.rs
+++ b/tests/int_base64.rs
@ -0,0 +1,34 @@
+use assert_cmd::prelude::*;
+use predicates::prelude::*;
+use std::{fs, process::Command};
+use tempfile::tempdir;
+
+// Ensure base64 encoded secrets are decoded and detected
+#[test]
+fn detects_base64_encoded_secret() -> anyhow::Result<()> {
+    let dir = tempdir()?;
+    let file_path = dir.path().join("secret.txt");
+    // Base64 for ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa
+    let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDJxTHFQYQ==";
+    fs::write(&file_path, encoded)?;
+
+    Command::cargo_bin("kingfisher")?
+        .args([
+            "scan",
+            dir.path().to_str().unwrap(),
+            "--no-binary",
+            "--confidence=low",
+            "--format",
+            "json",
+            "--no-update-check",
+        ])
+        .assert()
+        .code(200)
+        .stdout(
+            predicate::str::contains("ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa")
+                .and(predicate::str::contains("\"encoding\": \"base64\"")),
+        );
+
+    dir.close()?;
+    Ok(())
+}