preparing for v1.100.0

This commit is contained in:
Mick Grove 2026-05-18 09:42:04 -07:00
commit 1636b07810
10 changed files with 296 additions and 88 deletions

1
.gitignore vendored
View file

@ -23,6 +23,7 @@ logs/*
!testdata/html_embedded_vulnerable.html
!docs/viewer/index.html
!docs-site/overrides/*.html
private-notes/
*.dot
fuzz/*
!fuzz/Cargo.toml

View file

@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file.
## [v1.100.0]
- Archive scanning now reaches inside Android/iOS app packages: added `apk`, `aab`, and `ipa` to the recognized ZIP-based archive formats so secrets embedded in APK/AAB/IPA contents (e.g. `classes*.dex`, `res/values/strings.xml`) are extracted and matched.
- Git repository scans now extract archive blobs encountered in the object database, not just on the filesystem. Previously a `.zip`/`.jar`/`.apk`/`.tar.gz` committed to a repo was scanned as raw compressed bytes, so secrets inside it were invisible. The git enumerator fans each archive entry out as a synthetic `<archive>!<entry>` blob with the original commit metadata. Honors `--no-extract-archives` for opt-out.
- Fixed tar-wrapped archive extraction for `.tgz` and `.tar.*` files, and made dependent credential validation deduplication preserve per-occurrence context so repeated secrets validate with the correct nearby companion value.
- Performance: ZIP-based git blobs ≤ 64 MB extract entirely in memory (no temp-file round trip), beating the v1.99.0 baseline by ~15% on a 80 GiB monorepo despite scanning ~300K additional archive-content blobs. Larger archives auto-fall-back to a disk-streaming extractor.
- Memory safety: hard caps on archive extraction — 64 MB compressed pre-flight, 256 MB aggregate decompressed per archive (in-memory and disk paths), 512 MB per entry, plus a `PK\x03\x04` magic-byte gate. Worst-case footprint is bounded at ~`num_jobs * 320 MB`.
- Release binary trimmed from 34 MB to 26 MB (~24% smaller). Switched `jsonwebtoken` to its `rust_crypto` backend (eliminates our scanner's pull on `aws-lc-rs`), bumped workspace `hmac` 0.12→0.13, `sha1` 0.10→0.11, `sha2` 0.10→0.11 to deduplicate our internal crypto code with the AWS sigv4 side, and migrated affected call sites in `kingfisher-core`, `kingfisher-rules`, and `kingfisher-scanner` to the digest-0.11 API (`hex::encode` for hex digests, explicit `KeyInit` import for HMAC).

View file

@ -5,16 +5,15 @@ rules:
(?x)
\b
(
(?:A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)
(?:A3T[A-Z0-9]|AKIA|ASIA)
[A-Z0-9]{16}
)
\b
pattern_requirements:
min_digits: 1
ignore_if_contains:
- "EXAMPLE"
- "TEST"
min_entropy: 3.2
min_entropy: 3.0
visible: false
confidence: medium
examples:
@ -25,14 +24,14 @@ rules:
pattern: |
(?xi)
(?:
\b
(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)
(?:\b|_)
(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|ASIA)
(?:.|[\n\r]){0,64}?
[^A-Za-z0-9_+!@\#$%^&*()\]./]
([A-Za-z0-9/+]{40})
[^A-Za-z0-9_+!@\#$%^&*()\]./]
|
\b(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)
(?:\b|_)(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|ASIA)
(?:.|[\n\r]){0,96}?
(?:SECRET|PRIVATE|ACCESS)
(?:.|[\n\r]){0,16}?

View file

@ -5,12 +5,13 @@ rules:
# Matches keys starting with 'pa-' followed by 43 URL-safe base64 characters
pattern: |
(?x)
\b
(
pa-[a-zA-Z0-9\-_]{43}
)
\b
min_entropy: 4.0
confidence: high
confidence: medium
examples:
- pa-r4yuCYCuPhNO-10Lu9aO7dR4jxUWlLmlUjm_NOVVdSs
validation:
@ -22,7 +23,56 @@ rules:
headers:
Authorization: "Bearer {{ TOKEN }}"
response_matcher:
# 200 = key has /v1/files permission, 403 = valid key without that permission
# (e.g. an inference-only key). 401 with "Provided API key is invalid." is the
# only response Voyage AI returns for a bad key, so any non-401 status is live.
- type: StatusMatch
status: [200]
status: [401]
negative: true
- type: WordMatch
words:
- "Provided API key is invalid"
negative: true
references:
- https://docs.voyageai.com/reference
- https://docs.voyageai.com/reference
- https://docs.voyageai.com/docs/api-key-and-installation
- name: Voyage AI API Key
id: kingfisher.voyageai.api_key.2
description: Detects Voyage AI API keys (al- prefix variant) used for embedding and retrieval models.
# Matches keys starting with 'al-' followed by 43 URL-safe base64 characters
pattern: |
(?x)
\b
(
al-[a-zA-Z0-9\-_]{43}
)
\b
min_entropy: 4.0
confidence: medium
examples:
- al-Qf7M2bZ8xnLpvE4hRcDsJtAo1KyU93WgIBmXrNVoYTu
validation:
type: Http
content:
request:
method: GET
url: https://api.voyageai.com/v1/files
headers:
Authorization: "Bearer {{ TOKEN }}"
response_matcher:
# 200 = key has /v1/files permission, 403 = valid key without that permission.
# 401 with "Provided API key is invalid." is the only invalid-key response.
- type: StatusMatch
status: [401]
negative: true
- type: WordMatch
words:
- "Provided API key is invalid"
negative: true
references:
- https://docs.voyageai.com/reference
- https://docs.voyageai.com/docs/api-key-and-installation
# NOTE: Revocation is not implemented because Voyage AI does not document a public REST
# endpoint for programmatic API key revocation. All probed admin/key-management paths
# under api.voyageai.com return 404. Keys must be revoked via the Voyage AI dashboard.

View file

@ -200,9 +200,7 @@ pub fn validate_aws_credentials_input(access_key_id: &str, secret_key: &str) ->
return Err("AWS access key ID contains invalid characters".to_string());
}
let prefix = &access_key_id[..4];
let valid_prefix =
matches!(prefix, "AKIA" | "AGPA" | "AIDA" | "AROA" | "AIPA" | "ANPA" | "ANVA" | "ASIA")
|| prefix.starts_with("A3T");
let valid_prefix = matches!(prefix, "AKIA" | "ASIA") || prefix.starts_with("A3T");
if !valid_prefix {
return Err("Invalid AWS access key ID format".to_string());
}

View file

@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file.
## [v1.100.0]
- Archive scanning now reaches inside Android/iOS app packages: added `apk`, `aab`, and `ipa` to the recognized ZIP-based archive formats so secrets embedded in APK/AAB/IPA contents (e.g. `classes*.dex`, `res/values/strings.xml`) are extracted and matched.
- Git repository scans now extract archive blobs encountered in the object database, not just on the filesystem. Previously a `.zip`/`.jar`/`.apk`/`.tar.gz` committed to a repo was scanned as raw compressed bytes, so secrets inside it were invisible. The git enumerator fans each archive entry out as a synthetic `<archive>!<entry>` blob with the original commit metadata. Honors `--no-extract-archives` for opt-out.
- Fixed tar-wrapped archive extraction for `.tgz` and `.tar.*` files, and made dependent credential validation deduplication preserve per-occurrence context so repeated secrets validate with the correct nearby companion value.
- Performance: ZIP-based git blobs ≤ 64 MB extract entirely in memory (no temp-file round trip), beating the v1.99.0 baseline by ~15% on a 80 GiB monorepo despite scanning ~300K additional archive-content blobs. Larger archives auto-fall-back to a disk-streaming extractor.
- Memory safety: hard caps on archive extraction — 64 MB compressed pre-flight, 256 MB aggregate decompressed per archive (in-memory and disk paths), 512 MB per entry, plus a `PK\x03\x04` magic-byte gate. Worst-case footprint is bounded at ~`num_jobs * 320 MB`.
- Release binary trimmed from 34 MB to 26 MB (~24% smaller). Switched `jsonwebtoken` to its `rust_crypto` backend (eliminates our scanner's pull on `aws-lc-rs`), bumped workspace `hmac` 0.12→0.13, `sha1` 0.10→0.11, `sha2` 0.10→0.11 to deduplicate our internal crypto code with the AWS sigv4 side, and migrated affected call sites in `kingfisher-core`, `kingfisher-rules`, and `kingfisher-scanner` to the digest-0.11 API (`hex::encode` for hex digests, explicit `KeyInit` import for HMAC).

View file

@ -23,18 +23,18 @@ pub const ZIP_BASED_FORMATS: &[&str] = &[
"kmz", "widget", "xpi", "sketch", "pages", "key", "numbers", "hwpx",
];
/// Break `<name>.<outer>.<inner>` into `(Some(outer), Some(inner))`.
/// For `foo.tar.gz` this returns `("tar", "gz")`.
fn split_extensions(path: &Path) -> (Option<String>, Option<String>) {
let ext_inner = path.extension().and_then(|e| e.to_str()).map(|s| s.to_ascii_lowercase());
fn is_tar_wrapped_compression(path: &Path) -> bool {
let filename = match path.file_name().and_then(|s| s.to_str()) {
Some(name) => name.to_ascii_lowercase(),
None => return false,
};
let ext_outer = path
.file_stem()
.and_then(|s| Path::new(s).extension())
.and_then(|e| e.to_str())
.map(|s| s.to_ascii_lowercase());
(ext_outer, ext_inner)
filename.ends_with(".tgz")
|| filename.ends_with(".tar.gz")
|| filename.ends_with(".tar.gzip")
|| filename.ends_with(".tar.bz2")
|| filename.ends_with(".tar.bzip2")
|| filename.ends_with(".tar.xz")
}
#[derive(Debug)]
@ -450,7 +450,7 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result<CompressedCon
return handle_zip_archive_streaming(&mut file, path, temp.path());
}
}
"gz" | "gzip" => {
"gz" | "gzip" | "tgz" => {
let out_path = make_output_path(path, base_dir, "decomp.tar");
let decoder = GzDecoder::new(BufReader::new(safe_open_for_read(path)?));
return stream_to_file(decoder, &out_path);
@ -487,12 +487,13 @@ pub fn decompress_file(path: &Path, base_dir: Option<&Path>) -> Result<Compresse
let mut owned_buf: Option<PathBuf>;
loop {
let should_extract_tar = is_tar_wrapped_compression(current_path);
let content = decompress_once(current_path, base_dir)?;
// If the step produced a single on-disk file that is itself a .tar,
// recurse on that file.
if let CompressedContent::RawFile(ref p) = content {
if split_extensions(p).0.as_deref() == Some("tar") {
if should_extract_tar {
owned_buf = Some(p.clone()); // own the path
current_path = owned_buf.as_ref().unwrap();
continue;
@ -570,7 +571,7 @@ mod tests {
use tempfile::tempdir;
use zip::{CompressionMethod, ZipWriter, write::SimpleFileOptions};
use super::{CompressedContent, decompress_once};
use super::{CompressedContent, decompress_file_to_temp, decompress_once};
/// 1) Fully unpack:
/// - 1st decompress `.gz` -- get a `.tar` file
@ -627,6 +628,45 @@ mod tests {
Ok(())
}
#[test]
fn smoke_decompress_tgz_archive() -> anyhow::Result<()> {
let dir = tempdir()?;
let tgz = dir.path().join("payload.tgz");
let github_pat = "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"; // this is not a real secret
{
let f = File::create(&tgz)?;
let gz = GzEncoder::new(f, Compression::default());
let mut tar = Builder::new(gz);
let data = format!("token={github_pat}\n");
let mut hdr = tar::Header::new_gnu();
hdr.set_size(data.len() as u64);
hdr.set_mode(0o644);
hdr.set_cksum();
tar.append_data(&mut hdr, "secret.txt", data.as_bytes())?;
tar.into_inner()?.finish()?;
}
let (content, _tmp) = decompress_file_to_temp(&tgz)?;
if let CompressedContent::ArchiveFiles(files) = content {
let mut found = false;
for (logical, path) in files {
if logical.ends_with("payload.tgz!secret.txt") {
let txt = std::fs::read_to_string(&path)?;
assert!(txt.contains(github_pat));
found = true;
}
}
assert!(found, "did not find secret.txt in tgz ArchiveFiles");
} else {
panic!("expected ArchiveFiles for tgz archive, got {:?}", content);
}
Ok(())
}
/// 2) No-extract flag: just peel the `.gz` layer (no base_dir -- use NamedTempFile), and verify
/// you get back a RawFile, whose contents are the tar archive itself.
#[test]

View file

@ -983,6 +983,18 @@ fn build_cache_key(
// For demonstration, well do a simplistic approach
// You can adapt from your existing logic
let capture0 = om.captures.captures.get(0).map_or(String::new(), |c| c.raw_value().to_string());
if !om.rule.syntax().depends_on_rule.is_empty() {
return format!(
"{}|{}|{}|{}|{}",
om.rule.name(),
capture0,
om.blob_id,
om.matching_input_offset_span.start,
om.matching_input_offset_span.end
);
}
format!("{}|{}|{}", om.rule.name(), capture0, dep_vars_str)
}

View file

@ -15,7 +15,7 @@ use http::StatusCode;
use liquid::Object;
use liquid_core::{Value, ValueView};
use reqwest::{Client, Url, header, header::HeaderValue, multipart};
use rustc_hash::FxHashMap;
use rustc_hash::{FxHashMap, FxHashSet};
use tokio::{sync::Notify, time};
use tracing::{debug, trace};
@ -260,7 +260,9 @@ type Cache = kingfisher_scanner::validation::Cache;
/// Returns an opaque 64-bit key for internal validation deduplication.
///
/// This is an INTERNAL key used only for validation deduplication within a single scan.
/// It uses `captures.get(0)` to get the primary secret value.
/// It uses `captures.get(0)` to get the primary secret value. Rules with dependent
/// variables also include blob location because validation can depend on nearby context
/// such as an AWS access-key ID paired with a secret access key.
///
/// **Important**: This is distinct from the EXTERNAL `finding_fingerprint` used for:
/// - Baseline comparisons across scans
@ -279,6 +281,13 @@ fn validation_dedup_key(m: &OwnedBlobMatch) -> u64 {
if let Some(val) = capture_value {
val.hash(&mut hasher);
}
if !m.rule.syntax().depends_on_rule.is_empty() {
m.blob_id.hash(&mut hasher);
m.matching_input_offset_span.start.hash(&mut hasher);
m.matching_input_offset_span.end.hash(&mut hasher);
}
let key = hasher.finish();
trace!(
@ -693,7 +702,7 @@ async fn timed_validate_single_match<'a>(
validate_jwt_rule(m, &captured_values, use_lax_tls, clients.allow_internal_ips).await;
}
Some(Validation::AWS) => {
validate_aws_rule(m, &captured_values, cache).await;
validate_aws_rule(m, &captured_values, dependent_variables, cache).await;
}
Some(Validation::GCP) => {
validate_gcp_rule(m, &globals, cache).await;
@ -1391,6 +1400,7 @@ async fn validate_jwt_rule(
async fn validate_aws_rule(
m: &mut OwnedBlobMatch,
captured_values: &[(String, String, usize, usize)],
dependent_variables: &FxHashMap<String, Vec<(String, OffsetSpan)>>,
cache: &Cache,
) {
let secret = captured_values
@ -1398,10 +1408,8 @@ async fn validate_aws_rule(
.find(|(n, ..)| n == "TOKEN")
.map(|(_, v, ..)| v.clone())
.unwrap_or_default();
let akid =
utils::find_closest_variable(captured_values, &secret, "TOKEN", "AKID").unwrap_or_default();
if akid.is_empty() || secret.is_empty() {
if secret.is_empty() {
m.validation_success = false;
m.validation_response_body =
validation_body::from_string("Missing AWS access-key ID or secret.".to_string());
@ -1409,77 +1417,171 @@ async fn validate_aws_rule(
return;
}
let cache_key = aws::generate_aws_cache_key(&akid, &secret);
if let Some(cached) = cache.get(&cache_key) {
let c = cached.value();
if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
m.validation_success = c.is_valid;
m.validation_response_body = c.body.clone();
m.validation_response_status = c.status;
return;
}
}
let akid_candidates = aws_akid_candidates(
captured_values,
dependent_variables.get("AKID"),
m.matching_input_offset_span,
&secret,
);
if let Some(account_id) = aws::should_skip_aws_validation(&akid) {
m.validation_success = false;
m.validation_response_body = validation_body::from_string(format!(
"(skip list entry) AWS validation not attempted for account {}.",
account_id
));
m.validation_response_status = StatusCode::PRECONDITION_REQUIRED;
cache.insert(
cache_key,
CachedResponse {
body: m.validation_response_body.clone(),
status: m.validation_response_status,
is_valid: m.validation_success,
timestamp: Instant::now(),
},
);
return;
}
if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) {
if akid_candidates.is_empty() {
m.validation_success = false;
m.validation_response_body =
validation_body::from_string(format!("Invalid AWS credentials ({}): {}", akid, e));
validation_body::from_string("Missing AWS access-key ID or secret.".to_string());
m.validation_response_status = StatusCode::BAD_REQUEST;
return;
}
match aws::validate_aws_credentials(&akid, &secret).await {
Ok((ok, msg)) => {
m.validation_success = ok;
if ok {
let mut body = format!("{} --- ARN: {}", akid, msg);
if let Ok(acct) = aws::aws_key_to_account_number(&akid) {
body.push_str(&format!(" --- AWS Account Number: {:012}", acct));
let mut last_body = None;
let mut last_status = StatusCode::UNAUTHORIZED;
for akid in akid_candidates {
let cache_key = aws::generate_aws_cache_key(&akid, &secret);
if let Some(cached) = cache.get(&cache_key) {
let c = cached.value();
if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
if c.is_valid {
m.validation_success = c.is_valid;
m.validation_response_body = c.body.clone();
m.validation_response_status = c.status;
return;
}
m.validation_response_body = validation_body::from_string(body);
m.validation_response_status = StatusCode::OK;
} else {
m.validation_response_body = validation_body::from_string(format!(
"AWS validation error ({}): {}",
akid, msg
));
m.validation_response_status = StatusCode::UNAUTHORIZED;
last_body = Some(c.body.clone());
last_status = c.status;
continue;
}
}
if let Some(account_id) = aws::should_skip_aws_validation(&akid) {
let body = validation_body::from_string(format!(
"(skip list entry) AWS validation not attempted for account {}.",
account_id
));
cache.insert(
cache_key,
CachedResponse {
body: m.validation_response_body.clone(),
status: m.validation_response_status,
is_valid: m.validation_success,
body: body.clone(),
status: StatusCode::PRECONDITION_REQUIRED,
is_valid: false,
timestamp: Instant::now(),
},
);
last_body = Some(body);
last_status = StatusCode::PRECONDITION_REQUIRED;
continue;
}
Err(e) => {
m.validation_success = false;
m.validation_response_body =
validation_body::from_string(format!("AWS validation error ({}): {}", akid, e));
m.validation_response_status = StatusCode::BAD_GATEWAY;
if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) {
let body =
validation_body::from_string(format!("Invalid AWS credentials ({}): {}", akid, e));
cache.insert(
cache_key,
CachedResponse {
body: body.clone(),
status: StatusCode::BAD_REQUEST,
is_valid: false,
timestamp: Instant::now(),
},
);
last_body = Some(body);
last_status = StatusCode::BAD_REQUEST;
continue;
}
match aws::validate_aws_credentials(&akid, &secret).await {
Ok((ok, msg)) => {
if ok {
m.validation_success = true;
let mut body = format!("{} --- ARN: {}", akid, msg);
if let Ok(acct) = aws::aws_key_to_account_number(&akid) {
body.push_str(&format!(" --- AWS Account Number: {:012}", acct));
}
m.validation_response_body = validation_body::from_string(body);
m.validation_response_status = StatusCode::OK;
cache.insert(
cache_key,
CachedResponse {
body: m.validation_response_body.clone(),
status: m.validation_response_status,
is_valid: true,
timestamp: Instant::now(),
},
);
return;
}
let body = validation_body::from_string(format!(
"AWS validation error ({}): {}",
akid, msg
));
cache.insert(
cache_key,
CachedResponse {
body: body.clone(),
status: StatusCode::UNAUTHORIZED,
is_valid: false,
timestamp: Instant::now(),
},
);
last_body = Some(body);
last_status = StatusCode::UNAUTHORIZED;
}
Err(e) => {
last_body = Some(validation_body::from_string(format!(
"AWS validation error ({}): {}",
akid, e
)));
last_status = StatusCode::BAD_GATEWAY;
}
}
}
m.validation_success = false;
m.validation_response_body = last_body.unwrap_or_else(|| {
validation_body::from_string("AWS validation failed for all nearby access-key IDs.")
});
m.validation_response_status = last_status;
}
fn aws_akid_candidates(
captured_values: &[(String, String, usize, usize)],
dependent_akids: Option<&Vec<(String, OffsetSpan)>>,
target_span: OffsetSpan,
secret: &str,
) -> Vec<String> {
let mut candidates = Vec::new();
if let Some(closest) =
utils::find_closest_variable(captured_values, &secret.to_string(), "TOKEN", "AKID")
{
candidates.push((0usize, closest));
}
if let Some(values) = dependent_akids {
candidates.extend(
values
.iter()
.map(|(value, span)| (dependency_distance(*span, target_span), value.clone())),
);
}
candidates.sort_by_key(|(distance, _)| *distance);
let mut seen = FxHashSet::default();
candidates
.into_iter()
.filter_map(|(_, value)| if seen.insert(value.clone()) { Some(value) } else { None })
.take(64)
.collect()
}
fn dependency_distance(span: OffsetSpan, target_span: OffsetSpan) -> usize {
if span.end <= target_span.start {
target_span.start - span.end
} else if span.start >= target_span.end {
span.start - target_span.end
} else {
0
}
}

View file

@ -21,7 +21,11 @@ fn smoke_scan_tar_gz_archive() -> anyhow::Result<()> {
let mut t = Builder::new(gz);
let data = format!("token={github_pat}\n");
t.append_data(&mut tar::Header::new_gnu(), "secret.txt", data.as_bytes())?;
let mut header = tar::Header::new_gnu();
header.set_size(data.len() as u64);
header.set_mode(0o644);
header.set_cksum();
t.append_data(&mut header, "secret.txt", data.as_bytes())?;
t.into_inner()?.finish()?;
}