Merge pull request #382 from mongodb/development

v1.100.0
This commit is contained in:
Mick Grove 2026-05-19 04:16:57 -07:00 committed by GitHub
commit e332d4eebb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
33 changed files with 1846 additions and 950 deletions

View file

@ -132,7 +132,7 @@ jobs:
toolchain: ${{ env.RUST_TOOLCHAIN }}
- name: Set up MSYS2
uses: msys2/setup-msys2@cafece8e6baf9247cf9b1bf95097b0b983cc558d # v2.31.0
uses: msys2/setup-msys2@e9898307ac31d1a803454791be09ab9973336e1c # v2.31.1
with:
msystem: ${{ matrix.msystem }}
update: true

View file

@ -274,7 +274,7 @@ jobs:
toolchain: ${{ env.RUST_TOOLCHAIN }}
- name: Set up MSYS2
uses: msys2/setup-msys2@cafece8e6baf9247cf9b1bf95097b0b983cc558d # v2.31.0
uses: msys2/setup-msys2@e9898307ac31d1a803454791be09ab9973336e1c # v2.31.1
with:
msystem: ${{ matrix.msystem }}
update: true

1
.gitignore vendored
View file

@ -23,6 +23,7 @@ logs/*
!testdata/html_embedded_vulnerable.html
!docs/viewer/index.html
!docs-site/overrides/*.html
private-notes/
*.dot
fuzz/*
!fuzz/Cargo.toml

View file

@ -2,6 +2,14 @@
All notable changes to this project will be documented in this file.
## [v1.100.0]
- Archive scanning now reaches inside Android/iOS app packages: added `apk`, `aab`, and `ipa` to the recognized ZIP-based archive formats so secrets embedded in APK/AAB/IPA contents (e.g. `classes*.dex`, `res/values/strings.xml`) are extracted and matched.
- Git repository scans now extract archive blobs encountered in the object database, not just on the filesystem. Previously a `.zip`/`.jar`/`.apk`/`.tar.gz` committed to a repo was scanned as raw compressed bytes, so secrets inside it were invisible. The git enumerator fans each archive entry out as a synthetic `<archive>!<entry>` blob with the original commit metadata. Honors `--no-extract-archives` for opt-out.
- Fixed tar-wrapped archive extraction for `.tgz` and `.tar.*` files, and made dependent credential validation deduplication preserve per-occurrence context so repeated secrets validate with the correct nearby companion value.
- Performance: ZIP-based git blobs ≤ 64 MB extract entirely in memory (no temp-file round trip), beating the v1.99.0 baseline by ~15% on a 80 GiB monorepo despite scanning ~300K additional archive-content blobs. Larger archives auto-fall-back to a disk-streaming extractor.
- Memory safety: hard caps on archive extraction — 64 MB compressed pre-flight, 256 MB aggregate decompressed per archive (in-memory and disk paths), 512 MB per entry, plus a `PK\x03\x04` magic-byte gate. Worst-case footprint is bounded at ~`num_jobs * 320 MB`.
- Release binary trimmed from 34 MB to 26 MB (~24% smaller). Switched `jsonwebtoken` to its `rust_crypto` backend (eliminates our scanner's pull on `aws-lc-rs`), bumped workspace `hmac` 0.12→0.13, `sha1` 0.10→0.11, `sha2` 0.10→0.11 to deduplicate our internal crypto code with the AWS sigv4 side, and migrated affected call sites in `kingfisher-core`, `kingfisher-rules`, and `kingfisher-scanner` to the digest-0.11 API (`hex::encode` for hex digests, explicit `KeyInit` import for HMAC).
## [v1.99.0]
- Fixed [#371](https://github.com/mongodb/kingfisher/issues/371): `pip install kingfisher-bin` on glibc Linux distros (Ubuntu, Debian, RHEL, Fedora, …) installed a macOS Mach-O binary and failed with `OSError: [Errno 8] Exec format error`. Linux wheels are now tagged `manylinux_2_17_<arch>.musllinux_1_2_<arch>` (instead of `musllinux_1_2_<arch>` only), so pip accepts them on both glibc-2.17+ and musl distros. The `pypi/hatch_build.py` hook now hard-fails when `KINGFISHER_PYPI_WHEEL_TAG` is unset, and the publish workflow refuses to upload any `py3-none-any.whl`, so the v1.92.0-era pure-Python wheel cannot recur.
- `--self-update` (alias `--update`) on a scan or other command now **re-execs into the freshly installed binary** so the current invocation completes with the new code and the latest detection rules. Previously the on-disk binary was replaced but the running process kept using the old in-memory version, requiring a second invocation to pick up the changes. On Unix this is a true `exec()` (same PID); on Windows the new binary is spawned and the parent exits with its status code. The explicit `kingfisher self-update` subcommand still updates and exits without re-execing. Self-update now also covers Windows arm64 (the asset was already published; the runtime cfg map gained the missing arm). See `docs/ADVANCED.md`*Update Checks*.

857
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -32,9 +32,9 @@ xxhash-rust = { version = "0.8", features = ["xxh3"] }
ignore = "0.4"
walkdir = "2.5"
include_dir = "0.7"
sha1 = "0.10"
sha2 = "0.10"
hmac = "0.12"
sha1 = "0.11"
sha2 = "0.11"
hmac = "0.13"
base32 = "0.5.1"
base64 = "0.22"
percent-encoding = "2.3"
@ -48,7 +48,7 @@ http = "1.4"
[package]
name = "kingfisher"
version = "1.99.0"
version = "1.100.0"
description = "MongoDB's blazingly fast and accurate secret scanning and validation tool"
edition.workspace = true
rust-version.workspace = true
@ -148,15 +148,16 @@ flate2 = "1.1"
thousands = "0.2.0"
crossbeam-skiplist = "0.1.3"
tokio-postgres = { version = "0.7", default-features = false, features = ["runtime"] }
mongodb = { version = "3.4", default-features = false, features = ["rustls-tls", "aws-auth", "compat-3-0-0", "dns-resolver"] }
# Temporary Git pin: keeps MongoDB SRV validation enabled while using the upstream
# Hickory 0.26 DNS-resolver fix before it is available in a crates.io release.
# TODO: switch back to a crates.io mongodb release once it includes that fix.
mongodb = { git = "https://github.com/mongodb/mongo-rust-driver", rev = "bdddefc50c4794d51d10b944320d42c6eb216b04", default-features = false, features = ["rustls-tls", "aws-auth", "compat-3-0-0", "dns-resolver"] }
mysql_async = { version = "0.36.2", default-features = false, features = ["default-rustls"] }
aws-config = { version = "1.8.14", default-features = false, features = ["default-https-client", "rt-tokio", "credentials-process", "sso"] }
aws-credential-types = "1.2.12"
aws-sdk-sts = { version = "1.98.0", default-features = false, features = ["default-https-client", "rt-tokio"] }
aws-types = "1.3.12"
parking_lot = "0.12.5"
octorust = "0.10.0"
reqwest-middleware-octorust = { package = "reqwest-middleware", version = "0.4.2" }
tracing-subscriber = {version = "0.3.22", features = ["env-filter"] }
tracing-core = "0.1.35"
aws-smithy-http-client = "1.1.10"
@ -181,9 +182,9 @@ futures = "0.3.31"
dashmap = "6.1.0"
xxhash-rust = { version = "0.8.15", features = ["xxh3", "const_xxh3"] }
serde_yaml = "0.9.34"
hmac = "0.13.0"
hmac = { workspace = true }
sha1 = { workspace = true }
sha2 = "0.11.0"
sha2 = { workspace = true }
humantime = "2.3.0"
path-dedot = "3.1.1"
quick-xml = { version = "0.39.2", features = ["serde", "serialize"] }
@ -251,11 +252,11 @@ proptest = "1.9.0"
[profile.release]
debug = false
strip = true #"debuginfo"
opt-level = 3 # Maximum optimization for performance
strip = true
opt-level = "s" # Optimize for smaller binary size over speed
lto = true # Enable Link Time Optimization
codegen-units = 1 # Optimize for size but slower compilation
panic = "abort" # Remove unwind tables for panics
panic = "abort" # Remove unwind tables for panics
rpath = false # Don't embed path dependencies
incremental = false

View file

@ -11,7 +11,7 @@
use std::{
convert::TryInto,
fs::File,
io::{Read, Write},
io::Read,
path::Path,
sync::{
Arc, OnceLock,
@ -235,7 +235,7 @@ impl BlobId {
pub fn new(input: &[u8]) -> Self {
const CHUNK: usize = 64 * 1024; // 64KB from start and end
let mut hasher = Sha1::new();
write!(&mut hasher, "blob {}\0", input.len()).unwrap();
update_git_blob_header(&mut hasher, input.len());
if input.len() <= CHUNK * 2 {
hasher.update(input);
} else {
@ -249,7 +249,7 @@ impl BlobId {
/// Computes a `BlobId` from the complete bytes (no truncation).
pub fn compute_from_bytes(bytes: &[u8]) -> Self {
let mut hasher = Sha1::new();
write!(&mut hasher, "blob {}\0", bytes.len()).unwrap();
update_git_blob_header(&mut hasher, bytes.len());
hasher.update(bytes);
let digest: [u8; 20] = hasher.finalize().into();
BlobId(digest)
@ -277,6 +277,27 @@ impl BlobId {
}
}
fn update_git_blob_header(hasher: &mut Sha1, len: usize) {
let mut digits = [0u8; 20];
let mut n = len;
let mut i = digits.len();
if n == 0 {
i -= 1;
digits[i] = b'0';
} else {
while n > 0 {
i -= 1;
digits[i] = b'0' + (n % 10) as u8;
n /= 10;
}
}
hasher.update(b"blob ");
hasher.update(&digits[i..]);
hasher.update(b"\0");
}
impl<'de> Deserialize<'de> for BlobId {
fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
struct Vis;

View file

@ -41,6 +41,7 @@ crc32fast = "1.5"
hmac.workspace = true
sha1.workspace = true
sha2.workspace = true
hex.workspace = true
percent-encoding.workspace = true
time.workspace = true
uuid = { workspace = true, features = ["v4"] }

View file

@ -5,16 +5,15 @@ rules:
(?x)
\b
(
(?:A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)
(?:A3T[A-Z0-9]|AKIA|ASIA)
[A-Z0-9]{16}
)
\b
pattern_requirements:
min_digits: 1
ignore_if_contains:
- "EXAMPLE"
- "TEST"
min_entropy: 3.2
min_entropy: 3.0
visible: false
confidence: medium
examples:
@ -25,14 +24,14 @@ rules:
pattern: |
(?xi)
(?:
\b
(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)
(?:\b|_)
(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|ASIA)
(?:.|[\n\r]){0,64}?
[^A-Za-z0-9_+!@\#$%^&*()\]./]
([A-Za-z0-9/+]{40})
[^A-Za-z0-9_+!@\#$%^&*()\]./]
|
\b(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)
(?:\b|_)(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|ASIA)
(?:.|[\n\r]){0,96}?
(?:SECRET|PRIVATE|ACCESS)
(?:.|[\n\r]){0,16}?

View file

@ -5,12 +5,13 @@ rules:
# Matches keys starting with 'pa-' followed by 43 URL-safe base64 characters
pattern: |
(?x)
\b
(
pa-[a-zA-Z0-9\-_]{43}
)
\b
min_entropy: 4.0
confidence: high
confidence: medium
examples:
- pa-r4yuCYCuPhNO-10Lu9aO7dR4jxUWlLmlUjm_NOVVdSs
validation:
@ -22,7 +23,56 @@ rules:
headers:
Authorization: "Bearer {{ TOKEN }}"
response_matcher:
# 200 = key has /v1/files permission, 403 = valid key without that permission
# (e.g. an inference-only key). 401 with "Provided API key is invalid." is the
# only response Voyage AI returns for a bad key, so any non-401 status is live.
- type: StatusMatch
status: [200]
status: [401]
negative: true
- type: WordMatch
words:
- "Provided API key is invalid"
negative: true
references:
- https://docs.voyageai.com/reference
- https://docs.voyageai.com/reference
- https://docs.voyageai.com/docs/api-key-and-installation
- name: Voyage AI API Key
id: kingfisher.voyageai.api_key.2
description: Detects Voyage AI API keys (al- prefix variant) used for embedding and retrieval models.
# Matches keys starting with 'al-' followed by 43 URL-safe base64 characters
pattern: |
(?x)
\b
(
al-[a-zA-Z0-9\-_]{43}
)
\b
min_entropy: 4.0
confidence: medium
examples:
- al-Qf7M2bZ8xnLpvE4hRcDsJtAo1KyU93WgIBmXrNVoYTu
validation:
type: Http
content:
request:
method: GET
url: https://api.voyageai.com/v1/files
headers:
Authorization: "Bearer {{ TOKEN }}"
response_matcher:
# 200 = key has /v1/files permission, 403 = valid key without that permission.
# 401 with "Provided API key is invalid." is the only invalid-key response.
- type: StatusMatch
status: [401]
negative: true
- type: WordMatch
words:
- "Provided API key is invalid"
negative: true
references:
- https://docs.voyageai.com/reference
- https://docs.voyageai.com/docs/api-key-and-installation
# NOTE: Revocation is not implemented because Voyage AI does not document a public REST
# endpoint for programmatic API key revocation. All probed admin/key-management paths
# under api.voyageai.com return 404. Keys must be revoked via the Voyage AI dashboard.

View file

@ -2,7 +2,7 @@
use base64::{Engine, engine::general_purpose};
use crc32fast::Hasher;
use hmac::{Hmac, Mac};
use hmac::{Hmac, KeyInit, Mac};
use liquid_core::{
Display_filter, Error as LiquidError, Expression, Filter, FilterParameters, FilterReflection,
FromFilterParameters, ParseFilter, Result, Runtime, Value, ValueView,
@ -536,7 +536,7 @@ static_filter!(
|input: &dyn ValueView| -> String {
let mut h = Sha256::new();
h.update(input.to_kstr().as_bytes());
format!("{:x}", h.finalize())
hex::encode(h.finalize())
}
);
@ -1128,7 +1128,7 @@ pub fn register_all(builder: liquid::ParserBuilder) -> liquid::ParserBuilder {
#[cfg(test)]
mod tests {
use base64::{Engine as _, engine::general_purpose};
use hmac::{Hmac, Mac};
use hmac::{Hmac, KeyInit, Mac};
use liquid::{ParserBuilder, object};
use percent_encoding::{NON_ALPHANUMERIC, utf8_percent_encode};
use regex::Regex;
@ -1162,7 +1162,7 @@ mod tests {
#[test]
fn sha256_filter() {
let expect = format!("{:x}", Sha256::digest(b"hello"));
let expect = hex::encode(Sha256::digest(b"hello"));
assert_eq!(render(r#"{{ "hello" | sha256 }}"#), expect);
}

View file

@ -24,6 +24,7 @@ validation-http = [
"dep:liquid-core",
"dep:quick-xml",
"dep:sha1",
"dep:hex",
"dep:time",
]
@ -182,12 +183,15 @@ pem = { version = "3.0.6", optional = true }
percent-encoding = { workspace = true, optional = true }
ring = { version = "0.17", optional = true }
jsonwebtoken = { version = "10.3.0", features = ["aws-lc-rs"], optional = true }
jsonwebtoken = { version = "10.3.0", default-features = false, features = ["rust_crypto"], optional = true }
p256 = { version = "0.13.2", optional = true }
ed25519-dalek = { version = "2.2", features = ["pkcs8"], optional = true }
hex = { workspace = true, optional = true }
url = { version = "2.5.7", optional = true }
mongodb = { version = "3.4", default-features = false, features = ["rustls-tls", "aws-auth", "compat-3-0-0", "dns-resolver"], optional = true }
# Temporary Git pin: keeps MongoDB SRV validation enabled while using the upstream
# Hickory 0.26 DNS-resolver fix before it is available in a crates.io release.
# TODO: switch back to a crates.io mongodb release once it includes that fix.
mongodb = { git = "https://github.com/mongodb/mongo-rust-driver", rev = "bdddefc50c4794d51d10b944320d42c6eb216b04", default-features = false, features = ["rustls-tls", "aws-auth", "compat-3-0-0", "dns-resolver"], optional = true }
mysql_async = { version = "0.36.2", default-features = false, features = ["default-rustls"], optional = true }
tokio-postgres = { version = "0.7", default-features = false, features = ["runtime"], optional = true }
tokio-postgres-rustls = { version = "0.13.0", optional = true }

View file

@ -187,12 +187,12 @@ pub fn generate_aws_cache_key(aws_access_key_id: &str, aws_secret_access_key: &s
hasher.update(aws_access_key_id.as_bytes());
hasher.update(b"\0");
hasher.update(aws_secret_access_key.as_bytes());
format!("AWS:{:x}", hasher.finalize())
format!("AWS:{}", hex::encode(hasher.finalize()))
}
/// Validate AWS credentials format before attempting validation.
pub fn validate_aws_credentials_input(access_key_id: &str, secret_key: &str) -> Result<(), String> {
// Validate access key ID format (20 chars, known AWS prefixes including STS)
// Validate access key ID format (20 chars, usable AWS access-key prefixes including STS)
if access_key_id.len() != 20 {
return Err("Invalid AWS access key ID format".to_string());
}
@ -200,9 +200,9 @@ pub fn validate_aws_credentials_input(access_key_id: &str, secret_key: &str) ->
return Err("AWS access key ID contains invalid characters".to_string());
}
let prefix = &access_key_id[..4];
let valid_prefix =
matches!(prefix, "AKIA" | "AGPA" | "AIDA" | "AROA" | "AIPA" | "ANPA" | "ANVA" | "ASIA")
|| prefix.starts_with("A3T");
// IAM principal IDs (for example AIDA/AROA) are deliberately rejected here:
// they are not usable access-key IDs for STS credential validation.
let valid_prefix = matches!(prefix, "AKIA" | "ASIA") || prefix.starts_with("A3T");
if !valid_prefix {
return Err("Invalid AWS access key ID format".to_string());
}

View file

@ -3,7 +3,7 @@ use std::time::Duration;
use anyhow::{Result, anyhow};
use base64::{Engine as _, engine::general_purpose::STANDARD as b64};
use chrono::Utc;
use hmac::{Hmac, Mac};
use hmac::{Hmac, KeyInit, Mac};
use http::StatusCode;
use quick_xml::{Reader, events::Event};
use reqwest::{Client, header::HeaderValue};
@ -18,7 +18,7 @@ pub fn generate_azure_cache_key(azure_json: &str) -> String {
use sha1::{Digest, Sha1};
let mut h = Sha1::new();
h.update(azure_json.as_bytes());
format!("AZURE:{:x}", h.finalize())
format!("AZURE:{}", hex::encode(h.finalize()))
}
/// Validate Azure Storage credentials without Azure SDK crates.

View file

@ -23,7 +23,7 @@ pub fn generate_coinbase_cache_key(cred_name: &str, private_key: &str) -> String
h.update(cred_name.as_bytes());
h.update(b"\0");
h.update(private_key.as_bytes());
format!("COINBASE:{:x}", h.finalize())
format!("COINBASE:{}", hex::encode(h.finalize()))
}
pub async fn validate_cdp_api_key(

View file

@ -142,7 +142,7 @@ pub fn generate_gcp_cache_key(gcp_json: &str) -> String {
use sha1::{Digest, Sha1};
let mut hasher = Sha1::new();
hasher.update(gcp_json.as_bytes());
format!("GCP:{:x}", hasher.finalize())
format!("GCP:{}", hex::encode(hasher.finalize()))
}
impl GcpValidator {

View file

@ -60,7 +60,7 @@ pub fn generate_http_cache_key_parts(
hasher.update(b"\0");
}
format!("HTTP:{:x}", hasher.finalize())
format!("HTTP:{}", hex::encode(hasher.finalize()))
}
/// Parse an HTTP method from a string.

View file

@ -147,5 +147,5 @@ pub fn generate_mongodb_cache_key(mongodb_uri: &str) -> String {
use sha1::{Digest, Sha1};
let mut hasher = Sha1::new();
hasher.update(mongodb_uri.as_bytes());
format!("MongoDB:{:x}", hasher.finalize())
format!("MongoDB:{}", hex::encode(hasher.finalize()))
}

View file

@ -56,7 +56,7 @@ pub fn generate_mysql_cache_key(mysql_url: &str) -> String {
let mut hasher = Sha1::new();
hasher.update(mysql_url.as_bytes());
format!("MySQL:{:x}", hasher.finalize())
format!("MySQL:{}", hex::encode(hasher.finalize()))
}
fn is_local_host(host: &str) -> bool {

View file

@ -70,7 +70,7 @@ impl ServerCertVerifier for LaxCertVerifier {
pub fn generate_postgres_cache_key(postgres_url: &str) -> String {
let mut hasher = Sha1::new();
hasher.update(postgres_url.as_bytes());
format!("Postgres:{:x}", hasher.finalize())
format!("Postgres:{}", hex::encode(hasher.finalize()))
}
pub fn parse_postgres_url(postgres_url: &str) -> Result<Config> {

View file

@ -32,7 +32,7 @@ pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String,
/// that should be paired with a secret key.
pub fn find_closest_variable(
captures: &[(String, String, usize, usize)],
target_value: &String,
target_value: &str,
target_variable_name: &str,
search_variable_name: &str,
) -> Option<String> {
@ -40,7 +40,7 @@ pub fn find_closest_variable(
// compare relative offsets with candidate variables.
let mut target_positions = Vec::new();
for (name, value, start, end) in captures {
if name == target_variable_name && value == target_value {
if name == target_variable_name && value.as_str() == target_value {
target_positions.push((*start, *end));
}
}
@ -138,8 +138,7 @@ mod tests {
("AKID".to_string(), "following".to_string(), 180usize, 200usize),
];
let result =
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
let result = find_closest_variable(&captures, "secret", "TOKEN", "AKID").unwrap();
assert_eq!(result, "preceding".to_string());
}
@ -151,8 +150,7 @@ mod tests {
("AKID".to_string(), "after".to_string(), 60usize, 80usize),
];
let result =
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
let result = find_closest_variable(&captures, "secret", "TOKEN", "AKID").unwrap();
assert_eq!(result, "after".to_string());
}

View file

@ -7,6 +7,14 @@ description: "Kingfisher release history: new features, rules, bug fixes, and im
All notable changes to this project will be documented in this file.
## [v1.100.0]
- Archive scanning now reaches inside Android/iOS app packages: added `apk`, `aab`, and `ipa` to the recognized ZIP-based archive formats so secrets embedded in APK/AAB/IPA contents (e.g. `classes*.dex`, `res/values/strings.xml`) are extracted and matched.
- Git repository scans now extract archive blobs encountered in the object database, not just on the filesystem. Previously a `.zip`/`.jar`/`.apk`/`.tar.gz` committed to a repo was scanned as raw compressed bytes, so secrets inside it were invisible. The git enumerator fans each archive entry out as a synthetic `<archive>!<entry>` blob with the original commit metadata. Honors `--no-extract-archives` for opt-out.
- Fixed tar-wrapped archive extraction for `.tgz` and `.tar.*` files, and made dependent credential validation deduplication preserve per-occurrence context so repeated secrets validate with the correct nearby companion value.
- Performance: ZIP-based git blobs ≤ 64 MB extract entirely in memory (no temp-file round trip), beating the v1.99.0 baseline by ~15% on a 80 GiB monorepo despite scanning ~300K additional archive-content blobs. Larger archives auto-fall-back to a disk-streaming extractor.
- Memory safety: hard caps on archive extraction — 64 MB compressed pre-flight, 256 MB aggregate decompressed per archive (in-memory and disk paths), 512 MB per entry, plus a `PK\x03\x04` magic-byte gate. Worst-case footprint is bounded at ~`num_jobs * 320 MB`.
- Release binary trimmed from 34 MB to 26 MB (~24% smaller). Switched `jsonwebtoken` to its `rust_crypto` backend (eliminates our scanner's pull on `aws-lc-rs`), bumped workspace `hmac` 0.12→0.13, `sha1` 0.10→0.11, `sha2` 0.10→0.11 to deduplicate our internal crypto code with the AWS sigv4 side, and migrated affected call sites in `kingfisher-core`, `kingfisher-rules`, and `kingfisher-scanner` to the digest-0.11 API (`hex::encode` for hex digests, explicit `KeyInit` import for HMAC).
## [v1.99.0]
- Fixed [#371](https://github.com/mongodb/kingfisher/issues/371): `pip install kingfisher-bin` on glibc Linux distros (Ubuntu, Debian, RHEL, Fedora, …) installed a macOS Mach-O binary and failed with `OSError: [Errno 8] Exec format error`. Linux wheels are now tagged `manylinux_2_17_<arch>.musllinux_1_2_<arch>` (instead of `musllinux_1_2_<arch>` only), so pip accepts them on both glibc-2.17+ and musl distros. The `pypi/hatch_build.py` hook now hard-fails when `KINGFISHER_PYPI_WHEEL_TAG` is unset, and the publish workflow refuses to upload any `py3-none-any.whl`, so the v1.92.0-era pure-Python wheel cannot recur.
- `--self-update` (alias `--update`) on a scan or other command now **re-execs into the freshly installed binary** so the current invocation completes with the new code and the latest detection rules. Previously the on-disk binary was replaced but the running process kept using the old in-memory version, requiring a second invocation to pick up the changes. On Unix this is a true `exec()` (same PID); on Windows the new binary is spawned and the parent exits with its status code. The explicit `kingfisher self-update` subcommand still updates and exits without re-execing. Self-update now also covers Windows arm64 (the asset was already published; the runtime cfg map gained the missing arm). See `docs/ADVANCED.md`*Update Checks*.

392
fuzz/Cargo.lock generated
View file

@ -123,9 +123,9 @@ dependencies = [
[[package]]
name = "arc-swap"
version = "1.8.2"
version = "1.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f9f3647c145568cec02c42054e07bdf9a5a698e15b466fb2341bfc393cd24aa5"
checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207"
dependencies = [
"rustversion",
]
@ -172,6 +172,15 @@ dependencies = [
"generic-array",
]
[[package]]
name = "block-buffer"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be"
dependencies = [
"hybrid-array",
]
[[package]]
name = "bstr"
version = "1.12.1"
@ -332,6 +341,12 @@ dependencies = [
"cc",
]
[[package]]
name = "cmov"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f88a43d011fc4a6876cb7344703e297c71dda42494fee094d5f7c76bf13f746"
[[package]]
name = "colorchoice"
version = "1.0.5"
@ -360,6 +375,12 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "const-oid"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c"
[[package]]
name = "core-foundation-sys"
version = "0.8.7"
@ -447,6 +468,24 @@ dependencies = [
"typenum",
]
[[package]]
name = "crypto-common"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77727bb15fa921304124b128af125e7e3b968275d1b108b379190264f4423710"
dependencies = [
"hybrid-array",
]
[[package]]
name = "ctutils"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d5515a3834141de9eafb9717ad39eea8247b5674e6066c404e8c4b365d2a29e"
dependencies = [
"cmov",
]
[[package]]
name = "dashmap"
version = "6.1.0"
@ -494,9 +533,20 @@ version = "0.10.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
dependencies = [
"block-buffer",
"crypto-common",
"subtle",
"block-buffer 0.10.4",
"crypto-common 0.1.7",
]
[[package]]
name = "digest"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2"
dependencies = [
"block-buffer 0.12.0",
"const-oid",
"crypto-common 0.2.1",
"ctutils",
]
[[package]]
@ -718,9 +768,9 @@ dependencies = [
[[package]]
name = "gix"
version = "0.81.0"
version = "0.83.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0473c64d9ccbcfb9953a133b47c8b9a335b87ac6c52b983ee4b03d49000b0f3f"
checksum = "6ce52001b946a6249d5d0d3011df0a042ac3f8a4d013460db6476577b0b9c567"
dependencies = [
"gix-actor",
"gix-archive",
@ -777,22 +827,21 @@ dependencies = [
[[package]]
name = "gix-actor"
version = "0.40.0"
version = "0.41.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e5e5b518339d5e6718af108fd064d4e9ba33caf728cf487352873d76411df35"
checksum = "272916673b83714734b15d4ef3c8b5f1ccddb15fea8ff548430b97c1ab7b7ed8"
dependencies = [
"bstr",
"gix-date",
"gix-error",
"serde",
"winnow",
]
[[package]]
name = "gix-archive"
version = "0.30.0"
version = "0.32.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "651c99be11aac9b303483193ae50b45eb6e094da4f5ed797019b03948f51aad6"
checksum = "9a20ec244b733338d4cb60e5e05eac700dab7fcc689647b1d1daa9396b119342"
dependencies = [
"bstr",
"gix-date",
@ -803,9 +852,9 @@ dependencies = [
[[package]]
name = "gix-attributes"
version = "0.31.0"
version = "0.33.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c233d6eaa098c0ca5ce03236fd7a96e27f1abe72fad74b46003fbd11fe49563c"
checksum = "fe17c5a1c0b6f2ef1476aa1d3222ea50cdff67608016613a58bfc3e078046000"
dependencies = [
"bstr",
"gix-glob",
@ -821,18 +870,18 @@ dependencies = [
[[package]]
name = "gix-bitmap"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7add20f40d060db8c9b1314d499bac6ed7480f33eb113ce3e1cf5d6ff85d989"
checksum = "1ecbfc77ec6852294e341ecc305a490b59f2813e6ca42d79efda5099dcab1894"
dependencies = [
"gix-error",
]
[[package]]
name = "gix-blame"
version = "0.11.0"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c77aaf9f7348f4da3ebfbfbbc35fa0d07155d98377856198dde6f695fd648705"
checksum = "14dab9a942ab54a9661ded7397c3bf927274e7afa94494db0d75cfcbde02ca0a"
dependencies = [
"gix-commitgraph",
"gix-date",
@ -850,18 +899,18 @@ dependencies = [
[[package]]
name = "gix-chunk"
version = "0.7.0"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1096b6608fbe5d27fb4984e20f992b4e76fb8c613f6acb87d07c5831b53a6959"
checksum = "edf288be9b60fe7231de03771faa292be1493d84786f68727e33ad1f91764320"
dependencies = [
"gix-error",
]
[[package]]
name = "gix-command"
version = "0.8.0"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b849c65a609f50d02f8a2774fe371650b3384a743c79c2a070ce0da49b7fb7da"
checksum = "86335306511abe43d75c866d4b1f3d90932fe202edcd43e1314036333e7384d8"
dependencies = [
"bstr",
"gix-path",
@ -872,9 +921,9 @@ dependencies = [
[[package]]
name = "gix-commitgraph"
version = "0.35.0"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3196655fd1443f3c58a48c114aa480be3e4e87b393d7292daaa0d543862eb445"
checksum = "fe3b5aa0f24e19028c261d229aeeedafcaaa52ebd71021cc15184620fc9d32eb"
dependencies = [
"bstr",
"gix-chunk",
@ -887,9 +936,9 @@ dependencies = [
[[package]]
name = "gix-config"
version = "0.54.0"
version = "0.56.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08939b4c4ed7a663d0e64be9e1e9bdf23a1fb4fcee1febdf449f12229542e50d"
checksum = "8c01848aebd21c67f6ba41f1de8efd46ae96df21f001954a3c9e1517e514d410"
dependencies = [
"bstr",
"gix-config-value",
@ -898,18 +947,16 @@ dependencies = [
"gix-path",
"gix-ref",
"gix-sec",
"memchr",
"smallvec",
"thiserror 2.0.18",
"unicode-bom",
"winnow",
]
[[package]]
name = "gix-config-value"
version = "0.17.1"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "441a300bc3645a1f45cba495b9175f90f47256ce43f2ee161da0031e3ac77c92"
checksum = "13b39ed39ee4c10a3b157f9fb94bac8098d9f8e56201f0cf7dee6c187416c4b2"
dependencies = [
"bitflags",
"bstr",
@ -920,9 +967,9 @@ dependencies = [
[[package]]
name = "gix-credentials"
version = "0.37.1"
version = "0.38.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b2a34b8715e3bbd514f3d1705f5d51c4b250e5bfe506b9fb60b133c85c93d9"
checksum = "65ca11598b70811d7b16ff90945a6e57dfe521e85b744e51636965fe39cc8f60"
dependencies = [
"bstr",
"gix-command",
@ -939,9 +986,9 @@ dependencies = [
[[package]]
name = "gix-date"
version = "0.15.1"
version = "0.15.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39acf819aa9fee65e4838a2eec5cb2506e47ebb89e02a5ab9918196e491571ea"
checksum = "b94cdae4eb4b0f4136e3d9b3aa2d2cd03cfb5bb9b636b31263aea2df86d41543"
dependencies = [
"bstr",
"gix-error",
@ -953,31 +1000,30 @@ dependencies = [
[[package]]
name = "gix-diff"
version = "0.61.0"
version = "0.63.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88f3b3475e5d3877d7c30c40827cc2441936ce890efc226e5ba4afe3a7ae33f0"
checksum = "dc08e0fa1a91ff5f24affeab052f198056645e1de004910bde7b82b50ea5982a"
dependencies = [
"bstr",
"gix-command",
"gix-filter",
"gix-fs",
"gix-hash",
"gix-imara-diff",
"gix-object",
"gix-path",
"gix-tempfile",
"gix-trace",
"gix-traverse",
"gix-worktree",
"imara-diff 0.1.8",
"imara-diff 0.2.0",
"thiserror 2.0.18",
]
[[package]]
name = "gix-dir"
version = "0.23.0"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5da4604a360988f0ba8efe6f90093ca5a844f4a7f8e1a3dcda501ec44e600ea9"
checksum = "32a0fc06e9e1e430cbf0a313666976d90f822f461a6525320427aa9b8af5236c"
dependencies = [
"bstr",
"gix-discover",
@ -995,9 +1041,9 @@ dependencies = [
[[package]]
name = "gix-discover"
version = "0.49.0"
version = "0.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c65bd3330fe0cb9d40d875bf862fd5e8ad6fa4164ddbc4842fbeb889c3f0b2c6"
checksum = "17852e6a501e688a1702b24ebe5b3761d4719455bc869fd29f38b0b859bcad34"
dependencies = [
"bstr",
"dunce",
@ -1010,18 +1056,18 @@ dependencies = [
[[package]]
name = "gix-error"
version = "0.2.1"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e86d01da904d4a9265def43bd42a18c5e6dc7000a73af512946ba14579c9fbd"
checksum = "e207b971746ab724fccdfced2e4e19e854744611904a0195d3aa8fda8a110613"
dependencies = [
"bstr",
]
[[package]]
name = "gix-features"
version = "0.46.2"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "752493cd4b1d5eaaa0138a7493f65c96863fefa990fc021e0e519579e389ab20"
checksum = "af375693ad5333d0a2c66b4c5b2cbe9ccc38e34f8e8bf24e4ae42c12307fdc4f"
dependencies = [
"bytes",
"crc32fast",
@ -1038,9 +1084,9 @@ dependencies = [
[[package]]
name = "gix-filter"
version = "0.28.0"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d37598282a6566da6fb52667570c7fe0aedcb122ac886724a9e62a2180523e35"
checksum = "dac917dbe9653c9b615d248db91907a365bd779750c9e1b457a9d9fdeece3a08"
dependencies = [
"bstr",
"encoding_rs",
@ -1059,9 +1105,9 @@ dependencies = [
[[package]]
name = "gix-fs"
version = "0.19.2"
version = "0.21.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a964b4aec683eb0bacb87533defa80805bb4768056371a47ab38b00a2d377b72"
checksum = "1e1967daac9848757c47c2aef0c57bcadc1a897347f559778249bf286a536c86"
dependencies = [
"bstr",
"fastrand",
@ -1073,9 +1119,9 @@ dependencies = [
[[package]]
name = "gix-glob"
version = "0.24.0"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b03e6cd88cc0dc1eafa1fddac0fb719e4e74b6ea58dd016e71125fde4a326bee"
checksum = "08bf29249a069bf2507f5964f80997f37b134d320ea348d66527726b9be2c38c"
dependencies = [
"bitflags",
"bstr",
@ -1086,9 +1132,9 @@ dependencies = [
[[package]]
name = "gix-hash"
version = "0.23.0"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fb896a02d9ab96fa518475a5f30ad3952010f801a8de5840f633f4a6b985dfb"
checksum = "bcf70d1e252337eed16360f8b8ebb71865ece58eab7954b39ce38b420de703d2"
dependencies = [
"faster-hex",
"gix-features",
@ -1099,9 +1145,9 @@ dependencies = [
[[package]]
name = "gix-hashtable"
version = "0.13.0"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2664216fc5e89b51e756a4a3ac676315602ce2dac07acf1da959a22038d69b33"
checksum = "d33b455e07b3c16d3b2eeebc7b38d2dafcbf8a653de1138ef55d4c2a1fd0b08b"
dependencies = [
"gix-hash",
"hashbrown 0.16.1",
@ -1110,9 +1156,9 @@ dependencies = [
[[package]]
name = "gix-ignore"
version = "0.19.1"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09f915dcf6911e3027537166d34e13f0fe101ed12225178d2ae29cd1272cff26"
checksum = "6bb13fbbeeafee943e52b61fcc88dfddf6a452fcaf0c4d0cdc8f218fa25bbec5"
dependencies = [
"bstr",
"gix-glob",
@ -1123,10 +1169,20 @@ dependencies = [
]
[[package]]
name = "gix-index"
version = "0.49.0"
name = "gix-imara-diff"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bae54ab14e4e74d5dda60b82ea7afad7c8eb3be68283d6d5f29bd2e6d47fff7"
checksum = "39eb0623e15e4cb83c02ce6a959e48fadd1ae3b715b36b5acc01816e01388c82"
dependencies = [
"bstr",
"hashbrown 0.16.1",
]
[[package]]
name = "gix-index"
version = "0.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54c3ef97ad08121e4327a6226bd63fed6b9e3c6b976d48bddd4356d9d41191db"
dependencies = [
"bitflags",
"bstr",
@ -1153,9 +1209,9 @@ dependencies = [
[[package]]
name = "gix-lock"
version = "21.0.1"
version = "23.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbe09cf05ba7c679bba189acc29eeea137f643e7fff1b5dff879dfd45248be31"
checksum = "09b3bc074e5723027b482dcd9ab99d95804a53742f6de812d0172fbba4a186c1"
dependencies = [
"gix-tempfile",
"gix-utils",
@ -1164,9 +1220,9 @@ dependencies = [
[[package]]
name = "gix-mailmap"
version = "0.32.0"
version = "0.33.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7b4818da522786ec7e32a00884ee8fc40fa4c215c3997c0b15f7b62684d1199"
checksum = "023d3a6561cbebe45b89e0764d48928ad970667076f16fa5889e6f86d8432086"
dependencies = [
"bstr",
"gix-actor",
@ -1177,9 +1233,9 @@ dependencies = [
[[package]]
name = "gix-merge"
version = "0.14.0"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4606747466512d22c2dffc019142e1941238f543987ea51353c938cca80c500"
checksum = "74bbcdcc52b70a32f0a151b024dff9d0fcf56ee48f00d9503e735af9d99ea881"
dependencies = [
"bstr",
"gix-command",
@ -1187,6 +1243,7 @@ dependencies = [
"gix-filter",
"gix-fs",
"gix-hash",
"gix-imara-diff",
"gix-index",
"gix-object",
"gix-path",
@ -1196,16 +1253,15 @@ dependencies = [
"gix-tempfile",
"gix-trace",
"gix-worktree",
"imara-diff 0.1.8",
"nonempty",
"thiserror 2.0.18",
]
[[package]]
name = "gix-negotiate"
version = "0.29.0"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ea064c7595eea08fdd01c70748af747d9acc40f727b61f4c8a2145a5c5fc28c"
checksum = "103d42bfade1b8a96ca5005933127bdad461ce588d92422b2c2daa3ff20d780c"
dependencies = [
"bitflags",
"gix-commitgraph",
@ -1217,9 +1273,9 @@ dependencies = [
[[package]]
name = "gix-object"
version = "0.58.0"
version = "0.60.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cafb802bb688a7c1e69ef965612ff5ff859f046bfb616377e4a0ba4c01e43d47"
checksum = "a38075a95d7cc5df8afd38e72c617026c1456952207a4120a7f55a3fbf93b4d7"
dependencies = [
"bstr",
"gix-actor",
@ -1227,21 +1283,19 @@ dependencies = [
"gix-features",
"gix-hash",
"gix-hashtable",
"gix-path",
"gix-utils",
"gix-validate",
"itoa",
"serde",
"smallvec",
"thiserror 2.0.18",
"winnow",
]
[[package]]
name = "gix-odb"
version = "0.78.0"
version = "0.80.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24833ae9323b4f7079575fb9f961cf9c414b0afbec428a536ab8e7dd93bc002b"
checksum = "aeeda12a9663120418735ecdc1250d06eeab0be75700e47b3402a981331716ba"
dependencies = [
"arc-swap",
"gix-features",
@ -1252,6 +1306,7 @@ dependencies = [
"gix-pack",
"gix-path",
"gix-quote",
"memmap2",
"parking_lot",
"serde",
"tempfile",
@ -1260,9 +1315,9 @@ dependencies = [
[[package]]
name = "gix-pack"
version = "0.68.0"
version = "0.70.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3484119cd19859d7d7639413c27e192478fa354d3f4ff5f7e3c041e8040f0f4"
checksum = "daf02e6f5c8f07a069c9ea5245f40d9b14856ada4086091dc99941b49002b4fa"
dependencies = [
"clru",
"gix-chunk",
@ -1280,9 +1335,9 @@ dependencies = [
[[package]]
name = "gix-packetline"
version = "0.21.2"
version = "0.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be19313dcdb7dff75a3ce2f99be00878458295bcc3b6c7f0005591597573345c"
checksum = "362246df440ee691699f0664cbf7006a6ece477db6734222be95e4198e5656e6"
dependencies = [
"bstr",
"faster-hex",
@ -1292,9 +1347,9 @@ dependencies = [
[[package]]
name = "gix-path"
version = "0.11.2"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09c31d4373bda7fab9eb01822927b55185a378d6e1bf737e0a54c743ad806658"
checksum = "671a6059e8a4c1b7f406e24716499cefa3926e060876fb1959ef225efeee346e"
dependencies = [
"bstr",
"gix-trace",
@ -1304,9 +1359,9 @@ dependencies = [
[[package]]
name = "gix-pathspec"
version = "0.16.1"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f89611f13544ca5ebeb68a502673814ef57200df60c24a61c2ce7b96f612f08b"
checksum = "2a84a4f083dd70fb49f4377e13afa6d90df2daaa1c705c49d6ff1331fc7e8855"
dependencies = [
"bitflags",
"bstr",
@ -1319,9 +1374,9 @@ dependencies = [
[[package]]
name = "gix-prompt"
version = "0.14.1"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f61f6264e1f6c5a951531fe127722c7522bc02ebda80c4528286bda4642055f"
checksum = "e041a626c64cb69e4117fcdf80da8d0e454fba3b1f420412792d191f52251aee"
dependencies = [
"gix-command",
"gix-config-value",
@ -1332,9 +1387,9 @@ dependencies = [
[[package]]
name = "gix-protocol"
version = "0.59.0"
version = "0.61.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f38666350736b5877c79f57ddae02bde07a4ce186d889adc391e831cddcbe76"
checksum = "aa4bee82db63ec635996b96efae71cf467c155fa3f34a556184373224a26c4fd"
dependencies = [
"bstr",
"gix-date",
@ -1348,14 +1403,13 @@ dependencies = [
"nonempty",
"serde",
"thiserror 2.0.18",
"winnow",
]
[[package]]
name = "gix-quote"
version = "0.7.0"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68533db71259c8776dd4e770d2b7b98696213ecdc1f5c9e3507119e274e0c578"
checksum = "6e97b73791a64bc0fa7dd2c5b3e551136115f97750b876ed1c952c7a7dbaf8be"
dependencies = [
"bstr",
"gix-error",
@ -1364,9 +1418,9 @@ dependencies = [
[[package]]
name = "gix-ref"
version = "0.61.0"
version = "0.63.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2159978abb99b7027c8579d15211e262ef0ef2594d5cecb3334fbcbdfe2997c"
checksum = "d8ba9cc15f558b274c99349b83130f5ec83459660828fde9718bbbb43a726167"
dependencies = [
"gix-actor",
"gix-features",
@ -1381,14 +1435,13 @@ dependencies = [
"memmap2",
"serde",
"thiserror 2.0.18",
"winnow",
]
[[package]]
name = "gix-refspec"
version = "0.39.0"
version = "0.41.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc806ee13f437428f8a1ba4c72ecfaa3f20e14f5f0d4c2bc17d0b33e794aa6ac"
checksum = "61755b27d57edc8940a1b1593c8c61548ca8e4c02da1ed8d5bfeda9eb2a6b761"
dependencies = [
"bstr",
"gix-error",
@ -1402,9 +1455,9 @@ dependencies = [
[[package]]
name = "gix-revision"
version = "0.43.0"
version = "0.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c08f1ec5d1e6a524f8ba291c41f0ccaef64e48ed0e8cf790b3461cae45f6d3d"
checksum = "1fb5288fac706d3ea3e4e2ba9ec38b78743b8c02f422e18cb342299cfd6ab7e8"
dependencies = [
"bitflags",
"bstr",
@ -1421,9 +1474,9 @@ dependencies = [
[[package]]
name = "gix-revwalk"
version = "0.29.0"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e4b2b87772b21ca449249e86d32febadba5cba32b0fcce804ab9cefc6f2111c"
checksum = "313813706b073a12ff7f9b2896bf3e6504cdac7cfbc97b1920114724705069f0"
dependencies = [
"gix-commitgraph",
"gix-date",
@ -1437,9 +1490,9 @@ dependencies = [
[[package]]
name = "gix-sec"
version = "0.13.2"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf82ae037de9c62850ce67beaa92ec8e3e17785ea307cdde7618edc215603b4f"
checksum = "f5a3a2d3e504a238136751e646a6c028252286a0ea64ea9974bf0498633407c6"
dependencies = [
"bitflags",
"gix-path",
@ -1450,9 +1503,9 @@ dependencies = [
[[package]]
name = "gix-shallow"
version = "0.10.0"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbf60711c9083b2364b3fac8a352444af76b17201f3682fdebe74fa66d89a772"
checksum = "29187305521bfacf4aefd284ab28dbfa9fb74abd39a5e63dd313b1baa5808c27"
dependencies = [
"bstr",
"gix-hash",
@ -1464,9 +1517,9 @@ dependencies = [
[[package]]
name = "gix-status"
version = "0.28.0"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23d6c598e3fdbc352fba1c5ba7e709e69402fafbc44d9295edad2e3c4738996b"
checksum = "68c6d2a8c521ffa205fe7e268c82e6d1378ba37cd826ca10ab6129fdc29a4b65"
dependencies = [
"bstr",
"filetime",
@ -1487,9 +1540,9 @@ dependencies = [
[[package]]
name = "gix-submodule"
version = "0.28.0"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ce5c3929c5e6821f651d35e8420f72fea3cfafe9fc1e928a61e718b462c72a5"
checksum = "9fd5fc8692890bd71a596e540fd4c364f8460eaa82c4eaaedebde6e1e3eb4d91"
dependencies = [
"bstr",
"gix-config",
@ -1502,9 +1555,9 @@ dependencies = [
[[package]]
name = "gix-tempfile"
version = "21.0.1"
version = "23.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d9ab2c89fe4bfd4f1d8700aa4516534c170d8a21ae2c554167374607c2eaf16"
checksum = "691ea1e31435c7e7d4d04705ec9d1c0d9482c46b2acf512bc723939d8f0af7fb"
dependencies = [
"dashmap",
"gix-fs",
@ -1515,15 +1568,15 @@ dependencies = [
[[package]]
name = "gix-trace"
version = "0.1.18"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f69a13643b8437d4ca6845e08143e847a36ca82903eed13303475d0ae8b162e0"
checksum = "6f23569e55f2ffaf958617353b9734a7d52a7c19c439eeaa5e3efc217fd2270e"
[[package]]
name = "gix-transport"
version = "0.55.1"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a521e39c6235ce63ed6c001e2dd79818c830b82c3b7b59247ee7b229c39ec9bb"
checksum = "ffd6a5c676b92d4ead5f5a2b2935024415dec69edc997b6090ca9cac010a3018"
dependencies = [
"bstr",
"gix-command",
@ -1538,9 +1591,9 @@ dependencies = [
[[package]]
name = "gix-traverse"
version = "0.55.0"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "963dc2afcdb611092aa587c3f9365e749ac0a0892ff27662dbc75f26c953fbec"
checksum = "a14b7052c0786676c03e71fcfde7d7f0f8e8316e642b5cec6bb3998719b2ce5c"
dependencies = [
"bitflags",
"gix-commitgraph",
@ -1555,9 +1608,9 @@ dependencies = [
[[package]]
name = "gix-url"
version = "0.35.2"
version = "0.36.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d28e8af3d42581190da884f013caf254d2fd4d6ab102408f08d21bfa11de6c8d"
checksum = "35842d099e813f6f6bba529e88d4670572149c3df79b7a412952259887721ece"
dependencies = [
"bstr",
"gix-path",
@ -1568,9 +1621,9 @@ dependencies = [
[[package]]
name = "gix-utils"
version = "0.3.1"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "befcdbdfb1238d2854591f760a48711bed85e72d80a10e8f2f93f656746ef7c5"
checksum = "4e477b4f07a6e8da4ba791c53c858102959703c60d70f199932010d5b94adb2c"
dependencies = [
"bstr",
"fastrand",
@ -1579,18 +1632,18 @@ dependencies = [
[[package]]
name = "gix-validate"
version = "0.11.0"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ec1eff98d91941f47766367cba1be746bab662bad761d9891ae6f7882f7840b"
checksum = "e26ac2602b43eadfdca0560b81d3341944162a3c9f64ccdeef8fc501ad80dad5"
dependencies = [
"bstr",
]
[[package]]
name = "gix-worktree"
version = "0.50.0"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6bd5830cbc43c9c00918b826467d2afad685b195cb82329cde2b2d116d2c578"
checksum = "d69955eb5e2910832f88d041964b809eee01dadd579237e0b55efec58fd406fd"
dependencies = [
"bstr",
"gix-attributes",
@ -1607,9 +1660,9 @@ dependencies = [
[[package]]
name = "gix-worktree-state"
version = "0.28.0"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "644a1681f96e1be43c2a8384337d9d220e7624f50db54beda70997052aebf707"
checksum = "8a96dccbcf9e8fe0291c55f06e08da93ebb2e691c1311276f541eefcc6d70800"
dependencies = [
"bstr",
"gix-features",
@ -1625,9 +1678,9 @@ dependencies = [
[[package]]
name = "gix-worktree-stream"
version = "0.30.0"
version = "0.32.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24e3fb70a1f650a5cec7d5b8d10d6d6fe86daf3cf15bde08ba0c70988a2932c3"
checksum = "9a8444b8ed4662e1a0c97f3eceda29630001a1bbb2632201e50312623e594213"
dependencies = [
"gix-attributes",
"gix-error",
@ -1748,11 +1801,11 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "hmac"
version = "0.12.1"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
checksum = "6303bc9732ae41b04cb554b844a762b4115a61bfaa81e3e83050991eeb56863f"
dependencies = [
"digest",
"digest 0.11.3",
]
[[package]]
@ -1783,6 +1836,15 @@ dependencies = [
"libm",
]
[[package]]
name = "hybrid-array"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9155a582abd142abc056962c29e3ce5ff2ad5469f4246b537ed42c5deba857da"
dependencies = [
"typenum",
]
[[package]]
name = "iana-time-zone"
version = "0.1.65"
@ -1829,25 +1891,6 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "imara-diff"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17d34b7d42178945f775e84bc4c36dde7c1c6cdfea656d3354d009056f2bb3d2"
dependencies = [
"hashbrown 0.15.5",
]
[[package]]
name = "imara-diff"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f01d462f766df78ab820dd06f5eb700233c51f0f4c2e846520eaf4ba6aa5c5c"
dependencies = [
"hashbrown 0.15.5",
"memchr",
]
[[package]]
name = "include_dir"
version = "0.7.4"
@ -2008,7 +2051,7 @@ dependencies = [
"schemars",
"serde",
"serde_json",
"sha1",
"sha1 0.11.0",
"smallvec",
"thiserror 2.0.18",
"tokei",
@ -2032,6 +2075,7 @@ dependencies = [
"base32",
"base64",
"crc32fast",
"hex",
"hmac",
"ignore",
"include_dir",
@ -2045,8 +2089,8 @@ dependencies = [
"serde",
"serde_json",
"serde_yaml",
"sha1",
"sha2",
"sha1 0.11.0",
"sha2 0.11.0",
"thiserror 2.0.18",
"time",
"tracing",
@ -2362,7 +2406,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89815c69d36021a140146f26659a81d6c2afa33d216d736dd4be5381a7362220"
dependencies = [
"pest",
"sha2",
"sha2 0.10.9",
]
[[package]]
@ -2768,7 +2812,18 @@ checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
dependencies = [
"cfg-if",
"cpufeatures 0.2.17",
"digest",
"digest 0.10.7",
]
[[package]]
name = "sha1"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aacc4cc499359472b4abe1bf11d0b12e688af9a805fa5e3016f9a386dc2d0214"
dependencies = [
"cfg-if",
"cpufeatures 0.3.0",
"digest 0.11.3",
]
[[package]]
@ -2777,8 +2832,8 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89f599ac0c323ebb1c6082821a54962b839832b03984598375bff3975b804423"
dependencies = [
"digest",
"sha1",
"digest 0.10.7",
"sha1 0.10.6",
]
[[package]]
@ -2789,7 +2844,18 @@ checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
dependencies = [
"cfg-if",
"cpufeatures 0.2.17",
"digest",
"digest 0.10.7",
]
[[package]]
name = "sha2"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4"
dependencies = [
"cfg-if",
"cpufeatures 0.3.0",
"digest 0.11.3",
]
[[package]]
@ -2847,12 +2913,6 @@ version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "subtle"
version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]]
name = "syn"
version = "2.0.117"
@ -3132,9 +3192,9 @@ dependencies = [
[[package]]
name = "typenum"
version = "1.19.0"
version = "1.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
[[package]]
name = "ucd-trie"

View file

@ -18,23 +18,23 @@ use zip::ZipArchive;
/// Formats that are basically a ZIP container.
pub const ZIP_BASED_FORMATS: &[&str] = &[
"zip", "zipx", "jar", "war", "ear", "aar", "jmod", "jhm", "jnlp", "nupkg", "vsix", "xap",
"docx", "xlsx", "pptx", "odt", "ods", "odp", "odg", "odf", "epub", "gadget", "kmz", "widget",
"xpi", "sketch", "pages", "key", "numbers", "hwpx",
"zip", "zipx", "jar", "war", "ear", "aar", "apk", "aab", "ipa", "jmod", "jhm", "jnlp", "nupkg",
"vsix", "xap", "docx", "xlsx", "pptx", "odt", "ods", "odp", "odg", "odf", "epub", "gadget",
"kmz", "widget", "xpi", "sketch", "pages", "key", "numbers", "hwpx",
];
/// Break `<name>.<outer>.<inner>` into `(Some(outer), Some(inner))`.
/// For `foo.tar.gz` this returns `("tar", "gz")`.
fn split_extensions(path: &Path) -> (Option<String>, Option<String>) {
let ext_inner = path.extension().and_then(|e| e.to_str()).map(|s| s.to_ascii_lowercase());
fn is_tar_wrapped_compression(path: &Path) -> bool {
let filename = match path.file_name().and_then(|s| s.to_str()) {
Some(name) => name.to_ascii_lowercase(),
None => return false,
};
let ext_outer = path
.file_stem()
.and_then(|s| Path::new(s).extension())
.and_then(|e| e.to_str())
.map(|s| s.to_ascii_lowercase());
(ext_outer, ext_inner)
filename.ends_with(".tgz")
|| filename.ends_with(".tar.gz")
|| filename.ends_with(".tar.gzip")
|| filename.ends_with(".tar.bz2")
|| filename.ends_with(".tar.bzip2")
|| filename.ends_with(".tar.xz")
}
#[derive(Debug)]
@ -50,17 +50,17 @@ pub enum CompressedContent {
}
pub fn is_safe_extract_path(path: &Path) -> bool {
for (idx, comp) in path.components().enumerate() {
if path.is_absolute() {
return false;
}
for comp in path.components() {
match comp {
// Never allow parent-directory escapes
Component::ParentDir => return false,
// Leading "C:\" (Windows) or "/" (Unix) is fine;
// a prefix later in the path would be suspicious.
Component::Prefix(_) | Component::RootDir if idx == 0 => continue,
// A prefix *inside* the path (e.g. "foo/C:\evil") is unsafe
Component::Prefix(_) => return false,
// Archive entry names must always be relative to the extraction root.
Component::Prefix(_) | Component::RootDir => return false,
_ => {}
}
@ -68,6 +68,17 @@ pub fn is_safe_extract_path(path: &Path) -> bool {
true
}
fn has_parent_or_embedded_prefix(path: &Path) -> bool {
for (idx, comp) in path.components().enumerate() {
match comp {
Component::ParentDir => return true,
Component::Prefix(_) if idx > 0 => return true,
_ => {}
}
}
false
}
fn is_zip_format(ext: &str) -> bool {
ZIP_BASED_FORMATS.iter().any(|z| z == &ext)
}
@ -87,6 +98,10 @@ fn handle_tar_archive_streaming(
let mut entry = entry?;
if entry.header().entry_type().is_file() {
let path_in_tar = entry.path()?.to_string_lossy().to_string();
if !is_safe_extract_path(Path::new(&path_in_tar)) {
tracing::warn!("unsafe tar path: {path_in_tar}");
continue;
}
let logical_path = format!("{}!{}", archive_path.display(), path_in_tar);
let out_path = base_dir.join(&path_in_tar);
@ -96,10 +111,6 @@ fn handle_tar_archive_streaming(
continue;
}
}
if !is_safe_extract_path(&out_path) {
tracing::warn!("unsafe tar path: {}", out_path.display());
continue;
}
match fs::File::create(&out_path) {
Ok(mut out_file) => {
if let Err(e) = std::io::copy(&mut entry, &mut out_file) {
@ -118,6 +129,119 @@ fn handle_tar_archive_streaming(
Ok(CompressedContent::ArchiveFiles(entries_on_disk))
}
/// Extract every file entry in a ZIP-based archive directly from a byte
/// slice, without touching the filesystem. Intended for the git-blob
/// scan path where blobs already sit in memory and writing them out to a
/// temp file just to read them back imposes substantial overhead in
/// monorepos with many committed `.jar`/`.zip`/`.apk` artifacts.
///
/// `archive_label` is used to construct logical entry paths of the form
/// `<archive_label>!<entry_name>`, matching the convention used by the
/// streaming-to-disk path.
///
/// The same per-entry decompressed-size cap as the streaming-to-disk
/// extractor is enforced so that ZIP bombs cannot allocate unbounded
/// memory.
/// Maximum compressed archive size that the in-memory ZIP extractor will
/// accept. Larger archives fall back to the disk-streaming path so that we
/// never hold both the archive bytes AND every decompressed entry in RAM
/// simultaneously. The threshold is intentionally generous — most committed
/// `.jar`/`.zip`/`.apk` artifacts in real repos are well under 64 MB.
pub const MAX_INMEM_ZIP_ARCHIVE_BYTES: usize = 64 * 1024 * 1024;
/// Aggregate cap on total decompressed bytes the in-memory ZIP extractor
/// will accumulate per archive. Bounds the worst-case footprint of one
/// rayon worker processing one archive: with `num_jobs` workers running
/// in parallel, peak resident memory is bounded by `num_jobs * this`.
/// Independent of the per-entry cap, so a single bomb-style entry can't
/// drain it all but neither can N medium-sized entries.
pub const MAX_INMEM_ZIP_DECOMPRESSED_BYTES: u64 = 256 * 1024 * 1024;
pub fn extract_zip_archive_in_memory(
data: &[u8],
archive_label: &str,
) -> Result<Vec<(String, Vec<u8>)>> {
if data.len() > MAX_INMEM_ZIP_ARCHIVE_BYTES {
anyhow::bail!(
"zip archive {archive_label} is {} bytes, exceeding {} byte in-memory cap",
data.len(),
MAX_INMEM_ZIP_ARCHIVE_BYTES
);
}
// Per-entry cap on decompressed bytes: bounds memory cost of zip bombs.
// Mirrors the disk-streaming variant's cap.
// nosemgrep: this is the defensive cap — do not flag for missing-limit rules.
const MAX_ZIP_ENTRY_DECOMPRESSED_BYTES: u64 = 512 * 1024 * 1024;
let cursor = std::io::Cursor::new(data);
let mut zip = ZipArchive::new(cursor)?;
let mut entries = Vec::with_capacity(zip.len());
let mut total_decompressed: u64 = 0;
for i in 0..zip.len() {
if total_decompressed >= MAX_INMEM_ZIP_DECOMPRESSED_BYTES {
tracing::warn!(
"in-memory zip {archive_label} exceeded {MAX_INMEM_ZIP_DECOMPRESSED_BYTES} byte aggregate cap at entry {i}/{}; truncating",
zip.len()
);
break;
}
let mut zipped_file = match zip.by_index(i) {
Ok(f) => f,
Err(e) => {
tracing::debug!("zip entry {i} read failed: {e}");
continue;
}
};
if !zipped_file.is_file() {
continue;
}
let name_in_zip = zipped_file.name().to_string();
// Defense in depth: refuse traversal-style names. The in-memory
// path never writes to disk, but downstream code may construct
// file URLs from these strings.
if !is_safe_extract_path(Path::new(&name_in_zip)) {
tracing::warn!("unsafe zip entry name in {archive_label}: {name_in_zip}");
continue;
}
// The remaining-budget cap on this read serves two purposes:
// (1) honor the aggregate budget exactly even if one entry would
// individually push us over it, and (2) keep the existing
// per-entry zip-bomb cap of 512 MB as a hard upper bound.
let remaining = MAX_INMEM_ZIP_DECOMPRESSED_BYTES.saturating_sub(total_decompressed);
let entry_cap = remaining.min(MAX_ZIP_ENTRY_DECOMPRESSED_BYTES);
let mut buf = Vec::new();
let mut limited = (&mut zipped_file).take(entry_cap);
if let Err(e) = limited.read_to_end(&mut buf) {
tracing::debug!(
"failed to decompress zip entry {name_in_zip} from {archive_label}: {e}"
);
continue;
}
if buf.len() as u64 == entry_cap && entry_cap == MAX_ZIP_ENTRY_DECOMPRESSED_BYTES {
tracing::warn!(
"zip entry {name_in_zip} in {archive_label} exceeded {MAX_ZIP_ENTRY_DECOMPRESSED_BYTES} byte cap; truncating"
);
}
total_decompressed += buf.len() as u64;
entries.push((format!("{archive_label}!{name_in_zip}"), buf));
}
Ok(entries)
}
/// Return true if `data` begins with a standard ZIP signature — used to
/// short-circuit extraction attempts on blobs whose extension matches a
/// ZIP-based format but whose contents are not actually a real ZIP.
pub fn looks_like_zip(data: &[u8]) -> bool {
data.starts_with(b"PK\x03\x04")
|| data.starts_with(b"PK\x05\x06")
|| data.starts_with(b"PK\x07\x08")
}
fn handle_zip_archive_streaming(
file: &mut fs::File,
archive_path: &Path,
@ -130,11 +254,26 @@ fn handle_zip_archive_streaming(
let mut zip = ZipArchive::new(file)?;
let mut entries_on_disk = Vec::new();
let mut total_decompressed: u64 = 0;
for i in 0..zip.len() {
if total_decompressed >= MAX_INMEM_ZIP_DECOMPRESSED_BYTES {
tracing::warn!(
"zip archive {} exceeded {} byte aggregate cap at entry {i}/{}; truncating",
archive_path.display(),
MAX_INMEM_ZIP_DECOMPRESSED_BYTES,
zip.len()
);
break;
}
let mut zipped_file = zip.by_index(i)?;
if zipped_file.is_file() {
let name_in_zip = zipped_file.name().to_string();
if !is_safe_extract_path(Path::new(&name_in_zip)) {
tracing::warn!("unsafe zip path: {name_in_zip}");
continue;
}
let logical_path = format!("{}!{}", archive_path.display(), name_in_zip);
let out_path = base_dir.join(&name_in_zip);
@ -144,13 +283,12 @@ fn handle_zip_archive_streaming(
continue;
}
}
if !is_safe_extract_path(&out_path) {
tracing::warn!("unsafe zip path: {}", out_path.display());
continue;
}
match fs::File::create(&out_path) {
Ok(mut out_file) => {
let mut limited = (&mut zipped_file).take(MAX_ZIP_ENTRY_DECOMPRESSED_BYTES);
let remaining =
MAX_INMEM_ZIP_DECOMPRESSED_BYTES.saturating_sub(total_decompressed);
let entry_cap = remaining.min(MAX_ZIP_ENTRY_DECOMPRESSED_BYTES);
let mut limited = (&mut zipped_file).take(entry_cap);
let copied = match std::io::copy(&mut limited, &mut out_file) {
Ok(n) => n,
Err(e) => {
@ -158,7 +296,8 @@ fn handle_zip_archive_streaming(
continue;
}
};
if copied == MAX_ZIP_ENTRY_DECOMPRESSED_BYTES {
total_decompressed += copied;
if copied == entry_cap && entry_cap == MAX_ZIP_ENTRY_DECOMPRESSED_BYTES {
tracing::warn!(
"zip entry {} exceeded {} byte cap; truncating",
out_path.display(),
@ -166,6 +305,14 @@ fn handle_zip_archive_streaming(
);
}
entries_on_disk.push((logical_path, out_path));
if total_decompressed >= MAX_INMEM_ZIP_DECOMPRESSED_BYTES {
tracing::warn!(
"zip archive {} reached {} byte aggregate cap; truncating remaining entries",
archive_path.display(),
MAX_INMEM_ZIP_DECOMPRESSED_BYTES
);
break;
}
}
Err(e) => {
tracing::debug!("failed to create file {}: {}", out_path.display(), e);
@ -277,7 +424,7 @@ fn handle_asar_archive_in_memory(buffer: &[u8], archive_path: &Path) -> Result<C
/// Validate and open a file for reading, checking for path traversal attacks.
fn safe_open_for_read(path: &Path) -> Result<fs::File> {
if !is_safe_extract_path(path) {
if has_parent_or_embedded_prefix(path) {
anyhow::bail!("unsafe input path during decompression: {}", path.display());
}
Ok(fs::File::open(path)?)
@ -285,7 +432,7 @@ fn safe_open_for_read(path: &Path) -> Result<fs::File> {
/// Validate and create a file for writing, checking for path traversal attacks.
fn safe_create_for_write(path: &Path) -> Result<fs::File> {
if !is_safe_extract_path(path) {
if has_parent_or_embedded_prefix(path) {
anyhow::bail!("unsafe output path during decompression: {}", path.display());
}
Ok(fs::File::create(path)?)
@ -346,7 +493,7 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result<CompressedCon
return handle_zip_archive_streaming(&mut file, path, temp.path());
}
}
"gz" | "gzip" => {
"gz" | "gzip" | "tgz" => {
let out_path = make_output_path(path, base_dir, "decomp.tar");
let decoder = GzDecoder::new(BufReader::new(safe_open_for_read(path)?));
return stream_to_file(decoder, &out_path);
@ -383,12 +530,13 @@ pub fn decompress_file(path: &Path, base_dir: Option<&Path>) -> Result<Compresse
let mut owned_buf: Option<PathBuf>;
loop {
let should_extract_tar = is_tar_wrapped_compression(current_path);
let content = decompress_once(current_path, base_dir)?;
// If the step produced a single on-disk file that is itself a .tar,
// recurse on that file.
if let CompressedContent::RawFile(ref p) = content {
if split_extensions(p).0.as_deref() == Some("tar") {
if should_extract_tar {
owned_buf = Some(p.clone()); // own the path
current_path = owned_buf.as_ref().unwrap();
continue;
@ -466,7 +614,7 @@ mod tests {
use tempfile::tempdir;
use zip::{CompressionMethod, ZipWriter, write::SimpleFileOptions};
use super::{CompressedContent, decompress_once};
use super::{CompressedContent, decompress_file_to_temp, decompress_once};
/// 1) Fully unpack:
/// - 1st decompress `.gz` -- get a `.tar` file
@ -523,6 +671,45 @@ mod tests {
Ok(())
}
#[test]
fn smoke_decompress_tgz_archive() -> anyhow::Result<()> {
let dir = tempdir()?;
let tgz = dir.path().join("payload.tgz");
let github_pat = "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"; // this is not a real secret
{
let f = File::create(&tgz)?;
let gz = GzEncoder::new(f, Compression::default());
let mut tar = Builder::new(gz);
let data = format!("token={github_pat}\n");
let mut hdr = tar::Header::new_gnu();
hdr.set_size(data.len() as u64);
hdr.set_mode(0o644);
hdr.set_cksum();
tar.append_data(&mut hdr, "secret.txt", data.as_bytes())?;
tar.into_inner()?.finish()?;
}
let (content, _tmp) = decompress_file_to_temp(&tgz)?;
if let CompressedContent::ArchiveFiles(files) = content {
let mut found = false;
for (logical, path) in files {
if logical.ends_with("payload.tgz!secret.txt") {
let txt = std::fs::read_to_string(&path)?;
assert!(txt.contains(github_pat));
found = true;
}
}
assert!(found, "did not find secret.txt in tgz ArchiveFiles");
} else {
panic!("expected ArchiveFiles for tgz archive, got {:?}", content);
}
Ok(())
}
/// 2) No-extract flag: just peel the `.gz` layer (no base_dir -- use NamedTempFile), and verify
/// you get back a RawFile, whose contents are the tar archive itself.
#[test]
@ -696,6 +883,50 @@ mod tests {
Ok(())
}
#[test]
fn smoke_decompress_apk_archive() -> anyhow::Result<()> {
// APKs are ZIP containers. We expect Kingfisher to recognize the .apk
// extension and extract its entries so embedded secrets get scanned.
let dir = tempdir()?;
let apk_path = dir.path().join("aws_leak.apk");
let aws_key = "AKIAIOSFODNN7EXAMPLE"; // canonical AWS sample, not real
{
let file = File::create(&apk_path)?;
let mut zip = ZipWriter::new(file);
let options = SimpleFileOptions::default()
.compression_method(CompressionMethod::Deflated)
.unix_permissions(0o644);
zip.start_file("res/values/strings.xml", options)?;
zip.write_all(
format!(
"<?xml version=\"1.0\"?><resources><string name=\"aws\">{aws_key}</string></resources>"
)
.as_bytes(),
)?;
zip.finish()?;
}
let tmp = tempdir()?;
let content = decompress_once(&apk_path, Some(tmp.path()))?;
if let CompressedContent::ArchiveFiles(files) = content {
let mut found = false;
for (logical, path) in files {
if logical.ends_with("!res/values/strings.xml") {
let txt = std::fs::read_to_string(&path)?;
assert!(txt.contains(aws_key));
found = true;
}
}
assert!(found, "did not find res/values/strings.xml in apk ArchiveFiles");
} else {
panic!("expected ArchiveFiles for apk archive, got {:?}", content);
}
Ok(())
}
#[test]
fn smoke_decompress_hwpx_archive() -> anyhow::Result<()> {
let dir = tempdir()?;

View file

@ -8,12 +8,8 @@ use std::{
use anyhow::{Context, Result};
use indicatif::{ProgressBar, ProgressStyle};
use octorust::{
Client,
auth::Credentials,
types::{Order, ReposListOrgSort, ReposListOrgType, ReposListUserType},
};
use reqwest::StatusCode;
use reqwest::header::HeaderMap;
use serde::Deserialize;
use serde_json::Value;
use tracing::{info, warn};
@ -33,6 +29,11 @@ struct GitHubRepo {
fork: bool,
}
#[derive(Deserialize)]
struct GitHubOrg {
login: String,
}
#[derive(Debug)]
pub struct RepoSpecifiers {
pub user: Vec<String>,
@ -52,21 +53,20 @@ pub enum RepoType {
Source,
Fork,
}
impl From<RepoType> for ReposListUserType {
fn from(repo_type: RepoType) -> Self {
match repo_type {
RepoType::All => ReposListUserType::All,
RepoType::Source => ReposListUserType::Owner,
RepoType::Fork => ReposListUserType::Member,
impl RepoType {
fn user_query_value(&self) -> &'static str {
match self {
RepoType::All => "all",
RepoType::Source => "owner",
RepoType::Fork => "member",
}
}
}
impl From<RepoType> for ReposListOrgType {
fn from(repo_type: RepoType) -> Self {
match repo_type {
RepoType::All => ReposListOrgType::All,
RepoType::Source => ReposListOrgType::Sources,
RepoType::Fork => ReposListOrgType::Forks,
fn org_query_value(&self) -> &'static str {
match self {
RepoType::All => "all",
RepoType::Source => "sources",
RepoType::Fork => "forks",
}
}
}
@ -128,34 +128,13 @@ fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher {
fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool {
git_host::should_exclude_repo(clone_url, excludes, parse_repo_name_from_url)
}
fn create_github_client(github_url: &url::Url, ignore_certs: bool) -> Result<Arc<Client>> {
// Try personal access token
let credentials = if let Ok(token) = env::var("KF_GITHUB_TOKEN") {
Credentials::Token(token)
} else {
Credentials::Token("".to_string()) // Anonymous access
};
fn create_github_client(ignore_certs: bool) -> Result<Arc<reqwest::Client>> {
let mut client_builder = reqwest::Client::builder();
if ignore_certs {
client_builder = client_builder.danger_accept_invalid_certs(ignore_certs);
}
let reqwest_client = client_builder.build().context("Failed to build HTTP client")?;
let http_client = reqwest_middleware_octorust::ClientBuilder::new(reqwest_client).build();
let mut client = Client::custom(
concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION")),
credentials,
http_client,
);
// Override host if not using api.github.com
if github_url.host_str() != Some("api.github.com") {
client.with_host_override(github_url.as_str());
}
Ok(Arc::new(client))
Ok(Arc::new(client_builder.build().context("Failed to build HTTP client")?))
}
fn normalize_api_base(api_url: &Url) -> Url {
@ -167,6 +146,108 @@ fn normalize_api_base(api_url: &Url) -> Url {
base
}
fn github_token() -> Option<String> {
env::var("KF_GITHUB_TOKEN").ok().filter(|t| !t.is_empty())
}
fn github_get(client: &reqwest::Client, url: Url, token: Option<&str>) -> reqwest::RequestBuilder {
let req = client.get(url).header("User-Agent", GLOBAL_USER_AGENT.as_str());
if let Some(token) = token { req.bearer_auth(token) } else { req }
}
async fn ensure_github_success(resp: reqwest::Response, action: &str) -> Result<reqwest::Response> {
if resp.status().is_success() {
return Ok(resp);
}
let status = resp.status();
let url = resp.url().clone();
warn_on_rate_limit("GitHub", status, action);
let mut body = resp.text().await.unwrap_or_default();
if body.len() > 512 {
body.truncate(512);
}
anyhow::bail!("GitHub API request failed while {action}: HTTP {status} ({url}): {body}");
}
fn is_github_soft_limit_status(status: StatusCode) -> bool {
matches!(status, StatusCode::FORBIDDEN | StatusCode::TOO_MANY_REQUESTS)
}
fn github_next_link(headers: &HeaderMap) -> Option<Url> {
let raw = headers.get(reqwest::header::LINK)?.to_str().ok()?;
raw.split(',').find_map(|part| {
let (url_part, params) = part.trim().split_once(';')?;
if !params.split(';').any(|param| param.trim() == "rel=\"next\"") {
return None;
}
let url = url_part.trim().strip_prefix('<')?.strip_suffix('>')?;
Url::parse(url).ok()
})
}
async fn fetch_github_orgs(
client: &reqwest::Client,
api_base: &Url,
token: Option<&str>,
) -> Result<Vec<String>> {
let mut orgs = Vec::new();
let mut next_url = {
let mut url = api_base.join("organizations").context("Failed to build GitHub orgs URL")?;
url.query_pairs_mut().append_pair("per_page", "100");
Some(url)
};
while let Some(url) = next_url {
let resp = ensure_github_success(
github_get(client, url, token).send().await?,
"listing organizations",
)
.await?;
next_url = github_next_link(resp.headers());
let page_orgs: Vec<GitHubOrg> = resp.json().await?;
if page_orgs.is_empty() {
break;
}
orgs.extend(page_orgs.into_iter().map(|org| org.login));
}
Ok(orgs)
}
async fn fetch_github_repos(
client: &reqwest::Client,
api_base: &Url,
path: &str,
repo_type: &str,
token: Option<&str>,
action: &str,
) -> Result<Vec<GitHubRepo>> {
let mut repos = Vec::new();
let mut page = 1;
loop {
let mut url = api_base.join(path).context("Failed to build GitHub repositories URL")?;
url.query_pairs_mut()
.append_pair("per_page", "100")
.append_pair("page", &page.to_string())
.append_pair("type", repo_type)
.append_pair("sort", "created")
.append_pair("direction", "desc");
let resp =
ensure_github_success(github_get(client, url, token).send().await?, action).await?;
let page_repos: Vec<GitHubRepo> = resp.json().await?;
if page_repos.is_empty() {
break;
}
repos.extend(page_repos);
page += 1;
}
Ok(repos)
}
pub async fn enumerate_contributor_repo_urls(
repo_url: &GitUrl,
github_api_url: &Url,
@ -179,7 +260,7 @@ pub async fn enumerate_contributor_repo_urls(
let (_, owner, repo) = parse_repo(repo_url).context("invalid GitHub repo URL")?;
let exclude_set = build_exclude_matcher(exclude_repos);
let client = reqwest::Client::builder().danger_accept_invalid_certs(ignore_certs).build()?;
let token = env::var("KF_GITHUB_TOKEN").ok().filter(|t| !t.is_empty());
let token = github_token();
let api_base = normalize_api_base(github_api_url);
let mut contributor_logins = Vec::new();
@ -190,15 +271,12 @@ pub async fn enumerate_contributor_repo_urls(
.join(&format!("repos/{owner}/{repo}/contributors"))
.context("Failed to build GitHub contributors URL")?;
url.query_pairs_mut().append_pair("per_page", "100").append_pair("page", &page.to_string());
let mut req = client.get(url).header("User-Agent", GLOBAL_USER_AGENT.as_str());
if let Some(token) = token.as_ref() {
req = req.bearer_auth(token);
}
let resp = req.send().await?;
if !resp.status().is_success() {
let resp = github_get(&client, url, token.as_deref()).send().await?;
if is_github_soft_limit_status(resp.status()) {
warn_on_rate_limit("GitHub", resp.status(), "listing contributors");
break;
}
let resp = ensure_github_success(resp, "listing contributors").await?;
let contributors: Vec<GitHubContributor> = resp.json().await?;
if contributors.is_empty() {
break;
@ -251,15 +329,12 @@ pub async fn enumerate_contributor_repo_urls(
.append_pair("type", "all")
.append_pair("sort", "updated")
.append_pair("direction", "desc");
let mut req = client.get(url).header("User-Agent", GLOBAL_USER_AGENT.as_str());
if let Some(token) = token.as_ref() {
req = req.bearer_auth(token);
}
let resp = req.send().await?;
if !resp.status().is_success() {
let resp = github_get(&client, url, token.as_deref()).send().await?;
if is_github_soft_limit_status(resp.status()) {
warn_on_rate_limit("GitHub", resp.status(), "listing user repositories");
break;
}
let resp = ensure_github_success(resp, "listing user repositories").await?;
let repos: Vec<GitHubRepo> = resp.json().await?;
if repos.is_empty() {
break;
@ -351,22 +426,22 @@ pub async fn enumerate_repo_urls(
ignore_certs: bool,
mut progress: Option<&mut ProgressBar>,
) -> Result<Vec<String>> {
let client = create_github_client(&github_url, ignore_certs)?;
let client = create_github_client(ignore_certs)?;
let mut repo_urls = Vec::new();
let exclude_set = build_exclude_matcher(&repo_specifiers.exclude_repos);
let user_repo_type: ReposListUserType = repo_specifiers.repo_filter.clone().into();
let org_repo_type: ReposListOrgType = repo_specifiers.repo_filter.clone().into();
let api_base = normalize_api_base(&github_url);
let token = github_token();
for username in &repo_specifiers.user {
let repos = client
.repos()
.list_all_for_user(
username,
user_repo_type.clone(),
ReposListOrgSort::Created,
Order::Desc,
)
.await?;
repo_urls.extend(repos.body.into_iter().filter_map(|repo| {
let repos = fetch_github_repos(
&client,
&api_base,
&format!("users/{username}/repos"),
repo_specifiers.repo_filter.user_query_value(),
token.as_deref(),
"listing user repositories",
)
.await?;
repo_urls.extend(repos.into_iter().filter_map(|repo| {
let clone_url = repo.clone_url;
if should_exclude_repo(&clone_url, &exclude_set) { None } else { Some(clone_url) }
}));
@ -375,24 +450,21 @@ pub async fn enumerate_repo_urls(
}
}
let orgs = if repo_specifiers.all_organizations {
let mut all_orgs = Vec::new();
let org_list = client.orgs().list_all(100).await?;
all_orgs.extend(org_list.body.into_iter().map(|org| org.login));
all_orgs
fetch_github_orgs(&client, &api_base, token.as_deref()).await?
} else {
repo_specifiers.organization.clone()
};
for org_name in orgs {
let repos = client
.repos()
.list_all_for_org(
&org_name,
org_repo_type.clone(),
ReposListOrgSort::Created,
Order::Desc,
)
.await?;
repo_urls.extend(repos.body.into_iter().filter_map(|repo| {
let repos = fetch_github_repos(
&client,
&api_base,
&format!("orgs/{org_name}/repos"),
repo_specifiers.repo_filter.org_query_value(),
token.as_deref(),
"listing organization repositories",
)
.await?;
repo_urls.extend(repos.into_iter().filter_map(|repo| {
let clone_url = repo.clone_url;
if should_exclude_repo(&clone_url, &exclude_set) { None } else { Some(clone_url) }
}));
@ -675,4 +747,29 @@ mod tests {
assert!(should_exclude_repo("https://github.com/owner/project-archive.git", &excludes));
assert!(!should_exclude_repo("https://github.com/owner/project.git", &excludes));
}
#[test]
fn github_next_link_parses_next_relation() {
let mut headers = HeaderMap::new();
headers.insert(
reqwest::header::LINK,
r#"<https://api.github.com/organizations?since=42>; rel="next", <https://api.github.com/organizations?since=1>; rel="first""#
.parse()
.unwrap(),
);
let next = github_next_link(&headers).unwrap();
assert_eq!(next.as_str(), "https://api.github.com/organizations?since=42");
}
#[test]
fn github_next_link_returns_none_without_next_relation() {
let mut headers = HeaderMap::new();
headers.insert(
reqwest::header::LINK,
r#"<https://api.github.com/organizations?since=1>; rel="first""#.parse().unwrap(),
);
assert!(github_next_link(&headers).is_none());
}
}

View file

@ -94,6 +94,9 @@ struct EnumeratorConfig {
repo_scan_timeout: Duration,
exclude_globset: Option<std::sync::Arc<GlobSet>>,
git_diff: Option<GitDiffConfig>,
/// Whether archive blobs encountered during git scanning should be
/// transparently extracted before pattern matching.
extract_archives: bool,
}
pub enum FoundInput {

View file

@ -979,11 +979,7 @@ fn build_config_yaml(
// round-trip. Pull the raw CLI/env string from `ArgMatches` instead so
// the emitted YAML matches what the user actually passed.
fn raw_arg_string(matches: &clap::ArgMatches, id: &str) -> Option<String> {
matches
.get_raw(id)
.and_then(|mut v| v.next())
.and_then(|s| s.to_str())
.map(str::to_owned)
matches.get_raw(id).and_then(|mut v| v.next()).and_then(|s| s.to_str()).map(str::to_owned)
}
if user_set(sub_matches, "github_api_url") {
git.github_api_url = raw_arg_string(sub_matches, "github_api_url");
@ -2326,6 +2322,7 @@ alerts:
assert_eq!(
cfg.git.github_api_url.as_deref(),
Some("https://ghe.corp.example.com/api/v3/"),
"github_api_url must preserve a user-supplied trailing slash",
);
}

View file

@ -1,4 +1,5 @@
use std::{
io::Read,
marker::PhantomData,
path::Path,
process::Command,
@ -30,7 +31,10 @@ use crate::{
binary::is_binary,
blob::{Blob, BlobAppearance, BlobId, BlobIdMap},
cli::commands::{github::GitHistoryMode, scan},
decompress::{CompressedContent, decompress_file_to_temp},
decompress::{
CompressedContent, MAX_INMEM_ZIP_ARCHIVE_BYTES, ZIP_BASED_FORMATS, decompress_file_to_temp,
extract_zip_archive_in_memory, looks_like_zip,
},
findings_store,
git_commit_metadata::CommitMetadata,
git_repo_enumerator::{GitBlobMetadata, GitBlobSource, MIN_SCANNABLE_BLOB_SIZE},
@ -156,6 +160,7 @@ pub fn enumerate_filesystem_inputs(
repo_scan_timeout,
exclude_globset: exclude_globset.clone(),
git_diff: diff_config.clone(),
extract_archives: !args.content_filtering_args.no_extract_archives,
};
let (send_ds, recv_ds) = create_datastore_channel(args.num_jobs);
let datastore_writer_thread =
@ -208,12 +213,10 @@ pub fn enumerate_filesystem_inputs(
}
Ok(entry) => entry,
};
// Check if this is an archive file
let is_archive = if let Origin::File(file_origin) = &origin.first() {
is_compressed_file(&file_origin.path)
} else {
false
};
// Check if this is an archive file. `blob_path()` covers both filesystem and git
// origins, so archive/binary filtering stays consistent across input modes.
let is_archive =
origin.first().blob_path().map(is_compressed_file).unwrap_or(false);
let is_binary = is_binary(&blob.bytes());
let should_skip = if is_archive {
// For archives: skip only if --no_extract_archives is true
@ -517,10 +520,248 @@ impl FileResult {
}
}
/// Extract an archive blob loaded from a git ODB.
///
/// `blob_path` is the in-tree path the blob was first seen at (used both to
/// pick an extension and to label the resulting per-entry origins so reports
/// look like `aws_leak.apk!classes4.dex`). `data` is the raw blob bytes.
///
/// Returns `Ok(None)` when the path is not a recognized archive format —
/// the caller should fall back to scanning the blob's raw bytes. Returns
/// `Ok(Some(entries))` with one element per extracted entry on success.
/// Returns `Err` only on infrastructure failures (failed to write temp
/// file, etc.); decompression errors return `Ok(None)` so the caller can
/// still scan the raw blob.
fn try_extract_git_blob_archive(
blob_path: &str,
data: &[u8],
) -> Result<Option<Vec<(String, Vec<u8>)>>> {
let pb = PathBuf::from(blob_path);
if !is_compressed_file(&pb) {
return Ok(None);
}
// Use the repo-relative path in reports while staging the blob under its basename so the
// decompressor still dispatches on the original extension.
let archive_label = blob_path.to_string();
let staged_name = pb.file_name().and_then(|s| s.to_str()).unwrap_or("blob").to_string();
// ── fast path: ZIP-based archives extract entirely in memory ──
//
// For monorepos with many committed `.jar`/`.zip`/`.apk`/`.aar`
// artifacts, the disk-staging path below imposes substantial
// overhead per blob (mkdir + stage write + per-entry tempfile +
// re-read into memory). Since the blob bytes are already in memory
// here, we skip the round-trip entirely for ZIP-based formats —
// this is the dominant archive type committed to git in practice.
//
// Memory bound: archives larger than `MAX_INMEM_ZIP_ARCHIVE_BYTES`
// (64 MB) fall through to the disk-streaming path so a single
// worker never holds the archive bytes AND every decompressed
// entry resident at once. The fast path additionally caps total
// decompressed bytes per archive (see
// `MAX_INMEM_ZIP_DECOMPRESSED_BYTES` in `decompress.rs`).
let zip_based_ext = pb
.extension()
.and_then(|s| s.to_str())
.map(|s| s.to_ascii_lowercase())
.filter(|ext| ZIP_BASED_FORMATS.iter().any(|z| z == ext));
if let Some(_ext) = zip_based_ext.as_ref() {
// Cheap magic-byte check first: if a `.zip`-named blob is not
// actually a ZIP (truncated download, stub file, accidental
// rename), skip extraction so the caller scans the raw bytes.
if !looks_like_zip(data) {
return Ok(None);
}
if data.len() <= MAX_INMEM_ZIP_ARCHIVE_BYTES {
return match extract_zip_archive_in_memory(data, &archive_label) {
Ok(entries) => Ok(Some(entries)),
Err(e) => {
debug!(
"in-memory zip extract failed for {archive_label}: {e:#}; falling back to raw scan"
);
Ok(None)
}
};
}
debug!(
"{archive_label} is {} bytes (> {} MB cap); falling back to disk streaming extractor",
data.len(),
MAX_INMEM_ZIP_ARCHIVE_BYTES / (1024 * 1024)
);
// fall through to the disk-streaming path below
}
// ── slow path: tar/gz/bz2/xz/zlib/asar/hwp/egg etc. via tempfile,
// and large ZIP-based archives that exceeded the
// in-memory cap above. ──
let staging = tempfile::tempdir().context("Failed to create staging tempdir for git blob")?;
let staged_path = staging.path().join(&staged_name);
std::fs::write(&staged_path, data)
.with_context(|| format!("Failed to stage blob to {}", staged_path.display()))?;
let (content, _td) = match decompress_file_to_temp(&staged_path) {
Ok(c) => c,
Err(e) => {
debug!("decompress_file_to_temp({}) failed: {e:#}", staged_path.display());
return Ok(None);
}
};
use crate::decompress::CompressedContent;
let strip_logical_prefix = |logical: String| -> String {
// decompress_file_to_temp builds logicals as
// `<staged_path>!<entry>`. Replace the staged-path prefix with the
// real repo-relative archive path so report paths look like
// `dir/aws_leak.apk!res/values/strings.xml`.
match logical.split_once('!') {
Some((_, entry)) => format!("{}!{}", archive_label, entry),
None => format!("{}!{}", archive_label, logical),
}
};
// Aggregate cap on bytes accumulated by this wrapper. The on-disk
// entries themselves were already bounded during decompression by
// per-entry caps; this cap bounds the size of the final
// `Vec<(String, Vec<u8>)>` we hand back. Without it, a JAR with N
// medium-sized entries could push num_jobs * N * entry_size bytes
// resident across the rayon pool.
const MAX_DISK_PATH_AGGREGATE_BYTES: u64 = 256 * 1024 * 1024;
let entries = match content {
CompressedContent::Archive(files) => {
let mut out = Vec::with_capacity(files.len());
let mut total: u64 = 0;
for (logical, bytes) in files {
if total >= MAX_DISK_PATH_AGGREGATE_BYTES {
debug!(
"{archive_label} disk-archive aggregate cap of {MAX_DISK_PATH_AGGREGATE_BYTES} bytes reached; truncating remaining entries"
);
break;
}
let remaining = MAX_DISK_PATH_AGGREGATE_BYTES - total;
if bytes.len() as u64 > remaining {
debug!(
"{archive_label} disk-archive aggregate cap reached while reading {}; truncating entry",
logical
);
let take = remaining as usize;
out.push((strip_logical_prefix(logical), bytes[..take].to_vec()));
break;
}
total += bytes.len() as u64;
out.push((strip_logical_prefix(logical), bytes));
}
out
}
CompressedContent::ArchiveFiles(disk_entries) => {
let mut out = Vec::with_capacity(disk_entries.len());
let mut total: u64 = 0;
for (logical, disk_path) in disk_entries {
if total >= MAX_DISK_PATH_AGGREGATE_BYTES {
debug!(
"{archive_label} disk-archive aggregate cap of {MAX_DISK_PATH_AGGREGATE_BYTES} bytes reached; truncating remaining entries"
);
break;
}
let remaining = MAX_DISK_PATH_AGGREGATE_BYTES - total;
let entry_len = match std::fs::metadata(&disk_path) {
Ok(md) => md.len(),
Err(e) => {
debug!("Failed to stat extracted entry {}: {e}", disk_path.display());
continue;
}
};
let file = match std::fs::File::open(&disk_path) {
Ok(file) => file,
Err(e) => {
debug!("Failed to open extracted entry {}: {e}", disk_path.display());
continue;
}
};
let to_read = entry_len.min(remaining);
let mut bytes = Vec::with_capacity(to_read as usize);
match file.take(to_read).read_to_end(&mut bytes) {
Ok(_) => {
total += bytes.len() as u64;
out.push((strip_logical_prefix(logical), bytes));
if entry_len > remaining {
debug!(
"{archive_label} disk-archive aggregate cap reached while reading {}; truncating entry",
disk_path.display()
);
break;
}
}
Err(e) => {
debug!("Failed to read extracted entry {}: {e}", disk_path.display());
}
}
}
out
}
// Single-stream decompression (gz/bz2/xz/zlib) gives one logical
// payload; cap it just like aggregate archive-entry reads.
CompressedContent::Raw(mut bytes) => {
if bytes.len() as u64 > MAX_DISK_PATH_AGGREGATE_BYTES {
debug!(
"{archive_label} single-stream payload exceeded {MAX_DISK_PATH_AGGREGATE_BYTES} byte cap; truncating"
);
bytes.truncate(MAX_DISK_PATH_AGGREGATE_BYTES as usize);
}
vec![(format!("{}!content", archive_label), bytes)]
}
CompressedContent::RawFile(path) => {
let payload_len = match std::fs::metadata(&path) {
Ok(md) => md.len(),
Err(e) => {
debug!("Failed to stat decompressed payload {}: {e}", path.display());
return Ok(None);
}
};
let file = match std::fs::File::open(&path) {
Ok(file) => file,
Err(e) => {
debug!("Failed to open decompressed payload {}: {e}", path.display());
return Ok(None);
}
};
let to_read = payload_len.min(MAX_DISK_PATH_AGGREGATE_BYTES);
let mut bytes = Vec::with_capacity(to_read as usize);
if let Err(e) = file.take(to_read).read_to_end(&mut bytes) {
debug!("Failed to read decompressed payload {}: {e}", path.display());
return Ok(None);
}
if payload_len > MAX_DISK_PATH_AGGREGATE_BYTES {
debug!(
"{archive_label} single-stream payload exceeded {MAX_DISK_PATH_AGGREGATE_BYTES} byte cap; truncating"
);
}
vec![(format!("{}!content", archive_label), bytes)]
}
};
if entries.is_empty() { Ok(None) } else { Ok(Some(entries)) }
}
fn archive_entry_suffix<'a>(entry_logical: &'a str, archive_path: &str) -> Option<&'a str> {
entry_logical.strip_prefix(archive_path).filter(|suffix| suffix.starts_with('!')).or_else(
|| entry_logical.split_once('!').map(|(archive, _)| &entry_logical[archive.len()..]),
)
}
// A marker so the struct itself carries the lifetime.
struct GitRepoResultIter<'a> {
inner: GitRepoResult,
deadline: std::time::Instant,
/// When true, blobs whose in-tree path matches a known archive format
/// (zip/jar/apk/tar/gz/...) are extracted before scanning, so secrets
/// inside the archive can be matched. When false, archive blobs are
/// scanned as raw compressed bytes (legacy behavior).
extract_archives: bool,
_marker: std::marker::PhantomData<&'a ()>,
}
@ -534,6 +775,8 @@ impl ParallelBlobIterator for GitRepoResult {
Ok(Some(GitRepoResultIter {
inner: self,
deadline: Instant::now() + PLACEHOLDER,
// Default to enabled; the dispatch site overrides from CLI args.
extract_archives: true,
_marker: std::marker::PhantomData,
}))
}
@ -551,12 +794,18 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> {
let repo_path = Arc::new(self.inner.path.clone());
let deadline = self.deadline;
let flag = Arc::new(AtomicBool::new(false)); // first-timeout gate
let extract_archives = self.extract_archives;
// Loads one git blob and returns one *or more* `(OriginSet, Blob)`
// tuples: a single tuple for normal blobs, multiple tuples for
// archive blobs (zip/jar/apk/...) whose entries get unpacked into
// synthetic per-entry blobs so pattern matchers can see the
// contents. See `try_extract_git_blob_archive` below.
let load_blob = {
let repo_path = Arc::clone(&repo_path);
let flag = Arc::clone(&flag);
move |repo: &mut GixRepo, md: GitBlobMetadata| -> Result<(OriginSet, Blob)> {
move |repo: &mut GixRepo, md: GitBlobMetadata| -> Result<Vec<(OriginSet, Blob<'a>)>> {
if StdInstant::now() > deadline {
if flag.swap(true, Ordering::Relaxed) {
bail!("__timeout_silenced__");
@ -566,7 +815,61 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> {
let blob_id = md.blob_oid;
let mut raw = repo.find_object(blob_id)?.try_into_blob()?;
let blob = Blob::new(BlobId::from(&blob_id), std::mem::take(&mut raw.data));
let data = std::mem::take(&mut raw.data);
// Try archive extraction if any first-seen path looks like
// a known archive format. We don't need to keep the raw
// archive bytes around — its compressed contents won't
// produce useful matches anyway.
if extract_archives {
let archive_path: Option<String> = md
.first_seen
.iter()
.map(|e| String::from_utf8_lossy(&e.path).to_string())
.find(|p| is_compressed_file(Path::new(p)));
if let Some(archive_path) = archive_path {
match try_extract_git_blob_archive(&archive_path, &data) {
Ok(Some(entries)) => {
let mut out = Vec::with_capacity(entries.len());
for (entry_logical, entry_bytes) in entries {
let entry_suffix =
archive_entry_suffix(&entry_logical, &archive_path);
let origin =
OriginSet::try_from_iter(md.first_seen.iter().map(|e| {
let repo_relative_path =
String::from_utf8_lossy(&e.path).to_string();
let per_appearance_logical = entry_suffix
.map(|suffix| {
format!("{repo_relative_path}{suffix}")
})
.unwrap_or_else(|| entry_logical.clone());
Origin::from_git_repo_with_first_commit(
Arc::clone(&repo_path),
Arc::clone(&e.commit_metadata),
per_appearance_logical,
)
}))
.unwrap_or_else(
|| Origin::from_git_repo(Arc::clone(&repo_path)).into(),
);
out.push((origin, Blob::from_bytes(entry_bytes)));
}
return Ok(out);
}
Ok(None) => { /* not an archive we can crack — fall through */ }
Err(e) => {
debug!(
"Failed to extract git archive blob {} ({}): {e:#}",
blob_id, archive_path
);
// fall through and scan raw bytes
}
}
}
}
let blob = Blob::new(BlobId::from(&blob_id), data);
let origin = OriginSet::try_from_iter(md.first_seen.iter().map(|e| {
Origin::from_git_repo_with_first_commit(
@ -577,14 +880,32 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> {
}))
.unwrap_or_else(|| Origin::from_git_repo(Arc::clone(&repo_path)).into());
Ok((origin, blob))
Ok(vec![(origin, blob)])
}
};
let timeout_filter = |res: &Result<(OriginSet, Blob)>| -> bool {
// After flat-mapping, errors and successes both flow as
// `Result<(OriginSet, Blob<'a>)>`. Filter out the silenced timeout
// marker before handing items to the scan consumer.
let timeout_filter = |res: &Result<(OriginSet, Blob<'a>)>| -> bool {
!matches!(res, Err(e) if e.to_string() == "__timeout_silenced__")
};
// Convert `Result<Vec<T>>` into a sequential iterator of `Result<T>`,
// suitable for rayon's `flat_map_iter`. A failed load yields a single
// `Err`; a successful load fans out into one item per extracted blob.
// A closure is used (rather than a free function) so the produced
// `Blob<'static>` items can coerce into the iterator's
// `Blob<'a>` Item type — Blob is covariant in its lifetime, but a
// free fn would lose that link.
let fan_out = |res: Result<Vec<(OriginSet, Blob<'a>)>>|
-> Box<dyn Iterator<Item = Result<(OriginSet, Blob<'a>)>> + Send + 'a> {
match res {
Ok(v) => Box::new(v.into_iter().map(Ok)),
Err(e) => Box::new(std::iter::once(Err(e))),
}
};
match self.inner.blobs {
GitBlobSource::Precomputed(blobs) => {
let rs = Arc::clone(&repo_sync);
@ -592,6 +913,7 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> {
.into_par_iter()
.with_min_len(1024)
.map_init(move || rs.to_thread_local(), load_blob)
.flat_map_iter(fan_out)
.filter(timeout_filter)
.drive_unindexed(consumer)
}
@ -640,6 +962,7 @@ impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> {
.into_iter()
.par_bridge()
.map_init(move || rs.to_thread_local(), load_blob)
.flat_map_iter(fan_out)
.filter(timeout_filter)
.drive_unindexed(consumer)
}
@ -829,12 +1152,14 @@ impl<'cfg> ParallelBlobIterator for (&'cfg EnumeratorConfig, FoundInput) {
t_start.elapsed().as_secs_f64()
);
// Convert to a blob iterator, then patch the deadline
// Convert to a blob iterator, then patch deadline + extraction.
let extract_archives = cfg.extract_archives;
repo_result
.into_blob_iter() // Option<GitRepoResultIter>
.map(|iter| {
iter.map(|mut gri| {
gri.deadline = Instant::now() + timeout;
gri.extract_archives = extract_archives;
FoundInputIter::GitRepo(gri)
})
})
@ -1165,12 +1490,12 @@ fn reference_candidates(reference: &str) -> Vec<String> {
#[cfg(test)]
mod tests {
use std::fs;
use std::path::Path;
use std::{fs, io::Write};
use super::{
FileResult, GitBlobSource, GitDiffConfig, ParallelBlobIterator, enumerate_git_diff_repo,
reference_candidates,
reference_candidates, try_extract_git_blob_archive,
};
use anyhow::Result;
use bstr::ByteSlice;
@ -1179,6 +1504,7 @@ mod tests {
use rayon::iter::ParallelIterator;
use rusqlite::Connection;
use tempfile::tempdir;
use zip::{CompressionMethod, ZipWriter, write::SimpleFileOptions};
#[test]
fn reference_candidates_for_plain_branch() {
@ -1278,6 +1604,40 @@ mod tests {
Ok(())
}
#[test]
fn archive_entry_suffix_preserves_entry_component() {
assert_eq!(
super::archive_entry_suffix("dir/archive.zip!nested/secret.txt", "dir/archive.zip"),
Some("!nested/secret.txt")
);
assert_eq!(
super::archive_entry_suffix("archive.zip!nested/secret.txt", "other/archive.zip"),
Some("!nested/secret.txt")
);
}
#[test]
fn git_blob_archive_extraction_preserves_repo_relative_paths() -> Result<()> {
let mut cursor = std::io::Cursor::new(Vec::new());
{
let mut zip = ZipWriter::new(&mut cursor);
let options = SimpleFileOptions::default()
.compression_method(CompressionMethod::Deflated)
.unix_permissions(0o644);
zip.start_file("nested/secret.txt", options)?;
zip.write_all(b"token=not-a-real-secret")?;
zip.finish()?;
}
let entries = try_extract_git_blob_archive("dir/payload.zip", &cursor.into_inner())?
.expect("zip blob should extract");
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].0, "dir/payload.zip!nested/secret.txt");
assert_eq!(entries[0].1, b"token=not-a-real-secret");
Ok(())
}
fn collect_file_bytes(file: FileResult) -> Result<Vec<(std::path::PathBuf, Vec<u8>)>> {
let iter = file.into_blob_iter()?.expect("file result should yield a blob");
iter.collect::<Vec<_>>()

View file

@ -10,7 +10,9 @@ pub fn is_compressed_file(path: &Path) -> bool {
};
// Check for compound extensions first
if filename.ends_with(".tar.gz")
|| filename.ends_with(".tar.gzip")
|| filename.ends_with(".tar.bz2")
|| filename.ends_with(".tar.bzip2")
|| filename.ends_with(".tar.xz")
{
return true;
@ -19,8 +21,10 @@ pub fn is_compressed_file(path: &Path) -> bool {
if let Some(ext) = path.extension().and_then(|s| s.to_str()) {
let ext_lower = ext.to_lowercase();
ext_lower == "gz"
|| ext_lower == "gzip"
|| ext_lower == "tgz"
|| ext_lower == "bz2"
|| ext_lower == "bzip2"
|| ext_lower == "xz"
|| ext_lower == "tar"
|| ext_lower == "zlib"
@ -63,3 +67,22 @@ pub fn is_sqlite_file(path: &Path) -> bool {
pub fn has_sqlite_magic(data: &[u8]) -> bool {
data.len() >= SQLITE_MAGIC.len() && data[..SQLITE_MAGIC.len()] == *SQLITE_MAGIC
}
#[cfg(test)]
mod tests {
use std::path::Path;
use super::is_compressed_file;
#[test]
fn recognizes_tar_wrapped_long_compression_extensions() {
assert!(is_compressed_file(Path::new("archive.tar.gzip")));
assert!(is_compressed_file(Path::new("archive.tar.bzip2")));
}
#[test]
fn recognizes_long_single_compression_extensions() {
assert!(is_compressed_file(Path::new("payload.gzip")));
assert!(is_compressed_file(Path::new("payload.bzip2")));
}
}

View file

@ -725,7 +725,7 @@ pub async fn run_secret_validation(
let mut by_key: FxHashMap<String, Vec<OwnedBlobMatch>> =
FxHashMap::default();
for om in owned {
by_key.entry(build_cache_key(&om, &dep_vars)).or_default().push(om);
by_key.entry(build_cache_key(&om)).or_default().push(om);
}
let reps: Vec<_> =
by_key.into_iter().map(|(_k, mut v)| (v.remove(0), v)).collect();
@ -859,17 +859,7 @@ async fn validate_single(
validation_retries: u32,
max_body_len: usize,
) {
// Build key
let dep_vars_str = dep_vars
.get(om.rule.id())
.map(|hm| {
let mut sorted: Vec<_> = hm.iter().collect();
sorted.sort_by(|(k, _), (k2, _)| k.cmp(k2));
sorted.into_iter().map(|(k, v)| format!("{}={}", k, v)).collect::<Vec<_>>().join("|")
})
.unwrap_or_default();
let capture0 = om.captures.captures.get(0).map_or(String::new(), |c| c.raw_value().to_string());
let cache_key = format!("{}|{}|{}", om.rule.name(), capture0, dep_vars_str);
let cache_key = build_cache_key(om);
// Check cache first
if let Some(cached) = cache.get(&cache_key) {
om.validation_success = cached.is_valid;
@ -966,24 +956,29 @@ fn is_counted_validation_status(status: StatusCode) -> bool {
!matches!(status, StatusCode::CONTINUE | StatusCode::PRECONDITION_REQUIRED)
}
// Helper to compute the cache key for an OwnedBlobMatch
fn build_cache_key(
om: &OwnedBlobMatch,
dep_vars: &FxHashMap<String, Vec<(String, OffsetSpan)>>,
) -> String {
// Build key
let dep_vars_str = dep_vars
.get(om.rule.id())
.map(|hm| {
let mut sorted: Vec<_> = hm.iter().collect();
sorted.sort_by(|(k, _), (k2, _)| k.cmp(k2));
sorted.into_iter().map(|(k, v)| format!("{}={}", k, v)).collect::<Vec<_>>().join("|")
})
.unwrap_or_default();
// For demonstration, well do a simplistic approach
// You can adapt from your existing logic
// Helper to compute the cache key for an OwnedBlobMatch.
fn build_cache_key(om: &OwnedBlobMatch) -> String {
let capture0 = om.captures.captures.get(0).map_or(String::new(), |c| c.raw_value().to_string());
format!("{}|{}|{}", om.rule.name(), capture0, dep_vars_str)
let has_context_dependency = om
.rule
.syntax()
.depends_on_rule
.iter()
.flatten()
.any(|dep| !dep.variable.eq_ignore_ascii_case("TOKEN"));
if has_context_dependency {
return format!(
"{}|{}|{}|{}|{}",
om.rule.name(),
capture0,
om.blob_id,
om.matching_input_offset_span.start,
om.matching_input_offset_span.end
);
}
format!("{}|{}", om.rule.name(), capture0)
}
fn maybe_record_access_map(om: &OwnedBlobMatch, collector: Option<&AccessMapCollector>) {
@ -1006,8 +1001,9 @@ fn maybe_record_access_map(om: &OwnedBlobMatch, collector: Option<&AccessMapColl
.map(|(_, value, ..)| value.clone())
.unwrap_or_default();
let mut akid = utils::find_closest_variable(&captures, &secret, "TOKEN", "AKID")
.unwrap_or_default();
let mut akid =
utils::find_closest_variable(&captures, secret.as_str(), "TOKEN", "AKID")
.unwrap_or_default();
if akid.is_empty() {
akid = extract_akid_from_body(&om.validation_response_body).unwrap_or_default();
@ -1031,7 +1027,7 @@ fn maybe_record_access_map(om: &OwnedBlobMatch, collector: Option<&AccessMapColl
.map(|(_, value, ..)| value.clone())
.unwrap_or_default();
let storage_account =
utils::find_closest_variable(&captures, &storage_key, "TOKEN", "AZURENAME")
utils::find_closest_variable(&captures, storage_key.as_str(), "TOKEN", "AZURENAME")
.unwrap_or_default();
let mut storage_account = storage_account;
@ -1086,9 +1082,13 @@ fn maybe_record_access_map(om: &OwnedBlobMatch, collector: Option<&AccessMapColl
.find(|(name, ..)| name == "TOKEN")
.map(|(_, value, ..)| value.clone())
.unwrap_or_default();
let mut organization =
utils::find_closest_variable(&captures, &token, "TOKEN", "AZURE_DEVOPS_ORG")
.unwrap_or_default();
let mut organization = utils::find_closest_variable(
&captures,
token.as_str(),
"TOKEN",
"AZURE_DEVOPS_ORG",
)
.unwrap_or_default();
if organization.is_empty() {
organization = extract_azure_devops_org_from_body(&om.validation_response_body)
.unwrap_or_default();
@ -1105,7 +1105,7 @@ fn maybe_record_access_map(om: &OwnedBlobMatch, collector: Option<&AccessMapColl
.map(|(_, value, ..)| value.clone())
.unwrap_or_default();
let access_key =
utils::find_closest_variable(&captures, &secret_key, "TOKEN", "AKID")
utils::find_closest_variable(&captures, secret_key.as_str(), "TOKEN", "AKID")
.or_else(|| om.dependent_captures.get("AKID").cloned())
.unwrap_or_default();
@ -1119,14 +1119,22 @@ fn maybe_record_access_map(om: &OwnedBlobMatch, collector: Option<&AccessMapColl
.find(|(name, ..)| name == "TOKEN")
.map(|(_, value, ..)| value.clone())
.unwrap_or_default();
let access_key =
utils::find_closest_variable(&captures, &secret_key, "TOKEN", "STS_AKID")
.or_else(|| om.dependent_captures.get("STS_AKID").cloned())
.unwrap_or_default();
let session_token =
utils::find_closest_variable(&captures, &secret_key, "TOKEN", "SECURITY_TOKEN")
.or_else(|| om.dependent_captures.get("SECURITY_TOKEN").cloned())
.unwrap_or_default();
let access_key = utils::find_closest_variable(
&captures,
secret_key.as_str(),
"TOKEN",
"STS_AKID",
)
.or_else(|| om.dependent_captures.get("STS_AKID").cloned())
.unwrap_or_default();
let session_token = utils::find_closest_variable(
&captures,
secret_key.as_str(),
"TOKEN",
"SECURITY_TOKEN",
)
.or_else(|| om.dependent_captures.get("SECURITY_TOKEN").cloned())
.unwrap_or_default();
if !access_key.is_empty() && !secret_key.is_empty() && !session_token.is_empty() {
collector.record_alibaba(

View file

@ -15,7 +15,7 @@ use http::StatusCode;
use liquid::Object;
use liquid_core::{Value, ValueView};
use reqwest::{Client, Url, header, header::HeaderValue, multipart};
use rustc_hash::FxHashMap;
use rustc_hash::{FxHashMap, FxHashSet};
use tokio::{sync::Notify, time};
use tracing::{debug, trace};
@ -260,7 +260,9 @@ type Cache = kingfisher_scanner::validation::Cache;
/// Returns an opaque 64-bit key for internal validation deduplication.
///
/// This is an INTERNAL key used only for validation deduplication within a single scan.
/// It uses `captures.get(0)` to get the primary secret value.
/// It uses `captures.get(0)` to get the primary secret value. Rules with dependent
/// variables also include blob location because validation can depend on nearby context
/// such as an AWS access-key ID paired with a secret access key.
///
/// **Important**: This is distinct from the EXTERNAL `finding_fingerprint` used for:
/// - Baseline comparisons across scans
@ -279,6 +281,13 @@ fn validation_dedup_key(m: &OwnedBlobMatch) -> u64 {
if let Some(val) = capture_value {
val.hash(&mut hasher);
}
if !m.rule.syntax().depends_on_rule.is_empty() {
m.blob_id.hash(&mut hasher);
m.matching_input_offset_span.start.hash(&mut hasher);
m.matching_input_offset_span.end.hash(&mut hasher);
}
let key = hasher.finish();
trace!(
@ -693,7 +702,7 @@ async fn timed_validate_single_match<'a>(
validate_jwt_rule(m, &captured_values, use_lax_tls, clients.allow_internal_ips).await;
}
Some(Validation::AWS) => {
validate_aws_rule(m, &captured_values, cache).await;
validate_aws_rule(m, &captured_values, dependent_variables, cache).await;
}
Some(Validation::GCP) => {
validate_gcp_rule(m, &globals, cache).await;
@ -1186,7 +1195,7 @@ async fn validate_azure_storage(
.map(|(_, v, ..)| v.clone())
.unwrap_or_default();
let storage_account =
utils::find_closest_variable(captured_values, &storage_key, "TOKEN", "AZURENAME")
utils::find_closest_variable(captured_values, storage_key.as_str(), "TOKEN", "AZURENAME")
.unwrap_or_default();
if storage_account.is_empty() || storage_key.is_empty() {
@ -1391,6 +1400,7 @@ async fn validate_jwt_rule(
async fn validate_aws_rule(
m: &mut OwnedBlobMatch,
captured_values: &[(String, String, usize, usize)],
dependent_variables: &FxHashMap<String, Vec<(String, OffsetSpan)>>,
cache: &Cache,
) {
let secret = captured_values
@ -1398,10 +1408,8 @@ async fn validate_aws_rule(
.find(|(n, ..)| n == "TOKEN")
.map(|(_, v, ..)| v.clone())
.unwrap_or_default();
let akid =
utils::find_closest_variable(captured_values, &secret, "TOKEN", "AKID").unwrap_or_default();
if akid.is_empty() || secret.is_empty() {
if secret.is_empty() {
m.validation_success = false;
m.validation_response_body =
validation_body::from_string("Missing AWS access-key ID or secret.".to_string());
@ -1409,77 +1417,169 @@ async fn validate_aws_rule(
return;
}
let cache_key = aws::generate_aws_cache_key(&akid, &secret);
if let Some(cached) = cache.get(&cache_key) {
let c = cached.value();
if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
m.validation_success = c.is_valid;
m.validation_response_body = c.body.clone();
m.validation_response_status = c.status;
return;
}
}
let akid_candidates = aws_akid_candidates(
captured_values,
dependent_variables.get("AKID"),
m.matching_input_offset_span,
&secret,
);
if let Some(account_id) = aws::should_skip_aws_validation(&akid) {
m.validation_success = false;
m.validation_response_body = validation_body::from_string(format!(
"(skip list entry) AWS validation not attempted for account {}.",
account_id
));
m.validation_response_status = StatusCode::PRECONDITION_REQUIRED;
cache.insert(
cache_key,
CachedResponse {
body: m.validation_response_body.clone(),
status: m.validation_response_status,
is_valid: m.validation_success,
timestamp: Instant::now(),
},
);
return;
}
if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) {
if akid_candidates.is_empty() {
m.validation_success = false;
m.validation_response_body =
validation_body::from_string(format!("Invalid AWS credentials ({}): {}", akid, e));
validation_body::from_string("Missing AWS access-key ID or secret.".to_string());
m.validation_response_status = StatusCode::BAD_REQUEST;
return;
}
match aws::validate_aws_credentials(&akid, &secret).await {
Ok((ok, msg)) => {
m.validation_success = ok;
if ok {
let mut body = format!("{} --- ARN: {}", akid, msg);
if let Ok(acct) = aws::aws_key_to_account_number(&akid) {
body.push_str(&format!(" --- AWS Account Number: {:012}", acct));
let mut last_body = None;
let mut last_status = StatusCode::UNAUTHORIZED;
for akid in akid_candidates {
let cache_key = aws::generate_aws_cache_key(&akid, &secret);
if let Some(cached) = cache.get(&cache_key) {
let c = cached.value();
if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
if c.is_valid {
m.validation_success = c.is_valid;
m.validation_response_body = c.body.clone();
m.validation_response_status = c.status;
return;
}
m.validation_response_body = validation_body::from_string(body);
m.validation_response_status = StatusCode::OK;
} else {
m.validation_response_body = validation_body::from_string(format!(
"AWS validation error ({}): {}",
akid, msg
));
m.validation_response_status = StatusCode::UNAUTHORIZED;
last_body = Some(c.body.clone());
last_status = c.status;
continue;
}
}
if let Some(account_id) = aws::should_skip_aws_validation(&akid) {
let body = validation_body::from_string(format!(
"(skip list entry) AWS validation not attempted for account {}.",
account_id
));
cache.insert(
cache_key,
CachedResponse {
body: m.validation_response_body.clone(),
status: m.validation_response_status,
is_valid: m.validation_success,
body: body.clone(),
status: StatusCode::PRECONDITION_REQUIRED,
is_valid: false,
timestamp: Instant::now(),
},
);
last_body = Some(body);
last_status = StatusCode::PRECONDITION_REQUIRED;
continue;
}
Err(e) => {
m.validation_success = false;
m.validation_response_body =
validation_body::from_string(format!("AWS validation error ({}): {}", akid, e));
m.validation_response_status = StatusCode::BAD_GATEWAY;
if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) {
let body =
validation_body::from_string(format!("Invalid AWS credentials ({}): {}", akid, e));
cache.insert(
cache_key,
CachedResponse {
body: body.clone(),
status: StatusCode::BAD_REQUEST,
is_valid: false,
timestamp: Instant::now(),
},
);
last_body = Some(body);
last_status = StatusCode::BAD_REQUEST;
continue;
}
match aws::validate_aws_credentials(&akid, &secret).await {
Ok((ok, msg)) => {
if ok {
m.validation_success = true;
let mut body = format!("{} --- ARN: {}", akid, msg);
if let Ok(acct) = aws::aws_key_to_account_number(&akid) {
body.push_str(&format!(" --- AWS Account Number: {:012}", acct));
}
m.validation_response_body = validation_body::from_string(body);
m.validation_response_status = StatusCode::OK;
cache.insert(
cache_key,
CachedResponse {
body: m.validation_response_body.clone(),
status: m.validation_response_status,
is_valid: true,
timestamp: Instant::now(),
},
);
return;
}
let body = validation_body::from_string(format!(
"AWS validation error ({}): {}",
akid, msg
));
cache.insert(
cache_key,
CachedResponse {
body: body.clone(),
status: StatusCode::UNAUTHORIZED,
is_valid: false,
timestamp: Instant::now(),
},
);
last_body = Some(body);
last_status = StatusCode::UNAUTHORIZED;
}
Err(e) => {
last_body = Some(validation_body::from_string(format!(
"AWS validation error ({}): {}",
akid, e
)));
last_status = StatusCode::BAD_GATEWAY;
}
}
}
m.validation_success = false;
m.validation_response_body = last_body.unwrap_or_else(|| {
validation_body::from_string("AWS validation failed for all nearby access-key IDs.")
});
m.validation_response_status = last_status;
}
fn aws_akid_candidates(
captured_values: &[(String, String, usize, usize)],
dependent_akids: Option<&Vec<(String, OffsetSpan)>>,
target_span: OffsetSpan,
secret: &str,
) -> Vec<String> {
let mut candidates = Vec::new();
if let Some(closest) = utils::find_closest_variable(captured_values, secret, "TOKEN", "AKID") {
candidates.push((0usize, closest));
}
if let Some(values) = dependent_akids {
candidates.extend(
values
.iter()
.map(|(value, span)| (dependency_distance(*span, target_span), value.clone())),
);
}
candidates.sort_by_key(|(distance, _)| *distance);
let mut seen = FxHashSet::default();
candidates
.into_iter()
.filter_map(|(_, value)| if seen.insert(value.clone()) { Some(value) } else { None })
.take(64)
.collect()
}
fn dependency_distance(span: OffsetSpan, target_span: OffsetSpan) -> usize {
if span.end <= target_span.start {
target_span.start - span.end
} else if span.start >= target_span.end {
span.start - target_span.end
} else {
0
}
}
@ -1746,6 +1846,47 @@ mod tests {
assert_eq!(selected.1, OffsetSpan::from_range(70..80));
}
#[test]
fn aws_akid_candidates_orders_by_proximity_and_deduplicates() {
let captured_values = vec![
("TOKEN".to_string(), "secret".to_string(), 100usize, 140usize),
("AKID".to_string(), "closest_capture".to_string(), 80usize, 90usize),
];
let dependent_akids = vec![
("far_before".to_string(), OffsetSpan::from_range(10..20)),
("near_after".to_string(), OffsetSpan::from_range(150..160)),
("overlap".to_string(), OffsetSpan::from_range(110..120)),
("closest_capture".to_string(), OffsetSpan::from_range(80..90)),
];
let candidates = aws_akid_candidates(
&captured_values,
Some(&dependent_akids),
OffsetSpan::from_range(100..140),
"secret",
);
assert_eq!(candidates, vec!["closest_capture", "overlap", "near_after", "far_before"]);
}
#[test]
fn aws_akid_candidates_caps_unique_candidates() {
let dependent_akids = (0..70)
.map(|i| (format!("akid{i}"), OffsetSpan::from_range((i * 2)..(i * 2 + 1))))
.collect::<Vec<_>>();
let candidates = aws_akid_candidates(
&[],
Some(&dependent_akids),
OffsetSpan::from_range(1_000..1_010),
"secret",
);
assert_eq!(candidates.len(), 64);
assert_eq!(candidates.first().map(String::as_str), Some("akid69"));
assert_eq!(candidates.last().map(String::as_str), Some("akid6"));
}
#[test]
fn truncate_to_char_boundary_handles_multibyte_characters() {
let max_len = 2048;

View file

@ -153,7 +153,7 @@ pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String,
pub fn find_closest_variable(
captures: &[(String, String, usize, usize)],
target_value: &String,
target_value: &str,
target_variable_name: &str,
search_variable_name: &str,
) -> Option<String> {
@ -161,7 +161,7 @@ pub fn find_closest_variable(
// compare relative offsets with candidate variables.
let mut target_positions = Vec::new();
for (name, value, start, end) in captures {
if name == target_variable_name && value == target_value {
if name == target_variable_name && value.as_str() == target_value {
target_positions.push((*start, *end));
}
}
@ -346,8 +346,7 @@ mod tests {
("AKID".to_string(), "following".to_string(), 180usize, 200usize),
];
let result =
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
let result = find_closest_variable(&captures, "secret", "TOKEN", "AKID").unwrap();
assert_eq!(result, "preceding".to_string());
}
@ -359,8 +358,7 @@ mod tests {
("AKID".to_string(), "after".to_string(), 60usize, 80usize),
];
let result =
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
let result = find_closest_variable(&captures, "secret", "TOKEN", "AKID").unwrap();
assert_eq!(result, "after".to_string());
}

View file

@ -21,7 +21,11 @@ fn smoke_scan_tar_gz_archive() -> anyhow::Result<()> {
let mut t = Builder::new(gz);
let data = format!("token={github_pat}\n");
t.append_data(&mut tar::Header::new_gnu(), "secret.txt", data.as_bytes())?;
let mut header = tar::Header::new_gnu();
header.set_size(data.len() as u64);
header.set_mode(0o644);
header.set_cksum();
t.append_data(&mut header, "secret.txt", data.as_bytes())?;
t.into_inner()?.finish()?;
}