kingfisher/src/decompress.rs
2026-05-28 21:01:44 -07:00

1264 lines
49 KiB
Rust
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use std::{
fs,
io::{BufReader, Read, Write},
path::{Component, Path, PathBuf},
};
use anyhow::Result;
use asar::AsarReader;
use bzip2_rs::DecoderReader;
use cfb::CompoundFile;
use flate2::read::{DeflateDecoder, GzDecoder, ZlibDecoder};
use lzma_rs::xz_decompress;
use memmap2::Mmap;
use tar::Archive;
use tempfile::{TempDir, tempdir};
use uuid::Uuid;
use zip::ZipArchive;
/// Formats that are basically a ZIP container.
pub const ZIP_BASED_FORMATS: &[&str] = &[
"zip", "zipx", "jar", "war", "ear", "aar", "apk", "aab", "ipa", "jmod", "jhm", "jnlp", "nupkg",
"vsix", "xap", "docx", "xlsx", "pptx", "odt", "ods", "odp", "odg", "odf", "epub", "gadget",
"kmz", "widget", "xpi", "sketch", "pages", "key", "numbers", "hwpx",
];
fn is_tar_wrapped_compression(path: &Path) -> bool {
let filename = match path.file_name().and_then(|s| s.to_str()) {
Some(name) => name.to_ascii_lowercase(),
None => return false,
};
filename.ends_with(".tgz")
|| filename.ends_with(".tar.gz")
|| filename.ends_with(".tar.gzip")
|| filename.ends_with(".tar.bz2")
|| filename.ends_with(".tar.bzip2")
|| filename.ends_with(".tar.xz")
}
#[derive(Debug)]
pub enum CompressedContent {
/// Decompressed content fully in memory.
Raw(Vec<u8>),
/// Decompressed content streamed to a file on disk.
RawFile(PathBuf),
/// Archive entries fully in memory (original approach).
Archive(Vec<(String, Vec<u8>)>),
/// Archive entries each extracted to a file on disk (streaming approach).
ArchiveFiles(Vec<(String, PathBuf)>),
}
pub fn is_safe_extract_path(path: &Path) -> bool {
if path.is_absolute() {
return false;
}
for comp in path.components() {
match comp {
// Never allow parent-directory escapes
Component::ParentDir => return false,
// Archive entry names must always be relative to the extraction root.
Component::Prefix(_) | Component::RootDir => return false,
_ => {}
}
}
true
}
fn has_parent_or_embedded_prefix(path: &Path) -> bool {
for (idx, comp) in path.components().enumerate() {
match comp {
Component::ParentDir => return true,
Component::Prefix(_) if idx > 0 => return true,
_ => {}
}
}
false
}
fn is_zip_format(ext: &str) -> bool {
ZIP_BASED_FORMATS.iter().any(|z| z == &ext)
}
/* ───────────────────────────────────────────────────────────────
helpers for streaming archives
───────────────────────────────────────────────────────────── */
fn handle_tar_archive_streaming(
file: &mut fs::File,
archive_path: &Path,
base_dir: &Path,
) -> Result<CompressedContent> {
let mut archive = Archive::new(file);
let mut entries_on_disk = Vec::new();
for entry in archive.entries()? {
let mut entry = entry?;
if entry.header().entry_type().is_file() {
let path_in_tar = entry.path()?.to_string_lossy().to_string();
if !is_safe_extract_path(Path::new(&path_in_tar)) {
tracing::warn!("unsafe tar path: {path_in_tar}");
continue;
}
let logical_path = format!("{}!{}", archive_path.display(), path_in_tar);
let out_path = base_dir.join(&path_in_tar);
if let Some(parent) = out_path.parent() {
if let Err(e) = fs::create_dir_all(parent) {
tracing::debug!("failed to create directory {}: {}", parent.display(), e);
continue;
}
}
match fs::File::create(&out_path) {
Ok(mut out_file) => {
if let Err(e) = std::io::copy(&mut entry, &mut out_file) {
tracing::debug!("failed to extract {}: {}", out_path.display(), e);
continue;
}
entries_on_disk.push((logical_path, out_path));
}
Err(e) => {
tracing::debug!("failed to create file {}: {}", out_path.display(), e);
continue;
}
}
}
}
Ok(CompressedContent::ArchiveFiles(entries_on_disk))
}
/// Extract every file entry in a ZIP-based archive directly from a byte
/// slice, without touching the filesystem. Intended for the git-blob
/// scan path where blobs already sit in memory and writing them out to a
/// temp file just to read them back imposes substantial overhead in
/// monorepos with many committed `.jar`/`.zip`/`.apk` artifacts.
///
/// `archive_label` is used to construct logical entry paths of the form
/// `<archive_label>!<entry_name>`, matching the convention used by the
/// streaming-to-disk path.
///
/// The same per-entry decompressed-size cap as the streaming-to-disk
/// extractor is enforced so that ZIP bombs cannot allocate unbounded
/// memory.
/// Maximum compressed archive size that the in-memory ZIP extractor will
/// accept. Larger archives fall back to the disk-streaming path so that we
/// never hold both the archive bytes AND every decompressed entry in RAM
/// simultaneously. The threshold is intentionally generous — most committed
/// `.jar`/`.zip`/`.apk` artifacts in real repos are well under 64 MB.
pub const MAX_INMEM_ZIP_ARCHIVE_BYTES: usize = 64 * 1024 * 1024;
/// Aggregate cap on total decompressed bytes the in-memory ZIP extractor
/// will accumulate per archive. Bounds the worst-case footprint of one
/// rayon worker processing one archive: with `num_jobs` workers running
/// in parallel, peak resident memory is bounded by `num_jobs * this`.
/// Independent of the per-entry cap, so a single bomb-style entry can't
/// drain it all but neither can N medium-sized entries.
pub const MAX_INMEM_ZIP_DECOMPRESSED_BYTES: u64 = 256 * 1024 * 1024;
pub fn extract_zip_archive_in_memory(
data: &[u8],
archive_label: &str,
) -> Result<Vec<(String, Vec<u8>)>> {
if data.len() > MAX_INMEM_ZIP_ARCHIVE_BYTES {
anyhow::bail!(
"zip archive {archive_label} is {} bytes, exceeding {} byte in-memory cap",
data.len(),
MAX_INMEM_ZIP_ARCHIVE_BYTES
);
}
// Per-entry cap on decompressed bytes: bounds memory cost of zip bombs.
// Mirrors the disk-streaming variant's cap.
// nosemgrep: this is the defensive cap — do not flag for missing-limit rules.
const MAX_ZIP_ENTRY_DECOMPRESSED_BYTES: u64 = 512 * 1024 * 1024;
let cursor = std::io::Cursor::new(data);
let mut zip = ZipArchive::new(cursor)?;
let mut entries = Vec::with_capacity(zip.len());
let mut total_decompressed: u64 = 0;
for i in 0..zip.len() {
if total_decompressed >= MAX_INMEM_ZIP_DECOMPRESSED_BYTES {
tracing::warn!(
"in-memory zip {archive_label} exceeded {MAX_INMEM_ZIP_DECOMPRESSED_BYTES} byte aggregate cap at entry {i}/{}; truncating",
zip.len()
);
break;
}
let mut zipped_file = match zip.by_index(i) {
Ok(f) => f,
Err(e) => {
tracing::debug!("zip entry {i} read failed: {e}");
continue;
}
};
if !zipped_file.is_file() {
continue;
}
let name_in_zip = zipped_file.name().to_string();
// Defense in depth: refuse traversal-style names. The in-memory
// path never writes to disk, but downstream code may construct
// file URLs from these strings.
if !is_safe_extract_path(Path::new(&name_in_zip)) {
tracing::warn!("unsafe zip entry name in {archive_label}: {name_in_zip}");
continue;
}
// The remaining-budget cap on this read serves two purposes:
// (1) honor the aggregate budget exactly even if one entry would
// individually push us over it, and (2) keep the existing
// per-entry zip-bomb cap of 512 MB as a hard upper bound.
let remaining = MAX_INMEM_ZIP_DECOMPRESSED_BYTES.saturating_sub(total_decompressed);
let entry_cap = remaining.min(MAX_ZIP_ENTRY_DECOMPRESSED_BYTES);
let mut buf = Vec::new();
let mut limited = (&mut zipped_file).take(entry_cap);
if let Err(e) = limited.read_to_end(&mut buf) {
tracing::debug!(
"failed to decompress zip entry {name_in_zip} from {archive_label}: {e}"
);
continue;
}
if buf.len() as u64 == entry_cap && entry_cap == MAX_ZIP_ENTRY_DECOMPRESSED_BYTES {
tracing::warn!(
"zip entry {name_in_zip} in {archive_label} exceeded {MAX_ZIP_ENTRY_DECOMPRESSED_BYTES} byte cap; truncating"
);
}
total_decompressed += buf.len() as u64;
entries.push((format!("{archive_label}!{name_in_zip}"), buf));
}
Ok(entries)
}
/// Return true if `data` begins with a standard ZIP signature — used to
/// short-circuit extraction attempts on blobs whose extension matches a
/// ZIP-based format but whose contents are not actually a real ZIP.
pub fn looks_like_zip(data: &[u8]) -> bool {
data.starts_with(b"PK\x03\x04")
|| data.starts_with(b"PK\x05\x06")
|| data.starts_with(b"PK\x07\x08")
}
fn handle_zip_archive_streaming(
file: &mut fs::File,
archive_path: &Path,
base_dir: &Path,
) -> Result<CompressedContent> {
// Per-entry cap on decompressed bytes: bounds CPU/disk cost of zip bombs
// by refusing to read more than this much from any single entry.
// nosemgrep: this is the defensive cap — do not flag for missing-limit rules.
const MAX_ZIP_ENTRY_DECOMPRESSED_BYTES: u64 = 512 * 1024 * 1024;
let mut zip = ZipArchive::new(file)?;
let mut entries_on_disk = Vec::new();
let mut total_decompressed: u64 = 0;
for i in 0..zip.len() {
if total_decompressed >= MAX_INMEM_ZIP_DECOMPRESSED_BYTES {
tracing::warn!(
"zip archive {} exceeded {} byte aggregate cap at entry {i}/{}; truncating",
archive_path.display(),
MAX_INMEM_ZIP_DECOMPRESSED_BYTES,
zip.len()
);
break;
}
let mut zipped_file = zip.by_index(i)?;
if zipped_file.is_file() {
let name_in_zip = zipped_file.name().to_string();
if !is_safe_extract_path(Path::new(&name_in_zip)) {
tracing::warn!("unsafe zip path: {name_in_zip}");
continue;
}
let logical_path = format!("{}!{}", archive_path.display(), name_in_zip);
let out_path = base_dir.join(&name_in_zip);
if let Some(parent) = out_path.parent() {
if let Err(e) = fs::create_dir_all(parent) {
tracing::debug!("failed to create directory {}: {}", parent.display(), e);
continue;
}
}
match fs::File::create(&out_path) {
Ok(mut out_file) => {
let remaining =
MAX_INMEM_ZIP_DECOMPRESSED_BYTES.saturating_sub(total_decompressed);
let entry_cap = remaining.min(MAX_ZIP_ENTRY_DECOMPRESSED_BYTES);
let mut limited = (&mut zipped_file).take(entry_cap);
let copied = match std::io::copy(&mut limited, &mut out_file) {
Ok(n) => n,
Err(e) => {
tracing::debug!("failed to extract {}: {}", out_path.display(), e);
continue;
}
};
total_decompressed += copied;
if copied == entry_cap && entry_cap == MAX_ZIP_ENTRY_DECOMPRESSED_BYTES {
tracing::warn!(
"zip entry {} exceeded {} byte cap; truncating",
out_path.display(),
MAX_ZIP_ENTRY_DECOMPRESSED_BYTES
);
}
entries_on_disk.push((logical_path, out_path));
if total_decompressed >= MAX_INMEM_ZIP_DECOMPRESSED_BYTES {
tracing::warn!(
"zip archive {} reached {} byte aggregate cap; truncating remaining entries",
archive_path.display(),
MAX_INMEM_ZIP_DECOMPRESSED_BYTES
);
break;
}
}
Err(e) => {
tracing::debug!("failed to create file {}: {}", out_path.display(), e);
continue;
}
}
}
}
Ok(CompressedContent::ArchiveFiles(entries_on_disk))
}
/// Extract streams from an HWP (Hancom Word Processor) file.
///
/// HWP 5.x uses the Microsoft Compound File Binary (OLE2/CFBF) container.
/// Body streams (e.g. `BodyText/Section*`) are typically raw DEFLATE
/// without a zlib header, others may be zlib-framed, and metadata
/// streams are plaintext UTF-16/ASCII. We try DEFLATE then zlib, and
/// fall back to the raw bytes so the scanner always sees content.
fn handle_hwp_archive_in_memory(path: &Path, archive_path: &Path) -> Result<CompressedContent> {
// Per-stream caps to defend against malformed or hostile HWP containers
// (huge CFB streams or deflate bombs). Raw bytes are bounded by the size
// of the stream on disk; decoded output is capped independently so a
// small compressed payload can't fan out to gigabytes.
// nosemgrep: this is the defensive cap we want — do not flag for
// "magic number" or missing-limit rules, it *is* the limit.
const MAX_HWP_RAW_BYTES: u64 = 64 * 1024 * 1024;
const MAX_HWP_DECODED_BYTES: u64 = 512 * 1024 * 1024;
let file = safe_open_for_read(path)?;
let mut cf = CompoundFile::open(file)?;
let stream_paths: Vec<PathBuf> =
cf.walk().filter(|e| e.is_stream()).map(|e| e.path().to_path_buf()).collect();
let mut out = Vec::with_capacity(stream_paths.len());
for sp in stream_paths {
let mut raw = Vec::new();
match cf.open_stream(&sp) {
Ok(s) => {
let mut limited = s.take(MAX_HWP_RAW_BYTES);
if let Err(e) = limited.read_to_end(&mut raw) {
tracing::debug!("failed to read hwp stream {}: {}", sp.display(), e);
continue;
}
}
Err(e) => {
tracing::debug!("failed to open hwp stream {}: {}", sp.display(), e);
continue;
}
}
let try_decode = |mut decoder: Box<dyn Read>| -> Option<Vec<u8>> {
let mut buf = Vec::new();
match decoder.read_to_end(&mut buf) {
Ok(_) if !buf.is_empty() => Some(buf),
_ => None,
}
};
let decoded = if raw.is_empty() {
raw
} else {
let deflate =
try_decode(Box::new(DeflateDecoder::new(&raw[..]).take(MAX_HWP_DECODED_BYTES)));
if let Some(buf) = deflate {
buf
} else {
let zlib =
try_decode(Box::new(ZlibDecoder::new(&raw[..]).take(MAX_HWP_DECODED_BYTES)));
zlib.unwrap_or(raw)
}
};
let logical = format!("{}!{}", archive_path.display(), sp.display());
out.push((logical, decoded));
}
Ok(CompressedContent::Archive(out))
}
fn handle_asar_archive_in_memory(buffer: &[u8], archive_path: &Path) -> Result<CompressedContent> {
// Per-entry cap: ASAR files have an index listing arbitrary sizes, and
// a malformed or hostile archive could claim a single multi-GB entry.
// We cap each entry independently even though the outer buffer is
// already size-limited, to avoid ever copying a giant slice.
// nosemgrep: this is the defensive cap — do not flag for missing-limit rules.
const MAX_ASAR_ENTRY_BYTES: usize = 512 * 1024 * 1024;
match AsarReader::new(buffer, None) {
Ok(reader) => {
let mut contents = Vec::new();
for (path_in_asar, file) in reader.files() {
let inner_path = path_in_asar.to_string_lossy().replace('\\', "/");
if !is_safe_extract_path(Path::new(&inner_path)) {
tracing::warn!("unsafe asar path: {inner_path}");
continue;
}
let logical_path = format!("{}!{}", archive_path.display(), inner_path);
let data = file.data();
let take = data.len().min(MAX_ASAR_ENTRY_BYTES);
if take < data.len() {
tracing::warn!(
"asar entry {} exceeded {} byte cap; truncating",
inner_path,
MAX_ASAR_ENTRY_BYTES
);
}
contents.push((logical_path, data[..take].to_vec()));
}
Ok(CompressedContent::Archive(contents))
}
Err(_) => Ok(CompressedContent::Archive(Vec::new())),
}
}
fn materialize_in_memory_archive_entries(
files: &[(String, Vec<u8>)],
base_dir: &Path,
) -> Result<()> {
for (name, data) in files {
let rel = name.split_once('!').map(|(_, sub)| sub).unwrap_or(name.as_str());
let normalized_rel = rel.replace('\\', "/");
let rel_path = Path::new(&normalized_rel);
if !is_safe_extract_path(rel_path) {
tracing::warn!("unsafe archive path: {normalized_rel}");
continue;
}
let p = base_dir.join(rel_path);
if let Some(parent) = p.parent() {
fs::create_dir_all(parent)?;
}
fs::write(p, data)?;
}
Ok(())
}
/// Validate and open a file for reading, checking for path traversal attacks.
fn safe_open_for_read(path: &Path) -> Result<fs::File> {
if has_parent_or_embedded_prefix(path) {
anyhow::bail!("unsafe input path during decompression: {}", path.display());
}
Ok(fs::File::open(path)?)
}
/// Validate and create a file for writing, checking for path traversal attacks.
fn safe_create_for_write(path: &Path) -> Result<fs::File> {
if has_parent_or_embedded_prefix(path) {
anyhow::bail!("unsafe output path during decompression: {}", path.display());
}
Ok(fs::File::create(path)?)
}
/// Hard cap on the number of bytes a single-stream decompressor
/// (gzip/bzip2/xz/zlib) will write to disk for one input. Mirrors the per-entry
/// decompressed cap enforced by the ZIP extractors so a small "compression
/// bomb" cannot expand without limit and exhaust the scanner's temporary
/// filesystem. Output past the cap is dropped and a truncation warning logged.
// nosemgrep: this is the defensive cap — do not flag for missing-limit rules.
pub const MAX_SINGLE_STREAM_DECOMPRESSED_BYTES: u64 = 512 * 1024 * 1024;
/// `Write` adaptor that drops everything past `remaining` bytes.
///
/// Bounds single-stream decompressors so a high-ratio compression bomb can't
/// fill the disk. Writes past the cap are reported as fully consumed so the
/// underlying decoder runs to completion instead of erroring or looping; the
/// truncation is surfaced via [`CappedWriter::truncated`], matching the
/// truncate-and-warn behavior of the ZIP entry extractor.
struct CappedWriter<W: Write> {
inner: W,
remaining: u64,
truncated: bool,
}
impl<W: Write> CappedWriter<W> {
fn new(inner: W, cap: u64) -> Self {
Self { inner, remaining: cap, truncated: false }
}
fn truncated(&self) -> bool {
self.truncated
}
}
impl<W: Write> Write for CappedWriter<W> {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
let allowed = (buf.len() as u64).min(self.remaining) as usize;
if allowed > 0 {
self.inner.write_all(&buf[..allowed])?;
self.remaining -= allowed as u64;
}
if allowed < buf.len() {
self.truncated = true;
}
// Report the whole buffer as consumed even when the tail was dropped so
// `io::copy`/`xz_decompress` don't treat the cap as a write error.
Ok(buf.len())
}
fn flush(&mut self) -> std::io::Result<()> {
self.inner.flush()
}
}
fn stream_to_file_capped<R: Read>(
mut decoder: R,
out_path: &Path,
cap: u64,
) -> Result<CompressedContent> {
let out_file = safe_create_for_write(out_path)?;
let mut capped = CappedWriter::new(out_file, cap);
std::io::copy(&mut decoder, &mut capped)?;
if capped.truncated() {
tracing::warn!(
"decompressed stream written to {} exceeded {cap} byte cap; truncating",
out_path.display()
);
}
Ok(CompressedContent::RawFile(out_path.to_owned()))
}
fn stream_xz_to_file_capped(path: &Path, out_path: &Path, cap: u64) -> Result<CompressedContent> {
let input = safe_open_for_read(path)?;
let mut reader = BufReader::new(input);
let out_file = safe_create_for_write(out_path)?;
let mut capped = CappedWriter::new(out_file, cap);
xz_decompress(&mut reader, &mut capped)?;
if capped.truncated() {
tracing::warn!(
"decompressed xz stream written to {} exceeded {cap} byte cap; truncating",
out_path.display()
);
}
Ok(CompressedContent::RawFile(out_path.to_owned()))
}
/* ───────────────────────────────────────────────────────────────
one *step* of decompression
───────────────────────────────────────────────────────────── */
fn decompress_once_with_single_stream_cap(
path: &Path,
base_dir: Option<&Path>,
single_stream_cap: u64,
) -> Result<CompressedContent> {
let extension = path.extension().and_then(|ext| ext.to_str()).map(|s| s.to_ascii_lowercase());
let mut file = safe_open_for_read(path)?;
if let Some(ext) = extension.as_deref() {
match ext {
"asar" => {
let mmap = unsafe { Mmap::map(&file)? };
return handle_asar_archive_in_memory(&mmap, path);
}
"hwp" => {
return handle_hwp_archive_in_memory(path, path);
}
"egg" => {
// No open-source EGG (ALZip) extractor exists. Return the
// raw bytes so plaintext content inside the container is
// still scanned.
let mut buffer = Vec::new();
file.read_to_end(&mut buffer)?;
return Ok(CompressedContent::Raw(buffer));
}
"tar" => {
if let Some(base) = base_dir {
return handle_tar_archive_streaming(&mut file, path, base);
} else {
let temp = tempdir()?;
return handle_tar_archive_streaming(&mut file, path, temp.path());
}
}
_ if is_zip_format(ext) => {
if let Some(base) = base_dir {
return handle_zip_archive_streaming(&mut file, path, base);
} else {
let temp = tempdir()?;
return handle_zip_archive_streaming(&mut file, path, temp.path());
}
}
"gz" | "gzip" | "tgz" => {
let out_path = make_output_path(path, base_dir, "decomp.tar");
let decoder = GzDecoder::new(BufReader::new(safe_open_for_read(path)?));
return stream_to_file_capped(decoder, &out_path, single_stream_cap);
}
"bz2" | "bzip2" => {
let out_path = make_output_path(path, base_dir, "decomp.tar");
let decoder = DecoderReader::new(BufReader::new(safe_open_for_read(path)?));
return stream_to_file_capped(decoder, &out_path, single_stream_cap);
}
"xz" => {
let out_path = make_output_path(path, base_dir, "decomp.tar");
return stream_xz_to_file_capped(path, &out_path, single_stream_cap);
}
"zlib" => {
let out_path = make_output_path(path, base_dir, "decomp.tar");
let decoder = ZlibDecoder::new(BufReader::new(safe_open_for_read(path)?));
return stream_to_file_capped(decoder, &out_path, single_stream_cap);
}
_ => {}
}
}
// Unknown extension -- just read the bytes
let mut buffer = Vec::new();
file.read_to_end(&mut buffer)?;
Ok(CompressedContent::Raw(buffer))
}
/* ───────────────────────────────────────────────────────────────
public entry point keeps peeling layers
───────────────────────────────────────────────────────────── */
pub fn decompress_file(path: &Path, base_dir: Option<&Path>) -> Result<CompressedContent> {
decompress_file_with_single_stream_cap(path, base_dir, MAX_SINGLE_STREAM_DECOMPRESSED_BYTES)
}
pub fn decompress_file_with_single_stream_cap(
path: &Path,
base_dir: Option<&Path>,
single_stream_cap: u64,
) -> Result<CompressedContent> {
let mut current_path: &Path = path;
let mut owned_buf: Option<PathBuf>;
loop {
let should_extract_tar = is_tar_wrapped_compression(current_path);
let content =
decompress_once_with_single_stream_cap(current_path, base_dir, single_stream_cap)?;
// If the step produced a single on-disk file that is itself a .tar,
// recurse on that file.
if let CompressedContent::RawFile(ref p) = content {
if should_extract_tar {
owned_buf = Some(p.clone()); // own the path
current_path = owned_buf.as_ref().unwrap();
continue;
}
}
return Ok(content);
}
}
fn make_output_path(path: &Path, base: Option<&Path>, extension: &str) -> PathBuf {
if let Some(b) = base {
let stem = path.file_stem().unwrap_or_default();
b.join(stem).with_extension(extension)
} else {
std::env::temp_dir().join(format!(
"kingfisher-{}-{}-{}",
std::process::id(),
Uuid::new_v4(),
extension
))
}
}
pub fn decompress_file_to_temp(path: &Path) -> Result<(CompressedContent, TempDir)> {
let temp_dir = tempdir()?;
let mut content = decompress_file(path, Some(temp_dir.path()))?;
// if let CompressedContent::Archive(ref files) = content {
let mut prefix_for_replace = None;
if let Some(stem) = path.file_stem() {
let candidate = temp_dir.path().join(stem).with_extension("decomp.tar");
prefix_for_replace = Some(candidate);
}
if let CompressedContent::Archive(ref mut files) = content {
if let Some(prefix) = &prefix_for_replace {
let prefix_str = prefix.display().to_string();
for (name, _) in files.iter_mut() {
if let Some(rest) = name.strip_prefix(&prefix_str) {
if let Some((_, suffix)) = rest.split_once('!') {
*name = format!("{}!{}", path.display(), suffix);
}
}
}
}
materialize_in_memory_archive_entries(files, temp_dir.path())?;
} else if let CompressedContent::ArchiveFiles(ref mut entries) = content {
if let Some(prefix) = &prefix_for_replace {
let prefix_str = prefix.display().to_string();
for (name, _) in entries.iter_mut() {
if let Some(rest) = name.strip_prefix(&prefix_str) {
if let Some((_, suffix)) = rest.split_once('!') {
*name = format!("{}!{}", path.display(), suffix);
}
}
}
}
}
Ok((content, temp_dir))
}
#[cfg(test)]
mod tests {
use std::{fs::File, io::Write, path::Path};
use flate2::{Compression, write::GzEncoder};
use tar::Builder;
use tempfile::tempdir;
use zip::{CompressionMethod, ZipWriter, write::SimpleFileOptions};
use super::{
CompressedContent, decompress_file_to_temp, materialize_in_memory_archive_entries,
};
fn decompress_once(path: &Path, base_dir: Option<&Path>) -> anyhow::Result<CompressedContent> {
super::decompress_once_with_single_stream_cap(
path,
base_dir,
super::MAX_SINGLE_STREAM_DECOMPRESSED_BYTES,
)
}
/// 1) Fully unpack:
/// - 1st decompress `.gz` -- get a `.tar` file
///
/// - 2nd decompress that `.tar` -- get ArchiveFiles
#[test]
fn smoke_decompress_tar_gz_archive() -> anyhow::Result<()> {
let dir = tempdir()?;
let tar_gz = dir.path().join("payload.tar.gz");
let github_pat = "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"; // this is not a real secret
// build payload.tar.gz containing secret.txt
{
let f = File::create(&tar_gz)?;
let gz = GzEncoder::new(f, Compression::default());
let mut tar = Builder::new(gz);
let data = format!("token={github_pat}\n");
let mut hdr = tar::Header::new_gnu();
hdr.set_size(data.len() as u64);
hdr.set_mode(0o644);
hdr.set_cksum();
tar.append_data(&mut hdr, "secret.txt", data.as_bytes())?;
// finish archive + gzip stream
tar.into_inner()?.finish()?;
}
// 1) peel off .gz -- RawFile(tar_path)
let tmp = tempdir()?;
let layer1 = decompress_once(&tar_gz, Some(tmp.path()))?;
let tar_path = match layer1 {
CompressedContent::RawFile(p) => p,
other => panic!("expected RawFile on first pass, got {:?}", other),
};
// 2) unpack the .tar -- ArchiveFiles
let content = decompress_once(&tar_path, Some(tmp.path()))?;
if let CompressedContent::ArchiveFiles(files) = content {
// find secret.txt
let mut found = false;
for (logical, path) in files {
if logical.ends_with("!secret.txt") {
let txt = std::fs::read_to_string(&path)?;
assert!(txt.contains(github_pat));
found = true;
}
}
assert!(found, "did not find secret.txt in ArchiveFiles");
} else {
panic!("expected ArchiveFiles on second pass, got {:?}", content);
}
Ok(())
}
#[test]
fn smoke_decompress_tgz_archive() -> anyhow::Result<()> {
let dir = tempdir()?;
let tgz = dir.path().join("payload.tgz");
let github_pat = "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"; // this is not a real secret
{
let f = File::create(&tgz)?;
let gz = GzEncoder::new(f, Compression::default());
let mut tar = Builder::new(gz);
let data = format!("token={github_pat}\n");
let mut hdr = tar::Header::new_gnu();
hdr.set_size(data.len() as u64);
hdr.set_mode(0o644);
hdr.set_cksum();
tar.append_data(&mut hdr, "secret.txt", data.as_bytes())?;
tar.into_inner()?.finish()?;
}
let (content, _tmp) = decompress_file_to_temp(&tgz)?;
if let CompressedContent::ArchiveFiles(files) = content {
let mut found = false;
for (logical, path) in files {
if logical.ends_with("payload.tgz!secret.txt") {
let txt = std::fs::read_to_string(&path)?;
assert!(txt.contains(github_pat));
found = true;
}
}
assert!(found, "did not find secret.txt in tgz ArchiveFiles");
} else {
panic!("expected ArchiveFiles for tgz archive, got {:?}", content);
}
Ok(())
}
/// 2) No-extract flag: just peel the `.gz` layer (no base_dir -- use NamedTempFile), and verify
/// you get back a RawFile, whose contents are the tar archive itself.
#[test]
fn smoke_decompress_without_extract_archives() -> anyhow::Result<()> {
let dir = tempdir()?;
let tar_gz = dir.path().join("payload.tar.gz");
let github_pat = "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";
// ── build payload.tar.gz containing secret.txt ──────────────────────────────
{
let f = File::create(&tar_gz)?;
let gz = GzEncoder::new(f, Compression::default());
let mut tar = Builder::new(gz);
let data = format!("token={github_pat}\n");
let mut hdr = tar::Header::new_gnu();
hdr.set_size(data.len() as u64);
hdr.set_mode(0o644);
hdr.set_cksum();
tar.append_data(&mut hdr, "secret.txt", data.as_bytes())?;
// finish archive + gzip stream
tar.into_inner()?.finish()?;
}
// peel only the .gz -- get a RawFile, but do NOT unpack tar
let content = decompress_once(&tar_gz, None)?;
match content {
CompressedContent::RawFile(path) => {
// ensure the file exists and contains the tar header or our secret name
let data = std::fs::read(&path)?;
let as_str = String::from_utf8_lossy(&data);
assert!(
as_str.contains("secret.txt") || data.windows(5).any(|w| w == b"ustar"),
"raw file isnt a tar archive"
);
}
other => panic!("expected RawFile, got {:?}", other),
}
Ok(())
}
#[test]
fn smoke_decompress_zip_archive() -> anyhow::Result<()> {
let dir = tempdir()?;
let zip_path = dir.path().join("payload.zip");
let github_pat = "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"; // this is not a real secret
{
let file = File::create(&zip_path)?;
let mut zip = ZipWriter::new(file);
let options = SimpleFileOptions::default()
.compression_method(CompressionMethod::Deflated)
.unix_permissions(0o644);
zip.start_file("nested/secret.txt", options)?;
zip.write_all(format!("token={github_pat}\n").as_bytes())?;
zip.finish()?;
}
let tmp = tempdir()?;
let content = decompress_once(&zip_path, Some(tmp.path()))?;
if let CompressedContent::ArchiveFiles(files) = content {
let mut found = false;
for (logical, path) in files {
if logical.ends_with("!nested/secret.txt") {
let txt = std::fs::read_to_string(&path)?;
assert!(txt.contains(github_pat));
found = true;
}
}
assert!(found, "did not find nested/secret.txt in ArchiveFiles");
} else {
panic!("expected ArchiveFiles for zip archive, got {:?}", content);
}
Ok(())
}
#[test]
fn materialize_in_memory_archive_entries_skips_unsafe_paths() -> anyhow::Result<()> {
let sandbox = tempdir()?;
let extract_root = sandbox.path().join("extract");
std::fs::create_dir(&extract_root)?;
let outside_parent = sandbox.path().join("outside-parent.txt");
let outside_absolute = sandbox.path().join("outside-absolute.txt");
let entries = vec![
("archive.asar!nested/safe.txt".to_string(), b"safe".to_vec()),
("archive.asar!../outside-parent.txt".to_string(), b"bad".to_vec()),
(format!("archive.asar!{}", outside_absolute.display()), b"bad".to_vec()),
];
materialize_in_memory_archive_entries(&entries, &extract_root)?;
assert_eq!(std::fs::read(extract_root.join("nested/safe.txt"))?, b"safe");
assert!(!outside_parent.exists());
assert!(!outside_absolute.exists());
Ok(())
}
#[test]
fn decompress_asar_skips_parent_dir_entries() -> anyhow::Result<()> {
use asar::AsarWriter;
let mut writer = AsarWriter::new();
writer.write_file("safe.txt", b"safe", false)?;
writer.write_file("aa/bb/escape.txt", b"bad", false)?;
let mut archive = Vec::new();
writer.finalize(&mut archive)?;
let json_size = u32::from_le_bytes(archive[12..16].try_into().unwrap()) as usize;
let header = &mut archive[16..16 + json_size];
let header_str = std::str::from_utf8(header)?;
assert!(header_str.contains("\"aa\""));
assert!(header_str.contains("\"bb\""));
let patched = header_str.replace("\"aa\"", "\"..\"").replace("\"bb\"", "\"..\"");
assert_eq!(patched.len(), header_str.len());
header.copy_from_slice(patched.as_bytes());
let dir = tempdir()?;
let asar_path = dir.path().join("malicious.asar");
std::fs::write(&asar_path, archive)?;
let (content, _tmp) = decompress_file_to_temp(&asar_path)?;
let CompressedContent::Archive(entries) = content else {
panic!("expected Archive for asar");
};
assert_eq!(entries.len(), 1);
assert!(
entries
.iter()
.any(|(name, data)| name.ends_with("!safe.txt") && data.as_slice() == b"safe")
);
assert!(!entries.iter().any(|(name, _)| name.contains("..")));
Ok(())
}
/// 3) Nested archive: outer.tar.gz ──▶ outer.tar (contains inner.tar.gz) └──▶ inner.tar.gz
/// ──▶ inner.tar (contains secret.txt)
#[test]
fn smoke_decompress_nested_tar_gz_archives() -> anyhow::Result<()> {
use std::{fs::File, io::Read, path::PathBuf};
use flate2::{Compression, write::GzEncoder};
use tar::Builder;
use tempfile::tempdir;
use super::CompressedContent;
let tmp = tempdir()?;
/* ── build INNER tar.gz ──────────────────────────────────────────────── */
let inner_tgz = tmp.path().join("inner.tar.gz");
{
let f = File::create(&inner_tgz)?;
let gz = GzEncoder::new(f, Compression::default());
let mut tar = Builder::new(gz);
let data = b"nested_secret=shh\n";
let mut hdr = tar::Header::new_gnu();
hdr.set_size(data.len() as u64);
hdr.set_mode(0o644);
hdr.set_cksum();
tar.append_data(&mut hdr, "secret.txt", &data[..])?;
tar.into_inner()?.finish()?;
}
/* ── read inner archive into memory so we can embed it ──────────────── */
let mut inner_bytes = Vec::new();
File::open(&inner_tgz)?.read_to_end(&mut inner_bytes)?;
/* ── build OUTER tar.gz that contains the inner .tar.gz ─────────────── */
let outer_tgz = tmp.path().join("outer.tar.gz");
{
let f = File::create(&outer_tgz)?;
let gz = GzEncoder::new(f, Compression::default());
let mut tar = Builder::new(gz);
let mut hdr = tar::Header::new_gnu();
hdr.set_size(inner_bytes.len() as u64);
hdr.set_mode(0o644);
hdr.set_cksum();
tar.append_data(&mut hdr, "inner.tar.gz", inner_bytes.as_slice())?;
tar.into_inner()?.finish()?;
}
/* ── Layer 1: gunzip outer.tar.gz ───────────────────────────────────── */
let scratch = tempdir()?; // where intermediate layers land
let tar_path = match decompress_once(&outer_tgz, Some(scratch.path()))? {
CompressedContent::RawFile(p) => p,
other => panic!("expected RawFile after gunzip, got {:?}", other),
};
/* ── Layer 2: untar outer.tar -> find inner.tar.gz on disk ─────────── */
let inner_on_disk: PathBuf = match decompress_once(&tar_path, Some(scratch.path()))? {
CompressedContent::ArchiveFiles(files) => files
.into_iter()
.find(|(logical, _)| logical.ends_with("!inner.tar.gz"))
.map(|(_, p)| p)
.expect("inner.tar.gz not found in outer archive"),
other => panic!("expected ArchiveFiles after untar, got {:?}", other),
};
/* ── Layer 3: gunzip inner.tar.gz ───────────────────────────────────── */
let inner_tar = match decompress_once(&inner_on_disk, Some(scratch.path()))? {
CompressedContent::RawFile(p) => p,
other => panic!("expected RawFile after gunzip inner, got {:?}", other),
};
/* ── Layer 4: untar inner.tar -> secret.txt should be present ──────── */
match decompress_once(&inner_tar, Some(scratch.path()))? {
CompressedContent::ArchiveFiles(files) => {
let mut found = false;
for (logical, path) in files {
if logical.ends_with("!secret.txt") {
let txt = std::fs::read_to_string(&path)?;
assert!(txt.contains("nested_secret=shh"), "secret.txt content corrupted");
found = true;
}
}
assert!(found, "secret.txt not extracted from nested archive");
}
other => panic!("expected ArchiveFiles after untar inner, got {:?}", other),
}
Ok(())
}
#[test]
fn smoke_decompress_apk_archive() -> anyhow::Result<()> {
// APKs are ZIP containers. We expect Kingfisher to recognize the .apk
// extension and extract its entries so embedded secrets get scanned.
let dir = tempdir()?;
let apk_path = dir.path().join("aws_leak.apk");
let aws_key = "AKIAIOSFODNN7EXAMPLE"; // canonical AWS sample, not real
{
let file = File::create(&apk_path)?;
let mut zip = ZipWriter::new(file);
let options = SimpleFileOptions::default()
.compression_method(CompressionMethod::Deflated)
.unix_permissions(0o644);
zip.start_file("res/values/strings.xml", options)?;
zip.write_all(
format!(
"<?xml version=\"1.0\"?><resources><string name=\"aws\">{aws_key}</string></resources>"
)
.as_bytes(),
)?;
zip.finish()?;
}
let tmp = tempdir()?;
let content = decompress_once(&apk_path, Some(tmp.path()))?;
if let CompressedContent::ArchiveFiles(files) = content {
let mut found = false;
for (logical, path) in files {
if logical.ends_with("!res/values/strings.xml") {
let txt = std::fs::read_to_string(&path)?;
assert!(txt.contains(aws_key));
found = true;
}
}
assert!(found, "did not find res/values/strings.xml in apk ArchiveFiles");
} else {
panic!("expected ArchiveFiles for apk archive, got {:?}", content);
}
Ok(())
}
#[test]
fn smoke_decompress_hwpx_archive() -> anyhow::Result<()> {
let dir = tempdir()?;
let hwpx_path = dir.path().join("document.hwpx");
let github_pat = "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"; // this is not a real secret
{
let file = File::create(&hwpx_path)?;
let mut zip = ZipWriter::new(file);
let options = SimpleFileOptions::default()
.compression_method(CompressionMethod::Deflated)
.unix_permissions(0o644);
zip.start_file("Contents/section0.xml", options)?;
zip.write_all(
format!("<?xml version=\"1.0\"?><doc>token={github_pat}</doc>").as_bytes(),
)?;
zip.finish()?;
}
let tmp = tempdir()?;
let content = decompress_once(&hwpx_path, Some(tmp.path()))?;
if let CompressedContent::ArchiveFiles(files) = content {
let mut found = false;
for (logical, path) in files {
if logical.ends_with("!Contents/section0.xml") {
let txt = std::fs::read_to_string(&path)?;
assert!(txt.contains(github_pat));
found = true;
}
}
assert!(found, "did not find Contents/section0.xml in hwpx ArchiveFiles");
} else {
panic!("expected ArchiveFiles for hwpx archive, got {:?}", content);
}
Ok(())
}
#[test]
fn smoke_decompress_hwp_archive() -> anyhow::Result<()> {
use cfb::CompoundFile;
use flate2::{Compression, write::ZlibEncoder};
let dir = tempdir()?;
let hwp_path = dir.path().join("document.hwp");
let github_pat = "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"; // this is not a real secret
// Build a minimal CFB with two streams: one plaintext, one zlib-framed.
{
let file = File::create(&hwp_path)?;
let mut cf = CompoundFile::create(file)?;
cf.create_storage("/BodyText")?;
let mut s_plain = cf.create_stream("/DocInfo")?;
s_plain.write_all(format!("metadata token={github_pat}").as_bytes())?;
drop(s_plain);
let mut zencoder = ZlibEncoder::new(Vec::new(), Compression::default());
zencoder.write_all(format!("body token={github_pat}").as_bytes())?;
let zbytes = zencoder.finish()?;
let mut s_body = cf.create_stream("/BodyText/Section0")?;
s_body.write_all(&zbytes)?;
drop(s_body);
cf.flush()?;
}
let content = decompress_once(&hwp_path, None)?;
if let CompressedContent::Archive(entries) = content {
let mut saw_plain = false;
let mut saw_body = false;
for (logical, bytes) in &entries {
let as_str = String::from_utf8_lossy(bytes);
if logical.contains("DocInfo") && as_str.contains(github_pat) {
saw_plain = true;
}
if logical.contains("Section0") && as_str.contains(github_pat) {
saw_body = true;
}
}
assert!(saw_plain, "plaintext DocInfo stream missing or not decoded");
assert!(saw_body, "zlib-framed BodyText/Section0 stream missing or not decoded");
} else {
panic!("expected Archive for hwp, got {:?}", content);
}
Ok(())
}
#[test]
fn smoke_decompress_egg_raw() -> anyhow::Result<()> {
let dir = tempdir()?;
let egg_path = dir.path().join("archive.egg");
let github_pat = "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"; // this is not a real secret
{
let mut f = File::create(&egg_path)?;
f.write_all(format!("EGG-pretend-header\ntoken={github_pat}\n").as_bytes())?;
}
let content = decompress_once(&egg_path, None)?;
match content {
CompressedContent::Raw(bytes) => {
let as_str = String::from_utf8_lossy(&bytes);
assert!(
as_str.contains(github_pat),
"raw egg bytes did not contain the embedded pat"
);
}
other => panic!("expected Raw for egg, got {:?}", other),
}
Ok(())
}
#[test]
fn capped_writer_drops_bytes_past_cap() {
use std::io::Write;
use super::CappedWriter;
let mut sink = Vec::new();
let mut capped = CappedWriter::new(&mut sink, 40);
// Report full consumption even though the tail is dropped.
assert_eq!(capped.write(&[0u8; 100]).unwrap(), 100);
assert!(capped.truncated());
capped.flush().unwrap();
assert_eq!(sink.len(), 40);
let mut sink = Vec::new();
let mut capped = CappedWriter::new(&mut sink, 40);
assert_eq!(capped.write(&[0u8; 10]).unwrap(), 10);
assert!(!capped.truncated());
assert_eq!(sink.len(), 10);
}
#[test]
fn stream_to_file_capped_truncates_oversized_stream() -> anyhow::Result<()> {
use std::io::Cursor;
use super::{CompressedContent, stream_to_file_capped};
let dir = tempdir()?;
let out_path = dir.path().join("out.bin");
// A "decompressed" stream far larger than the cap: only `cap` bytes
// should ever reach disk, mirroring a small compression bomb.
let payload = vec![b'A'; 8192];
let content = stream_to_file_capped(Cursor::new(payload), &out_path, 128)?;
match content {
CompressedContent::RawFile(p) => {
let written = std::fs::metadata(&p)?.len();
assert_eq!(written, 128, "output must be capped at the byte budget");
}
other => panic!("expected RawFile, got {other:?}"),
}
Ok(())
}
}