diff --git a/CHANGELOG.md b/CHANGELOG.md index 21a7424..0804eb7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to this project will be documented in this file. +## [v1.76.0] +- Fixed validation deduplication for rules with nested unnamed captures (e.g. `(?...(ABC|DEF)...)`) to use the primary capture for grouping, ensuring each unique match triggers a separate validation request. +- Added trace-level (`-vv`) logging for internal validation dedup keys and grouping to aid debugging. +- Switched compression dependencies to pure-Rust bzip2/lzma implementations and pared zip features to avoid C-based codecs for bz2/xz handling. + ## [v1.75.0] - Enhanced Access Map View: added fingerprint display, enabled searching by fingerprint, and implemented bidirectional navigation between Findings and Access Map nodes. - Added Slack Access Map support with granular permissions in the tree view. diff --git a/Cargo.toml b/Cargo.toml index d47352e..896bfe5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.75.0" +version = "1.76.0" description = "MongoDB's blazingly fast and accurate secret scanning and validation tool" edition.workspace = true rust-version.workspace = true @@ -150,10 +150,10 @@ tree_magic_mini = "3.2" content_inspector = "0.2.4" rustc-hash = "2.1.1" term_size = "0.3.2" -bzip2 = "0.5.2" -zip = "2.4.2" +bzip2-rs = "0.1.2" +zip = { version = "2.4.2", default-features = false, features = ["deflate", "deflate64", "time"] } tar = "0.4.44" -xz2 = "0.1.7" +lzma-rs = "0.3.0" asar = "0.3.0" blake3 = "1.8.2" memchr = "2.7" diff --git a/data/rules/netrc.yml b/data/rules/netrc.yml index 6b6be12..950e1da 100644 --- a/data/rules/netrc.yml +++ b/data/rules/netrc.yml @@ -10,6 +10,10 @@ rules: password \s+ ([^\s]+) min_entropy: 3.3 confidence: medium + pattern_requirements: + ignore_if_contains: + - '${' + - '$(' examples: - 'machine api.github.com login ziggy^stardust password 012345abcdef' - | diff --git a/src/decompress.rs b/src/decompress.rs index de04d27..c3bc804 100644 --- a/src/decompress.rs +++ b/src/decompress.rs @@ -1,18 +1,18 @@ use std::{ fs, - io::Read, + io::{BufReader, Read}, path::{Component, Path, PathBuf}, }; use anyhow::Result; use asar::AsarReader; -use bzip2::read::BzDecoder; +use bzip2_rs::DecoderReader; use flate2::read::{GzDecoder, ZlibDecoder}; +use lzma_rs::xz_decompress; use memmap2::Mmap; use tar::Archive; use tempfile::{tempdir, TempDir}; use uuid::Uuid; -use xz2::read::XzDecoder; use zip::ZipArchive; /// Formats that are basically a ZIP container. @@ -175,22 +175,43 @@ fn handle_asar_archive_in_memory(buffer: &[u8], archive_path: &Path) -> Result(mut decoder: R, out_path: &Path) -> Result { - if !is_safe_extract_path(out_path) { - anyhow::bail!("unsafe path during decompression: {}", out_path.display()); +/// Validate and open a file for reading, checking for path traversal attacks. +fn safe_open_for_read(path: &Path) -> Result { + if !is_safe_extract_path(path) { + anyhow::bail!("unsafe input path during decompression: {}", path.display()); } - let mut out_file = fs::File::create(out_path)?; + Ok(fs::File::open(path)?) +} + +/// Validate and create a file for writing, checking for path traversal attacks. +fn safe_create_for_write(path: &Path) -> Result { + if !is_safe_extract_path(path) { + anyhow::bail!("unsafe output path during decompression: {}", path.display()); + } + Ok(fs::File::create(path)?) +} + +fn stream_to_file(mut decoder: R, out_path: &Path) -> Result { + let mut out_file = safe_create_for_write(out_path)?; std::io::copy(&mut decoder, &mut out_file)?; Ok(CompressedContent::RawFile(out_path.to_owned())) } +fn stream_xz_to_file(path: &Path, out_path: &Path) -> Result { + let input = safe_open_for_read(path)?; + let mut reader = BufReader::new(input); + let mut out_file = safe_create_for_write(out_path)?; + xz_decompress(&mut reader, &mut out_file)?; + Ok(CompressedContent::RawFile(out_path.to_owned())) +} + /* ─────────────────────────────────────────────────────────────── one *step* of decompression ───────────────────────────────────────────────────────────── */ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result { let extension = path.extension().and_then(|ext| ext.to_str()).map(|s| s.to_ascii_lowercase()); - let mut file = fs::File::open(path)?; + let mut file = safe_open_for_read(path)?; if let Some(ext) = extension.as_deref() { match ext { @@ -216,22 +237,21 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result { let out_path = make_output_path(path, base_dir, "decomp.tar"); - let decoder = GzDecoder::new(fs::File::open(path)?); + let decoder = GzDecoder::new(BufReader::new(safe_open_for_read(path)?)); return stream_to_file(decoder, &out_path); } "bz2" | "bzip2" => { let out_path = make_output_path(path, base_dir, "decomp.tar"); - let decoder = BzDecoder::new(fs::File::open(path)?); + let decoder = DecoderReader::new(BufReader::new(safe_open_for_read(path)?)); return stream_to_file(decoder, &out_path); } "xz" => { let out_path = make_output_path(path, base_dir, "decomp.tar"); - let decoder = XzDecoder::new(fs::File::open(path)?); - return stream_to_file(decoder, &out_path); + return stream_xz_to_file(path, &out_path); } "zlib" => { let out_path = make_output_path(path, base_dir, "decomp.tar"); - let decoder = ZlibDecoder::new(fs::File::open(path)?); + let decoder = ZlibDecoder::new(BufReader::new(safe_open_for_read(path)?)); return stream_to_file(decoder, &out_path); } _ => {} diff --git a/src/matcher.rs b/src/matcher.rs index 0317214..40444cf 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -96,7 +96,15 @@ impl OwnedBlobMatch { } pub fn from_blob_match(blob_match: BlobMatch) -> Self { - // Get the matching value from capture group 1 (or 0 if not available) + // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility. + // + // This indexing is intentionally different from the internal `validation_dedup_key()` + // (which uses get(0)) to maintain stable external fingerprints. Changing this would break: + // - Historical baselines that rely on fingerprint matching + // - Dedup entries stored in external systems + // + // For rules with nested captures like (?...(ABC)...), this may pick up + // the inner group, but that behavior is now established and must be preserved. let matching_finding = blob_match .captures .captures @@ -1021,8 +1029,8 @@ impl Match { origin_type: &'a str, ) -> Self { let offset_span = owned_blob_match.matching_input_offset_span; - // Extract the matched secret content. Use capture group 1 if it exists, otherwise fall back - // to group 0. + // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility. + // See comment in from_blob_match() for why this differs from validation_dedup_key(). let matching_finding_bytes = owned_blob_match .captures .captures diff --git a/src/reporter.rs b/src/reporter.rs index e7e7e4a..aabd8df 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -228,6 +228,12 @@ impl DetailsReporter { } fn normalized_finding_fingerprint(m: &Match, origin: &OriginSet) -> u64 { + // EXTERNAL FINGERPRINT: Use get(1).or_else(get(0)) for backward compatibility. + // + // This indexing is intentionally different from the internal `validation_dedup_key()` + // (which uses get(0)) to maintain stable external fingerprints and consistent + // reporting output. Changing this would break historical baselines and alter + // finding appearance. let finding_value = m .groups .captures diff --git a/src/scanner/validation.rs b/src/scanner/validation.rs index 1529c9e..24c77e9 100644 --- a/src/scanner/validation.rs +++ b/src/scanner/validation.rs @@ -15,6 +15,7 @@ use liquid::Parser; use reqwest::{Client, StatusCode}; use rustc_hash::FxHashMap; use tokio::{sync::Notify, time::timeout}; +use tracing::trace; use crate::{ access_map::AccessMapRequest, @@ -156,16 +157,35 @@ pub async fn run_secret_validation( if !simple_matches.is_empty() { let mut groups: FxHashMap>> = FxHashMap::default(); for arc_msg in simple_matches { - let secret = arc_msg - .2 - .groups - .captures - .get(1) - .or_else(|| arc_msg.2.groups.captures.get(0)) - .map_or("", |c| c.raw_value()); - groups.entry(format!("{}|{}", arc_msg.2.rule.id(), secret)).or_default().push(arc_msg); + // VALIDATION DEDUP: Use get(0) to get the first/primary capture for grouping. + // + // This differs from fingerprint/reporting code (which uses get(1).or_else(get(0))) + // for backward compatibility reasons - changing fingerprint calculation would break + // historical baselines and dedup entries. + // + // For validation deduplication, we need the PRIMARY secret value to ensure each + // unique secret triggers a separate validation request. Using get(1) first would + // incorrectly pick up inner unnamed groups when patterns have nested captures + // like (?...(ABC|DEF)...), causing all matches to share the same + // validation result. + let secret = arc_msg.2.groups.captures.get(0).map_or("", |c| c.raw_value()); + let group_key = format!("{}|{}", arc_msg.2.rule.id(), secret); + trace!( + rule_id = %arc_msg.2.rule.id(), + secret_value = %secret, + external_fingerprint = arc_msg.2.finding_fingerprint, + validation_group_key = %group_key, + "Grouping finding for validation" + ); + groups.entry(group_key).or_default().push(arc_msg); } + trace!( + total_findings = groups.values().map(|v| v.len()).sum::(), + unique_validation_groups = groups.len(), + "Validation grouping complete (internal dedup)" + ); + let validation_results = DashMap::::new(); let pb = ProgressBar::new(groups.len() as u64).with_message("Validating secrets…"); @@ -195,13 +215,9 @@ pub async fn run_secret_validation( let access_map = access_map.clone(); async move { - let secret = rep_arc - .2 - .groups - .captures - .get(1) - .or_else(|| rep_arc.2.groups.captures.get(0)) - .map_or("", |c| c.raw_value()); + // VALIDATION DEDUP: Use get(0) for the primary secret value. + // See comment above for why this differs from fingerprint/reporting code. + let secret = rep_arc.2.groups.captures.get(0).map_or("", |c| c.raw_value()); let key = format!("{}|{}", rep_arc.2.rule.id(), secret); match val_res.entry(key.clone()) { diff --git a/src/validation.rs b/src/validation.rs index 2076947..ec34fc5 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -16,7 +16,7 @@ use once_cell::sync::{Lazy, OnceCell}; use reqwest::{header, header::HeaderValue, multipart, Client, Url}; use rustc_hash::FxHashMap; use tokio::{sync::Notify, time}; -use tracing::debug; +use tracing::{debug, trace}; use crate::{ location::OffsetSpan, @@ -91,16 +91,38 @@ pub fn set_user_agent_suffix>(suffix: Option) { // Use SkipMap-based cache instead of a mutex-wrapped FxHashMap. type Cache = Arc>; -/// Returns an opaque 64-bit fingerprint for “same secret under the same rule”. -fn secret_fingerprint(m: &OwnedBlobMatch) -> u64 { +/// Returns an opaque 64-bit key for internal validation deduplication. +/// +/// This is an INTERNAL key used only for validation deduplication within a single scan. +/// It uses `captures.get(0)` to get the primary secret value. +/// +/// **Important**: This is distinct from the EXTERNAL `finding_fingerprint` used for: +/// - Baseline comparisons across scans +/// - Deduplication entries in external systems +/// - Reporting output +/// +/// The external fingerprint uses `get(1).or_else(get(0))` for backward compatibility +/// and must remain stable. This internal key can evolve independently. +fn validation_dedup_key(m: &OwnedBlobMatch) -> u64 { let mut hasher = xxhash_rust::xxh3::Xxh3::new(); m.rule.syntax().id.hash(&mut hasher); - // first capture = the secret text itself - if let Some(c0) = m.captures.captures.get(0) { - c0.raw_value().hash(&mut hasher); + // Use the first capture (primary secret) for deduplication. + // Note: capture_value is stored in a variable because it's also used in trace! below. + let capture_value = m.captures.captures.get(0).map(|c| c.raw_value()); + if let Some(val) = capture_value { + val.hash(&mut hasher); } - hasher.finish() + let key = hasher.finish(); + + trace!( + rule_id = %m.rule.syntax().id, + capture_value = ?capture_value, + validation_dedup_key = key, + "Computed internal validation dedup key" + ); + + key } static VALIDATION_CACHE: OnceCell> = OnceCell::new(); @@ -171,11 +193,16 @@ pub fn collect_variables_and_dependencies( if !matching_dependencies.is_empty() { for other_match in matching_dependencies { + // VALIDATION: Use get(0) for the primary capture value when collecting + // dependent variables. This ensures we get the main captured value rather + // than inner unnamed groups from nested captures like (?...(ABC)...). + // + // Note: This differs from fingerprint/reporting code which uses + // get(1).or_else(get(0)) for backward compatibility. let matching_input = other_match .captures .captures - .get(1) - .or_else(|| other_match.captures.captures.get(0)) + .get(0) .expect("Expected at least one capture"); variable_map .entry(dependency.variable.to_uppercase()) @@ -297,7 +324,7 @@ async fn timed_validate_single_match<'a>( // ────────────────────────────────────────────────────────── // 1. process-wide fingerprint de-dup // ────────────────────────────────────────────────────────── - let fp = secret_fingerprint(m); + let fp = validation_dedup_key(m); if let Some(entry) = VALIDATION_CACHE.get_or_init(DashMap::new).get(&fp) { if entry.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) { @@ -453,10 +480,21 @@ async fn timed_validate_single_match<'a>( header_map.insert(name.as_str().to_string(), v.to_string()); } } + + // Render the body template to include in cache key + let rendered_body = + http_validation.request.body.as_ref().and_then(|body_template| { + parser + .parse(body_template) + .ok() + .and_then(|template| template.render(&globals).ok()) + }); + cache_key = httpvalidation::generate_http_cache_key_parts( http_validation.request.method.as_str(), &url, &header_map, + rendered_body.as_deref(), ); if let Some(cached) = cache.get(&cache_key) { let c = cached.value(); diff --git a/src/validation/httpvalidation.rs b/src/validation/httpvalidation.rs index 21aade7..c08c890 100644 --- a/src/validation/httpvalidation.rs +++ b/src/validation/httpvalidation.rs @@ -30,6 +30,7 @@ pub fn generate_http_cache_key_parts( method: &str, url: &Url, headers: &BTreeMap, + body: Option<&str>, ) -> String { let method = method.to_uppercase(); // ensure "get" == "GET" let url = url.as_str(); // canonical form from `reqwest::Url` @@ -49,6 +50,13 @@ pub fn generate_http_cache_key_parts( hasher.update(b"\0"); } + // Include the request body in the cache key if present + if let Some(b) = body { + hasher.update(b"BODY\0"); + hasher.update(b.as_bytes()); + hasher.update(b"\0"); + } + // Hex-encode and prefix so callers can tell this key came from HTTP logic format!("HTTP:{:x}", hasher.finalize()) } @@ -605,4 +613,34 @@ mod tests { "Should correctly identify HTML even with multi-byte characters at boundary" ); } + + #[test] + fn test_cache_key_includes_body() { + let url = Url::from_str("https://example.com/api").unwrap(); + let headers = + BTreeMap::from([("Content-Type".to_string(), "application/json".to_string())]); + + // Same method, url, headers but different bodies should produce different cache keys + let key_no_body = generate_http_cache_key_parts("POST", &url, &headers, None); + let key_body_a = + generate_http_cache_key_parts("POST", &url, &headers, Some(r#"{"value": "abc"}"#)); + let key_body_b = + generate_http_cache_key_parts("POST", &url, &headers, Some(r#"{"value": "xyz"}"#)); + + // All three should be different + assert_ne!( + key_no_body, key_body_a, + "Cache key with body should differ from key without body" + ); + assert_ne!( + key_no_body, key_body_b, + "Cache key with body should differ from key without body" + ); + assert_ne!(key_body_a, key_body_b, "Cache keys with different bodies should be different"); + + // Same body should produce same key + let key_body_a_dup = + generate_http_cache_key_parts("POST", &url, &headers, Some(r#"{"value": "abc"}"#)); + assert_eq!(key_body_a, key_body_a_dup, "Same inputs should produce same cache key"); + } }