From dca955a95c8241c7f1d072010d8c82b8af78b91c Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 10 Nov 2025 18:47:51 -0800 Subject: [PATCH] v1.63.0 --- .github/workflows/ci.yml | 5 +- .github/workflows/release.yml | 5 +- CHANGELOG.md | 5 ++ Cargo.toml | 2 +- data/rules/datadog.yml | 128 +++++++++++++++++++++++----------- src/decompress.rs | 4 +- src/main.rs | 30 ++++---- src/matcher.rs | 121 +++++++++++++++++++++++++++----- src/reporter.rs | 8 ++- src/rules/rule.rs | 2 +- src/validation.rs | 4 ++ src/validation/aws.rs | 4 +- src/validation/utils.rs | 100 ++++++++++++++++++-------- tests/int_base64.rs | 14 ++-- tests/int_slack.rs | 2 +- tests/smoke_archive.rs | 2 +- tests/smoke_baseline.rs | 2 +- tests/smoke_exclude.rs | 2 +- tests/smoke_fs.rs | 2 +- tests/smoke_git.rs | 6 +- 20 files changed, 318 insertions(+), 130 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 81c6bb0..1080ff4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -77,9 +77,10 @@ jobs: C:\vcpkg\downloads C:\vcpkg\archives C:\Users\runneradmin\AppData\Local\vcpkg\archives - key: ${{ runner.os }}-vcpkg-hyperscan-542 + key: vcpkg-${{ runner.os }}-hs-542 restore-keys: | - ${{ runner.os }}-vcpkg- + vcpkg-${{ runner.os }}- + vcpkg- # Ensure downloads dir exists and seed PCRE 8.45 zip from a working mirror - name: Pre-seed PCRE 8.45 for vcpkg (bypass SourceForge redirect) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e8f6222..ed7f8f5 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -212,9 +212,10 @@ jobs: C:\vcpkg\downloads C:\vcpkg\archives C:\Users\runneradmin\AppData\Local\vcpkg\archives - key: ${{ runner.os }}-vcpkg-hyperscan-542 + key: vcpkg-${{ runner.os }}-hs-542 restore-keys: | - ${{ runner.os }}-vcpkg- + vcpkg-${{ runner.os }}- + vcpkg- # Ensure downloads dir exists and seed PCRE 8.45 zip from a working mirror - name: Pre-seed PCRE 8.45 for vcpkg (bypass SourceForge redirect) diff --git a/CHANGELOG.md b/CHANGELOG.md index 044bede..2ee37c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to this project will be documented in this file. +## [v1.63.0] +- Fixed bug when retrieving some finding values and injecting them as TOKENS in the rule templates +- Improved Datadog rule +- Improved AWS rule + ## [v1.62.0] - Added `pattern_requirements` checks to rules, providing lightweight post-regex character-class validation without lookarounds. See docs/RULES.md for detail - Added an `ignore_if_contains` option to `pattern_requirements` to drop matches containing case-insensitive placeholder words, with tests covering the new behavior. diff --git a/Cargo.toml b/Cargo.toml index 44afc1d..71d9065 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.62.0" +version = "1.63.0" description = "MongoDB's blazingly fast and accurate secret scanning and validation tool" edition.workspace = true rust-version.workspace = true diff --git a/data/rules/datadog.yml b/data/rules/datadog.yml index 6958bf8..67c8a5f 100644 --- a/data/rules/datadog.yml +++ b/data/rules/datadog.yml @@ -1,16 +1,16 @@ rules: - name: Datadog API Key - id: kingfisher.datadog.1 + id: kingfisher.datadog.3 pattern: | - (?xi) + (?xi) \b - datadog + (?:datadog|dd) (?:.|[\n\r]){0,64}? - (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) + (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)? (?:.|[\n\r]){0,32}? - \b + \b ( - [a-z0-9]{32} + [A-Za-z0-9]{32} ) \b pattern_requirements: @@ -18,48 +18,96 @@ rules: min_entropy: 3.3 confidence: medium examples: - - datadog-secrettoken-0024a29224affe29d173c0bf99e5a89d + - DD_API_KEY=0024a29224affe29d173c0bf99e5a89d references: - https://docs.datadoghq.com/account_management/api-app-keys/ validation: type: Http content: request: + method: GET + url: https://api.datadoghq.com/api/v1/validate headers: Accept: application/json - DD-API-KEY: '{{ TOKEN }}' - DD-APPLICATION-KEY: '{{ APPKEY }}' - method: GET + DD-API-KEY: "{{ TOKEN }}" response_matcher: - report_response: true - - status: - - 200 - type: StatusMatch - url: https://api.datadoghq.com/api/v2/current_user - depends_on_rule: - - rule_id: kingfisher.datadog.2 - variable: APPKEY + - type: StatusMatch + status: [200] - - name: Datadog Application Secret - id: kingfisher.datadog.2 - pattern: | - (?xi) - \b - datadog - (?:.|[\n\r]){0,64}? - (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) - (?:.|[\n\r]){0,16}? - ( - [a-z0-9]{40} - ) - pattern_requirements: - min_digits: 2 - min_uppercase: 1 - min_lowercase: 1 - min_entropy: 3.3 - confidence: medium - examples: - - datadog_secret_key-3c0c3965368a6b10f7640dbda46abfdca981c2d3 - - datadog_token = BzHpkcs7LujMb3Q1vLRRjbpBNxxYV0ousumYoKJS - references: - - https://docs.datadoghq.com/account_management/api-app-keys/ \ No newline at end of file + # - name: Datadog API Key + # id: kingfisher.datadog.1 + # pattern: | + # (?xi) + # \b + # datadog + # (?:.|[\n\r]){0,64}? + # (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) + # (?:.|[\n\r]){0,32}? + # \b + # ( + # [a-z0-9]{32} + # ) + # \b + # pattern_requirements: + # min_digits: 2 + # min_entropy: 3.3 + # confidence: medium + # examples: + # - datadog-secrettoken-0024a29224affe29d173c0bf99e5a89d + # references: + # - https://docs.datadoghq.com/account_management/api-app-keys/ + # validation: + # type: Http + # content: + # request: + # headers: + # Accept: application/json + # DD-API-KEY: '{{ TOKEN }}' + # DD-APPLICATION-KEY: '{{ APPKEY }}' + # method: GET + # response_matcher: + # - report_response: true + # - status: + # - 200 + # type: StatusMatch + # url: https://api.datadoghq.com/api/v2/current_user + # depends_on_rule: + # - rule_id: kingfisher.datadog.2 + # variable: APPKEY + + # - name: Datadog API Key (API-only validation) + # id: kingfisher.datadog.3 + # pattern: | + # (?xi) + # \b + # (?:datadog|dd) + # (?:.|[\n\r]){0,64}? + # (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)? + # (?:.|[\n\r]){0,32}? + # \b + # ( + # [A-Za-z0-9]{32} + # ) + # \b + # pattern_requirements: + # min_digits: 2 + # min_entropy: 3.3 + # confidence: medium + # examples: + # - DD_API_KEY=0024a29224affe29d173c0bf99e5a89d + # references: + # - https://docs.datadoghq.com/account_management/api-app-keys/ + # validation: + # type: Http + # content: + # request: + # method: GET + # url: https://api.datadoghq.com/api/v1/validate + # headers: + # Accept: application/json + # DD-API-KEY: "{{ TOKEN }}" + # response_matcher: + # - report_response: true + # - type: StatusMatch + # status: [200] diff --git a/src/decompress.rs b/src/decompress.rs index bef07c6..595d11a 100644 --- a/src/decompress.rs +++ b/src/decompress.rs @@ -340,7 +340,7 @@ mod tests { fn smoke_decompress_tar_gz_archive() -> anyhow::Result<()> { let dir = tempdir()?; let tar_gz = dir.path().join("payload.tar.gz"); - let github_pat = "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"; // this is not a real secret + let github_pat = "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"; // this is not a real secret // build payload.tar.gz containing secret.txt { @@ -393,7 +393,7 @@ mod tests { fn smoke_decompress_without_extract_archives() -> anyhow::Result<()> { let dir = tempdir()?; let tar_gz = dir.path().join("payload.tar.gz"); - let github_pat = "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"; + let github_pat = "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"; // ── build payload.tar.gz containing secret.txt ────────────────────────────── { diff --git a/src/main.rs b/src/main.rs index b52fcd3..99f0718 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,27 +5,27 @@ // * Fallback - system allocator (`system-alloc` feature) // ──────────────────────────────────────────────────────────── -// --- jemalloc (opt-in) --- -#[cfg(feature = "use-jemalloc")] -#[global_allocator] -static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +// // --- jemalloc (opt-in) --- +// #[cfg(feature = "use-jemalloc")] +// #[global_allocator] +// static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -// --- mimalloc (default) --- -#[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))] -#[global_allocator] -static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; - -// --- system allocator (explicit opt-out) --- -#[cfg(feature = "system-alloc")] -use std::alloc::System; -#[cfg(feature = "system-alloc")] -#[global_allocator] -static GLOBAL: System = System; +// // --- mimalloc (default) --- +// #[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))] +// #[global_allocator] +// static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; +// // --- system allocator (explicit opt-out) --- +// #[cfg(feature = "system-alloc")] // use std::alloc::System; +// #[cfg(feature = "system-alloc")] // #[global_allocator] // static GLOBAL: System = System; +use std::alloc::System; +#[global_allocator] +static GLOBAL: System = System; + use std::{ io::{IsTerminal, Read}, sync::{Arc, Mutex}, diff --git a/src/matcher.rs b/src/matcher.rs index b820233..6dd5048 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -374,7 +374,9 @@ impl<'a> Matcher<'a> { } else { None }; + ///////////////////////////// // Process matches + ///////////////////////////// let mut matches = Vec::new(); let owned_ts_results = tree_sitter_result.map(|ts_results| { ts_results @@ -565,9 +567,11 @@ fn record_match( ) -> bool { insert_span(map.entry(rule_id).or_default(), span) } +// in src/matcher.rs + +#[allow(clippy::too_many_arguments)] fn filter_match<'b>( blob: &'b Blob, - // rule: &'b Rule, rule: Arc, re: &Regex, start: usize, @@ -596,14 +600,44 @@ fn filter_match<'b>( for captures in re.captures_iter(haystack) { let full_capture = captures.get(0).unwrap(); - let matching_input = captures.get(1).unwrap_or(full_capture); + + // --- LOGIC TO FIND THE "SECRET" FOR ENTROPY/SAFE-LISTING --- + let matching_input_for_entropy = 'block: { + // 1. Prefer a named capture called TOKEN (case-insensitive). + if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| { + name_opt + .filter(|name| name.eq_ignore_ascii_case("TOKEN")) + .and_then(|_| captures.get(i)) + }) { + break 'block token_cap; + } + + // 2. Otherwise, prefer the first *matched* named capture. + if let Some(named_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| { + name_opt.and_then(|_| captures.get(i)) // find(i > 0 && name_opt.is_some()) + }) { + break 'block named_cap; + } + + // 3. Otherwise, fall back to the first positional capture (group 1). + if let Some(pos_cap) = captures.get(1) { + break 'block pos_cap; + } + + // 4. Finally, fall back to the full match (group 0). + break 'block full_capture; + }; + // --- END LOGIC --- + let min_entropy = rule.min_entropy(); - let mi_bytes = matching_input.as_bytes(); + let entropy_bytes = matching_input_for_entropy.as_bytes(); let full_bytes = full_capture.as_bytes(); - let calculated_entropy = calculate_shannon_entropy(mi_bytes); + let calculated_entropy = calculate_shannon_entropy(entropy_bytes); + + // Check entropy and safe-listing against the *selected* secret bytes if calculated_entropy <= min_entropy - || is_safe_match(mi_bytes) - || is_user_match(mi_bytes, full_bytes) + || is_safe_match(entropy_bytes) + || is_user_match(entropy_bytes, full_bytes) { debug!( "Skipping match with entropy {} <= {} or safe match", @@ -619,7 +653,15 @@ fn filter_match<'b>( captures: &captures, full_match: full_bytes, }; - match char_reqs.validate(mi_bytes, Some(context), respect_ignore_if_contains) { + + // --- FIX IS HERE --- + // + // The `validate` function (and thus `{{ MATCH }}`) should *always* + // operate on the *full match* (group 0), not just the entropy bytes. + // This aligns the scan logic with the unit test's logic. + match char_reqs.validate(full_bytes, Some(context), respect_ignore_if_contains) { + // + // --- END FIX --- PatternValidationResult::Passed => {} PatternValidationResult::Failed => { debug!( @@ -647,6 +689,9 @@ fn filter_match<'b>( } } + // Use the `matching_input_for_entropy` as the span/key for the finding. + let matching_input = matching_input_for_entropy; + let matching_input_offset_span = OffsetSpan::from_range( (start + matching_input.start())..(start + matching_input.end()), ); @@ -668,7 +713,10 @@ fn filter_match<'b>( } let only_matching_input = &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end]; + + // Pass the *full* capture object to from_captures let groups = SerializableCaptures::from_captures(&captures, haystack, re, redact); + matches.push(BlobMatch { rule: Arc::clone(&rule), blob_id: blob.id_ref(), @@ -687,6 +735,7 @@ fn filter_match<'b>( t.end(new_count > 0, new_count, 0); } } + fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap)> { match lang.to_lowercase().as_str() { "bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())), @@ -796,6 +845,7 @@ pub struct SerializableCaptures { #[schemars(with = "Vec")] pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed) } + impl SerializableCaptures { pub fn from_captures( captures: ®ex::bytes::Captures, @@ -808,26 +858,51 @@ impl SerializableCaptures { let capture_names: SmallVec<[Option; 4]> = re.capture_names().map(|name| name.map(str::to_string)).collect(); - for i in 0..captures.len() { - if let Some(cap) = captures.get(i) { + // If there are explicit capture groups (e.g., group 1, 2, ...), + // only serialize those. + if captures.len() > 1 { + for i in 1..captures.len() { + // Start from 1 + if let Some(cap) = captures.get(i) { + let value = if redact { + redact_value(&String::from_utf8_lossy(cap.as_bytes())) + } else { + String::from_utf8_lossy(cap.as_bytes()).to_string() + }; + let interned = intern(&value); + let name = capture_names.get(i).and_then(|opt| opt.as_ref()).cloned(); + + serialized_captures.push(SerializableCapture { + name, + match_number: i32::try_from(i).unwrap_or(0), + start: cap.start(), + end: cap.end(), + value: interned, + }); + } + } + } else if captures.len() == 1 { + // ELSE, if there is ONLY the full match (len == 1), + // serialize just that full match (group 0) as the fallback. + if let Some(cap) = captures.get(0) { let value = if redact { redact_value(&String::from_utf8_lossy(cap.as_bytes())) } else { String::from_utf8_lossy(cap.as_bytes()).to_string() }; let interned = intern(&value); - - let name = capture_names.get(i).and_then(|opt| opt.as_ref()).cloned(); + let name = capture_names.get(0).and_then(|opt| opt.as_ref()).cloned(); serialized_captures.push(SerializableCapture { name, - match_number: i32::try_from(i).unwrap_or(0), + match_number: 0, start: cap.start(), end: cap.end(), value: interned, }); } } + // If len == 0 (no match), loop is skipped, empty vec is returned. SerializableCaptures { captures: serialized_captures } } @@ -950,7 +1025,8 @@ pub struct DecodedData { } #[inline] fn is_base64_byte(b: u8) -> bool { - matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/') + // Include URL-safe characters '-' and '_' + matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_') } pub fn get_base64_strings(input: &[u8]) -> Vec { @@ -975,7 +1051,14 @@ pub fn get_base64_strings(input: &[u8]) -> Vec { let len = end - start; if len >= 32 && len % 4 == 0 { let base64_slice = &input[start..end]; - if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) { + + // Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD + let decode_result = general_purpose::STANDARD + .decode(base64_slice) + .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice)) + .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice)); + + if let Ok(decoded) = decode_result { if let Ok(decoded_str) = std::str::from_utf8(&decoded) { if decoded_str.is_ascii() { results.push(DecodedData { @@ -1521,10 +1604,10 @@ line2 .map(|cap| (cap.name.as_deref(), cap.match_number, cap.value)) .collect(); - assert_eq!(entries.len(), 4); - assert_eq!(entries[0], (None, 0, "ghp_ABC12")); - assert_eq!(entries[1], (None, 1, "ghp_ABC12")); - assert_eq!(entries[2], (Some("body"), 2, "ABC")); - assert_eq!(entries[3], (Some("checksum"), 3, "12")); + assert_eq!(entries.len(), 3); + + assert_eq!(entries[0], (None, 1, "ghp_ABC12")); + assert_eq!(entries[1], (Some("body"), 2, "ABC")); + assert_eq!(entries[2], (Some("checksum"), 3, "12")); } } diff --git a/src/reporter.rs b/src/reporter.rs index e8e29ba..f1a1679 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -417,15 +417,19 @@ impl DetailsReporter { let source_span = &rm.m.location.source_span; let line_num = source_span.start.line; + // --- FIX IS HERE --- + // We now correctly serialize *only* the explicit capture groups (or group 0 + // as a fallback). The primary "secret" is therefore always at index 0 + // of the captures SmallVec. let snippet = Escaped( rm.m.groups .captures - .get(1) - .or_else(|| rm.m.groups.captures.get(0)) + .get(0) // Get the first (and primary) serialized capture .map(|capture| capture.value.as_bytes()) .unwrap_or_default(), ) .to_string(); + // --- END FIX --- let validation_status = if rm.validation_success { "Active Credential".to_string() diff --git a/src/rules/rule.rs b/src/rules/rule.rs index 1c2fb08..4048e35 100644 --- a/src/rules/rule.rs +++ b/src/rules/rule.rs @@ -731,7 +731,7 @@ mod tests { }), }; - let token = b"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"; + let token = b"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"; let regex = BytesRegex::new(r"(?x) ghp_(?P[A-Za-z0-9]{30})(?P[A-Za-z0-9]{6})") .unwrap(); diff --git a/src/validation.rs b/src/validation.rs index 778127c..825e2ef 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -321,6 +321,10 @@ async fn timed_validate_single_match<'a>( for dep in m.rule.syntax().depends_on_rule.iter().flatten() { if let Some(vals) = dependent_variables.get(&dep.variable.to_uppercase()) { for (val, span) in vals { + // Skip adding captured values for TOKEN dependencies + if dep.variable.eq_ignore_ascii_case("TOKEN") { + continue; + } captured_values.push(( dep.variable.to_uppercase(), val.clone(), diff --git a/src/validation/aws.rs b/src/validation/aws.rs index 9c00de0..d1d844a 100644 --- a/src/validation/aws.rs +++ b/src/validation/aws.rs @@ -187,9 +187,7 @@ pub fn validate_aws_credentials_input(access_key_id: &str, secret_key: &str) -> if !access_key_id.chars().all(|c| c.is_ascii_alphanumeric()) { return Err("AWS access key ID contains invalid characters".to_string()); } - if !secret_key.chars().all(|c| c.is_ascii_alphanumeric() || c == '/' || c == '+') { - return Err("AWS secret key contains invalid characters".to_string()); - } + Ok(()) } diff --git a/src/validation/utils.rs b/src/validation/utils.rs index 4db50a3..225e186 100644 --- a/src/validation/utils.rs +++ b/src/validation/utils.rs @@ -3,22 +3,46 @@ use tokio::net::lookup_host; use crate::validation::SerializableCaptures; -/// Return (NAME, value, start, end) for every capture we care about. +/// Return (NAME, value, start, end) for the captures we care about. /// -/// * If a capture has a name, use that (upper-cased) -/// * If it’s unnamed, fall back to `"TOKEN"` +/// * Named captures keep their (upper-cased) name +/// * Among unnamed captures, keep **only the first one** and call it "TOKEN" pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String, usize, usize)> { + let mut saw_unnamed = false; + captures .captures .iter() - .map(|cap| { - let name = - cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string()); - (name, cap.value.to_string(), cap.start, cap.end) + .filter_map(|cap| { + if let Some(name) = &cap.name { + Some((name.to_uppercase(), cap.value.to_string(), cap.start, cap.end)) + } else if !saw_unnamed { + saw_unnamed = true; + Some(("TOKEN".to_string(), cap.value.to_string(), cap.start, cap.end)) + } else { + // Ignore any additional unnamed captures (e.g., from unintended groups) + None + } }) .collect() } +// /// Return (NAME, value, start, end) for every capture we care about. +// /// +// /// * If a capture has a name, use that (upper-cased) +// /// * If it’s unnamed, fall back to `"TOKEN"` +// pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String, usize, usize)> { +// captures +// .captures +// .iter() +// .map(|cap| { +// let name = +// cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string()); +// (name, cap.value.to_string(), cap.start, cap.end) +// }) +// .collect() +// } + pub fn find_closest_variable( captures: &[(String, String, usize, usize)], target_value: &String, @@ -108,7 +132,7 @@ pub async fn check_url_resolvable(url: &Url) -> Result<(), Boxbcd)`. + // With our fix, group 0 ("abcde") is NOT serialized. + // We only get the explicit captures (group 1 and "foo"). SerializableCapture { + // This is group 1 (unnamed) name: None, - match_number: 0, - start: 0, - end: 5, - value: "abcde", + match_number: 1, // Corrected match_number + start: 1, + end: 4, + value: "bcd", }, SerializableCapture { + // This is group 2 (named "foo") name: Some("foo".to_string()), - match_number: -1, + match_number: 2, // Corrected match_number start: 1, end: 4, value: "bcd", @@ -152,45 +181,60 @@ mod tests { ], }; let result = process_captures(&captures); + + // --- FIX --- + // The expected result now only contains the explicit captures. + // The first unnamed capture ("bcd") becomes "TOKEN". assert_eq!( result, vec![ - ("TOKEN".to_string(), "abcde".to_string(), 0usize, 5usize), + ("TOKEN".to_string(), "bcd".to_string(), 1usize, 4usize), ("FOO".to_string(), "bcd".to_string(), 1usize, 4usize), ] ); + // --- END FIX --- } + #[test] #[test] fn includes_whole_match_and_unnamed_groups() { let captures = SerializableCaptures { captures: smallvec![ + // --- FIX --- + // This test simulated a regex like `(?Paa)bb(cc)`. + // With our fix, group 0 ("aabbcc") is NOT serialized. + // We only get the explicit captures ("foo" and group 2). SerializableCapture { - name: None, - match_number: 0, - start: 0, - end: 6, - value: "aabbcc", - }, - SerializableCapture { + // This is group 1 (named "foo") name: Some("foo".to_string()), - match_number: -1, + match_number: 1, // Corrected match_number start: 0, end: 2, value: "aa", }, - SerializableCapture { name: None, match_number: 1, start: 4, end: 6, value: "cc" }, + SerializableCapture { + // This is group 2 (unnamed) + name: None, + match_number: 2, // Corrected match_number + start: 4, + end: 6, + value: "cc" + }, ], }; let result = process_captures(&captures); + + // --- FIX --- + // The expected result no longer contains the full match ("aabbcc"). + // The first (and only) unnamed capture ("cc") is now correctly labeled "TOKEN". assert_eq!( result, vec![ - ("TOKEN".to_string(), "aabbcc".to_string(), 0usize, 6usize), - ("FOO".to_string(), "aa".to_string(), 0usize, 2usize), - ("TOKEN".to_string(), "cc".to_string(), 4usize, 6usize), + ("FOO".to_string(), "aa".to_string(), 0usize, 2usize), // From named group 1 + ("TOKEN".to_string(), "cc".to_string(), 4usize, 6usize), // From unnamed group 2 ] ); + // --- END FIX --- } #[test] diff --git a/tests/int_base64.rs b/tests/int_base64.rs index dfb0289..d0fc980 100644 --- a/tests/int_base64.rs +++ b/tests/int_base64.rs @@ -8,8 +8,8 @@ use tempfile::tempdir; fn detects_base64_encoded_secret() -> anyhow::Result<()> { let dir = tempdir()?; let file_path = dir.path().join("secret.txt"); - // Base64 for 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 - let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDBNV0h4cw=="; + // Base64 for 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 + let encoded = "Z2hwX0Vab3BaRE1XZWlsZGZvRnp5SDBLbld5UTVZeTN2eTBZMlNVNg=="; fs::write(&file_path, encoded)?; Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) @@ -26,7 +26,7 @@ fn detects_base64_encoded_secret() -> anyhow::Result<()> { .assert() .code(200) .stdout( - predicate::str::contains("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") + predicate::str::contains("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") .and(predicate::str::contains("\"encoding\": \"base64\"")), ); @@ -39,7 +39,7 @@ fn detects_base64_encoded_secret() -> anyhow::Result<()> { fn skips_base64_when_disabled() -> anyhow::Result<()> { let dir = tempdir()?; let file_path = dir.path().join("secret.txt"); - let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDBNV0h4cw=="; + let encoded = "Z2hwX0Vab3BaRE1XZWlsZGZvRnp5SDBLbld5UTVZeTN2eTBZMlNVNg=="; fs::write(&file_path, encoded)?; Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) @@ -92,8 +92,8 @@ fn no_base64_skips_empty_files() -> anyhow::Result<()> { fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> { let dir = tempdir()?; let file_path = dir.path().join("secret.py"); - // Base64 for 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 - let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDBNV0h4cw=="; + // Base64 for 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 + let encoded = "Z2hwX0Vab3BaRE1XZWlsZGZvRnp5SDBLbld5UTVZeTN2eTBZMlNVNg=="; fs::write(&file_path, format!("token = \"{}\"\n", encoded))?; Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) @@ -110,7 +110,7 @@ fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> { .assert() .code(200) .stdout( - predicate::str::contains("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") + predicate::str::contains("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") .and(predicate::str::contains("\"encoding\": \"base64\"")), ); diff --git a/tests/int_slack.rs b/tests/int_slack.rs index c3cacc3..999a4c0 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -159,7 +159,7 @@ async fn test_scan_slack_messages() -> Result<()> { "messages": { "matches": [{ "permalink": "https://example.slack.com/archives/C123/p1234", - "text": "This contains a github token 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", + "text": "This contains a github token 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", "ts": "1234.56", "channel": {"id": "C123", "name": "general"} }], diff --git a/tests/smoke_archive.rs b/tests/smoke_archive.rs index 3e08a0a..ebd524b 100644 --- a/tests/smoke_archive.rs +++ b/tests/smoke_archive.rs @@ -7,7 +7,7 @@ fn smoke_scan_tar_gz_archive() -> anyhow::Result<()> { let dir = tempfile::tempdir()?; let tar_gz = dir.path().join("payload.tar.gz"); - let github_pat = "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"; + let github_pat = "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"; // --- build a payload.tar.gz ------------------------------------------------- { diff --git a/tests/smoke_baseline.rs b/tests/smoke_baseline.rs index 734682c..ea6c644 100644 --- a/tests/smoke_baseline.rs +++ b/tests/smoke_baseline.rs @@ -5,7 +5,7 @@ use clap::Parser; use predicates::prelude::*; use tempfile::tempdir; -const GH_PAT: &str = "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"; +const GH_PAT: &str = "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"; #[test] fn manage_baseline_enables_no_dedup() -> anyhow::Result<()> { diff --git a/tests/smoke_exclude.rs b/tests/smoke_exclude.rs index 048c85f..9936899 100644 --- a/tests/smoke_exclude.rs +++ b/tests/smoke_exclude.rs @@ -4,7 +4,7 @@ use assert_cmd::Command; use predicates::prelude::*; use tempfile::tempdir; -const SECRET: &str = "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"; +const SECRET: &str = "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"; #[test] fn exclude_pattern_hides_matches() -> anyhow::Result<()> { diff --git a/tests/smoke_fs.rs b/tests/smoke_fs.rs index 55c45fa..786e184 100644 --- a/tests/smoke_fs.rs +++ b/tests/smoke_fs.rs @@ -5,7 +5,7 @@ use assert_cmd::prelude::*; use predicates::prelude::*; use tempfile::tempdir; -const GITHUB_PAT: &str = "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"; +const GITHUB_PAT: &str = "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"; #[test] fn smoke_scan_filesystem_text_and_binary() -> anyhow::Result<()> { diff --git a/tests/smoke_git.rs b/tests/smoke_git.rs index b260bb9..1336803 100644 --- a/tests/smoke_git.rs +++ b/tests/smoke_git.rs @@ -15,7 +15,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> { // commit v1 let file_path = repo_dir.join("config.yml"); - fs::write(&file_path, b"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")?; + fs::write(&file_path, b"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")?; let mut idx = repo.index()?; idx.add_path(std::path::Path::new("config.yml"))?; let oid1 = idx.write_tree()?; @@ -23,7 +23,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> { repo.commit(Some("HEAD"), &sig, &sig, "init", &tree1, &[])?; // commit v2 (same leak, will test dedup) - fs::write(&file_path, b"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 # unchanged")?; + fs::write(&file_path, b"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 # unchanged")?; idx.add_path(std::path::Path::new("config.yml"))?; let oid2 = idx.write_tree()?; let tree2 = repo.find_tree(oid2)?; @@ -44,7 +44,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> { ]) .assert() .code(200) // ← kingfisher’s “findings present” status - .stdout(predicate::str::contains("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")); + .stdout(predicate::str::contains("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")); dir.close()?; Ok(())