v1.63.0

2025-11-10 18:47:51 -08:00 · 2025-11-10 18:47:51 -08:00 · dca955a95c
commit dca955a95c
parent 4433b6e27b
20 changed files with 318 additions and 130 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -77,9 +77,10 @@ jobs:
            C:\vcpkg\downloads
            C:\vcpkg\archives
            C:\Users\runneradmin\AppData\Local\vcpkg\archives
-          key: ${{ runner.os }}-vcpkg-hyperscan-542
+          key: vcpkg-${{ runner.os }}-hs-542
          restore-keys: |
-            ${{ runner.os }}-vcpkg-
+            vcpkg-${{ runner.os }}-
+            vcpkg-

      # Ensure downloads dir exists and seed PCRE 8.45 zip from a working mirror
      - name: Pre-seed PCRE 8.45 for vcpkg (bypass SourceForge redirect)
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -212,9 +212,10 @@ jobs:
            C:\vcpkg\downloads
            C:\vcpkg\archives
            C:\Users\runneradmin\AppData\Local\vcpkg\archives
-          key: ${{ runner.os }}-vcpkg-hyperscan-542
+          key: vcpkg-${{ runner.os }}-hs-542
          restore-keys: |
-            ${{ runner.os }}-vcpkg-
+            vcpkg-${{ runner.os }}-
+            vcpkg-

      # Ensure downloads dir exists and seed PCRE 8.45 zip from a working mirror
      - name: Pre-seed PCRE 8.45 for vcpkg (bypass SourceForge redirect)
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,11 @@

 All notable changes to this project will be documented in this file.

+## [v1.63.0]
+- Fixed bug when retrieving some finding values and injecting them as TOKENS in the rule templates
+- Improved Datadog rule
+- Improved AWS rule
+
 ## [v1.62.0]
 - Added `pattern_requirements` checks to rules, providing lightweight post-regex character-class validation without lookarounds. See docs/RULES.md for detail
 - Added an `ignore_if_contains` option to `pattern_requirements` to drop matches containing case-insensitive placeholder words, with tests covering the new behavior.
--- a/Cargo.toml
+++ b/Cargo.toml
@ -10,7 +10,7 @@ publish = false

 [package]
 name = "kingfisher"
-version = "1.62.0"
+version = "1.63.0"
 description = "MongoDB's blazingly fast and accurate secret scanning and validation tool"
 edition.workspace = true
 rust-version.workspace = true
--- a/data/rules/datadog.yml
+++ b/data/rules/datadog.yml
@ -1,16 +1,16 @@
 rules:
  - name: Datadog API Key
-    id: kingfisher.datadog.1
+    id: kingfisher.datadog.3
    pattern: |
-      (?xi) 
+      (?xi)
      \b
-      datadog
+      (?:datadog|dd)
      (?:.|[\n\r]){0,64}?
-      (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)
+      (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)?
      (?:.|[\n\r]){0,32}?
-      \b                     
+      \b
      (
-        [a-z0-9]{32}
+        [A-Za-z0-9]{32}
      )
      \b
    pattern_requirements:
@ -18,48 +18,96 @@ rules:
    min_entropy: 3.3
    confidence: medium
    examples:
-      - datadog-secrettoken-0024a29224affe29d173c0bf99e5a89d
+      - DD_API_KEY=0024a29224affe29d173c0bf99e5a89d
    references:
      - https://docs.datadoghq.com/account_management/api-app-keys/
    validation:
      type: Http
      content:
        request:
+          method: GET
+          url: https://api.datadoghq.com/api/v1/validate
          headers:
            Accept: application/json
-            DD-API-KEY: '{{ TOKEN }}'
-            DD-APPLICATION-KEY: '{{ APPKEY }}'
-          method: GET
+            DD-API-KEY: "{{ TOKEN }}"
          response_matcher:
            - report_response: true
-            - status:
-                - 200
-              type: StatusMatch
-          url: https://api.datadoghq.com/api/v2/current_user
-    depends_on_rule:
-      - rule_id: kingfisher.datadog.2
-        variable: APPKEY
+            - type: StatusMatch
+              status: [200]

-  - name: Datadog Application Secret
-    id: kingfisher.datadog.2
-    pattern: |
-      (?xi)
-      \b
-      datadog
-      (?:.|[\n\r]){0,64}?
-      (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)
-      (?:.|[\n\r]){0,16}?
-      (
-        [a-z0-9]{40}
-      )
-    pattern_requirements:
-      min_digits: 2
-      min_uppercase: 1
-      min_lowercase: 1
-    min_entropy: 3.3
-    confidence: medium
-    examples:
-      - datadog_secret_key-3c0c3965368a6b10f7640dbda46abfdca981c2d3
-      - datadog_token = BzHpkcs7LujMb3Q1vLRRjbpBNxxYV0ousumYoKJS
-    references:
-      - https://docs.datadoghq.com/account_management/api-app-keys/
+  # - name: Datadog API Key
+  #   id: kingfisher.datadog.1
+  #   pattern: |
+  #     (?xi) 
+  #     \b
+  #     datadog
+  #     (?:.|[\n\r]){0,64}?
+  #     (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)
+  #     (?:.|[\n\r]){0,32}?
+  #     \b                     
+  #     (
+  #       [a-z0-9]{32}
+  #     )
+  #     \b
+  #   pattern_requirements:
+  #     min_digits: 2
+  #   min_entropy: 3.3
+  #   confidence: medium
+  #   examples:
+  #     - datadog-secrettoken-0024a29224affe29d173c0bf99e5a89d
+  #   references:
+  #     - https://docs.datadoghq.com/account_management/api-app-keys/
+  #   validation:
+  #     type: Http
+  #     content:
+  #       request:
+  #         headers:
+  #           Accept: application/json
+  #           DD-API-KEY: '{{ TOKEN }}'
+  #           DD-APPLICATION-KEY: '{{ APPKEY }}'
+  #         method: GET
+  #         response_matcher:
+  #           - report_response: true
+  #           - status:
+  #               - 200
+  #             type: StatusMatch
+  #         url: https://api.datadoghq.com/api/v2/current_user
+  #   depends_on_rule:
+  #     - rule_id: kingfisher.datadog.2
+  #       variable: APPKEY
+
+  # - name: Datadog API Key (API-only validation)
+  #   id: kingfisher.datadog.3
+  #   pattern: |
+  #     (?xi)
+  #     \b
+  #     (?:datadog|dd)
+  #     (?:.|[\n\r]){0,64}?
+  #     (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)?
+  #     (?:.|[\n\r]){0,32}?
+  #     \b
+  #     (
+  #       [A-Za-z0-9]{32}
+  #     )
+  #     \b
+  #   pattern_requirements:
+  #     min_digits: 2
+  #   min_entropy: 3.3
+  #   confidence: medium
+  #   examples:
+  #     - DD_API_KEY=0024a29224affe29d173c0bf99e5a89d
+  #   references:
+  #     - https://docs.datadoghq.com/account_management/api-app-keys/
+  #   validation:
+  #     type: Http
+  #     content:
+  #       request:
+  #         method: GET
+  #         url: https://api.datadoghq.com/api/v1/validate
+  #         headers:
+  #           Accept: application/json
+  #           DD-API-KEY: "{{ TOKEN }}"
+  #         response_matcher:
+  #           - report_response: true
+  #           - type: StatusMatch
+  #             status: [200]
--- a/src/decompress.rs
+++ b/src/decompress.rs
@ -340,7 +340,7 @@ mod tests {
    fn smoke_decompress_tar_gz_archive() -> anyhow::Result<()> {
        let dir = tempdir()?;
        let tar_gz = dir.path().join("payload.tar.gz");
-        let github_pat = "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"; // this is not a real secret
+        let github_pat = "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"; // this is not a real secret

        // build payload.tar.gz containing secret.txt
        {
@ -393,7 +393,7 @@ mod tests {
    fn smoke_decompress_without_extract_archives() -> anyhow::Result<()> {
        let dir = tempdir()?;
        let tar_gz = dir.path().join("payload.tar.gz");
-        let github_pat = "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";
+        let github_pat = "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";

        // ── build payload.tar.gz containing secret.txt ──────────────────────────────
        {
--- a/src/main.rs
+++ b/src/main.rs
@ -5,27 +5,27 @@
 //   * Fallback - system allocator     (`system-alloc` feature)
 // ────────────────────────────────────────────────────────────

-// --- jemalloc (opt-in) ---
-#[cfg(feature = "use-jemalloc")]
-#[global_allocator]
-static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+// // --- jemalloc (opt-in) ---
+// #[cfg(feature = "use-jemalloc")]
+// #[global_allocator]
+// static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;

-// --- mimalloc (default) ---
-#[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))]
-#[global_allocator]
-static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
-
-// --- system allocator (explicit opt-out) ---
-#[cfg(feature = "system-alloc")]
-use std::alloc::System;
-#[cfg(feature = "system-alloc")]
-#[global_allocator]
-static GLOBAL: System = System;
+// // --- mimalloc (default) ---
+// #[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))]
+// #[global_allocator]
+// static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;

+// // --- system allocator (explicit opt-out) ---
+// #[cfg(feature = "system-alloc")]
 // use std::alloc::System;
+// #[cfg(feature = "system-alloc")]
 // #[global_allocator]
 // static GLOBAL: System = System;

+use std::alloc::System;
+#[global_allocator]
+static GLOBAL: System = System;
+
 use std::{
    io::{IsTerminal, Read},
    sync::{Arc, Mutex},
--- a/src/matcher.rs
+++ b/src/matcher.rs
@ -374,7 +374,9 @@ impl<'a> Matcher<'a> {
        } else {
            None
        };
+        /////////////////////////////
        // Process matches
+        /////////////////////////////
        let mut matches = Vec::new();
        let owned_ts_results = tree_sitter_result.map(|ts_results| {
            ts_results
@ -565,9 +567,11 @@ fn record_match(
 ) -> bool {
    insert_span(map.entry(rule_id).or_default(), span)
 }
+// in src/matcher.rs
+
+#[allow(clippy::too_many_arguments)]
 fn filter_match<'b>(
    blob: &'b Blob,
-    // rule: &'b Rule,
    rule: Arc<Rule>,
    re: &Regex,
    start: usize,
@ -596,14 +600,44 @@ fn filter_match<'b>(

    for captures in re.captures_iter(haystack) {
        let full_capture = captures.get(0).unwrap();
-        let matching_input = captures.get(1).unwrap_or(full_capture);
+
+        // --- LOGIC TO FIND THE "SECRET" FOR ENTROPY/SAFE-LISTING ---
+        let matching_input_for_entropy = 'block: {
+            // 1. Prefer a named capture called TOKEN (case-insensitive).
+            if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
+                name_opt
+                    .filter(|name| name.eq_ignore_ascii_case("TOKEN"))
+                    .and_then(|_| captures.get(i))
+            }) {
+                break 'block token_cap;
+            }
+
+            // 2. Otherwise, prefer the first *matched* named capture.
+            if let Some(named_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
+                name_opt.and_then(|_| captures.get(i)) // find(i > 0 && name_opt.is_some())
+            }) {
+                break 'block named_cap;
+            }
+
+            // 3. Otherwise, fall back to the first positional capture (group 1).
+            if let Some(pos_cap) = captures.get(1) {
+                break 'block pos_cap;
+            }
+
+            // 4. Finally, fall back to the full match (group 0).
+            break 'block full_capture;
+        };
+        // --- END LOGIC ---
+
        let min_entropy = rule.min_entropy();
-        let mi_bytes = matching_input.as_bytes();
+        let entropy_bytes = matching_input_for_entropy.as_bytes();
        let full_bytes = full_capture.as_bytes();
-        let calculated_entropy = calculate_shannon_entropy(mi_bytes);
+        let calculated_entropy = calculate_shannon_entropy(entropy_bytes);
+
+        // Check entropy and safe-listing against the *selected* secret bytes
        if calculated_entropy <= min_entropy
-            || is_safe_match(mi_bytes)
-            || is_user_match(mi_bytes, full_bytes)
+            || is_safe_match(entropy_bytes)
+            || is_user_match(entropy_bytes, full_bytes)
        {
            debug!(
                "Skipping match with entropy {} <= {} or safe match",
@ -619,7 +653,15 @@ fn filter_match<'b>(
                captures: &captures,
                full_match: full_bytes,
            };
-            match char_reqs.validate(mi_bytes, Some(context), respect_ignore_if_contains) {
+
+            // --- FIX IS HERE ---
+            //
+            // The `validate` function (and thus `{{ MATCH }}`) should *always*
+            // operate on the *full match* (group 0), not just the entropy bytes.
+            // This aligns the scan logic with the unit test's logic.
+            match char_reqs.validate(full_bytes, Some(context), respect_ignore_if_contains) {
+                //
+                // --- END FIX ---
                PatternValidationResult::Passed => {}
                PatternValidationResult::Failed => {
                    debug!(
@ -647,6 +689,9 @@ fn filter_match<'b>(
            }
        }

+        // Use the `matching_input_for_entropy` as the span/key for the finding.
+        let matching_input = matching_input_for_entropy;
+
        let matching_input_offset_span = OffsetSpan::from_range(
            (start + matching_input.start())..(start + matching_input.end()),
        );
@ -668,7 +713,10 @@ fn filter_match<'b>(
        }
        let only_matching_input =
            &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
+
+        // Pass the *full* capture object to from_captures
        let groups = SerializableCaptures::from_captures(&captures, haystack, re, redact);
+
        matches.push(BlobMatch {
            rule: Arc::clone(&rule),
            blob_id: blob.id_ref(),
@ -687,6 +735,7 @@ fn filter_match<'b>(
        t.end(new_count > 0, new_count, 0);
    }
 }
+
 fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, String>)> {
    match lang.to_lowercase().as_str() {
        "bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())),
@ -796,6 +845,7 @@ pub struct SerializableCaptures {
    #[schemars(with = "Vec<SerializableCapture>")]
    pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
 }
+
 impl SerializableCaptures {
    pub fn from_captures(
        captures: &regex::bytes::Captures,
@ -808,26 +858,51 @@ impl SerializableCaptures {
        let capture_names: SmallVec<[Option<String>; 4]> =
            re.capture_names().map(|name| name.map(str::to_string)).collect();

-        for i in 0..captures.len() {
-            if let Some(cap) = captures.get(i) {
+        // If there are explicit capture groups (e.g., group 1, 2, ...),
+        // only serialize those.
+        if captures.len() > 1 {
+            for i in 1..captures.len() {
+                // Start from 1
+                if let Some(cap) = captures.get(i) {
+                    let value = if redact {
+                        redact_value(&String::from_utf8_lossy(cap.as_bytes()))
+                    } else {
+                        String::from_utf8_lossy(cap.as_bytes()).to_string()
+                    };
+                    let interned = intern(&value);
+                    let name = capture_names.get(i).and_then(|opt| opt.as_ref()).cloned();
+
+                    serialized_captures.push(SerializableCapture {
+                        name,
+                        match_number: i32::try_from(i).unwrap_or(0),
+                        start: cap.start(),
+                        end: cap.end(),
+                        value: interned,
+                    });
+                }
+            }
+        } else if captures.len() == 1 {
+            // ELSE, if there is ONLY the full match (len == 1),
+            // serialize just that full match (group 0) as the fallback.
+            if let Some(cap) = captures.get(0) {
                let value = if redact {
                    redact_value(&String::from_utf8_lossy(cap.as_bytes()))
                } else {
                    String::from_utf8_lossy(cap.as_bytes()).to_string()
                };
                let interned = intern(&value);
-
-                let name = capture_names.get(i).and_then(|opt| opt.as_ref()).cloned();
+                let name = capture_names.get(0).and_then(|opt| opt.as_ref()).cloned();

                serialized_captures.push(SerializableCapture {
                    name,
-                    match_number: i32::try_from(i).unwrap_or(0),
+                    match_number: 0,
                    start: cap.start(),
                    end: cap.end(),
                    value: interned,
                });
            }
        }
+        // If len == 0 (no match), loop is skipped, empty vec is returned.

        SerializableCaptures { captures: serialized_captures }
    }
@ -950,7 +1025,8 @@ pub struct DecodedData {
 }
 #[inline]
 fn is_base64_byte(b: u8) -> bool {
-    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/')
+    // Include URL-safe characters '-' and '_'
+    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
 }

 pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
@ -975,7 +1051,14 @@ pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
        let len = end - start;
        if len >= 32 && len % 4 == 0 {
            let base64_slice = &input[start..end];
-            if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) {
+
+            // Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD
+            let decode_result = general_purpose::STANDARD
+                .decode(base64_slice)
+                .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
+                .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
+
+            if let Ok(decoded) = decode_result {
                if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
                    if decoded_str.is_ascii() {
                        results.push(DecodedData {
@ -1521,10 +1604,10 @@ line2
            .map(|cap| (cap.name.as_deref(), cap.match_number, cap.value))
            .collect();

-        assert_eq!(entries.len(), 4);
-        assert_eq!(entries[0], (None, 0, "ghp_ABC12"));
-        assert_eq!(entries[1], (None, 1, "ghp_ABC12"));
-        assert_eq!(entries[2], (Some("body"), 2, "ABC"));
-        assert_eq!(entries[3], (Some("checksum"), 3, "12"));
+        assert_eq!(entries.len(), 3);
+
+        assert_eq!(entries[0], (None, 1, "ghp_ABC12"));
+        assert_eq!(entries[1], (Some("body"), 2, "ABC"));
+        assert_eq!(entries[2], (Some("checksum"), 3, "12"));
    }
 }
--- a/src/reporter.rs
+++ b/src/reporter.rs
@ -417,15 +417,19 @@ impl DetailsReporter {
        let source_span = &rm.m.location.source_span;
        let line_num = source_span.start.line;

+        // --- FIX IS HERE ---
+        // We now correctly serialize *only* the explicit capture groups (or group 0
+        // as a fallback). The primary "secret" is therefore always at index 0
+        // of the captures SmallVec.
        let snippet = Escaped(
            rm.m.groups
                .captures
-                .get(1)
-                .or_else(|| rm.m.groups.captures.get(0))
+                .get(0) // Get the first (and primary) serialized capture
                .map(|capture| capture.value.as_bytes())
                .unwrap_or_default(),
        )
        .to_string();
+        // --- END FIX ---

        let validation_status = if rm.validation_success {
            "Active Credential".to_string()
--- a/src/rules/rule.rs
+++ b/src/rules/rule.rs
@ -731,7 +731,7 @@ mod tests {
            }),
        };

-        let token = b"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";
+        let token = b"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";
        let regex =
            BytesRegex::new(r"(?x) ghp_(?P<body>[A-Za-z0-9]{30})(?P<checksum>[A-Za-z0-9]{6})")
                .unwrap();
--- a/src/validation.rs
+++ b/src/validation.rs
@ -321,6 +321,10 @@ async fn timed_validate_single_match<'a>(
    for dep in m.rule.syntax().depends_on_rule.iter().flatten() {
        if let Some(vals) = dependent_variables.get(&dep.variable.to_uppercase()) {
            for (val, span) in vals {
+                // Skip adding captured values for TOKEN dependencies
+                if dep.variable.eq_ignore_ascii_case("TOKEN") {
+                    continue;
+                }
                captured_values.push((
                    dep.variable.to_uppercase(),
                    val.clone(),
--- a/src/validation/aws.rs
+++ b/src/validation/aws.rs
@ -187,9 +187,7 @@ pub fn validate_aws_credentials_input(access_key_id: &str, secret_key: &str) ->
    if !access_key_id.chars().all(|c| c.is_ascii_alphanumeric()) {
        return Err("AWS access key ID contains invalid characters".to_string());
    }
-    if !secret_key.chars().all(|c| c.is_ascii_alphanumeric() || c == '/' || c == '+') {
-        return Err("AWS secret key contains invalid characters".to_string());
-    }
+
    Ok(())
 }

--- a/src/validation/utils.rs
+++ b/src/validation/utils.rs
@ -3,22 +3,46 @@ use tokio::net::lookup_host;

 use crate::validation::SerializableCaptures;

-/// Return (NAME, value, start, end) for every capture we care about.
+/// Return (NAME, value, start, end) for the captures we care about.
 ///
-/// * If a capture has a name, use that (upper-cased)  
-/// * If it’s unnamed, fall back to `"TOKEN"`
+/// * Named captures keep their (upper-cased) name
+/// * Among unnamed captures, keep **only the first one** and call it "TOKEN"
 pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String, usize, usize)> {
+    let mut saw_unnamed = false;
+
    captures
        .captures
        .iter()
-        .map(|cap| {
-            let name =
-                cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string());
-            (name, cap.value.to_string(), cap.start, cap.end)
+        .filter_map(|cap| {
+            if let Some(name) = &cap.name {
+                Some((name.to_uppercase(), cap.value.to_string(), cap.start, cap.end))
+            } else if !saw_unnamed {
+                saw_unnamed = true;
+                Some(("TOKEN".to_string(), cap.value.to_string(), cap.start, cap.end))
+            } else {
+                // Ignore any additional unnamed captures (e.g., from unintended groups)
+                None
+            }
        })
        .collect()
 }

+// /// Return (NAME, value, start, end) for every capture we care about.
+// ///
+// /// * If a capture has a name, use that (upper-cased)
+// /// * If it’s unnamed, fall back to `"TOKEN"`
+// pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String, usize, usize)> {
+//     captures
+//         .captures
+//         .iter()
+//         .map(|cap| {
+//             let name =
+//                 cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string());
+//             (name, cap.value.to_string(), cap.start, cap.end)
+//         })
+//         .collect()
+// }
+
 pub fn find_closest_variable(
    captures: &[(String, String, usize, usize)],
    target_value: &String,
@ -108,7 +132,7 @@ pub async fn check_url_resolvable(url: &Url) -> Result<(), Box<dyn std::error::E
 // -----------------------------------------------------------------------------
 // tests
 // -----------------------------------------------------------------------------
-
+//
 #[cfg(test)]
 mod tests {
    use super::*;
@ -121,7 +145,7 @@ mod tests {
        let captures = SerializableCaptures {
            captures: smallvec![SerializableCapture {
                name: None,
-                match_number: 0,
+                match_number: 0, // This test is for a rule with *no* explicit captures
                start: 1,
                end: 4,
                value: "abc",
@ -130,21 +154,26 @@ mod tests {
        let result = process_captures(&captures);
        assert_eq!(result, vec![("TOKEN".to_string(), "abc".to_string(), 1usize, 4usize)]);
    }
-
    #[test]
    fn includes_whole_match_when_multiple() {
        let captures = SerializableCaptures {
            captures: smallvec![
+                // --- FIX ---
+                // This test simulated a regex like `(abc)de(?P<foo>bcd)`.
+                // With our fix, group 0 ("abcde") is NOT serialized.
+                // We only get the explicit captures (group 1 and "foo").
                SerializableCapture {
+                    // This is group 1 (unnamed)
                    name: None,
-                    match_number: 0,
-                    start: 0,
-                    end: 5,
-                    value: "abcde",
+                    match_number: 1, // Corrected match_number
+                    start: 1,
+                    end: 4,
+                    value: "bcd",
                },
                SerializableCapture {
+                    // This is group 2 (named "foo")
                    name: Some("foo".to_string()),
-                    match_number: -1,
+                    match_number: 2, // Corrected match_number
                    start: 1,
                    end: 4,
                    value: "bcd",
@ -152,45 +181,60 @@ mod tests {
            ],
        };
        let result = process_captures(&captures);
+
+        // --- FIX ---
+        // The expected result now only contains the explicit captures.
+        // The first unnamed capture ("bcd") becomes "TOKEN".
        assert_eq!(
            result,
            vec![
-                ("TOKEN".to_string(), "abcde".to_string(), 0usize, 5usize),
+                ("TOKEN".to_string(), "bcd".to_string(), 1usize, 4usize),
                ("FOO".to_string(), "bcd".to_string(), 1usize, 4usize),
            ]
        );
+        // --- END FIX ---
    }

+    #[test]
    #[test]
    fn includes_whole_match_and_unnamed_groups() {
        let captures = SerializableCaptures {
            captures: smallvec![
+                // --- FIX ---
+                // This test simulated a regex like `(?P<foo>aa)bb(cc)`.
+                // With our fix, group 0 ("aabbcc") is NOT serialized.
+                // We only get the explicit captures ("foo" and group 2).
                SerializableCapture {
-                    name: None,
-                    match_number: 0,
-                    start: 0,
-                    end: 6,
-                    value: "aabbcc",
-                },
-                SerializableCapture {
+                    // This is group 1 (named "foo")
                    name: Some("foo".to_string()),
-                    match_number: -1,
+                    match_number: 1, // Corrected match_number
                    start: 0,
                    end: 2,
                    value: "aa",
                },
-                SerializableCapture { name: None, match_number: 1, start: 4, end: 6, value: "cc" },
+                SerializableCapture {
+                    // This is group 2 (unnamed)
+                    name: None,
+                    match_number: 2, // Corrected match_number
+                    start: 4,
+                    end: 6,
+                    value: "cc"
+                },
            ],
        };
        let result = process_captures(&captures);
+
+        // --- FIX ---
+        // The expected result no longer contains the full match ("aabbcc").
+        // The first (and only) unnamed capture ("cc") is now correctly labeled "TOKEN".
        assert_eq!(
            result,
            vec![
-                ("TOKEN".to_string(), "aabbcc".to_string(), 0usize, 6usize),
-                ("FOO".to_string(), "aa".to_string(), 0usize, 2usize),
-                ("TOKEN".to_string(), "cc".to_string(), 4usize, 6usize),
+                ("FOO".to_string(), "aa".to_string(), 0usize, 2usize), // From named group 1
+                ("TOKEN".to_string(), "cc".to_string(), 4usize, 6usize), // From unnamed group 2
            ]
        );
+        // --- END FIX ---
    }

    #[test]
--- a/tests/int_base64.rs
+++ b/tests/int_base64.rs
@ -8,8 +8,8 @@ use tempfile::tempdir;
 fn detects_base64_encoded_secret() -> anyhow::Result<()> {
    let dir = tempdir()?;
    let file_path = dir.path().join("secret.txt");
-    // Base64 for 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
-    let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDBNV0h4cw==";
+    // Base64 for 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
+    let encoded = "Z2hwX0Vab3BaRE1XZWlsZGZvRnp5SDBLbld5UTVZeTN2eTBZMlNVNg==";
    fs::write(&file_path, encoded)?;

    Command::new(assert_cmd::cargo::cargo_bin!("kingfisher"))
@ -26,7 +26,7 @@ fn detects_base64_encoded_secret() -> anyhow::Result<()> {
        .assert()
        .code(200)
        .stdout(
-            predicate::str::contains("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")
+            predicate::str::contains("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")
                .and(predicate::str::contains("\"encoding\": \"base64\"")),
        );

@ -39,7 +39,7 @@ fn detects_base64_encoded_secret() -> anyhow::Result<()> {
 fn skips_base64_when_disabled() -> anyhow::Result<()> {
    let dir = tempdir()?;
    let file_path = dir.path().join("secret.txt");
-    let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDBNV0h4cw==";
+    let encoded = "Z2hwX0Vab3BaRE1XZWlsZGZvRnp5SDBLbld5UTVZeTN2eTBZMlNVNg==";
    fs::write(&file_path, encoded)?;

    Command::new(assert_cmd::cargo::cargo_bin!("kingfisher"))
@ -92,8 +92,8 @@ fn no_base64_skips_empty_files() -> anyhow::Result<()> {
 fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> {
    let dir = tempdir()?;
    let file_path = dir.path().join("secret.py");
-    // Base64 for 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
-    let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDBNV0h4cw==";
+    // Base64 for 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
+    let encoded = "Z2hwX0Vab3BaRE1XZWlsZGZvRnp5SDBLbld5UTVZeTN2eTBZMlNVNg==";
    fs::write(&file_path, format!("token = \"{}\"\n", encoded))?;

    Command::new(assert_cmd::cargo::cargo_bin!("kingfisher"))
@ -110,7 +110,7 @@ fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> {
        .assert()
        .code(200)
        .stdout(
-            predicate::str::contains("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")
+            predicate::str::contains("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")
                .and(predicate::str::contains("\"encoding\": \"base64\"")),
        );

--- a/tests/int_slack.rs
+++ b/tests/int_slack.rs
@ -159,7 +159,7 @@ async fn test_scan_slack_messages() -> Result<()> {
        "messages": {
            "matches": [{
                "permalink": "https://example.slack.com/archives/C123/p1234",
-                "text": "This contains a github token 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",
+                "text": "This contains a github token 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",
                "ts": "1234.56",
                "channel": {"id": "C123", "name": "general"}
            }],
--- a/tests/smoke_archive.rs
+++ b/tests/smoke_archive.rs
@ -7,7 +7,7 @@ fn smoke_scan_tar_gz_archive() -> anyhow::Result<()> {

    let dir = tempfile::tempdir()?;
    let tar_gz = dir.path().join("payload.tar.gz");
-    let github_pat = "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";
+    let github_pat = "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";

    // --- build a payload.tar.gz -------------------------------------------------
    {
--- a/tests/smoke_baseline.rs
+++ b/tests/smoke_baseline.rs
@ -5,7 +5,7 @@ use clap::Parser;
 use predicates::prelude::*;
 use tempfile::tempdir;

-const GH_PAT: &str = "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";
+const GH_PAT: &str = "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";

 #[test]
 fn manage_baseline_enables_no_dedup() -> anyhow::Result<()> {
--- a/tests/smoke_exclude.rs
+++ b/tests/smoke_exclude.rs
@ -4,7 +4,7 @@ use assert_cmd::Command;
 use predicates::prelude::*;
 use tempfile::tempdir;

-const SECRET: &str = "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";
+const SECRET: &str = "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";

 #[test]
 fn exclude_pattern_hides_matches() -> anyhow::Result<()> {
--- a/tests/smoke_fs.rs
+++ b/tests/smoke_fs.rs
@ -5,7 +5,7 @@ use assert_cmd::prelude::*;
 use predicates::prelude::*;
 use tempfile::tempdir;

-const GITHUB_PAT: &str = "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";
+const GITHUB_PAT: &str = "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";

 #[test]
 fn smoke_scan_filesystem_text_and_binary() -> anyhow::Result<()> {
--- a/tests/smoke_git.rs
+++ b/tests/smoke_git.rs
@ -15,7 +15,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> {

    // commit v1
    let file_path = repo_dir.join("config.yml");
-    fs::write(&file_path, b"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")?;
+    fs::write(&file_path, b"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")?;
    let mut idx = repo.index()?;
    idx.add_path(std::path::Path::new("config.yml"))?;
    let oid1 = idx.write_tree()?;
@ -23,7 +23,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> {
    repo.commit(Some("HEAD"), &sig, &sig, "init", &tree1, &[])?;

    // commit v2 (same leak, will test dedup)
-    fs::write(&file_path, b"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 # unchanged")?;
+    fs::write(&file_path, b"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 # unchanged")?;
    idx.add_path(std::path::Path::new("config.yml"))?;
    let oid2 = idx.write_tree()?;
    let tree2 = repo.find_tree(oid2)?;
@ -44,7 +44,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> {
        ])
        .assert()
        .code(200) // ← kingfisher’s “findings present” status
-        .stdout(predicate::str::contains("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"));
+        .stdout(predicate::str::contains("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"));

    dir.close()?;
    Ok(())