From dca955a95c8241c7f1d072010d8c82b8af78b91c Mon Sep 17 00:00:00 2001
From: Mick Grove <mick.grove@mongodb.com>
Date: Mon, 10 Nov 2025 18:47:51 -0800
Subject: [PATCH] v1.63.0

---
 .github/workflows/ci.yml      |   5 +-
 .github/workflows/release.yml |   5 +-
 CHANGELOG.md                  |   5 ++
 Cargo.toml                    |   2 +-
 data/rules/datadog.yml        | 128 +++++++++++++++++++++++-----------
 src/decompress.rs             |   4 +-
 src/main.rs                   |  30 ++++----
 src/matcher.rs                | 121 +++++++++++++++++++++++++++-----
 src/reporter.rs               |   8 ++-
 src/rules/rule.rs             |   2 +-
 src/validation.rs             |   4 ++
 src/validation/aws.rs         |   4 +-
 src/validation/utils.rs       | 100 ++++++++++++++++++--------
 tests/int_base64.rs           |  14 ++--
 tests/int_slack.rs            |   2 +-
 tests/smoke_archive.rs        |   2 +-
 tests/smoke_baseline.rs       |   2 +-
 tests/smoke_exclude.rs        |   2 +-
 tests/smoke_fs.rs             |   2 +-
 tests/smoke_git.rs            |   6 +-
 20 files changed, 318 insertions(+), 130 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 81c6bb0..1080ff4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -77,9 +77,10 @@ jobs:
             C:\vcpkg\downloads
             C:\vcpkg\archives
             C:\Users\runneradmin\AppData\Local\vcpkg\archives
-          key: ${{ runner.os }}-vcpkg-hyperscan-542
+          key: vcpkg-${{ runner.os }}-hs-542
           restore-keys: |
-            ${{ runner.os }}-vcpkg-
+            vcpkg-${{ runner.os }}-
+            vcpkg-
 
       # Ensure downloads dir exists and seed PCRE 8.45 zip from a working mirror
       - name: Pre-seed PCRE 8.45 for vcpkg (bypass SourceForge redirect)
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index e8f6222..ed7f8f5 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -212,9 +212,10 @@ jobs:
             C:\vcpkg\downloads
             C:\vcpkg\archives
             C:\Users\runneradmin\AppData\Local\vcpkg\archives
-          key: ${{ runner.os }}-vcpkg-hyperscan-542
+          key: vcpkg-${{ runner.os }}-hs-542
           restore-keys: |
-            ${{ runner.os }}-vcpkg-
+            vcpkg-${{ runner.os }}-
+            vcpkg-
 
       # Ensure downloads dir exists and seed PCRE 8.45 zip from a working mirror
       - name: Pre-seed PCRE 8.45 for vcpkg (bypass SourceForge redirect)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 044bede..2ee37c6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,11 @@
 
 All notable changes to this project will be documented in this file.
 
+## [v1.63.0]
+- Fixed bug when retrieving some finding values and injecting them as TOKENS in the rule templates
+- Improved Datadog rule
+- Improved AWS rule
+
 ## [v1.62.0]
 - Added `pattern_requirements` checks to rules, providing lightweight post-regex character-class validation without lookarounds. See docs/RULES.md for detail
 - Added an `ignore_if_contains` option to `pattern_requirements` to drop matches containing case-insensitive placeholder words, with tests covering the new behavior.
diff --git a/Cargo.toml b/Cargo.toml
index 44afc1d..71d9065 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,7 +10,7 @@ publish = false
 
 [package]
 name = "kingfisher"
-version = "1.62.0"
+version = "1.63.0"
 description = "MongoDB's blazingly fast and accurate secret scanning and validation tool"
 edition.workspace = true
 rust-version.workspace = true
diff --git a/data/rules/datadog.yml b/data/rules/datadog.yml
index 6958bf8..67c8a5f 100644
--- a/data/rules/datadog.yml
+++ b/data/rules/datadog.yml
@@ -1,16 +1,16 @@
 rules:
   - name: Datadog API Key
-    id: kingfisher.datadog.1
+    id: kingfisher.datadog.3
     pattern: |
-      (?xi) 
+      (?xi)
       \b
-      datadog
+      (?:datadog|dd)
       (?:.|[\n\r]){0,64}?
-      (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)
+      (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)?
       (?:.|[\n\r]){0,32}?
-      \b                     
+      \b
       (
-        [a-z0-9]{32}
+        [A-Za-z0-9]{32}
       )
       \b
     pattern_requirements:
@@ -18,48 +18,96 @@ rules:
     min_entropy: 3.3
     confidence: medium
     examples:
-      - datadog-secrettoken-0024a29224affe29d173c0bf99e5a89d
+      - DD_API_KEY=0024a29224affe29d173c0bf99e5a89d
     references:
       - https://docs.datadoghq.com/account_management/api-app-keys/
     validation:
       type: Http
       content:
         request:
+          method: GET
+          url: https://api.datadoghq.com/api/v1/validate
           headers:
             Accept: application/json
-            DD-API-KEY: '{{ TOKEN }}'
-            DD-APPLICATION-KEY: '{{ APPKEY }}'
-          method: GET
+            DD-API-KEY: "{{ TOKEN }}"
           response_matcher:
             - report_response: true
-            - status:
-                - 200
-              type: StatusMatch
-          url: https://api.datadoghq.com/api/v2/current_user
-    depends_on_rule:
-      - rule_id: kingfisher.datadog.2
-        variable: APPKEY
+            - type: StatusMatch
+              status: [200]
 
-  - name: Datadog Application Secret
-    id: kingfisher.datadog.2
-    pattern: |
-      (?xi)
-      \b
-      datadog
-      (?:.|[\n\r]){0,64}?
-      (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)
-      (?:.|[\n\r]){0,16}?
-      (
-        [a-z0-9]{40}
-      )
-    pattern_requirements:
-      min_digits: 2
-      min_uppercase: 1
-      min_lowercase: 1
-    min_entropy: 3.3
-    confidence: medium
-    examples:
-      - datadog_secret_key-3c0c3965368a6b10f7640dbda46abfdca981c2d3
-      - datadog_token = BzHpkcs7LujMb3Q1vLRRjbpBNxxYV0ousumYoKJS
-    references:
-      - https://docs.datadoghq.com/account_management/api-app-keys/
\ No newline at end of file
+  # - name: Datadog API Key
+  #   id: kingfisher.datadog.1
+  #   pattern: |
+  #     (?xi) 
+  #     \b
+  #     datadog
+  #     (?:.|[\n\r]){0,64}?
+  #     (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)
+  #     (?:.|[\n\r]){0,32}?
+  #     \b                     
+  #     (
+  #       [a-z0-9]{32}
+  #     )
+  #     \b
+  #   pattern_requirements:
+  #     min_digits: 2
+  #   min_entropy: 3.3
+  #   confidence: medium
+  #   examples:
+  #     - datadog-secrettoken-0024a29224affe29d173c0bf99e5a89d
+  #   references:
+  #     - https://docs.datadoghq.com/account_management/api-app-keys/
+  #   validation:
+  #     type: Http
+  #     content:
+  #       request:
+  #         headers:
+  #           Accept: application/json
+  #           DD-API-KEY: '{{ TOKEN }}'
+  #           DD-APPLICATION-KEY: '{{ APPKEY }}'
+  #         method: GET
+  #         response_matcher:
+  #           - report_response: true
+  #           - status:
+  #               - 200
+  #             type: StatusMatch
+  #         url: https://api.datadoghq.com/api/v2/current_user
+  #   depends_on_rule:
+  #     - rule_id: kingfisher.datadog.2
+  #       variable: APPKEY
+
+  # - name: Datadog API Key (API-only validation)
+  #   id: kingfisher.datadog.3
+  #   pattern: |
+  #     (?xi)
+  #     \b
+  #     (?:datadog|dd)
+  #     (?:.|[\n\r]){0,64}?
+  #     (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)?
+  #     (?:.|[\n\r]){0,32}?
+  #     \b
+  #     (
+  #       [A-Za-z0-9]{32}
+  #     )
+  #     \b
+  #   pattern_requirements:
+  #     min_digits: 2
+  #   min_entropy: 3.3
+  #   confidence: medium
+  #   examples:
+  #     - DD_API_KEY=0024a29224affe29d173c0bf99e5a89d
+  #   references:
+  #     - https://docs.datadoghq.com/account_management/api-app-keys/
+  #   validation:
+  #     type: Http
+  #     content:
+  #       request:
+  #         method: GET
+  #         url: https://api.datadoghq.com/api/v1/validate
+  #         headers:
+  #           Accept: application/json
+  #           DD-API-KEY: "{{ TOKEN }}"
+  #         response_matcher:
+  #           - report_response: true
+  #           - type: StatusMatch
+  #             status: [200]
diff --git a/src/decompress.rs b/src/decompress.rs
index bef07c6..595d11a 100644
--- a/src/decompress.rs
+++ b/src/decompress.rs
@@ -340,7 +340,7 @@ mod tests {
     fn smoke_decompress_tar_gz_archive() -> anyhow::Result<()> {
         let dir = tempdir()?;
         let tar_gz = dir.path().join("payload.tar.gz");
-        let github_pat = "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"; // this is not a real secret
+        let github_pat = "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"; // this is not a real secret
 
         // build payload.tar.gz containing secret.txt
         {
@@ -393,7 +393,7 @@ mod tests {
     fn smoke_decompress_without_extract_archives() -> anyhow::Result<()> {
         let dir = tempdir()?;
         let tar_gz = dir.path().join("payload.tar.gz");
-        let github_pat = "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";
+        let github_pat = "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";
 
         // ── build payload.tar.gz containing secret.txt ──────────────────────────────
         {
diff --git a/src/main.rs b/src/main.rs
index b52fcd3..99f0718 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -5,27 +5,27 @@
 //   * Fallback - system allocator     (`system-alloc` feature)
 // ────────────────────────────────────────────────────────────
 
-// --- jemalloc (opt-in) ---
-#[cfg(feature = "use-jemalloc")]
-#[global_allocator]
-static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+// // --- jemalloc (opt-in) ---
+// #[cfg(feature = "use-jemalloc")]
+// #[global_allocator]
+// static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-// --- mimalloc (default) ---
-#[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))]
-#[global_allocator]
-static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
-
-// --- system allocator (explicit opt-out) ---
-#[cfg(feature = "system-alloc")]
-use std::alloc::System;
-#[cfg(feature = "system-alloc")]
-#[global_allocator]
-static GLOBAL: System = System;
+// // --- mimalloc (default) ---
+// #[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))]
+// #[global_allocator]
+// static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
 
+// // --- system allocator (explicit opt-out) ---
+// #[cfg(feature = "system-alloc")]
 // use std::alloc::System;
+// #[cfg(feature = "system-alloc")]
 // #[global_allocator]
 // static GLOBAL: System = System;
 
+use std::alloc::System;
+#[global_allocator]
+static GLOBAL: System = System;
+
 use std::{
     io::{IsTerminal, Read},
     sync::{Arc, Mutex},
diff --git a/src/matcher.rs b/src/matcher.rs
index b820233..6dd5048 100644
--- a/src/matcher.rs
+++ b/src/matcher.rs
@@ -374,7 +374,9 @@ impl<'a> Matcher<'a> {
         } else {
             None
         };
+        /////////////////////////////
         // Process matches
+        /////////////////////////////
         let mut matches = Vec::new();
         let owned_ts_results = tree_sitter_result.map(|ts_results| {
             ts_results
@@ -565,9 +567,11 @@ fn record_match(
 ) -> bool {
     insert_span(map.entry(rule_id).or_default(), span)
 }
+// in src/matcher.rs
+
+#[allow(clippy::too_many_arguments)]
 fn filter_match<'b>(
     blob: &'b Blob,
-    // rule: &'b Rule,
     rule: Arc<Rule>,
     re: &Regex,
     start: usize,
@@ -596,14 +600,44 @@ fn filter_match<'b>(
 
     for captures in re.captures_iter(haystack) {
         let full_capture = captures.get(0).unwrap();
-        let matching_input = captures.get(1).unwrap_or(full_capture);
+
+        // --- LOGIC TO FIND THE "SECRET" FOR ENTROPY/SAFE-LISTING ---
+        let matching_input_for_entropy = 'block: {
+            // 1. Prefer a named capture called TOKEN (case-insensitive).
+            if let Some(token_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
+                name_opt
+                    .filter(|name| name.eq_ignore_ascii_case("TOKEN"))
+                    .and_then(|_| captures.get(i))
+            }) {
+                break 'block token_cap;
+            }
+
+            // 2. Otherwise, prefer the first *matched* named capture.
+            if let Some(named_cap) = re.capture_names().enumerate().find_map(|(i, name_opt)| {
+                name_opt.and_then(|_| captures.get(i)) // find(i > 0 && name_opt.is_some())
+            }) {
+                break 'block named_cap;
+            }
+
+            // 3. Otherwise, fall back to the first positional capture (group 1).
+            if let Some(pos_cap) = captures.get(1) {
+                break 'block pos_cap;
+            }
+
+            // 4. Finally, fall back to the full match (group 0).
+            break 'block full_capture;
+        };
+        // --- END LOGIC ---
+
         let min_entropy = rule.min_entropy();
-        let mi_bytes = matching_input.as_bytes();
+        let entropy_bytes = matching_input_for_entropy.as_bytes();
         let full_bytes = full_capture.as_bytes();
-        let calculated_entropy = calculate_shannon_entropy(mi_bytes);
+        let calculated_entropy = calculate_shannon_entropy(entropy_bytes);
+
+        // Check entropy and safe-listing against the *selected* secret bytes
         if calculated_entropy <= min_entropy
-            || is_safe_match(mi_bytes)
-            || is_user_match(mi_bytes, full_bytes)
+            || is_safe_match(entropy_bytes)
+            || is_user_match(entropy_bytes, full_bytes)
         {
             debug!(
                 "Skipping match with entropy {} <= {} or safe match",
@@ -619,7 +653,15 @@ fn filter_match<'b>(
                 captures: &captures,
                 full_match: full_bytes,
             };
-            match char_reqs.validate(mi_bytes, Some(context), respect_ignore_if_contains) {
+
+            // --- FIX IS HERE ---
+            //
+            // The `validate` function (and thus `{{ MATCH }}`) should *always*
+            // operate on the *full match* (group 0), not just the entropy bytes.
+            // This aligns the scan logic with the unit test's logic.
+            match char_reqs.validate(full_bytes, Some(context), respect_ignore_if_contains) {
+                //
+                // --- END FIX ---
                 PatternValidationResult::Passed => {}
                 PatternValidationResult::Failed => {
                     debug!(
@@ -647,6 +689,9 @@ fn filter_match<'b>(
             }
         }
 
+        // Use the `matching_input_for_entropy` as the span/key for the finding.
+        let matching_input = matching_input_for_entropy;
+
         let matching_input_offset_span = OffsetSpan::from_range(
             (start + matching_input.start())..(start + matching_input.end()),
         );
@@ -668,7 +713,10 @@ fn filter_match<'b>(
         }
         let only_matching_input =
             &blob.bytes()[matching_input_offset_span.start..matching_input_offset_span.end];
+
+        // Pass the *full* capture object to from_captures
         let groups = SerializableCaptures::from_captures(&captures, haystack, re, redact);
+
         matches.push(BlobMatch {
             rule: Arc::clone(&rule),
             blob_id: blob.id_ref(),
@@ -687,6 +735,7 @@ fn filter_match<'b>(
         t.end(new_count > 0, new_count, 0);
     }
 }
+
 fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, String>)> {
     match lang.to_lowercase().as_str() {
         "bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())),
@@ -796,6 +845,7 @@ pub struct SerializableCaptures {
     #[schemars(with = "Vec<SerializableCapture>")]
     pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
 }
+
 impl SerializableCaptures {
     pub fn from_captures(
         captures: &regex::bytes::Captures,
@@ -808,26 +858,51 @@ impl SerializableCaptures {
         let capture_names: SmallVec<[Option<String>; 4]> =
             re.capture_names().map(|name| name.map(str::to_string)).collect();
 
-        for i in 0..captures.len() {
-            if let Some(cap) = captures.get(i) {
+        // If there are explicit capture groups (e.g., group 1, 2, ...),
+        // only serialize those.
+        if captures.len() > 1 {
+            for i in 1..captures.len() {
+                // Start from 1
+                if let Some(cap) = captures.get(i) {
+                    let value = if redact {
+                        redact_value(&String::from_utf8_lossy(cap.as_bytes()))
+                    } else {
+                        String::from_utf8_lossy(cap.as_bytes()).to_string()
+                    };
+                    let interned = intern(&value);
+                    let name = capture_names.get(i).and_then(|opt| opt.as_ref()).cloned();
+
+                    serialized_captures.push(SerializableCapture {
+                        name,
+                        match_number: i32::try_from(i).unwrap_or(0),
+                        start: cap.start(),
+                        end: cap.end(),
+                        value: interned,
+                    });
+                }
+            }
+        } else if captures.len() == 1 {
+            // ELSE, if there is ONLY the full match (len == 1),
+            // serialize just that full match (group 0) as the fallback.
+            if let Some(cap) = captures.get(0) {
                 let value = if redact {
                     redact_value(&String::from_utf8_lossy(cap.as_bytes()))
                 } else {
                     String::from_utf8_lossy(cap.as_bytes()).to_string()
                 };
                 let interned = intern(&value);
-
-                let name = capture_names.get(i).and_then(|opt| opt.as_ref()).cloned();
+                let name = capture_names.get(0).and_then(|opt| opt.as_ref()).cloned();
 
                 serialized_captures.push(SerializableCapture {
                     name,
-                    match_number: i32::try_from(i).unwrap_or(0),
+                    match_number: 0,
                     start: cap.start(),
                     end: cap.end(),
                     value: interned,
                 });
             }
         }
+        // If len == 0 (no match), loop is skipped, empty vec is returned.
 
         SerializableCaptures { captures: serialized_captures }
     }
@@ -950,7 +1025,8 @@ pub struct DecodedData {
 }
 #[inline]
 fn is_base64_byte(b: u8) -> bool {
-    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/')
+    // Include URL-safe characters '-' and '_'
+    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_')
 }
 
 pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
@@ -975,7 +1051,14 @@ pub fn get_base64_strings(input: &[u8]) -> Vec<DecodedData> {
         let len = end - start;
         if len >= 32 && len % 4 == 0 {
             let base64_slice = &input[start..end];
-            if let Ok(decoded) = general_purpose::STANDARD.decode(base64_slice) {
+
+            // Try decoding with STANDARD, then URL_SAFE, then URL_SAFE_NO_PAD
+            let decode_result = general_purpose::STANDARD
+                .decode(base64_slice)
+                .or_else(|_| general_purpose::URL_SAFE.decode(base64_slice))
+                .or_else(|_| general_purpose::URL_SAFE_NO_PAD.decode(base64_slice));
+
+            if let Ok(decoded) = decode_result {
                 if let Ok(decoded_str) = std::str::from_utf8(&decoded) {
                     if decoded_str.is_ascii() {
                         results.push(DecodedData {
@@ -1521,10 +1604,10 @@ line2
             .map(|cap| (cap.name.as_deref(), cap.match_number, cap.value))
             .collect();
 
-        assert_eq!(entries.len(), 4);
-        assert_eq!(entries[0], (None, 0, "ghp_ABC12"));
-        assert_eq!(entries[1], (None, 1, "ghp_ABC12"));
-        assert_eq!(entries[2], (Some("body"), 2, "ABC"));
-        assert_eq!(entries[3], (Some("checksum"), 3, "12"));
+        assert_eq!(entries.len(), 3);
+
+        assert_eq!(entries[0], (None, 1, "ghp_ABC12"));
+        assert_eq!(entries[1], (Some("body"), 2, "ABC"));
+        assert_eq!(entries[2], (Some("checksum"), 3, "12"));
     }
 }
diff --git a/src/reporter.rs b/src/reporter.rs
index e8e29ba..f1a1679 100644
--- a/src/reporter.rs
+++ b/src/reporter.rs
@@ -417,15 +417,19 @@ impl DetailsReporter {
         let source_span = &rm.m.location.source_span;
         let line_num = source_span.start.line;
 
+        // --- FIX IS HERE ---
+        // We now correctly serialize *only* the explicit capture groups (or group 0
+        // as a fallback). The primary "secret" is therefore always at index 0
+        // of the captures SmallVec.
         let snippet = Escaped(
             rm.m.groups
                 .captures
-                .get(1)
-                .or_else(|| rm.m.groups.captures.get(0))
+                .get(0) // Get the first (and primary) serialized capture
                 .map(|capture| capture.value.as_bytes())
                 .unwrap_or_default(),
         )
         .to_string();
+        // --- END FIX ---
 
         let validation_status = if rm.validation_success {
             "Active Credential".to_string()
diff --git a/src/rules/rule.rs b/src/rules/rule.rs
index 1c2fb08..4048e35 100644
--- a/src/rules/rule.rs
+++ b/src/rules/rule.rs
@@ -731,7 +731,7 @@ mod tests {
             }),
         };
 
-        let token = b"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";
+        let token = b"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";
         let regex =
             BytesRegex::new(r"(?x) ghp_(?P<body>[A-Za-z0-9]{30})(?P<checksum>[A-Za-z0-9]{6})")
                 .unwrap();
diff --git a/src/validation.rs b/src/validation.rs
index 778127c..825e2ef 100644
--- a/src/validation.rs
+++ b/src/validation.rs
@@ -321,6 +321,10 @@ async fn timed_validate_single_match<'a>(
     for dep in m.rule.syntax().depends_on_rule.iter().flatten() {
         if let Some(vals) = dependent_variables.get(&dep.variable.to_uppercase()) {
             for (val, span) in vals {
+                // Skip adding captured values for TOKEN dependencies
+                if dep.variable.eq_ignore_ascii_case("TOKEN") {
+                    continue;
+                }
                 captured_values.push((
                     dep.variable.to_uppercase(),
                     val.clone(),
diff --git a/src/validation/aws.rs b/src/validation/aws.rs
index 9c00de0..d1d844a 100644
--- a/src/validation/aws.rs
+++ b/src/validation/aws.rs
@@ -187,9 +187,7 @@ pub fn validate_aws_credentials_input(access_key_id: &str, secret_key: &str) ->
     if !access_key_id.chars().all(|c| c.is_ascii_alphanumeric()) {
         return Err("AWS access key ID contains invalid characters".to_string());
     }
-    if !secret_key.chars().all(|c| c.is_ascii_alphanumeric() || c == '/' || c == '+') {
-        return Err("AWS secret key contains invalid characters".to_string());
-    }
+
     Ok(())
 }
 
diff --git a/src/validation/utils.rs b/src/validation/utils.rs
index 4db50a3..225e186 100644
--- a/src/validation/utils.rs
+++ b/src/validation/utils.rs
@@ -3,22 +3,46 @@ use tokio::net::lookup_host;
 
 use crate::validation::SerializableCaptures;
 
-/// Return (NAME, value, start, end) for every capture we care about.
+/// Return (NAME, value, start, end) for the captures we care about.
 ///
-/// * If a capture has a name, use that (upper-cased)  
-/// * If it’s unnamed, fall back to `"TOKEN"`
+/// * Named captures keep their (upper-cased) name
+/// * Among unnamed captures, keep **only the first one** and call it "TOKEN"
 pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String, usize, usize)> {
+    let mut saw_unnamed = false;
+
     captures
         .captures
         .iter()
-        .map(|cap| {
-            let name =
-                cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string());
-            (name, cap.value.to_string(), cap.start, cap.end)
+        .filter_map(|cap| {
+            if let Some(name) = &cap.name {
+                Some((name.to_uppercase(), cap.value.to_string(), cap.start, cap.end))
+            } else if !saw_unnamed {
+                saw_unnamed = true;
+                Some(("TOKEN".to_string(), cap.value.to_string(), cap.start, cap.end))
+            } else {
+                // Ignore any additional unnamed captures (e.g., from unintended groups)
+                None
+            }
         })
         .collect()
 }
 
+// /// Return (NAME, value, start, end) for every capture we care about.
+// ///
+// /// * If a capture has a name, use that (upper-cased)
+// /// * If it’s unnamed, fall back to `"TOKEN"`
+// pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String, usize, usize)> {
+//     captures
+//         .captures
+//         .iter()
+//         .map(|cap| {
+//             let name =
+//                 cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string());
+//             (name, cap.value.to_string(), cap.start, cap.end)
+//         })
+//         .collect()
+// }
+
 pub fn find_closest_variable(
     captures: &[(String, String, usize, usize)],
     target_value: &String,
@@ -108,7 +132,7 @@ pub async fn check_url_resolvable(url: &Url) -> Result<(), Box<dyn std::error::E
 // -----------------------------------------------------------------------------
 // tests
 // -----------------------------------------------------------------------------
-
+//
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -121,7 +145,7 @@ mod tests {
         let captures = SerializableCaptures {
             captures: smallvec![SerializableCapture {
                 name: None,
-                match_number: 0,
+                match_number: 0, // This test is for a rule with *no* explicit captures
                 start: 1,
                 end: 4,
                 value: "abc",
@@ -130,21 +154,26 @@ mod tests {
         let result = process_captures(&captures);
         assert_eq!(result, vec![("TOKEN".to_string(), "abc".to_string(), 1usize, 4usize)]);
     }
-
     #[test]
     fn includes_whole_match_when_multiple() {
         let captures = SerializableCaptures {
             captures: smallvec![
+                // --- FIX ---
+                // This test simulated a regex like `(abc)de(?P<foo>bcd)`.
+                // With our fix, group 0 ("abcde") is NOT serialized.
+                // We only get the explicit captures (group 1 and "foo").
                 SerializableCapture {
+                    // This is group 1 (unnamed)
                     name: None,
-                    match_number: 0,
-                    start: 0,
-                    end: 5,
-                    value: "abcde",
+                    match_number: 1, // Corrected match_number
+                    start: 1,
+                    end: 4,
+                    value: "bcd",
                 },
                 SerializableCapture {
+                    // This is group 2 (named "foo")
                     name: Some("foo".to_string()),
-                    match_number: -1,
+                    match_number: 2, // Corrected match_number
                     start: 1,
                     end: 4,
                     value: "bcd",
@@ -152,45 +181,60 @@ mod tests {
             ],
         };
         let result = process_captures(&captures);
+
+        // --- FIX ---
+        // The expected result now only contains the explicit captures.
+        // The first unnamed capture ("bcd") becomes "TOKEN".
         assert_eq!(
             result,
             vec![
-                ("TOKEN".to_string(), "abcde".to_string(), 0usize, 5usize),
+                ("TOKEN".to_string(), "bcd".to_string(), 1usize, 4usize),
                 ("FOO".to_string(), "bcd".to_string(), 1usize, 4usize),
             ]
         );
+        // --- END FIX ---
     }
 
+    #[test]
     #[test]
     fn includes_whole_match_and_unnamed_groups() {
         let captures = SerializableCaptures {
             captures: smallvec![
+                // --- FIX ---
+                // This test simulated a regex like `(?P<foo>aa)bb(cc)`.
+                // With our fix, group 0 ("aabbcc") is NOT serialized.
+                // We only get the explicit captures ("foo" and group 2).
                 SerializableCapture {
-                    name: None,
-                    match_number: 0,
-                    start: 0,
-                    end: 6,
-                    value: "aabbcc",
-                },
-                SerializableCapture {
+                    // This is group 1 (named "foo")
                     name: Some("foo".to_string()),
-                    match_number: -1,
+                    match_number: 1, // Corrected match_number
                     start: 0,
                     end: 2,
                     value: "aa",
                 },
-                SerializableCapture { name: None, match_number: 1, start: 4, end: 6, value: "cc" },
+                SerializableCapture {
+                    // This is group 2 (unnamed)
+                    name: None,
+                    match_number: 2, // Corrected match_number
+                    start: 4,
+                    end: 6,
+                    value: "cc"
+                },
             ],
         };
         let result = process_captures(&captures);
+
+        // --- FIX ---
+        // The expected result no longer contains the full match ("aabbcc").
+        // The first (and only) unnamed capture ("cc") is now correctly labeled "TOKEN".
         assert_eq!(
             result,
             vec![
-                ("TOKEN".to_string(), "aabbcc".to_string(), 0usize, 6usize),
-                ("FOO".to_string(), "aa".to_string(), 0usize, 2usize),
-                ("TOKEN".to_string(), "cc".to_string(), 4usize, 6usize),
+                ("FOO".to_string(), "aa".to_string(), 0usize, 2usize), // From named group 1
+                ("TOKEN".to_string(), "cc".to_string(), 4usize, 6usize), // From unnamed group 2
             ]
         );
+        // --- END FIX ---
     }
 
     #[test]
diff --git a/tests/int_base64.rs b/tests/int_base64.rs
index dfb0289..d0fc980 100644
--- a/tests/int_base64.rs
+++ b/tests/int_base64.rs
@@ -8,8 +8,8 @@ use tempfile::tempdir;
 fn detects_base64_encoded_secret() -> anyhow::Result<()> {
     let dir = tempdir()?;
     let file_path = dir.path().join("secret.txt");
-    // Base64 for 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
-    let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDBNV0h4cw==";
+    // Base64 for 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
+    let encoded = "Z2hwX0Vab3BaRE1XZWlsZGZvRnp5SDBLbld5UTVZeTN2eTBZMlNVNg==";
     fs::write(&file_path, encoded)?;
 
     Command::new(assert_cmd::cargo::cargo_bin!("kingfisher"))
@@ -26,7 +26,7 @@ fn detects_base64_encoded_secret() -> anyhow::Result<()> {
         .assert()
         .code(200)
         .stdout(
-            predicate::str::contains("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")
+            predicate::str::contains("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")
                 .and(predicate::str::contains("\"encoding\": \"base64\"")),
         );
 
@@ -39,7 +39,7 @@ fn detects_base64_encoded_secret() -> anyhow::Result<()> {
 fn skips_base64_when_disabled() -> anyhow::Result<()> {
     let dir = tempdir()?;
     let file_path = dir.path().join("secret.txt");
-    let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDBNV0h4cw==";
+    let encoded = "Z2hwX0Vab3BaRE1XZWlsZGZvRnp5SDBLbld5UTVZeTN2eTBZMlNVNg==";
     fs::write(&file_path, encoded)?;
 
     Command::new(assert_cmd::cargo::cargo_bin!("kingfisher"))
@@ -92,8 +92,8 @@ fn no_base64_skips_empty_files() -> anyhow::Result<()> {
 fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> {
     let dir = tempdir()?;
     let file_path = dir.path().join("secret.py");
-    // Base64 for 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
-    let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDBNV0h4cw==";
+    // Base64 for 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
+    let encoded = "Z2hwX0Vab3BaRE1XZWlsZGZvRnp5SDBLbld5UTVZeTN2eTBZMlNVNg==";
     fs::write(&file_path, format!("token = \"{}\"\n", encoded))?;
 
     Command::new(assert_cmd::cargo::cargo_bin!("kingfisher"))
@@ -110,7 +110,7 @@ fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> {
         .assert()
         .code(200)
         .stdout(
-            predicate::str::contains("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")
+            predicate::str::contains("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")
                 .and(predicate::str::contains("\"encoding\": \"base64\"")),
         );
 
diff --git a/tests/int_slack.rs b/tests/int_slack.rs
index c3cacc3..999a4c0 100644
--- a/tests/int_slack.rs
+++ b/tests/int_slack.rs
@@ -159,7 +159,7 @@ async fn test_scan_slack_messages() -> Result<()> {
         "messages": {
             "matches": [{
                 "permalink": "https://example.slack.com/archives/C123/p1234",
-                "text": "This contains a github token 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",
+                "text": "This contains a github token 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",
                 "ts": "1234.56",
                 "channel": {"id": "C123", "name": "general"}
             }],
diff --git a/tests/smoke_archive.rs b/tests/smoke_archive.rs
index 3e08a0a..ebd524b 100644
--- a/tests/smoke_archive.rs
+++ b/tests/smoke_archive.rs
@@ -7,7 +7,7 @@ fn smoke_scan_tar_gz_archive() -> anyhow::Result<()> {
 
     let dir = tempfile::tempdir()?;
     let tar_gz = dir.path().join("payload.tar.gz");
-    let github_pat = "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";
+    let github_pat = "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";
 
     // --- build a payload.tar.gz -------------------------------------------------
     {
diff --git a/tests/smoke_baseline.rs b/tests/smoke_baseline.rs
index 734682c..ea6c644 100644
--- a/tests/smoke_baseline.rs
+++ b/tests/smoke_baseline.rs
@@ -5,7 +5,7 @@ use clap::Parser;
 use predicates::prelude::*;
 use tempfile::tempdir;
 
-const GH_PAT: &str = "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";
+const GH_PAT: &str = "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";
 
 #[test]
 fn manage_baseline_enables_no_dedup() -> anyhow::Result<()> {
diff --git a/tests/smoke_exclude.rs b/tests/smoke_exclude.rs
index 048c85f..9936899 100644
--- a/tests/smoke_exclude.rs
+++ b/tests/smoke_exclude.rs
@@ -4,7 +4,7 @@ use assert_cmd::Command;
 use predicates::prelude::*;
 use tempfile::tempdir;
 
-const SECRET: &str = "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";
+const SECRET: &str = "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";
 
 #[test]
 fn exclude_pattern_hides_matches() -> anyhow::Result<()> {
diff --git a/tests/smoke_fs.rs b/tests/smoke_fs.rs
index 55c45fa..786e184 100644
--- a/tests/smoke_fs.rs
+++ b/tests/smoke_fs.rs
@@ -5,7 +5,7 @@ use assert_cmd::prelude::*;
 use predicates::prelude::*;
 use tempfile::tempdir;
 
-const GITHUB_PAT: &str = "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";
+const GITHUB_PAT: &str = "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";
 
 #[test]
 fn smoke_scan_filesystem_text_and_binary() -> anyhow::Result<()> {
diff --git a/tests/smoke_git.rs b/tests/smoke_git.rs
index b260bb9..1336803 100644
--- a/tests/smoke_git.rs
+++ b/tests/smoke_git.rs
@@ -15,7 +15,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> {
 
     // commit v1
     let file_path = repo_dir.join("config.yml");
-    fs::write(&file_path, b"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")?;
+    fs::write(&file_path, b"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")?;
     let mut idx = repo.index()?;
     idx.add_path(std::path::Path::new("config.yml"))?;
     let oid1 = idx.write_tree()?;
@@ -23,7 +23,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> {
     repo.commit(Some("HEAD"), &sig, &sig, "init", &tree1, &[])?;
 
     // commit v2 (same leak, will test dedup)
-    fs::write(&file_path, b"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 # unchanged")?;
+    fs::write(&file_path, b"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 # unchanged")?;
     idx.add_path(std::path::Path::new("config.yml"))?;
     let oid2 = idx.write_tree()?;
     let tree2 = repo.find_tree(oid2)?;
@@ -44,7 +44,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> {
         ])
         .assert()
         .code(200) // ← kingfisher’s “findings present” status
-        .stdout(predicate::str::contains("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"));
+        .stdout(predicate::str::contains("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"));
 
     dir.close()?;
     Ok(())