Merge pull request #11 from mongodb/development

Preparing for v1.14.0 This PR prepares for v1.14.0 by making response_matcher optional with a default, enforcing its presence in rule validation, and updating related tests and rule data. Made response_matcher required. Added a validation step to error on missing or empty response_matcher in rules. Reformatted tests, scanner code, data rules, bumped version, and updated changelog.
2025-06-25 23:53:59 -07:00 · 2025-06-25 23:53:59 -07:00 · ff0cfb1012
commit ff0cfb1012
parent d1834e976e 18e0b3c9b4
24 changed files with 191 additions and 143 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,10 @@

 All notable changes to this project will be documented in this file.

+## [1.14.0]
+- Fixed several malformed rules
+- Now validating that response_matcher is present in validation section of all rules
+
 ## [1.13.0]
 - Added new rules for Planetscale, Postman, Openweather, opsgenie, pagerduty, pastebin, paypal, netlify, netrc, newrelic, ngrok, npm, nuget, mandrill, mapbox, microsoft teams, stripe, linkedin, mailchimp, mailgun, linear, line, huggingface, ibm cloud, intercom, ipstack, heroku, gradle, grafana
 - Added `--rule-stats` command-line flag that will display rule performance statistics during a scan. Useful when creating or debugging rules
--- a/Cargo.toml
+++ b/Cargo.toml
@ -10,7 +10,7 @@ publish = false

 [package]
 name = "kingfisher"
-version = "1.13.0"
+version = "1.14.0"
 edition.workspace = true
 rust-version.workspace = true
 license.workspace = true
--- a/data/rules/digitalocean.yml
+++ b/data/rules/digitalocean.yml
@ -14,9 +14,7 @@ rules:
    examples:
      - dop_v1_1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef
      - 'token         = "dop_v1_ef0e04edc13918192246e0c90f0735c7f4db7a5a036a857e48d6cc98f1c9576b"'
-    categories:
-      - api
-      - secret
+
    validation:
      type: Http
      content:
@ -60,9 +58,9 @@ rules:
              "grant_type": "refresh_token",
              "refresh_token": "{{ TOKEN }}"
            }
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch          
-            status: 
-              - 200
-          - type: JsonValid          
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status:
+                - 200
+            - type: JsonValid
--- a/data/rules/doppler.yml
+++ b/data/rules/doppler.yml
@ -23,11 +23,11 @@ rules:
          headers:
            Authorization: Bearer {{ TOKEN }}
            Accept: application/json
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status:
-              - 200
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status:
+                - 200
  - name: Doppler Personal Token
    id: kingfisher.doppler.2
    pattern: |
@ -52,11 +52,11 @@ rules:
          headers:
            Authorization: Bearer {{ TOKEN }}
            Accept: application/json
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status:
-              - 200
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status:
+                - 200

  - name: Doppler Service Token
    id: kingfisher.doppler.3
@ -82,11 +82,11 @@ rules:
          headers:
            Authorization: Bearer {{ TOKEN }}
            Accept: application/json
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status:
-              - 200
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status:
+                - 200

  - name: Doppler Service Account Token
    id: kingfisher.doppler.4
@ -112,11 +112,11 @@ rules:
          headers:
            Authorization: Bearer {{ TOKEN }}
            Accept: application/json
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status:
-              - 200
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status:
+                - 200

  - name: Doppler SCIM Token
    id: kingfisher.doppler.5
@ -142,11 +142,11 @@ rules:
          headers:
            Authorization: Bearer {{ TOKEN }}
            Accept: application/json
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status:
-              - 200
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status:
+                - 200

  - name: Doppler Audit Token
    id: kingfisher.doppler.6
@ -172,8 +172,8 @@ rules:
          headers:
            Authorization: Bearer {{ TOKEN }}
            Accept: application/json
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status:
-              - 200
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status:
+                - 200
--- a/data/rules/figma.yml
+++ b/data/rules/figma.yml
@ -23,6 +23,12 @@ rules:
            X-Figma-Token: '{{ TOKEN }}'
          method: GET
          url: https://api.figma.com/v1/me
+          response_matcher:
+            - report_response: true
+            - type: WordMatch
+              words: 
+                - "Invalid token"
+              negative: true

  - name: Figma Personal Access Header Token
    id: kingfisher.figma.2
--- a/data/rules/ibm.yml
+++ b/data/rules/ibm.yml
@ -3,8 +3,11 @@ rules:
    id: kingfisher.ibm.1
    pattern: |
      (?xi)
+      \b
      (?:ibm(?:cloud)?|bx)
      (?:.|[\n\r]){0,32}?
+      (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)
+      (?:.|[\n\r]){0,32}?
      \b
      (                             
        [0-9A-Z_-]{42,44}
--- a/data/rules/linear.yml
+++ b/data/rules/linear.yml
@ -30,9 +30,9 @@ rules:
              "query": "query { issues(first: 1) { nodes { id } } }"
            }
          url: https://api.linear.app/graphql 
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status: [200]
-          - type: WordMatch
-            words: ['"issues":', '"nodes":']
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status: [200]
+            - type: WordMatch
+              words: ['"issues":', '"nodes":']
--- a/data/rules/microsoft_teams.yml
+++ b/data/rules/microsoft_teams.yml
@ -42,11 +42,11 @@ rules:
          headers:
            Content-Type: application/json
          body: '{"text":""}'
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status:
-              - 400
-          - type: WordMatch
-            words:
-              - 'Text is required'
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status:
+                - 400
+            - type: WordMatch
+              words:
+                - 'Text is required'
--- a/data/rules/pagerdutyapikey.yml
+++ b/data/rules/pagerdutyapikey.yml
@ -2,35 +2,46 @@ rules:
  - name: PagerDuty API Key
    id: kingfisher.pagerduty.1
    pattern: |
-      (?xi)
+      (?xi)                                                     
      \b
-      (?:pagerduty|pager[_-]duty|pd[-_\]=\)]|pd\.webhook?)
-      (?:.|[\n\r]){0,16}?
-      (
-        u\+[A-Z0-9_+-]{18}        # new personal tokens
-        |
-        [A-Z0-9_-]{20}            # legacy personal tokens
-        |
-        [A-F0-9]{32}                 # integration keys / routing keys
+      (?:                                                       
+          Token |
+          Authorization |
+          pd[_-]? |
+          pd[_-]? |
+          pagerduty[_-]? |
+          pagerduty
      )
-      \b
-    min_entropy: 3.3
+      \W{0,20}
+      (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN)
+      (?:.|[\n\r]){0,16}?         
+      \b            
+      (                                                          
+        u\+[A-Z0-9_+-]{18}         |  # personal user token (20 chars)
+        [A-Z0-9_-]{20}             |  # legacy PAT (20 chars, mixed case)
+        [a-f0-9]{32}                     # integration / routing key (32 hex, lower case)
+      )
+      \b                                          
+    min_entropy: 3.5
    confidence: medium
    examples:
-      - pagerduty_key = u+Lyhd2_N2MCy+ZoH-S5
+      - "Authorization: Token token=u+Lyhd2_N2MCy+ZoH-S5"
      - pd_key = u+3xVszZ-b4m+T6d23KA
+      - Token token=ABCDEF1234567890ABCDEF1234567890 
+    references:
+      - https://developer.pagerduty.com/api-reference/4555ca1c983d0-get-the-current-user    
    validation:
      type: Http
      content:
        request:
          method: GET
-          url: https://api.pagerduty.com/abilities 
+          url: https://api.pagerduty.com/users          
          headers:
            Authorization: Token token={{ TOKEN }}
-            Accept: application/vnd.pagerduty+json;version=2
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status: [200]
-          - type: WordMatch
-            words: ['"abilities":'] 
+            Accept: application/json
+          response_matcher:
+            - report_response: true
+            - type: JsonValid                              
+            - type: WordMatch                                
+              words: 
+                - '"users":'
--- a/data/rules/particle.io.yml
+++ b/data/rules/particle.io.yml
@ -29,13 +29,13 @@ rules:
        request:
          method: GET
          url: https://api.particle.io/v1/user?access_token={{ TOKEN }}
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status: [200]
-          - type: WordMatch
-            match_all_words: true
-            words: ['"username":']
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status: [200]
+            - type: WordMatch
+              match_all_words: true
+              words: ['"username":']

  - name: particle.io Access Token
    id: kingfisher.particleio.2
@ -65,10 +65,10 @@ rules:
        request:
          method: GET
          url: https://api.particle.io/v1/user?access_token={{ TOKEN }}
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status: [200]
-          - type: WordMatch
-            match_all_words: true
-            words: ['"username":']
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status: [200]
+            - type: WordMatch
+              match_all_words: true
+              words: ['"username":']
--- a/data/rules/pastebin.yml
+++ b/data/rules/pastebin.yml
@ -28,10 +28,10 @@ rules:
            Content-Type: application/x-www-form-urlencoded
          body: |
            api_dev_key={{ TOKEN }}&api_user_name=dummy&api_user_password=dummy
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status: [200]                                   
-          - type: WordMatch                                
-            words: ['invalid api_dev_key']
-            negative: true
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status: [200]                                   
+            - type: WordMatch                                
+              words: ['invalid api_dev_key']
+              negative: true
--- a/data/rules/paypal.yml
+++ b/data/rules/paypal.yml
@ -47,10 +47,10 @@ rules:
          Authorization: |
            Basic {{ CLIENTID | append: ':' | append: TOKEN | b64enc }}
        body: grant_type=client_credentials
-      response_matcher:
-        - report_response: true
-        - type: StatusMatch
-          status: [200]                                  
+        response_matcher:
+          - report_response: true
+          - type: StatusMatch
+            status: [200]                                  
    depends_on_rule:
      - rule_id: kingfisher.paypal.1
        variable: CLIENTID
--- a/data/rules/pypi.yml
+++ b/data/rules/pypi.yml
@ -18,11 +18,11 @@ rules:
        request:
          method: POST
          url: https://upload.pypi.org/legacy/
-          response_is_html: true 
+          response_is_html: true
          response_matcher:
            - report_response: true
            - type: WordMatch
-              words: 
+              words:
                - "isn't allowed to upload to project"
          headers:
            Authorization: 'Basic {{ "__token__:" | append: TOKEN | b64enc }}'
--- a/data/rules/tailscale.yml
+++ b/data/rules/tailscale.yml
@ -25,7 +25,7 @@ rules:
          headers:
            Authorization: "Bearer {{ TOKEN }}"
            Accept: application/json
-        response_matcher:
-          - report_response: true
-          - type: StatusMatch
-            status: [200]
+          response_matcher:
+            - report_response: true
+            - type: StatusMatch
+              status: [200]
--- a/data/rules/travisci.yml
+++ b/data/rules/travisci.yml
@ -28,7 +28,7 @@ rules:
              Authorization: token {{ TOKEN }}
              Accept: application/vnd.travis-ci.3+json
              Travis-API-Version: "3"
-          response_matcher:
-            - report_response: true
-            - type: StatusMatch
-              status: [200]
+            response_matcher:
+              - report_response: true
+              - type: StatusMatch
+                status: [200]
--- a/src/decompress.rs
+++ b/src/decompress.rs
@ -382,16 +382,11 @@ mod tests {
        Ok(())
    }

-    /// 3) Nested archive:
-    ///    outer.tar.gz  ──▶  outer.tar  (contains inner.tar.gz)
-    ///                               └──▶  inner.tar.gz  ──▶  inner.tar  (contains secret.txt)
+    /// 3) Nested archive: outer.tar.gz  ──▶  outer.tar  (contains inner.tar.gz) └──▶  inner.tar.gz
+    ///    ──▶  inner.tar  (contains secret.txt)
    #[test]
    fn smoke_decompress_nested_tar_gz_archives() -> anyhow::Result<()> {
-        use std::{
-            fs::File,
-            io::Read,
-            path::PathBuf,
-        };
+        use std::{fs::File, io::Read, path::PathBuf};

        use flate2::{write::GzEncoder, Compression};
        use tar::Builder;
@ -468,10 +463,7 @@ mod tests {
                for (logical, path) in files {
                    if logical.ends_with("!secret.txt") {
                        let txt = std::fs::read_to_string(&path)?;
-                        assert!(
-                            txt.contains("nested_secret=shh"),
-                            "secret.txt content corrupted"
-                        );
+                        assert!(txt.contains("nested_secret=shh"), "secret.txt content corrupted");
                        found = true;
                    }
                }
--- a/src/matcher.rs
+++ b/src/matcher.rs
@ -25,7 +25,6 @@ use smallvec::SmallVec;
 use tracing::debug;
 use xxhash_rust::xxh3::xxh3_64;

-use crate::rule_profiling::RuleTimer;
 use crate::{
    blob::{Blob, BlobId, BlobIdMap},
    entropy::calculate_shannon_entropy,
@ -33,7 +32,7 @@ use crate::{
    origin::OriginSet,
    parser,
    parser::{Checker, Language},
-    rule_profiling::{ConcurrentRuleProfiler, RuleStats},
+    rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer},
    rules::rule::Rule,
    rules_database::RulesDatabase,
    safe_list::is_safe_match,
@ -464,15 +463,8 @@ fn filter_match<'b>(
    filename: &str,
    profiler: Option<&Arc<ConcurrentRuleProfiler>>,
 ) {
-    let mut timer = profiler.map(|p| {
-        RuleTimer::new(
-            p,
-            rule.id(),
-            rule.name(),
-            &rule.syntax.pattern,
-            filename,
-        )
-    });
+    let mut timer =
+        profiler.map(|p| RuleTimer::new(p, rule.id(), rule.name(), &rule.syntax.pattern, filename));

    let initial_len = matches.len();

@ -989,7 +981,7 @@ mod test {
                    method: "GET".to_string(),
                    url: "https://example.com".to_string(),
                    headers: BTreeMap::new(),
-                    response_matcher: vec![],
+                    response_matcher: Some(vec![]),
                    multipart: None,
                    response_is_html: false,
                },
--- a/src/rules.rs
+++ b/src/rules.rs
@ -8,7 +8,7 @@ pub mod rule;
 use std::{fs::File, io::BufReader, path::Path};

 use anyhow::Context;
-use rule::{Confidence, RuleSyntax};
+use rule::{Confidence, RuleSyntax, Validation};
 use serde::de::DeserializeOwned;

 /// Custom error type for more granular rules loading errors.
@ -28,6 +28,9 @@ pub enum RulesError {

    #[error("Invalid ResponseMatcher variant in file: {0}, at line: {1}, column: {2}")]
    InvalidResponseMatcherVariant(String, usize, usize),
+
+    #[error("HTTP validation for rule `{rule_id}` in file {path} missing response_matcher")]
+    MissingResponseMatcher { path: String, rule_id: String },
 }

 /// Represents a collection of rule syntaxes.
@ -58,6 +61,21 @@ impl Rules {
            match serde_yaml::from_reader::<_, Rules>(contents) {
                Ok(mut rs) => {
                    rs.rules.retain(|rule| rule.confidence.is_at_least(&confidence));
+                    for rule_syntax in &rs.rules {
+                        if let Some(Validation::Http(http_val)) = &rule_syntax.validation {
+                            if http_val
+                                .request
+                                .response_matcher
+                                .as_ref()
+                                .map_or(true, |m| m.is_empty())
+                            {
+                                bail!(RulesError::MissingResponseMatcher {
+                                    path: path.display().to_string(),
+                                    rule_id: rule_syntax.id.clone(),
+                                });
+                            }
+                        }
+                    }
                    rules.update(rs);
                }
                Err(e) => {
--- a/src/rules/rule.rs
+++ b/src/rules/rule.rs
@ -65,7 +65,7 @@ pub struct HttpRequest {
    #[serde(default)]
    pub headers: BTreeMap<String, String>,
    #[serde(default)]
-    pub response_matcher: Vec<ResponseMatcher>,
+    pub response_matcher: Option<Vec<ResponseMatcher>>,
    #[serde(default)]
    pub multipart: Option<MultipartConfig>,
    // allow HTML only when explicitly set true
--- a/src/scanner/summary.rs
+++ b/src/scanner/summary.rs
@ -161,7 +161,7 @@ pub fn print_scan_summary(
            if !stats.is_empty() {
                // Calculate dynamic column widths
                let name_w = stats.iter().map(|s| s.rule_name.len()).max().unwrap_or(4);
-                let id_w   = stats.iter().map(|s| s.rule_id.len()).max().unwrap_or(2);
+                let id_w = stats.iter().map(|s| s.rule_id.len()).max().unwrap_or(2);

                // Header
                safe_println!("\n{:-^1$}", " Rule Performance Stats ", name_w + id_w + 47);
@ -173,7 +173,7 @@ pub fn print_scan_summary(
                    "Slowest",
                    "Average",
                    name_w = name_w,
-                    id_w   = id_w
+                    id_w = id_w
                );
                safe_println!("{:-<width$}", "", width = name_w + id_w + 49);

@ -187,14 +187,13 @@ pub fn print_scan_summary(
                        rs.slowest_match_time,
                        rs.average_match_time,
                        name_w = name_w,
-                        id_w   = id_w
+                        id_w = id_w
                    );
                }
            }
        }
    }

-
    debug!("\nAll Rules with Matches:");
    debug!("=======================");
    let max_rule_length = sorted_findings.iter().map(|(rule, _)| rule.len()).max().unwrap_or(0);
--- a/src/validation.rs
+++ b/src/validation.rs
@ -514,8 +514,14 @@ async fn timed_validate_single_match<'a>(

                    m.validation_response_status = status;
                    m.validation_response_body = body.clone();
+                    let matchers = http_validation
+                        .request
+                        .response_matcher
+                        .as_ref()
+                        .expect("missing response_matcher");
+
                    m.validation_success = httpvalidation::validate_response(
-                        &http_validation.request.response_matcher,
+                        matchers,
                        &body,
                        &status,
                        &headers,
@ -880,6 +886,12 @@ rules:
        request:
          method: POST
          url: https://upload.pypi.org/legacy/
+          response_is_html: true
+          response_matcher:
+            - report_response: true
+            - type: WordMatch
+              words:
+                - "isn't allowed to upload to project"
          headers:
            Authorization: 'Basic {{ "__token__:" | append: TOKEN | b64enc }}'
          multipart:
--- a/tests/cli_failure.rs
+++ b/tests/cli_failure.rs
@ -30,7 +30,7 @@ fn scan_fails_for_bad_rule_yaml() {
            "--rules-path",
            tmp.path().to_str().unwrap(), // point loader at bad YAML
            "--no-validate",              // keep the test fast
-             "--no-update-check", // skip update check to avoid network calls
+            "--no-update-check",          // skip update check to avoid network calls
        ])
        .assert()
        .failure()
@ -72,7 +72,7 @@ rules:
            tmp.path().to_str().unwrap(), // only the custom rule
            "--no-dedup",
            "--load-builtins=false", // skip the builtin rules
-            "--no-update-check", // skip update check to avoid network calls
+            "--no-update-check",     // skip update check to avoid network calls
        ])
        .assert()
        .failure() // CLI exits 0
--- a/tests/int_validation_cache.rs
+++ b/tests/int_validation_cache.rs
@ -76,6 +76,12 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
            request:
              method: GET
              url: '{base}/validate?token={{ {{ TOKEN }} }}'
+              response_matcher:
+                  - report_response: true
+                  - type: WordMatch
+                    words:
+                      - '"error_code":"403003"'
+                    negative: true
    "#,
        base = server.uri()
    );
--- a/tests/smoke_archive.rs
+++ b/tests/smoke_archive.rs
@ -30,7 +30,14 @@ fn smoke_scan_tar_gz_archive() -> anyhow::Result<()> {

    // ── 1) extraction ENABLED -- secret should be found ─────────────────────────
    Command::cargo_bin("kingfisher")?
-        .args(["scan", tar_gz.to_str().unwrap(), "--confidence=low", "--format", "json", "--no-update-check"])
+        .args([
+            "scan",
+            tar_gz.to_str().unwrap(),
+            "--confidence=low",
+            "--format",
+            "json",
+            "--no-update-check",
+        ])
        .assert()
        .code(findings_code)
        .stdout(predicates::str::contains(github_pat));