From 0d33dff196e993f4cf2a6b655a8b65c83a31581a Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Wed, 8 Apr 2026 11:09:36 -0700 Subject: [PATCH] changes in response to PR review --- crates/kingfisher-rules/src/rules_database.rs | 94 ++++++++++++++++--- .../src/validation/http_validation.rs | 39 +++++++- src/matcher/mod.rs | 43 +++++++++ src/validation.rs | 16 +++- tests/int_context_verification.rs | 80 +--------------- 5 files changed, 173 insertions(+), 99 deletions(-) diff --git a/crates/kingfisher-rules/src/rules_database.rs b/crates/kingfisher-rules/src/rules_database.rs index 3a28fac..5ba76ea 100644 --- a/crates/kingfisher-rules/src/rules_database.rs +++ b/crates/kingfisher-rules/src/rules_database.rs @@ -60,21 +60,7 @@ impl RulesDatabase { let mut reason_codes: Vec<&'static str> = Vec::new(); - let has_self_identifying_prefix = [ - "ccipat_", - "xoxb-", - "xoxa-", - "xoxp-", - "xapp-", - "ghp_", - "github_pat_", - "sk_live_", - "sk_test_", - "ltai", - "akia", - ] - .iter() - .any(|m| normalized.contains(m)); + let has_self_identifying_prefix = has_self_identifying_shape(&normalized); if has_self_identifying_prefix { reason_codes.push("self_identifying_prefix"); return RuleMatchProfile { @@ -307,6 +293,33 @@ impl RulesDatabase { } } +fn has_self_identifying_shape(normalized_pattern: &str) -> bool { + let literal_markers = [ + "ccipat_", + "xapp-", + "ghp_", + "github_pat_", + "sk_live_", + "sk_test_", + "ltai", + "akia", + "aizasy", + "pypi-ageichlwas5vcmc", + "https://hooks\\.slack\\.com/services/", + ]; + + literal_markers.iter().any(|needle| normalized_pattern.contains(needle)) + || normalized_pattern.contains("xox[pbarose]") + || normalized_pattern.contains("xoxe\\.xox[bparose]-") + || normalized_pattern.contains("xoxe-\\d-") + || (normalized_pattern.contains("-----begin\\s") + && normalized_pattern.contains("private\\skey") + && normalized_pattern.contains("-----end\\s")) + || (normalized_pattern.contains("-----begin\\ ") + && normalized_pattern.contains("private\\ key") + && normalized_pattern.contains("-----end\\ ")) +} + fn has_generic_token_class(normalized_pattern: &str) -> bool { [ "[a-za-z0-9]{", @@ -436,6 +449,57 @@ mod test_rule_match_profiles { assert!(profile.reason_codes.contains(&"self_identifying_prefix")); } + #[test] + fn classifies_google_api_key_rule_as_self_identifying() { + let rule = mk_rule("kingfisher.google.7", r"(?xi)\b(AIzaSy[A-Za-z0-9_-]{33})"); + let profile = RulesDatabase::classify_rule_profile(&rule); + assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying); + } + + #[test] + fn classifies_slack_token_charclass_rule_as_self_identifying() { + let rule = mk_rule( + "kingfisher.slack.2", + r"(?xi)\b(xox[pbarose][-0-9]{0,3}-[0-9a-z]{6,15}-[0-9a-z]{6,15}-[-0-9a-z]{6,66})\b", + ); + let profile = RulesDatabase::classify_rule_profile(&rule); + assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying); + } + + #[test] + fn classifies_slack_webhook_rule_as_self_identifying() { + let rule = mk_rule( + "kingfisher.slack.4", + r"(?xi)\b(https://hooks\.slack\.com/services/T[a-z0-9_-]{8,12}/B[a-z0-9_-]{8,12}/[a-z0-9_-]{20,30})", + ); + let profile = RulesDatabase::classify_rule_profile(&rule); + assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying); + } + + #[test] + fn classifies_pypi_token_rule_as_self_identifying() { + let rule = mk_rule("kingfisher.pypi.1", r"(?x)(pypi-AgEIcHlwaS5vcmc[A-Za-z0-9_-]{50,})\b"); + let profile = RulesDatabase::classify_rule_profile(&rule); + assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying); + } + + #[test] + fn classifies_private_key_envelope_rules_as_self_identifying() { + let rule = mk_rule( + "kingfisher.privkey.2", + r"(?xims)(-----BEGIN\s(?:RSA|PGP|DSA|OPENSSH|ENCRYPTED|EC)?\s{0,1}PRIVATE\sKEY-----[a-z0-9 /+=\r\n\\n]{32,}?-----END\s(?:RSA|PGP|DSA|OPENSSH|ENCRYPTED|EC)?\s{0,1}PRIVATE\sKEY-----)", + ); + let profile = RulesDatabase::classify_rule_profile(&rule); + assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying); + + let pem_rule = mk_rule( + "kingfisher.pem.1", + r#"(?x)-----BEGIN\ .{0,20}\ ?PRIVATE\ KEY\ ?.{0,20}-----\s*((?:[a-zA-Z0-9+/=\s"',]|\\r|\\n){50,})\s*-----END\ .{0,20}\ ?PRIVATE\ KEY\ ?.{0,20}-----"#, + ); + let pem_profile = RulesDatabase::classify_rule_profile(&pem_rule); + assert_eq!(pem_profile.kind, RuleDetectionProfileKind::SelfIdentifying); + } + #[test] fn classifies_context_dependent_generic_rule() { let rule = mk_rule( diff --git a/crates/kingfisher-scanner/src/validation/http_validation.rs b/crates/kingfisher-scanner/src/validation/http_validation.rs index 0862323..255edd3 100644 --- a/crates/kingfisher-scanner/src/validation/http_validation.rs +++ b/crates/kingfisher-scanner/src/validation/http_validation.rs @@ -36,12 +36,11 @@ use kingfisher_rules::ResponseMatcher; /// Build a deterministic cache key from the immutable parts of an HTTP request. pub fn generate_http_cache_key_parts( method: &str, - url: &Url, + url: &str, headers: &BTreeMap, body: Option<&str>, ) -> String { let method = method.to_uppercase(); - let url = url.as_str(); let mut hasher = Sha1::new(); hasher.update(method.as_bytes()); @@ -101,6 +100,21 @@ pub fn with_request_template_globals(globals: &Object) -> Object { out } +/// Clone `globals` and add stable placeholder values for request-scoped template vars that +/// would otherwise make HTTP validation cache keys vary per execution. +pub fn with_cache_key_template_globals(globals: &Object) -> Object { + let mut out = globals.clone(); + + if !out.contains_key("REQUEST_RFC1123_DATE") { + out.insert("REQUEST_RFC1123_DATE".into(), Value::scalar("REQUEST_RFC1123_DATE")); + } + if !out.contains_key("REQUEST_UNIX_MILLIS") { + out.insert("REQUEST_UNIX_MILLIS".into(), Value::scalar("REQUEST_UNIX_MILLIS")); + } + + out +} + /// Build a reqwest RequestBuilder using the provided parameters. pub fn build_request_builder( client: &Client, @@ -630,6 +644,27 @@ mod tests { assert_eq!(rendered.get("REQUEST_UNIX_MILLIS").unwrap().to_kstr(), "123"); } + #[test] + fn cache_key_template_globals_use_stable_placeholders() { + let globals = Object::new(); + let rendered = with_cache_key_template_globals(&globals); + + assert_eq!(rendered.get("REQUEST_RFC1123_DATE").unwrap().to_kstr(), "REQUEST_RFC1123_DATE"); + assert_eq!(rendered.get("REQUEST_UNIX_MILLIS").unwrap().to_kstr(), "REQUEST_UNIX_MILLIS"); + } + + #[test] + fn cache_key_template_globals_preserve_explicit_overrides() { + let mut globals = Object::new(); + globals.insert("REQUEST_RFC1123_DATE".into(), Value::scalar("custom-date")); + globals.insert("REQUEST_UNIX_MILLIS".into(), Value::scalar("123")); + + let rendered = with_cache_key_template_globals(&globals); + + assert_eq!(rendered.get("REQUEST_RFC1123_DATE").unwrap().to_kstr(), "custom-date"); + assert_eq!(rendered.get("REQUEST_UNIX_MILLIS").unwrap().to_kstr(), "123"); + } + #[test] fn rejects_ipv4_loopback() { assert!(!is_ssrf_safe_ip(&IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)))); diff --git a/src/matcher/mod.rs b/src/matcher/mod.rs index 9fe0fa5..5e2b03c 100644 --- a/src/matcher/mod.rs +++ b/src/matcher/mod.rs @@ -1215,4 +1215,47 @@ line2 assert_eq!(found.len(), 1, "self-identifying tokens should remain raw-pass findings"); Ok(()) } + + #[test] + fn self_identifying_charclass_prefix_rule_remains_hyperscan_only() -> Result<()> { + let token = "xoxb-730191371696-1413868247813-IG7Z6nYevC2hdviE3aJhb5kY"; + let rule = Rule::new(RuleSyntax { + id: "kingfisher.slack.2".into(), + name: "slack token".into(), + pattern: + "(?xi)\\b(xox[pbarose][-0-9]{0,3}-[0-9a-z]{6,15}-[0-9a-z]{6,15}-[-0-9a-z]{6,66})\\b" + .into(), + confidence: crate::rules::rule::Confidence::Medium, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + revocation: None, + depends_on_rule: vec![], + pattern_requirements: None, + tls_mode: None, + }); + + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone()))); + let mut matcher = + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; + + let blob = Blob::from_bytes(format!("token={token}").into_bytes()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("slack.txt"))); + + let found = match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => matches, + _ => panic!("unexpected scan result"), + }; + assert_eq!( + found.len(), + 1, + "self-identifying token families should not require parser context" + ); + Ok(()) + } } diff --git a/src/validation.rs b/src/validation.rs index 5cdf823..804f7f2 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -615,6 +615,7 @@ async fn timed_validate_single_match<'a>( let multipart_timeout = validation_timeout; let max_retries: u32 = validation_retries; let request_globals = httpvalidation::with_request_template_globals(&globals); + let cache_globals = httpvalidation::with_cache_key_template_globals(&globals); // render URL let url = match render_and_parse_url( parser, @@ -661,10 +662,19 @@ async fn timed_validate_single_match<'a>( // old per-request cache (optional) if !is_multipart { + let cache_url = render_template( + parser, + &cache_globals, + &rule_syntax.name, + &http_validation.request.url, + ) + .await + .unwrap_or_else(|_| http_validation.request.url.clone()); + let rendered_headers = httpvalidation::process_headers( &http_validation.request.headers, parser, - &request_globals, + &cache_globals, &url, ) .unwrap_or_default(); @@ -682,12 +692,12 @@ async fn timed_validate_single_match<'a>( parser .parse(body_template) .ok() - .and_then(|template| template.render(&request_globals).ok()) + .and_then(|template| template.render(&cache_globals).ok()) }); cache_key = httpvalidation::generate_http_cache_key_parts( http_validation.request.method.as_str(), - &url, + &cache_url, &header_map, rendered_body.as_deref(), ); diff --git a/tests/int_context_verification.rs b/tests/int_context_verification.rs index 8737670..44f85c9 100644 --- a/tests/int_context_verification.rs +++ b/tests/int_context_verification.rs @@ -3,84 +3,6 @@ use std::{fs, process::Command}; use anyhow::{Context, Result}; use serde_json::{Deserializer, Value}; -fn macos_arm64_known_missing_findings() -> &'static [(&'static str, &'static str)] { - &[ - ("kingfisher.google.7", "AIzaSyBUPHAjZl3n8Eza66ka6B78iVyPteC5MgM"), - ( - "kingfisher.pem.1", - "MIICWQIBAAKBgHsSuRPLMDrxcwMB9P6ubGFGmlSvHvSXq2kfwycrcEKf/TCctShzA2HYo2IWed8n1rqazlESHnhNmCWlFWIMMFWagZyDBy9yy71MhWISvoTuQVyCx/z3q1v171fy+Ds5smKwZ8wK3bgwBTR7BTKfYNmearDZvPJgwK0jsYEJDZ/DAgElAoGAMeT+7FlK53akP31VfAFG4j83pcp0VVI+kmbSk1bMpWN0e33M5uKE1KPvNZpowkCVUpHJQ3YMWkj4ffbRUUM2L/jQmKkICf7vynIdq5cj+lF6lNXSzwq6pVR6/octdeKS/70DuGcVG+LiRTu2mRb6mPY9bIJIvcgenXajnVanx9UCQQDRwf6oyU/EH4x+kw/XQZi/RebtDPD1yIQuhVG8B1xkPxBsAywTwVDL7DSZ1BsbWJcl5HcXt/q0n/3NZ62XRr1VAkEAljSLsMOk5H7XCctEk3mCu1WgCsUvb/RRCBiBT+cic14OpVtytJMAeLeqcAhIj54ef4hQPGKbAsQZ3E/X4EsotwJAa7alXZfPA9jZcW4c5Ciai7wcoz3/MhrcF+OYrKnVf5YBg5LtHua6yZT4aqswg6oIbWd7bQty5yG5rqrcmcphOQJAHGrOUd/TFnjckyZ0wfRk11VjeG2Fg+IdKwuOFgkiMYB/T7da4+R1tfk7666KRK82M82uUJ0IkdISuvpZRhwOnwJBAI34lnrN4bNcUVB5kAXT9huyH8tJomNdsJOufS3vCi5tKaqKIc3jMIwtyuXsn4NhJNUFlgfPL70CPtb3x/eePqw=", - ), - ( - "kingfisher.privkey.2", - "-----BEGIN RSA PRIVATE KEY-----MIICWQIBAAKBgHsSuRPLMDrxcwMB9P6ubGFGmlSvHvSXq2kfwycrcEKf/TCctShzA2HYo2IWed8n1rqazlESHnhNmCWlFWIMMFWagZyDBy9yy71MhWISvoTuQVyCx/z3q1v171fy+Ds5smKwZ8wK3bgwBTR7BTKfYNmearDZvPJgwK0jsYEJDZ/DAgElAoGAMeT+7FlK53akP31VfAFG4j83pcp0VVI+kmbSk1bMpWN0e33M5uKE1KPvNZpowkCVUpHJQ3YMWkj4ffbRUUM2L/jQmKkICf7vynIdq5cj+lF6lNXSzwq6pVR6/octdeKS/70DuGcVG+LiRTu2mRb6mPY9bIJIvcgenXajnVanx9UCQQDRwf6oyU/EH4x+kw/XQZi/RebtDPD1yIQuhVG8B1xkPxBsAywTwVDL7DSZ1BsbWJcl5HcXt/q0n/3NZ62XRr1VAkEAljSLsMOk5H7XCctEk3mCu1WgCsUvb/RRCBiBT+cic14OpVtytJMAeLeqcAhIj54ef4hQPGKbAsQZ3E/X4EsotwJAa7alXZfPA9jZcW4c5Ciai7wcoz3/MhrcF+OYrKnVf5YBg5LtHua6yZT4aqswg6oIbWd7bQty5yG5rqrcmcphOQJAHGrOUd/TFnjckyZ0wfRk11VjeG2Fg+IdKwuOFgkiMYB/T7da4+R1tfk7666KRK82M82uUJ0IkdISuvpZRhwOnwJBAI34lnrN4bNcUVB5kAXT9huyH8tJomNdsJOufS3vCi5tKaqKIc3jMIwtyuXsn4NhJNUFlgfPL70CPtb3x/eePqw=-----END RSA PRIVATE KEY-----", - ), - ( - "kingfisher.pypi.1", - "pypi-AgEIcHlwaS5vcmcCAWEAAAYgNh9pJUqVF-EtMCwGaZYcStFR07RbE8hyb9h2vYxifO8", - ), - ( - "kingfisher.pypi.1", - "pypi-AgEIcHlwaS5vcmcCAWIAAAYgf_d_XvJfqkOhrkqbEBo-eW9UID46ABNJIdGfaO3n3_k", - ), - ( - "kingfisher.pypi.1", - "pypi-AgEIcHlwaS5vcmcCAWIAAAYgxbyLvb9egSCECeOdB3qW3h4oXEoNC6kJI0NtaFOQlUY", - ), - ( - "kingfisher.pypi.1", - "pypi-AgEIcHlwaS5vcmcCAWIAAi97InZlcnNpb24iOiAxLCAicGVybWlzc2lvbnMiOiB7InByb2plY3RzIjogW119fQAABiBWHBa1jsbY-iN-Swf3JCrxy8Q8eRCxMrc_1KkkDuB6KQ", - ), - ( - "kingfisher.pypi.1", - "pypi-AgEIcHlwaS5vcmcCAWIAAiV7InZlcnNpb24iOiAxLCAicGVybWlzc2lvbnMiOiAidXNlciJ9AAAGIBeIJGhXk8kPPref7vLuwlKbnSWusZKZivIh92GRUUX4", - ), - ( - "kingfisher.slack.1", - "xapp-1-A01C259PH2A-1440755929120-7d5241948a2cc1b464add85df8a8e75f9040ae2869f6599926ed0b9dcafdb32b", - ), - ( - "kingfisher.slack.2", - "xoxb-730191371696-1413868247813-IG7Z6nYevC2hdviE3aJhb5kY", - ), - ( - "kingfisher.slack.4", - "https://hooks.slack.com/services/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", - ), - ] -} - -fn is_known_macos_arm64_missing_finding(finding: &Value) -> bool { - let rule_id = finding.get("rule_id").and_then(Value::as_str).unwrap_or_default(); - let snippet = finding.get("snippet").and_then(Value::as_str).unwrap_or_default(); - macos_arm64_known_missing_findings().iter().any(|(known_rule_id, known_snippet)| { - rule_id == *known_rule_id && snippet == *known_snippet - }) -} - -fn assert_findings_match_for_platform(actual: Vec, expected: Vec) { - if cfg!(all(target_os = "macos", target_arch = "aarch64")) { - let missing = expected - .iter() - .filter(|finding| !actual.contains(finding)) - .cloned() - .collect::>(); - let extras = actual - .iter() - .filter(|finding| !expected.contains(finding)) - .cloned() - .collect::>(); - - assert!(extras.is_empty(), "unexpected extra findings on macOS ARM64: {extras:#?}"); - assert!( - missing.iter().all(is_known_macos_arm64_missing_finding), - "unexpected missing findings on macOS ARM64: {missing:#?}" - ); - return; - } - - assert_eq!(actual, expected); -} - #[test] fn scan_findings_match_pre_removal_baseline() -> Result<()> { let output = Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) @@ -156,6 +78,6 @@ fn scan_findings_match_pre_removal_baseline() -> Result<()> { .collect::>(); expected.sort_by(|left, right| left.to_string().cmp(&right.to_string())); - assert_findings_match_for_platform(actual, expected); + assert_eq!(actual, expected); Ok(()) }