diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cfebf6..43f3d60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ All notable changes to this project will be documented in this file. ## [v1.83.0] +- Improved tree-sitter parsing + structured secret detection in source files. A Vectorscan pre-filter over the combined tree-sitter output avoids the O(results × rules) regex cost. - Access Map: added Hugging Face, Gitea, Bitbucket, PostgreSQL, and MongoDB providers. All perform read-only enumeration with severity classification. - Access Map: Hugging Face, Bitbucket, Postgres, and MongoDB credentials from scans are now auto-collected when using `--access-map`. - Access Map CLI: added providers `huggingface`/`hf`, `gitea`, `bitbucket`, `postgres`, `mongodb`/`mongo`. diff --git a/Cargo.toml b/Cargo.toml index b9e6e0f..a443814 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,35 @@ homepage = "https://github.com/mongodb/kingfisher" repository = "https://github.com/mongodb/kingfisher" publish = false +[workspace.dependencies] +anyhow = "1.0" +thiserror = "1.0.69" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +schemars = "0.8" +regex = "1.12" +bstr = { version = "1.12", features = ["serde"] } +smallvec = { version = "1", features = ["const_generics", "const_new", "union"] } +once_cell = "1.21" +parking_lot = "0.12" +tracing = "0.1" +vectorscan-rs = "0.0.5" +xxhash-rust = { version = "0.8", features = ["xxh3"] } +ignore = "0.4" +walkdir = "2.5" +include_dir = "0.7" +sha1 = "0.10" +sha2 = "0.10" +hmac = "0.12" +base64 = "0.22" +percent-encoding = "2.3" +time = "0.3" +uuid = "1.19" +rand = "0.9" +hex = "0.4" +rustc-hash = "2.1" +http = "1.4" + [package] name = "kingfisher" @@ -65,9 +94,9 @@ clap = { version = "4.5", features = [ anyhow = "1.0" bstr = { version = "1.12", features = ["serde"] } fixedbitset = "0.5" -gix = { version = "0.73", features = ["max-performance", "serde", "blocking-network-client"] } +gix = { version = "0.73", features = ["max-performance-safe", "serde", "blocking-network-client"] } ignore = "0.4" -petgraph = "0.6" +petgraph = "0.8" roaring = "0.10" schemars = "0.8" serde = { version = "1.0", features = ["derive", "rc"] } @@ -128,38 +157,38 @@ mysql_async = { version = "0.34.2", default-features = false, features = ["defau bson = "2.15.0" ring = "0.17.14" pem = "3.0.6" -aws-config = "1.8.12" -aws-credential-types = "1.2.11" -aws-sdk-sts = "1.95.0" -aws-types = "1.3.11" +aws-config = { version = "1.8.14", default-features = false, features = ["default-https-client", "rt-tokio", "credentials-process", "sso"] } +aws-credential-types = "1.2.12" +aws-sdk-sts = { version = "1.98.0", default-features = false, features = ["default-https-client", "rt-tokio"] } +aws-types = "1.3.12" byteorder = "1.5.0" parking_lot = "0.12.5" -octorust = "0.9.0" +octorust = "0.10.0" reqwest-middleware = "0.4.2" tracing-subscriber = {version = "0.3.22", features = ["env-filter"] } tracing-core = "0.1.35" -tree-sitter = "0.25.10" -aws-smithy-http-client = "1.1.5" -aws-smithy-runtime-api = "1.9.3" -aws-smithy-types = "1.3.5" +tree-sitter = "0.26.5" +aws-smithy-http-client = "1.1.10" +aws-smithy-runtime-api = "1.11.4" +aws-smithy-types = "1.4.4" tree-sitter-bash = "0.25.1" tree-sitter-c = "0.24.1" tree-sitter-c-sharp = "0.23.1" tree-sitter-cpp = "0.23.4" -tree-sitter-css = "0.23.2" -tree-sitter-go = "0.23.4" +tree-sitter-css = "0.25.0" +tree-sitter-go = "0.25.0" tree-sitter-html = "0.23.2" tree-sitter-java = "0.23.5" -tree-sitter-javascript = "0.23.1" -tree-sitter-php = "0.23.11" -tree-sitter-python = "0.23.6" +tree-sitter-javascript = "0.25.0" +tree-sitter-php = "0.24.2" +tree-sitter-python = "0.25.0" tree-sitter-ruby = "0.23.1" tree-sitter-rust = "0.24.0" tree-sitter-toml-ng = "0.7.0" tree-sitter-typescript = "0.23.2" tree-sitter-yaml = "0.7.2" streaming-iterator = "0.1.9" -tree-sitter-regex = "0.24.3" +tree-sitter-regex = "0.25.0" tree_magic_mini = "3.2" content_inspector = "0.2.4" rustc-hash = "2.1.1" @@ -207,19 +236,19 @@ oci-client = { version = "0.15", default-features = false, features = ["rustls-t walkdir = "2.5.0" p256 = "0.13.2" ed25519-dalek = { version = "2.2", features = ["pkcs8"] } -aws-sdk-s3 = "1.117.0" -aws-sdk-iam = "1.101.0" -aws-sdk-ec2 = "1.196.0" -aws-sdk-dynamodb = "1.101.0" -aws-sdk-lambda = "1.111.0" -aws-sdk-kms = "1.97.0" -aws-sdk-secretsmanager = "1.96.0" +aws-sdk-s3 = { version = "1.123.0", default-features = false, features = ["default-https-client", "rt-tokio"] } +aws-sdk-iam = { version = "1.104.0", default-features = false, features = ["default-https-client", "rt-tokio"] } +aws-sdk-ec2 = { version = "1.211.0", default-features = false, features = ["default-https-client", "rt-tokio"] } +aws-sdk-dynamodb = { version = "1.105.0", default-features = false, features = ["default-https-client", "rt-tokio"] } +aws-sdk-lambda = { version = "1.116.0", default-features = false, features = ["default-https-client", "rt-tokio"] } +aws-sdk-kms = { version = "1.100.0", default-features = false, features = ["default-https-client", "rt-tokio"] } +aws-sdk-secretsmanager = { version = "1.100.0", default-features = false, features = ["default-https-client", "rt-tokio"] } gcloud-storage = { version = "1.1.1", default-features = false, features = [ "rustls-tls", "auth", "jwt-aws-lc-rs", ] } -tokei = "12.1.2" +tokei = "14.0.0" crc32fast = "1.5.0" bytes = "1.11.1" tokio-rustls = "0.26.4" diff --git a/crates/kingfisher-core/Cargo.toml b/crates/kingfisher-core/Cargo.toml index dfd9513..ef532f3 100644 --- a/crates/kingfisher-core/Cargo.toml +++ b/crates/kingfisher-core/Cargo.toml @@ -12,30 +12,30 @@ publish.workspace = true [dependencies] # Serialization -serde = { version = "1.0", features = ["derive", "rc"] } -serde_json = "1.0" -schemars = "0.8" +serde = { workspace = true, features = ["rc"] } +serde_json.workspace = true +schemars.workspace = true # Error handling -anyhow = "1.0" -thiserror = "1.0" +anyhow.workspace = true +thiserror.workspace = true # Hashing and crypto -sha1 = "0.10" -hex = "0.4" +sha1.workspace = true +hex.workspace = true # Memory management memmap2 = "0.9" -once_cell = "1.21" -parking_lot = "0.12" +once_cell.workspace = true +parking_lot.workspace = true # Collections -smallvec = { version = "1", features = ["const_generics", "const_new", "union"] } -rustc-hash = "2.1" +smallvec.workspace = true +rustc-hash.workspace = true dashmap = "6.1" # Byte string handling -bstr = { version = "1.12", features = ["serde"] } +bstr.workspace = true memchr = "2.7" # Git types (minimal, for ObjectId and Time) diff --git a/crates/kingfisher-rules/Cargo.toml b/crates/kingfisher-rules/Cargo.toml index c63ade1..9b417ee 100644 --- a/crates/kingfisher-rules/Cargo.toml +++ b/crates/kingfisher-rules/Cargo.toml @@ -15,49 +15,49 @@ publish.workspace = true kingfisher-core = { path = "../kingfisher-core" } # Serialization -serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" +serde.workspace = true +serde_json.workspace = true serde_yaml = "0.9" -schemars = "0.8" +schemars.workspace = true # Error handling -anyhow = "1.0" -thiserror = "1.0" +anyhow.workspace = true +thiserror.workspace = true # Regex -regex = "1.12" +regex.workspace = true lazy_static = "1.5" # Hashing -xxhash-rust = { version = "0.8", features = ["xxh3"] } +xxhash-rust.workspace = true # Liquid templating (for checksum validation) liquid = "0.26" liquid-core = "0.26" # Crypto for liquid filters -base64 = "0.22" +base64.workspace = true crc32fast = "1.5" -hmac = "0.12" -sha1 = "0.10" -sha2 = "0.10" -percent-encoding = "2.3" -time = "0.3" -uuid = { version = "1.19", features = ["v4"] } -rand = "0.9" +hmac.workspace = true +sha1.workspace = true +sha2.workspace = true +percent-encoding.workspace = true +time.workspace = true +uuid = { workspace = true, features = ["v4"] } +rand.workspace = true # Vectorscan for pattern matching -vectorscan-rs = "0.0.5" +vectorscan-rs.workspace = true # Logging -tracing = "0.1" +tracing.workspace = true # File walking -walkdir = "2.5" -ignore = "0.4" +walkdir.workspace = true +ignore.workspace = true # Embedded rule files -include_dir = "0.7" +include_dir.workspace = true [dev-dependencies] pretty_assertions = "1.4" diff --git a/crates/kingfisher-scanner/Cargo.toml b/crates/kingfisher-scanner/Cargo.toml index edc9e84..bdeec18 100644 --- a/crates/kingfisher-scanner/Cargo.toml +++ b/crates/kingfisher-scanner/Cargo.toml @@ -111,44 +111,44 @@ kingfisher-core = { path = "../kingfisher-core" } kingfisher-rules = { path = "../kingfisher-rules" } # Error handling -anyhow = "1.0" -thiserror = "1.0" +anyhow.workspace = true +thiserror.workspace = true # Serialization -serde = { version = "1.0", features = ["derive"], optional = true } -serde_json = "1.0" -schemars = "0.8" +serde = { workspace = true, optional = true } +serde_json.workspace = true +schemars.workspace = true # Regex -regex = "1.12" +regex.workspace = true # Hashing -xxhash-rust = { version = "0.8", features = ["xxh3"] } +xxhash-rust.workspace = true # Vectorscan -vectorscan-rs = "0.0.5" +vectorscan-rs.workspace = true # Collections -smallvec = { version = "1", features = ["const_generics", "const_new", "union"] } -rustc-hash = "2.1" +smallvec.workspace = true +rustc-hash.workspace = true # Concurrency -parking_lot = "0.12" +parking_lot.workspace = true thread_local = "1.1" -once_cell = "1.21" +once_cell.workspace = true crossbeam-skiplist = "0.1.3" # HTTP status codes -http = "1.4" +http.workspace = true # Byte strings -bstr = { version = "1.12", features = ["serde"] } +bstr.workspace = true # Base64 for decoding -base64 = "0.22" +base64.workspace = true # Logging -tracing = "0.1" +tracing.workspace = true # ---- Optional validation dependencies ---- @@ -160,18 +160,18 @@ tokio = { version = "1.48", features = ["net", "time", "sync"], optional = true liquid = { version = "0.26", optional = true } liquid-core = { version = "0.26", optional = true } quick-xml = { version = "0.38", features = ["serde", "serialize"], optional = true } -sha1 = { version = "0.10", optional = true } +sha1 = { workspace = true, optional = true } chrono = { version = "0.4.42", optional = true } -hmac = { version = "0.12", optional = true } -sha2 = { version = "0.10", optional = true } +hmac = { workspace = true, optional = true } +sha2 = { workspace = true, optional = true } pem = { version = "3.0.6", optional = true } -percent-encoding = { version = "2.3.2", optional = true } +percent-encoding = { workspace = true, optional = true } ring = { version = "0.17", optional = true } ipnet = { version = "2.11", optional = true } jsonwebtoken = { version = "10.2.0", features = ["aws-lc-rs"], optional = true } p256 = { version = "0.13.2", optional = true } ed25519-dalek = { version = "2.2", features = ["pkcs8"], optional = true } -hex = { version = "0.4.3", optional = true } +hex = { workspace = true, optional = true } url = { version = "2.5.7", optional = true } bson = { version = "2.15.0", optional = true } mongodb = { version = "3.4", default-features = false, features = ["rustls-tls", "aws-auth", "compat-3-0-0", "dns-resolver"], optional = true } @@ -182,14 +182,14 @@ rustls = { version = "0.23.35", optional = true } rustls-native-certs = { version = "0.8.2", optional = true } # AWS validation -aws-config = { version = "1.8", optional = true } -aws-credential-types = { version = "1.2", optional = true } -aws-sdk-iam = { version = "1.101.0", optional = true } -aws-sdk-sts = { version = "1.95", optional = true } -aws-types = { version = "1.3", optional = true } -aws-smithy-http-client = { version = "1.1", optional = true } -aws-smithy-runtime-api = { version = "1.9", optional = true } -aws-smithy-types = { version = "1.3", optional = true } +aws-config = { version = "1.8.14", default-features = false, features = ["default-https-client", "rt-tokio"], optional = true } +aws-credential-types = { version = "1.2.12", optional = true } +aws-sdk-iam = { version = "1.104.0", default-features = false, features = ["default-https-client", "rt-tokio"], optional = true } +aws-sdk-sts = { version = "1.98.0", default-features = false, features = ["default-https-client", "rt-tokio"], optional = true } +aws-types = { version = "1.3.12", optional = true } +aws-smithy-http-client = { version = "1.1.10", optional = true } +aws-smithy-runtime-api = { version = "1.11.4", optional = true } +aws-smithy-types = { version = "1.4.4", optional = true } base32 = { version = "0.5", optional = true } byteorder = { version = "1.5", optional = true } rand = { version = "0.9", optional = true } diff --git a/src/content_type.rs b/src/content_type.rs index ee44da6..611badb 100644 --- a/src/content_type.rs +++ b/src/content_type.rs @@ -6,8 +6,8 @@ use tokei::LanguageType; // Sort longest-first so more specific shebangs win. static SHEBANG_PREFIXES: Lazy> = Lazy::new(|| { let mut v = Vec::new(); - for &lang in LanguageType::list() { - for &sb in lang.shebangs() { + for &(lang, shebangs) in LanguageType::list() { + for &sb in shebangs { v.push((sb.as_bytes(), lang)); } } diff --git a/src/git_binary.rs b/src/git_binary.rs index e2bc9b4..a475613 100644 --- a/src/git_binary.rs +++ b/src/git_binary.rs @@ -385,10 +385,21 @@ mod tests { #[test] fn test_git_new() { - let git = Git::new(false); - assert!(!git.ignore_certs); - assert!(git.credentials.is_empty()); - assert!(git.bitbucket_access_token.is_none()); + temp_env::with_vars( + &[ + ("KF_GITHUB_TOKEN", None::<&str>), + ("KF_BITBUCKET_OAUTH_TOKEN", None::<&str>), + ("KF_BITBUCKET_ACCESS_TOKEN", None::<&str>), + ("KF_BITBUCKET_USERNAME", None::<&str>), + ("KF_BITBUCKET_APP_PASSWORD", None::<&str>), + ], + || { + let git = Git::new(false); + assert!(!git.ignore_certs); + assert!(git.credentials.is_empty()); + assert!(git.bitbucket_access_token.is_none()); + }, + ); temp_env::with_var("KF_GITHUB_TOKEN", Some("test_token"), || { let git = Git::new(false); diff --git a/src/matcher.rs b/src/matcher.rs index 10dbe38..ac89bae 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -370,8 +370,7 @@ impl<'a> Matcher<'a> { && blob_len <= TREE_SITTER_MAX_LIMIT && blob_len >= TREE_SITTER_MIN_LIMIT && has_raw_matches - && lang_hint.is_some() - && !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled + && lang_hint.is_some(); let tree_sitter_result = if should_run_tree_sitter { lang_hint.and_then(|lang_str| { @@ -396,7 +395,6 @@ impl<'a> Matcher<'a> { let owned_ts_results = tree_sitter_result.map(|ts_results| { ts_results .into_iter() - .filter(|match_result| match_result.is_base64_decoded) .map(|match_result| { ( match_result.range, @@ -440,33 +438,66 @@ impl<'a> Matcher<'a> { &self.inline_ignore_config, ); } - // If tree-sitter produced base64-decoded matches, try them against all rules + // Pre-filter tree-sitter extracted key-value pairs through Vectorscan, + // then only run the anchored regex for rules that Vectorscan flags as candidates. if let Some(ref ts_results) = owned_ts_results { - for (ts_range, ts_match, is_base64_decoded, _original_base64) in ts_results.iter() { - if *is_base64_decoded { - for (rule_id_usize, rule) in rules_db.rules().iter().enumerate() { - let re = &rules_db.anchored_regexes()[rule_id_usize]; - filter_match( - blob, - rule.clone(), - re, - ts_range.start, - ts_range.end, - &mut matches, - &mut previous_matches, - rule_id_usize, - &mut seen_matches, - origin, - Some(ts_match.as_bytes()), - *is_base64_decoded, - redact, - &filename, - self.profiler.as_ref(), - self.respect_ignore_if_contains, - &self.inline_ignore_config, - ); + if !ts_results.is_empty() { + // Build a combined buffer of all tree-sitter texts separated by newlines + // so we can run a single Vectorscan pass instead of one per result. + let mut combined_buf = Vec::new(); + let mut segment_ends: Vec = Vec::with_capacity(ts_results.len()); + for (_ts_range, ts_match, _is_base64_decoded, _original_base64) in ts_results.iter() + { + combined_buf.extend_from_slice(ts_match.as_bytes()); + segment_ends.push(combined_buf.len()); + combined_buf.push(b'\n'); + } + + // Single Vectorscan pass over the combined buffer + let mut ts_raw_matches: Vec<(u32, u64)> = Vec::new(); + self.scanner_pool.with(|scanner| { + scanner.scan(&combined_buf, |rule_id, _from, to, _flags| { + ts_raw_matches.push((rule_id, to)); + vectorscan_rs::Scan::Continue + }) + })?; + + // Map each Vectorscan hit back to its tree-sitter result and dedup + let mut rule_ts_pairs: FxHashSet<(usize, usize)> = FxHashSet::default(); + for &(rule_id, to) in &ts_raw_matches { + let to = to as usize; + let seg_idx = segment_ends.partition_point(|&end| end < to); + if seg_idx < ts_results.len() { + rule_ts_pairs.insert((rule_id as usize, seg_idx)); } } + + // Only run the anchored regex for (rule, ts_result) pairs Vectorscan flagged + for (rule_id_usize, ts_idx) in rule_ts_pairs { + let (ts_range, ts_match, is_base64_decoded, _original_base64) = + &ts_results[ts_idx]; + let rule = Arc::clone(&rules_db.rules()[rule_id_usize]); + let re = &rules_db.anchored_regexes()[rule_id_usize]; + filter_match( + blob, + rule, + re, + ts_range.start, + ts_range.end, + &mut matches, + &mut previous_matches, + rule_id_usize, + &mut seen_matches, + origin, + Some(ts_match.as_bytes()), + *is_base64_decoded, + redact, + &filename, + self.profiler.as_ref(), + self.respect_ignore_if_contains, + &self.inline_ignore_config, + ); + } } }