diff --git a/.gitignore b/.gitignore index 5e8e0e6..6ec14cd 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ *.json !webserver/static/sample-report.json !docs/access-map-viewer/sample-report.json +!testdata/parsers/tree_sitter_capture_baseline.json *.jsonl *.bson .prettierrc @@ -75,11 +76,11 @@ Cargo.lock ### VisualStudioCode ### .vscode/* -!.vscode/settings.json -!.vscode/tasks.json -!.vscode/launch.json -!.vscode/extensions.json -!.vscode/*.code-snippets +.vscode/settings.json +.vscode/tasks.json +.vscode/launch.json +.vscode/extensions.json +.vscode/*.code-snippets .vscode/launch.json # Local History for Visual Studio Code diff --git a/CHANGELOG.md b/CHANGELOG.md index b98d185..516c68e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ All notable changes to this project will be documented in this file. +## [v1.88.0] +- Tree-sitter fallback behavior changed to be strictly additive: when parser context is unavailable, findings now fall back to Hyperscan/Vectorscan matches instead of being suppressed. +- Fixed dependent-rule reporting gaps (for example Algolia API keys) by preserving regex findings when tree-sitter is unavailable, while still marking validation as skipped when dependency inputs are missing. +- Expanded parser queries for C, Go, Java, JavaScript, and TypeScript to improve assignment/literal capture coverage (including template/raw string handling in JS/TS/Go). +- Added parser query quality gates: compile-time query validation tests plus fixture-based capture-count regression tests backed by `testdata/parsers/tree_sitter_capture_baseline.json`. +- Added inline-ignore coverage for directives placed on the line immediately before a single-line secret match. +- Updated tree-sitter documentation wording to align with `--turbo` terminology. + ## [v1.87.0] - Tree-sitter verification now runs for blobs from `0` bytes up to `128 KiB` (previously `1 KiB` to `64 KiB`), while remaining a post-regex verification step applied only to context-dependent candidate matches from Hyperscan/Vectorscan. - False-positive reduction: Hyperscan/Vectorscan still scans everything first, then tree-sitter performs a second-pass verification only on auto-classified context-dependent findings; self-identifying/token-explicit findings stay regex-first. diff --git a/Cargo.toml b/Cargo.toml index 0abc7fc..2c03628 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,7 +48,7 @@ http = "1.4" [package] name = "kingfisher" -version = "1.87.0" +version = "1.88.0" description = "MongoDB's blazingly fast and accurate secret scanning and validation tool" edition.workspace = true rust-version.workspace = true diff --git a/crates/kingfisher-rules/data/rules/redis.yml b/crates/kingfisher-rules/data/rules/redis.yml index 9c92f9b..3d6d5b8 100644 --- a/crates/kingfisher-rules/data/rules/redis.yml +++ b/crates/kingfisher-rules/data/rules/redis.yml @@ -90,7 +90,7 @@ rules: - "localhost" min_entropy: 3.0 - confidence: medium + confidence: low examples: - 'REDIS_PASSWORD="EXAMPLEp4ssw0rd123"' - 'redis_password=MyS3cur3R3d1sK3y' diff --git a/crates/kingfisher-rules/src/lib.rs b/crates/kingfisher-rules/src/lib.rs index 2c8164a..8455999 100644 --- a/crates/kingfisher-rules/src/lib.rs +++ b/crates/kingfisher-rules/src/lib.rs @@ -29,7 +29,6 @@ pub use rules::{Rules, RulesError}; // Re-export RulesDatabase pub use rules_database::{ format_regex_pattern, RuleDetectionProfileKind, RuleMatchProfile, RulesDatabase, - TreeSitterFallbackPolicy, }; // Re-export defaults diff --git a/crates/kingfisher-rules/src/rules_database.rs b/crates/kingfisher-rules/src/rules_database.rs index 3531273..3a28fac 100644 --- a/crates/kingfisher-rules/src/rules_database.rs +++ b/crates/kingfisher-rules/src/rules_database.rs @@ -7,12 +7,6 @@ use vectorscan_rs::{BlockDatabase, Flag, Pattern}; use crate::rule::{Rule, RULE_COMMENTS_PATTERN}; -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum TreeSitterFallbackPolicy { - KeepRawWhenUnavailable, - SuppressWhenUnavailable, -} - #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum RuleDetectionProfileKind { SelfIdentifying, @@ -22,7 +16,6 @@ pub enum RuleDetectionProfileKind { #[derive(Debug, Clone, PartialEq, Eq)] pub struct RuleMatchProfile { pub kind: RuleDetectionProfileKind, - pub fallback_policy: TreeSitterFallbackPolicy, pub reason_codes: Vec<&'static str>, } @@ -86,7 +79,6 @@ impl RulesDatabase { reason_codes.push("self_identifying_prefix"); return RuleMatchProfile { kind: RuleDetectionProfileKind::SelfIdentifying, - fallback_policy: TreeSitterFallbackPolicy::KeepRawWhenUnavailable, reason_codes, }; } @@ -133,24 +125,14 @@ impl RulesDatabase { if !is_context_dependent { return RuleMatchProfile { kind: RuleDetectionProfileKind::SelfIdentifying, - fallback_policy: TreeSitterFallbackPolicy::KeepRawWhenUnavailable, reason_codes, }; } - - let fallback_policy = if looks_generic_token && has_distance_operator { - reason_codes.push("strict_fallback_suppress_when_unavailable"); - TreeSitterFallbackPolicy::SuppressWhenUnavailable - } else { - reason_codes.push("fallback_keep_when_unavailable"); - TreeSitterFallbackPolicy::KeepRawWhenUnavailable - }; - - RuleMatchProfile { - kind: RuleDetectionProfileKind::ContextDependent, - fallback_policy, - reason_codes, + if looks_generic_token && has_distance_operator { + reason_codes.push("strict_contextual_shape"); } + + RuleMatchProfile { kind: RuleDetectionProfileKind::ContextDependent, reason_codes } } pub fn get_rule_by_finding_fingerprint(&self, finding_fingerprint: &str) -> Option> { @@ -451,7 +433,6 @@ mod test_rule_match_profiles { mk_rule("kingfisher.circleci.1", r"(?x)\b(CCIPAT_[A-Za-z0-9]{22}_[a-z0-9]{40})\b"); let profile = RulesDatabase::classify_rule_profile(&rule); assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying); - assert_eq!(profile.fallback_policy, TreeSitterFallbackPolicy::KeepRawWhenUnavailable); assert!(profile.reason_codes.contains(&"self_identifying_prefix")); } @@ -463,8 +444,8 @@ mod test_rule_match_profiles { ); let profile = RulesDatabase::classify_rule_profile(&rule); assert_eq!(profile.kind, RuleDetectionProfileKind::ContextDependent); - assert_eq!(profile.fallback_policy, TreeSitterFallbackPolicy::SuppressWhenUnavailable); assert!(profile.reason_codes.contains(&"generic_token_shape")); + assert!(profile.reason_codes.contains(&"strict_contextual_shape")); } #[test] @@ -476,4 +457,33 @@ mod test_rule_match_profiles { let profile = RulesDatabase::classify_rule_profile(&rule); assert_eq!(profile.kind, RuleDetectionProfileKind::ContextDependent); } + + #[test] + fn depends_on_rules_keep_raw_when_parser_unavailable() { + use crate::rule::DependsOnRule; + + let rule = Rule::new(RuleSyntax { + id: "kingfisher.algolia.1".to_string(), + name: "algolia".to_string(), + pattern: r"(?xi)algolia(?:.|[\n\r]){0,32}?([a-z0-9]{32})".to_string(), + confidence: Confidence::Medium, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + revocation: None, + depends_on_rule: vec![Some(DependsOnRule { + rule_id: "kingfisher.algolia.2".to_string(), + variable: "APPID".to_string(), + })], + pattern_requirements: None, + tls_mode: None, + }); + + let profile = RulesDatabase::classify_rule_profile(&rule); + assert_eq!(profile.kind, RuleDetectionProfileKind::ContextDependent); + assert!(profile.reason_codes.contains(&"depends_on_rule")); + } } diff --git a/docs/TREE_SITTER.md b/docs/TREE_SITTER.md index 50999ea..999826a 100644 --- a/docs/TREE_SITTER.md +++ b/docs/TREE_SITTER.md @@ -13,19 +13,19 @@ The goal is to confirm that a regex hit appears in a plausible code assignment/c ## Where It Runs in the Scan Pipeline 1. `BlobProcessor::run` decides whether to compute a language hint. - - It skips language hinting in `fast_mode`. + - It skips language hinting in `turbo_mode`. - It also skips when blob size is outside the Tree-sitter window. 2. `Matcher::scan_blob` performs the primary regex scan and other filtering. 3. `maybe_apply_tree_sitter_verification` runs near the end of `scan_blob`. 4. Only candidate matches are checked against Tree-sitter extracted text. -5. Matches that fail verification can be dropped, depending on rule profile and fallback policy. +5. Matches that fail verification are dropped for context-dependent rules. ## Size and Mode Gates Tree-sitter is attempted only when all of these are true: - Blob length is between `0 KiB` and `128 KiB` (`should_attempt_tree_sitter`). -- `fast_mode` is disabled. +- `turbo_mode` is disabled. - A language hint is available. - The language maps to a supported Tree-sitter grammar + query set. @@ -38,9 +38,9 @@ Tree-sitter verification is only applied to matches that are: - Classified as `ContextDependent` by rule profiling. - Not base64-derived findings (`is_base64 == false`). -Classification and fallback policy come from rule profiles in `kingfisher-rules`: +Classification comes from rule profiles in `kingfisher-rules`: -- `SelfIdentifying`: usually keep raw regex result. +- `SelfIdentifying`: keep raw regex result. - `ContextDependent`: may require Tree-sitter confirmation. ## How Verification Works @@ -61,14 +61,9 @@ When Tree-sitter is available: If no extracted fragment verifies the secret, that candidate finding is removed. -## Fallback Behavior When Tree-sitter Is Unavailable +## Behavior When Tree-sitter Is Unavailable -If Tree-sitter cannot run (size/mode/language/parse errors), behavior is rule-driven: - -- `KeepRawWhenUnavailable`: keep the regex finding. -- `SuppressWhenUnavailable`: drop the finding. - -`SuppressWhenUnavailable` is used for stricter generic-context patterns where false positives are likely without syntax-aware confirmation. +If Tree-sitter cannot run (size/mode/language/parse errors), Kingfisher keeps the original regex finding. ## Supported Languages in This Path @@ -100,6 +95,6 @@ Tree-sitter in Kingfisher is a conditional verifier, not the primary detector: - Regex finds candidates quickly. - Rule profiling decides which candidates need context verification. - Tree-sitter confirms contextual plausibility from parsed syntax. -- Fallback policy determines what to do when verification cannot run. +- If verification cannot run, scan results fall back to the regex pass. This keeps scanning fast while reducing noisy matches for context-dependent secret patterns. diff --git a/src/inline_ignore.rs b/src/inline_ignore.rs index 22f47ea..e42f6c9 100644 --- a/src/inline_ignore.rs +++ b/src/inline_ignore.rs @@ -390,6 +390,19 @@ mod tests { assert!(config.should_ignore(blob, &span)); } + #[test] + fn ignores_single_line_secret_with_directive_on_previous_line() { + let blob = b"# safe-secret\n123456\n"; + let matched = b"123456"; + let start = blob + .windows(matched.len()) + .position(|window| window == matched) + .expect("match bytes present"); + let span = OffsetSpan::from_range(start..start + matched.len()); + let config = InlineIgnoreConfig::new(&["safe-secret".to_string()]); + assert!(config.should_ignore(blob, &span)); + } + #[test] fn trim_ascii_whitespace_returns_inner_slice() { assert_eq!(trim_ascii_whitespace(b" abc "), b"abc"); diff --git a/src/matcher/mod.rs b/src/matcher/mod.rs index 6735e52..a1e862c 100644 --- a/src/matcher/mod.rs +++ b/src/matcher/mod.rs @@ -27,7 +27,7 @@ use crate::{ parser::{Checker, Language}, rule_profiling::{ConcurrentRuleProfiler, RuleStats}, rules::rule::Rule, - rules_database::{RuleDetectionProfileKind, RulesDatabase, TreeSitterFallbackPolicy}, + rules_database::{RuleDetectionProfileKind, RulesDatabase}, scanner_pool::ScannerPool, validation_body::ValidationResponseBody, }; @@ -446,7 +446,6 @@ fn maybe_apply_tree_sitter_verification<'a>( let Some(rule_idx) = match_rule_indices.get(idx).copied() else { continue; }; - let profile = &profiles[rule_idx]; let match_secret = matches[idx].matching_input; let re = &rules_db.anchored_regexes()[rule_idx]; @@ -460,9 +459,8 @@ fn maybe_apply_tree_sitter_verification<'a>( } } None => { - if profile.fallback_policy == TreeSitterFallbackPolicy::SuppressWhenUnavailable { - keep[idx] = false; - } + // Tree-sitter is an optional precision layer. If parser context + // is unavailable, always fall back to the original regex match. } } } @@ -1169,7 +1167,7 @@ line2 } #[test] - fn strict_context_rule_suppresses_when_tree_sitter_unavailable() -> Result<()> { + fn strict_context_rule_keeps_raw_when_tree_sitter_unavailable() -> Result<()> { let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234"; let rule = Rule::new(RuleSyntax { id: "kingfisher.auth0.2".into(), @@ -1202,9 +1200,10 @@ line2 ScanResult::New(matches) => matches, _ => panic!("unexpected scan result"), }; - assert!( - found.is_empty(), - "strict contextual rules should suppress when tree-sitter is unavailable for verification" + assert_eq!( + found.len(), + 1, + "strict contextual rules should fall back to raw regex findings when tree-sitter is unavailable" ); Ok(()) } diff --git a/src/parser.rs b/src/parser.rs index 255335a..4dcae85 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -306,3 +306,187 @@ impl Checker { }) } } + +#[cfg(test)] +mod tests { + use super::*; + use std::{collections::BTreeMap, fs, path::PathBuf}; + + fn fixture_cases() -> Vec<(Language, &'static str)> { + vec![ + (Language::Bash, "testdata/shell_vulnerable.sh"), + (Language::C, "testdata/c_vulnerable.c"), + (Language::CSharp, "testdata/csharp_vulnerable.cs"), + (Language::Cpp, "testdata/cpp_vulnerable.cpp"), + (Language::Go, "testdata/go_vulnerable.go"), + (Language::Java, "testdata/java_vulnerable.java"), + (Language::JavaScript, "testdata/javascript_vulnerable.js"), + (Language::Php, "testdata/php_vulnerable.php"), + (Language::Python, "testdata/python_vulnerable.py"), + (Language::Ruby, "testdata/ruby_vulnerable.rb"), + (Language::Rust, "testdata/rust_vulnerable.rs"), + (Language::Toml, "testdata/toml_vulnerable.toml"), + (Language::TypeScript, "testdata/typescript_vulnerable.ts"), + (Language::Yaml, "testdata/yaml_vulnerable.yaml"), + ] + } + + fn build_checker(language: &Language) -> Checker { + Checker { + language: language.clone(), + rules: match language { + Language::Bash => queries::bash::get_bash_queries(), + Language::C => queries::c::get_c_queries(), + Language::CSharp => queries::csharp::get_csharp_queries(), + Language::Cpp => queries::cpp::get_cpp_queries(), + Language::Css => queries::css::get_css_queries(), + Language::Go => queries::go::get_go_queries(), + Language::Html => queries::html::get_html_queries(), + Language::Java => queries::java::get_java_queries(), + Language::JavaScript => queries::javascript::get_javascript_queries(), + Language::Php => queries::php::get_php_queries(), + Language::Python => queries::python::get_python_queries(), + Language::Regex => queries::regex::get_regex_queries(), + Language::Ruby => queries::ruby::get_ruby_queries(), + Language::Rust => queries::rust::get_rust_queries(), + Language::Toml => queries::toml::get_toml_queries(), + Language::TypeScript => queries::typescript::get_typescript_queries(), + Language::Yaml => queries::yaml::get_yaml_queries(), + }, + } + } + + fn current_capture_counts( + root: &PathBuf, + cases: &[(Language, &'static str)], + ) -> BTreeMap { + let mut current = BTreeMap::new(); + for (language, rel_path) in cases { + let file_path = root.join(rel_path); + let source = fs::read(&file_path) + .unwrap_or_else(|e| panic!("failed to read fixture {}: {e}", file_path.display())); + let checker = build_checker(language); + let count = checker + .check(&source) + .unwrap_or_else(|e| panic!("checker failed for {}: {e}", rel_path)) + .len(); + current.insert(format!("{}:{}", language.name(), rel_path), count); + } + current + } + + #[test] + fn queries_compile_for_supported_languages() { + let cases = vec![ + (Language::Bash, queries::bash::get_bash_queries()), + (Language::C, queries::c::get_c_queries()), + (Language::CSharp, queries::csharp::get_csharp_queries()), + (Language::Cpp, queries::cpp::get_cpp_queries()), + (Language::Css, queries::css::get_css_queries()), + (Language::Go, queries::go::get_go_queries()), + (Language::Html, queries::html::get_html_queries()), + (Language::Java, queries::java::get_java_queries()), + (Language::JavaScript, queries::javascript::get_javascript_queries()), + (Language::Php, queries::php::get_php_queries()), + (Language::Python, queries::python::get_python_queries()), + (Language::Regex, queries::regex::get_regex_queries()), + (Language::Ruby, queries::ruby::get_ruby_queries()), + (Language::Rust, queries::rust::get_rust_queries()), + (Language::Toml, queries::toml::get_toml_queries()), + (Language::TypeScript, queries::typescript::get_typescript_queries()), + (Language::Yaml, queries::yaml::get_yaml_queries()), + ]; + + for (language, rule_set) in cases { + let ts_language = language + .get_ts_language() + .unwrap_or_else(|e| panic!("failed to load language {}: {e}", language.name())); + for (name, query) in rule_set { + Query::new(&ts_language, &query).unwrap_or_else(|e| { + panic!("query '{name}' failed for language {}: {e}", language.name()) + }); + } + } + } + + #[test] + fn tree_sitter_capture_counts_do_not_regress() { + let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let baseline_path = root.join("testdata/parsers/tree_sitter_capture_baseline.json"); + let cases = fixture_cases(); + let current = current_capture_counts(&root, &cases); + + if std::env::var("UPDATE_TREE_SITTER_CAPTURE_BASELINE").as_deref() == Ok("1") { + let payload = serde_json::to_string_pretty(¤t) + .unwrap_or_else(|e| panic!("failed to serialize baseline: {e}")); + fs::write(&baseline_path, format!("{payload}\n")).unwrap_or_else(|e| { + panic!("failed to write baseline {}: {e}", baseline_path.display()) + }); + return; + } + + let baseline_raw = fs::read_to_string(&baseline_path).unwrap_or_else(|e| { + panic!( + "failed to read baseline {}: {e}. Run with UPDATE_TREE_SITTER_CAPTURE_BASELINE=1", + baseline_path.display() + ) + }); + let baseline: BTreeMap = serde_json::from_str(&baseline_raw) + .unwrap_or_else(|e| panic!("invalid baseline JSON {}: {e}", baseline_path.display())); + + let mut regressions = Vec::new(); + for (key, actual) in ¤t { + let expected = baseline.get(key).unwrap_or_else(|| { + panic!( + "missing baseline entry for {key}. Run with UPDATE_TREE_SITTER_CAPTURE_BASELINE=1" + ) + }); + if actual < expected { + regressions.push(format!("{key}: expected >= {expected}, got {actual}")); + } + } + + assert!( + regressions.is_empty(), + "tree-sitter capture regression(s):\n{}", + regressions.join("\n") + ); + } + + #[test] + fn report_tree_sitter_capture_count_deltas() { + let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let baseline_path = root.join("testdata/parsers/tree_sitter_capture_baseline.json"); + let cases = fixture_cases(); + let current = current_capture_counts(&root, &cases); + + let baseline_raw = match fs::read_to_string(&baseline_path) { + Ok(data) => data, + Err(e) => { + println!( + "capture-delta report unavailable: cannot read baseline {}: {e}", + baseline_path.display() + ); + return; + } + }; + + let baseline: BTreeMap = match serde_json::from_str(&baseline_raw) { + Ok(v) => v, + Err(e) => { + println!( + "capture-delta report unavailable: invalid baseline JSON {}: {e}", + baseline_path.display() + ); + return; + } + }; + + println!("tree-sitter capture delta report (current vs baseline):"); + for (key, actual) in ¤t { + let expected = baseline.get(key).copied().unwrap_or(0); + let delta = (*actual as isize) - (expected as isize); + println!(" {key}: current={actual}, baseline={expected}, delta={delta:+}"); + } + } +} diff --git a/src/parser/queries.rs b/src/parser/queries.rs index b45ebbd..7213015 100644 --- a/src/parser/queries.rs +++ b/src/parser/queries.rs @@ -103,14 +103,6 @@ pub mod javascript { queries } } -pub mod kotlin { - use super::*; - pub fn get_kotlin_queries() -> FxHashMap { - let mut queries = FxHashMap::default(); - queries.insert("combined_kotlin_query".to_string(), QUERIES_KOTLIN.to_string()); - queries - } -} pub mod php { use super::*; pub fn get_php_queries() -> FxHashMap { @@ -220,9 +212,12 @@ pub const QUERIES_C: &str = r#" ) ; Query 7: Matches initializer lists containing string literals - declarator: (init_declarator - value: (initializer_list - (string_literal) @val + (declaration + declarator: (init_declarator + declarator: [(identifier)(array_declarator)(pointer_declarator)] @key + value: (initializer_list + (string_literal) @val + ) ) ) @@ -595,7 +590,7 @@ pub const QUERIES_GO: &str = r#" (var_spec name: (identifier) @key value: (expression_list - (interpreted_string_literal) @val + [(interpreted_string_literal)(raw_string_literal)] @val ) ) @@ -605,7 +600,7 @@ pub const QUERIES_GO: &str = r#" (identifier) @key ) right: (expression_list - (interpreted_string_literal) @val + [(interpreted_string_literal)(raw_string_literal)] @val ) ) @@ -615,7 +610,7 @@ pub const QUERIES_GO: &str = r#" (identifier) @key ) right: (expression_list - (interpreted_string_literal) @val + [(interpreted_string_literal)(raw_string_literal)] @val ) ) @@ -627,7 +622,7 @@ pub const QUERIES_GO: &str = r#" ) ) right: (expression_list - (interpreted_string_literal) @val + [(interpreted_string_literal)(raw_string_literal)] @val ) ) @@ -637,7 +632,7 @@ pub const QUERIES_GO: &str = r#" (selector_expression) @key ) right: (expression_list - (interpreted_string_literal) @val + [(interpreted_string_literal)(raw_string_literal)] @val ) ) @@ -647,7 +642,7 @@ pub const QUERIES_GO: &str = r#" (type_identifier)? "=" (expression_list - (interpreted_string_literal) @val + [(interpreted_string_literal)(raw_string_literal)] @val )+ ) "#; @@ -673,127 +668,55 @@ pub const QUERIES_HTML: &str = r#" ) "#; pub const QUERIES_JAVA: &str = r#" - ; Query 1: Matches variable declarations with cast expressions - declarator: (variable_declarator - name: (identifier) @key - value: (parenthesized_expression - (cast_expression - value: [(string_literal)(decimal_integer_literal)(decimal_floating_point_literal)(hex_integer_literal)(hex_floating_point_literal)(binary_integer_literal)] @val - ) + ; Query 1: Local variable declarations with direct string assignments + (local_variable_declaration + declarator: (variable_declarator + name: (identifier) @key + value: (string_literal) @val ) ) - ; Query 2: Matches variable declarations with object creation or literal values - declarator: (variable_declarator - name: (identifier) @key - value: [(object_creation_expression - arguments: (argument_list - [(string_literal)(decimal_integer_literal)(decimal_floating_point_literal)(hex_integer_literal)(hex_floating_point_literal)(binary_integer_literal)] @val - ) - )[(string_literal)(decimal_integer_literal)(decimal_floating_point_literal)(hex_integer_literal)(hex_floating_point_literal)(binary_integer_literal)] @val] - ) - - ; Query 3: Matches variable declarations with method invocations - declarator: (variable_declarator - name: (identifier) @key - value: (method_invocation - arguments: (argument_list - [(string_literal)(decimal_integer_literal)(decimal_floating_point_literal)(hex_integer_literal)(hex_floating_point_literal)(binary_integer_literal)] @val - ) + ; Query 2: Field declarations with direct string assignments + (field_declaration + declarator: (variable_declarator + name: (identifier) @key + value: (string_literal) @val ) ) - ; Query 4: Matches variable declarations with lambda expressions - declarator: (variable_declarator - name: (identifier) @key - value: (lambda_expression - body: ( - (_ - object: (string_literal) @val - ) - ) - ) - ) - - ; Query 5: Matches assignment expressions with object creation + ; Query 3: Identifier assignment with direct string literal (assignment_expression left: (identifier) @key - right: (object_creation_expression - arguments: (argument_list - [(string_literal)(decimal_integer_literal)(decimal_floating_point_literal)(hex_integer_literal)(hex_floating_point_literal)(binary_integer_literal)] @val - ) - ) + right: (string_literal) @val ) - ; Query 6: Matches assignment expressions with field access + ; Query 4: Field assignment with direct string literal (assignment_expression left: (field_access field: (identifier) @key ) - right: [(string_literal)(decimal_integer_literal)(decimal_floating_point_literal)(hex_integer_literal)(hex_floating_point_literal)(binary_integer_literal)] @val + right: (string_literal) @val ) - ; Query 7: Matches simple assignment expressions - (assignment_expression - left: (identifier) @key - right: [(string_literal)(decimal_integer_literal)(decimal_floating_point_literal)(hex_integer_literal)(hex_floating_point_literal)(binary_integer_literal)] @val - ) - - ; Query 8: Matches field declarations - (field_declaration + ; Query 5: Local variable assignment from constructor call containing a string + (local_variable_declaration declarator: (variable_declarator name: (identifier) @key - value: [(string_literal)(decimal_integer_literal)(decimal_floating_point_literal)(hex_integer_literal)(hex_floating_point_literal)(binary_integer_literal)] @val - ) - ) - - ; Query 9: Matches element value pairs in annotations - (element_value_pair - key: (identifier) @key - value: [(string_literal)(decimal_integer_literal)(decimal_floating_point_literal)(hex_integer_literal)(hex_floating_point_literal)(binary_integer_literal)] @val - ) - - ; Query 10: Matches method arguments with field access and string literals - arguments: (argument_list - (field_access - field: (identifier) @key - ) - (string_literal) @val - ) - - ; Query 11: Matches local variable declarations with string literals - (local_variable_declaration - declarator: (_ - name: (identifier) @key - value: (_ - (string_literal - (string_fragment) @val + value: (object_creation_expression + arguments: (argument_list + (string_literal) @val ) ) ) ) - ; Query 12: Matches nested local variable declarations with string literals + ; Query 6: Local variable assignment from method call containing a string (local_variable_declaration - declarator: (_ + declarator: (variable_declarator name: (identifier) @key - value: (_ - value: (_ - (string_literal - (string_fragment) @val - ) - ) - ) - ) - ) - - ; Query 13: Matches method invocations with string literal arguments - (expression_statement - (method_invocation - name: (identifier) @key - arguments: (_ - (string_literal - (string_fragment) @val + value: (method_invocation + arguments: (argument_list + (string_literal) @val ) ) ) @@ -805,13 +728,19 @@ pub const QUERIES_JAVASCRIPT: &str = r#" left: (member_expression property: (property_identifier) @key ) - right: (string (string_fragment) @val) + right: [ + (string (string_fragment) @val) + (template_string) @val + ] ) ; Query 2: Matches variable declarations with literal values (variable_declarator name: (identifier) @key - value: (string (string_fragment) @val) + value: [ + (string (string_fragment) @val) + (template_string) @val + ] ) ; Query 3: Matches variable declarations with object literals @@ -820,7 +749,10 @@ pub const QUERIES_JAVASCRIPT: &str = r#" value: (object (pair key: (property_identifier) @key - value: (string (string_fragment) @val) + value: [ + (string (string_fragment) @val) + (template_string) @val + ] ) ) ) @@ -829,14 +761,20 @@ pub const QUERIES_JAVASCRIPT: &str = r#" (call_expression arguments: (arguments (identifier) @key - (string (string_fragment) @val) + [ + (string (string_fragment) @val) + (template_string) @val + ] ) ) ; Query 5: Matches object literal key-value pairs (pair key: (property_identifier) @key - value: (string (string_fragment) @val) + value: [ + (string (string_fragment) @val) + (template_string) @val + ] ) ; Query 6: Matches assignments to array or object elements @@ -844,7 +782,10 @@ pub const QUERIES_JAVASCRIPT: &str = r#" left: (subscript_expression index: [(string)(identifier)] @key ) - right: (string (string_fragment) @val) + right: [ + (string (string_fragment) @val) + (template_string) @val + ] ) ; Query 7: Matches method calls on objects with string arguments @@ -853,86 +794,12 @@ pub const QUERIES_JAVASCRIPT: &str = r#" object: (identifier) @key ) arguments: (arguments - (string - (string_fragment) @val - ) - ) - ) -"#; -pub const QUERIES_KOTLIN: &str = r#" - ; Query 1: Matches property declarations with string literals - (property_declaration - (variable_declaration - (simple_identifier) @key - ) - (string_literal) @val - ) - - ; Query 2: Matches property declarations with call expressions and string literals - (property_declaration - (variable_declaration - (simple_identifier) @key - ) - (call_expression - (navigation_expression - (string_literal) @val - ) - ) - ) - - ; Query 3: Matches property declarations with call expressions and value arguments - (property_declaration - (variable_declaration - (simple_identifier) @key - ) - (call_expression - (call_suffix - (value_arguments - (value_argument) @val + [ + (string + (string_fragment) @val ) - ) - ) - ) - - ; Query 4: Matches assignments with string literals - (assignment - (directly_assignable_expression - (simple_identifier) @key - ) - (string_literal) @val - ) - - ; Query 5: Matches property declarations with property delegates and string literals - (property_declaration - (variable_declaration - (simple_identifier) @key - ) - (property_delegate - (_ - (call_suffix - (_ - (_ - (_ - (string_literal) @val - ) - ) - ) - ) - ) - ) - ) - - ; Query 6: Matches secondary constructor assignments with string literals - (secondary_constructor - (statements - (assignment - (directly_assignable_expression - (navigation_suffix - (simple_identifier) @key - ) - ) - (string_literal) @val - ) + (template_string) @val + ] ) ) "#; @@ -1175,13 +1042,13 @@ pub const QUERIES_TYPESCRIPT: &str = r#" ; Query 1: Matches variable declarations with string or number values (variable_declarator name: (identifier) @key - value: [(string)(number)] @val + value: [(string)(template_string)(number)] @val ) ; Query 2: Matches assignments to variables or object properties (assignment_expression left: [(member_expression)(identifier)] @key - right: [(string)(number)] @val + right: [(string)(template_string)(number)] @val ) ; Query 3: Matches variable declarations with string literal type annotations @@ -1199,7 +1066,7 @@ pub const QUERIES_TYPESCRIPT: &str = r#" key: (property_identifier) @key value: ( (array - (string) @val + [(string)(template_string)] @val ) ) ) @@ -1207,7 +1074,7 @@ pub const QUERIES_TYPESCRIPT: &str = r#" ; Query 5: Matches object property definitions with string or number values (pair key: (property_identifier) @key - value: [(string)(number)] @val + value: [(string)(template_string)(number)] @val ) ; Query 6: Matches property signatures with literal types @@ -1232,7 +1099,7 @@ pub const QUERIES_TYPESCRIPT: &str = r#" property: (property_identifier) @key ) arguments: (arguments - (string) @val + [(string)(template_string)] @val ) ) "#; diff --git a/src/rules_database.rs b/src/rules_database.rs index 396fdf9..c37b0f4 100644 --- a/src/rules_database.rs +++ b/src/rules_database.rs @@ -4,5 +4,4 @@ pub use kingfisher_rules::rules_database::{ format_regex_pattern, RuleDetectionProfileKind, RuleMatchProfile, RulesDatabase, - TreeSitterFallbackPolicy, }; diff --git a/testdata/kotlin_vulnerable.kt b/testdata/kotlin_vulnerable.kt deleted file mode 100644 index ceb375a..0000000 --- a/testdata/kotlin_vulnerable.kt +++ /dev/null @@ -1,61 +0,0 @@ - -// Direct Assignment with Double Quotes -val greeting: String = "Hello, World!" - -// Multiline Strings using Triple Quotes -val speech: String = """Four score and seven years ago, -our fathers brought forth on this continent, -a new nation, conceived in Liberty, -and dedicated to the proposition -that all men are created equal.""".trimMargin() - -// Using String Templates -val password: String = "This is a sup3r s3cr3t p@ssw0rd!" -val interpolation: String = "Hello, $name!" - - -val passphrase: String = "This is a sup3r s3cr3t p@ssw0rd!" -val api_key: String = "somekey_29f3d2hbiuhlf203hewidd3" -import javax.naming.Context -import javax.naming.directory.InitialDirContext - -class HelloWorld { - var strPassword: String = "sunshine123" - var foobarPassword: String = "kingpin987" - var horsePassword: String = "kingpin987" - - companion object { - // It seems you attempted to redeclare these variables multiple times in Java, which is not valid in Kotlin. - // Here they're declared once. - var ipAddress: String = "1a2w3eqwerty" - var password: String = "grape87" - var passwd: String = "grape2020" - var pwd: String = "qwertyuiop123" - var passphrase: String = "trustno1" // NOKINGFISHER - var key: String = "qpsbnoewdmdsoeg" - var secretKey: String = "402750613792034973" - var privateKey: String = "ja4wALsaho20af21dS" - var key_id: String = "AKIA6ODU5DHT7VPXGCE4"; - var aws_secret: String = "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"; - var hidden_passphrase: String = "blink182"; - - @JvmStatic - fun main(args: Array) { - println("Hello, World") - - try { - val env = Hashtable() - env[Context.SECURITY_CREDENTIALS] = "412389uSwYkRm1Tg!" - env[Context.SECURITY_PRINCIPAL] = "fakefakefake@contoso.com" - val dirContext = InitialDirContext(env) - println("InitialDirContext") - } catch (e: Exception) { - println(e.message) - println(e) - } - } - } -} - - -val passwd = "9043hfdlasf023" \ No newline at end of file diff --git a/testdata/parsers/parsers_test.go b/testdata/parsers/parsers_test.go index a76581f..74006ba 100644 --- a/testdata/parsers/parsers_test.go +++ b/testdata/parsers/parsers_test.go @@ -55,7 +55,6 @@ func TestParseFiles(t *testing.T) { {"elixir_vulnerable.exs", 5, 0}, {"generic_secrets.py", 15, 0}, {"go_vulnerable.go", 10, 0}, - {"kotlin_vulnerable.kt", 10, 0}, {"java_vulnerable.java", 15, 0}, {"javascript_vulnerable.js", 7, 0}, {"json_vulnerable.json", 2, 0}, diff --git a/testdata/parsers/tree_sitter_capture_baseline.json b/testdata/parsers/tree_sitter_capture_baseline.json new file mode 100644 index 0000000..a977c1b --- /dev/null +++ b/testdata/parsers/tree_sitter_capture_baseline.json @@ -0,0 +1,16 @@ +{ + "bash:testdata/shell_vulnerable.sh": 10, + "c:testdata/c_vulnerable.c": 14, + "c_sharp:testdata/csharp_vulnerable.cs": 29, + "cpp:testdata/cpp_vulnerable.cpp": 5, + "go:testdata/go_vulnerable.go": 29, + "java:testdata/java_vulnerable.java": 48, + "javascript:testdata/javascript_vulnerable.js": 14, + "php:testdata/php_vulnerable.php": 25, + "python:testdata/python_vulnerable.py": 27, + "ruby:testdata/ruby_vulnerable.rb": 36, + "rust:testdata/rust_vulnerable.rs": 16, + "toml:testdata/toml_vulnerable.toml": 11, + "typescript:testdata/typescript_vulnerable.ts": 25, + "yaml:testdata/yaml_vulnerable.yaml": 19 +} diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 5dea201..824482c 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -367,7 +367,6 @@ async fn test_scan_vulnerable_files() -> Result<()> { TestCase { file_name: "testdata/java_vulnerable.java", min_expected_findings: 4 }, TestCase { file_name: "testdata/javascript_vulnerable.js", min_expected_findings: 4 }, TestCase { file_name: "testdata/json_vulnerable.json", min_expected_findings: 4 }, - TestCase { file_name: "testdata/kotlin_vulnerable.kt", min_expected_findings: 7 }, TestCase { file_name: "testdata/objc_vulnerable.m", min_expected_findings: 4 }, TestCase { file_name: "testdata/php_vulnerable.php", min_expected_findings: 5 }, TestCase { file_name: "testdata/python_vulnerable.py", min_expected_findings: 10 },