diff --git a/AGENTS.md b/AGENTS.md index 917c3f7..4d39dbc 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -24,7 +24,7 @@ Key capabilities: - `src/cli/commands/`: CLI command implementations - `src/matcher/`: pattern matching engine - `src/scanner/`: core scanning logic -- `src/parser/`: language-aware parsing (`tree-sitter`) +- `src/parser/`: language-aware context verification (lightweight lexers, `tl` for HTML, `cssparser` for CSS) - `src/reporter/`: TOON/JSON/SARIF/HTML report generation - `src/access_map/`: access mapping analysis - `crates/kingfisher-core/`: shared types and core logic diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ee1dde..94ec62d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file. ## [v1.95.0] - Added 80+ built-in rules, bringing the bundled ruleset to 820 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more. +- Replaced tree-sitter with a lighter parser-based context verifier built from handwritten lexers plus `tl`/`cssparser`, preserving context-dependent matching while cutting about 19 MB from the release binary. - Added a `validation: type: Raw` exception path for provider-specific checks, with new raw validators for Azure Batch, FTP, Kraken, LDAP, RabbitMQ, and Redis. Also added stable request-scoped template values plus new Liquid filters for HMAC-SHA384 hex output and timestamp generation. - Expanded live validation coverage for several built-in rules, including Agora, Bitfinex, DocuSign, Dwolla, GitLab, KuCoin, RingCentral, Snowflake, Tableau, Trello, and Webex. Also tightened newly added helper regex to avoid high-match scan regressions, and made preflight-blocked raw validations report as skipped/not attempted instead of failed. diff --git a/Cargo.lock b/Cargo.lock index 36fd53a..a8b5e17 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1994,6 +1994,17 @@ dependencies = [ "hybrid-array", ] +[[package]] +name = "cssparser" +version = "0.37.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c9cdaae01d5ed7882b04d795e7f752f46ff52d2fa3b50a20d28c464510bba98" +dependencies = [ + "dtoa-short", + "itoa", + "smallvec", +] + [[package]] name = "ctutils" version = "0.4.0" @@ -2418,6 +2429,21 @@ dependencies = [ "litrs", ] +[[package]] +name = "dtoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + [[package]] name = "dunce" version = "1.0.5" @@ -5097,6 +5123,7 @@ dependencies = [ "crc32fast", "crossbeam-channel", "crossbeam-skiplist", + "cssparser", "dashmap", "ed25519-dalek", "fixedbitset", @@ -5168,7 +5195,6 @@ dependencies = [ "sha1 0.11.0", "sha2 0.11.0", "smallvec", - "streaming-iterator", "strum 0.28.0", "strum_macros 0.28.0", "sysinfo", @@ -5181,6 +5207,7 @@ dependencies = [ "thread_local", "tikv-jemallocator", "time", + "tl", "tokei", "tokio", "tokio-postgres", @@ -5190,24 +5217,6 @@ dependencies = [ "tracing", "tracing-core", "tracing-subscriber", - "tree-sitter", - "tree-sitter-bash", - "tree-sitter-c", - "tree-sitter-c-sharp", - "tree-sitter-cpp", - "tree-sitter-css", - "tree-sitter-go", - "tree-sitter-html", - "tree-sitter-java", - "tree-sitter-javascript", - "tree-sitter-php", - "tree-sitter-python", - "tree-sitter-regex", - "tree-sitter-ruby", - "tree-sitter-rust", - "tree-sitter-toml-ng", - "tree-sitter-typescript", - "tree-sitter-yaml", "tree_magic_mini", "url", "uuid", @@ -8249,12 +8258,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "streaming-iterator" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" - [[package]] name = "stringprep" version = "0.1.5" @@ -8724,6 +8727,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tl" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b130bd8a58c163224b44e217b4239ca7b927d82bf6cc2fea1fc561d15056e3f7" + [[package]] name = "tls_codec" version = "0.4.2" @@ -9150,196 +9159,6 @@ dependencies = [ "tracing-log", ] -[[package]] -name = "tree-sitter" -version = "0.26.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "887bd495d0582c5e3e0d8ece2233666169fa56a9644d172fc22ad179ab2d0538" -dependencies = [ - "cc", - "regex", - "regex-syntax", - "serde_json", - "streaming-iterator", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-bash" -version = "0.25.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5ec769279cc91b561d3df0d8a5deb26b0ad40d183127f409494d6d8fc53062" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-c" -version = "0.24.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a3aad8f0129083a59fe8596157552d2bb7148c492d44c21558d68ca1c722707" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-c-sharp" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67f06accca7b45351758663b8215089e643d53bd9a660ce0349314263737fcb0" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-cpp" -version = "0.23.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df2196ea9d47b4ab4a31b9297eaa5a5d19a0b121dceb9f118f6790ad0ab94743" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-css" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5cbc5e18f29a2c6d6435891f42569525cf95435a3e01c2f1947abcde178686f" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-go" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8560a4d2f835cc0d4d2c2e03cbd0dde2f6114b43bc491164238d333e28b16ea" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-html" -version = "0.23.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "261b708e5d92061ede329babaaa427b819329a9d427a1d710abb0f67bbef63ee" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-java" -version = "0.23.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0aa6cbcdc8c679b214e616fd3300da67da0e492e066df01bcf5a5921a71e90d6" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-javascript" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68204f2abc0627a90bdf06e605f5c470aa26fdcb2081ea553a04bdad756693f5" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-language" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782" - -[[package]] -name = "tree-sitter-php" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d8c17c3ab69052c5eeaa7ff5cd972dd1bc25d1b97ee779fec391ad3b5df5592" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-python" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bf85fd39652e740bf60f46f4cda9492c3a9ad75880575bf14960f775cb74a1c" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-regex" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd8a59be9f0ac131fd8f062eaaba14882b2fa5a6a7882a20134cb1d60df2e625" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-ruby" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be0484ea4ef6bb9c575b4fdabde7e31340a8d2dbc7d52b321ac83da703249f95" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-rust" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439e577dbe07423ec2582ac62c7531120dbfccfa6e5f92406f93dd271a120e45" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-toml-ng" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9adc2c898ae49730e857d75be403da3f92bb81d8e37a2f918a08dd10de5ebb1" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-typescript" -version = "0.23.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5f76ed8d947a75cc446d5fccd8b602ebf0cde64ccf2ffa434d873d7a575eff" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-yaml" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53c223db85f05e34794f065454843b0668ebc15d240ada63e2b5939f43ce7c97" -dependencies = [ - "cc", - "tree-sitter-language", -] - [[package]] name = "tree_magic_mini" version = "3.2.2" diff --git a/Cargo.toml b/Cargo.toml index 3bc7011..a9cbb83 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -168,28 +168,11 @@ reqwest-middleware = "0.5.1" reqwest-middleware-octorust = { package = "reqwest-middleware", version = "0.4.2" } tracing-subscriber = {version = "0.3.22", features = ["env-filter"] } tracing-core = "0.1.35" -tree-sitter = "0.26.5" aws-smithy-http-client = "1.1.10" aws-smithy-runtime-api = "1.11.4" aws-smithy-types = "1.4.4" -tree-sitter-bash = "0.25.1" -tree-sitter-c = "0.24.1" -tree-sitter-c-sharp = "0.23.1" -tree-sitter-cpp = "0.23.4" -tree-sitter-css = "0.25.0" -tree-sitter-go = "0.25.0" -tree-sitter-html = "0.23.2" -tree-sitter-java = "0.23.5" -tree-sitter-javascript = "0.25.0" -tree-sitter-php = "0.24.2" -tree-sitter-python = "0.25.0" -tree-sitter-ruby = "0.23.1" -tree-sitter-rust = "0.24.0" -tree-sitter-toml-ng = "0.7.0" -tree-sitter-typescript = "0.23.2" -tree-sitter-yaml = "0.7.2" -streaming-iterator = "0.1.9" -tree-sitter-regex = "0.25.0" +cssparser = { version = "0.37.0", default-features = false } +tl = "0.7.8" tree_magic_mini = "3.2" content_inspector = "0.2.4" rustc-hash = "2.1.1" diff --git a/README.md b/README.md index 5e1aad6..c89ea8f 100644 --- a/README.md +++ b/README.md @@ -401,7 +401,7 @@ kingfisher scan /path/to/code kingfisher scan ~/src/myrepo --no-validate # Turbo mode: run as fast as possible by disabling Git commit metadata, Base64 decoding, -# MIME sniffing, language detection, and tree-sitter parsing +# MIME sniffing, language detection, and parser-based context verification # (findings omit commit context, Base64-only matches, MIME type, and language metadata) kingfisher scan ~/src/myrepo --turbo @@ -510,7 +510,7 @@ cat /path/to/file.py | kingfisher scan - kingfisher scan /some/file --max-file-size 500 # Turbo mode: equivalent to --commit-metadata=false --no-base64 and disables MIME sniffing, -# language detection/tree-sitter parsing for maximum speed +# language detection/parser-based context verification for maximum speed # No Git commit metadata (author, date, hash), Base64 decoding, MIME, or language metadata in findings kingfisher scan /path/to/repo --turbo @@ -725,7 +725,7 @@ kingfisher scan /tmp/repo --branch feature-1 \ | [FINGERPRINT.md](docs/FINGERPRINT.md) | Understanding finding fingerprints and deduplication | | [COMPARISON.md](docs/COMPARISON.md) | Benchmark results and performance comparisons | | [PARSING.md](docs/PARSING.md) | Language-aware parsing details | -| [TREE_SITTER.md](docs/TREE_SITTER.md) | Tree-sitter scanning flow, verification gates, and fallback behavior | +| [CONTEXT_VERIFICATION.md](docs/CONTEXT_VERIFICATION.md) | Context-verification flow, gates, and parser backends | # Library Usage @@ -751,7 +751,7 @@ Since then it has evolved far beyond that starting point, introducing live valid - **Live validation** of detected secrets directly within rules - **Hundreds of new built-in rules** and an expanded YAML rule schema - **Baseline management** to suppress known findings over time -- **Tree-sitter parsing** layered on Hyperscan for language-aware detection +- **Parser-based context verification** layered on Hyperscan for language-aware detection - **More scan targets** (GitLab, Bitbucket, Gitea, Jira, Confluence, Slack, Microsoft Teams, S3, GCS, Docker, Hugging Face, etc.) - **Compressed Files**, **SQLite database**, and **Python bytecode (.pyc)** scanning support - **New storage model** (in-memory + Bloom filter, replacing SQLite) diff --git a/crates/kingfisher-rules/data/rules/adobe.yml b/crates/kingfisher-rules/data/rules/adobe.yml index 63a2062..5983e12 100644 --- a/crates/kingfisher-rules/data/rules/adobe.yml +++ b/crates/kingfisher-rules/data/rules/adobe.yml @@ -70,7 +70,7 @@ rules: examples: - | { - "client_credentials": { + "adobe_client_credentials": { "client_id": "a65b0146769d433a835f36660881db50", "client_secret": "p8e-ibndcvsmAp9ZgPBZ606FSlYIZVlsZ-g5" }, @@ -119,7 +119,7 @@ rules: examples: - | { - "client_credentials": { + "adobe_client_credentials": { "client_id": "a65b0146769d433a835f36660881db50", "client_secret": "p8e-ibndcvsmAp9ZgPBZ606FSlYIZVlsZ-g5" }, diff --git a/crates/kingfisher-scanner/src/scanner.rs b/crates/kingfisher-scanner/src/scanner.rs index dc8ef6c..55dff65 100644 --- a/crates/kingfisher-scanner/src/scanner.rs +++ b/crates/kingfisher-scanner/src/scanner.rs @@ -26,7 +26,7 @@ pub struct ScannerConfig { /// Override the minimum entropy threshold for all rules. pub min_entropy_override: Option, - /// Language hint for tree-sitter parsing (e.g., "python", "javascript"). + /// Language hint for parser-based context verification (e.g., "python", "javascript"). pub language_hint: Option, /// Whether to redact secrets in findings. diff --git a/docs-site/docs/assets/images/binary-size-comparison.png b/docs-site/docs/assets/images/binary-size-comparison.png new file mode 100644 index 0000000..1353d6d Binary files /dev/null and b/docs-site/docs/assets/images/binary-size-comparison.png differ diff --git a/docs-site/docs/changelog.md b/docs-site/docs/changelog.md index 323e91b..0e5334f 100644 --- a/docs-site/docs/changelog.md +++ b/docs-site/docs/changelog.md @@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. ## [v1.95.0] - Added 80+ built-in rules, bringing the bundled ruleset to 820 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more. +- Replaced tree-sitter with a lighter parser-based context verifier built from handwritten lexers plus `tl`/`cssparser`, preserving context-dependent matching while cutting about 19 MB from the release binary. - Added a `validation: type: Raw` exception path for provider-specific checks, with new raw validators for Azure Batch, FTP, Kraken, LDAP, RabbitMQ, and Redis. Also added stable request-scoped template values plus new Liquid filters for HMAC-SHA384 hex output and timestamp generation. - Expanded live validation coverage for several built-in rules, including Agora, Bitfinex, DocuSign, Dwolla, GitLab, KuCoin, RingCentral, Snowflake, Tableau, Trello, and Webex. Also tightened newly added helper regex to avoid high-match scan regressions, and made preflight-blocked raw validations report as skipped/not attempted instead of failed. diff --git a/docs-site/docs/features/parsing.md b/docs-site/docs/features/parsing.md index fc90e8d..da71121 100644 --- a/docs-site/docs/features/parsing.md +++ b/docs-site/docs/features/parsing.md @@ -1,47 +1,54 @@ --- title: "Source Code Parsing" -description: "Language-aware secret detection using tree-sitter parsing for 13+ languages including Python, JavaScript, Go, Rust, and more." +description: "Language-aware secret detection using lightweight lexers for 16 languages including Python, JavaScript, Go, Rust, and more." --- # Kingfisher Source Code Parsing -Kingfisher leverages tree-sitter as an extra layer of analysis when scanning source files written in supported programming languages. In practice, after its initial regex-based scan (powered by Vectorscan/Hyperscan), Kingfisher can run a targeted verification pass for context-dependent rules. +Kingfisher uses a parser-based context verifier as a second pass on supported source files. After its initial regex scan (powered by Vectorscan/Hyperscan), it extracts assignment-style snippets from code and configuration files to confirm that generic keyword+token matches appear in plausible contexts. -If so, it creates a Checker (see below) that uses tree‐sitter to parse the file and run language‐specific queries. This additional pass refines the detection by capturing more structured patterns—such as secret-like tokens—that might be obscured or spread over code constructs. +The implementation favors lightweight extractors over full AST parsing: -## How It’s Called +- **Handwritten lexers** for common programming and config languages — comment-aware stripping followed by regex-based `key = value` extraction +- **`tl`** for HTML — attribute values, element text, and embedded `