Replaced tree-sitter with a lighter parser-based context verifier built from handwritten lexers plus tl/cssparser, preserving context-dependent matching while cutting about 19 MB from the release binary.

This commit is contained in:
Mick Grove 2026-04-07 23:20:17 -07:00
commit 0cb854872b
30 changed files with 2054 additions and 2026 deletions

View file

@ -24,7 +24,7 @@ Key capabilities:
- `src/cli/commands/`: CLI command implementations
- `src/matcher/`: pattern matching engine
- `src/scanner/`: core scanning logic
- `src/parser/`: language-aware parsing (`tree-sitter`)
- `src/parser/`: language-aware context verification (lightweight lexers, `tl` for HTML, `cssparser` for CSS)
- `src/reporter/`: TOON/JSON/SARIF/HTML report generation
- `src/access_map/`: access mapping analysis
- `crates/kingfisher-core/`: shared types and core logic

View file

@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file.
## [v1.95.0]
- Added 80+ built-in rules, bringing the bundled ruleset to 820 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more.
- Replaced tree-sitter with a lighter parser-based context verifier built from handwritten lexers plus `tl`/`cssparser`, preserving context-dependent matching while cutting about 19 MB from the release binary.
- Added a `validation: type: Raw` exception path for provider-specific checks, with new raw validators for Azure Batch, FTP, Kraken, LDAP, RabbitMQ, and Redis. Also added stable request-scoped template values plus new Liquid filters for HMAC-SHA384 hex output and timestamp generation.
- Expanded live validation coverage for several built-in rules, including Agora, Bitfinex, DocuSign, Dwolla, GitLab, KuCoin, RingCentral, Snowflake, Tableau, Trello, and Webex. Also tightened newly added helper regex to avoid high-match scan regressions, and made preflight-blocked raw validations report as skipped/not attempted instead of failed.

249
Cargo.lock generated
View file

@ -1994,6 +1994,17 @@ dependencies = [
"hybrid-array",
]
[[package]]
name = "cssparser"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c9cdaae01d5ed7882b04d795e7f752f46ff52d2fa3b50a20d28c464510bba98"
dependencies = [
"dtoa-short",
"itoa",
"smallvec",
]
[[package]]
name = "ctutils"
version = "0.4.0"
@ -2418,6 +2429,21 @@ dependencies = [
"litrs",
]
[[package]]
name = "dtoa"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590"
[[package]]
name = "dtoa-short"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
dependencies = [
"dtoa",
]
[[package]]
name = "dunce"
version = "1.0.5"
@ -5097,6 +5123,7 @@ dependencies = [
"crc32fast",
"crossbeam-channel",
"crossbeam-skiplist",
"cssparser",
"dashmap",
"ed25519-dalek",
"fixedbitset",
@ -5168,7 +5195,6 @@ dependencies = [
"sha1 0.11.0",
"sha2 0.11.0",
"smallvec",
"streaming-iterator",
"strum 0.28.0",
"strum_macros 0.28.0",
"sysinfo",
@ -5181,6 +5207,7 @@ dependencies = [
"thread_local",
"tikv-jemallocator",
"time",
"tl",
"tokei",
"tokio",
"tokio-postgres",
@ -5190,24 +5217,6 @@ dependencies = [
"tracing",
"tracing-core",
"tracing-subscriber",
"tree-sitter",
"tree-sitter-bash",
"tree-sitter-c",
"tree-sitter-c-sharp",
"tree-sitter-cpp",
"tree-sitter-css",
"tree-sitter-go",
"tree-sitter-html",
"tree-sitter-java",
"tree-sitter-javascript",
"tree-sitter-php",
"tree-sitter-python",
"tree-sitter-regex",
"tree-sitter-ruby",
"tree-sitter-rust",
"tree-sitter-toml-ng",
"tree-sitter-typescript",
"tree-sitter-yaml",
"tree_magic_mini",
"url",
"uuid",
@ -8249,12 +8258,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "streaming-iterator"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520"
[[package]]
name = "stringprep"
version = "0.1.5"
@ -8724,6 +8727,12 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tl"
version = "0.7.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b130bd8a58c163224b44e217b4239ca7b927d82bf6cc2fea1fc561d15056e3f7"
[[package]]
name = "tls_codec"
version = "0.4.2"
@ -9150,196 +9159,6 @@ dependencies = [
"tracing-log",
]
[[package]]
name = "tree-sitter"
version = "0.26.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "887bd495d0582c5e3e0d8ece2233666169fa56a9644d172fc22ad179ab2d0538"
dependencies = [
"cc",
"regex",
"regex-syntax",
"serde_json",
"streaming-iterator",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-bash"
version = "0.25.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e5ec769279cc91b561d3df0d8a5deb26b0ad40d183127f409494d6d8fc53062"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-c"
version = "0.24.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a3aad8f0129083a59fe8596157552d2bb7148c492d44c21558d68ca1c722707"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-c-sharp"
version = "0.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67f06accca7b45351758663b8215089e643d53bd9a660ce0349314263737fcb0"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-cpp"
version = "0.23.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df2196ea9d47b4ab4a31b9297eaa5a5d19a0b121dceb9f118f6790ad0ab94743"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-css"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5cbc5e18f29a2c6d6435891f42569525cf95435a3e01c2f1947abcde178686f"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-go"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8560a4d2f835cc0d4d2c2e03cbd0dde2f6114b43bc491164238d333e28b16ea"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-html"
version = "0.23.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "261b708e5d92061ede329babaaa427b819329a9d427a1d710abb0f67bbef63ee"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-java"
version = "0.23.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0aa6cbcdc8c679b214e616fd3300da67da0e492e066df01bcf5a5921a71e90d6"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-javascript"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68204f2abc0627a90bdf06e605f5c470aa26fdcb2081ea553a04bdad756693f5"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-language"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782"
[[package]]
name = "tree-sitter-php"
version = "0.24.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0d8c17c3ab69052c5eeaa7ff5cd972dd1bc25d1b97ee779fec391ad3b5df5592"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-python"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bf85fd39652e740bf60f46f4cda9492c3a9ad75880575bf14960f775cb74a1c"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-regex"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd8a59be9f0ac131fd8f062eaaba14882b2fa5a6a7882a20134cb1d60df2e625"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-ruby"
version = "0.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be0484ea4ef6bb9c575b4fdabde7e31340a8d2dbc7d52b321ac83da703249f95"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-rust"
version = "0.24.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439e577dbe07423ec2582ac62c7531120dbfccfa6e5f92406f93dd271a120e45"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-toml-ng"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9adc2c898ae49730e857d75be403da3f92bb81d8e37a2f918a08dd10de5ebb1"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-typescript"
version = "0.23.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c5f76ed8d947a75cc446d5fccd8b602ebf0cde64ccf2ffa434d873d7a575eff"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-yaml"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53c223db85f05e34794f065454843b0668ebc15d240ada63e2b5939f43ce7c97"
dependencies = [
"cc",
"tree-sitter-language",
]
[[package]]
name = "tree_magic_mini"
version = "3.2.2"

View file

@ -168,28 +168,11 @@ reqwest-middleware = "0.5.1"
reqwest-middleware-octorust = { package = "reqwest-middleware", version = "0.4.2" }
tracing-subscriber = {version = "0.3.22", features = ["env-filter"] }
tracing-core = "0.1.35"
tree-sitter = "0.26.5"
aws-smithy-http-client = "1.1.10"
aws-smithy-runtime-api = "1.11.4"
aws-smithy-types = "1.4.4"
tree-sitter-bash = "0.25.1"
tree-sitter-c = "0.24.1"
tree-sitter-c-sharp = "0.23.1"
tree-sitter-cpp = "0.23.4"
tree-sitter-css = "0.25.0"
tree-sitter-go = "0.25.0"
tree-sitter-html = "0.23.2"
tree-sitter-java = "0.23.5"
tree-sitter-javascript = "0.25.0"
tree-sitter-php = "0.24.2"
tree-sitter-python = "0.25.0"
tree-sitter-ruby = "0.23.1"
tree-sitter-rust = "0.24.0"
tree-sitter-toml-ng = "0.7.0"
tree-sitter-typescript = "0.23.2"
tree-sitter-yaml = "0.7.2"
streaming-iterator = "0.1.9"
tree-sitter-regex = "0.25.0"
cssparser = { version = "0.37.0", default-features = false }
tl = "0.7.8"
tree_magic_mini = "3.2"
content_inspector = "0.2.4"
rustc-hash = "2.1.1"

View file

@ -401,7 +401,7 @@ kingfisher scan /path/to/code
kingfisher scan ~/src/myrepo --no-validate
# Turbo mode: run as fast as possible by disabling Git commit metadata, Base64 decoding,
# MIME sniffing, language detection, and tree-sitter parsing
# MIME sniffing, language detection, and parser-based context verification
# (findings omit commit context, Base64-only matches, MIME type, and language metadata)
kingfisher scan ~/src/myrepo --turbo
@ -510,7 +510,7 @@ cat /path/to/file.py | kingfisher scan -
kingfisher scan /some/file --max-file-size 500
# Turbo mode: equivalent to --commit-metadata=false --no-base64 and disables MIME sniffing,
# language detection/tree-sitter parsing for maximum speed
# language detection/parser-based context verification for maximum speed
# No Git commit metadata (author, date, hash), Base64 decoding, MIME, or language metadata in findings
kingfisher scan /path/to/repo --turbo
@ -725,7 +725,7 @@ kingfisher scan /tmp/repo --branch feature-1 \
| [FINGERPRINT.md](docs/FINGERPRINT.md) | Understanding finding fingerprints and deduplication |
| [COMPARISON.md](docs/COMPARISON.md) | Benchmark results and performance comparisons |
| [PARSING.md](docs/PARSING.md) | Language-aware parsing details |
| [TREE_SITTER.md](docs/TREE_SITTER.md) | Tree-sitter scanning flow, verification gates, and fallback behavior |
| [CONTEXT_VERIFICATION.md](docs/CONTEXT_VERIFICATION.md) | Context-verification flow, gates, and parser backends |
# Library Usage
@ -751,7 +751,7 @@ Since then it has evolved far beyond that starting point, introducing live valid
- **Live validation** of detected secrets directly within rules
- **Hundreds of new built-in rules** and an expanded YAML rule schema
- **Baseline management** to suppress known findings over time
- **Tree-sitter parsing** layered on Hyperscan for language-aware detection
- **Parser-based context verification** layered on Hyperscan for language-aware detection
- **More scan targets** (GitLab, Bitbucket, Gitea, Jira, Confluence, Slack, Microsoft Teams, S3, GCS, Docker, Hugging Face, etc.)
- **Compressed Files**, **SQLite database**, and **Python bytecode (.pyc)** scanning support
- **New storage model** (in-memory + Bloom filter, replacing SQLite)

View file

@ -70,7 +70,7 @@ rules:
examples:
- |
{
"client_credentials": {
"adobe_client_credentials": {
"client_id": "a65b0146769d433a835f36660881db50",
"client_secret": "p8e-ibndcvsmAp9ZgPBZ606FSlYIZVlsZ-g5"
},
@ -119,7 +119,7 @@ rules:
examples:
- |
{
"client_credentials": {
"adobe_client_credentials": {
"client_id": "a65b0146769d433a835f36660881db50",
"client_secret": "p8e-ibndcvsmAp9ZgPBZ606FSlYIZVlsZ-g5"
},

View file

@ -26,7 +26,7 @@ pub struct ScannerConfig {
/// Override the minimum entropy threshold for all rules.
pub min_entropy_override: Option<f32>,
/// Language hint for tree-sitter parsing (e.g., "python", "javascript").
/// Language hint for parser-based context verification (e.g., "python", "javascript").
pub language_hint: Option<String>,
/// Whether to redact secrets in findings.

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

View file

@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file.
## [v1.95.0]
- Added 80+ built-in rules, bringing the bundled ruleset to 820 total. New coverage includes Amazon OAuth, Asaas, multiple Azure credential families, Bitrise, Canva, CockroachDB, eBay, Elastic, hCaptcha, Highnote, Lichess, MailerSend, Onfido, Paddle, Pangea, Persona, Pinterest, Proof, Rootly, Runpod, Telnyx, Thunderstore, Valtown, Volcengine, and more.
- Replaced tree-sitter with a lighter parser-based context verifier built from handwritten lexers plus `tl`/`cssparser`, preserving context-dependent matching while cutting about 19 MB from the release binary.
- Added a `validation: type: Raw` exception path for provider-specific checks, with new raw validators for Azure Batch, FTP, Kraken, LDAP, RabbitMQ, and Redis. Also added stable request-scoped template values plus new Liquid filters for HMAC-SHA384 hex output and timestamp generation.
- Expanded live validation coverage for several built-in rules, including Agora, Bitfinex, DocuSign, Dwolla, GitLab, KuCoin, RingCentral, Snowflake, Tableau, Trello, and Webex. Also tightened newly added helper regex to avoid high-match scan regressions, and made preflight-blocked raw validations report as skipped/not attempted instead of failed.

View file

@ -1,47 +1,54 @@
---
title: "Source Code Parsing"
description: "Language-aware secret detection using tree-sitter parsing for 13+ languages including Python, JavaScript, Go, Rust, and more."
description: "Language-aware secret detection using lightweight lexers for 16 languages including Python, JavaScript, Go, Rust, and more."
---
# Kingfisher Source Code Parsing
Kingfisher leverages tree-sitter as an extra layer of analysis when scanning source files written in supported programming languages. In practice, after its initial regex-based scan (powered by Vectorscan/Hyperscan), Kingfisher can run a targeted verification pass for context-dependent rules.
Kingfisher uses a parser-based context verifier as a second pass on supported source files. After its initial regex scan (powered by Vectorscan/Hyperscan), it extracts assignment-style snippets from code and configuration files to confirm that generic keyword+token matches appear in plausible contexts.
If so, it creates a Checker (see below) that uses treesitter to parse the file and run languagespecific queries. This additional pass refines the detection by capturing more structured patterns—such as secret-like tokens—that might be obscured or spread over code constructs.
The implementation favors lightweight extractors over full AST parsing:
## How Its Called
- **Handwritten lexers** for common programming and config languages — comment-aware stripping followed by regex-based `key = value` extraction
- **`tl`** for HTML — attribute values, element text, and embedded `<script>` / `<style>` delegation
- **`cssparser`** for CSS — declaration parsing via Mozilla's CSS tokenizer
> **History:** Earlier versions used tree-sitter with 17 statically-linked
> grammar crates. This added ~20 MB to the binary and required building a
> full syntax tree just to extract assignment pairs. The current lexer-based
> approach achieves the same extraction quality with near-zero binary overhead
> and no external grammar dependencies.
## How It's Called
In the scanning phase (in the Matcher's implementation), Kingfisher does the following:
- **Primary Regex Pass:** Kingfisher always scans the full blob with Vectorscan/Hyperscan first.
- **Candidate Selection:** Findings from rules classified as context-dependent become tree-sitter verification candidates.
- **Language Detection:** If a language string is provided (for example from metadata or extension), the code calls a helper (such as `get_language_and_queries`) to retrieve the corresponding tree-sitter language and queries.
- **Checker Creation:** With those values, a `Checker` is instantiated with the target language and query map.
- **Parsing and Querying:** The Checker retrieves a thread-local parser (to avoid recreating it on every call), sets language, parses source, and runs queries to extract structured snippets (for example `key = value` pairs).
- **Verification Decision:** Candidate findings are kept only if parser-extracted context verifies the matched secret. If tree-sitter is unavailable, fallback behavior is profile-driven (for strict generic keyword+token rules, findings are suppressed).
*(See the implementation details in the parser module for example, the `modify_regex` function in the Checker, and the conditional treesitter call in Matcher::scan_blob)*
- **Candidate Selection:** Findings from rules classified as context-dependent become parser-verification candidates.
- **Language Detection:** If a language string is provided (for example from metadata or extension), the code maps it to a supported parser backend.
- **Parsing and Querying:** The parser streams normalized snippets such as `key = value` without materializing a full syntax tree.
- **Verification Decision:** Candidate findings are kept only if parser-extracted context verifies the matched secret.
## Supported Languages
The design supports many common source code languages. The Language enum (defined in the parser module) includes variants for:
- **Scripting:** Bash, Python, Ruby, PHP
- **Compiled languages:** C, C++, C#, Rust, Java
- **Web-related languages:** CSS, HTML, JavaScript, TypeScript, YAML, Toml
- **Others:** Go, and even a generic “Regex” mode
- **Scripting:** Bash, Python, Ruby, PHP
- **Compiled languages:** C, C++, C#, Rust, Java
- **Web-related languages:** CSS, HTML, JavaScript, TypeScript, YAML, TOML
- **Others:** Go
Each variant maps to its corresponding treesitter language through the `get_ts_language()` method.
## When Context Verification Is Not Called
## When Treesitter Is Not Called
Context verification is skipped in certain cases:
Treesitter wont be invoked in certain cases:
- **No Language Identified:** If the file isnt recognized as belonging to one of the supported languages or no language hint is provided, the Checker isnt even constructed.
- **Non-source Files:** Binary files or files that arent expected to contain code (or arent extracted from archives) bypass treesitter parsing.
- **Fallback on Errors:** If treesitter parsing fails (e.g. due to malformed code or other errors), Kingfisher will fall back on its regex/Vectorscan matches without the additional treesitter insights.
- **No Language Identified:** If the file isn't recognized as belonging to one of the supported languages or no language hint is provided, the context verifier isn't even constructed.
- **Non-source Files:** Binary files or files that aren't expected to contain code (or aren't extracted from archives) bypass parser-based context verification.
- **Large Blobs:** Files larger than 2 MiB skip context verification to avoid spending time on generated or minified content.
- **Verification Errors:** If extraction fails, context-dependent matches are suppressed instead of falling back to raw regex hits.
## Summary
In essence, Kingfishers use of treesitter is conditional and complementary. It is called only when the scanned file is a source code file written in a supported language, and its role is to enrich the scanning results by leveraging the syntax tree and language-specific queries. When files are non-source, binary, or if no language is provided, treesitter is not invoked, and Kingfisher relies solely on its regex-based detection.
Parser-based context verification is conditional and complementary. It is called only when the scanned file is a supported source or config file, and its role is to reduce noisy context-dependent findings by checking them against extracted code/config structure.
This layered approach helps improve the accuracy of secret detection while maintaining high performance.

View file

@ -64,7 +64,7 @@ flowchart LR
subgraph Engines[Engines]
Vector[vectorscan]
ScanPool[scanner pool]
Tree[tree-sitter]
Context["context verifier"]
Liquid[Liquid templates]
end
@ -94,7 +94,7 @@ flowchart LR
ScannerLib --> Validate
Match --> Vector --> ScanPool
Match --> Tree
Match --> Context
Validate --> Liquid
Validate --> APIs
@ -112,7 +112,7 @@ flowchart LR
- `src/scanner/runner.rs`: the orchestration hub for `scan`, including repo enumeration, clone streaming, artifact fetching, validation setup, sequential or parallel scan execution (threshold: >10 git repos triggers parallel mode), reporting, and summary generation.
- `src/scanner/*`: input enumeration (`enumerate.rs`), repository handling and artifact fetching (`repos.rs`), blob processing (`processing.rs`), validation coordination (`validation.rs`), scan summaries (`summary.rs`), Docker image scanning (`docker.rs`), and utilities (`util.rs`).
- `src/matcher/*`: the main detection engine (`mod.rs`), including vectorscan callbacks, regex helpers, Base64 discovery (`base64_decode.rs`), capture group handling (`captures.rs`), dedup support (`dedup.rs`), filtering (`filter.rs`), and finding fingerprinting (`fingerprint.rs`).
- `src/parser.rs`: tree-sitter integration for language-aware parsing, supporting 17+ languages (Bash, C, C#, C++, CSS, Go, HTML, Java, JavaScript, PHP, Python, Ruby, Rust, TOML, TypeScript, YAML, and regex).
- `src/parser.rs` and `src/parser/*`: parser-based context verification for language-aware matching, with handwritten lexers plus lightweight HTML and CSS parsers.
- `src/scanner_pool.rs`: thread-local vectorscan `BlockScanner` pool, providing safe reuse of compiled pattern databases across scan threads.
- `src/reporter.rs` and `src/reporter/*`: report rendering for pretty, JSON, BSON, TOON, SARIF, and HTML outputs, plus the data model used by the viewer.
- `src/direct_validate.rs`: direct validation of a known secret without going through pattern matching. Supports HTTP, AWS, Azure, GCP, JDBC, MongoDB, MySQL, PostgreSQL, JWT, and Coinbase validators, with Liquid template integration for custom validation logic.
@ -125,6 +125,6 @@ flowchart LR
- `kingfisher-scanner` is still important: it provides the embeddable scanner API plus shared validation and primitive functionality reused by the application.
- Direct `validate`, `revoke`, and standalone `access-map` are sibling command paths. They are not downstream stages of `FindingsStore`.
- Reporting is downstream from the datastore, which lets Kingfisher emit multiple output formats and drive the local viewer from the same finding set.
- The matching layer is intentionally hybrid: vectorscan provides high-throughput SIMD-accelerated pattern detection, while regex helpers, Base64 support, and tree-sitter verification improve accuracy and reduce false positives.
- The matching layer is intentionally hybrid: vectorscan provides high-throughput SIMD-accelerated pattern detection, while regex helpers, Base64 support, and parser-based context verification improve accuracy and reduce false positives.
- `FindingsStore` uses an in-memory store with a Bloom filter for deduplication, replacing the earlier SQLite-based storage model.
- Validation and revocation templates are rendered via Liquid, allowing rule authors to define HTTP request sequences, variable extraction, and multi-step flows in YAML without touching Rust code.

View file

@ -7,18 +7,17 @@ description: "Benchmark results comparing Kingfisher performance against Truffle
## Runtime Comparison (seconds)
*Lower runtimes are better.*
| Repository | Kingfisher Runtime | TruffleHog Runtime | GitLeaks Runtime | detect-secrets Runtime |
|------------|--------------------|--------------------|------------------|------------------------|
| croc | 2.64 | 10.36 | 3.10 | 0.16 |
| rails | 8.75 | 24.19 | 24.24 | 0.48 |
| ruby | 22.93 | 132.68 | 61.37 | 0.79 |
| gitlab | 135.41 | 325.93 | 350.84 | 5.04 |
| django | 6.91 | 227.63 | 59.50 | 0.61 |
| lucene | 15.62 | 89.11 | 76.24 | 0.66 |
| mongodb | 25.37 | 174.93 | 175.80 | 2.74 |
| linux | 205.19 | 597.51 | 548.96 | 5.49 |
| typescript | 64.99 | 183.04 | 232.34 | 4.23 |
| Repository | Kingfisher Runtime | TruffleHog Runtime | GitLeaks Runtime |
|------------|--------------------|--------------------|------------------|
| croc | 2.64 | 10.36 | 3.10 |
| rails | 8.75 | 24.19 | 24.24 |
| ruby | 22.93 | 132.68 | 61.37 |
| gitlab | 135.41 | 325.93 | 350.84 |
| django | 6.91 | 227.63 | 59.50 |
| lucene | 15.62 | 89.11 | 76.24 |
| mongodb | 25.37 | 174.93 | 175.80 |
| linux | 205.19 | 597.51 | 548.96 |
| typescript | 64.99 | 183.04 | 232.34 |
<p align="center">
<img src="../assets/images/runtime-comparison.png" alt="Kingfisher Runtime Comparison" style="vertical-align: center;" />
@ -28,37 +27,52 @@ description: "Benchmark results comparing Kingfisher performance against Truffle
Note: For GitLeaks and detect-secrets, validated/verified counts are not available.
| Repository | Kingfisher Validated | TruffleHog Verified | GitLeaks Verified | detect-secrets Verified |
|------------|----------------------|---------------------|-------------------|-------------------------|
| croc | 0 | 0 | 0 | 0 |
| rails | 0 | 0 | 0 | 0 |
| ruby | 0 | 0 | 0 | 0 |
| gitlab | 6 | 6 | 0 | 0 |
| django | 0 | 0 | 0 | 0 |
| lucene | 0 | 0 | 0 | 0 |
| mongodb | 0 | 0 | 0 | 0 |
| linux | 0 | 0 | 0 | 0 |
| typescript | 0 | 0 | 0 | 0 |
| Repository | Kingfisher Validated | TruffleHog Verified | GitLeaks Verified |
|------------|----------------------|---------------------|-------------------|
| croc | 0 | 0 | 0 |
| rails | 0 | 0 | 0 |
| ruby | 0 | 0 | 0 |
| gitlab | **6** | **6** | 0 |
| django | 0 | 0 | 0 |
| lucene | 0 | 0 | 0 |
| mongodb | 0 | 0 | 0 |
| linux | 0 | 0 | 0 |
| typescript | 0 | 0 | 0 |
### Network Requests Comparison
*'Network Requests' shows the total number of HTTP calls made during a scan. Since Gitleaks and detectsecrets dont validate secrets, they never make any network requests.*
| Repository | Kingfisher Network Requests | TruffleHog Network Requests | GitLeaks Network Requests | detect-secrets Network Requests |
|------------|-----------------------------|-----------------------------|---------------------------|----------------------------------|
| croc | 0 | 17 | 0 | 0 |
| rails | 1 | 25 | 0 | 0 |
| ruby | 3 | 33 | 0 | 0 |
| gitlab | 17 | 15624 | 0 | 0 |
| django | 0 | 66 | 0 | 0 |
| lucene | 0 | 116 | 0 | 0 |
| mongodb | 1 | 191 | 0 | 0 |
| linux | 0 | 287 | 0 | 0 |
| typescript | 0 | 10 | 0 | 0 |
| Repository | Kingfisher Network Requests | TruffleHog Network Requests | GitLeaks Network Requests |
|------------|-----------------------------|-----------------------------|---------------------------|
| croc | 0 | 17 | 0 |
| rails | 1 | 25 | 0 |
| ruby | 3 | 33 | 0 |
| gitlab | 17 | **15624** | 0 |
| django | 0 | 66 | 0 |
| lucene | 0 | 116 | 0 |
| mongodb | 1 | 191 | 0 |
| linux | 0 | 287 | 0 |
| typescript | 0 | 10 | 0 |
*Lower runtimes are better. Validated/Verified counts are reported where available. 'Network Requests' indicates the number of HTTP requests made during scanning.*
OS: darwin
Architecture: arm64
CPU Cores: 16
RAM: 48.00 GB
### Binary Size Comparison (macOS arm64)
| Tool | Version | Binary Size |
|------|---------|-------------|
| Gitleaks | 8.30.0 | 14.5 MB |
| **Kingfisher** | **1.95.0** | **32.8 MB** |
| TruffleHog | 3.94.2 | 160.3 MB |
*Smaller binaries are easier to distribute, deploy in CI, and embed in container images*
<p align="center">
<img src="../assets/images/binary-size-comparison.png" alt="Binary Size Comparison" />
</p>
## Benchmark Environment
OS: darwin
Architecture: arm64
CPU Cores: 16
RAM: 48.00 GB

View file

@ -59,7 +59,7 @@ flowchart LR
subgraph Engines[Engines]
Vector[vectorscan]
ScanPool[scanner pool]
Tree[tree-sitter]
Context["context verifier"]
Liquid[Liquid templates]
end
@ -89,7 +89,7 @@ flowchart LR
ScannerLib --> Validate
Match --> Vector --> ScanPool
Match --> Tree
Match --> Context
Validate --> Liquid
Validate --> APIs
@ -107,7 +107,7 @@ flowchart LR
- `src/scanner/runner.rs`: the orchestration hub for `scan`, including repo enumeration, clone streaming, artifact fetching, validation setup, sequential or parallel scan execution (threshold: >10 git repos triggers parallel mode), reporting, and summary generation.
- `src/scanner/*`: input enumeration (`enumerate.rs`), repository handling and artifact fetching (`repos.rs`), blob processing (`processing.rs`), validation coordination (`validation.rs`), scan summaries (`summary.rs`), Docker image scanning (`docker.rs`), and utilities (`util.rs`).
- `src/matcher/*`: the main detection engine (`mod.rs`), including vectorscan callbacks, regex helpers, Base64 discovery (`base64_decode.rs`), capture group handling (`captures.rs`), dedup support (`dedup.rs`), filtering (`filter.rs`), and finding fingerprinting (`fingerprint.rs`).
- `src/parser.rs`: tree-sitter integration for language-aware parsing, supporting 17+ languages (Bash, C, C#, C++, CSS, Go, HTML, Java, JavaScript, PHP, Python, Ruby, Rust, TOML, TypeScript, YAML, and regex).
- `src/parser.rs` and `src/parser/*`: parser-based context verification for language-aware matching, with handwritten lexers plus lightweight HTML and CSS parsers.
- `src/scanner_pool.rs`: thread-local vectorscan `BlockScanner` pool, providing safe reuse of compiled pattern databases across scan threads.
- `src/reporter.rs` and `src/reporter/*`: report rendering for pretty, JSON, BSON, TOON, SARIF, and HTML outputs, plus the data model used by the viewer.
- `src/direct_validate.rs`: direct validation of a known secret without going through pattern matching. Supports HTTP, gRPC, plus schema-level typed validators such as AWS, AzureStorage, GCP, JDBC, MongoDB, MySQL, PostgreSQL, JWT, and Coinbase, and delegates ad-hoc `Raw` validators to `crates/kingfisher-scanner/src/validation/raw.rs`.
@ -121,6 +121,6 @@ flowchart LR
- The shared validation layer in `crates/kingfisher-scanner/src/validation/` contains both reusable typed validator families and the `Raw` exception-path validators used by rule YAML.
- Direct `validate`, `revoke`, and standalone `access-map` are sibling command paths. They are not downstream stages of `FindingsStore`.
- Reporting is downstream from the datastore, which lets Kingfisher emit multiple output formats and drive the local viewer from the same finding set.
- The matching layer is intentionally hybrid: vectorscan provides high-throughput SIMD-accelerated pattern detection, while regex helpers, Base64 support, and tree-sitter verification improve accuracy and reduce false positives.
- The matching layer is intentionally hybrid: vectorscan provides high-throughput SIMD-accelerated pattern detection, while regex helpers, Base64 support, and parser-based context verification improve accuracy and reduce false positives.
- `FindingsStore` uses an in-memory store with a Bloom filter for deduplication, replacing the earlier SQLite-based storage model.
- Validation and revocation templates are rendered via Liquid, allowing rule authors to define HTTP request sequences, variable extraction, and multi-step flows in YAML without touching Rust code.

View file

@ -4,17 +4,17 @@
## Runtime Comparison (seconds)
*Lower runtimes are better.*
| Repository | Kingfisher Runtime | TruffleHog Runtime | GitLeaks Runtime | detect-secrets Runtime |
|------------|--------------------|--------------------|------------------|------------------------|
| croc | 2.64 | 10.36 | 3.10 | 0.16 |
| rails | 8.75 | 24.19 | 24.24 | 0.48 |
| ruby | 22.93 | 132.68 | 61.37 | 0.79 |
| gitlab | 135.41 | 325.93 | 350.84 | 5.04 |
| django | 6.91 | 227.63 | 59.50 | 0.61 |
| lucene | 15.62 | 89.11 | 76.24 | 0.66 |
| mongodb | 25.37 | 174.93 | 175.80 | 2.74 |
| linux | 205.19 | 597.51 | 548.96 | 5.49 |
| typescript | 64.99 | 183.04 | 232.34 | 4.23 |
| Repository | Kingfisher Runtime | TruffleHog Runtime | GitLeaks Runtime |
|------------|--------------------|--------------------|------------------|
| croc | 2.64 | 10.36 | 3.10 |
| rails | 8.75 | 24.19 | 24.24 |
| ruby | 22.93 | 132.68 | 61.37 |
| gitlab | 135.41 | 325.93 | 350.84 |
| django | 6.91 | 227.63 | 59.50 |
| lucene | 15.62 | 89.11 | 76.24 |
| mongodb | 25.37 | 174.93 | 175.80 |
| linux | 205.19 | 597.51 | 548.96 |
| typescript | 64.99 | 183.04 | 232.34 |
<p align="center">
<img src="./runtime-comparison.png" alt="Kingfisher Runtime Comparison" style="vertical-align: center;" />
@ -24,37 +24,52 @@
Note: For GitLeaks and detect-secrets, validated/verified counts are not available.
| Repository | Kingfisher Validated | TruffleHog Verified | GitLeaks Verified | detect-secrets Verified |
|------------|----------------------|---------------------|-------------------|-------------------------|
| croc | 0 | 0 | 0 | 0 |
| rails | 0 | 0 | 0 | 0 |
| ruby | 0 | 0 | 0 | 0 |
| gitlab | 6 | 6 | 0 | 0 |
| django | 0 | 0 | 0 | 0 |
| lucene | 0 | 0 | 0 | 0 |
| mongodb | 0 | 0 | 0 | 0 |
| linux | 0 | 0 | 0 | 0 |
| typescript | 0 | 0 | 0 | 0 |
| Repository | Kingfisher Validated | TruffleHog Verified | GitLeaks Verified |
|------------|----------------------|---------------------|-------------------|
| croc | 0 | 0 | 0 |
| rails | 0 | 0 | 0 |
| ruby | 0 | 0 | 0 |
| gitlab | **6** | **6** | 0 |
| django | 0 | 0 | 0 |
| lucene | 0 | 0 | 0 |
| mongodb | 0 | 0 | 0 |
| linux | 0 | 0 | 0 |
| typescript | 0 | 0 | 0 |
### Network Requests Comparison
*'Network Requests' shows the total number of HTTP calls made during a scan. Since Gitleaks and detectsecrets dont validate secrets, they never make any network requests.*
| Repository | Kingfisher Network Requests | TruffleHog Network Requests | GitLeaks Network Requests | detect-secrets Network Requests |
|------------|-----------------------------|-----------------------------|---------------------------|----------------------------------|
| croc | 0 | 17 | 0 | 0 |
| rails | 1 | 25 | 0 | 0 |
| ruby | 3 | 33 | 0 | 0 |
| gitlab | 17 | 15624 | 0 | 0 |
| django | 0 | 66 | 0 | 0 |
| lucene | 0 | 116 | 0 | 0 |
| mongodb | 1 | 191 | 0 | 0 |
| linux | 0 | 287 | 0 | 0 |
| typescript | 0 | 10 | 0 | 0 |
| Repository | Kingfisher Network Requests | TruffleHog Network Requests | GitLeaks Network Requests |
|------------|-----------------------------|-----------------------------|---------------------------|
| croc | 0 | 17 | 0 |
| rails | 1 | 25 | 0 |
| ruby | 3 | 33 | 0 |
| gitlab | 17 | **15624** | 0 |
| django | 0 | 66 | 0 |
| lucene | 0 | 116 | 0 |
| mongodb | 1 | 191 | 0 |
| linux | 0 | 287 | 0 |
| typescript | 0 | 10 | 0 |
*Lower runtimes are better. Validated/Verified counts are reported where available. 'Network Requests' indicates the number of HTTP requests made during scanning.*
OS: darwin
Architecture: arm64
CPU Cores: 16
RAM: 48.00 GB
### Binary Size Comparison (macOS arm64)
| Tool | Version | Binary Size |
|------|---------|-------------|
| Gitleaks | 8.30.0 | 14.5 MB |
| **Kingfisher** | **1.95.0** | **32.8 MB** |
| TruffleHog | 3.94.2 | 160.3 MB |
*Smaller binaries are easier to distribute, deploy in CI, and embed in container images*
<p align="center">
<img src="./binary-size-comparison.png" alt="Binary Size Comparison" />
</p>
## Benchmark Environment
OS: darwin
Architecture: arm64
CPU Cores: 16
RAM: 48.00 GB

View file

@ -0,0 +1,49 @@
# Parser-Based Context Verification
[← Back to README](../README.md)
Kingfisher starts with a fast regex pass powered by Vectorscan/Hyperscan. For rules classified as `ContextDependent`, it then runs a lightweight parser-based verification pass that extracts likely assignment-style snippets such as `api_key = secret`.
> **Why not tree-sitter?** Earlier versions used tree-sitter for this step.
> Statically linking 17 grammar crates added roughly 20 MB to the binary and
> required a full AST parse just to extract `key = value` pairs. The current
> approach — handwritten regex-based lexers with comment-aware stripping —
> produces the same (or better) extraction quality at a fraction of the binary
> and runtime cost.
## Where It Runs
1. `BlobProcessor::run` decides whether to compute a language hint.
2. `Matcher::scan_blob` performs the primary regex scan and other filtering.
3. `maybe_apply_context_verification` streams parser candidates near the end of `scan_blob`.
4. Only context-dependent, non-Base64 matches are checked.
5. Candidates that cannot be verified are removed.
## Gates
Context verification runs only when all of these are true:
- Blob length is between `0 KiB` and `2 MiB` (`should_attempt_context_verification`).
- Turbo mode is disabled.
- A supported language hint is available.
If any gate fails, context-dependent matches are suppressed rather than falling back to raw regex hits.
## Backends
Kingfisher uses lightweight language-specific extractors instead of a full AST layer:
- Handwritten lexers for Bash, C, C#, C++, Go, Java, JavaScript, PHP, Python, Ruby, Rust, TOML, TypeScript, and YAML
- `tl` for HTML attributes, element text, and embedded `<script>` / `<style>` blocks
- `cssparser` for CSS declarations and function-style values
Each lexer runs a comment-aware stripping pass (tracking string boundaries to avoid false comment detection) followed by a small set of regex patterns that extract assignment-style pairs.
## Verification Model
- Rule profiling decides which matches are `ContextDependent`.
- The parser streams candidate text snippets like `secret_key = abcd1234`.
- Kingfisher re-runs the rule's anchored regex against each candidate snippet.
- Verification succeeds only when the regex secret capture exactly matches the original hit.
This keeps the fast regex engine on the hot path while still filtering noisy generic keyword+token matches with language-aware context.

View file

@ -1,43 +1,51 @@
# Kingfisher Source Code Parsing
[← Back to README](../README.md)
Kingfisher leverages tree-sitter as an extra layer of analysis when scanning source files written in supported programming languages. In practice, after its initial regex-based scan (powered by Vectorscan/Hyperscan), Kingfisher can run a targeted verification pass for context-dependent rules.
If so, it creates a Checker (see below) that uses treesitter to parse the file and run languagespecific queries. This additional pass refines the detection by capturing more structured patterns—such as secret-like tokens—that might be obscured or spread over code constructs.
Kingfisher uses a parser-based context verifier as a second pass on supported source files. After its initial regex scan (powered by Vectorscan/Hyperscan), it extracts assignment-style snippets from code and configuration files to confirm that generic keyword+token matches appear in plausible contexts.
The implementation favors lightweight extractors over full AST parsing:
- **Handwritten lexers** for common programming and config languages — comment-aware stripping followed by regex-based `key = value` extraction
- **`tl`** for HTML — attribute values, element text, and embedded `<script>` / `<style>` delegation
- **`cssparser`** for CSS — declaration parsing via Mozillas CSS tokenizer
> **History:** Earlier versions used tree-sitter with 17 statically-linked
> grammar crates. This added ~20 MB to the binary and required building a
> full syntax tree just to extract assignment pairs. The current lexer-based
> approach achieves the same extraction quality with near-zero binary overhead
> and no external grammar dependencies.
## How Its Called
In the scanning phase (in the Matcher's implementation), Kingfisher does the following:
In the scanning phase (in the Matchers implementation), Kingfisher does the following:
- **Primary Regex Pass:** Kingfisher always scans the full blob with Vectorscan/Hyperscan first.
- **Candidate Selection:** Findings from rules classified as context-dependent become tree-sitter verification candidates.
- **Language Detection:** If a language string is provided (for example from metadata or extension), the code calls a helper (such as `get_language_and_queries`) to retrieve the corresponding tree-sitter language and queries.
- **Checker Creation:** With those values, a `Checker` is instantiated with the target language and query map.
- **Parsing and Querying:** The Checker retrieves a thread-local parser (to avoid recreating it on every call), sets language, parses source, and runs queries to extract structured snippets (for example `key = value` pairs).
- **Verification Decision:** Candidate findings are kept only if parser-extracted context verifies the matched secret. If tree-sitter is unavailable, fallback behavior is profile-driven (for strict generic keyword+token rules, findings are suppressed).
*(See the implementation details in the parser module for example, the `modify_regex` function in the Checker, and the conditional treesitter call in Matcher::scan_blob)*
- **Candidate Selection:** Findings from rules classified as context-dependent become parser-verification candidates.
- **Language Detection:** If a language string is provided (for example from metadata or extension), the code maps it to a supported parser backend.
- **Parsing and Querying:** The parser streams normalized snippets such as `key = value` without materializing a full syntax tree.
- **Verification Decision:** Candidate findings are kept only if parser-extracted context verifies the matched secret.
## Supported Languages
The design supports many common source code languages. The Language enum (defined in the parser module) includes variants for:
- **Scripting:** Bash, Python, Ruby, PHP
- **Compiled languages:** C, C++, C#, Rust, Java
- **Web-related languages:** CSS, HTML, JavaScript, TypeScript, YAML, Toml
- **Others:** Go, and even a generic “Regex” mode
- **Scripting:** Bash, Python, Ruby, PHP
- **Compiled languages:** C, C++, C#, Rust, Java
- **Web-related languages:** CSS, HTML, JavaScript, TypeScript, YAML, TOML
- **Others:** Go
Each variant maps to its corresponding treesitter language through the `get_ts_language()` method.
## When Context Verification Is Not Called
## When Treesitter Is Not Called
Context verification is skipped in certain cases:
Treesitter wont be invoked in certain cases:
- **No Language Identified:** If the file isnt recognized as belonging to one of the supported languages or no language hint is provided, the Checker isnt even constructed.
- **Non-source Files:** Binary files or files that arent expected to contain code (or arent extracted from archives) bypass treesitter parsing.
- **Fallback on Errors:** If treesitter parsing fails (e.g. due to malformed code or other errors), Kingfisher will fall back on its regex/Vectorscan matches without the additional treesitter insights.
- **No Language Identified:** If the file isnt recognized as belonging to one of the supported languages or no language hint is provided, the context verifier isnt even constructed.
- **Non-source Files:** Binary files or files that arent expected to contain code (or arent extracted from archives) bypass parser-based context verification.
- **Large Blobs:** Files larger than 2 MiB skip context verification to avoid spending time on generated or minified content.
- **Verification Errors:** If extraction fails, context-dependent matches are suppressed instead of falling back to raw regex hits.
## Summary
In essence, Kingfishers use of treesitter is conditional and complementary. It is called only when the scanned file is a source code file written in a supported language, and its role is to enrich the scanning results by leveraging the syntax tree and language-specific queries. When files are non-source, binary, or if no language is provided, treesitter is not invoked, and Kingfisher relies solely on its regex-based detection.
Parser-based context verification is conditional and complementary. It is called only when the scanned file is a supported source or config file, and its role is to reduce noisy context-dependent findings by checking them against extracted code/config structure.
This layered approach helps improve the accuracy of secret detection while maintaining high performance.

View file

@ -1,100 +0,0 @@
# Tree-sitter in Kingfisher Scanning
[← Back to README](../README.md)
This document explains how Tree-sitter is used during scanning, and when it is intentionally skipped.
## What Tree-sitter Is Used For
Kingfisher always starts with a fast regex pass (Vectorscan/Hyperscan). Tree-sitter is a secondary verification layer used only for context-dependent findings.
The goal is to confirm that a regex hit appears in a plausible code assignment/config context (for example `api_key = "..."`) before keeping the finding.
## Where It Runs in the Scan Pipeline
1. `BlobProcessor::run` decides whether to compute a language hint.
- It skips language hinting in `turbo_mode`.
- It also skips when blob size is outside the Tree-sitter window.
2. `Matcher::scan_blob` performs the primary regex scan and other filtering.
3. `maybe_apply_tree_sitter_verification` runs near the end of `scan_blob`.
4. Only candidate matches are checked against Tree-sitter extracted text.
5. Matches that fail verification are dropped for context-dependent rules.
## Size and Mode Gates
Tree-sitter is attempted only when all of these are true:
- Blob length is between `0 KiB` and `128 KiB` (`should_attempt_tree_sitter`).
- `turbo_mode` is disabled.
- A language hint is available.
- The language maps to a supported Tree-sitter grammar + query set.
If any of these conditions fails, Tree-sitter verification is considered unavailable for that blob.
## Candidate Selection (Not Every Match)
Tree-sitter verification is only applied to matches that are:
- Classified as `ContextDependent` by rule profiling.
- Not base64-derived findings (`is_base64 == false`).
Classification comes from rule profiles in `kingfisher-rules`:
- `SelfIdentifying`: keep raw regex result.
- `ContextDependent`: may require Tree-sitter confirmation.
## How Verification Works
When Tree-sitter is available:
1. `load_tree_sitter_results` builds a `Checker` with:
- `Language` enum value
- language-specific queries from `src/parser/queries.rs`
2. `Checker::check`:
- Reuses a thread-local parser cache (`PARSER_CACHE`)
- Parses source into a syntax tree
- Runs language query patterns capturing `@key` and `@val`
- Produces normalized strings like `key = value`
- Attempts base64 decode of value and keeps decoded ASCII form when valid
3. For each candidate finding, Kingfisher re-runs that rule's anchored regex on each extracted Tree-sitter text fragment.
4. Verification succeeds only when the rule's secret capture equals the original matched secret bytes.
If no extracted fragment verifies the secret, that candidate finding is removed.
## Behavior When Tree-sitter Is Unavailable
If Tree-sitter cannot run (size/mode/language/parse errors), Kingfisher keeps the original regex finding.
## Supported Languages in This Path
Language mapping for verification currently includes:
- `bash`/`shell`
- `c`
- `c#`/`csharp`
- `c++`/`cpp`
- `css`
- `go`
- `html`
- `java`
- `javascript`/`js`
- `php`
- `python`/`py`/`starlark`
- `ruby`
- `rust`
- `toml`
- `typescript`/`ts`
- `yaml`
The Tree-sitter query definitions for these languages live in `src/parser/queries.rs`.
## Operational Summary
Tree-sitter in Kingfisher is a conditional verifier, not the primary detector:
- Regex finds candidates quickly.
- Rule profiling decides which candidates need context verification.
- Tree-sitter confirms contextual plausibility from parsed syntax.
- If verification cannot run, scan results fall back to the regex pass.
This keeps scanning fast while reducing noisy matches for context-dependent secret patterns.

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

View file

@ -166,7 +166,7 @@ pub struct ScanArgs {
#[arg(global = true, long, default_value_t = false)]
pub no_base64: bool,
/// Turbo mode: equivalent to --commit-metadata=false --no-base64 and disables MIME sniffing, language detection, and tree-sitter parsing
/// Turbo mode: equivalent to --commit-metadata=false --no-base64 and disables MIME sniffing, language detection, and parser-based context verification
#[arg(global = true, long = "turbo", default_value_t = false)]
pub turbo: bool,

View file

@ -24,7 +24,7 @@ use crate::{
location::OffsetSpan,
origin::OriginSet,
parser,
parser::{Checker, Language},
parser::Language,
rule_profiling::{ConcurrentRuleProfiler, RuleStats},
rules::rule::Rule,
rules_database::{RuleDetectionProfileKind, RulesDatabase},
@ -40,12 +40,16 @@ use self::{
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
const TREE_SITTER_MAX_LIMIT: usize = 128 * 1024; // only run tree-sitter on blobs <= 128 KiB
const TREE_SITTER_MIN_LIMIT: usize = 0; // allow tree-sitter starting at 0 bytes
// The old tree-sitter limit was 128 KiB due to full-AST parsing cost.
// The lightweight regex-based lexer is O(n) line-by-line, so we can afford
// a much higher ceiling. We still cap it to avoid spending time on huge
// generated/minified blobs where context verification adds little value.
const CONTEXT_VERIFIER_MAX_LIMIT: usize = 2 * 1024 * 1024; // verify code context on blobs <= 2 MiB
const CONTEXT_VERIFIER_MIN_LIMIT: usize = 0; // allow context verification starting at 0 bytes
#[inline]
pub(crate) fn should_attempt_tree_sitter(blob_len: usize) -> bool {
blob_len <= TREE_SITTER_MAX_LIMIT && blob_len >= TREE_SITTER_MIN_LIMIT
pub(crate) fn should_attempt_context_verification(blob_len: usize) -> bool {
blob_len <= CONTEXT_VERIFIER_MAX_LIMIT && blob_len >= CONTEXT_VERIFIER_MIN_LIMIT
}
// -------------------------------------------------------------------------------------------------
@ -374,7 +378,7 @@ impl<'a> Matcher<'a> {
}
}
}
maybe_apply_tree_sitter_verification(
maybe_apply_context_verification(
rules_db,
blob,
lang_hint,
@ -407,7 +411,7 @@ impl<'a> Matcher<'a> {
}
}
fn maybe_apply_tree_sitter_verification<'a>(
fn maybe_apply_context_verification<'a>(
rules_db: &RulesDatabase,
blob: &'a Blob,
lang_hint: Option<&str>,
@ -439,36 +443,44 @@ fn maybe_apply_tree_sitter_verification<'a>(
return;
}
let ts_results = load_tree_sitter_results(blob, lang_hint, blob_len);
let mut keep = vec![true; matches.len()];
for idx in candidate_indices {
let Some(rule_idx) = match_rule_indices.get(idx).copied() else {
continue;
};
let match_secret = matches[idx].matching_input;
let re = &rules_db.anchored_regexes()[rule_idx];
match ts_results.as_ref() {
Some(results) => {
let verified = results.iter().any(|text| {
verify_match_in_tree_sitter_text(re, match_secret, text.as_bytes())
});
if !verified {
keep[idx] = false;
}
}
None => {
// Tree-sitter is an optional precision layer. If parser context
// is unavailable, always fall back to the original regex match.
}
let Some(language) = load_context_verifier_language(lang_hint, blob_len) else {
for idx in candidate_indices {
keep[idx] = false;
}
filter_kept_matches(matches, &keep);
return;
};
let mut remaining = candidate_indices.clone();
let verification = parser::stream_context_candidates(blob.bytes(), &language, |text| {
remaining.retain(|idx| {
let Some(rule_idx) = match_rule_indices.get(*idx).copied() else {
return false;
};
let re = &rules_db.anchored_regexes()[rule_idx];
let expected_secret = matches[*idx].matching_input;
!verify_match_in_context_text(re, expected_secret, text.as_bytes())
});
!remaining.is_empty()
});
if let Err(e) = verification {
debug!("context verification unavailable: {e}");
remaining = candidate_indices;
}
for idx in remaining {
keep[idx] = false;
}
filter_kept_matches(matches, &keep);
}
fn filter_kept_matches<'a>(matches: &mut Vec<BlobMatch<'a>>, keep: &[bool]) {
if keep.iter().all(|k| *k) {
return;
}
let mut filtered = Vec::with_capacity(matches.len());
for (idx, item) in std::mem::take(matches).into_iter().enumerate() {
if keep[idx] {
@ -478,27 +490,15 @@ fn maybe_apply_tree_sitter_verification<'a>(
*matches = filtered;
}
fn load_tree_sitter_results(
blob: &Blob,
lang_hint: Option<&str>,
blob_len: usize,
) -> Option<Vec<String>> {
if !should_attempt_tree_sitter(blob_len) {
fn load_context_verifier_language(lang_hint: Option<&str>, blob_len: usize) -> Option<Language> {
if !should_attempt_context_verification(blob_len) {
return None;
}
let lang = lang_hint?;
let (language, queries) = get_language_and_queries(lang)?;
let checker = Checker { language, rules: queries };
match checker.check(&blob.bytes()) {
Ok(results) => Some(results.into_iter().map(|m| m.text).collect()),
Err(e) => {
debug!("tree-sitter verification unavailable: {e}");
None
}
}
Language::from_hint(lang)
}
fn verify_match_in_tree_sitter_text(
fn verify_match_in_context_text(
re: &regex::bytes::Regex,
expected_secret: &[u8],
text: &[u8],
@ -507,34 +507,6 @@ fn verify_match_in_tree_sitter_text(
.any(|captures| find_secret_capture(re, &captures).as_bytes() == expected_secret)
}
fn get_language_and_queries(lang: &str) -> Option<(Language, FxHashMap<String, String>)> {
match lang.to_lowercase().as_str() {
"bash" | "shell" => Some((Language::Bash, parser::queries::bash::get_bash_queries())),
"c" => Some((Language::C, parser::queries::c::get_c_queries())),
"c#" | "csharp" => Some((Language::CSharp, parser::queries::csharp::get_csharp_queries())),
"c++" | "cpp" => Some((Language::Cpp, parser::queries::cpp::get_cpp_queries())),
"css" => Some((Language::Css, parser::queries::css::get_css_queries())),
"go" => Some((Language::Go, parser::queries::go::get_go_queries())),
"html" => Some((Language::Html, parser::queries::html::get_html_queries())),
"java" => Some((Language::Java, parser::queries::java::get_java_queries())),
"javascript" | "js" => {
Some((Language::JavaScript, parser::queries::javascript::get_javascript_queries()))
}
"php" => Some((Language::Php, parser::queries::php::get_php_queries())),
"python" | "py" | "starlark" => {
Some((Language::Python, parser::queries::python::get_python_queries()))
}
"ruby" => Some((Language::Ruby, parser::queries::ruby::get_ruby_queries())),
"rust" => Some((Language::Rust, parser::queries::rust::get_rust_queries())),
"toml" => Some((Language::Toml, parser::queries::toml::get_toml_queries())),
"typescript" | "ts" => {
Some((Language::TypeScript, parser::queries::typescript::get_typescript_queries()))
}
"yaml" => Some((Language::Yaml, parser::queries::yaml::get_yaml_queries())),
_ => None,
}
}
// -------------------------------------------------------------------------------------------------
// test
// -------------------------------------------------------------------------------------------------
@ -1161,13 +1133,13 @@ line2
};
assert!(
found.is_empty(),
"comment-only contextual hits should be suppressed when tree-sitter cannot verify assignment context"
"comment-only contextual hits should be suppressed when parser-based verification cannot confirm assignment context"
);
Ok(())
}
#[test]
fn strict_context_rule_keeps_raw_when_tree_sitter_unavailable() -> Result<()> {
fn strict_context_rule_suppresses_raw_when_context_verification_is_unavailable() -> Result<()> {
let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234";
let rule = Rule::new(RuleSyntax {
id: "kingfisher.auth0.2".into(),
@ -1200,10 +1172,9 @@ line2
ScanResult::New(matches) => matches,
_ => panic!("unexpected scan result"),
};
assert_eq!(
found.len(),
1,
"strict contextual rules should fall back to raw regex findings when tree-sitter is unavailable"
assert!(
found.is_empty(),
"strict contextual rules should be suppressed when parser-based verification cannot run"
);
Ok(())
}

View file

@ -1,34 +1,13 @@
use std::{cell::RefCell, error::Error as StdError, ops::Range, str, str::FromStr};
use std::str::FromStr;
use base64::{engine::general_purpose::STANDARD, Engine};
use rustc_hash::FxHashMap;
use anyhow::Result;
use regex::bytes::Regex;
use serde::Deserialize;
use streaming_iterator::StreamingIterator;
use tree_sitter::{Parser as TreeSitterParser, Query, QueryCursor};
use tree_sitter_bash;
use tree_sitter_c;
use tree_sitter_c_sharp;
use tree_sitter_cpp;
use tree_sitter_css;
use tree_sitter_go;
use tree_sitter_html;
use tree_sitter_java;
use tree_sitter_javascript;
use tree_sitter_php;
use tree_sitter_python;
use tree_sitter_regex;
use tree_sitter_ruby;
use tree_sitter_rust;
use tree_sitter_toml_ng;
use tree_sitter_typescript;
use tree_sitter_yaml;
// use tree_sitter_php;
use crate::util::is_base64;
//
pub mod queries;
// pub(crate) type Error = Box<dyn std::error::Error>;
type Result<T> = std::result::Result<T, Box<dyn StdError>>;
mod css;
mod html;
mod lexer;
#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Hash)]
#[serde(rename_all = "lowercase")]
pub enum Language {
@ -43,22 +22,15 @@ pub enum Language {
JavaScript,
Php,
Python,
Regex,
Ruby,
Rust,
Toml,
TypeScript,
Yaml,
}
#[derive(Debug, Clone)]
pub struct MatchResult {
pub range: Range<usize>,
pub text: String,
pub is_base64_decoded: bool,
pub original_base64: Option<String>, // Store original base64 if decoded
}
impl Language {
fn name(&self) -> &'static str {
pub fn name(&self) -> &'static str {
match self {
Language::Bash => "bash",
Language::C => "c",
@ -71,7 +43,6 @@ impl Language {
Language::JavaScript => "javascript",
Language::Php => "php",
Language::Python => "python",
Language::Regex => "regex",
Language::Ruby => "ruby",
Language::Rust => "rust",
Language::Toml => "toml",
@ -80,248 +51,89 @@ impl Language {
}
}
pub fn get_ts_language(&self) -> Result<tree_sitter::Language> {
match self {
Language::Bash => Ok(tree_sitter_bash::LANGUAGE.into()),
Language::C => Ok(tree_sitter_c::LANGUAGE.into()),
Language::CSharp => Ok(tree_sitter_c_sharp::LANGUAGE.into()),
Language::Cpp => Ok(tree_sitter_cpp::LANGUAGE.into()),
Language::Css => Ok(tree_sitter_css::LANGUAGE.into()),
Language::Go => Ok(tree_sitter_go::LANGUAGE.into()),
Language::Html => Ok(tree_sitter_html::LANGUAGE.into()),
Language::Java => Ok(tree_sitter_java::LANGUAGE.into()),
Language::JavaScript => Ok(tree_sitter_javascript::LANGUAGE.into()),
Language::Php => Ok(tree_sitter_php::LANGUAGE_PHP.into()),
Language::Python => Ok(tree_sitter_python::LANGUAGE.into()),
Language::Regex => Ok(tree_sitter_regex::LANGUAGE.into()),
Language::Ruby => Ok(tree_sitter_ruby::LANGUAGE.into()),
Language::Rust => Ok(tree_sitter_rust::LANGUAGE.into()),
Language::Toml => Ok(tree_sitter_toml_ng::LANGUAGE.into()),
Language::TypeScript => Ok(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()),
Language::Yaml => Ok(tree_sitter_yaml::LANGUAGE.into()),
pub fn from_hint(hint: &str) -> Option<Self> {
match hint.to_lowercase().as_str() {
"bash" | "shell" => Some(Language::Bash),
"c" => Some(Language::C),
"c#" | "csharp" => Some(Language::CSharp),
"c++" | "cpp" => Some(Language::Cpp),
"css" => Some(Language::Css),
"go" => Some(Language::Go),
"html" => Some(Language::Html),
"java" => Some(Language::Java),
"javascript" | "js" => Some(Language::JavaScript),
"php" => Some(Language::Php),
"python" | "py" | "starlark" => Some(Language::Python),
"ruby" => Some(Language::Ruby),
"rust" | "rs" => Some(Language::Rust),
"toml" => Some(Language::Toml),
"typescript" | "ts" => Some(Language::TypeScript),
"yaml" | "yml" => Some(Language::Yaml),
_ => None,
}
}
}
impl FromStr for Language {
// type Err = Box<dyn Error>;
type Err = Box<dyn StdError + Send + Sync>;
type Err = String;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"bash" => Ok(Language::Bash),
"c" => Ok(Language::C),
"csharp" | "c_sharp" => Ok(Language::CSharp),
"cpp" => Ok(Language::Cpp),
"css" => Ok(Language::Css),
"go" => Ok(Language::Go),
"html" => Ok(Language::Html),
"java" => Ok(Language::Java),
"javascript" | "js" => Ok(Language::JavaScript),
"php" => Ok(Language::Php),
"python" | "py" => Ok(Language::Python),
"ruby" => Ok(Language::Ruby),
"rust" | "rs" => Ok(Language::Rust),
"toml" => Ok(Language::Toml),
"typescript" | "ts" => Ok(Language::TypeScript),
"yaml" | "yml" => Ok(Language::Yaml),
_ => Err(format!("Unknown language: {}", s).into()),
}
Self::from_hint(s).ok_or_else(|| format!("Unknown language: {s}"))
}
}
thread_local! {
static PARSER_CACHE: RefCell<Option<TreeSitterParser>> = RefCell::new(None);
}
#[derive(Debug, Deserialize)]
pub struct Checker {
pub language: Language,
pub rules: FxHashMap<String, String>,
}
impl Checker {
pub fn modify_regex(&self, source: &[u8]) -> Result<String> {
if source.is_empty() {
return Err("Source code is empty".into());
}
let tree_sitter_language = self.language.get_ts_language()?;
PARSER_CACHE.with(|cache| {
let mut cache = cache.borrow_mut();
if cache.is_none() {
*cache = Some(TreeSitterParser::new());
}
let parser = cache.as_mut().unwrap();
parser
.set_language(&tree_sitter_language)
.map_err(|e| format!("Failed to set language '{}': {}", self.language.name(), e))?;
let tree = parser.parse(source, None).ok_or_else(|| {
format!("Failed to parse source for language '{}'", self.language.name())
})?;
let mut modified_regex = String::from_utf8_lossy(source).to_string();
for (_name, rule) in &self.rules {
let query = Query::new(&tree_sitter_language, rule)
.map_err(|e| format!("Failed to create query: {}", e))?;
// Store matches in a Vec so we can process them in reverse order
let mut matches = Vec::new();
let mut query_cursor = QueryCursor::new();
let mut cursor = query_cursor.matches(&query, tree.root_node(), source);
// Collect matches, converting them into owned data structures
while cursor.next().is_some() {
if let Some(m) = cursor.get() {
let captures: Vec<_> = m
.captures
.iter()
.map(|capture| {
let range = capture.node.byte_range();
let text = source[range.clone()].to_vec();
(capture.index, range, text)
})
.collect();
matches.push(captures);
}
}
// Process matches in reverse order to maintain correct byte offsets
for captures in matches.iter().rev() {
let mut boundary_text = None;
let mut boundary_range = None;
let mut key_text = None;
let mut key_range = None;
for (index, range, _) in captures {
let capture_name = query.capture_names()[*index as usize];
let captured_text = &source[range.clone()];
if capture_name == "key" {
key_text = Some(String::from_utf8_lossy(captured_text).to_string());
key_range = Some(range.clone());
} else if capture_name == "boundary" {
boundary_text =
Some(String::from_utf8_lossy(captured_text).to_string());
boundary_range = Some(range.clone());
}
}
if let Some(key_str) = key_text {
// Include the boundary text if available
let new_pattern = if let Some(boundary_str) = boundary_text {
format!(
r#"(?:
{}
{}
)
|
(?:
[A-Za-z0-9+/]{{16,64}}={{0,3}}
)"#,
key_str, boundary_str
)
} else {
format!(
r#"(?:
{}
)
|
(?:
[A-Za-z0-9+/]{{16,64}}={{0,3}}
)"#,
key_str
)
};
// Remove the `boundary` part if it exists
if let Some(range) = boundary_range {
modified_regex.replace_range(range, "");
}
// Replace the captured part with the new pattern
if let Some(range) = key_range {
modified_regex.replace_range(range, &new_pattern);
}
}
}
}
Ok(modified_regex)
})
}
pub fn check(&self, source: &[u8]) -> Result<Vec<MatchResult>> {
if source.is_empty() {
return Err("Source code is empty".into());
}
let tree_sitter_language = self.language.get_ts_language()?;
PARSER_CACHE.with(|cache| {
let mut cache = cache.borrow_mut();
if cache.is_none() {
*cache = Some(TreeSitterParser::new());
}
let parser = cache.as_mut().unwrap();
parser
.set_language(&tree_sitter_language)
.map_err(|e| format!("Failed to set language '{}': {}", self.language.name(), e))?;
let tree = parser.parse(source, None).ok_or_else(|| {
format!("Failed to parse source for language '{}'", self.language.name())
})?;
let mut all_matches = Vec::new();
for (_name, rule) in &self.rules {
let query = Query::new(&tree_sitter_language, rule)
.map_err(|e| format!("Failed to create query: {}", e))?;
let mut rule_matches = Vec::new();
QueryCursor::new().matches(&query, tree.root_node(), source).for_each(|m| {
let captures: Vec<_> = m.captures.iter().collect();
if captures.len() >= 2 {
let first_range = captures[0].node.range();
let second_range = captures[1].node.range();
let first_text = String::from_utf8_lossy(
&source[first_range.start_byte..first_range.end_byte],
);
let second_text = String::from_utf8_lossy(
&source[second_range.start_byte..second_range.end_byte],
);
let second_trimmed = second_text.trim();
let mut is_base64_decoded = is_base64(second_trimmed);
let (final_text, original_base64) = if is_base64_decoded {
if let Some(decoded) =
STANDARD.decode(second_trimmed).ok().and_then(|decoded| {
if decoded.is_ascii() && std::str::from_utf8(&decoded).is_ok() {
Some(String::from_utf8_lossy(&decoded).to_string())
} else {
is_base64_decoded = false;
None
}
})
{
(
format!("{} = {}", first_text.trim(), decoded),
Some(second_trimmed.to_string()),
)
} else {
(format!("{} = {}", first_text.trim(), second_trimmed), None)
}
} else {
(format!("{} = {}", first_text.trim(), second_trimmed), None)
};
rule_matches.push(MatchResult {
range: first_range.start_byte..second_range.end_byte,
text: final_text,
is_base64_decoded,
original_base64,
});
}
});
all_matches.extend(rule_matches);
}
Ok(all_matches)
})
pub fn stream_context_candidates<F>(source: &[u8], language: &Language, mut sink: F) -> Result<()>
where
F: FnMut(&str) -> bool,
{
match language {
Language::Css => css::stream_context_candidates(source, &mut sink),
Language::Html => html::stream_context_candidates(source, &mut sink),
_ => lexer::stream_context_candidates(source, language, &mut sink),
}
}
pub fn verify_match_in_context(
source: &[u8],
language: &Language,
re: &Regex,
expected_secret: &[u8],
) -> Result<bool> {
let mut verified = false;
stream_context_candidates(source, language, |text| {
verified = verify_match_in_context_text(re, expected_secret, text.as_bytes());
!verified
})?;
Ok(verified)
}
fn verify_match_in_context_text(re: &Regex, expected_secret: &[u8], text: &[u8]) -> bool {
use kingfisher_scanner::primitives::find_secret_capture;
re.captures_iter(text)
.any(|captures| find_secret_capture(re, &captures).as_bytes() == expected_secret)
}
#[cfg(test)]
mod tests {
use super::*;
use std::{collections::BTreeMap, fs, path::PathBuf};
use super::*;
fn fixture_cases() -> Vec<(Language, &'static str)> {
vec![
(Language::Bash, "testdata/shell_vulnerable.sh"),
(Language::C, "testdata/c_vulnerable.c"),
(Language::CSharp, "testdata/csharp_vulnerable.cs"),
(Language::Cpp, "testdata/cpp_vulnerable.cpp"),
(Language::Css, "testdata/css_vulnerable.css"),
(Language::Go, "testdata/go_vulnerable.go"),
(Language::Html, "testdata/html_embedded_vulnerable.html"),
(Language::Html, "testdata/html_vulnerable.html"),
(Language::Java, "testdata/java_vulnerable.java"),
(Language::JavaScript, "testdata/javascript_vulnerable.js"),
(Language::Php, "testdata/php_vulnerable.php"),
(Language::Python, "testdata/parsers/comment_only_context.py"),
(Language::Python, "testdata/python_vulnerable.py"),
(Language::Ruby, "testdata/ruby_vulnerable.rb"),
(Language::Rust, "testdata/rust_vulnerable.rs"),
@ -331,162 +143,106 @@ mod tests {
]
}
fn build_checker(language: &Language) -> Checker {
Checker {
language: language.clone(),
rules: match language {
Language::Bash => queries::bash::get_bash_queries(),
Language::C => queries::c::get_c_queries(),
Language::CSharp => queries::csharp::get_csharp_queries(),
Language::Cpp => queries::cpp::get_cpp_queries(),
Language::Css => queries::css::get_css_queries(),
Language::Go => queries::go::get_go_queries(),
Language::Html => queries::html::get_html_queries(),
Language::Java => queries::java::get_java_queries(),
Language::JavaScript => queries::javascript::get_javascript_queries(),
Language::Php => queries::php::get_php_queries(),
Language::Python => queries::python::get_python_queries(),
Language::Regex => queries::regex::get_regex_queries(),
Language::Ruby => queries::ruby::get_ruby_queries(),
Language::Rust => queries::rust::get_rust_queries(),
Language::Toml => queries::toml::get_toml_queries(),
Language::TypeScript => queries::typescript::get_typescript_queries(),
Language::Yaml => queries::yaml::get_yaml_queries(),
},
}
}
fn current_capture_counts(
fn current_capture_texts(
root: &PathBuf,
cases: &[(Language, &'static str)],
) -> BTreeMap<String, usize> {
) -> BTreeMap<String, Vec<String>> {
let mut current = BTreeMap::new();
for (language, rel_path) in cases {
let file_path = root.join(rel_path);
let source = fs::read(&file_path)
.unwrap_or_else(|e| panic!("failed to read fixture {}: {e}", file_path.display()));
let checker = build_checker(language);
let count = checker
.check(&source)
.unwrap_or_else(|e| panic!("checker failed for {}: {e}", rel_path))
.len();
current.insert(format!("{}:{}", language.name(), rel_path), count);
let mut texts = Vec::new();
stream_context_candidates(&source, language, |text| {
texts.push(text.to_string());
true
})
.unwrap_or_else(|e| panic!("context verifier failed for {}: {e}", rel_path));
current.insert(format!("{}:{}", language.name(), rel_path), texts);
}
current
}
/// The golden file records the minimum set of candidates each fixture must
/// produce. The current output must be a **superset** of the golden set
/// (every golden candidate must still appear), but new candidates are
/// allowed — this lets us improve extraction without regenerating the
/// golden file every time.
#[test]
fn queries_compile_for_supported_languages() {
let cases = vec![
(Language::Bash, queries::bash::get_bash_queries()),
(Language::C, queries::c::get_c_queries()),
(Language::CSharp, queries::csharp::get_csharp_queries()),
(Language::Cpp, queries::cpp::get_cpp_queries()),
(Language::Css, queries::css::get_css_queries()),
(Language::Go, queries::go::get_go_queries()),
(Language::Html, queries::html::get_html_queries()),
(Language::Java, queries::java::get_java_queries()),
(Language::JavaScript, queries::javascript::get_javascript_queries()),
(Language::Php, queries::php::get_php_queries()),
(Language::Python, queries::python::get_python_queries()),
(Language::Regex, queries::regex::get_regex_queries()),
(Language::Ruby, queries::ruby::get_ruby_queries()),
(Language::Rust, queries::rust::get_rust_queries()),
(Language::Toml, queries::toml::get_toml_queries()),
(Language::TypeScript, queries::typescript::get_typescript_queries()),
(Language::Yaml, queries::yaml::get_yaml_queries()),
];
for (language, rule_set) in cases {
let ts_language = language
.get_ts_language()
.unwrap_or_else(|e| panic!("failed to load language {}: {e}", language.name()));
for (name, query) in rule_set {
Query::new(&ts_language, &query).unwrap_or_else(|e| {
panic!("query '{name}' failed for language {}: {e}", language.name())
});
}
}
}
#[test]
fn tree_sitter_capture_counts_do_not_regress() {
fn context_verifier_outputs_are_superset_of_golden() {
let root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let baseline_path = root.join("testdata/parsers/tree_sitter_capture_baseline.json");
let baseline_path = root.join("testdata/parsers/context_verifier_golden.json");
let cases = fixture_cases();
let current = current_capture_counts(&root, &cases);
let current = current_capture_texts(&root, &cases);
if std::env::var("UPDATE_TREE_SITTER_CAPTURE_BASELINE").as_deref() == Ok("1") {
if std::env::var("UPDATE_CONTEXT_VERIFIER_GOLDEN").as_deref() == Ok("1") {
let payload = serde_json::to_string_pretty(&current)
.unwrap_or_else(|e| panic!("failed to serialize baseline: {e}"));
.unwrap_or_else(|e| panic!("failed to serialize golden output: {e}"));
fs::write(&baseline_path, format!("{payload}\n")).unwrap_or_else(|e| {
panic!("failed to write baseline {}: {e}", baseline_path.display())
panic!("failed to write golden output {}: {e}", baseline_path.display())
});
return;
}
let baseline_raw = fs::read_to_string(&baseline_path).unwrap_or_else(|e| {
panic!(
"failed to read baseline {}: {e}. Run with UPDATE_TREE_SITTER_CAPTURE_BASELINE=1",
"failed to read golden output {}: {e}. Run with UPDATE_CONTEXT_VERIFIER_GOLDEN=1",
baseline_path.display()
)
});
let baseline: BTreeMap<String, usize> = serde_json::from_str(&baseline_raw)
.unwrap_or_else(|e| panic!("invalid baseline JSON {}: {e}", baseline_path.display()));
let baseline: BTreeMap<String, Vec<String>> = serde_json::from_str(&baseline_raw)
.unwrap_or_else(|e| panic!("invalid golden JSON {}: {e}", baseline_path.display()));
let mut regressions = Vec::new();
for (key, actual) in &current {
let expected = baseline.get(key).unwrap_or_else(|| {
panic!(
"missing baseline entry for {key}. Run with UPDATE_TREE_SITTER_CAPTURE_BASELINE=1"
)
for (key, expected_texts) in &baseline {
let actual_texts = current.get(key).unwrap_or_else(|| {
panic!("missing fixture key {key}. Run with UPDATE_CONTEXT_VERIFIER_GOLDEN=1")
});
if actual < expected {
regressions.push(format!("{key}: expected >= {expected}, got {actual}"));
for expected in expected_texts {
if !actual_texts.contains(expected) {
regressions.push(format!(" {key}: missing candidate: {expected:?}"));
}
}
}
assert!(
regressions.is_empty(),
"tree-sitter capture regression(s):\n{}",
"context verifier regression(s) — golden candidates no longer emitted:\n{}",
regressions.join("\n")
);
}
#[test]
fn report_tree_sitter_capture_count_deltas() {
fn html_embedded_context_extracts_script_and_style_candidates() {
let root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let baseline_path = root.join("testdata/parsers/tree_sitter_capture_baseline.json");
let cases = fixture_cases();
let current = current_capture_counts(&root, &cases);
let source = fs::read(root.join("testdata/html_embedded_vulnerable.html")).unwrap();
let mut texts = Vec::new();
stream_context_candidates(&source, &Language::Html, |text| {
texts.push(text.to_string());
true
})
.unwrap();
let baseline_raw = match fs::read_to_string(&baseline_path) {
Ok(data) => data,
Err(e) => {
println!(
"capture-delta report unavailable: cannot read baseline {}: {e}",
baseline_path.display()
);
return;
}
};
assert!(
texts.iter().any(|text| text.contains("auth0_client_secret =")),
"expected script extraction to emit auth0_client_secret candidate"
);
assert!(
texts.iter().any(|text| text.contains("content =")),
"expected style extraction to emit CSS content candidate"
);
}
let baseline: BTreeMap<String, usize> = match serde_json::from_str(&baseline_raw) {
Ok(v) => v,
Err(e) => {
println!(
"capture-delta report unavailable: invalid baseline JSON {}: {e}",
baseline_path.display()
);
return;
}
};
println!("tree-sitter capture delta report (current vs baseline):");
for (key, actual) in &current {
let expected = baseline.get(key).copied().unwrap_or(0);
let delta = (*actual as isize) - (expected as isize);
println!(" {key}: current={actual}, baseline={expected}, delta={delta:+}");
}
#[test]
fn comment_only_python_context_is_ignored() {
let root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let source = fs::read(root.join("testdata/parsers/comment_only_context.py")).unwrap();
let mut texts = Vec::new();
stream_context_candidates(&source, &Language::Python, |text| {
texts.push(text.to_string());
true
})
.unwrap();
assert!(texts.is_empty());
}
}

173
src/parser/css.rs Normal file
View file

@ -0,0 +1,173 @@
use anyhow::Result;
use cssparser::{
parse_important, AtRuleParser, CowRcStr, DeclarationParser, ParseError, Parser, ParserInput,
ParserState, RuleBodyItemParser, RuleBodyParser, StyleSheetParser, ToCss, Token,
};
pub(super) fn stream_context_candidates<F>(source: &[u8], sink: &mut F) -> Result<()>
where
F: FnMut(&str) -> bool,
{
let css = String::from_utf8_lossy(source);
if css.trim().is_empty() {
return Ok(());
}
let mut input = ParserInput::new(&css);
let mut parser = Parser::new(&mut input);
let mut collector = Collector { sink, stopped: false };
for _ in StyleSheetParser::new(&mut parser, &mut collector) {}
Ok(())
}
struct Collector<'a, F> {
sink: &'a mut F,
stopped: bool,
}
impl<'a, F> Collector<'a, F>
where
F: FnMut(&str) -> bool,
{
fn emit(&mut self, name: &str, value: &str) {
if self.stopped {
return;
}
let candidate = format!("{name} = {value}");
self.stopped = !(self.sink)(&candidate);
}
}
impl<'i, F> DeclarationParser<'i> for Collector<'_, F>
where
F: FnMut(&str) -> bool,
{
type Declaration = ();
type Error = ();
fn parse_value<'t>(
&mut self,
name: CowRcStr<'i>,
input: &mut Parser<'i, 't>,
_declaration_start: &ParserState,
) -> Result<(), ParseError<'i, ()>> {
let mut values = Vec::new();
let mut important = false;
loop {
let start = input.state();
let token = match input.next_including_whitespace().cloned() {
Ok(token) => token,
Err(_) => break,
};
if token == Token::Delim('!') {
input.reset(&start);
if parse_important(input).is_ok() && input.is_exhausted() {
important = true;
break;
}
input.reset(&start);
}
collect_token_values(token, input, &mut values);
}
if values.is_empty() && !important {
return Ok(());
}
if values.is_empty() && important {
values.push("important".to_string());
}
for value in values {
self.emit(&name, &value);
if self.stopped {
break;
}
}
Ok(())
}
}
impl<'i, F> AtRuleParser<'i> for Collector<'_, F>
where
F: FnMut(&str) -> bool,
{
type Prelude = ();
type AtRule = ();
type Error = ();
}
impl<'i, F> cssparser::QualifiedRuleParser<'i> for Collector<'_, F>
where
F: FnMut(&str) -> bool,
{
type Prelude = ();
type QualifiedRule = ();
type Error = ();
fn parse_prelude<'t>(&mut self, input: &mut Parser<'i, 't>) -> Result<(), ParseError<'i, ()>> {
while input.next_including_whitespace().is_ok() {}
Ok(())
}
fn parse_block<'t>(
&mut self,
_prelude: (),
_start: &ParserState,
input: &mut Parser<'i, 't>,
) -> Result<(), ParseError<'i, ()>> {
for _ in RuleBodyParser::new(input, self) {}
Ok(())
}
}
impl<F> RuleBodyItemParser<'_, (), ()> for Collector<'_, F>
where
F: FnMut(&str) -> bool,
{
fn parse_qualified(&self) -> bool {
true
}
fn parse_declarations(&self) -> bool {
true
}
}
fn collect_token_values<'i, 't>(
token: Token<'i>,
input: &mut Parser<'i, 't>,
values: &mut Vec<String>,
) {
match token {
Token::QuotedString(value) => values.push(value.to_string()),
Token::UnquotedUrl(value) => values.push(value.to_string()),
Token::Ident(value) => values.push(value.to_string()),
Token::Hash(value) | Token::IDHash(value) => values.push(value.to_string()),
Token::Number { .. }
| Token::Percentage { .. }
| Token::Dimension { .. }
| Token::Function(_) => {
values.push(token.to_css_string());
if matches!(token, Token::Function(_)) {
let _ = input.parse_nested_block(|nested| {
while let Ok(next) = nested.next_including_whitespace().cloned() {
collect_token_values(next, nested, values);
}
Ok::<(), ParseError<'i, ()>>(())
});
}
}
Token::ParenthesisBlock | Token::SquareBracketBlock | Token::CurlyBracketBlock => {
let _ = input.parse_nested_block(|nested| {
while let Ok(next) = nested.next_including_whitespace().cloned() {
collect_token_values(next, nested, values);
}
Ok::<(), ParseError<'i, ()>>(())
});
}
_ => {}
}
}

67
src/parser/html.rs Normal file
View file

@ -0,0 +1,67 @@
use anyhow::Result;
use tl::{Node, ParserOptions};
use super::{css, lexer, Language};
pub(super) fn stream_context_candidates<F>(source: &[u8], sink: &mut F) -> Result<()>
where
F: FnMut(&str) -> bool,
{
let html = String::from_utf8_lossy(source);
if html.trim().is_empty() {
return Ok(());
}
let dom = match tl::parse(&html, ParserOptions::default()) {
Ok(dom) => dom,
Err(_) => return Ok(()),
};
let parser = dom.parser();
for node in dom.nodes() {
let Some(tag) = node.as_tag() else {
continue;
};
let tag_name = tag.name().as_utf8_str().to_string();
for (key, value) in tag.attributes().iter() {
let Some(value) = value else {
continue;
};
let candidate = format!("{key} = {value}");
if !sink(&candidate) {
return Ok(());
}
}
let inner_text = tag.inner_text(parser).trim().to_string();
match tag_name.as_str() {
"script" => {
let candidate = format!("<script> = {inner_text}");
if !inner_text.is_empty() && !sink(&candidate) {
return Ok(());
}
lexer::stream_context_candidates(
inner_text.as_bytes(),
&Language::JavaScript,
sink,
)?;
}
"style" => {
if !inner_text.is_empty() {
css::stream_context_candidates(inner_text.as_bytes(), sink)?;
}
}
_ => {
if !inner_text.is_empty()
&& !matches!(node, Node::Comment(_))
&& !sink(&format!("{tag_name} = {inner_text}"))
{
return Ok(());
}
}
}
}
Ok(())
}

1276
src/parser/lexer.rs Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -6,7 +6,7 @@ use crate::{
blob::{Blob, BlobMetadata},
content_type::ContentInspector,
location::LocationMapping,
matcher::{should_attempt_tree_sitter, Match, Matcher, OwnedBlobMatch, ScanResult},
matcher::{should_attempt_context_verification, Match, Matcher, OwnedBlobMatch, ScanResult},
origin::{Origin, OriginSet},
scanner::repos::DatastoreMessage,
Path,
@ -32,7 +32,7 @@ impl<'a> BlobProcessor<'a> {
) -> Result<Option<DatastoreMessage>> {
let _span = debug_span!("matcher", temp_id = blob.temp_id()).entered();
let t1 = Instant::now();
let language_hint = if fast_mode || !should_attempt_tree_sitter(blob.len()) {
let language_hint = if fast_mode || !should_attempt_context_verification(blob.len()) {
None
} else {
origin

8
testdata/css_vulnerable.css vendored Normal file
View file

@ -0,0 +1,8 @@
.banner {
password: "blink182";
background-image: url("all-along-the-watchtower");
}
.secret-key {
content: "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234";
}

View file

@ -0,0 +1,2 @@
# auth0 token abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234
# password = "superSecret123"

View file

@ -61,7 +61,7 @@ fn skips_base64_when_disabled() -> anyhow::Result<()> {
Ok(())
}
// Ensure disabling Base64 decoding does not trigger tree-sitter errors on empty files
// Ensure disabling Base64 decoding does not trigger context verifier errors on empty files
#[test]
fn no_base64_skips_empty_files() -> anyhow::Result<()> {
let dir = tempdir()?;
@ -87,9 +87,9 @@ fn no_base64_skips_empty_files() -> anyhow::Result<()> {
Ok(())
}
// Ensure tree-sitter based decoding works even when the standalone base64 scanner is disabled
// Ensure parser-based context extraction still surfaces base64-looking code assignments
#[test]
fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> {
fn detects_base64_in_code_with_context_verifier() -> anyhow::Result<()> {
let dir = tempdir()?;
let file_path = dir.path().join("secret.py");
// Base64 for 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

View file

@ -0,0 +1,83 @@
use std::{fs, process::Command};
use anyhow::{Context, Result};
use serde_json::{Deserializer, Value};
#[test]
fn scan_findings_match_pre_removal_baseline() -> Result<()> {
let output = Command::new(assert_cmd::cargo::cargo_bin!("kingfisher"))
.args(["scan", "testdata", "--format", "json", "--no-validate", "--no-update-check"])
.output()
.context("run kingfisher scan against testdata")?;
let code = output.status.code().unwrap_or_default();
assert!(
matches!(code, 0 | 200),
"expected exit code 0 or 200, got {code}. stderr:\n{}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8(output.stdout).context("scan stdout is not valid utf-8")?;
let mut stream = Deserializer::from_str(&stdout).into_iter::<Value>();
let value = stream
.next()
.transpose()
.context("parse scan json output")?
.context("scan output did not contain a json object")?;
let findings = value
.get("findings")
.and_then(Value::as_array)
.context("scan output missing findings array")?;
let mut actual = findings
.iter()
.filter(|finding| {
finding
.get("finding")
.and_then(Value::as_object)
.and_then(|data| data.get("path"))
.and_then(Value::as_str)
.map(|path| !path.starts_with("testdata/parsers/"))
.unwrap_or(true)
})
.map(|finding| {
let rule = finding.get("rule").and_then(Value::as_object).cloned().unwrap_or_default();
serde_json::json!({
"rule_id": rule.get("id").and_then(Value::as_str),
"snippet": finding
.get("finding")
.and_then(Value::as_object)
.and_then(|data| data.get("snippet"))
.and_then(Value::as_str),
})
})
.collect::<Vec<_>>();
actual.sort_by(|left, right| left.to_string().cmp(&right.to_string()));
let mut expected = serde_json::from_str::<Vec<Value>>(
&fs::read_to_string("testdata/parsers/scan_findings_baseline.json")
.context("read scan findings baseline")?,
)
.context("parse scan findings baseline json")?
.into_iter()
.filter(|finding| finding.get("snippet").and_then(Value::as_str).is_some())
.map(|finding| {
serde_json::json!({
"rule_id": finding.get("rule_id").and_then(Value::as_str),
"snippet": finding.get("snippet").and_then(Value::as_str),
})
})
.filter(|finding| {
finding
.get("snippet")
.and_then(Value::as_str)
.map(|snippet| !snippet.is_empty())
.unwrap_or(true)
})
.collect::<Vec<_>>();
expected.sort_by(|left, right| left.to_string().cmp(&right.to_string()));
assert_eq!(actual, expected);
Ok(())
}