performance improvements and rule improvements

This commit is contained in:
Mick Grove 2026-04-19 14:50:11 -07:00
commit a13b175fc5
14 changed files with 464 additions and 371 deletions

View file

@ -3,6 +3,7 @@
All notable changes to this project will be documented in this file.
## [v1.96.0]
- Added archive extraction for three Korean formats: HWPX (Hancom OWPML ZIP container), HWP (Hancom 5.x OLE2/CFBF binary — streams decoded via raw DEFLATE / zlib fallbacks), and EGG (ALZip; registered for enumeration and scanned as raw bytes since no open-source extractor exists).
- Added live HTTP validation for 18 rules across 15 providers: Val Town, Polar, hCaptcha, Thunderstore, Elastic Cloud (2 rules), LlamaCloud, Gemfury (2 rules), Vonage, ThingsBoard, Zapier, Facebook Access Token, GitLab Session Cookie, PostHog Feature Flags, Unkey API Key, and Hop.io (2 rules).
- Added revocation support for 7 rules across 6 providers: Discord webhooks (single-step DELETE), DigitalOcean PATs (self-revoke via OAuth), and multi-step HttpMultiStep revocation for LaunchDarkly, Resend, Linode, and Netlify (2 rules). Built-in revocation coverage is now 34 provider families with 53 revocation-enabled rules.
- Expanded Alibaba Cloud coverage with STS temporary credential detection for STS access key IDs, STS security tokens, and STS access key secrets. Built-in rule coverage is now 923 rules total.

12
Cargo.lock generated
View file

@ -1473,6 +1473,17 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
[[package]]
name = "cfb"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a347dcabdae9c31b0825fd6a8bed285ec9c2acb89c47827126d52fa4f59cece3"
dependencies = [
"fnv",
"uuid",
"web-time",
]
[[package]]
name = "cfg-if"
version = "1.0.4"
@ -5037,6 +5048,7 @@ dependencies = [
"bstr",
"bytes",
"bzip2-rs",
"cfb",
"chrono",
"clap",
"color-backtrace",

View file

@ -171,6 +171,7 @@ zip = { version = "8.5.0", default-features = false, features = ["deflate", "def
tar = "0.4.44"
lzma-rs = "0.3.0"
asar = "0.3.0"
cfb = "0.14"
rusqlite = { version = "0.39", features = ["bundled"] }
blake3 = "1.8.2"
memchr = "2.7"

View file

@ -56,7 +56,7 @@ Kingfisher is a high-performance, open source secret detection tool for source c
- **Revocation support matrix**: current built-in revocation coverage across providers and rule IDs ([docs/REVOCATION_PROVIDERS.md](/docs/REVOCATION_PROVIDERS.md))
- **Blast Radius Mapping**: instantly map leaked keys to their effective cloud identities and exposed resources with `--access-map`. Supports 39 providers (see table below).
- **Broad AI SaaS coverage**: finds and validates tokens for OpenAI, Anthropic, Google Gemini, Cohere, AWS Bedrock, Voyage AI, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, Together.ai, Zhipu, and many more
- **Compressed Files**: Supports extracting and scanning compressed files for secrets
- **Compressed Files**: Supports extracting and scanning compressed files for secrets, including `tar.gz`/`bz2`/`xz`, ZIP-family containers (`zip`, `jar`, `docx`, `xlsx`, `pptx`, `odt`, `epub`, `hwpx`, and more), `asar`, HWP (Hancom OLE2/CFBF binary with DEFLATE/zlib stream decoding), and EGG (ALZip; raw-byte scanning)
- **SQLite Database Scanning**: Automatically extracts and scans SQLite database contents for secrets stored in table rows
- **Python Bytecode (.pyc) Scanning**: Extracts and scans string constants from compiled Python (`.pyc`, `.pyo`) files
- **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md))

View file

@ -22,21 +22,8 @@ rules:
- 'hcaptcha_secret: ES_abcdef1234567890abcdef1234567890'
references:
- https://docs.hcaptcha.com/
validation:
type: Http
content:
request:
method: POST
url: https://api.hcaptcha.com/siteverify
headers:
Content-Type: application/x-www-form-urlencoded
body: "secret={{ TOKEN }}&response=invalid_response&sitekey=00000000-0000-0000-0000-000000000000"
response_matcher:
- report_response: true
- type: WordMatch
words:
- '"success":'
- type: WordMatch
words:
- "invalid-input-secret"
negative: true
- https://docs.hcaptcha.com/#verify-the-user-response-server-side
# No standalone validation is added: hCaptcha's siteverify endpoint returns the
# same invalid-input-response / missing-input-response errors for both real and
# mutated secrets when no genuine client response token is available, so it is
# not reliable for distinguishing active from inactive secrets during scanning.

View file

@ -3,20 +3,19 @@ rules:
id: kingfisher.hetzner.1
pattern: |
(?xi)
\b(?:hetzner|hcloud)
\b
(?:hetzner|hcloud)
(?:.|[\n\r]){0,48}?
(?:API[_-]?TOKEN|TOKEN|SECRET|KEY)
(?:.|[\n\r]){0,24}?
\b
(
[A-Za-z0-9]{64}
[A-Z0-9]{64}
)
\b
pattern_requirements:
min_digits: 3
min_uppercase: 3
min_lowercase: 3
min_entropy: 4.0
min_entropy: 3.0
confidence: medium
examples:
- 'HETZNER_API_TOKEN=CqM049yakVZO8EndHyyawia3EjIboWMVeoEeMW0UVN0SpTXryKh0zrtnGpeyAjTs'
@ -37,3 +36,6 @@ rules:
- type: StatusMatch
status: [200]
- type: JsonValid
# Revocation not added: Hetzner documents API token management through the
# Console project Security > API tokens UI, and the public Cloud API
# reference does not expose a token deletion/revocation endpoint.

View file

@ -13,7 +13,6 @@ rules:
min_uppercase: 4
min_entropy: 3.0
confidence: medium
categories: [api, key]
examples:
- 'AWS_ACCESS_KEY_ID=LSIAQAAAAAAVNCBMPN59'
- 'aws_access_key=LKIAQAAAAAAVNCBMPN59'

View file

@ -27,9 +27,7 @@ pub use rule::{
pub use rules::{Rules, RulesError};
// Re-export RulesDatabase
pub use rules_database::{
RuleDetectionProfileKind, RuleMatchProfile, RulesDatabase, format_regex_pattern,
};
pub use rules_database::{RulesDatabase, format_regex_pattern};
// Re-export defaults
pub use defaults::get_builtin_rules;

View file

@ -7,23 +7,11 @@ use vectorscan_rs::{BlockDatabase, Flag, Pattern};
use crate::rule::{RULE_COMMENTS_PATTERN, Rule};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RuleDetectionProfileKind {
SelfIdentifying,
ContextDependent,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RuleMatchProfile {
pub kind: RuleDetectionProfileKind,
pub reason_codes: Vec<&'static str>,
}
pub struct RulesDatabase {
// pub(crate) rules: Vec<Rule,>,
pub(crate) rules: Vec<Arc<Rule>>,
pub(crate) anchored_regexes: Vec<Regex>,
pub(crate) rule_match_profiles: Vec<RuleMatchProfile>,
pub(crate) self_identifying_flags: Vec<bool>,
pub(crate) vsdb: BlockDatabase,
}
@ -40,86 +28,12 @@ pub fn format_regex_pattern(pattern: &str) -> String {
}
impl RulesDatabase {
fn classify_rule_profile(rule: &Rule) -> RuleMatchProfile {
Self::classify_rule_match_profile(rule)
}
fn build_rule_profiles(rules: &[Arc<Rule>]) -> Vec<RuleMatchProfile> {
rules.iter().map(|r| Self::classify_rule_profile(r.as_ref())).collect()
}
pub fn get_regex_by_rule_id(&self, rule_id: &str) -> Option<&Regex> {
self.rules
.iter()
.position(|r| r.syntax().id == rule_id)
.and_then(|index| self.anchored_regexes.get(index))
}
pub fn classify_rule_match_profile(rule: &Rule) -> RuleMatchProfile {
let flattened = format_regex_pattern(&rule.syntax().pattern);
let normalized = flattened.to_lowercase();
let mut reason_codes: Vec<&'static str> = Vec::new();
let has_self_identifying_prefix = has_self_identifying_shape(&normalized);
if has_self_identifying_prefix {
reason_codes.push("self_identifying_prefix");
return RuleMatchProfile {
kind: RuleDetectionProfileKind::SelfIdentifying,
reason_codes,
};
}
let has_context_keywords =
["secret", "token", "key", "password", "private", "access", "client", "auth"]
.iter()
.any(|m| normalized.contains(m));
if has_context_keywords {
reason_codes.push("context_keywords");
}
let has_distance_operator = normalized.contains("(?:.|[\\n\\r]){0,");
if has_distance_operator {
reason_codes.push("distance_operator");
}
let has_depends_on = !rule.syntax().depends_on_rule.is_empty();
if has_depends_on {
reason_codes.push("depends_on_rule");
}
let max_quantifier = max_quantifier_min_value(&normalized);
let looks_generic_token = has_generic_token_class(&normalized) && max_quantifier >= 24;
if looks_generic_token {
reason_codes.push("generic_token_shape");
}
let mut score = 0usize;
if has_context_keywords {
score += 1;
}
if has_distance_operator {
score += 1;
}
if looks_generic_token {
score += 2;
}
if has_depends_on {
score += 1;
}
let is_context_dependent = score >= 3;
if !is_context_dependent {
return RuleMatchProfile {
kind: RuleDetectionProfileKind::SelfIdentifying,
reason_codes,
};
}
if looks_generic_token && has_distance_operator {
reason_codes.push("strict_contextual_shape");
}
RuleMatchProfile { kind: RuleDetectionProfileKind::ContextDependent, reason_codes }
}
pub fn get_rule_by_finding_fingerprint(&self, finding_fingerprint: &str) -> Option<Arc<Rule>> {
self.rules.iter().find(|r| r.finding_sha1_fingerprint() == finding_fingerprint).cloned()
@ -155,9 +69,9 @@ impl RulesDatabase {
Ok(vsdb) => {
let d1 = t1.elapsed().as_secs_f64();
let (anchored_regexes, d2) = Self::compile_regexes(&rules)?;
let rule_match_profiles = Self::build_rule_profiles(&rules);
let self_identifying_flags = Self::build_self_identifying_flags(&rules);
debug!("Compiled {} rules: vectorscan {}s; regex {}s", rules.len(), d1, d2);
Ok(RulesDatabase { rules, vsdb, anchored_regexes, rule_match_profiles })
Ok(RulesDatabase { rules, vsdb, anchored_regexes, self_identifying_flags })
}
Err(e) => {
error!(
@ -225,12 +139,12 @@ impl RulesDatabase {
bail!("Failed to compile the following rules:\n{}", error_messages.join("\n"));
}
let vsdb = BlockDatabase::new(compiled_patterns)?;
let rule_match_profiles = Self::build_rule_profiles(&compiled_rules);
let self_identifying_flags = Self::build_self_identifying_flags(&compiled_rules);
Ok(RulesDatabase {
rules: compiled_rules,
vsdb,
anchored_regexes: compiled_regexes,
rule_match_profiles,
self_identifying_flags,
})
}
@ -287,14 +201,26 @@ impl RulesDatabase {
&self.anchored_regexes
}
/// Returns true when the rule at `index` is recognised as
/// self-identifying by literal pattern shape (e.g. `GHP_`, `AIzaSy`,
/// `xox[pbarose]`, PEM envelopes, Slack webhook URLs). Self-identifying
/// rules bypass structural context gating — their regex shape already
/// provides strong precision.
#[inline]
pub fn rule_match_profiles(&self) -> &[RuleMatchProfile] {
&self.rule_match_profiles
pub fn is_rule_self_identifying(&self, index: usize) -> bool {
self.self_identifying_flags.get(index).copied().unwrap_or(false)
}
fn build_self_identifying_flags(rules: &[Arc<Rule>]) -> Vec<bool> {
rules
.iter()
.map(|rule| has_self_identifying_shape(&format_regex_pattern(&rule.syntax().pattern).to_lowercase()))
.collect()
}
}
fn has_self_identifying_shape(normalized_pattern: &str) -> bool {
let literal_markers = [
const LITERAL_MARKERS: &[&str] = &[
"ccipat_",
"xapp-",
"ghp_",
@ -308,55 +234,21 @@ fn has_self_identifying_shape(normalized_pattern: &str) -> bool {
"https://hooks\\.slack\\.com/services/",
];
literal_markers.iter().any(|needle| normalized_pattern.contains(needle))
|| normalized_pattern.contains("xox[pbarose]")
|| normalized_pattern.contains("xoxe\\.xox[bparose]-")
|| normalized_pattern.contains("xoxe-\\d-")
|| (normalized_pattern.contains("-----begin\\s")
&& normalized_pattern.contains("private\\skey")
&& normalized_pattern.contains("-----end\\s"))
|| (normalized_pattern.contains("-----begin\\ ")
&& normalized_pattern.contains("private\\ key")
&& normalized_pattern.contains("-----end\\ "))
}
fn has_generic_token_class(normalized_pattern: &str) -> bool {
[
"[a-za-z0-9]{",
"[a-z0-9]{",
"[a-f0-9]{",
"[a-z0-9_-]{",
"[a-za-z0-9_-]{",
"[a-za-z0-9+/]{",
"[a-za-z0-9+/=]{",
]
.iter()
.any(|needle| normalized_pattern.contains(needle))
}
fn max_quantifier_min_value(normalized_pattern: &str) -> usize {
let mut max_seen = 0usize;
let bytes = normalized_pattern.as_bytes();
let mut i = 0usize;
while i < bytes.len() {
if bytes[i] != b'{' {
i += 1;
continue;
}
let mut j = i + 1;
let mut val = 0usize;
let mut saw_digit = false;
while j < bytes.len() && bytes[j].is_ascii_digit() {
saw_digit = true;
val = val.saturating_mul(10).saturating_add((bytes[j] - b'0') as usize);
j += 1;
}
if saw_digit && val > max_seen {
max_seen = val;
}
i = j.saturating_add(1);
if LITERAL_MARKERS.iter().any(|needle| normalized_pattern.contains(needle)) {
return true;
}
max_seen
if normalized_pattern.contains("xox[pbarose]") || normalized_pattern.contains("xoxe-\\d-") {
return true;
}
let has_pem_escaped_space = normalized_pattern.contains("-----begin\\s")
&& normalized_pattern.contains("private\\skey")
&& normalized_pattern.contains("-----end\\s");
let has_pem_literal_space = normalized_pattern.contains("-----begin\\ ")
&& normalized_pattern.contains("private\\ key")
&& normalized_pattern.contains("-----end\\ ");
has_pem_escaped_space || has_pem_literal_space
}
#[cfg(test)]
@ -416,138 +308,3 @@ mod test_regex_cleaning {
}
}
#[cfg(test)]
mod test_rule_match_profiles {
use super::*;
use crate::rule::{Confidence, RuleSyntax, Validation};
fn mk_rule(id: &str, pattern: &str) -> Rule {
Rule::new(RuleSyntax {
id: id.to_string(),
name: id.to_string(),
pattern: pattern.to_string(),
confidence: Confidence::Medium,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
})
}
#[test]
fn classifies_self_identifying_prefix_rule() {
let rule =
mk_rule("kingfisher.circleci.1", r"(?x)\b(CCIPAT_[A-Za-z0-9]{22}_[a-z0-9]{40})\b");
let profile = RulesDatabase::classify_rule_profile(&rule);
assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying);
assert!(profile.reason_codes.contains(&"self_identifying_prefix"));
}
#[test]
fn classifies_google_api_key_rule_as_self_identifying() {
let rule = mk_rule("kingfisher.google.7", r"(?xi)\b(AIzaSy[A-Za-z0-9_-]{33})");
let profile = RulesDatabase::classify_rule_profile(&rule);
assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying);
}
#[test]
fn classifies_slack_token_charclass_rule_as_self_identifying() {
let rule = mk_rule(
"kingfisher.slack.2",
r"(?xi)\b(xox[pbarose][-0-9]{0,3}-[0-9a-z]{6,15}-[0-9a-z]{6,15}-[-0-9a-z]{6,66})\b",
);
let profile = RulesDatabase::classify_rule_profile(&rule);
assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying);
}
#[test]
fn classifies_slack_webhook_rule_as_self_identifying() {
let rule = mk_rule(
"kingfisher.slack.4",
r"(?xi)\b(https://hooks\.slack\.com/services/T[a-z0-9_-]{8,12}/B[a-z0-9_-]{8,12}/[a-z0-9_-]{20,30})",
);
let profile = RulesDatabase::classify_rule_profile(&rule);
assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying);
}
#[test]
fn classifies_pypi_token_rule_as_self_identifying() {
let rule = mk_rule("kingfisher.pypi.1", r"(?x)(pypi-AgEIcHlwaS5vcmc[A-Za-z0-9_-]{50,})\b");
let profile = RulesDatabase::classify_rule_profile(&rule);
assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying);
}
#[test]
fn classifies_private_key_envelope_rules_as_self_identifying() {
let rule = mk_rule(
"kingfisher.privkey.2",
r"(?xims)(-----BEGIN\s(?:RSA|PGP|DSA|OPENSSH|ENCRYPTED|EC)?\s{0,1}PRIVATE\sKEY-----[a-z0-9 /+=\r\n\\n]{32,}?-----END\s(?:RSA|PGP|DSA|OPENSSH|ENCRYPTED|EC)?\s{0,1}PRIVATE\sKEY-----)",
);
let profile = RulesDatabase::classify_rule_profile(&rule);
assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying);
let pem_rule = mk_rule(
"kingfisher.pem.1",
r#"(?x)-----BEGIN\ .{0,20}\ ?PRIVATE\ KEY\ ?.{0,20}-----\s*((?:[a-zA-Z0-9+/=\s"',]|\\r|\\n){50,})\s*-----END\ .{0,20}\ ?PRIVATE\ KEY\ ?.{0,20}-----"#,
);
let pem_profile = RulesDatabase::classify_rule_profile(&pem_rule);
assert_eq!(pem_profile.kind, RuleDetectionProfileKind::SelfIdentifying);
}
#[test]
fn classifies_context_dependent_generic_rule() {
let rule = mk_rule(
"kingfisher.auth0.2",
r"(?xi)\bauth0(?:.|[\n\r]){0,16}?(?:secret|token)(?:.|[\n\r]){0,64}?\b([a-z0-9_-]{64,})\b",
);
let profile = RulesDatabase::classify_rule_profile(&rule);
assert_eq!(profile.kind, RuleDetectionProfileKind::ContextDependent);
assert!(profile.reason_codes.contains(&"generic_token_shape"));
assert!(profile.reason_codes.contains(&"strict_contextual_shape"));
}
#[test]
fn context_like_rule_is_parser_gated() {
let rule = mk_rule(
"kingfisher.example.1",
r"(?xi)\bexample(?:.|[\n\r]){0,16}?(?:secret|token)(?:.|[\n\r]){0,64}?\b([a-z0-9_-]{64,})\b",
);
let profile = RulesDatabase::classify_rule_profile(&rule);
assert_eq!(profile.kind, RuleDetectionProfileKind::ContextDependent);
}
#[test]
fn depends_on_rules_keep_raw_when_parser_unavailable() {
use crate::rule::DependsOnRule;
let rule = Rule::new(RuleSyntax {
id: "kingfisher.algolia.1".to_string(),
name: "algolia".to_string(),
pattern: r"(?xi)algolia(?:.|[\n\r]){0,32}?([a-z0-9]{32})".to_string(),
confidence: Confidence::Medium,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![Some(DependsOnRule {
rule_id: "kingfisher.algolia.2".to_string(),
variable: "APPID".to_string(),
})],
pattern_requirements: None,
tls_mode: None,
});
let profile = RulesDatabase::classify_rule_profile(&rule);
assert_eq!(profile.kind, RuleDetectionProfileKind::ContextDependent);
assert!(profile.reason_codes.contains(&"depends_on_rule"));
}
}

View file

@ -8,6 +8,7 @@ description: "Kingfisher release history: new features, rules, bug fixes, and im
All notable changes to this project will be documented in this file.
## [v1.96.0]
- Added archive extraction for three Korean formats: HWPX (Hancom OWPML ZIP container), HWP (Hancom 5.x OLE2/CFBF binary — streams decoded via raw DEFLATE / zlib fallbacks), and EGG (ALZip; registered for enumeration and scanned as raw bytes since no open-source extractor exists).
- Added live HTTP validation for 18 rules across 15 providers: Val Town, Polar, hCaptcha, Thunderstore, Elastic Cloud (2 rules), LlamaCloud, Gemfury (2 rules), Vonage, ThingsBoard, Zapier, Facebook Access Token, GitLab Session Cookie, PostHog Feature Flags, Unkey API Key, and Hop.io (2 rules).
- Added revocation support for 7 rules across 6 providers: Discord webhooks (single-step DELETE), DigitalOcean PATs (self-revoke via OAuth), and multi-step HttpMultiStep revocation for LaunchDarkly, Resend, Linode, and Netlify (2 rules). Built-in revocation coverage is now 34 provider families with 53 revocation-enabled rules.
- Expanded Alibaba Cloud coverage with STS temporary credential detection for STS access key IDs, STS security tokens, and STS access key secrets. Built-in rule coverage is now 923 rules total.

View file

@ -7,7 +7,8 @@ use std::{
use anyhow::Result;
use asar::AsarReader;
use bzip2_rs::DecoderReader;
use flate2::read::{GzDecoder, ZlibDecoder};
use cfb::CompoundFile;
use flate2::read::{DeflateDecoder, GzDecoder, ZlibDecoder};
use lzma_rs::xz_decompress;
use memmap2::Mmap;
use tar::Archive;
@ -19,7 +20,7 @@ use zip::ZipArchive;
pub const ZIP_BASED_FORMATS: &[&str] = &[
"zip", "zipx", "jar", "war", "ear", "aar", "jmod", "jhm", "jnlp", "nupkg", "vsix", "xap",
"docx", "xlsx", "pptx", "odt", "ods", "odp", "odg", "odf", "epub", "gadget", "kmz", "widget",
"xpi", "sketch", "pages", "key", "numbers",
"xpi", "sketch", "pages", "key", "numbers", "hwpx",
];
/// Break `<name>.<outer>.<inner>` into `(Some(outer), Some(inner))`.
@ -160,6 +161,61 @@ fn handle_zip_archive_streaming(
Ok(CompressedContent::ArchiveFiles(entries_on_disk))
}
/// Extract streams from an HWP (Hancom Word Processor) file.
///
/// HWP 5.x uses the Microsoft Compound File Binary (OLE2/CFBF) container.
/// Body streams (e.g. `BodyText/Section*`) are typically raw DEFLATE
/// without a zlib header, others may be zlib-framed, and metadata
/// streams are plaintext UTF-16/ASCII. We try DEFLATE then zlib, and
/// fall back to the raw bytes so the scanner always sees content.
fn handle_hwp_archive_in_memory(path: &Path, archive_path: &Path) -> Result<CompressedContent> {
let file = safe_open_for_read(path)?;
let mut cf = CompoundFile::open(file)?;
let stream_paths: Vec<PathBuf> =
cf.walk().filter(|e| e.is_stream()).map(|e| e.path().to_path_buf()).collect();
let mut out = Vec::with_capacity(stream_paths.len());
for sp in stream_paths {
let mut raw = Vec::new();
match cf.open_stream(&sp) {
Ok(mut s) => {
if let Err(e) = s.read_to_end(&mut raw) {
tracing::debug!("failed to read hwp stream {}: {}", sp.display(), e);
continue;
}
}
Err(e) => {
tracing::debug!("failed to open hwp stream {}: {}", sp.display(), e);
continue;
}
}
let decoded = {
let mut buf = Vec::new();
if !raw.is_empty()
&& DeflateDecoder::new(&raw[..]).read_to_end(&mut buf).is_ok()
&& !buf.is_empty()
{
buf
} else {
buf.clear();
if !raw.is_empty()
&& ZlibDecoder::new(&raw[..]).read_to_end(&mut buf).is_ok()
&& !buf.is_empty()
{
buf
} else {
raw
}
}
};
let logical = format!("{}!{}", archive_path.display(), sp.display());
out.push((logical, decoded));
}
Ok(CompressedContent::Archive(out))
}
fn handle_asar_archive_in_memory(buffer: &[u8], archive_path: &Path) -> Result<CompressedContent> {
match AsarReader::new(buffer, None) {
Ok(reader) => {
@ -219,6 +275,17 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result<CompressedCon
let mmap = unsafe { Mmap::map(&file)? };
return handle_asar_archive_in_memory(&mmap, path);
}
"hwp" => {
return handle_hwp_archive_in_memory(path, path);
}
"egg" => {
// No open-source EGG (ALZip) extractor exists. Return the
// raw bytes so plaintext content inside the container is
// still scanned.
let mut buffer = Vec::new();
file.read_to_end(&mut buffer)?;
return Ok(CompressedContent::Raw(buffer));
}
"tar" => {
if let Some(base) = base_dir {
return handle_tar_archive_streaming(&mut file, path, base);
@ -584,4 +651,120 @@ mod tests {
Ok(())
}
#[test]
fn smoke_decompress_hwpx_archive() -> anyhow::Result<()> {
let dir = tempdir()?;
let hwpx_path = dir.path().join("document.hwpx");
let github_pat = "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"; // this is not a real secret
{
let file = File::create(&hwpx_path)?;
let mut zip = ZipWriter::new(file);
let options = SimpleFileOptions::default()
.compression_method(CompressionMethod::Deflated)
.unix_permissions(0o644);
zip.start_file("Contents/section0.xml", options)?;
zip.write_all(
format!("<?xml version=\"1.0\"?><doc>token={github_pat}</doc>").as_bytes(),
)?;
zip.finish()?;
}
let tmp = tempdir()?;
let content = decompress_once(&hwpx_path, Some(tmp.path()))?;
if let CompressedContent::ArchiveFiles(files) = content {
let mut found = false;
for (logical, path) in files {
if logical.ends_with("!Contents/section0.xml") {
let txt = std::fs::read_to_string(&path)?;
assert!(txt.contains(github_pat));
found = true;
}
}
assert!(found, "did not find Contents/section0.xml in hwpx ArchiveFiles");
} else {
panic!("expected ArchiveFiles for hwpx archive, got {:?}", content);
}
Ok(())
}
#[test]
fn smoke_decompress_hwp_archive() -> anyhow::Result<()> {
use cfb::CompoundFile;
use flate2::{Compression, write::ZlibEncoder};
let dir = tempdir()?;
let hwp_path = dir.path().join("document.hwp");
let github_pat = "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"; // this is not a real secret
// Build a minimal CFB with two streams: one plaintext, one zlib-framed.
{
let file = File::create(&hwp_path)?;
let mut cf = CompoundFile::create(file)?;
cf.create_storage("/BodyText")?;
let mut s_plain = cf.create_stream("/DocInfo")?;
s_plain.write_all(format!("metadata token={github_pat}").as_bytes())?;
drop(s_plain);
let mut zencoder = ZlibEncoder::new(Vec::new(), Compression::default());
zencoder.write_all(format!("body token={github_pat}").as_bytes())?;
let zbytes = zencoder.finish()?;
let mut s_body = cf.create_stream("/BodyText/Section0")?;
s_body.write_all(&zbytes)?;
drop(s_body);
cf.flush()?;
}
let content = decompress_once(&hwp_path, None)?;
if let CompressedContent::Archive(entries) = content {
let mut saw_plain = false;
let mut saw_body = false;
for (logical, bytes) in &entries {
let as_str = String::from_utf8_lossy(bytes);
if logical.contains("DocInfo") && as_str.contains(github_pat) {
saw_plain = true;
}
if logical.contains("Section0") && as_str.contains(github_pat) {
saw_body = true;
}
}
assert!(saw_plain, "plaintext DocInfo stream missing or not decoded");
assert!(saw_body, "zlib-framed BodyText/Section0 stream missing or not decoded");
} else {
panic!("expected Archive for hwp, got {:?}", content);
}
Ok(())
}
#[test]
fn smoke_decompress_egg_raw() -> anyhow::Result<()> {
let dir = tempdir()?;
let egg_path = dir.path().join("archive.egg");
let github_pat = "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"; // this is not a real secret
{
let mut f = File::create(&egg_path)?;
f.write_all(format!("EGG-pretend-header\ntoken={github_pat}\n").as_bytes())?;
}
let content = decompress_once(&egg_path, None)?;
match content {
CompressedContent::Raw(bytes) => {
let as_str = String::from_utf8_lossy(&bytes);
assert!(
as_str.contains(github_pat),
"raw egg bytes did not contain the embedded pat"
);
}
other => panic!("expected Raw for egg, got {:?}", other),
}
Ok(())
}
}

View file

@ -27,7 +27,7 @@ use crate::{
parser::Language,
rule_profiling::{ConcurrentRuleProfiler, RuleStats},
rules::rule::Rule,
rules_database::{RuleDetectionProfileKind, RuleMatchProfile, RulesDatabase},
rules_database::RulesDatabase,
scanner_pool::ScannerPool,
validation_body::ValidationResponseBody,
};
@ -403,7 +403,8 @@ impl<'a> Matcher<'a> {
}
}
}
maybe_apply_context_verification(
maybe_apply_markup_context_gate(
self.rules_db,
blob,
lang_hint,
@ -411,6 +412,7 @@ impl<'a> Matcher<'a> {
&mut matches,
&match_rule_indices,
);
// Finalize
if !no_dedup && !matches.is_empty() {
let blob_id = blob.id();
@ -436,7 +438,22 @@ impl<'a> Matcher<'a> {
}
}
fn maybe_apply_context_verification<'a>(
/// Apply parser-based context verification only for HTML and CSS blobs.
///
/// HTML and CSS are the one regime where regex can't easily express
/// "this capture is in a real value position" — attribute values, CSS
/// property values, and nested script/style content need structural
/// understanding. For every other language (and for blobs without a
/// language hint, e.g. logs, binaries), this function is a no-op.
///
/// Self-identifying rules (matched by literal shape — `GHP_`, `AIzaSy`,
/// `xox[pbarose]`, PEM envelopes, Slack webhook URLs, etc.) bypass the
/// gate even in HTML/CSS so plain-prose leaks are still caught.
///
/// The gate is subtractive only when the parser actually runs and rejects
/// a match. If the parser is unavailable (too-large blob, parser error),
/// all matches are kept — never silently dropped.
fn maybe_apply_markup_context_gate<'a>(
rules_db: &RulesDatabase,
blob: &'a Blob,
lang_hint: Option<&str>,
@ -447,8 +464,17 @@ fn maybe_apply_context_verification<'a>(
if matches.is_empty() {
return;
}
if !should_attempt_context_verification(blob_len) {
return;
}
let Some(hint) = lang_hint else {
return;
};
let language = match Language::from_hint(hint) {
Some(lang @ (Language::Html | Language::Css)) => lang,
_ => return,
};
let profiles = rules_db.rule_match_profiles();
let candidate_indices: Vec<usize> = matches
.iter()
.enumerate()
@ -456,10 +482,10 @@ fn maybe_apply_context_verification<'a>(
if m.is_base64 {
return false;
}
let Some(rule_idx) = match_rule_indices.get(*idx) else {
return false;
};
profiles[*rule_idx].kind == RuleDetectionProfileKind::ContextDependent
match match_rule_indices.get(*idx) {
Some(rule_idx) => !rules_db.is_rule_self_identifying(*rule_idx),
None => false,
}
})
.map(|(idx, _)| idx)
.collect();
@ -468,25 +494,6 @@ fn maybe_apply_context_verification<'a>(
return;
}
let mut keep = vec![true; matches.len()];
let mandatory_candidate_indices: Vec<usize> = candidate_indices
.iter()
.copied()
.filter(|idx| {
let Some(rule_idx) = match_rule_indices.get(*idx) else {
return false;
};
should_require_context_verification(&profiles[*rule_idx])
})
.collect();
let Some(language) = load_context_verifier_language(lang_hint, blob_len) else {
for idx in mandatory_candidate_indices {
keep[idx] = false;
}
filter_kept_matches(matches, &keep);
return;
};
let mut remaining = candidate_indices.clone();
let verification = parser::stream_context_candidates(blob.bytes(), &language, |text| {
remaining.retain(|idx| {
@ -501,25 +508,18 @@ fn maybe_apply_context_verification<'a>(
});
if let Err(e) = verification {
debug!("context verification unavailable: {e}");
remaining = mandatory_candidate_indices;
debug!("HTML/CSS context verification unavailable: {e}");
return;
}
if remaining.is_empty() {
return;
}
let mut keep = vec![true; matches.len()];
for idx in remaining {
keep[idx] = false;
}
filter_kept_matches(matches, &keep);
}
fn should_require_context_verification(profile: &RuleMatchProfile) -> bool {
profile.reason_codes.contains(&"strict_contextual_shape")
}
fn filter_kept_matches<'a>(matches: &mut Vec<BlobMatch<'a>>, keep: &[bool]) {
if keep.iter().all(|k| *k) {
return;
}
let mut filtered = Vec::with_capacity(matches.len());
for (idx, item) in std::mem::take(matches).into_iter().enumerate() {
if keep[idx] {
@ -529,14 +529,6 @@ fn filter_kept_matches<'a>(matches: &mut Vec<BlobMatch<'a>>, keep: &[bool]) {
*matches = filtered;
}
fn load_context_verifier_language(lang_hint: Option<&str>, blob_len: usize) -> Option<Language> {
if !should_attempt_context_verification(blob_len) {
return None;
}
let lang = lang_hint?;
Language::from_hint(lang)
}
fn verify_match_in_context_text(
re: &regex::bytes::Regex,
expected_secret: &[u8],
@ -1174,15 +1166,12 @@ line2
ScanResult::New(matches) => matches,
_ => panic!("unexpected scan result"),
};
assert!(
found.is_empty(),
"comment-only contextual hits should be suppressed when parser-based verification cannot confirm assignment context"
);
assert_eq!(found.len(), 1, "raw regex matches should remain findings without classifier gating");
Ok(())
}
#[test]
fn strict_context_rule_suppresses_raw_when_context_verification_is_unavailable() -> Result<()> {
fn strict_context_rule_survives_without_classifier_gating() -> Result<()> {
let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234";
let rule = Rule::new(RuleSyntax {
id: "kingfisher.auth0.2".into(),
@ -1215,10 +1204,7 @@ line2
ScanResult::New(matches) => matches,
_ => panic!("unexpected scan result"),
};
assert!(
found.is_empty(),
"strict contextual rules should be suppressed when parser-based verification cannot run"
);
assert_eq!(found.len(), 1, "strict contextual rules should still be reported without classifier gating");
Ok(())
}
@ -1259,7 +1245,7 @@ line2
assert_eq!(
found.len(),
1,
"assignment-style contextual rules should still scan raw text when parser context is unavailable"
"assignment-style contextual rules should still scan raw text without classifier gating"
);
Ok(())
}
@ -1306,7 +1292,7 @@ line2
assert_eq!(
found.len(),
1,
"depends_on assignment-style rules should still scan raw text when parser context is unavailable"
"depends_on assignment-style rules should still scan raw text without classifier gating"
);
Ok(())
}
@ -1386,7 +1372,173 @@ line2
assert_eq!(
found.len(),
1,
"self-identifying token families should not require parser context"
"self-identifying token families should still be reported without classifier gating"
);
Ok(())
}
fn generic_auth0_rule() -> Rule {
Rule::new(RuleSyntax {
id: "kingfisher.auth0.2".into(),
name: "auth0 secret".into(),
pattern: "(?xi)\\bauth0(?:.|[\\n\\r]){0,16}?(?:secret|token)(?:.|[\\n\\r]){0,64}?\\b([a-z0-9_-]{64,})\\b".into(),
confidence: crate::rules::rule::Confidence::Medium,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
})
}
#[test]
fn html_gate_drops_generic_contextual_match_outside_value_position() -> Result<()> {
let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234";
let rules_db = RulesDatabase::from_rules(vec![generic_auth0_rule()])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let body = format!(
"<html><body><!-- auth0 secret {token} --></body></html>"
);
let blob = Blob::from_bytes(body.into_bytes());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("page.html")));
let found = match matcher.scan_blob(
&blob,
&origin,
Some("html".to_string()),
false,
false,
false,
)? {
ScanResult::New(matches) => matches,
_ => panic!("unexpected scan result"),
};
assert!(
found.is_empty(),
"HTML gate should drop generic contextual hits that sit outside any value position"
);
Ok(())
}
#[test]
fn html_gate_keeps_generic_contextual_match_inside_script_assignment() -> Result<()> {
let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234";
let rules_db = RulesDatabase::from_rules(vec![generic_auth0_rule()])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let body = format!(
"<html><body><script>const auth0_client_secret = \"{token}\";</script></body></html>"
);
let blob = Blob::from_bytes(body.into_bytes());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("app.html")));
let found = match matcher.scan_blob(
&blob,
&origin,
Some("html".to_string()),
false,
false,
false,
)? {
ScanResult::New(matches) => matches,
_ => panic!("unexpected scan result"),
};
assert_eq!(
found.len(),
1,
"HTML gate should keep generic contextual hits that appear inside a script assignment"
);
Ok(())
}
#[test]
fn html_gate_does_not_affect_self_identifying_rule_in_prose() -> Result<()> {
let rule = Rule::new(RuleSyntax {
id: "kingfisher.google.7".into(),
name: "google api key".into(),
pattern: "(?xi)\\b(AIzaSy[A-Za-z0-9_-]{33})".into(),
confidence: crate::rules::rule::Confidence::Medium,
min_entropy: 0.0,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None::<Validation>,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
});
let rules_db = RulesDatabase::from_rules(vec![rule])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let body = "<html><body><p>Key: AIzaSyBUPHAjZl3n8Eza66ka6B78iVyPteC5MgM</p></body></html>"
.to_string();
let blob = Blob::from_bytes(body.into_bytes());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("docs.html")));
let found = match matcher.scan_blob(
&blob,
&origin,
Some("html".to_string()),
false,
false,
false,
)? {
ScanResult::New(matches) => matches,
_ => panic!("unexpected scan result"),
};
assert_eq!(
found.len(),
1,
"self-identifying rules must bypass the HTML gate so prose leaks still fire"
);
Ok(())
}
#[test]
fn html_gate_does_not_trigger_for_other_languages() -> Result<()> {
let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234";
let rules_db = RulesDatabase::from_rules(vec![generic_auth0_rule()])?;
let seen = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone())));
let mut matcher =
Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?;
let body = format!("# auth0 secret {token}");
let blob = Blob::from_bytes(body.into_bytes());
let origin = OriginSet::from(Origin::from_file(PathBuf::from("notes.py")));
let found = match matcher.scan_blob(
&blob,
&origin,
Some("python".to_string()),
false,
false,
false,
)? {
ScanResult::New(matches) => matches,
_ => panic!("unexpected scan result"),
};
assert_eq!(
found.len(),
1,
"non-HTML/CSS blobs must bypass the gate even when parser hint is available"
);
Ok(())
}

View file

@ -2,6 +2,4 @@
//!
//! This module re-exports types from [`kingfisher_rules::rules_database`].
pub use kingfisher_rules::rules_database::{
RuleDetectionProfileKind, RuleMatchProfile, RulesDatabase, format_regex_pattern,
};
pub use kingfisher_rules::rules_database::{RulesDatabase, format_regex_pattern};

View file

@ -25,7 +25,9 @@ pub fn is_compressed_file(path: &Path) -> bool {
|| ext_lower == "tar"
|| ext_lower == "zlib"
|| ext_lower == "asar"
|| ZIP_BASED_FORMATS.iter().any(|z| *z == ext)
|| ext_lower == "hwp"
|| ext_lower == "egg"
|| ZIP_BASED_FORMATS.iter().any(|z| *z == ext_lower)
} else {
false
}