diff --git a/CHANGELOG.md b/CHANGELOG.md index d60336c..20451b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ All notable changes to this project will be documented in this file. ## [v1.96.0] +- Added archive extraction for three Korean formats: HWPX (Hancom OWPML ZIP container), HWP (Hancom 5.x OLE2/CFBF binary — streams decoded via raw DEFLATE / zlib fallbacks), and EGG (ALZip; registered for enumeration and scanned as raw bytes since no open-source extractor exists). - Added live HTTP validation for 18 rules across 15 providers: Val Town, Polar, hCaptcha, Thunderstore, Elastic Cloud (2 rules), LlamaCloud, Gemfury (2 rules), Vonage, ThingsBoard, Zapier, Facebook Access Token, GitLab Session Cookie, PostHog Feature Flags, Unkey API Key, and Hop.io (2 rules). - Added revocation support for 7 rules across 6 providers: Discord webhooks (single-step DELETE), DigitalOcean PATs (self-revoke via OAuth), and multi-step HttpMultiStep revocation for LaunchDarkly, Resend, Linode, and Netlify (2 rules). Built-in revocation coverage is now 34 provider families with 53 revocation-enabled rules. - Expanded Alibaba Cloud coverage with STS temporary credential detection for STS access key IDs, STS security tokens, and STS access key secrets. Built-in rule coverage is now 923 rules total. diff --git a/Cargo.lock b/Cargo.lock index ea42061..2b5f156 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1473,6 +1473,17 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" +[[package]] +name = "cfb" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a347dcabdae9c31b0825fd6a8bed285ec9c2acb89c47827126d52fa4f59cece3" +dependencies = [ + "fnv", + "uuid", + "web-time", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -5037,6 +5048,7 @@ dependencies = [ "bstr", "bytes", "bzip2-rs", + "cfb", "chrono", "clap", "color-backtrace", diff --git a/Cargo.toml b/Cargo.toml index 4cdcd32..2eeee47 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -171,6 +171,7 @@ zip = { version = "8.5.0", default-features = false, features = ["deflate", "def tar = "0.4.44" lzma-rs = "0.3.0" asar = "0.3.0" +cfb = "0.14" rusqlite = { version = "0.39", features = ["bundled"] } blake3 = "1.8.2" memchr = "2.7" diff --git a/README.md b/README.md index 503dca8..3c01413 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ Kingfisher is a high-performance, open source secret detection tool for source c - **Revocation support matrix**: current built-in revocation coverage across providers and rule IDs ([docs/REVOCATION_PROVIDERS.md](/docs/REVOCATION_PROVIDERS.md)) - **Blast Radius Mapping**: instantly map leaked keys to their effective cloud identities and exposed resources with `--access-map`. Supports 39 providers (see table below). - **Broad AI SaaS coverage**: finds and validates tokens for OpenAI, Anthropic, Google Gemini, Cohere, AWS Bedrock, Voyage AI, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, Together.ai, Zhipu, and many more -- **Compressed Files**: Supports extracting and scanning compressed files for secrets +- **Compressed Files**: Supports extracting and scanning compressed files for secrets, including `tar.gz`/`bz2`/`xz`, ZIP-family containers (`zip`, `jar`, `docx`, `xlsx`, `pptx`, `odt`, `epub`, `hwpx`, and more), `asar`, HWP (Hancom OLE2/CFBF binary with DEFLATE/zlib stream decoding), and EGG (ALZip; raw-byte scanning) - **SQLite Database Scanning**: Automatically extracts and scans SQLite database contents for secrets stored in table rows - **Python Bytecode (.pyc) Scanning**: Extracts and scans string constants from compiled Python (`.pyc`, `.pyo`) files - **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) diff --git a/crates/kingfisher-rules/data/rules/hcaptcha.yml b/crates/kingfisher-rules/data/rules/hcaptcha.yml index 42429d6..a7d54dc 100644 --- a/crates/kingfisher-rules/data/rules/hcaptcha.yml +++ b/crates/kingfisher-rules/data/rules/hcaptcha.yml @@ -22,21 +22,8 @@ rules: - 'hcaptcha_secret: ES_abcdef1234567890abcdef1234567890' references: - https://docs.hcaptcha.com/ - validation: - type: Http - content: - request: - method: POST - url: https://api.hcaptcha.com/siteverify - headers: - Content-Type: application/x-www-form-urlencoded - body: "secret={{ TOKEN }}&response=invalid_response&sitekey=00000000-0000-0000-0000-000000000000" - response_matcher: - - report_response: true - - type: WordMatch - words: - - '"success":' - - type: WordMatch - words: - - "invalid-input-secret" - negative: true + - https://docs.hcaptcha.com/#verify-the-user-response-server-side + # No standalone validation is added: hCaptcha's siteverify endpoint returns the + # same invalid-input-response / missing-input-response errors for both real and + # mutated secrets when no genuine client response token is available, so it is + # not reliable for distinguishing active from inactive secrets during scanning. diff --git a/crates/kingfisher-rules/data/rules/hetzner.yml b/crates/kingfisher-rules/data/rules/hetzner.yml index 4fffdd8..6d82dff 100644 --- a/crates/kingfisher-rules/data/rules/hetzner.yml +++ b/crates/kingfisher-rules/data/rules/hetzner.yml @@ -3,20 +3,19 @@ rules: id: kingfisher.hetzner.1 pattern: | (?xi) - \b(?:hetzner|hcloud) + \b + (?:hetzner|hcloud) (?:.|[\n\r]){0,48}? - (?:API[_-]?TOKEN|TOKEN|SECRET|KEY) - (?:.|[\n\r]){0,24}? \b ( - [A-Za-z0-9]{64} + [A-Z0-9]{64} ) \b pattern_requirements: min_digits: 3 min_uppercase: 3 min_lowercase: 3 - min_entropy: 4.0 + min_entropy: 3.0 confidence: medium examples: - 'HETZNER_API_TOKEN=CqM049yakVZO8EndHyyawia3EjIboWMVeoEeMW0UVN0SpTXryKh0zrtnGpeyAjTs' @@ -37,3 +36,6 @@ rules: - type: StatusMatch status: [200] - type: JsonValid + # Revocation not added: Hetzner documents API token management through the + # Console project Security > API tokens UI, and the public Cloud API + # reference does not expose a token deletion/revocation endpoint. diff --git a/crates/kingfisher-rules/data/rules/localstack.yml b/crates/kingfisher-rules/data/rules/localstack.yml index 7f610d4..d686996 100644 --- a/crates/kingfisher-rules/data/rules/localstack.yml +++ b/crates/kingfisher-rules/data/rules/localstack.yml @@ -13,7 +13,6 @@ rules: min_uppercase: 4 min_entropy: 3.0 confidence: medium - categories: [api, key] examples: - 'AWS_ACCESS_KEY_ID=LSIAQAAAAAAVNCBMPN59' - 'aws_access_key=LKIAQAAAAAAVNCBMPN59' diff --git a/crates/kingfisher-rules/src/lib.rs b/crates/kingfisher-rules/src/lib.rs index ac7e102..086ab7c 100644 --- a/crates/kingfisher-rules/src/lib.rs +++ b/crates/kingfisher-rules/src/lib.rs @@ -27,9 +27,7 @@ pub use rule::{ pub use rules::{Rules, RulesError}; // Re-export RulesDatabase -pub use rules_database::{ - RuleDetectionProfileKind, RuleMatchProfile, RulesDatabase, format_regex_pattern, -}; +pub use rules_database::{RulesDatabase, format_regex_pattern}; // Re-export defaults pub use defaults::get_builtin_rules; diff --git a/crates/kingfisher-rules/src/rules_database.rs b/crates/kingfisher-rules/src/rules_database.rs index 4a1c523..e883a66 100644 --- a/crates/kingfisher-rules/src/rules_database.rs +++ b/crates/kingfisher-rules/src/rules_database.rs @@ -7,23 +7,11 @@ use vectorscan_rs::{BlockDatabase, Flag, Pattern}; use crate::rule::{RULE_COMMENTS_PATTERN, Rule}; -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum RuleDetectionProfileKind { - SelfIdentifying, - ContextDependent, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct RuleMatchProfile { - pub kind: RuleDetectionProfileKind, - pub reason_codes: Vec<&'static str>, -} - pub struct RulesDatabase { // pub(crate) rules: Vec, pub(crate) rules: Vec>, pub(crate) anchored_regexes: Vec, - pub(crate) rule_match_profiles: Vec, + pub(crate) self_identifying_flags: Vec, pub(crate) vsdb: BlockDatabase, } @@ -40,86 +28,12 @@ pub fn format_regex_pattern(pattern: &str) -> String { } impl RulesDatabase { - fn classify_rule_profile(rule: &Rule) -> RuleMatchProfile { - Self::classify_rule_match_profile(rule) - } - - fn build_rule_profiles(rules: &[Arc]) -> Vec { - rules.iter().map(|r| Self::classify_rule_profile(r.as_ref())).collect() - } - pub fn get_regex_by_rule_id(&self, rule_id: &str) -> Option<&Regex> { self.rules .iter() .position(|r| r.syntax().id == rule_id) .and_then(|index| self.anchored_regexes.get(index)) } - pub fn classify_rule_match_profile(rule: &Rule) -> RuleMatchProfile { - let flattened = format_regex_pattern(&rule.syntax().pattern); - let normalized = flattened.to_lowercase(); - - let mut reason_codes: Vec<&'static str> = Vec::new(); - - let has_self_identifying_prefix = has_self_identifying_shape(&normalized); - if has_self_identifying_prefix { - reason_codes.push("self_identifying_prefix"); - return RuleMatchProfile { - kind: RuleDetectionProfileKind::SelfIdentifying, - reason_codes, - }; - } - - let has_context_keywords = - ["secret", "token", "key", "password", "private", "access", "client", "auth"] - .iter() - .any(|m| normalized.contains(m)); - if has_context_keywords { - reason_codes.push("context_keywords"); - } - - let has_distance_operator = normalized.contains("(?:.|[\\n\\r]){0,"); - if has_distance_operator { - reason_codes.push("distance_operator"); - } - - let has_depends_on = !rule.syntax().depends_on_rule.is_empty(); - if has_depends_on { - reason_codes.push("depends_on_rule"); - } - - let max_quantifier = max_quantifier_min_value(&normalized); - let looks_generic_token = has_generic_token_class(&normalized) && max_quantifier >= 24; - if looks_generic_token { - reason_codes.push("generic_token_shape"); - } - - let mut score = 0usize; - if has_context_keywords { - score += 1; - } - if has_distance_operator { - score += 1; - } - if looks_generic_token { - score += 2; - } - if has_depends_on { - score += 1; - } - - let is_context_dependent = score >= 3; - if !is_context_dependent { - return RuleMatchProfile { - kind: RuleDetectionProfileKind::SelfIdentifying, - reason_codes, - }; - } - if looks_generic_token && has_distance_operator { - reason_codes.push("strict_contextual_shape"); - } - - RuleMatchProfile { kind: RuleDetectionProfileKind::ContextDependent, reason_codes } - } pub fn get_rule_by_finding_fingerprint(&self, finding_fingerprint: &str) -> Option> { self.rules.iter().find(|r| r.finding_sha1_fingerprint() == finding_fingerprint).cloned() @@ -155,9 +69,9 @@ impl RulesDatabase { Ok(vsdb) => { let d1 = t1.elapsed().as_secs_f64(); let (anchored_regexes, d2) = Self::compile_regexes(&rules)?; - let rule_match_profiles = Self::build_rule_profiles(&rules); + let self_identifying_flags = Self::build_self_identifying_flags(&rules); debug!("Compiled {} rules: vectorscan {}s; regex {}s", rules.len(), d1, d2); - Ok(RulesDatabase { rules, vsdb, anchored_regexes, rule_match_profiles }) + Ok(RulesDatabase { rules, vsdb, anchored_regexes, self_identifying_flags }) } Err(e) => { error!( @@ -225,12 +139,12 @@ impl RulesDatabase { bail!("Failed to compile the following rules:\n{}", error_messages.join("\n")); } let vsdb = BlockDatabase::new(compiled_patterns)?; - let rule_match_profiles = Self::build_rule_profiles(&compiled_rules); + let self_identifying_flags = Self::build_self_identifying_flags(&compiled_rules); Ok(RulesDatabase { rules: compiled_rules, vsdb, anchored_regexes: compiled_regexes, - rule_match_profiles, + self_identifying_flags, }) } @@ -287,14 +201,26 @@ impl RulesDatabase { &self.anchored_regexes } + /// Returns true when the rule at `index` is recognised as + /// self-identifying by literal pattern shape (e.g. `GHP_`, `AIzaSy`, + /// `xox[pbarose]`, PEM envelopes, Slack webhook URLs). Self-identifying + /// rules bypass structural context gating — their regex shape already + /// provides strong precision. #[inline] - pub fn rule_match_profiles(&self) -> &[RuleMatchProfile] { - &self.rule_match_profiles + pub fn is_rule_self_identifying(&self, index: usize) -> bool { + self.self_identifying_flags.get(index).copied().unwrap_or(false) + } + + fn build_self_identifying_flags(rules: &[Arc]) -> Vec { + rules + .iter() + .map(|rule| has_self_identifying_shape(&format_regex_pattern(&rule.syntax().pattern).to_lowercase())) + .collect() } } fn has_self_identifying_shape(normalized_pattern: &str) -> bool { - let literal_markers = [ + const LITERAL_MARKERS: &[&str] = &[ "ccipat_", "xapp-", "ghp_", @@ -308,55 +234,21 @@ fn has_self_identifying_shape(normalized_pattern: &str) -> bool { "https://hooks\\.slack\\.com/services/", ]; - literal_markers.iter().any(|needle| normalized_pattern.contains(needle)) - || normalized_pattern.contains("xox[pbarose]") - || normalized_pattern.contains("xoxe\\.xox[bparose]-") - || normalized_pattern.contains("xoxe-\\d-") - || (normalized_pattern.contains("-----begin\\s") - && normalized_pattern.contains("private\\skey") - && normalized_pattern.contains("-----end\\s")) - || (normalized_pattern.contains("-----begin\\ ") - && normalized_pattern.contains("private\\ key") - && normalized_pattern.contains("-----end\\ ")) -} - -fn has_generic_token_class(normalized_pattern: &str) -> bool { - [ - "[a-za-z0-9]{", - "[a-z0-9]{", - "[a-f0-9]{", - "[a-z0-9_-]{", - "[a-za-z0-9_-]{", - "[a-za-z0-9+/]{", - "[a-za-z0-9+/=]{", - ] - .iter() - .any(|needle| normalized_pattern.contains(needle)) -} - -fn max_quantifier_min_value(normalized_pattern: &str) -> usize { - let mut max_seen = 0usize; - let bytes = normalized_pattern.as_bytes(); - let mut i = 0usize; - while i < bytes.len() { - if bytes[i] != b'{' { - i += 1; - continue; - } - let mut j = i + 1; - let mut val = 0usize; - let mut saw_digit = false; - while j < bytes.len() && bytes[j].is_ascii_digit() { - saw_digit = true; - val = val.saturating_mul(10).saturating_add((bytes[j] - b'0') as usize); - j += 1; - } - if saw_digit && val > max_seen { - max_seen = val; - } - i = j.saturating_add(1); + if LITERAL_MARKERS.iter().any(|needle| normalized_pattern.contains(needle)) { + return true; } - max_seen + + if normalized_pattern.contains("xox[pbarose]") || normalized_pattern.contains("xoxe-\\d-") { + return true; + } + + let has_pem_escaped_space = normalized_pattern.contains("-----begin\\s") + && normalized_pattern.contains("private\\skey") + && normalized_pattern.contains("-----end\\s"); + let has_pem_literal_space = normalized_pattern.contains("-----begin\\ ") + && normalized_pattern.contains("private\\ key") + && normalized_pattern.contains("-----end\\ "); + has_pem_escaped_space || has_pem_literal_space } #[cfg(test)] @@ -416,138 +308,3 @@ mod test_regex_cleaning { } } -#[cfg(test)] -mod test_rule_match_profiles { - use super::*; - use crate::rule::{Confidence, RuleSyntax, Validation}; - - fn mk_rule(id: &str, pattern: &str) -> Rule { - Rule::new(RuleSyntax { - id: id.to_string(), - name: id.to_string(), - pattern: pattern.to_string(), - confidence: Confidence::Medium, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - revocation: None, - depends_on_rule: vec![], - pattern_requirements: None, - tls_mode: None, - }) - } - - #[test] - fn classifies_self_identifying_prefix_rule() { - let rule = - mk_rule("kingfisher.circleci.1", r"(?x)\b(CCIPAT_[A-Za-z0-9]{22}_[a-z0-9]{40})\b"); - let profile = RulesDatabase::classify_rule_profile(&rule); - assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying); - assert!(profile.reason_codes.contains(&"self_identifying_prefix")); - } - - #[test] - fn classifies_google_api_key_rule_as_self_identifying() { - let rule = mk_rule("kingfisher.google.7", r"(?xi)\b(AIzaSy[A-Za-z0-9_-]{33})"); - let profile = RulesDatabase::classify_rule_profile(&rule); - assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying); - } - - #[test] - fn classifies_slack_token_charclass_rule_as_self_identifying() { - let rule = mk_rule( - "kingfisher.slack.2", - r"(?xi)\b(xox[pbarose][-0-9]{0,3}-[0-9a-z]{6,15}-[0-9a-z]{6,15}-[-0-9a-z]{6,66})\b", - ); - let profile = RulesDatabase::classify_rule_profile(&rule); - assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying); - } - - #[test] - fn classifies_slack_webhook_rule_as_self_identifying() { - let rule = mk_rule( - "kingfisher.slack.4", - r"(?xi)\b(https://hooks\.slack\.com/services/T[a-z0-9_-]{8,12}/B[a-z0-9_-]{8,12}/[a-z0-9_-]{20,30})", - ); - let profile = RulesDatabase::classify_rule_profile(&rule); - assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying); - } - - #[test] - fn classifies_pypi_token_rule_as_self_identifying() { - let rule = mk_rule("kingfisher.pypi.1", r"(?x)(pypi-AgEIcHlwaS5vcmc[A-Za-z0-9_-]{50,})\b"); - let profile = RulesDatabase::classify_rule_profile(&rule); - assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying); - } - - #[test] - fn classifies_private_key_envelope_rules_as_self_identifying() { - let rule = mk_rule( - "kingfisher.privkey.2", - r"(?xims)(-----BEGIN\s(?:RSA|PGP|DSA|OPENSSH|ENCRYPTED|EC)?\s{0,1}PRIVATE\sKEY-----[a-z0-9 /+=\r\n\\n]{32,}?-----END\s(?:RSA|PGP|DSA|OPENSSH|ENCRYPTED|EC)?\s{0,1}PRIVATE\sKEY-----)", - ); - let profile = RulesDatabase::classify_rule_profile(&rule); - assert_eq!(profile.kind, RuleDetectionProfileKind::SelfIdentifying); - - let pem_rule = mk_rule( - "kingfisher.pem.1", - r#"(?x)-----BEGIN\ .{0,20}\ ?PRIVATE\ KEY\ ?.{0,20}-----\s*((?:[a-zA-Z0-9+/=\s"',]|\\r|\\n){50,})\s*-----END\ .{0,20}\ ?PRIVATE\ KEY\ ?.{0,20}-----"#, - ); - let pem_profile = RulesDatabase::classify_rule_profile(&pem_rule); - assert_eq!(pem_profile.kind, RuleDetectionProfileKind::SelfIdentifying); - } - - #[test] - fn classifies_context_dependent_generic_rule() { - let rule = mk_rule( - "kingfisher.auth0.2", - r"(?xi)\bauth0(?:.|[\n\r]){0,16}?(?:secret|token)(?:.|[\n\r]){0,64}?\b([a-z0-9_-]{64,})\b", - ); - let profile = RulesDatabase::classify_rule_profile(&rule); - assert_eq!(profile.kind, RuleDetectionProfileKind::ContextDependent); - assert!(profile.reason_codes.contains(&"generic_token_shape")); - assert!(profile.reason_codes.contains(&"strict_contextual_shape")); - } - - #[test] - fn context_like_rule_is_parser_gated() { - let rule = mk_rule( - "kingfisher.example.1", - r"(?xi)\bexample(?:.|[\n\r]){0,16}?(?:secret|token)(?:.|[\n\r]){0,64}?\b([a-z0-9_-]{64,})\b", - ); - let profile = RulesDatabase::classify_rule_profile(&rule); - assert_eq!(profile.kind, RuleDetectionProfileKind::ContextDependent); - } - - #[test] - fn depends_on_rules_keep_raw_when_parser_unavailable() { - use crate::rule::DependsOnRule; - - let rule = Rule::new(RuleSyntax { - id: "kingfisher.algolia.1".to_string(), - name: "algolia".to_string(), - pattern: r"(?xi)algolia(?:.|[\n\r]){0,32}?([a-z0-9]{32})".to_string(), - confidence: Confidence::Medium, - min_entropy: 0.0, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None::, - revocation: None, - depends_on_rule: vec![Some(DependsOnRule { - rule_id: "kingfisher.algolia.2".to_string(), - variable: "APPID".to_string(), - })], - pattern_requirements: None, - tls_mode: None, - }); - - let profile = RulesDatabase::classify_rule_profile(&rule); - assert_eq!(profile.kind, RuleDetectionProfileKind::ContextDependent); - assert!(profile.reason_codes.contains(&"depends_on_rule")); - } -} diff --git a/docs-site/docs/changelog.md b/docs-site/docs/changelog.md index bc6b34e..034d588 100644 --- a/docs-site/docs/changelog.md +++ b/docs-site/docs/changelog.md @@ -8,6 +8,7 @@ description: "Kingfisher release history: new features, rules, bug fixes, and im All notable changes to this project will be documented in this file. ## [v1.96.0] +- Added archive extraction for three Korean formats: HWPX (Hancom OWPML ZIP container), HWP (Hancom 5.x OLE2/CFBF binary — streams decoded via raw DEFLATE / zlib fallbacks), and EGG (ALZip; registered for enumeration and scanned as raw bytes since no open-source extractor exists). - Added live HTTP validation for 18 rules across 15 providers: Val Town, Polar, hCaptcha, Thunderstore, Elastic Cloud (2 rules), LlamaCloud, Gemfury (2 rules), Vonage, ThingsBoard, Zapier, Facebook Access Token, GitLab Session Cookie, PostHog Feature Flags, Unkey API Key, and Hop.io (2 rules). - Added revocation support for 7 rules across 6 providers: Discord webhooks (single-step DELETE), DigitalOcean PATs (self-revoke via OAuth), and multi-step HttpMultiStep revocation for LaunchDarkly, Resend, Linode, and Netlify (2 rules). Built-in revocation coverage is now 34 provider families with 53 revocation-enabled rules. - Expanded Alibaba Cloud coverage with STS temporary credential detection for STS access key IDs, STS security tokens, and STS access key secrets. Built-in rule coverage is now 923 rules total. diff --git a/src/decompress.rs b/src/decompress.rs index 5545581..2d13c40 100644 --- a/src/decompress.rs +++ b/src/decompress.rs @@ -7,7 +7,8 @@ use std::{ use anyhow::Result; use asar::AsarReader; use bzip2_rs::DecoderReader; -use flate2::read::{GzDecoder, ZlibDecoder}; +use cfb::CompoundFile; +use flate2::read::{DeflateDecoder, GzDecoder, ZlibDecoder}; use lzma_rs::xz_decompress; use memmap2::Mmap; use tar::Archive; @@ -19,7 +20,7 @@ use zip::ZipArchive; pub const ZIP_BASED_FORMATS: &[&str] = &[ "zip", "zipx", "jar", "war", "ear", "aar", "jmod", "jhm", "jnlp", "nupkg", "vsix", "xap", "docx", "xlsx", "pptx", "odt", "ods", "odp", "odg", "odf", "epub", "gadget", "kmz", "widget", - "xpi", "sketch", "pages", "key", "numbers", + "xpi", "sketch", "pages", "key", "numbers", "hwpx", ]; /// Break `..` into `(Some(outer), Some(inner))`. @@ -160,6 +161,61 @@ fn handle_zip_archive_streaming( Ok(CompressedContent::ArchiveFiles(entries_on_disk)) } +/// Extract streams from an HWP (Hancom Word Processor) file. +/// +/// HWP 5.x uses the Microsoft Compound File Binary (OLE2/CFBF) container. +/// Body streams (e.g. `BodyText/Section*`) are typically raw DEFLATE +/// without a zlib header, others may be zlib-framed, and metadata +/// streams are plaintext UTF-16/ASCII. We try DEFLATE then zlib, and +/// fall back to the raw bytes so the scanner always sees content. +fn handle_hwp_archive_in_memory(path: &Path, archive_path: &Path) -> Result { + let file = safe_open_for_read(path)?; + let mut cf = CompoundFile::open(file)?; + let stream_paths: Vec = + cf.walk().filter(|e| e.is_stream()).map(|e| e.path().to_path_buf()).collect(); + + let mut out = Vec::with_capacity(stream_paths.len()); + for sp in stream_paths { + let mut raw = Vec::new(); + match cf.open_stream(&sp) { + Ok(mut s) => { + if let Err(e) = s.read_to_end(&mut raw) { + tracing::debug!("failed to read hwp stream {}: {}", sp.display(), e); + continue; + } + } + Err(e) => { + tracing::debug!("failed to open hwp stream {}: {}", sp.display(), e); + continue; + } + } + + let decoded = { + let mut buf = Vec::new(); + if !raw.is_empty() + && DeflateDecoder::new(&raw[..]).read_to_end(&mut buf).is_ok() + && !buf.is_empty() + { + buf + } else { + buf.clear(); + if !raw.is_empty() + && ZlibDecoder::new(&raw[..]).read_to_end(&mut buf).is_ok() + && !buf.is_empty() + { + buf + } else { + raw + } + } + }; + + let logical = format!("{}!{}", archive_path.display(), sp.display()); + out.push((logical, decoded)); + } + Ok(CompressedContent::Archive(out)) +} + fn handle_asar_archive_in_memory(buffer: &[u8], archive_path: &Path) -> Result { match AsarReader::new(buffer, None) { Ok(reader) => { @@ -219,6 +275,17 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result { + return handle_hwp_archive_in_memory(path, path); + } + "egg" => { + // No open-source EGG (ALZip) extractor exists. Return the + // raw bytes so plaintext content inside the container is + // still scanned. + let mut buffer = Vec::new(); + file.read_to_end(&mut buffer)?; + return Ok(CompressedContent::Raw(buffer)); + } "tar" => { if let Some(base) = base_dir { return handle_tar_archive_streaming(&mut file, path, base); @@ -584,4 +651,120 @@ mod tests { Ok(()) } + + #[test] + fn smoke_decompress_hwpx_archive() -> anyhow::Result<()> { + let dir = tempdir()?; + let hwpx_path = dir.path().join("document.hwpx"); + let github_pat = "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"; // this is not a real secret + + { + let file = File::create(&hwpx_path)?; + let mut zip = ZipWriter::new(file); + let options = SimpleFileOptions::default() + .compression_method(CompressionMethod::Deflated) + .unix_permissions(0o644); + + zip.start_file("Contents/section0.xml", options)?; + zip.write_all( + format!("token={github_pat}").as_bytes(), + )?; + zip.finish()?; + } + + let tmp = tempdir()?; + let content = decompress_once(&hwpx_path, Some(tmp.path()))?; + if let CompressedContent::ArchiveFiles(files) = content { + let mut found = false; + for (logical, path) in files { + if logical.ends_with("!Contents/section0.xml") { + let txt = std::fs::read_to_string(&path)?; + assert!(txt.contains(github_pat)); + found = true; + } + } + assert!(found, "did not find Contents/section0.xml in hwpx ArchiveFiles"); + } else { + panic!("expected ArchiveFiles for hwpx archive, got {:?}", content); + } + + Ok(()) + } + + #[test] + fn smoke_decompress_hwp_archive() -> anyhow::Result<()> { + use cfb::CompoundFile; + use flate2::{Compression, write::ZlibEncoder}; + + let dir = tempdir()?; + let hwp_path = dir.path().join("document.hwp"); + let github_pat = "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"; // this is not a real secret + + // Build a minimal CFB with two streams: one plaintext, one zlib-framed. + { + let file = File::create(&hwp_path)?; + let mut cf = CompoundFile::create(file)?; + cf.create_storage("/BodyText")?; + + let mut s_plain = cf.create_stream("/DocInfo")?; + s_plain.write_all(format!("metadata token={github_pat}").as_bytes())?; + drop(s_plain); + + let mut zencoder = ZlibEncoder::new(Vec::new(), Compression::default()); + zencoder.write_all(format!("body token={github_pat}").as_bytes())?; + let zbytes = zencoder.finish()?; + let mut s_body = cf.create_stream("/BodyText/Section0")?; + s_body.write_all(&zbytes)?; + drop(s_body); + + cf.flush()?; + } + + let content = decompress_once(&hwp_path, None)?; + if let CompressedContent::Archive(entries) = content { + let mut saw_plain = false; + let mut saw_body = false; + for (logical, bytes) in &entries { + let as_str = String::from_utf8_lossy(bytes); + if logical.contains("DocInfo") && as_str.contains(github_pat) { + saw_plain = true; + } + if logical.contains("Section0") && as_str.contains(github_pat) { + saw_body = true; + } + } + assert!(saw_plain, "plaintext DocInfo stream missing or not decoded"); + assert!(saw_body, "zlib-framed BodyText/Section0 stream missing or not decoded"); + } else { + panic!("expected Archive for hwp, got {:?}", content); + } + + Ok(()) + } + + #[test] + fn smoke_decompress_egg_raw() -> anyhow::Result<()> { + let dir = tempdir()?; + let egg_path = dir.path().join("archive.egg"); + let github_pat = "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"; // this is not a real secret + + { + let mut f = File::create(&egg_path)?; + f.write_all(format!("EGG-pretend-header\ntoken={github_pat}\n").as_bytes())?; + } + + let content = decompress_once(&egg_path, None)?; + match content { + CompressedContent::Raw(bytes) => { + let as_str = String::from_utf8_lossy(&bytes); + assert!( + as_str.contains(github_pat), + "raw egg bytes did not contain the embedded pat" + ); + } + other => panic!("expected Raw for egg, got {:?}", other), + } + + Ok(()) + } } diff --git a/src/matcher/mod.rs b/src/matcher/mod.rs index 30146da..cada686 100644 --- a/src/matcher/mod.rs +++ b/src/matcher/mod.rs @@ -27,7 +27,7 @@ use crate::{ parser::Language, rule_profiling::{ConcurrentRuleProfiler, RuleStats}, rules::rule::Rule, - rules_database::{RuleDetectionProfileKind, RuleMatchProfile, RulesDatabase}, + rules_database::RulesDatabase, scanner_pool::ScannerPool, validation_body::ValidationResponseBody, }; @@ -403,7 +403,8 @@ impl<'a> Matcher<'a> { } } } - maybe_apply_context_verification( + + maybe_apply_markup_context_gate( self.rules_db, blob, lang_hint, @@ -411,6 +412,7 @@ impl<'a> Matcher<'a> { &mut matches, &match_rule_indices, ); + // Finalize if !no_dedup && !matches.is_empty() { let blob_id = blob.id(); @@ -436,7 +438,22 @@ impl<'a> Matcher<'a> { } } -fn maybe_apply_context_verification<'a>( +/// Apply parser-based context verification only for HTML and CSS blobs. +/// +/// HTML and CSS are the one regime where regex can't easily express +/// "this capture is in a real value position" — attribute values, CSS +/// property values, and nested script/style content need structural +/// understanding. For every other language (and for blobs without a +/// language hint, e.g. logs, binaries), this function is a no-op. +/// +/// Self-identifying rules (matched by literal shape — `GHP_`, `AIzaSy`, +/// `xox[pbarose]`, PEM envelopes, Slack webhook URLs, etc.) bypass the +/// gate even in HTML/CSS so plain-prose leaks are still caught. +/// +/// The gate is subtractive only when the parser actually runs and rejects +/// a match. If the parser is unavailable (too-large blob, parser error), +/// all matches are kept — never silently dropped. +fn maybe_apply_markup_context_gate<'a>( rules_db: &RulesDatabase, blob: &'a Blob, lang_hint: Option<&str>, @@ -447,8 +464,17 @@ fn maybe_apply_context_verification<'a>( if matches.is_empty() { return; } + if !should_attempt_context_verification(blob_len) { + return; + } + let Some(hint) = lang_hint else { + return; + }; + let language = match Language::from_hint(hint) { + Some(lang @ (Language::Html | Language::Css)) => lang, + _ => return, + }; - let profiles = rules_db.rule_match_profiles(); let candidate_indices: Vec = matches .iter() .enumerate() @@ -456,10 +482,10 @@ fn maybe_apply_context_verification<'a>( if m.is_base64 { return false; } - let Some(rule_idx) = match_rule_indices.get(*idx) else { - return false; - }; - profiles[*rule_idx].kind == RuleDetectionProfileKind::ContextDependent + match match_rule_indices.get(*idx) { + Some(rule_idx) => !rules_db.is_rule_self_identifying(*rule_idx), + None => false, + } }) .map(|(idx, _)| idx) .collect(); @@ -468,25 +494,6 @@ fn maybe_apply_context_verification<'a>( return; } - let mut keep = vec![true; matches.len()]; - let mandatory_candidate_indices: Vec = candidate_indices - .iter() - .copied() - .filter(|idx| { - let Some(rule_idx) = match_rule_indices.get(*idx) else { - return false; - }; - should_require_context_verification(&profiles[*rule_idx]) - }) - .collect(); - let Some(language) = load_context_verifier_language(lang_hint, blob_len) else { - for idx in mandatory_candidate_indices { - keep[idx] = false; - } - filter_kept_matches(matches, &keep); - return; - }; - let mut remaining = candidate_indices.clone(); let verification = parser::stream_context_candidates(blob.bytes(), &language, |text| { remaining.retain(|idx| { @@ -501,25 +508,18 @@ fn maybe_apply_context_verification<'a>( }); if let Err(e) = verification { - debug!("context verification unavailable: {e}"); - remaining = mandatory_candidate_indices; + debug!("HTML/CSS context verification unavailable: {e}"); + return; } + if remaining.is_empty() { + return; + } + + let mut keep = vec![true; matches.len()]; for idx in remaining { keep[idx] = false; } - - filter_kept_matches(matches, &keep); -} - -fn should_require_context_verification(profile: &RuleMatchProfile) -> bool { - profile.reason_codes.contains(&"strict_contextual_shape") -} - -fn filter_kept_matches<'a>(matches: &mut Vec>, keep: &[bool]) { - if keep.iter().all(|k| *k) { - return; - } let mut filtered = Vec::with_capacity(matches.len()); for (idx, item) in std::mem::take(matches).into_iter().enumerate() { if keep[idx] { @@ -529,14 +529,6 @@ fn filter_kept_matches<'a>(matches: &mut Vec>, keep: &[bool]) { *matches = filtered; } -fn load_context_verifier_language(lang_hint: Option<&str>, blob_len: usize) -> Option { - if !should_attempt_context_verification(blob_len) { - return None; - } - let lang = lang_hint?; - Language::from_hint(lang) -} - fn verify_match_in_context_text( re: ®ex::bytes::Regex, expected_secret: &[u8], @@ -1174,15 +1166,12 @@ line2 ScanResult::New(matches) => matches, _ => panic!("unexpected scan result"), }; - assert!( - found.is_empty(), - "comment-only contextual hits should be suppressed when parser-based verification cannot confirm assignment context" - ); + assert_eq!(found.len(), 1, "raw regex matches should remain findings without classifier gating"); Ok(()) } #[test] - fn strict_context_rule_suppresses_raw_when_context_verification_is_unavailable() -> Result<()> { + fn strict_context_rule_survives_without_classifier_gating() -> Result<()> { let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234"; let rule = Rule::new(RuleSyntax { id: "kingfisher.auth0.2".into(), @@ -1215,10 +1204,7 @@ line2 ScanResult::New(matches) => matches, _ => panic!("unexpected scan result"), }; - assert!( - found.is_empty(), - "strict contextual rules should be suppressed when parser-based verification cannot run" - ); + assert_eq!(found.len(), 1, "strict contextual rules should still be reported without classifier gating"); Ok(()) } @@ -1259,7 +1245,7 @@ line2 assert_eq!( found.len(), 1, - "assignment-style contextual rules should still scan raw text when parser context is unavailable" + "assignment-style contextual rules should still scan raw text without classifier gating" ); Ok(()) } @@ -1306,7 +1292,7 @@ line2 assert_eq!( found.len(), 1, - "depends_on assignment-style rules should still scan raw text when parser context is unavailable" + "depends_on assignment-style rules should still scan raw text without classifier gating" ); Ok(()) } @@ -1386,7 +1372,173 @@ line2 assert_eq!( found.len(), 1, - "self-identifying token families should not require parser context" + "self-identifying token families should still be reported without classifier gating" + ); + Ok(()) + } + + fn generic_auth0_rule() -> Rule { + Rule::new(RuleSyntax { + id: "kingfisher.auth0.2".into(), + name: "auth0 secret".into(), + pattern: "(?xi)\\bauth0(?:.|[\\n\\r]){0,16}?(?:secret|token)(?:.|[\\n\\r]){0,64}?\\b([a-z0-9_-]{64,})\\b".into(), + confidence: crate::rules::rule::Confidence::Medium, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + revocation: None, + depends_on_rule: vec![], + pattern_requirements: None, + tls_mode: None, + }) + } + + #[test] + fn html_gate_drops_generic_contextual_match_outside_value_position() -> Result<()> { + let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234"; + let rules_db = RulesDatabase::from_rules(vec![generic_auth0_rule()])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone()))); + let mut matcher = + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; + + let body = format!( + "" + ); + let blob = Blob::from_bytes(body.into_bytes()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("page.html"))); + + let found = match matcher.scan_blob( + &blob, + &origin, + Some("html".to_string()), + false, + false, + false, + )? { + ScanResult::New(matches) => matches, + _ => panic!("unexpected scan result"), + }; + assert!( + found.is_empty(), + "HTML gate should drop generic contextual hits that sit outside any value position" + ); + Ok(()) + } + + #[test] + fn html_gate_keeps_generic_contextual_match_inside_script_assignment() -> Result<()> { + let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234"; + let rules_db = RulesDatabase::from_rules(vec![generic_auth0_rule()])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone()))); + let mut matcher = + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; + + let body = format!( + "" + ); + let blob = Blob::from_bytes(body.into_bytes()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("app.html"))); + + let found = match matcher.scan_blob( + &blob, + &origin, + Some("html".to_string()), + false, + false, + false, + )? { + ScanResult::New(matches) => matches, + _ => panic!("unexpected scan result"), + }; + assert_eq!( + found.len(), + 1, + "HTML gate should keep generic contextual hits that appear inside a script assignment" + ); + Ok(()) + } + + #[test] + fn html_gate_does_not_affect_self_identifying_rule_in_prose() -> Result<()> { + let rule = Rule::new(RuleSyntax { + id: "kingfisher.google.7".into(), + name: "google api key".into(), + pattern: "(?xi)\\b(AIzaSy[A-Za-z0-9_-]{33})".into(), + confidence: crate::rules::rule::Confidence::Medium, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None::, + revocation: None, + depends_on_rule: vec![], + pattern_requirements: None, + tls_mode: None, + }); + let rules_db = RulesDatabase::from_rules(vec![rule])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone()))); + let mut matcher = + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; + + let body = "

Key: AIzaSyBUPHAjZl3n8Eza66ka6B78iVyPteC5MgM

" + .to_string(); + let blob = Blob::from_bytes(body.into_bytes()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("docs.html"))); + + let found = match matcher.scan_blob( + &blob, + &origin, + Some("html".to_string()), + false, + false, + false, + )? { + ScanResult::New(matches) => matches, + _ => panic!("unexpected scan result"), + }; + assert_eq!( + found.len(), + 1, + "self-identifying rules must bypass the HTML gate so prose leaks still fire" + ); + Ok(()) + } + + #[test] + fn html_gate_does_not_trigger_for_other_languages() -> Result<()> { + let token = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234"; + let rules_db = RulesDatabase::from_rules(vec![generic_auth0_rule()])?; + let seen = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vectorscan_db().clone()))); + let mut matcher = + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; + + let body = format!("# auth0 secret {token}"); + let blob = Blob::from_bytes(body.into_bytes()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("notes.py"))); + + let found = match matcher.scan_blob( + &blob, + &origin, + Some("python".to_string()), + false, + false, + false, + )? { + ScanResult::New(matches) => matches, + _ => panic!("unexpected scan result"), + }; + assert_eq!( + found.len(), + 1, + "non-HTML/CSS blobs must bypass the gate even when parser hint is available" ); Ok(()) } diff --git a/src/rules_database.rs b/src/rules_database.rs index 73db985..e0c3a26 100644 --- a/src/rules_database.rs +++ b/src/rules_database.rs @@ -2,6 +2,4 @@ //! //! This module re-exports types from [`kingfisher_rules::rules_database`]. -pub use kingfisher_rules::rules_database::{ - RuleDetectionProfileKind, RuleMatchProfile, RulesDatabase, format_regex_pattern, -}; +pub use kingfisher_rules::rules_database::{RulesDatabase, format_regex_pattern}; diff --git a/src/scanner/util.rs b/src/scanner/util.rs index 649b58e..ac33c9c 100644 --- a/src/scanner/util.rs +++ b/src/scanner/util.rs @@ -25,7 +25,9 @@ pub fn is_compressed_file(path: &Path) -> bool { || ext_lower == "tar" || ext_lower == "zlib" || ext_lower == "asar" - || ZIP_BASED_FORMATS.iter().any(|z| *z == ext) + || ext_lower == "hwp" + || ext_lower == "egg" + || ZIP_BASED_FORMATS.iter().any(|z| *z == ext_lower) } else { false }