From 23102f4b5977249455afffdddc20d038a7b42d06 Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Tue, 2 Sep 2025 13:59:01 -0700
Subject: [PATCH 01/12] Improved error message when self-update cannot find the
current binary
---
CHANGELOG.md | 3 +++
src/update.rs | 33 +++++++++++++++++++++++----------
2 files changed, 26 insertions(+), 10 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6e57853..fda4998 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,9 @@
All notable changes to this project will be documented in this file.
+## [Unreleased]
+- Improved error message when self-update cannot find the current binary.
+
## [1.47.0]
- MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them
- Improved rules: github oauth2, diffbot, mailchimp, aws
diff --git a/src/update.rs b/src/update.rs
index 76629be..3ab5208 100644
--- a/src/update.rs
+++ b/src/update.rs
@@ -129,16 +129,29 @@ pub fn check_for_update(global_args: &GlobalArgs, base_url: Option<&str>) -> Opt
.apply_to(&format!("Updated to version {}", status.version()))
),
Err(e) => match e {
- UpdError::Io(ref io_err) if io_err.kind() == ErrorKind::PermissionDenied => {
- warn!(
- "{}",
- styles.style_finding_active_heading.apply_to(
- "Cannot replace the current binary - permission denied.\n\
- If you installed via a package manager, run its upgrade command.\n\
- Otherwise reinstall to a user-writable directory or re-run with sudo."
- )
- );
- }
+ UpdError::Io(ref io_err) => match io_err.kind() {
+ ErrorKind::PermissionDenied => {
+ warn!(
+ "{}",
+ styles.style_finding_active_heading.apply_to(
+ "Cannot replace the current binary - permission denied.\n\
+ If you installed via a package manager, run its upgrade command.\n\
+ Otherwise reinstall to a user-writable directory or re-run with sudo."
+ )
+ );
+ }
+ ErrorKind::NotFound => {
+ warn!(
+ "{}",
+ styles.style_finding_active_heading.apply_to(
+ "Cannot replace the current binary - file not found.\n\
+ If you installed via a package manager, run its upgrade command.\n\
+ Otherwise reinstall to a user-writable directory."
+ )
+ );
+ }
+ _ => error!("Failed to update: {e}"),
+ },
_ => error!("Failed to update: {e}"),
},
}
From c3513ea2067eec283293f7edb4eea8b49d8e9cd8 Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Tue, 2 Sep 2025 19:54:44 -0700
Subject: [PATCH 02/12] Optimized memory usage via string interning and
extensive data sharing
---
CHANGELOG.md | 3 ++-
src/content_type.rs | 3 +--
src/decompress.rs | 8 +++---
src/findings_store.rs | 17 ++++++------
src/matcher.rs | 40 ++++++++++------------------
src/reporter.rs | 8 +++---
src/reporter/json_format.rs | 43 ++++++++++++++++--------------
src/scanner/docker.rs | 10 ++++---
src/scanner/validation.rs | 53 +++++++++++--------------------------
src/validation.rs | 6 +++--
src/validation/utils.rs | 21 ++++++++-------
tests/fingerprint_dedup.rs | 27 ++++++++++++++-----
12 files changed, 114 insertions(+), 125 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index fda4998..2eae1b7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,8 @@
All notable changes to this project will be documented in this file.
## [Unreleased]
-- Improved error message when self-update cannot find the current binary.
+- Improved error message when self-update cannot find the current binary
+- Optimized memory usage via string interning and extensive data sharing
## [1.47.0]
- MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them
diff --git a/src/content_type.rs b/src/content_type.rs
index 197258d..ee44da6 100644
--- a/src/content_type.rs
+++ b/src/content_type.rs
@@ -96,9 +96,8 @@ impl ContentInspector {
#[inline]
#[must_use]
pub fn guess_charset(&self, bytes: &[u8]) -> Option {
- String::from_utf8(bytes.to_vec()).ok().map(|_| "UTF-8".to_string())
+ std::str::from_utf8(bytes).ok().map(|_| "UTF-8".to_string())
}
-
/// Guess programming language with broad coverage using `tokei`.
///
/// Strategy (no disk I/O):
diff --git a/src/decompress.rs b/src/decompress.rs
index 49dc7ef..09f327f 100644
--- a/src/decompress.rs
+++ b/src/decompress.rs
@@ -162,10 +162,10 @@ fn handle_zip_archive_streaming(
}
fn handle_asar_archive_in_memory(
- buffer: Vec,
+ buffer: &[u8],
archive_path: &Path,
) -> Result {
- match AsarReader::new(&buffer, None) {
+ match AsarReader::new(buffer, None) {
Ok(reader) => {
let mut contents = Vec::new();
for (path_in_asar, file) in reader.files() {
@@ -200,7 +200,7 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result {
let mmap = unsafe { Mmap::map(&file)? };
- return handle_asar_archive_in_memory(mmap.to_vec(), path);
+ return handle_asar_archive_in_memory(&mmap, path);
}
"tar" => {
if let Some(base) = base_dir {
@@ -525,4 +525,4 @@ mod tests {
Ok(())
}
-}
+}
\ No newline at end of file
diff --git a/src/findings_store.rs b/src/findings_store.rs
index fc610f2..23d8c7b 100644
--- a/src/findings_store.rs
+++ b/src/findings_store.rs
@@ -18,6 +18,7 @@ use crate::{
matcher::Match,
origin::{Origin, OriginSet},
rules::rule::Rule,
+ util::intern,
};
// share with Arc so every blob/origin is materialised once
@@ -151,7 +152,7 @@ impl FindingsStore {
.captures
.get(1)
.or_else(|| m.groups.captures.get(0))
- .map_or("", |c| c.value.as_ref());
+ .map_or("", |c| c.value);
let origin_kind = match origin.first() {
Origin::GitRepo(_) => "git",
@@ -160,7 +161,7 @@ impl FindingsStore {
};
let key = xxh3_64(
- format!("{}|{}|{}", m.rule_text_id.to_uppercase(), origin_kind, snippet)
+ format!("{}|{}|{}", m.rule.id().to_uppercase(), origin_kind, snippet)
.as_bytes(),
);
@@ -280,7 +281,7 @@ impl FindingsStore {
pub fn get_summary(&self) -> FxHashMap<&'static str, usize> {
self.matches.iter().fold(FxHashMap::default(), |mut acc, msg| {
let (_, _, m) = &**msg;
- *acc.entry(m.rule_name).or_insert(0) += 1; // borrow, no alloc
+ *acc.entry(intern(m.rule.name())).or_insert(0) += 1;
acc
})
}
@@ -342,13 +343,13 @@ impl FindingsStore {
self.matches.iter().map(|msg| {
let (_, _, match_item) = &**msg;
finding_data::FindingMetadata {
- rule_name: match_item.rule_name.to_string(),
+ rule_name: match_item.rule.name().to_string(),
num_matches: 1,
comment: None,
visible: match_item.visible,
finding_id: match_item.finding_id(),
- rule_finding_fingerprint: match_item.rule_finding_fingerprint.to_string(),
- rule_text_id: match_item.rule_text_id.to_string(),
+ rule_finding_fingerprint: match_item.rule.finding_sha1_fingerprint().to_string(),
+ rule_text_id: match_item.rule.id().to_string(),
}
})
}
@@ -362,7 +363,7 @@ impl FindingsStore {
.iter()
.filter(|msg| {
let (_, _, match_item) = &***msg;
- match_item.rule_name == metadata.rule_name
+ match_item.rule.name() == metadata.rule_name
})
.map(|msg| {
let (origin, blob_metadata, match_item) = &**msg;
@@ -373,7 +374,7 @@ impl FindingsStore {
match_id: MatchIdInt::from_str(&match_item.finding_id())?,
match_comment: None,
visible: match_item.visible,
- match_confidence: match_item.rule_confidence,
+ match_confidence: match_item.rule.confidence(),
validation_response_body: match_item.validation_response_body.clone(),
validation_response_status: match_item.validation_response_status,
validation_success: match_item.validation_success,
diff --git a/src/matcher.rs b/src/matcher.rs
index 1cbb16f..0ecc7ec 100644
--- a/src/matcher.rs
+++ b/src/matcher.rs
@@ -682,20 +682,20 @@ impl JsonSchema for Groups {
// pub end: usize, // End position of the match
// pub value: String, // The actual captured value
// }
-#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCapture {
pub name: Option,
pub match_number: i32,
pub start: usize,
pub end: usize,
- // Instead of storing an owned String, store a borrowed or interned value.
- // Here we use Cow to allow either borrowing or owning as needed.
- pub value: std::borrow::Cow<'static, str>,
+ /// Interned value of the capture.
+ pub value: &'static str,
}
-#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[derive(Debug, Clone, Serialize, JsonSchema)]
pub struct SerializableCaptures {
- pub captures: Vec, // All captures (named and unnamed)
+ #[schemars(with = "Vec")]
+ pub captures: SmallVec<[SerializableCapture; 2]>, // All captures (named and unnamed)
}
impl SerializableCaptures {
pub fn from_captures(
@@ -704,7 +704,7 @@ impl SerializableCaptures {
re: &Regex,
redact: bool,
) -> Self {
- let mut serialized_captures = Vec::new();
+ let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new();
// Process named captures
for name in re.capture_names().flatten() {
if let Some(capture) = captures.name(name) {
@@ -718,7 +718,7 @@ impl SerializableCaptures {
match_number: -1,
start: capture.start(),
end: capture.end(),
- value: value.into(),
+ value: intern(&value),
});
}
}
@@ -735,7 +735,7 @@ impl SerializableCaptures {
match_number: i32::try_from(i).unwrap_or(0),
start: capture.start(),
end: capture.end(),
- value: value.into(),
+ value: intern(&value),
});
}
}
@@ -764,16 +764,9 @@ pub struct Match {
pub finding_fingerprint: u64,
/// The rule that produced this match
- pub rule_finding_fingerprint: &'static str,
-
- /// The text identifier of the rule that produced this match
- pub rule_text_id: &'static str,
-
- /// The name of the rule that produced this match
- pub rule_name: &'static str,
-
- /// The confidence property of the rule that produced this match
- pub rule_confidence: crate::rules::rule::Confidence,
+ #[serde(skip_serializing)]
+ #[schemars(skip)]
+ pub rule: Arc,
/// Validation Body
pub validation_response_body: String,
@@ -813,8 +806,6 @@ impl Match {
let finding_value_for_fp = std::str::from_utf8(matching_finding_bytes).unwrap_or("");
let source_span = loc_mapping.get_source_span(&offset_span);
- let rule_finding_fingerprint = owned_blob_match.rule.finding_sha1_fingerprint().to_owned();
-
let offset_start: u64 =
owned_blob_match.matching_input_offset_span.start.try_into().unwrap();
let offset_end: u64 = owned_blob_match.matching_input_offset_span.end.try_into().unwrap();
@@ -828,10 +819,7 @@ impl Match {
// matching_snippet
Match {
- rule_finding_fingerprint: intern(&rule_finding_fingerprint),
- rule_name: intern(owned_blob_match.rule.name()),
- rule_confidence: owned_blob_match.rule.confidence(),
- rule_text_id: intern(owned_blob_match.rule.id()),
+ rule: owned_blob_match.rule.clone(),
visible: owned_blob_match.rule.visible().to_owned(),
location: Location { offset_span, source_span: source_span.clone() },
groups: owned_blob_match.captures.clone(),
@@ -852,7 +840,7 @@ impl Match {
pub fn finding_id(&self) -> String {
let mut h = Sha1::new();
- write!(&mut h, "{}\0", self.rule_finding_fingerprint)
+ write!(&mut h, "{}\0", self.rule.finding_sha1_fingerprint())
.expect("should be able to write to memory");
serde_json::to_writer(&mut h, &self.groups)
.expect("should be able to serialize groups as JSON");
diff --git a/src/reporter.rs b/src/reporter.rs
index c9c2a62..dacf75b 100644
--- a/src/reporter.rs
+++ b/src/reporter.rs
@@ -216,7 +216,7 @@ impl DetailsReporter {
m: match_item.clone(),
comment: None,
visible: match_item.visible,
- match_confidence: match_item.rule_confidence,
+ match_confidence: match_item.rule.confidence(),
validation_response_body: match_item.validation_response_body.clone(),
validation_response_status: match_item.validation_response_status,
validation_success: match_item.validation_success,
@@ -366,13 +366,13 @@ impl DetailsReporter {
FindingReporterRecord {
rule: RuleMetadata {
- name: rm.m.rule_name.to_string(),
- id: rm.m.rule_text_id.to_string(),
+ name: rm.m.rule.name().to_string(),
+ id: rm.m.rule.id().to_string(),
},
finding: FindingRecordData {
snippet,
fingerprint: rm.m.finding_fingerprint.to_string(),
- confidence: rm.match_confidence.to_string(),
+ confidence: rm.m.rule.confidence().to_string(),
entropy: format!("{:.2}", rm.m.calculated_entropy),
validation: ValidationInfo { status: validation_status, response: response_body },
language: rm
diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs
index 0b562d2..7cf90c8 100644
--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@@ -35,6 +35,7 @@ mod tests {
use crate::cli::commands::github::GitHistoryMode;
use crate::cli::commands::rules::RuleSpecifierArgs;
use crate::matcher::{SerializableCapture, SerializableCaptures};
+ use crate::rules::rule::{Rule, RuleSyntax, Confidence};
use crate::util::intern;
use crate::{
blob::BlobId,
@@ -54,6 +55,7 @@ mod tests {
path::PathBuf,
sync::{Arc, Mutex},
};
+ use smallvec::smallvec;
use url::Url;
fn create_default_args() -> cli::commands::scan::ScanArgs {
use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope
@@ -132,12 +134,21 @@ mod tests {
}
}
- fn create_mock_match(
- rule_name: &str,
- rule_text_id: &str,
- rule_finding_fingerprint: &str,
- validation_success: bool,
- ) -> Match {
+ fn create_mock_match(rule_name: &str, rule_text_id: &str, validation_success: bool) -> Match {
+ let syntax = RuleSyntax {
+ name: rule_name.to_string(),
+ id: rule_text_id.to_string(),
+ pattern: "dummy".to_string(),
+ min_entropy: 0.0,
+ confidence: Confidence::Medium,
+ visible: true,
+ examples: vec![],
+ negative_examples: vec![],
+ references: vec![],
+ validation: None,
+ depends_on_rule: vec![],
+ };
+ let rule = Arc::new(Rule::new(syntax));
Match {
location: Location {
offset_span: OffsetSpan { start: 10, end: 20 },
@@ -147,20 +158,17 @@ mod tests {
},
},
groups: SerializableCaptures {
- captures: vec![SerializableCapture {
+ captures: smallvec![SerializableCapture {
name: Some("token".to_string()),
match_number: 1,
start: 10,
end: 20,
- value: "mock_token".into(),
+ value: intern("mock_token"),
}],
},
blob_id: BlobId::new(b"mock_blob"),
finding_fingerprint: 0123,
- rule_finding_fingerprint: intern(rule_finding_fingerprint),
- rule_text_id: intern(rule_text_id),
- rule_name: intern(rule_name),
- rule_confidence: Confidence::Medium,
+ rule,
validation_response_body: "validation response".to_string(),
validation_response_status: 200,
validation_success,
@@ -204,8 +212,7 @@ mod tests {
#[test]
fn test_json_format() -> Result<()> {
- let mock_match =
- create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
+ let mock_match = create_mock_match("MockRule", "mock_rule_1", true);
let matches = vec![ReportMatch {
origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]),
blob_metadata: BlobMetadata {
@@ -238,12 +245,8 @@ mod tests {
fn test_validation_status_in_json() -> Result<()> {
let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")];
for (validation_success, expected_status) in test_cases {
- let mock_match = create_mock_match(
- "MockRule",
- "mock_rule_1",
- "mock_finding_fingerprint",
- validation_success,
- );
+ let mock_match =
+ create_mock_match("MockRule", "mock_rule_1", validation_success);
let matches = vec![ReportMatch {
origin: OriginSet::new(
Origin::from_file(PathBuf::from("/mock/path/file.rs")),
diff --git a/src/scanner/docker.rs b/src/scanner/docker.rs
index b427a3b..405a553 100644
--- a/src/scanner/docker.rs
+++ b/src/scanner/docker.rs
@@ -175,9 +175,11 @@ impl Docker {
pb.set_length(layer_paths.len() as u64);
for p in layer_paths {
- let mut data = Vec::new();
- File::open(&p)?.read_to_end(&mut data)?;
- let digest = format!("{:x}", Sha256::digest(&data));
+ let mut file = File::open(&p)?;
+ let mut hasher = Sha256::new();
+ std::io::copy(&mut file, &mut hasher)?;
+ let digest = format!("{:x}", hasher.finalize());
+
let new_path = out_dir.join(format!("layer_{digest}.tar"));
std::fs::rename(&p, &new_path)?;
// extract layer contents so inner filenames appear in scan results
@@ -280,4 +282,4 @@ mod tests {
fn docker_struct_new() {
let _ = Docker::new();
}
-}
+}
\ No newline at end of file
diff --git a/src/scanner/validation.rs b/src/scanner/validation.rs
index 05865b2..f01872a 100644
--- a/src/scanner/validation.rs
+++ b/src/scanner/validation.rs
@@ -13,7 +13,7 @@ use futures::{stream, StreamExt};
use indicatif::{ProgressBar, ProgressStyle};
use liquid::Parser;
use reqwest::{Client, StatusCode};
-use rustc_hash::{FxHashMap, FxHashSet};
+use rustc_hash::FxHashMap;
use tokio::{sync::Notify, time::timeout};
use crate::{
@@ -21,7 +21,6 @@ use crate::{
findings_store::{FindingsStore, FindingsStoreMessage},
location::OffsetSpan,
matcher::{Match, OwnedBlobMatch},
- rules::rule,
validation::{collect_variables_and_dependencies, validate_single_match, CachedResponse},
};
@@ -40,7 +39,7 @@ pub async fn run_secret_validation(
let fail_count = Arc::new(AtomicUsize::new(0));
// ── 2. Fetch rules + matches ────────────────────────────────────────────
- let (all_rules, all_matches_by_blob) = {
+ let (_all_rules, all_matches_by_blob) = {
let ds = datastore.lock().unwrap();
let rules = ds.get_rules()?;
let mut map: FxHashMap>> = FxHashMap::default();
@@ -51,16 +50,13 @@ pub async fn run_secret_validation(
};
// ── 3. Partition blobs ──────────────────────────────────────────────────
- let rules_with_deps: FxHashSet<&str> = all_rules
- .iter()
- .filter(|r| !r.syntax().depends_on_rule.is_empty())
- .map(|r| r.id())
- .collect();
-
let mut simple_matches = Vec::new();
let mut dependent_blobs = FxHashMap::default(); // blob_id -- Vec>
for (blob_id, matches) in all_matches_by_blob {
- if matches.iter().any(|m| rules_with_deps.contains(m.2.rule_text_id)) {
+ if matches
+ .iter()
+ .any(|m| !m.2.rule.syntax().depends_on_rule.is_empty())
+ {
dependent_blobs.insert(blob_id, matches);
} else {
simple_matches.extend(matches);
@@ -80,9 +76,9 @@ pub async fn run_secret_validation(
.captures
.get(1)
.or_else(|| arc_msg.2.groups.captures.get(0))
- .map_or("", |c| c.value.as_ref());
+ .map_or("", |c| c.value);
groups
- .entry(format!("{}|{}", arc_msg.2.rule_text_id, secret))
+ .entry(format!("{}|{}", arc_msg.2.rule.id(), secret))
.or_default()
.push(arc_msg);
}
@@ -109,7 +105,6 @@ pub async fn run_secret_validation(
let client = client.clone();
let cache_glob = cache.clone();
let val_res = &validation_results;
- let rules = &all_rules;
let success = success_count.clone();
let fail = fail_count.clone();
// *** FIX: Clone the progress bar for each concurrent task ***
@@ -122,8 +117,8 @@ pub async fn run_secret_validation(
.captures
.get(1)
.or_else(|| rep_arc.2.groups.captures.get(0))
- .map_or("", |c| c.value.as_ref());
- let key = format!("{}|{}", rep_arc.2.rule_text_id, secret);
+ .map_or("", |c| c.value);
+ let key = format!("{}|{}", rep_arc.2.rule.id(), secret);
match val_res.entry(key.clone()) {
dashmap::mapref::entry::Entry::Occupied(_) => return,
@@ -138,8 +133,8 @@ pub async fn run_secret_validation(
}
}
- let rule = find_rule_for_match(rules, rep_arc.2.rule_text_id).unwrap();
- let mut om = OwnedBlobMatch::convert_match_to_owned_blobmatch(&rep_arc.2, rule);
+ let mut om =
+ OwnedBlobMatch::convert_match_to_owned_blobmatch(&rep_arc.2, rep_arc.2.rule.clone());
validate_single(
&mut om,
@@ -211,7 +206,6 @@ pub async fn run_secret_validation(
let val_cache = Arc::new(DashMap::::new());
let in_flight = Arc::new(DashMap::::new());
- let rules_ref = Arc::new(all_rules.clone());
for chunk in blob_ids.chunks(chunk_size) {
let tasks: Vec<_> = chunk
@@ -225,15 +219,15 @@ pub async fn run_secret_validation(
let success = success_count.clone();
let fail = fail_count.clone();
let cache_glob = cache.clone();
- let rules = rules_ref.clone();
async move {
let owned = matches_for_blob
.iter()
.map(|arc_msg| {
- let rule = find_rule_for_match(&rules, arc_msg.2.rule_text_id)
- .expect("rule");
- OwnedBlobMatch::convert_match_to_owned_blobmatch(&arc_msg.2, rule)
+ OwnedBlobMatch::convert_match_to_owned_blobmatch(
+ &arc_msg.2,
+ arc_msg.2.rule.clone(),
+ )
})
.collect::>();
@@ -338,21 +332,6 @@ pub async fn run_secret_validation(
Ok(())
}
-/// Returns `Some(Arc)` if a matching rule is found; otherwise returns `None`.
-/// Callers can decide how to handle the `None` case (e.g., skip processing).
-fn find_rule_for_match(
- all_rules: &[Arc],
- rule_text_id: &str,
-) -> Option> {
- match all_rules.iter().find(|r| r.syntax().id == rule_text_id).cloned() {
- Some(rule) => Some(rule),
- None => {
- eprintln!("Warning: no rule found with id '{}'. Skipping.", rule_text_id);
- None
- }
- }
-}
-
// ---------------------------------------------------
// The core validation logic, used in an async pipeline
// ---------------------------------------------------
diff --git a/src/validation.rs b/src/validation.rs
index 3548d33..a645ce7 100644
--- a/src/validation.rs
+++ b/src/validation.rs
@@ -910,6 +910,7 @@ mod tests {
use crossbeam_skiplist::SkipMap;
use http::StatusCode;
use rustc_hash::FxHashMap;
+ use smallvec::smallvec;
use crate::{
blob::BlobId,
@@ -920,6 +921,7 @@ mod tests {
rule::{Confidence, Rule},
Rules,
},
+ util::intern,
validation::{validate_single_match, Cache},
};
#[tokio::test]
@@ -1016,12 +1018,12 @@ rules:
// matching_input: token.as_bytes().to_vec(),
matching_input_offset_span: OffsetSpan { start: 0, end: token.len() },
captures: SerializableCaptures {
- captures: vec![SerializableCapture {
+ captures: smallvec![SerializableCapture {
name: Some("TOKEN".to_string()),
match_number: -1,
start: 0,
end: token.len(),
- value: token.into(),
+ value: intern(token),
}],
},
validation_response_body: String::new(),
diff --git a/src/validation/utils.rs b/src/validation/utils.rs
index ee118c8..cb2068d 100644
--- a/src/validation/utils.rs
+++ b/src/validation/utils.rs
@@ -22,7 +22,7 @@ pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String,
.map(|cap| {
let name =
cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string());
- (name, cap.value.clone().into_owned(), cap.start, cap.end)
+ (name, cap.value.to_string(), cap.start, cap.end)
})
.collect()
}
@@ -75,17 +75,18 @@ pub async fn check_url_resolvable(url: &Url) -> Result<(), Box Match {
+ let syntax = RuleSyntax {
+ name: "Example Rule".to_string(),
+ id: "RULE.1".to_string(),
+ pattern: "dummy".to_string(),
+ min_entropy: 0.0,
+ confidence: Confidence::Medium,
+ visible: true,
+ examples: vec![],
+ negative_examples: vec![],
+ references: vec![],
+ validation: None,
+ depends_on_rule: vec![],
+ };
+ let rule = Arc::new(Rule::new(syntax));
Match {
location: Location {
offset_span: OffsetSpan { start: 0, end: 10 },
@@ -28,20 +44,17 @@ fn make_match(fp: u64) -> Match {
},
},
groups: SerializableCaptures {
- captures: vec![SerializableCapture {
+ captures: smallvec![SerializableCapture {
name: None,
match_number: 0,
start: 0,
end: 10,
- value: "dummy".into(),
+ value: intern("dummy"),
}],
},
blob_id: BlobId::new(b"dummy"),
finding_fingerprint: fp,
- rule_finding_fingerprint: "structural.1".into(),
- rule_text_id: "RULE.1".into(),
- rule_name: "Example Rule".into(),
- rule_confidence: Confidence::Medium,
+ rule,
validation_response_body: String::new(),
validation_response_status: 0,
validation_success: false,
From 52b2c02ee9731e1380b3ae9763c833fc6b878f1b Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Wed, 3 Sep 2025 09:52:49 -0700
Subject: [PATCH 03/12] Optimized memory usage via string interning and
extensive data sharing
---
Cargo.toml | 3 ++-
src/blob.rs | 52 +++++++++++++++++++++++++------------
src/location.rs | 3 +--
src/matcher.rs | 36 ++++++++-----------------
src/reporter/json_format.rs | 3 ---
src/scanner/enumerate.rs | 9 +++++--
src/scanner/processing.rs | 9 +++----
tests/fingerprint_dedup.rs | 2 --
8 files changed, 59 insertions(+), 58 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 4e774bf..ac3b6a6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,7 +65,7 @@ smallvec = { version = "1", features = [
tracing = "0.1.41"
indicatif = { version = "0.17", features = ["improved_unicode"] }
rayon = "1.10"
-sha1 = "0.10.6"
+sha1 = { version = "0.10.6", features = ["asm"] }
hex = "0.4.3"
vectorscan-rs = "0.0.5"
regex = "1.11.1"
@@ -151,6 +151,7 @@ tar = "0.4.44"
xz2 = "0.1.7"
asar = "0.3.0"
blake3 = "1.8.2"
+memchr = "2.7"
memmap2 = "0.9.7"
futures = "0.3.31"
dashmap = "6.1.0"
diff --git a/src/blob.rs b/src/blob.rs
index a54a4a8..9530882 100644
--- a/src/blob.rs
+++ b/src/blob.rs
@@ -10,16 +10,20 @@ use anyhow::Result;
use bstr::{BString, ByteSlice};
use gix::ObjectId;
use hex;
+use once_cell::sync::OnceCell;
use parking_lot::Mutex;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use sha1::{Digest, Sha1};
use smallvec::SmallVec;
+use std::sync::atomic::{AtomicU64, Ordering};
use crate::git_commit_metadata::CommitMetadata;
// const LARGE_FILE_THRESHOLD: u64 = 512 * 1024; // 512 KB
const LARGE_FILE_THRESHOLD: u64 = 0; // always mmap
+static NEXT_ID: AtomicU64 = AtomicU64::new(1);
+
/// The data of a blob, either owned (small files) or memory mapped (large files).
pub enum BlobData<'a> {
/// Small blobs – remains as-is.
@@ -75,47 +79,68 @@ pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>;
/// A Git blob, storing its SHA-1 id and its contents.
pub struct Blob<'a> {
- pub id: BlobId,
- pub data: BlobData<'a>,
+ id: OnceCell,
+ data: BlobData<'a>,
+ temp_id: u64,
}
impl Blob<'_> {
#[inline]
-
pub fn from_file>(path: P) -> Result {
let mut file = File::open(&path)?;
let file_size = file.metadata()?.len();
+ let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
if file_size > LARGE_FILE_THRESHOLD {
// Large files: one mmap, zero extra copies.
let mmap = unsafe { memmap2::Mmap::map(&file)? };
- let id = BlobId::new(mmap.as_ref());
- Ok(Blob { id, data: BlobData::Mapped(mmap) })
+ Ok(Blob { id: OnceCell::new(), data: BlobData::Mapped(mmap), temp_id })
} else {
// Small files: reuse the same handle and pre-allocate exact capacity
let mut bytes = Vec::with_capacity(file_size as usize);
file.read_to_end(&mut bytes)?;
- let id = BlobId::new(&bytes);
- Ok(Blob { id, data: BlobData::Owned(bytes) })
+ Ok(Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id })
}
}
+
/// Returns the blob's bytes as a slice.
#[inline]
pub fn bytes(&self) -> &[u8] {
self.data.as_ref()
}
+ /// Lazily compute and return the blob's SHA-1 `BlobId`.
+ #[inline]
+ pub fn id(&self) -> BlobId {
+ *self.id.get_or_init(|| BlobId::new(self.bytes()))
+ }
+
+ /// Get a reference to the blob's SHA-1 `BlobId`, computing it if necessary.
+ #[inline]
+ pub fn id_ref(&self) -> &BlobId {
+ self.id.get_or_init(|| BlobId::new(self.bytes()))
+ }
+
+ /// Return the temporary identifier assigned on blob creation.
+ #[inline]
+ pub fn temp_id(&self) -> u64 {
+ self.temp_id
+ }
+
/// Create a new `Blob` from a vector of bytes.
#[inline]
pub fn from_bytes(bytes: Vec) -> Self {
- let id = BlobId::compute_from_bytes(&bytes);
- Blob { id, data: BlobData::Owned(bytes) }
+ let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+ Blob { id: OnceCell::new(), data: BlobData::Owned(bytes), temp_id }
}
/// Create a new `Blob` with the given id and data.
#[inline]
pub fn new(id: BlobId, bytes: Vec) -> Self {
- Blob { id, data: BlobData::Owned(bytes) }
+ let temp_id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+ let cell = OnceCell::new();
+ let _ = cell.set(id);
+ Blob { id: cell, data: BlobData::Owned(bytes), temp_id }
}
#[inline]
@@ -360,9 +385,6 @@ pub struct BlobMetadata {
/// The guessed multimedia type of the blob
pub mime_essence: Option,
- /// The guessed charset of the blob
- pub charset: Option,
-
/// The guessed programming language of the blob
pub language: Option,
}
@@ -385,8 +407,4 @@ impl BlobMetadata {
self.mime_essence.as_deref()
}
- #[inline]
- pub fn charset(&self) -> Option<&str> {
- self.charset.as_deref()
- }
}
diff --git a/src/location.rs b/src/location.rs
index 4af7021..05a7586 100644
--- a/src/location.rs
+++ b/src/location.rs
@@ -94,8 +94,7 @@ pub struct LocationMapping {
impl LocationMapping {
/// Scan once for all `\n` positions.
pub fn new(input: &[u8]) -> Self {
- let newline_offsets =
- input.iter().enumerate().filter_map(|(i, &b)| (b == b'\n').then_some(i)).collect();
+ let newline_offsets = memchr::memchr_iter(b'\n', input).collect();
LocationMapping { newline_offsets }
}
diff --git a/src/matcher.rs b/src/matcher.rs
index 0ecc7ec..07f53d3 100644
--- a/src/matcher.rs
+++ b/src/matcher.rs
@@ -288,17 +288,6 @@ impl<'a> Matcher<'a> {
self.local_stats.blobs_scanned += 1;
self.local_stats.bytes_scanned += blob.bytes().len() as u64;
- // Check if blob was already seen and respect no_dedup flag
- if !no_dedup {
- if let Some(had_matches) = self.seen_blobs.get(&blob.id) {
- return Ok(if had_matches {
- ScanResult::SeenWithMatches
- } else {
- ScanResult::SeenSansMatches
- });
- }
- }
-
// Extract filename from origin
let filename = origin
.first()
@@ -316,16 +305,7 @@ impl<'a> Matcher<'a> {
let mut b64_items = if no_base64 { Vec::new() } else { get_base64_strings(blob.bytes()) };
if self.user_data.raw_matches_scratch.is_empty() && b64_items.is_empty() {
- // Only record in seen_blobs if deduplication is enabled
- if !no_dedup {
- return Ok(match self.seen_blobs.insert(blob.id, false) {
- None => ScanResult::New(Vec::new()),
- Some(true) => ScanResult::SeenWithMatches,
- Some(false) => ScanResult::SeenSansMatches,
- });
- } else {
- return Ok(ScanResult::New(Vec::new()));
- }
+ return Ok(ScanResult::New(Vec::new()));
}
let rules_db = self.rules_db;
@@ -472,9 +452,15 @@ impl<'a> Matcher<'a> {
}
}
// Finalize
- // Only record in seen_blobs if deduplication is enabled
- if !no_dedup {
- self.seen_blobs.insert(blob.id, !matches.is_empty());
+ if !no_dedup && !matches.is_empty() {
+ let blob_id = blob.id();
+ if let Some(had_matches) = self.seen_blobs.insert(blob_id, true) {
+ return Ok(if had_matches {
+ ScanResult::SeenWithMatches
+ } else {
+ ScanResult::SeenSansMatches
+ });
+ }
}
// --- opportunistic capacity cap ---------------------------------
@@ -571,7 +557,7 @@ fn filter_match<'b>(
SerializableCaptures::from_captures(&captures, byte_slice.as_ref(), re, redact);
matches.push(BlobMatch {
rule: Arc::clone(&rule),
- blob_id: &blob.id,
+ blob_id: blob.id_ref(),
matching_input: only_matching_input,
matching_input_offset_span,
captures: groups,
diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs
index 7cf90c8..f2b065e 100644
--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@@ -185,7 +185,6 @@ mod tests {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
- charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
};
let dedup = true;
@@ -219,7 +218,6 @@ mod tests {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
- charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
},
m: mock_match,
@@ -256,7 +254,6 @@ mod tests {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
- charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
},
m: mock_match,
diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs
index 14861ff..970ca5c 100644
--- a/src/scanner/enumerate.rs
+++ b/src/scanner/enumerate.rs
@@ -172,10 +172,15 @@ pub fn enumerate_filesystem_inputs(
};
if should_skip {
progress.suspend(|| {
+ let path = origin
+ .first()
+ .blob_path()
+ .map(|p| p.display().to_string())
+ .unwrap_or_else(|| blob.temp_id().to_string());
if is_archive {
- debug!("Skipping archive: {}", blob.id);
+ debug!("Skipping archive: {path}");
} else {
- debug!("Skipping binary blob: {}", blob.id);
+ debug!("Skipping binary blob: {path}");
}
});
return Ok(());
diff --git a/src/scanner/processing.rs b/src/scanner/processing.rs
index 9447dbf..410233c 100644
--- a/src/scanner/processing.rs
+++ b/src/scanner/processing.rs
@@ -27,8 +27,7 @@ impl<'a> BlobProcessor<'a> {
redact: bool,
no_base64: bool,
) -> Result
+- [Kingfisher](#kingfisher)
+ - [Key Features](#key-features)
+- [Benchmark Results](#benchmark-results)
+- [Getting Started](#getting-started)
+ - [Installation](#installation)
+ - [Run Kingfisher in Docker](#run-kingfisher-in-docker)
+- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance)
+ - [Write Custom Rules!](#write-custom-rules)
+- [Usage](#usage)
+ - [Basic Examples](#basic-examples)
+ - [Scan with secret validation](#scan-with-secret-validation)
+ - [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories)
+ - [Scan a Git repository without validation](#scan-a-git-repository-without-validation)
+ - [Display only secrets confirmed active by third‑party APIs](#display-only-secrets-confirmed-active-by-thirdparty-apis)
+ - [Output JSON and capture to a file](#output-json-and-capture-to-a-file)
+ - [Output SARIF directly to disk](#output-sarif-directly-to-disk)
+ - [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--)
+ - [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size)
+ - [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag)
+ - [Display rule performance statistics](#display-rule-performance-statistics)
+ - [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files)
+ - [Exclude specific paths](#exclude-specific-paths)
+ - [Scan an S3 bucket](#scan-an-s3-bucket)
+ - [Scanning Docker Images](#scanning-docker-images)
+ - [Scanning GitHub](#scanning-github)
+ - [Scan GitHub organisation (requires `KF_GITHUB_TOKEN`)](#scan-github-organisation-requires-kf_github_token)
+ - [Scan remote GitHub repository](#scan-remote-github-repository)
+ - [Scanning GitLab](#scanning-gitlab)
+ - [Scan GitLab group (requires `KF_GITLAB_TOKEN`)](#scan-gitlab-group-requires-kf_gitlab_token)
+ - [Scan GitLab user](#scan-gitlab-user)
+ - [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url)
+ - [List GitLab repositories](#list-gitlab-repositories)
+ - [Scanning Jira](#scanning-jira)
+ - [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query)
+ - [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues)
+ - [Scanning Confluence](#scanning-confluence)
+ - [Scan Confluence pages matching a CQL query](#scan-confluence-pages-matching-a-cql-query)
+ - [Scanning Slack](#scanning-slack)
+ - [Scan Slack messages matching a search query](#scan-slack-messages-matching-a-search-query)
+ - [Environment Variables for Tokens](#environment-variables-for-tokens)
+ - [Exit Codes](#exit-codes)
+ - [Update Checks](#update-checks)
+- [Advanced Options](#advanced-options)
+ - [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets)
+ - [List Builtin Rules](#list-builtin-rules)
+ - [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run)
+ - [To add your rules alongside the built‑ins:](#to-add-your-rules-alongside-the-builtins)
+ - [Other Examples](#other-examples)
+ - [Notable Scan Options](#notable-scan-options)
+ - [Understanding `--confidence`](#understanding---confidence)
+ - [Ignore known false positives](#ignore-known-false-positives)
+ - [Finding Fingerprint](#finding-fingerprint)
+ - [Rule Performance Profiling](#rule-performance-profiling)
+ - [CLI Options](#cli-options)
+ - [Origins and Divergence](#origins-and-divergence)
+- [Roadmap](#roadmap)
+- [License](#license)
+
# Getting Started
## Installation
From b70df66efd52f5dbaa5e909045c7c2037b0eecc2 Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Fri, 5 Sep 2025 07:57:14 -0700
Subject: [PATCH 09/12] updated readme
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 69c9e0c..1da58dd 100644
--- a/README.md
+++ b/README.md
@@ -294,7 +294,7 @@ kingfisher scan /some/file --max-file-size 250
### Scan using a rule _family_ with one flag
-_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws._`)\*
+_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws.*`)_
```bash
# Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2)
From 80aef7e6d75fef5a29b65d539239b9029da2a396 Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Fri, 5 Sep 2025 09:31:43 -0700
Subject: [PATCH 10/12] preparing for v1.48.0
---
.github/workflows/ci.yml | 2 -
src/matcher.rs | 3 +-
src/scanner/processing.rs | 4 +-
src/validation.rs.orig | 1052 -------------------------------------
4 files changed, 2 insertions(+), 1059 deletions(-)
delete mode 100644 src/validation.rs.orig
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 27e1f98..389b9c3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,8 +24,6 @@ jobs:
run: make ubuntu-arm64
- name: Run tests
run: make tests
- env:
- CARGO_BUILD_JOBS: 1
macos-arm64:
name: macOS arm64
diff --git a/src/matcher.rs b/src/matcher.rs
index 1211768..d30b0d8 100644
--- a/src/matcher.rs
+++ b/src/matcher.rs
@@ -925,13 +925,12 @@ pub fn get_base64_strings(input: &[u8]) -> Vec {
i += 1;
}
- let mut end = i;
let mut eq_count = 0;
while i < input.len() && input[i] == b'=' && eq_count < 2 {
i += 1;
eq_count += 1;
}
- end = i;
+ let end = i;
let len = end - start;
if len >= 32 && len % 4 == 0 {
diff --git a/src/scanner/processing.rs b/src/scanner/processing.rs
index b3535d7..5132209 100644
--- a/src/scanner/processing.rs
+++ b/src/scanner/processing.rs
@@ -110,7 +110,6 @@ impl<'a> BlobProcessor<'a> {
struct MetadataResult {
mime_essence: Option,
language: Option,
- charset: Option,
}
impl MetadataResult {
fn from_blob_and_origin(blob: &Blob, origin: &OriginSet) -> MetadataResult {
@@ -119,7 +118,6 @@ impl MetadataResult {
let mime_essence = Some(tree_magic_mini::from_u8(bytes).to_string());
let inspector = ContentInspector::default();
let language = blob_path.and_then(|p| inspector.guess_language(p, bytes));
- let charset = inspector.guess_charset(bytes);
- MetadataResult { mime_essence, language, charset }
+ MetadataResult { mime_essence, language }
}
}
diff --git a/src/validation.rs.orig b/src/validation.rs.orig
deleted file mode 100644
index ca76943..0000000
--- a/src/validation.rs.orig
+++ /dev/null
@@ -1,1052 +0,0 @@
-use std::{
- collections::BTreeMap,
- fs,
- hash::{Hash, Hasher},
- sync::Arc,
- time::{Duration, Instant},
-};
-
-use anyhow::Result;
-use crossbeam_skiplist::SkipMap;
-use dashmap::DashMap;
-use http::StatusCode;
-use liquid::Object;
-use liquid_core::{Value, ValueView};
-use once_cell::sync::OnceCell;
-use reqwest::{header, header::HeaderValue, multipart, Client, Url};
-use rustc_hash::FxHashMap;
-use tokio::{sync::Notify, time};
-use tracing::debug;
-
-use crate::{
- location::OffsetSpan,
- matcher::{OwnedBlobMatch, SerializableCaptures},
- rules::rule::Validation,
-};
-
-mod aws;
-mod azure;
-mod coinbase;
-mod gcp;
-mod httpvalidation;
-mod jwt;
-mod mongodb;
-mod postgres;
-mod utils;
-
-const VALIDATION_CACHE_SECONDS: u64 = 1200; // 20 minutes
-const MAX_VALIDATION_BODY_LEN: usize = 2048;
-
-// Use SkipMap-based cache instead of a mutex-wrapped FxHashMap.
-type Cache = Arc>;
-
-/// Returns an opaque 64-bit fingerprint for “same secret under the same rule”.
-fn secret_fingerprint(m: &OwnedBlobMatch) -> u64 {
- let mut hasher = xxhash_rust::xxh3::Xxh3::new();
- m.rule.syntax().id.hash(&mut hasher);
-
- // first capture = the secret text itself
- if let Some(c0) = m.captures.captures.get(0) {
- c0.value.hash(&mut hasher);
- }
- hasher.finish()
-}
-
-static VALIDATION_CACHE: OnceCell> = OnceCell::new();
-static IN_FLIGHT: OnceCell>> = OnceCell::new();
-
-/// Call this once near program start (e.g. in `main()`)
-pub fn init_validation_caches() {
- VALIDATION_CACHE.set(DashMap::new()).ok();
- IN_FLIGHT.set(DashMap::new()).ok();
-}
-
-#[derive(Clone)]
-pub struct CachedResponse {
- pub body: String,
- pub status: StatusCode,
- pub is_valid: bool,
- pub timestamp: Instant,
-}
-
-impl CachedResponse {
- pub fn new(body: String, status: StatusCode, is_valid: bool) -> Self {
- Self { body, status, is_valid, timestamp: Instant::now() }
- }
-
- pub fn is_still_valid(&self, cache_duration: Duration) -> bool {
- self.timestamp.elapsed() < cache_duration
- }
-}
-
-/// Collect dependent variables and missing dependencies from the provided matches.
-pub fn collect_variables_and_dependencies(
- matches: &[OwnedBlobMatch],
-) -> (FxHashMap>, FxHashMap>) {
- let mut variable_map: FxHashMap> = FxHashMap::default();
- let mut missing_deps: FxHashMap> = FxHashMap::default();
-
- for m in matches {
- let rule_id = m.rule.syntax().id.clone();
- for dependency in m.rule.syntax().depends_on_rule.iter().flatten() {
- let dependency_rule_id = &dependency.rule_id;
- // Use iterator adapter to get all matching dependencies.
- let matching_dependencies: Vec<_> =
- matches.iter().filter(|x| x.rule.syntax().id == *dependency_rule_id).collect();
-
- if !matching_dependencies.is_empty() {
- for other_match in matching_dependencies {
- let matching_input = other_match
- .captures
- .captures
- .get(1)
- .or_else(|| other_match.captures.captures.get(0))
- .expect("Expected at least one capture");
- variable_map
- .entry(dependency.variable.to_uppercase())
- .or_insert_with(Vec::new)
- .push((
- matching_input.value.to_string(),
- other_match.matching_input_offset_span,
- ));
- }
- } else {
- missing_deps.entry(rule_id.clone()).or_default().push(dependency.rule_id.clone());
- }
- }
- }
- (variable_map, missing_deps)
-}
-
-/// Render a template and parse the resulting string as a URL.
-async fn render_and_parse_url(
- parser: &liquid::Parser,
- globals: &liquid::Object,
- rule_name: &str,
- template_url: &str,
-) -> Result {
- let rendered_url_str =
- render_template(parser, globals, rule_name, template_url).await.map_err(|e| {
- let error_msg = format!("Error rendering URL template: <{}> {}", rule_name, e);
- debug!("{}", error_msg);
- error_msg
- })?;
-
- let url = Url::parse(&rendered_url_str).map_err(|e| {
- let error_msg = format!("Error parsing rendered URL: {}", e);
- debug!("{}", error_msg);
- error_msg
- })?;
-
- // Check if the URL is resolvable.
- utils::check_url_resolvable(&url).await.map_err(|e| {
- let error_msg = format!("URL resolution failed: {}", e);
- error_msg
- })?;
-
- Ok(url)
-}
-
-/// Render a template string using Liquid.
-async fn render_template(
- parser: &liquid::Parser,
- globals: &liquid::Object,
- rule_name: &str,
- template_str: &str,
-) -> Result {
- parser
- .parse(template_str)
- .map_err(|e| {
- let msg = format!("Error parsing template for rule <{}>: {}", rule_name, e);
- debug!("{}", msg);
- msg
- })
- .and_then(|template| {
- template.render(globals).map_err(|e| {
- let msg = format!("Error rendering template for rule <{}>: {}", rule_name, e);
- debug!("{}", msg);
- msg
- })
- })
-}
-
-/// Validate a single match with a timeout of 60 seconds.
-pub async fn validate_single_match(
- m: &mut OwnedBlobMatch,
- parser: &liquid::Parser,
- client: &Client,
- dependent_variables: &FxHashMap>,
- missing_dependencies: &FxHashMap>,
- cache: &Cache,
-) {
- let timeout_result = time::timeout(Duration::from_secs(60), async {
- timed_validate_single_match(
- m,
- parser,
- client,
- dependent_variables,
- missing_dependencies,
- cache,
- )
- .await
- })
- .await;
-
- if timeout_result.is_err() {
- m.validation_success = false;
- m.validation_response_body = "Validation timed out after 60 seconds".to_string();
- m.validation_response_status = StatusCode::REQUEST_TIMEOUT;
- }
-}
-
-/// Perform the actual validation of a match.
-/// Guarantees that each | is validated only once per process,
-/// even when `--no-dedup` is used.
-async fn timed_validate_single_match<'a>(
- m: &mut OwnedBlobMatch,
- parser: &liquid::Parser,
- client: &Client,
- dependent_variables: &FxHashMap>,
- missing_dependencies: &FxHashMap>,
- cache: &Cache,
-) {
- // ──────────────────────────────────────────────────────────
- // 1. process-wide fingerprint de-dup
- // ──────────────────────────────────────────────────────────
- let fp = secret_fingerprint(m);
-
- if let Some(entry) = VALIDATION_CACHE.get_or_init(DashMap::new).get(&fp) {
- if entry.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
- m.validation_success = entry.is_valid;
- m.validation_response_body = entry.body.clone();
- m.validation_response_status = entry.status;
- return;
- }
- }
- if let Some(wait) = IN_FLIGHT.get_or_init(DashMap::new).get(&fp) {
- wait.notified().await;
- if let Some(entry) = VALIDATION_CACHE.get().unwrap().get(&fp) {
- m.validation_success = entry.is_valid;
- m.validation_response_body = entry.body.clone();
- m.validation_response_status = entry.status;
- }
- return;
- }
- let notify = Arc::new(Notify::new());
- IN_FLIGHT.get().unwrap().insert(fp, notify.clone());
-
- // helper to persist result + notify waiters
- let commit_and_return = |m: &OwnedBlobMatch| {
- VALIDATION_CACHE.get().unwrap().insert(
- fp,
- CachedResponse {
- body: m.validation_response_body.clone(),
- status: m.validation_response_status,
- is_valid: m.validation_success,
- timestamp: Instant::now(),
- },
- );
- IN_FLIGHT.get().unwrap().remove(&fp);
- notify.notify_waiters();
- };
- // ──────────────────────────────────────────────────────────
-
- // 2. dependency check
- if let Some(missing) = missing_dependencies.get(&m.rule.syntax().id) {
- if !missing.is_empty() {
- m.validation_success = false;
- m.validation_response_body =
- format!("Validation skipped - missing dependent rules: {}", missing.join(", "));
- m.validation_response_status = StatusCode::PRECONDITION_REQUIRED;
- commit_and_return(m);
- return;
- }
- }
-
- // 3. capture processing
- let match_re_result = m.rule.syntax().as_anchored_regex();
- let mut captured_values: Vec<(String, String, usize, usize)> = match match_re_result {
- Ok(_) => utils::process_captures(&m.captures),
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = format!("Regex error: {}", e);
- m.validation_response_status = StatusCode::INTERNAL_SERVER_ERROR;
- commit_and_return(m);
- return;
- }
- };
-
- for dep in m.rule.syntax().depends_on_rule.iter().flatten() {
- if let Some(vals) = dependent_variables.get(&dep.variable.to_uppercase()) {
- for (val, span) in vals {
- captured_values.push((
- dep.variable.to_uppercase(),
- val.clone(),
- span.start,
- span.end,
- ));
- }
- }
- }
-
- let mut globals = Object::new();
- for (k, v, ..) in &captured_values {
- globals.insert(k.to_uppercase().into(), Value::scalar(v.clone()));
- }
-
- let rule_syntax = m.rule.syntax();
-
- // ──────────────────────────────────────────────────────────
- // 4. validator switch
- // ──────────────────────────────────────────────────────────
- match &rule_syntax.validation {
- // ---------------------------------------------------- HTTP validator
- Some(Validation::Http(http_validation)) => {
- // render URL
- let url = match render_and_parse_url(
- parser,
- &globals,
- &rule_syntax.name,
- &http_validation.request.url,
- )
- .await
- {
- Ok(u) => u,
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = e;
- m.validation_response_status = StatusCode::BAD_REQUEST;
- commit_and_return(m);
- return;
- }
- };
-
- // build request builder
- let request_builder = match httpvalidation::build_request_builder(
- client,
- &http_validation.request.method,
- &url,
- &http_validation.request.headers,
- &http_validation.request.body,
- parser,
- &globals,
- ) {
- Ok(rb) => rb,
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = e;
- m.validation_response_status = StatusCode::BAD_REQUEST;
- commit_and_return(m);
- return;
- }
- };
-
- let is_multipart = http_validation.request.multipart.is_some();
- let mut cache_key = String::new();
-
- // old per-request cache (optional)
- if !is_multipart {
- let rendered_headers = httpvalidation::process_headers(
- &http_validation.request.headers,
- parser,
- &globals,
- &url,
- )
- .unwrap_or_default();
-
- let mut header_map = BTreeMap::new();
- for (name, value) in rendered_headers.iter() {
- if let Ok(v) = value.to_str() {
- header_map.insert(name.as_str().to_string(), v.to_string());
- }
- }
- cache_key = httpvalidation::generate_http_cache_key_parts(
- http_validation.request.method.as_str(),
- &url,
- &header_map,
- );
- if let Some(cached) = cache.get(&cache_key) {
- let c = cached.value();
- if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
- m.validation_success = c.is_valid;
- m.validation_response_body = c.body.clone();
- m.validation_response_status = c.status;
- commit_and_return(m);
- return;
- }
- }
- }
-
- // helper to execute single non-multipart request with retry
- let exec_single = |builder: reqwest::RequestBuilder| async {
- httpvalidation::retry_request(
- builder,
- 1,
- Duration::from_millis(500),
- Duration::from_secs(2),
- )
- .await
- };
-
- // run request (multipart vs non-multipart)
- let resp_res = if is_multipart {
- // build multipart request each retry
- let build_request = || async {
- let method = httpvalidation::parse_http_method(&http_validation.request.method)
- .unwrap_or(reqwest::Method::GET);
-
- let mut fresh_builder =
- client.request(method, url.clone()).timeout(Duration::from_secs(5));
-
- if let Ok(mut headers) = httpvalidation::process_headers(
- &http_validation.request.headers,
- parser,
- &globals,
- &url,
- ) {
- // add realistic UA & accept headers
- let ua = format!(
- "{} {}/{}",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
- AppleWebKit/537.36 (KHTML, like Gecko) \
- Chrome/132.0.0.0 Safari/537.36",
- env!("CARGO_PKG_NAME"),
- env!("CARGO_PKG_VERSION")
- );
- let std_headers = [
- (header::USER_AGENT, ua.as_str()),
- (header::ACCEPT , "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"),
- (header::ACCEPT_LANGUAGE, "en-US,en;q=0.5"),
- (header::ACCEPT_ENCODING, "gzip, deflate, br"),
- (header::CONNECTION, "keep-alive"),
- ];
- for (hn, hv) in &std_headers {
- if let Ok(v) = HeaderValue::from_str(hv) {
- headers.insert(hn.clone(), v);
- }
- }
- fresh_builder = fresh_builder.headers(headers);
- }
-
- // build multipart form
- let mut form = multipart::Form::new();
- for part in http_validation.request.multipart.as_ref().unwrap().parts.iter() {
- match part.part_type.as_str() {
- "file" => {
- let path = render_template(
- parser,
- &globals,
- &rule_syntax.name,
- &part.content,
- )
- .await
- .unwrap_or_default();
- let bytes = fs::read(path).unwrap_or_default();
- let p = multipart::Part::bytes(bytes)
- .mime_str(
- part.content_type
- .as_deref()
- .unwrap_or("application/octet-stream"),
- )
- .unwrap_or_else(|_| multipart::Part::text("invalid"));
- form = form.part(part.name.clone(), p);
- }
- "text" => {
- let txt = render_template(
- parser,
- &globals,
- &rule_syntax.name,
- &part.content,
- )
- .await
- .unwrap_or_default();
- let p = multipart::Part::text(txt)
- .mime_str(part.content_type.as_deref().unwrap_or("text/plain"))
- .unwrap_or_else(|_| multipart::Part::text("invalid"));
- form = form.part(part.name.clone(), p);
- }
- _ => { /* ignore */ }
- }
- }
- fresh_builder.multipart(form)
- };
-
- httpvalidation::retry_multipart_request(
- build_request,
- 1,
- Duration::from_millis(500),
- Duration::from_secs(2),
- )
- .await
- } else {
- exec_single(request_builder).await
- };
-
- // handle result
- match resp_res {
- Ok(resp) => {
- let status = resp.status();
- let headers = resp.headers().clone();
- let mut body = match resp.text().await {
- Ok(b) => b,
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = format!("Error reading response: {}", e);
- m.validation_response_status = StatusCode::BAD_GATEWAY;
- commit_and_return(m);
- return;
- }
- };
- if body.len() > MAX_VALIDATION_BODY_LEN {
- body.truncate(MAX_VALIDATION_BODY_LEN);
- }
-
- m.validation_response_status = status;
- m.validation_response_body = body.clone();
- let matchers = http_validation
- .request
- .response_matcher
- .as_ref()
- .expect("missing response_matcher");
-
- m.validation_success = httpvalidation::validate_response(
- matchers,
- &body,
- &status,
- &headers,
- http_validation.request.response_is_html,
- );
-
- if !is_multipart && !cache_key.is_empty() {
- cache.insert(
- cache_key,
- CachedResponse {
- body,
- status,
- is_valid: m.validation_success,
- timestamp: Instant::now(),
- },
- );
- }
- }
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = format!("HTTP error: {:?}", e);
- m.validation_response_status = StatusCode::BAD_GATEWAY;
- }
- }
- }
-
- // ---------------------------------------------------- MongoDB validator
- Some(Validation::MongoDB) => {
- let uri = globals
- .get("TOKEN")
- .and_then(|v| v.as_scalar())
- .map(|s| s.into_owned().to_kstr().to_string())
- .unwrap_or_default();
-
- if uri.is_empty() {
- m.validation_success = false;
- m.validation_response_body = "MongoDB URI not found.".to_string();
- m.validation_response_status = StatusCode::BAD_REQUEST;
- commit_and_return(m);
- return;
- }
-
- let cache_key = mongodb::generate_mongodb_cache_key(&uri);
- if let Some(cached) = cache.get(&cache_key) {
- let c = cached.value();
- if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
- m.validation_success = c.is_valid;
- m.validation_response_body = c.body.clone();
- m.validation_response_status = c.status;
- commit_and_return(m);
- return;
- }
- }
-
- match mongodb::validate_mongodb(&uri).await {
- Ok((ok, msg)) => {
- m.validation_success = ok;
- m.validation_response_body = msg;
- m.validation_response_status =
- if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
- }
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = format!("MongoDB validation error: {}", e);
- m.validation_response_status = StatusCode::BAD_GATEWAY;
- }
- }
- }
-
- // ------------------------------------------------ Azure Storage validator
- Some(Validation::AzureStorage) => {
- let storage_key = captured_values
- .iter()
- .find(|(n, ..)| n == "TOKEN")
- .map(|(_, v, ..)| v.clone())
- .unwrap_or_default();
- let storage_account =
- utils::find_closest_variable(&captured_values, &storage_key, "TOKEN", "AZURENAME")
- .unwrap_or_default();
-
- if storage_account.is_empty() || storage_key.is_empty() {
- m.validation_success = false;
- m.validation_response_body = "Missing Azure Storage account or key.".to_string();
- m.validation_response_status = StatusCode::BAD_REQUEST;
- commit_and_return(m);
- return;
- }
-
- let creds_json = format!(
- r#"{{"storage_account":"{}","storage_key":"{}"}}"#,
- storage_account, storage_key
- );
- let cache_key = azure::generate_azure_cache_key(&creds_json);
-
- if let Some(cached) = cache.get(&cache_key) {
- let c = cached.value();
- if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
- m.validation_success = c.is_valid;
- m.validation_response_body = c.body.clone();
- m.validation_response_status = c.status;
- commit_and_return(m);
- return;
- }
- }
-
- match azure::validate_azure_storage_credentials(&creds_json, cache).await {
- Ok((ok, msg)) => {
- m.validation_success = ok;
- m.validation_response_body = msg;
- m.validation_response_status =
- if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
- }
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = format!("Azure Storage error: {}", e);
- m.validation_response_status = StatusCode::BAD_GATEWAY;
- }
- }
- cache.insert(
- cache_key,
- CachedResponse {
- body: m.validation_response_body.clone(),
- status: m.validation_response_status,
- is_valid: m.validation_success,
- timestamp: Instant::now(),
- },
- );
- }
-
- // ------------------------------------------------ Postgres validator
- Some(Validation::Postgres) => {
- let pg_url = globals
- .get("TOKEN")
- .and_then(|v| v.as_scalar())
- .map(|s| s.into_owned().to_kstr().to_string())
- .unwrap_or_default();
-
- if pg_url.is_empty() {
- m.validation_success = false;
- m.validation_response_body = "Postgres URL not found.".to_string();
- m.validation_response_status = StatusCode::BAD_REQUEST;
- commit_and_return(m);
- return;
- }
-
- let cache_key = postgres::generate_postgres_cache_key(&pg_url);
- if let Some(cached) = cache.get(&cache_key) {
- let c = cached.value();
- if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
- m.validation_success = c.is_valid;
- m.validation_response_body = c.body.clone();
- m.validation_response_status = c.status;
- commit_and_return(m);
- return;
- }
- }
-
- match postgres::validate_postgres(&pg_url).await {
- Ok((ok, meta)) => {
- m.validation_success = ok;
- m.validation_response_body = if ok {
- format!("Postgres connection is valid. Metadata: {:?}", meta)
- } else {
- "Postgres connection failed.".to_string()
- };
- m.validation_response_status =
- if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
- }
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = format!("Postgres error: {}", e);
- m.validation_response_status = StatusCode::BAD_GATEWAY;
- }
- }
- cache.insert(
- cache_key,
- CachedResponse {
- body: m.validation_response_body.clone(),
- status: m.validation_response_status,
- is_valid: m.validation_success,
- timestamp: Instant::now(),
- },
- );
- }
- // ---------------------------------------------------- JWT validator
- Some(Validation::JWT) => {
- let token = captured_values
- .iter()
- .find(|(n, ..)| n == "TOKEN")
- .map(|(_, v, ..)| v.clone())
- .unwrap_or_default();
-
- if token.is_empty() {
- m.validation_success = false;
- m.validation_response_body = "JWT token not found.".to_string();
- m.validation_response_status = StatusCode::BAD_REQUEST;
- commit_and_return(m);
- return;
- }
-
- match jwt::validate_jwt(&token).await {
- Ok((ok, msg)) => {
- m.validation_success = ok;
- m.validation_response_body = msg;
- m.validation_response_status =
- if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
- }
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = format!("JWT validation error: {}", e);
- m.validation_response_status = StatusCode::BAD_REQUEST;
- }
- }
- }
- // ---------------------------------------------------- AWS validator
- Some(Validation::AWS) => {
- let secret = captured_values
- .iter()
- .find(|(n, ..)| n == "TOKEN")
- .map(|(_, v, ..)| v.clone())
- .unwrap_or_default();
- let akid = utils::find_closest_variable(&captured_values, &secret, "TOKEN", "AKID")
- .unwrap_or_default();
-
- if akid.is_empty() || secret.is_empty() {
- m.validation_success = false;
- m.validation_response_body = "Missing AWS access-key ID or secret.".to_string();
- m.validation_response_status = StatusCode::BAD_REQUEST;
- commit_and_return(m);
- return;
- }
-
- let cache_key = aws::generate_aws_cache_key(&akid, &secret);
- if let Some(cached) = cache.get(&cache_key) {
- let c = cached.value();
- if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
- m.validation_success = c.is_valid;
- m.validation_response_body = c.body.clone();
- m.validation_response_status = c.status;
- commit_and_return(m);
- return;
- }
- }
-
- if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) {
- m.validation_success = false;
- m.validation_response_body = format!("Invalid AWS credentials ({}): {}", akid, e);
- m.validation_response_status = StatusCode::BAD_REQUEST;
- commit_and_return(m);
- return;
- }
-
- match aws::validate_aws_credentials(&akid, &secret, cache).await {
- Ok((ok, arn)) => {
- m.validation_success = ok;
- m.validation_response_body = format!("{} --- ARN: {}", akid, arn);
- m.validation_response_status =
- if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
- if let Ok(acct) = aws::aws_key_to_account_number(&akid) {
- m.validation_response_body
- .push_str(&format!(" --- AWS Account Number: {:012}", acct));
- }
- }
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = format!("AWS validation error ({}): {}", akid, e);
- m.validation_response_status = StatusCode::BAD_GATEWAY;
- }
- }
- cache.insert(
- cache_key,
- CachedResponse {
- body: m.validation_response_body.clone(),
- status: m.validation_response_status,
- is_valid: m.validation_success,
- timestamp: Instant::now(),
- },
- );
- }
-
- // ----------------------------------------------------- GCP validator
- Some(Validation::GCP) => {
- let gcp_json = globals
- .get("TOKEN")
- .and_then(|v| v.as_scalar())
- .map(|s| s.into_owned().to_kstr().to_string())
- .unwrap_or_default();
-
- if gcp_json.is_empty() {
- m.validation_success = false;
- m.validation_response_body = "GCP JSON not found.".to_string();
- m.validation_response_status = StatusCode::BAD_REQUEST;
- commit_and_return(m);
- return;
- }
-
- let cache_key = gcp::generate_gcp_cache_key(&gcp_json);
- if let Some(cached) = cache.get(&cache_key) {
- let c = cached.value();
- if c.timestamp.elapsed() < Duration::from_secs(VALIDATION_CACHE_SECONDS) {
- m.validation_success = c.is_valid;
- m.validation_response_body = c.body.clone();
- m.validation_response_status = c.status;
- commit_and_return(m);
- return;
- }
- }
-
- match gcp::GcpValidator::global() {
- Ok(validator) => {
- match validator.validate_gcp_credentials(&gcp_json.as_bytes()).await {
- Ok((ok, meta)) => {
- m.validation_success = ok;
- m.validation_response_body = meta.join("\n");
- m.validation_response_status =
- if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
- }
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = format!("GCP validation error: {}", e);
- m.validation_response_status = StatusCode::BAD_GATEWAY;
- }
- }
- }
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = format!("Failed to create GCP validator: {}", e);
- m.validation_response_status = StatusCode::INTERNAL_SERVER_ERROR;
- }
- }
- cache.insert(
- cache_key,
- CachedResponse {
- body: m.validation_response_body.clone(),
- status: m.validation_response_status,
- is_valid: m.validation_success,
- timestamp: Instant::now(),
- },
- );
- }
- // ----------------------------------------------------- Coinbase validator
- Some(Validation::Coinbase) => {
- let cred_name = globals
- .get("CRED_NAME")
- .and_then(|v| v.as_scalar())
- .map(|s| s.into_owned().to_kstr().to_string())
- .unwrap_or_default();
- let private_key = globals
- .get("PRIVATE_KEY")
- .and_then(|v| v.as_scalar())
- .map(|s| s.into_owned().to_kstr().to_string())
- .unwrap_or_default();
-
- if cred_name.is_empty() || private_key.is_empty() {
- m.validation_success = false;
- m.validation_response_body = "Missing key name or private key.".to_string();
- m.validation_response_status = StatusCode::BAD_REQUEST;
- commit_and_return(m);
- return;
- }
-
- match coinbase::validate_cdp_api_key(&cred_name, &private_key, client, parser, cache)
- .await
- {
- Ok((ok, msg)) => {
- m.validation_success = ok;
- m.validation_response_body = msg;
- m.validation_response_status =
- if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
- }
- Err(e) => {
- m.validation_success = false;
- m.validation_response_body = format!("Coinbase validation error: {}", e);
- m.validation_response_status = StatusCode::BAD_GATEWAY;
- }
- }
- }
- // --------------------------------------------------------- Raw / none
- Some(Validation::Raw(raw)) => {
- debug!("Raw validation not implemented: {}", raw);
- m.validation_success = false;
- m.validation_response_body = "Validator not implemented".to_string();
- m.validation_response_status = StatusCode::NOT_IMPLEMENTED;
- }
- None => { /* no validation specified */ }
- }
-
- // 5. persist result for success path
- commit_and_return(m);
-}
-
-#[cfg(test)]
-mod tests {
- use std::sync::Arc;
-
- use anyhow::Result;
- use crossbeam_skiplist::SkipMap;
- use http::StatusCode;
- use rustc_hash::FxHashMap;
-
- use crate::{
- blob::BlobId,
- liquid_filters::register_all,
- location::OffsetSpan,
- matcher::{OwnedBlobMatch, SerializableCapture, SerializableCaptures},
- rules::{
- rule::{Confidence, Rule},
- Rules,
- },
- validation::{validate_single_match, Cache},
- };
- #[tokio::test]
- async fn test_actual_pypi_token_validation() -> Result<()> {
- // Minimal PyPI YAML snippet for testing
- let pypi_yaml = r#"
-rules:
- - name: PyPI Upload Token
- id: kingfisher.pypi.1
- pattern: |
- (?x)
- \b
- (
- pypi-AgEIcHlwaS5vcmc[a-zA-Z0-9_-]{50,}
- )
- (?:[^a-zA-Z0-9_-]|$)
- min_entropy: 4.0
- confidence: medium
- examples:
- - '# password = pypi-AgEIcHlwaS5vcmcCJDkwNzYwNzU1LWMwOTUtNGNkOC1iYjQzLTU3OWNhZjI1NDQ1MwACJXsicGVybWCf99lvbnMiOiAidXNlciIsICJ2ZXJzaW9uIjogMX0AAAYgSpW5PAywXvchMUQnkF5H6-SolJysfUvIWopMsxE4hCM'
- - 'password: pypi-AgEIcHlwaS5vcmcCJGExMDIxZjRhLTFhZDMtNDc4YS1iOWNmLWQwCf99OTIwZjFjNwACSHsicGVybWlzc2lvbnMiOiB7InByb2plY3RzIjogWyJkamFuZ28tY2hhbm5lbHMtanNvbnJwYyJdfSwgInZlcnNpb24iOiAxfQAABiBZg48cIBQt7HckwM4G3q-462xphsLbm7IZvjqMS4jvQw'
- validation:
- type: Http
- content:
- request:
- method: POST
- url: https://upload.pypi.org/legacy/
- response_is_html: true
- response_matcher:
- - report_response: true
- - type: WordMatch
- words:
- - "isn't allowed to upload to project"
- headers:
- Authorization: 'Basic {{ "__token__:" | append: TOKEN | b64enc }}'
- multipart:
- parts:
- - name: name
- type: text
- content: "my-package"
- - name: version
- type: text
- content: "0.0.1"
- - name: filetype
- type: text
- content: "sdist"
- - name: metadata_version
- type: text
- content: "2.1"
- - name: summary
- type: text
- content: "A simple example package"
- - name: home_page
- type: text
- content: "https://github.com/yourusername/my_package"
- - name: sha256_digest
- type: text
- content: "0447379dd46c4ca8b8992bda56d07b358d015efb9300e6e16f224f4536e71d64"
- - name: md5_digest
- type: text
- content: "9b4036ab91a71124ab9f1d32a518e2bb"
- - name: :action
- type: text
- content: "file_upload"
- - name: protocol_version
- type: text
- content: "1"
- - name: content
- type: file
- content: "path/to/my_package-0.0.1.tar.gz"
- content_type: "application/octet-stream"
- "#;
- // Use from_paths_and_contents to parse the YAML snippet into a Rules object
- let data = vec![(std::path::Path::new("pypi_test.yaml"), pypi_yaml.as_bytes())];
- let rules = Rules::from_paths_and_contents(data, Confidence::Low)?;
- // Find the PyPI rule we just loaded
- let pypi_rule_syntax = rules
- .iter_rules()
- .find(|r| r.id == "kingfisher.pypi.1")
- .expect("Failed to find PyPI rule in test YAML")
- .clone(); // Clone so we can create a `Rule` from it
- // Wrap that into a `Rule` object
- let pypi_rule = Rule::new(pypi_rule_syntax);
- //////////////////////////////////////////
- //
- // Your actual PyPI token to test
- let token = "";
- let id = BlobId::new(&pypi_yaml.as_bytes());
- // Construct an `OwnedBlobMatch` (all fields needed):
- let mut owned_blob_match = OwnedBlobMatch {
- rule: pypi_rule.into(),
- blob_id: id,
- finding_fingerprint: 0, // dummy value
- // matching_input: token.as_bytes().to_vec(),
- matching_input_offset_span: OffsetSpan { start: 0, end: token.len() },
- captures: SerializableCaptures {
- captures: vec![SerializableCapture {
- name: Some("TOKEN".to_string()),
- match_number: -1,
- start: 0,
- end: token.len(),
- value: token.into(),
- }],
- },
- validation_response_body: String::new(),
- validation_response_status: StatusCode::OK,
- validation_success: false,
- calculated_entropy: 0.0, // or compute your own
- };
- let parser = register_all(liquid::ParserBuilder::with_stdlib()).build()?;
- let client = reqwest::Client::new();
- let cache: Cache = Arc::new(SkipMap::new());
- let dependent_vars = FxHashMap::default();
- let missing_deps = FxHashMap::default();
- // Run the validation
- validate_single_match(
- &mut owned_blob_match,
- &parser,
- &client,
- &dependent_vars,
- &missing_deps,
- &cache,
- )
- .await;
- println!("Success? {:?}", owned_blob_match.validation_success);
- println!("Status: {:?}", owned_blob_match.validation_response_status);
- println!("Body: {:?}", owned_blob_match.validation_response_body);
- Ok(())
- }
-}
From ba12a5b2beda9ac286f24a5d1fb4fdccd7205e58 Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Fri, 5 Sep 2025 09:31:52 -0700
Subject: [PATCH 11/12] preparing for v1.48.0
---
src/decompress.rs | 7 ++-----
src/reporter/json_format.rs | 7 +++----
src/scanner/docker.rs | 2 +-
src/scanner/validation.rs | 16 ++++++----------
src/validation/utils.rs | 10 ++--------
tests/fingerprint_dedup.rs | 2 +-
6 files changed, 15 insertions(+), 29 deletions(-)
diff --git a/src/decompress.rs b/src/decompress.rs
index 09f327f..46ae5d5 100644
--- a/src/decompress.rs
+++ b/src/decompress.rs
@@ -161,10 +161,7 @@ fn handle_zip_archive_streaming(
Ok(CompressedContent::ArchiveFiles(entries_on_disk))
}
-fn handle_asar_archive_in_memory(
- buffer: &[u8],
- archive_path: &Path,
-) -> Result {
+fn handle_asar_archive_in_memory(buffer: &[u8], archive_path: &Path) -> Result {
match AsarReader::new(buffer, None) {
Ok(reader) => {
let mut contents = Vec::new();
@@ -525,4 +522,4 @@ mod tests {
Ok(())
}
-}
\ No newline at end of file
+}
diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs
index f2b065e..1ee7fb7 100644
--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@@ -35,7 +35,7 @@ mod tests {
use crate::cli::commands::github::GitHistoryMode;
use crate::cli::commands::rules::RuleSpecifierArgs;
use crate::matcher::{SerializableCapture, SerializableCaptures};
- use crate::rules::rule::{Rule, RuleSyntax, Confidence};
+ use crate::rules::rule::{Confidence, Rule, RuleSyntax};
use crate::util::intern;
use crate::{
blob::BlobId,
@@ -50,12 +50,12 @@ mod tests {
origin::Origin,
reporter::styles::Styles,
};
+ use smallvec::smallvec;
use std::{
io::Cursor,
path::PathBuf,
sync::{Arc, Mutex},
};
- use smallvec::smallvec;
use url::Url;
fn create_default_args() -> cli::commands::scan::ScanArgs {
use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope
@@ -243,8 +243,7 @@ mod tests {
fn test_validation_status_in_json() -> Result<()> {
let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")];
for (validation_success, expected_status) in test_cases {
- let mock_match =
- create_mock_match("MockRule", "mock_rule_1", validation_success);
+ let mock_match = create_mock_match("MockRule", "mock_rule_1", validation_success);
let matches = vec![ReportMatch {
origin: OriginSet::new(
Origin::from_file(PathBuf::from("/mock/path/file.rs")),
diff --git a/src/scanner/docker.rs b/src/scanner/docker.rs
index 405a553..7ebb2d4 100644
--- a/src/scanner/docker.rs
+++ b/src/scanner/docker.rs
@@ -282,4 +282,4 @@ mod tests {
fn docker_struct_new() {
let _ = Docker::new();
}
-}
\ No newline at end of file
+}
diff --git a/src/scanner/validation.rs b/src/scanner/validation.rs
index f01872a..d69dbeb 100644
--- a/src/scanner/validation.rs
+++ b/src/scanner/validation.rs
@@ -53,10 +53,7 @@ pub async fn run_secret_validation(
let mut simple_matches = Vec::new();
let mut dependent_blobs = FxHashMap::default(); // blob_id -- Vec>
for (blob_id, matches) in all_matches_by_blob {
- if matches
- .iter()
- .any(|m| !m.2.rule.syntax().depends_on_rule.is_empty())
- {
+ if matches.iter().any(|m| !m.2.rule.syntax().depends_on_rule.is_empty()) {
dependent_blobs.insert(blob_id, matches);
} else {
simple_matches.extend(matches);
@@ -77,10 +74,7 @@ pub async fn run_secret_validation(
.get(1)
.or_else(|| arc_msg.2.groups.captures.get(0))
.map_or("", |c| c.value);
- groups
- .entry(format!("{}|{}", arc_msg.2.rule.id(), secret))
- .or_default()
- .push(arc_msg);
+ groups.entry(format!("{}|{}", arc_msg.2.rule.id(), secret)).or_default().push(arc_msg);
}
let validation_results = DashMap::::new();
@@ -133,8 +127,10 @@ pub async fn run_secret_validation(
}
}
- let mut om =
- OwnedBlobMatch::convert_match_to_owned_blobmatch(&rep_arc.2, rep_arc.2.rule.clone());
+ let mut om = OwnedBlobMatch::convert_match_to_owned_blobmatch(
+ &rep_arc.2,
+ rep_arc.2.rule.clone(),
+ );
validate_single(
&mut om,
diff --git a/src/validation/utils.rs b/src/validation/utils.rs
index cb2068d..ea0b020 100644
--- a/src/validation/utils.rs
+++ b/src/validation/utils.rs
@@ -75,8 +75,8 @@ pub async fn check_url_resolvable(url: &Url) -> Result<(), Box Match {
From 82caff531fee2869a55e881202efb8dbe8846579 Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Fri, 5 Sep 2025 10:20:41 -0700
Subject: [PATCH 12/12] preparing for v1.48.0
---
.github/workflows/ci.yml | 2 ++
1 file changed, 2 insertions(+)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 389b9c3..27e1f98 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,6 +24,8 @@ jobs:
run: make ubuntu-arm64
- name: Run tests
run: make tests
+ env:
+ CARGO_BUILD_JOBS: 1
macos-arm64:
name: macOS arm64