kingfisher/tests/fingerprint_dedup.rs
Mick Grove 078fa16e6a - Reduced per-match memory usage by compacting stored source locations and interning repeated capture names.
- Stored optional validation response bodies as boxed strings to avoid allocating empty payloads and to streamline validator caches.
- Parallelized git cloning based on the configured job count and begin scanning repositories as soon as each clone finishes to reduce end-to-end scan times.
- Combined per-repository results into a single aggregate summary after scans complete.
- Added initial access-map support and report viewer html file. Currently beta features.
2025-12-04 22:02:30 -08:00

214 lines
6.8 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! tests/dedup_git.rs
use std::{
path::PathBuf,
sync::{Arc, Mutex},
};
use anyhow::Result;
use gix::{date, ObjectId};
use kingfisher::{
blob::{BlobId, BlobMetadata},
findings_store::FindingsStore,
git_commit_metadata::CommitMetadata,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::{Match, SerializableCapture, SerializableCaptures},
origin::{Origin, OriginSet},
reporter::{styles::Styles, DetailsReporter, ReportMatch},
rules::rule::{Confidence, Rule, RuleSyntax},
util::intern,
};
use smallvec::smallvec;
// ---- helpers -------------------------------------------------------------------------------
fn make_match(fp: u64, rule_id: &str) -> Match {
let syntax = RuleSyntax {
name: "Example Rule".to_string(),
id: rule_id.to_string(),
pattern: "dummy".to_string(),
min_entropy: 0.0,
confidence: Confidence::Medium,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
depends_on_rule: vec![],
pattern_requirements: None,
};
let rule = Arc::new(Rule::new(syntax));
Match {
location: Location::with_source_span(
OffsetSpan { start: 0, end: 10 },
Some(SourceSpan {
start: SourcePoint { line: 1, column: 0 },
end: SourcePoint { line: 1, column: 10 },
}),
),
groups: SerializableCaptures {
captures: smallvec![SerializableCapture {
name: None,
match_number: 0,
start: 0,
end: 10,
value: intern("dummy"),
}],
},
blob_id: BlobId::new(b"dummy"),
finding_fingerprint: fp,
rule,
validation_response_body: None,
validation_response_status: 0,
validation_success: false,
calculated_entropy: 0.0,
visible: true,
is_base64: false,
}
}
/// Return a dummy commit object whose types match the current struct.
fn dummy_commit(commit_id: &str) -> CommitMetadata {
// Parse the supplied hex string into a Git objectid.
let oid = ObjectId::from_hex(commit_id.as_bytes())
.expect("commit_id must be a valid 40character hex string");
// A zeroepoch timestamp is fine for tests.
let ts = date::parse("1970-01-01 00:00:00 +0000", None).unwrap();
CommitMetadata {
commit_id: oid,
committer_name: "tester".into(),
committer_email: "tester@example.com".into(),
committer_timestamp: ts,
}
}
/// Create a Git origin whose only difference is the commitid.
fn git_origin(commit_id: &str) -> OriginSet {
// Most fields are irrelevant for this test we just need a publicly visible commit_id.
let md = dummy_commit(commit_id);
OriginSet::single(Origin::from_git_repo_with_first_commit(
Arc::new(PathBuf::from("/tmp/repo")),
Arc::new(md),
String::from("dummy.txt"),
))
}
// ---- the actual test -----------------------------------------------------------------------
#[test]
fn reporter_deduplicates_across_git_commits() -> Result<()> {
// Build two matches with the same fingerprint.
let m1 = make_match(0xBADC0FFE, "RULE.1");
let m2 = make_match(0xBADC0FFE, "RULE.1");
// Different commit ids -- old dedup logic *fails* to merge them.
let origin_a = git_origin("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
let origin_b = git_origin("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
// We can skip FindingsStore and talk straight to the reporter.
let reporter = DetailsReporter {
datastore: Arc::new(Mutex::new(FindingsStore::new(PathBuf::from("/tmp")))),
styles: Styles::new(false),
only_valid: false,
};
let matches = vec![
ReportMatch {
origin: origin_a,
blob_metadata: BlobMetadata {
id: BlobId::new(b"dummy"),
num_bytes: 10,
mime_essence: None,
language: None,
},
m: m1,
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: None,
validation_response_status: 0,
validation_success: false,
},
ReportMatch {
origin: origin_b,
blob_metadata: BlobMetadata {
id: BlobId::new(b"dummy"),
num_bytes: 10,
mime_essence: None,
language: None,
},
m: m2,
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: None,
validation_response_status: 0,
validation_success: false,
},
];
// no_dedup = false ⇒ we expect true deduplication.
let deduped = reporter.deduplicate_matches(matches, /* no_dedup= */ false);
// Old code ⇒ len == 2 (fails). Fixed code ⇒ len == 1 (passes).
assert_eq!(deduped.len(), 1, "identical findings across commits must be merged");
Ok(())
}
#[test]
fn dedup_preserves_distinct_rules_with_same_fingerprint() -> Result<()> {
let shared_fp = 0xDEADC0DE;
let m1 = make_match(shared_fp, "RULE.OPENAI");
let m2 = make_match(shared_fp, "RULE.DEEPSEEK");
let origin = git_origin("cccccccccccccccccccccccccccccccccccccccc");
let reporter = DetailsReporter {
datastore: Arc::new(Mutex::new(FindingsStore::new(PathBuf::from("/tmp")))),
styles: Styles::new(false),
only_valid: false,
};
let matches = vec![
ReportMatch {
origin: origin.clone(),
blob_metadata: BlobMetadata {
id: BlobId::new(b"dummy"),
num_bytes: 10,
mime_essence: None,
language: None,
},
m: m1,
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: None,
validation_response_status: 0,
validation_success: false,
},
ReportMatch {
origin,
blob_metadata: BlobMetadata {
id: BlobId::new(b"dummy"),
num_bytes: 10,
mime_essence: None,
language: None,
},
m: m2,
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: None,
validation_response_status: 0,
validation_success: false,
},
];
let deduped = reporter.deduplicate_matches(matches, /* no_dedup= */ false);
assert_eq!(deduped.len(), 2, "matches from distinct rules must not be deduplicated");
Ok(())
}