kingfisher/tests/fingerprint_dedup.rs

219 lines
7 KiB
Rust
Raw Normal View History

2025-06-24 17:17:16 -07:00
//! tests/dedup_git.rs
use std::{
path::PathBuf,
sync::{Arc, Mutex},
};
use anyhow::Result;
use gix::{date, ObjectId};
use kingfisher::{
blob::{BlobId, BlobMetadata},
findings_store::FindingsStore,
git_commit_metadata::CommitMetadata,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::{Match, SerializableCapture, SerializableCaptures},
origin::{Origin, OriginSet},
reporter::{styles::Styles, DetailsReporter, ReportMatch},
rules::rule::{Confidence, Rule, RuleSyntax},
util::intern,
2025-06-24 17:17:16 -07:00
};
2025-09-05 09:31:52 -07:00
use smallvec::smallvec;
2025-06-24 17:17:16 -07:00
// ---- helpers -------------------------------------------------------------------------------
fn make_match(fp: u64, rule_id: &str) -> Match {
let syntax = RuleSyntax {
name: "Example Rule".to_string(),
id: rule_id.to_string(),
pattern: "dummy".to_string(),
min_entropy: 0.0,
confidence: Confidence::Medium,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
2026-02-02 23:22:08 -08:00
tls_mode: None,
};
let rule = Arc::new(Rule::new(syntax));
2025-06-24 17:17:16 -07:00
Match {
location: Location::with_source_span(
OffsetSpan { start: 0, end: 10 },
Some(SourceSpan {
2025-06-24 17:17:16 -07:00
start: SourcePoint { line: 1, column: 0 },
end: SourcePoint { line: 1, column: 10 },
}),
),
2025-06-24 17:17:16 -07:00
groups: SerializableCaptures {
captures: smallvec![SerializableCapture {
2025-06-24 17:17:16 -07:00
name: None,
match_number: 0,
start: 0,
end: 10,
value: intern("dummy"),
2025-06-24 17:17:16 -07:00
}],
},
blob_id: BlobId::new(b"dummy"),
finding_fingerprint: fp,
rule,
validation_response_body: None,
2025-06-24 17:17:16 -07:00
validation_response_status: 0,
validation_success: false,
calculated_entropy: 0.0,
visible: true,
is_base64: false,
2026-02-02 23:22:08 -08:00
dependent_captures: std::collections::BTreeMap::new(),
2025-06-24 17:17:16 -07:00
}
}
/// Return a dummy commit object whose types match the current struct.
fn dummy_commit(commit_id: &str) -> CommitMetadata {
// Parse the supplied hex string into a Git objectid.
let oid = ObjectId::from_hex(commit_id.as_bytes())
.expect("commit_id must be a valid 40character hex string");
// A zeroepoch timestamp is fine for tests.
let ts = date::parse("1970-01-01 00:00:00 +0000", None).unwrap();
CommitMetadata {
commit_id: oid,
committer_name: "tester".into(),
2026-01-01 22:24:57 -08:00
committer_email: "tester@exmple.com".into(),
2025-06-24 17:17:16 -07:00
committer_timestamp: ts,
}
}
/// Create a Git origin whose only difference is the commitid.
fn git_origin(commit_id: &str) -> OriginSet {
// Most fields are irrelevant for this test we just need a publicly visible commit_id.
let md = dummy_commit(commit_id);
OriginSet::single(Origin::from_git_repo_with_first_commit(
Arc::new(PathBuf::from("/tmp/repo")),
Arc::new(md),
String::from("dummy.txt"),
2025-06-24 17:17:16 -07:00
))
}
// ---- the actual test -----------------------------------------------------------------------
#[test]
fn reporter_deduplicates_across_git_commits() -> Result<()> {
// Build two matches with the same fingerprint.
let m1 = make_match(0xBADC0FFE, "RULE.1");
let m2 = make_match(0xBADC0FFE, "RULE.1");
2025-06-24 17:17:16 -07:00
// Different commit ids -- old dedup logic *fails* to merge them.
let origin_a = git_origin("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
let origin_b = git_origin("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
// We can skip FindingsStore and talk straight to the reporter.
let reporter = DetailsReporter {
datastore: Arc::new(Mutex::new(FindingsStore::new(PathBuf::from("/tmp")))),
styles: Styles::new(false),
only_valid: false,
audit_context: None,
2025-06-24 17:17:16 -07:00
};
let matches = vec![
ReportMatch {
origin: origin_a,
blob_metadata: BlobMetadata {
id: BlobId::new(b"dummy"),
num_bytes: 10,
mime_essence: None,
language: None,
},
m: m1,
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: None,
2025-06-24 17:17:16 -07:00
validation_response_status: 0,
validation_success: false,
},
ReportMatch {
origin: origin_b,
blob_metadata: BlobMetadata {
id: BlobId::new(b"dummy"),
num_bytes: 10,
mime_essence: None,
language: None,
},
m: m2,
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: None,
2025-06-24 17:17:16 -07:00
validation_response_status: 0,
validation_success: false,
},
];
// no_dedup = false ⇒ we expect true deduplication.
let deduped = reporter.deduplicate_matches(matches, /* no_dedup= */ false);
// Old code ⇒ len == 2 (fails). Fixed code ⇒ len == 1 (passes).
assert_eq!(deduped.len(), 1, "identical findings across commits must be merged");
Ok(())
}
#[test]
fn dedup_preserves_distinct_rules_with_same_fingerprint() -> Result<()> {
let shared_fp = 0xDEADC0DE;
let m1 = make_match(shared_fp, "RULE.OPENAI");
let m2 = make_match(shared_fp, "RULE.DEEPSEEK");
let origin = git_origin("cccccccccccccccccccccccccccccccccccccccc");
let reporter = DetailsReporter {
datastore: Arc::new(Mutex::new(FindingsStore::new(PathBuf::from("/tmp")))),
styles: Styles::new(false),
only_valid: false,
audit_context: None,
};
let matches = vec![
ReportMatch {
origin: origin.clone(),
blob_metadata: BlobMetadata {
id: BlobId::new(b"dummy"),
num_bytes: 10,
mime_essence: None,
language: None,
},
m: m1,
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: None,
validation_response_status: 0,
validation_success: false,
},
ReportMatch {
origin,
blob_metadata: BlobMetadata {
id: BlobId::new(b"dummy"),
num_bytes: 10,
mime_essence: None,
language: None,
},
m: m2,
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: None,
validation_response_status: 0,
validation_success: false,
},
];
let deduped = reporter.deduplicate_matches(matches, /* no_dedup= */ false);
assert_eq!(deduped.len(), 2, "matches from distinct rules must not be deduplicated");
Ok(())
}