2025-06-24 17:17:16 -07:00
|
|
|
|
//! tests/dedup_git.rs
|
|
|
|
|
|
use std::{
|
|
|
|
|
|
path::PathBuf,
|
|
|
|
|
|
sync::{Arc, Mutex},
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
use anyhow::Result;
|
|
|
|
|
|
use gix::{date, ObjectId};
|
|
|
|
|
|
use kingfisher::{
|
|
|
|
|
|
blob::{BlobId, BlobMetadata},
|
|
|
|
|
|
findings_store::FindingsStore,
|
|
|
|
|
|
git_commit_metadata::CommitMetadata,
|
|
|
|
|
|
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
|
|
|
|
|
|
matcher::{Match, SerializableCapture, SerializableCaptures},
|
|
|
|
|
|
origin::{Origin, OriginSet},
|
|
|
|
|
|
reporter::{styles::Styles, DetailsReporter, ReportMatch},
|
2025-09-02 19:54:44 -07:00
|
|
|
|
rules::rule::{Confidence, Rule, RuleSyntax},
|
|
|
|
|
|
util::intern,
|
2025-06-24 17:17:16 -07:00
|
|
|
|
};
|
2025-09-05 09:31:52 -07:00
|
|
|
|
use smallvec::smallvec;
|
2025-06-24 17:17:16 -07:00
|
|
|
|
// ---- helpers -------------------------------------------------------------------------------
|
|
|
|
|
|
|
2025-11-23 11:32:27 -08:00
|
|
|
|
fn make_match(fp: u64, rule_id: &str) -> Match {
|
2025-09-02 19:54:44 -07:00
|
|
|
|
let syntax = RuleSyntax {
|
|
|
|
|
|
name: "Example Rule".to_string(),
|
2025-11-23 11:32:27 -08:00
|
|
|
|
id: rule_id.to_string(),
|
2025-09-02 19:54:44 -07:00
|
|
|
|
pattern: "dummy".to_string(),
|
|
|
|
|
|
min_entropy: 0.0,
|
|
|
|
|
|
confidence: Confidence::Medium,
|
|
|
|
|
|
visible: true,
|
|
|
|
|
|
examples: vec![],
|
|
|
|
|
|
negative_examples: vec![],
|
|
|
|
|
|
references: vec![],
|
|
|
|
|
|
validation: None,
|
2026-01-29 12:45:32 -08:00
|
|
|
|
revocation: None,
|
2025-09-02 19:54:44 -07:00
|
|
|
|
depends_on_rule: vec![],
|
2025-11-04 13:55:31 -05:00
|
|
|
|
pattern_requirements: None,
|
2026-02-02 23:22:08 -08:00
|
|
|
|
tls_mode: None,
|
2025-09-02 19:54:44 -07:00
|
|
|
|
};
|
|
|
|
|
|
let rule = Arc::new(Rule::new(syntax));
|
2025-06-24 17:17:16 -07:00
|
|
|
|
Match {
|
2025-12-04 22:02:30 -08:00
|
|
|
|
location: Location::with_source_span(
|
|
|
|
|
|
OffsetSpan { start: 0, end: 10 },
|
|
|
|
|
|
Some(SourceSpan {
|
2025-06-24 17:17:16 -07:00
|
|
|
|
start: SourcePoint { line: 1, column: 0 },
|
|
|
|
|
|
end: SourcePoint { line: 1, column: 10 },
|
2025-12-04 22:02:30 -08:00
|
|
|
|
}),
|
|
|
|
|
|
),
|
2025-06-24 17:17:16 -07:00
|
|
|
|
groups: SerializableCaptures {
|
2025-09-02 19:54:44 -07:00
|
|
|
|
captures: smallvec![SerializableCapture {
|
2025-06-24 17:17:16 -07:00
|
|
|
|
name: None,
|
|
|
|
|
|
match_number: 0,
|
|
|
|
|
|
start: 0,
|
|
|
|
|
|
end: 10,
|
2025-09-02 19:54:44 -07:00
|
|
|
|
value: intern("dummy"),
|
2025-06-24 17:17:16 -07:00
|
|
|
|
}],
|
|
|
|
|
|
},
|
|
|
|
|
|
blob_id: BlobId::new(b"dummy"),
|
|
|
|
|
|
finding_fingerprint: fp,
|
2025-09-02 19:54:44 -07:00
|
|
|
|
rule,
|
2025-12-04 22:02:30 -08:00
|
|
|
|
validation_response_body: None,
|
2025-06-24 17:17:16 -07:00
|
|
|
|
validation_response_status: 0,
|
|
|
|
|
|
validation_success: false,
|
|
|
|
|
|
calculated_entropy: 0.0,
|
|
|
|
|
|
visible: true,
|
2025-08-30 16:44:55 -07:00
|
|
|
|
is_base64: false,
|
2026-02-02 23:22:08 -08:00
|
|
|
|
dependent_captures: std::collections::BTreeMap::new(),
|
2025-06-24 17:17:16 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Return a dummy commit object whose types match the current struct.
|
|
|
|
|
|
fn dummy_commit(commit_id: &str) -> CommitMetadata {
|
|
|
|
|
|
// Parse the supplied hex string into a Git object‑id.
|
|
|
|
|
|
let oid = ObjectId::from_hex(commit_id.as_bytes())
|
|
|
|
|
|
.expect("commit_id must be a valid 40‑character hex string");
|
|
|
|
|
|
|
|
|
|
|
|
// A zero‑epoch timestamp is fine for tests.
|
|
|
|
|
|
let ts = date::parse("1970-01-01 00:00:00 +0000", None).unwrap();
|
|
|
|
|
|
|
|
|
|
|
|
CommitMetadata {
|
|
|
|
|
|
commit_id: oid,
|
2025-08-16 07:33:36 -07:00
|
|
|
|
committer_name: "tester".into(),
|
2026-01-01 22:24:57 -08:00
|
|
|
|
committer_email: "tester@exmple.com".into(),
|
2025-06-24 17:17:16 -07:00
|
|
|
|
committer_timestamp: ts,
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Create a Git origin whose only difference is the commit‐id.
|
|
|
|
|
|
fn git_origin(commit_id: &str) -> OriginSet {
|
|
|
|
|
|
// Most fields are irrelevant for this test – we just need a publicly visible commit_id.
|
|
|
|
|
|
let md = dummy_commit(commit_id);
|
|
|
|
|
|
|
|
|
|
|
|
OriginSet::single(Origin::from_git_repo_with_first_commit(
|
|
|
|
|
|
Arc::new(PathBuf::from("/tmp/repo")),
|
|
|
|
|
|
Arc::new(md),
|
2025-08-16 07:33:36 -07:00
|
|
|
|
String::from("dummy.txt"),
|
2025-06-24 17:17:16 -07:00
|
|
|
|
))
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- the actual test -----------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
fn reporter_deduplicates_across_git_commits() -> Result<()> {
|
|
|
|
|
|
// Build two matches with the same fingerprint.
|
2025-11-23 11:32:27 -08:00
|
|
|
|
let m1 = make_match(0xBADC0FFE, "RULE.1");
|
|
|
|
|
|
let m2 = make_match(0xBADC0FFE, "RULE.1");
|
2025-06-24 17:17:16 -07:00
|
|
|
|
|
|
|
|
|
|
// Different commit ids -- old dedup logic *fails* to merge them.
|
|
|
|
|
|
let origin_a = git_origin("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
|
|
|
|
|
|
let origin_b = git_origin("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
|
|
|
|
|
|
|
|
|
|
|
|
// We can skip FindingsStore and talk straight to the reporter.
|
|
|
|
|
|
let reporter = DetailsReporter {
|
|
|
|
|
|
datastore: Arc::new(Mutex::new(FindingsStore::new(PathBuf::from("/tmp")))),
|
|
|
|
|
|
styles: Styles::new(false),
|
|
|
|
|
|
only_valid: false,
|
2026-02-15 14:29:42 -08:00
|
|
|
|
audit_context: None,
|
2025-06-24 17:17:16 -07:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
let matches = vec![
|
|
|
|
|
|
ReportMatch {
|
|
|
|
|
|
origin: origin_a,
|
|
|
|
|
|
blob_metadata: BlobMetadata {
|
|
|
|
|
|
id: BlobId::new(b"dummy"),
|
|
|
|
|
|
num_bytes: 10,
|
|
|
|
|
|
mime_essence: None,
|
|
|
|
|
|
language: None,
|
|
|
|
|
|
},
|
|
|
|
|
|
m: m1,
|
|
|
|
|
|
comment: None,
|
|
|
|
|
|
match_confidence: Confidence::Medium,
|
|
|
|
|
|
visible: true,
|
2025-12-04 22:02:30 -08:00
|
|
|
|
validation_response_body: None,
|
2025-06-24 17:17:16 -07:00
|
|
|
|
validation_response_status: 0,
|
|
|
|
|
|
validation_success: false,
|
|
|
|
|
|
},
|
|
|
|
|
|
ReportMatch {
|
|
|
|
|
|
origin: origin_b,
|
|
|
|
|
|
blob_metadata: BlobMetadata {
|
|
|
|
|
|
id: BlobId::new(b"dummy"),
|
|
|
|
|
|
num_bytes: 10,
|
|
|
|
|
|
mime_essence: None,
|
|
|
|
|
|
language: None,
|
|
|
|
|
|
},
|
|
|
|
|
|
m: m2,
|
|
|
|
|
|
comment: None,
|
|
|
|
|
|
match_confidence: Confidence::Medium,
|
|
|
|
|
|
visible: true,
|
2025-12-04 22:02:30 -08:00
|
|
|
|
validation_response_body: None,
|
2025-06-24 17:17:16 -07:00
|
|
|
|
validation_response_status: 0,
|
|
|
|
|
|
validation_success: false,
|
|
|
|
|
|
},
|
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
|
|
// no_dedup = false ⇒ we expect true deduplication.
|
|
|
|
|
|
let deduped = reporter.deduplicate_matches(matches, /* no_dedup= */ false);
|
|
|
|
|
|
|
|
|
|
|
|
// Old code ⇒ len == 2 (fails). Fixed code ⇒ len == 1 (passes).
|
|
|
|
|
|
assert_eq!(deduped.len(), 1, "identical findings across commits must be merged");
|
|
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
|
}
|
2025-11-23 11:32:27 -08:00
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
fn dedup_preserves_distinct_rules_with_same_fingerprint() -> Result<()> {
|
|
|
|
|
|
let shared_fp = 0xDEADC0DE;
|
|
|
|
|
|
let m1 = make_match(shared_fp, "RULE.OPENAI");
|
|
|
|
|
|
let m2 = make_match(shared_fp, "RULE.DEEPSEEK");
|
|
|
|
|
|
|
|
|
|
|
|
let origin = git_origin("cccccccccccccccccccccccccccccccccccccccc");
|
|
|
|
|
|
|
|
|
|
|
|
let reporter = DetailsReporter {
|
|
|
|
|
|
datastore: Arc::new(Mutex::new(FindingsStore::new(PathBuf::from("/tmp")))),
|
|
|
|
|
|
styles: Styles::new(false),
|
|
|
|
|
|
only_valid: false,
|
2026-02-15 14:29:42 -08:00
|
|
|
|
audit_context: None,
|
2025-11-23 11:32:27 -08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
let matches = vec![
|
|
|
|
|
|
ReportMatch {
|
|
|
|
|
|
origin: origin.clone(),
|
|
|
|
|
|
blob_metadata: BlobMetadata {
|
|
|
|
|
|
id: BlobId::new(b"dummy"),
|
|
|
|
|
|
num_bytes: 10,
|
|
|
|
|
|
mime_essence: None,
|
|
|
|
|
|
language: None,
|
|
|
|
|
|
},
|
|
|
|
|
|
m: m1,
|
|
|
|
|
|
comment: None,
|
|
|
|
|
|
match_confidence: Confidence::Medium,
|
|
|
|
|
|
visible: true,
|
2025-12-04 22:02:30 -08:00
|
|
|
|
validation_response_body: None,
|
2025-11-23 11:32:27 -08:00
|
|
|
|
validation_response_status: 0,
|
|
|
|
|
|
validation_success: false,
|
|
|
|
|
|
},
|
|
|
|
|
|
ReportMatch {
|
|
|
|
|
|
origin,
|
|
|
|
|
|
blob_metadata: BlobMetadata {
|
|
|
|
|
|
id: BlobId::new(b"dummy"),
|
|
|
|
|
|
num_bytes: 10,
|
|
|
|
|
|
mime_essence: None,
|
|
|
|
|
|
language: None,
|
|
|
|
|
|
},
|
|
|
|
|
|
m: m2,
|
|
|
|
|
|
comment: None,
|
|
|
|
|
|
match_confidence: Confidence::Medium,
|
|
|
|
|
|
visible: true,
|
2025-12-04 22:02:30 -08:00
|
|
|
|
validation_response_body: None,
|
2025-11-23 11:32:27 -08:00
|
|
|
|
validation_response_status: 0,
|
|
|
|
|
|
validation_success: false,
|
|
|
|
|
|
},
|
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
|
|
let deduped = reporter.deduplicate_matches(matches, /* no_dedup= */ false);
|
|
|
|
|
|
|
2025-11-24 10:36:58 -08:00
|
|
|
|
assert_eq!(deduped.len(), 2, "matches from distinct rules must not be deduplicated");
|
2025-11-23 11:32:27 -08:00
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
|
}
|