kingfisher/src/baseline.rs
2026-04-24 00:14:56 -07:00

451 lines
16 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use std::{
collections::HashSet,
fs,
path::{Path, PathBuf},
sync::Arc,
};
use anyhow::{Context, Result};
use chrono::Local;
use serde::{Deserialize, Serialize};
use smallvec::{SmallVec, smallvec};
use tracing::debug;
type FingerprintForms = SmallVec<[u64; 2]>;
use crate::findings_store::FindingsStore;
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct BaselineFile {
#[serde(rename = "ExactFindings", default)]
pub exact_findings: ExactFindings,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct ExactFindings {
#[serde(default)]
pub matches: Vec<BaselineFinding>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct BaselineFinding {
pub filepath: String,
pub fingerprint: String,
pub linenum: usize,
pub lastupdated: String,
}
pub fn load_baseline(path: &Path) -> Result<BaselineFile> {
let data = fs::read_to_string(path).context("read baseline file")?;
Ok(serde_yaml::from_str(&data).context("parse baseline yaml")?)
}
/// Parse a baseline fingerprint string into its canonical u64 form(s).
///
/// Accepts either the decimal form users see in scan output (JSON/pretty/SARIF)
/// or the 16-char zero-padded hex form previously written by `--manage-baseline`.
/// Returns 02 canonical u64 interpretations: ambiguous 16-digit all-digit
/// strings — which could be either a decimal fingerprint or a legacy hex
/// fingerprint whose value happens to contain no `a-f` — yield both so either
/// form matches.
///
/// Detection:
/// 1. A `0x`/`0X` prefix is stripped and the rest parsed as hex.
/// 2. Exactly 16 hex chars containing at least one `a-f`/`A-F` letter are parsed as hex
/// (unambiguous legacy canonical form).
/// 3. Exactly 16 digits: ambiguous — try both decimal and hex and return whichever
/// interpretations parse successfully, so old baselines keep matching.
/// 4. Otherwise the string is parsed as decimal u64.
fn parse_fingerprint(s: &str) -> FingerprintForms {
let trimmed = s.trim();
if let Some(rest) = trimmed.strip_prefix("0x").or_else(|| trimmed.strip_prefix("0X")) {
return match u64::from_str_radix(rest, 16) {
Ok(v) => smallvec![v],
Err(_) => SmallVec::new(),
};
}
if trimmed.len() == 16 && trimmed.chars().all(|c| c.is_ascii_hexdigit()) {
if trimmed.chars().any(|c| c.is_ascii_alphabetic()) {
return match u64::from_str_radix(trimmed, 16) {
Ok(v) => smallvec![v],
Err(_) => SmallVec::new(),
};
}
let mut out: FingerprintForms = SmallVec::new();
if let Ok(v) = trimmed.parse::<u64>() {
out.push(v);
}
if let Ok(v) = u64::from_str_radix(trimmed, 16) {
if !out.contains(&v) {
out.push(v);
}
}
return out;
}
match trimmed.parse::<u64>() {
Ok(v) => smallvec![v],
Err(_) => SmallVec::new(),
}
}
pub fn save_baseline(path: &Path, baseline: &BaselineFile) -> Result<()> {
let data = serde_yaml::to_string(baseline).context("serialize baseline")?;
fs::write(path, data).context("write baseline file")
}
fn normalize_path(p: &Path, roots: &[PathBuf]) -> String {
for root in roots {
if let Ok(stripped) = p.strip_prefix(root) {
if let Some(name) = root.file_name() {
return PathBuf::from(name).join(stripped).to_string_lossy().replace('\\', "/");
}
}
}
p.to_string_lossy().replace('\\', "/")
}
pub fn apply_baseline(
store: &mut FindingsStore,
baseline_path: &Path,
manage: bool,
roots: &[PathBuf],
) -> Result<()> {
let mut baseline = if baseline_path.exists() {
load_baseline(baseline_path)?
} else {
BaselineFile::default()
};
let mut known: HashSet<u64> = HashSet::new();
for m in &baseline.exact_findings.matches {
let parsed = parse_fingerprint(&m.fingerprint);
if parsed.is_empty() {
debug!("Ignoring unparseable baseline fingerprint {:?}", m.fingerprint);
continue;
}
known.extend(parsed);
}
let mut encountered: HashSet<u64> = HashSet::new();
let mut new_entries = Vec::new();
for arc_msg in store.get_matches_mut() {
let (origin, _blob, m) = Arc::make_mut(arc_msg);
let file_path = origin.iter().filter_map(|o| o.full_path()).next();
let fp_value = m.finding_fingerprint;
if let Some(fp) = file_path {
let normalized = normalize_path(&fp, roots);
if known.contains(&fp_value) {
debug!("Skipping {} due to baseline (fingerprint {})", normalized, fp_value);
m.visible = false;
if manage {
encountered.insert(fp_value);
}
} else if manage {
known.insert(fp_value);
encountered.insert(fp_value);
let entry = BaselineFinding {
filepath: normalized,
fingerprint: fp_value.to_string(),
linenum: m.location.resolved_source_span().start.line,
lastupdated: Local::now().to_rfc2822(),
};
new_entries.push(entry);
}
} else if known.contains(&fp_value) {
m.visible = false;
if manage {
encountered.insert(fp_value);
}
}
}
if manage {
let original_len = baseline.exact_findings.matches.len();
baseline
.exact_findings
.matches
.retain(|m| parse_fingerprint(&m.fingerprint).iter().any(|v| encountered.contains(v)));
let mut changed = baseline.exact_findings.matches.len() != original_len;
if !new_entries.is_empty() {
baseline.exact_findings.matches.extend(new_entries);
changed = true;
}
if changed {
save_baseline(baseline_path, &baseline)?;
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{
blob::{BlobId, BlobMetadata},
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::{Match, SerializableCapture, SerializableCaptures},
origin::{Origin, OriginSet},
rules::rule::{Confidence, Rule, RuleSyntax},
};
use anyhow::Result;
use smallvec::SmallVec;
use std::{path::Path, sync::Arc};
use tempfile::TempDir;
fn test_rule() -> Arc<Rule> {
Arc::new(Rule::new(RuleSyntax {
name: "test".to_string(),
id: "test.rule".to_string(),
pattern: "test".to_string(),
min_entropy: 0.0,
confidence: Confidence::Low,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
revocation: None,
depends_on_rule: vec![],
pattern_requirements: None,
tls_mode: None,
}))
}
fn empty_captures() -> SerializableCaptures {
SerializableCaptures { captures: SmallVec::<[SerializableCapture; 2]>::new() }
}
fn make_store_with_match(fingerprint: u64, file_path: &Path) -> FindingsStore {
let mut store = FindingsStore::new(PathBuf::from("."));
let rule = test_rule();
let match_item = Match {
location: Location::with_source_span(
OffsetSpan { start: 0, end: 1 },
Some(SourceSpan {
start: SourcePoint { line: 1, column: 0 },
end: SourcePoint { line: 1, column: 1 },
}),
),
groups: empty_captures(),
blob_id: BlobId::default(),
finding_fingerprint: fingerprint,
rule: Arc::clone(&rule),
validation_response_body: None,
validation_response_status: 0,
validation_success: false,
calculated_entropy: 0.0,
visible: true,
is_base64: false,
dependent_captures: std::collections::BTreeMap::new(),
};
let origin = OriginSet::from(Origin::from_file(file_path.to_path_buf()));
let blob_meta = Arc::new(BlobMetadata {
id: BlobId::default(),
num_bytes: 0,
mime_essence: None,
language: None,
});
let entry = Arc::new((Arc::new(origin), blob_meta, match_item));
store.get_matches_mut().push(entry);
store
}
fn expected_relative_path(root: &Path, file: &Path) -> String {
let mut expected = PathBuf::from(root.file_name().unwrap());
if let Ok(stripped) = file.strip_prefix(root) {
expected = expected.join(stripped);
}
expected.to_string_lossy().replace('\\', "/")
}
#[test]
fn apply_baseline_filters_existing_fingerprints() -> Result<()> {
let tmp = TempDir::new()?;
let roots = [tmp.path().to_path_buf()];
let secret_file = tmp.path().join("secret.txt");
fs::write(&secret_file, "dummy")?;
let baseline_path = tmp.path().join("baseline.yaml");
let fingerprint = 0x1234_u64;
let mut store = make_store_with_match(fingerprint, &secret_file);
apply_baseline(&mut store, &baseline_path, true, &roots)?;
let baseline = load_baseline(&baseline_path)?;
assert_eq!(baseline.exact_findings.matches.len(), 1);
let entry = &baseline.exact_findings.matches[0];
assert_eq!(entry.fingerprint, fingerprint.to_string());
assert_eq!(entry.filepath, expected_relative_path(roots[0].as_path(), &secret_file));
let (_, _, recorded) = store.get_matches()[0].as_ref();
assert!(recorded.visible);
let mut follow_up = make_store_with_match(fingerprint, &secret_file);
apply_baseline(&mut follow_up, &baseline_path, false, &roots)?;
let (_, _, filtered) = follow_up.get_matches()[0].as_ref();
assert!(!filtered.visible);
Ok(())
}
#[test]
fn managing_baseline_is_idempotent() -> Result<()> {
let tmp = TempDir::new()?;
let roots = [tmp.path().to_path_buf()];
let secret_file = tmp.path().join("secret.txt");
fs::write(&secret_file, "dummy")?;
let baseline_path = tmp.path().join("baseline.yaml");
let fingerprint = 0xfeed_beef_dade_f00d_u64;
let mut initial = make_store_with_match(fingerprint, &secret_file);
apply_baseline(&mut initial, &baseline_path, true, &roots)?;
let baseline_before = fs::read_to_string(&baseline_path)?;
let mut rerun = make_store_with_match(fingerprint, &secret_file);
apply_baseline(&mut rerun, &baseline_path, true, &roots)?;
let baseline_after = fs::read_to_string(&baseline_path)?;
assert_eq!(baseline_before, baseline_after);
let (_, _, suppressed) = rerun.get_matches()[0].as_ref();
assert!(!suppressed.visible);
Ok(())
}
#[test]
fn parse_fingerprint_accepts_all_forms() {
let value: u64 = 0xfeed_beef_dade_f00d;
assert_eq!(parse_fingerprint(&format!("{:016x}", value)).as_slice(), &[value]);
assert_eq!(parse_fingerprint(&format!("0x{:016x}", value)).as_slice(), &[value]);
assert_eq!(parse_fingerprint(&format!("0X{:X}", value)).as_slice(), &[value]);
assert_eq!(parse_fingerprint(&value.to_string()).as_slice(), &[value]);
assert_eq!(parse_fingerprint(" 42 ").as_slice(), &[42]);
assert_eq!(parse_fingerprint("0").as_slice(), &[0]);
assert!(parse_fingerprint("").is_empty());
assert!(parse_fingerprint("notahex").is_empty());
}
#[test]
fn parse_fingerprint_all_digit_16_chars_is_ambiguous() {
// A 16-char all-digit string could be either a decimal fingerprint
// (scan output) or a legacy hex fingerprint whose value contains no
// a-f. Both interpretations must be returned so old baselines keep
// matching while new decimal fingerprints round-trip.
let s = "1234567890123456";
let parsed = parse_fingerprint(s);
assert!(parsed.contains(&1234567890123456_u64));
assert!(parsed.contains(&u64::from_str_radix(s, 16).unwrap()));
}
#[test]
fn decimal_fingerprint_from_output_roundtrips() -> Result<()> {
// Regression for issue #344: a fingerprint copied (in decimal) from
// scan output into a hand-written baseline file must suppress the match.
let tmp = TempDir::new()?;
let roots = [tmp.path().to_path_buf()];
let secret_file = tmp.path().join("secret.txt");
fs::write(&secret_file, "dummy")?;
let baseline_path = tmp.path().join("baseline.yaml");
let fingerprint = 0xfeed_beef_dade_f00d_u64;
let hand_written = BaselineFile {
exact_findings: ExactFindings {
matches: vec![BaselineFinding {
filepath: expected_relative_path(roots[0].as_path(), &secret_file),
fingerprint: fingerprint.to_string(),
linenum: 1,
lastupdated: "now".to_string(),
}],
},
};
save_baseline(&baseline_path, &hand_written)?;
let mut store = make_store_with_match(fingerprint, &secret_file);
apply_baseline(&mut store, &baseline_path, false, &roots)?;
let (_, _, m) = store.get_matches()[0].as_ref();
assert!(!m.visible);
Ok(())
}
#[test]
fn legacy_hex_baseline_still_matches() -> Result<()> {
// A baseline file written by an older kingfisher (hex-padded) must
// still suppress matches after the decimal switchover.
let tmp = TempDir::new()?;
let roots = [tmp.path().to_path_buf()];
let secret_file = tmp.path().join("secret.txt");
fs::write(&secret_file, "dummy")?;
let baseline_path = tmp.path().join("baseline.yaml");
let fingerprint = 0xfeed_beef_dade_f00d_u64;
let legacy = BaselineFile {
exact_findings: ExactFindings {
matches: vec![BaselineFinding {
filepath: expected_relative_path(roots[0].as_path(), &secret_file),
fingerprint: format!("{:016x}", fingerprint),
linenum: 1,
lastupdated: "then".to_string(),
}],
},
};
save_baseline(&baseline_path, &legacy)?;
let mut store = make_store_with_match(fingerprint, &secret_file);
apply_baseline(&mut store, &baseline_path, false, &roots)?;
let (_, _, m) = store.get_matches()[0].as_ref();
assert!(!m.visible);
Ok(())
}
#[test]
fn mixed_format_baseline_matches_both_entries() -> Result<()> {
let tmp = TempDir::new()?;
let roots = [tmp.path().to_path_buf()];
let file_hex = tmp.path().join("hex.txt");
let file_dec = tmp.path().join("dec.txt");
fs::write(&file_hex, "dummy")?;
fs::write(&file_dec, "dummy")?;
let baseline_path = tmp.path().join("baseline.yaml");
// fp_hex must contain at least one hex letter so its 16-char hex form is
// unambiguously hex (an all-digit 16-char string is treated as decimal to
// satisfy the roundtrip contract for fingerprints copied from scan output).
let fp_hex = 0x1a2b_3c4d_5e6f_7890_u64;
let fp_dec = 0xaaaa_bbbb_cccc_dddd_u64;
let mixed = BaselineFile {
exact_findings: ExactFindings {
matches: vec![
BaselineFinding {
filepath: expected_relative_path(roots[0].as_path(), &file_hex),
fingerprint: format!("{:016x}", fp_hex),
linenum: 1,
lastupdated: "then".to_string(),
},
BaselineFinding {
filepath: expected_relative_path(roots[0].as_path(), &file_dec),
fingerprint: fp_dec.to_string(),
linenum: 1,
lastupdated: "now".to_string(),
},
],
},
};
save_baseline(&baseline_path, &mixed)?;
let mut store_hex = make_store_with_match(fp_hex, &file_hex);
apply_baseline(&mut store_hex, &baseline_path, false, &roots)?;
assert!(!store_hex.get_matches()[0].as_ref().2.visible);
let mut store_dec = make_store_with_match(fp_dec, &file_dec);
apply_baseline(&mut store_dec, &baseline_path, false, &roots)?;
assert!(!store_dec.get_matches()[0].as_ref().2.visible);
Ok(())
}
}