diff --git a/CHANGELOG.md b/CHANGELOG.md index ebfc95f..2e1179b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file. ## [1.21.0] - Improved Azure Storage rule - Added rule to detect TravisCI encrypted values +- Added baseline feature with `--baseline-file` and `--manage-baseline` flags +- Introduced `--exclude` option for skipping paths +- Added tests covering baseline and exclude workflow + ## [1.20.0] - Removed confirmation prompt when user provides --self-update flag diff --git a/Cargo.toml b/Cargo.toml index 179f285..dc2db13 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -161,6 +161,7 @@ trust-dns-resolver = { version = "0.23.2", default-features = false, features atty = "0.2.14" self_update = { version = "0.42.0", default-features = false, features = ["rustls", "archive-tar", "archive-zip", "compression-flate2"] } semver = "1.0.26" +globset = "0.4.16" [dependencies.tikv-jemallocator] version = "0.6" diff --git a/README.md b/README.md index 40bbce6..d7ae8b6 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,24 @@ kingfisher scan /path/to/repo --rule-stats kingfisher scan ./my-project --ignore-tests ``` +### Exclude specific paths +```bash +# Skip all Python files and any directory named tests +kingfisher scan ./my-project \ + --exclude '*.py' \ + --exclude tests +``` + +If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude): + +```bash +# Skip all Python files and any directory named tests, and report to stderr any skipped files +kingfisher scan ./my-project \ + --exclude '*.py' \ + --exclude tests \ + -v +``` + --- ## Scanning GitHub @@ -287,6 +305,32 @@ kingfisher github repos list --organization my-org - `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) - `--redact`: Replaces discovered secrets with a one-way hash for secure output - `--ignore-tests`: Skip files or directories whose path component contains _test_, _spec_, _fixture_, _example_, or _sample_ (case-insensitive) +- `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax) +- `--baseline-file `: Ignore matches listed in a baseline YAML file +- `--manage-baseline`: Create or update the baseline file with current findings + +## Build a Baseline / Detect New Secrets + +There are situations where a repository already contains checked‑in secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list. + +The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches): + +```bash +kingfisher scan /path/to/code \ + --confidence low \ + --manage-baseline \ + --baseline-file ./baseline-file.yml +``` + +Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings: + +```bash +kingfisher scan /path/to/code \ + --baseline-file /path/to/baseline-file.yaml +``` + +See ([docs/BASELINE.md](docs/BASELINE.md)) for full detail. + ## Finding Fingerprint diff --git a/docs/BASELINE.md b/docs/BASELINE.md new file mode 100644 index 0000000..aef0274 --- /dev/null +++ b/docs/BASELINE.md @@ -0,0 +1,55 @@ + +# Build a Baseline / Detect Only New Secrets + +There are situations where a repository already contains checked‑in secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list. + +The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches): + +```bash +kingfisher scan /path/to/code \ + --confidence low \ + --manage-baseline \ + --baseline-file ./baseline-file.yml +``` + +This generates a YAML file named `baseline-file.yml` in the current directory. The file tracks each finding under an `ExactFindings` section: + +```yaml +ExactFindings: + matches: + - filepath: ruby_vulnerable.rb/ + fingerprint: 056876f00ffd0622 + linenum: 52 + lastupdated: Mon, 14 Jul 2025 10:17:56 -0700 + - filepath: ruby_vulnerable.rb/ + fingerprint: ce41d19b83b2b1b0 + linenum: 53 + lastupdated: Mon, 14 Jul 2025 10:17:56 -0700 + - filepath: ruby_vulnerable.rb/ + fingerprint: e8644d91fa6654f5 + linenum: 40 + lastupdated: Mon, 14 Jul 2025 10:17:56 -0700 +``` + +`fingerprint` reuses Kingfisher's 64-bit *finding fingerprint* algorithm with offsets set to zero. It hashes the secret value together with the normalized filepath, so moving a secret around does not create a new entry. + +Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings: + +```bash +kingfisher scan /path/to/code \ + --baseline-file /path/to/baseline-file.yaml +``` + +If you intentionally add a new secret that should be ignored later, rerun the scan with both `--manage-baseline` and `--baseline-file` to append new matches to the file without removing existing entries: + +```bash +kingfisher scan /path/to/code \ + --manage-baseline \ + --baseline-file /path/to/baseline-file.yml +``` + +If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via `--exclude`): + +```bash +kingfisher scan /path/to/project -v +``` \ No newline at end of file diff --git a/src/baseline.rs b/src/baseline.rs new file mode 100644 index 0000000..c2588f2 --- /dev/null +++ b/src/baseline.rs @@ -0,0 +1,115 @@ +use std::{ + collections::HashSet, + fs, + path::{Path, PathBuf}, + sync::Arc, +}; + +use anyhow::{Context, Result}; +use chrono::Local; +use serde::{Deserialize, Serialize}; +use tracing::debug; + +use crate::{findings_store::FindingsStore, matcher::compute_finding_fingerprint}; + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct BaselineFile { + #[serde(rename = "ExactFindings", default)] + pub exact_findings: ExactFindings, +} + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct ExactFindings { + #[serde(default)] + pub matches: Vec, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct BaselineFinding { + pub filepath: String, + pub fingerprint: String, + pub linenum: usize, + pub lastupdated: String, +} + +pub fn load_baseline(path: &Path) -> Result { + let data = fs::read_to_string(path).context("read baseline file")?; + Ok(serde_yaml::from_str(&data).context("parse baseline yaml")?) +} + +pub fn save_baseline(path: &Path, baseline: &BaselineFile) -> Result<()> { + let data = serde_yaml::to_string(baseline).context("serialize baseline")?; + fs::write(path, data).context("write baseline file") +} + +fn normalize_path(p: &Path, roots: &[PathBuf]) -> String { + for root in roots { + if let Ok(stripped) = p.strip_prefix(root) { + if let Some(name) = root.file_name() { + return PathBuf::from(name).join(stripped).to_string_lossy().into(); + } + } + } + p.to_string_lossy().into() +} + +fn compute_hash(secret: &str, path: &str) -> String { + let fp = compute_finding_fingerprint(secret, path, 0, 0); + format!("{:016x}", fp) +} + +fn extract_secret(m: &crate::matcher::Match) -> String { + m.groups + .captures + .get(1) + .or_else(|| m.groups.captures.get(0)) + .map(|c| c.value.to_string()) + .unwrap_or_default() +} + +pub fn apply_baseline( + store: &mut FindingsStore, + baseline_path: &Path, + manage: bool, + roots: &[PathBuf], +) -> Result<()> { + let mut baseline = if baseline_path.exists() { + load_baseline(baseline_path)? + } else { + BaselineFile::default() + }; + + let mut known: HashSet = + baseline.exact_findings.matches.iter().map(|m| m.fingerprint.clone()).collect(); + + let mut new_entries = Vec::new(); + for arc_msg in store.get_matches_mut() { + let (origin, _blob, m) = Arc::make_mut(arc_msg); + let file_path = origin.iter().filter_map(|o| o.full_path()).next(); + if let Some(fp) = file_path { + let normalized = normalize_path(&fp, roots); + let secret = extract_secret(m); + let hash = compute_hash(&secret, &normalized); + if known.contains(&hash) { + debug!("Skipping {} due to baseline (hash {})", normalized, hash); + m.visible = false; + } else if manage { + known.insert(hash.clone()); + let entry = BaselineFinding { + filepath: normalized, + fingerprint: hash, + linenum: m.location.source_span.start.line, + lastupdated: Local::now().to_rfc2822(), + }; + new_entries.push(entry); + } + } + } + + if manage && !new_entries.is_empty() { + baseline.exact_findings.matches.extend(new_entries); + save_baseline(baseline_path, &baseline)?; + } + + Ok(()) +} diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs index acfc220..20d3dde 100644 --- a/src/cli/commands/inputs.rs +++ b/src/cli/commands/inputs.rs @@ -111,9 +111,13 @@ pub struct ContentFilteringArgs { #[arg(long("max-file-size"), default_value_t = 25.0)] pub max_file_size_mb: f64, - /// Use custom path-based ignore rules from the given file(s) - #[arg(long, short, value_hint = ValueHint::FilePath)] - pub ignore: Vec, + // /// Use custom path-based ignore rules from the given file(s) + // #[arg(long, short, value_hint = ValueHint::FilePath)] + // pub ignore: Vec, + /// Skip any file or directory whose path matches this glob pattern. Multiple + /// patterns may be provided by repeating the flag. + #[arg(long, value_name = "PATTERN")] + pub exclude: Vec, /// If true, do NOT extract archive files #[arg(long("no-extract-archives"), default_value_t = false)] diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index 36726e0..406c896 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -106,6 +106,14 @@ pub struct ScanArgs { /// Bytes of context before and after each match #[arg(long, default_value_t = 256, value_name = "BYTES")] pub snippet_length: usize, + + /// Baseline file to filter known secrets + #[arg(long, value_name = "FILE")] + pub baseline_file: Option, + + /// Create or update the baseline file with current findings + #[arg(long, default_value_t = false)] + pub manage_baseline: bool, } /// Confidence levels for findings diff --git a/src/lib.rs b/src/lib.rs index 4395b89..9f04d46 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +pub mod baseline; pub mod binary; pub mod blob; pub mod bstring_escape; @@ -43,6 +44,7 @@ use crossbeam_channel::Sender; pub use git_repo_enumerator::{GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator}; pub use gix::{self, Repository, ThreadSafeRepository}; use gix::{open::Options, open_opts}; +use globset::{Glob, GlobSet, GlobSetBuilder}; pub use ignore::gitignore::{Gitignore, GitignoreBuilder}; use ignore::{DirEntry, WalkBuilder, WalkState}; use tokio::time::Duration; @@ -204,6 +206,7 @@ pub struct FilesystemEnumerator { extract_archives: bool, extraction_depth: usize, no_dedup: bool, + exclude_globset: Option>, } impl FilesystemEnumerator { @@ -234,6 +237,7 @@ impl FilesystemEnumerator { extract_archives: !args.content_filtering_args.no_extract_archives, extraction_depth: args.content_filtering_args.extraction_depth as usize, no_dedup: args.no_dedup, + exclude_globset: None, }) } @@ -287,6 +291,27 @@ impl FilesystemEnumerator { self } + pub fn set_exclude_patterns(&mut self, patterns: &[String]) -> Result<&mut Self> { + if patterns.is_empty() { + return Ok(self); + } + let mut builder = GlobSetBuilder::new(); + for pat in patterns { + builder.add(Glob::new(pat)?); + } + let globset = std::sync::Arc::new(builder.build()?); + self.exclude_globset = Some(globset.clone()); + self.filter_entry(move |entry| { + let path = entry.path(); + let matches = globset.is_match(path); + if matches { + debug!("Skipping {} due to --exclude", path.display()); + } + !matches + }); + Ok(self) + } + pub fn gitignore(&self) -> Result { Ok(self.gitignore_builder.build()?) } diff --git a/src/main.rs b/src/main.rs index 280422c..2567187 100644 --- a/src/main.rs +++ b/src/main.rs @@ -132,7 +132,18 @@ pub fn determine_exit_code(datastore: &Arc> // exit with code 0 if there are NO findings discovered let ds = datastore.lock().unwrap(); // Get all matches - let all_matches = ds.get_matches(); + // let all_matches = ds.get_matches(); + + // Only consider visible matches when determining the exit code + let all_matches = ds + .get_matches() + .iter() + .filter(|msg| { + let (_, _, match_item) = &***msg; + match_item.visible + }) + .collect::>(); + if all_matches.is_empty() { // No findings discovered 0 @@ -141,7 +152,7 @@ pub fn determine_exit_code(datastore: &Arc> let validated_matches = all_matches .iter() .filter(|msg| { - let (_, _, match_item) = &***msg; + let (_, _, match_item) = &****msg; match_item.validation_success }) .count(); @@ -278,7 +289,7 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { max_file_size_mb: 25.0, no_extract_archives: true, extraction_depth: 2, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns no_binary: true, }, confidence: ConfidenceLevel::Medium, @@ -291,6 +302,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { no_dedup: false, ignore_tests: false, snippet_length: 256, + baseline_file: None, + manage_baseline: false, output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, } } diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index ec45e83..c1185b2 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -383,6 +383,7 @@ mod tests { rules::rule::Confidence, util::intern, }; + fn create_default_args() -> cli::commands::scan::ScanArgs { use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope @@ -424,7 +425,7 @@ mod tests { max_file_size_mb: 25.0, no_extract_archives: false, extraction_depth: 2, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns no_binary: true, }, confidence: ConfidenceLevel::Medium, @@ -436,6 +437,8 @@ mod tests { git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, snippet_length: 256, + baseline_file: None, + manage_baseline: false, } } diff --git a/src/reporter/pretty_format.rs b/src/reporter/pretty_format.rs index 0e78477..7a6212c 100644 --- a/src/reporter/pretty_format.rs +++ b/src/reporter/pretty_format.rs @@ -351,7 +351,7 @@ fn test_pretty_format_with_nan_entropy_panics() { max_file_size_mb: 25.0, no_extract_archives: false, extraction_depth: 2, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns no_binary: true, }, @@ -369,6 +369,8 @@ fn test_pretty_format_with_nan_entropy_panics() { // display snippet_length: 256, + baseline_file: None, + manage_baseline: false, }; // This will panic if the entropy isn't checked for NaN diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index d9ca0f8..62dbd56 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -246,13 +246,7 @@ fn make_fs_enumerator( // Pass no_dedup when enumerating git history ie.no_dedup(args.no_dedup); - // Load any specified ignore files - for ignore_path in args.content_filtering_args.ignore.iter() { - debug!("Using ignore rules from {}", ignore_path.display()); - ie.add_ignore(ignore_path).with_context(|| { - format!("Failed to load ignore rules from {}", ignore_path.display()) - })?; - } + ie.set_exclude_patterns(&args.content_filtering_args.exclude)?; // Determine whether to collect git metadata or not let collect_git_metadata = false; ie.collect_git_metadata(collect_git_metadata); diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs index 112a04e..7c2f964 100644 --- a/src/scanner/runner.rs +++ b/src/scanner/runner.rs @@ -100,6 +100,16 @@ pub async fn run_async_scan( ds.replace_matches(deduped_arcs); } + // If baseline management is enabled, apply the baseline + if args.baseline_file.is_some() || args.manage_baseline { + let path = args + .baseline_file + .clone() + .unwrap_or_else(|| std::path::PathBuf::from("baseline-file.yaml")); + let mut ds = datastore.lock().unwrap(); + crate::baseline::apply_baseline(&mut ds, &path, args.manage_baseline, &input_roots)?; + } + // If validation is enabled, run it as a second phase if !args.no_validate { info!("Starting secret validation phase..."); diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index 94206d0..3c41afd 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -90,7 +90,7 @@ rules: extraction_depth: 1, no_binary: true, no_extract_archives: false, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns }, confidence: ConfidenceLevel::Low, no_validate: true, @@ -103,6 +103,8 @@ rules: no_dedup, ignore_tests: false, snippet_length: 64, + baseline_file: None, + manage_baseline: false, }; let global_args = GlobalArgs { diff --git a/tests/int_github.rs b/tests/int_github.rs index e541812..c66fdb8 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -77,7 +77,7 @@ fn test_github_remote_scan() -> Result<()> { no_extract_archives: false, extraction_depth: 2, no_binary: true, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns }, confidence: ConfidenceLevel::Medium, no_validate: false, @@ -90,6 +90,8 @@ fn test_github_remote_scan() -> Result<()> { no_dedup: true, ignore_tests: false, snippet_length: 256, + baseline_file: None, + manage_baseline: false, }; // Create global arguments let global_args = GlobalArgs { diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index ea0acaf..308f6a0 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -74,7 +74,7 @@ fn test_gitlab_remote_scan() -> Result<()> { no_extract_archives: false, extraction_depth: 2, no_binary: true, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns }, confidence: ConfidenceLevel::Medium, no_validate: false, @@ -87,6 +87,8 @@ fn test_gitlab_remote_scan() -> Result<()> { no_dedup: true, ignore_tests: false, snippet_length: 256, + baseline_file: None, + manage_baseline: false, }; let global_args = GlobalArgs { diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index 7f9ca80..06bc55a 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -133,7 +133,7 @@ async fn test_validation_cache_and_depvars() -> Result<()> { extraction_depth: 2, no_binary: true, no_extract_archives: false, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns }, confidence: ConfidenceLevel::Low, no_validate: false, @@ -146,6 +146,8 @@ async fn test_validation_cache_and_depvars() -> Result<()> { no_dedup: true, // keep duplicates so the cache is stressed ignore_tests: false, snippet_length: 128, + baseline_file: None, + manage_baseline: false, }; /* --------------------------------------------------------- * diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 20ad438..032a51b 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -76,7 +76,7 @@ impl TestContext { extraction_depth: 2, no_binary: true, no_extract_archives: false, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns }, confidence: ConfidenceLevel::Low, no_validate: true, @@ -89,6 +89,8 @@ impl TestContext { no_dedup: true, ignore_tests: false, snippet_length: 256, + baseline_file: None, + manage_baseline: false, }; let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules) @@ -140,7 +142,7 @@ impl TestContext { extraction_depth: 2, no_binary: true, no_extract_archives: false, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns }, confidence: ConfidenceLevel::Low, no_validate: true, @@ -153,6 +155,8 @@ impl TestContext { no_dedup: true, ignore_tests: false, snippet_length: 256, + baseline_file: None, + manage_baseline: false, }; let global_args = GlobalArgs { diff --git a/tests/smoke_baseline.rs b/tests/smoke_baseline.rs new file mode 100644 index 0000000..d5d1962 --- /dev/null +++ b/tests/smoke_baseline.rs @@ -0,0 +1,56 @@ +use std::fs; + +use assert_cmd::Command; +use predicates::prelude::*; +use tempfile::tempdir; + +const GH_PAT: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; + +#[test] +fn baseline_create_and_filter() -> anyhow::Result<()> { + let dir = tempdir()?; + let file = dir.path().join("leak.txt"); + fs::write(&file, format!("token = \"{}\"\n", GH_PAT))?; + let baseline = dir.path().join("baseline.yaml"); + + // Create baseline with manage flag + Command::cargo_bin("kingfisher")? + .args([ + "scan", + dir.path().to_str().unwrap(), + "--no-binary", + "--confidence=low", + "--no-validate", + "--format", + "json", + "--manage-baseline", + "--baseline-file", + baseline.to_str().unwrap(), + "--no-update-check", + ]) + .assert() + .code(200) + .stdout(predicate::str::contains(GH_PAT)); + + assert!(baseline.exists(), "baseline file created"); + + // Scan again using the baseline + Command::cargo_bin("kingfisher")? + .args([ + "scan", + dir.path().to_str().unwrap(), + "--no-binary", + "--confidence=low", + "--no-validate", + "--format", + "json", + "--baseline-file", + baseline.to_str().unwrap(), + "--no-update-check", + ]) + .assert() + .code(0) + .stdout(predicate::str::contains(GH_PAT).not()); + + Ok(()) +} diff --git a/tests/smoke_exclude.rs b/tests/smoke_exclude.rs new file mode 100644 index 0000000..76e647d --- /dev/null +++ b/tests/smoke_exclude.rs @@ -0,0 +1,34 @@ +use std::fs; + +use assert_cmd::Command; +use predicates::prelude::*; +use tempfile::tempdir; + +const SECRET: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; + +#[test] +fn exclude_pattern_hides_matches() -> anyhow::Result<()> { + let dir = tempdir()?; + let py = dir.path().join("foo.py"); + let txt = dir.path().join("bar.txt"); + fs::write(&py, format!("token = \"{}\"\n", SECRET))?; + fs::write(&txt, format!("token = \"{}\"\n", SECRET))?; + + Command::cargo_bin("kingfisher")? + .args([ + "scan", + dir.path().to_str().unwrap(), + "--confidence=low", + "--no-binary", + "--no-validate", + "--format", + "json", + "--exclude=*.py", + "--no-update-check", + ]) + .assert() + .code(200) + .stdout(predicate::str::contains("bar.txt").and(predicate::str::contains("foo.py").not())); + + Ok(()) +}