Merge pull request #36 from mongodb/development

v1.21.0
This commit is contained in:
Mick Grove 2025-07-14 18:07:58 -07:00 committed by GitHub
commit 9bee9e6b5d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
29 changed files with 743 additions and 110 deletions

View file

@ -2,6 +2,17 @@
All notable changes to this project will be documented in this file.
## [1.21.0]
- Improved Azure Storage rule
- Added rule to detect TravisCI encrypted values
- Added baseline feature with `--baseline-file` and `--manage-baseline` flags
- Introduced `--exclude` option for skipping paths
- Added tests covering baseline and exclude workflow
- Added validation for JWT tokens that checks `exp` and `nbf` claims
- JWT validation performs OpenID Connect discovery using the `iss` claim and verifies signatures via JWKS
- Removed `--ignore-tests` argument, because the `--exclude` flag provides more granular functionality
- DigitalOcean rule update
- Adafruit rule update
## [1.20.0]
- Removed confirmation prompt when user provides --self-update flag

View file

@ -10,7 +10,7 @@ publish = false
[package]
name = "kingfisher"
version = "1.20.0"
version = "1.21.0"
edition.workspace = true
rust-version.workspace = true
license.workspace = true
@ -161,6 +161,9 @@ trust-dns-resolver = { version = "0.23.2", default-features = false, features
atty = "0.2.14"
self_update = { version = "0.42.0", default-features = false, features = ["rustls", "archive-tar", "archive-zip", "compression-flate2"] }
semver = "1.0.26"
globset = "0.4.16"
jsonwebtoken = "9.3.1"
ipnet = "2.11.0"
[dependencies.tikv-jemallocator]
version = "0.6"

View file

@ -140,9 +140,34 @@ kingfisher scan /path/to/repo --rule-stats
### Scan while ignoring likely test files
`--exclude` skips any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive)
```bash
# Scan source but skip likely unit / integration tests
kingfisher scan ./my-project --ignore-tests
kingfisher scan ./my-project \
--exclude='[Tt]est' \
--exclude='spec' \
--exclude='[Ff]ixture' \
--exclude='example' \
--exclude='sample'
```
### Exclude specific paths
```bash
# Skip all Python files and any directory named tests
kingfisher scan ./my-project \
--exclude '*.py' \
--exclude '[Tt]ests'
```
If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude):
```bash
# Skip all Python files and any directory named tests, and report to stderr any skipped files
kingfisher scan ./my-project \
--exclude '*.py' \
--exclude tests \
-v
```
---
@ -286,7 +311,32 @@ kingfisher github repos list --organization my-org
- `--no-extract-archives`: Do not scan inside archives
- `--extraction-depth <N>`: Specifies how deep nested archives should be extracted and scanned (default: 2)
- `--redact`: Replaces discovered secrets with a one-way hash for secure output
- `--ignore-tests`:Skip files or directories whose path component contains _test_, _spec_, _fixture_, _example_, or _sample_ (case-insensitive)
- `--exclude <PATTERN>`: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive)
- `--baseline-file <FILE>`: Ignore matches listed in a baseline YAML file
- `--manage-baseline`: Create or update the baseline file with current findings
## Build a Baseline / Detect New Secrets
There are situations where a repository already contains checkedin secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list.
The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches):
```bash
kingfisher scan /path/to/code \
--confidence low \
--manage-baseline \
--baseline-file ./baseline-file.yml
```
Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings:
```bash
kingfisher scan /path/to/code \
--baseline-file /path/to/baseline-file.yaml
```
See ([docs/BASELINE.md](docs/BASELINE.md)) for full detail.
## Finding Fingerprint

View file

@ -1,9 +1,12 @@
rules:
- name: Azure Storage Account Name
id: kingfisher.azurestorage.name.1
id: kingfisher.azurestorage.1
pattern: |
(?xi)
(?:
\b
azure
(?:.|[\n\r]){0,32}?
(?i:
(?:Account|Storage)
(?:[._-]Account)?
@ -19,13 +22,16 @@ rules:
visible: false
confidence: medium
examples:
- storage_name=mystorageaccount123
- azure_storage_name=mystorageaccount123
- mystorageaccount.blob.core.windows.net
- name: Azure Storage Account Key
id: kingfisher.azurestorage.key.1
id: kingfisher.azurestorage.2
pattern: |
(?xi)
\b
azure
(?:.|[\n\r]){0,32}?
(?i:(?:Access|Account|Storage)[_.-]?Key)
(?:.|[\n\r]){0,25}?
(
@ -34,9 +40,9 @@ rules:
min_entropy: 4.0
confidence: medium
examples:
- AccountKey=Xy9aB8cD7eF6gH5iJ4kL3mN2oP1qR0sT9uV8wX7yZ6aB5cD4eF3gH2iJ1kL0mN9oP8qR7sT6uV5wX4yZ3aB2cD1eF0gH9iJ8kL7mN6oP5q==\
- Azure AccountKey=Xy9aB8cD7eF6gH5iJ4kL3mN2oP1qR0sT9uV8wX7yZ6aB5cD4eF3gH2iJ1kL0mN9oP8qR7sT6uV5wX4yZ3aB2cD1eF0gH9iJ8kL7mN6oP5q==\
validation:
type: AzureStorage
depends_on_rule:
- rule_id: kingfisher.azurestorage.name.1
- rule_id: kingfisher.azurestorage.1
variable: AZURENAME

View file

@ -22,4 +22,6 @@ rules:
- https://datatracker.ietf.org/doc/html/rfc7519
- https://en.wikipedia.org/wiki/Base64#URL_applications
- https://datatracker.ietf.org/doc/html/rfc4648
- https://developer.okta.com/blog/2018/06/20/what-happens-if-your-jwt-is-stolen
- https://developer.okta.com/blog/2018/06/20/what-happens-if-your-jwt-is-stolen
validation:
type: JWT

View file

@ -31,3 +31,24 @@ rules:
- report_response: true
- type: StatusMatch
status: [200]
- name: Travis CI Encrypted Variable
id: kingfisher.travisci.1
pattern: |
(?xis)
\b
(?:language|env|deploy|script):[\r\n]
(?:.|[\\n\r]){0,256}?
\b
(
secure:\s*"?[A-Za-z0-9+/=\\]+"?\s*
)
\b
min_entropy: 3.0
confidence: medium
examples:
- |
env:
global:
# This sets FOO=super-secret, but the plaintext never appears here.
- secure: "VJh0l9gOb+6AVNDk6cziZSs1AqVM8CqtZU6ot9ZQeJ+KfL1pxnGQ4qQF8Cz9\M1q85c3l1N1+qkQ0uV12QG6O6ylq6Qq1l3VjAJM3h2pY3jdmrA8kX2ZIxRjC/\8+Xj1wVtKQ0R+owM/6i5Y6cyx4hRb3VvSeYlC0lD1iTzQ2vgMyE="

55
docs/BASELINE.md Normal file
View file

@ -0,0 +1,55 @@
# Build a Baseline / Detect Only New Secrets
There are situations where a repository already contains checkedin secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list.
The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches):
```bash
kingfisher scan /path/to/code \
--confidence low \
--manage-baseline \
--baseline-file ./baseline-file.yml
```
This generates a YAML file named `baseline-file.yml` in the current directory. The file tracks each finding under an `ExactFindings` section:
```yaml
ExactFindings:
matches:
- filepath: ruby_vulnerable.rb/
fingerprint: 056876f00ffd0622
linenum: 52
lastupdated: Mon, 14 Jul 2025 10:17:56 -0700
- filepath: ruby_vulnerable.rb/
fingerprint: ce41d19b83b2b1b0
linenum: 53
lastupdated: Mon, 14 Jul 2025 10:17:56 -0700
- filepath: ruby_vulnerable.rb/
fingerprint: e8644d91fa6654f5
linenum: 40
lastupdated: Mon, 14 Jul 2025 10:17:56 -0700
```
`fingerprint` reuses Kingfisher's 64-bit *finding fingerprint* algorithm with offsets set to zero. It hashes the secret value together with the normalized filepath, so moving a secret around does not create a new entry.
Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings:
```bash
kingfisher scan /path/to/code \
--baseline-file /path/to/baseline-file.yaml
```
If you intentionally add a new secret that should be ignored later, rerun the scan with both `--manage-baseline` and `--baseline-file` to append new matches to the file without removing existing entries:
```bash
kingfisher scan /path/to/code \
--manage-baseline \
--baseline-file /path/to/baseline-file.yml
```
If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via `--exclude`):
```bash
kingfisher scan /path/to/project -v
```

115
src/baseline.rs Normal file
View file

@ -0,0 +1,115 @@
use std::{
collections::HashSet,
fs,
path::{Path, PathBuf},
sync::Arc,
};
use anyhow::{Context, Result};
use chrono::Local;
use serde::{Deserialize, Serialize};
use tracing::debug;
use crate::{findings_store::FindingsStore, matcher::compute_finding_fingerprint};
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct BaselineFile {
#[serde(rename = "ExactFindings", default)]
pub exact_findings: ExactFindings,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct ExactFindings {
#[serde(default)]
pub matches: Vec<BaselineFinding>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct BaselineFinding {
pub filepath: String,
pub fingerprint: String,
pub linenum: usize,
pub lastupdated: String,
}
pub fn load_baseline(path: &Path) -> Result<BaselineFile> {
let data = fs::read_to_string(path).context("read baseline file")?;
Ok(serde_yaml::from_str(&data).context("parse baseline yaml")?)
}
pub fn save_baseline(path: &Path, baseline: &BaselineFile) -> Result<()> {
let data = serde_yaml::to_string(baseline).context("serialize baseline")?;
fs::write(path, data).context("write baseline file")
}
fn normalize_path(p: &Path, roots: &[PathBuf]) -> String {
for root in roots {
if let Ok(stripped) = p.strip_prefix(root) {
if let Some(name) = root.file_name() {
return PathBuf::from(name).join(stripped).to_string_lossy().into();
}
}
}
p.to_string_lossy().into()
}
fn compute_hash(secret: &str, path: &str) -> String {
let fp = compute_finding_fingerprint(secret, path, 0, 0);
format!("{:016x}", fp)
}
fn extract_secret(m: &crate::matcher::Match) -> String {
m.groups
.captures
.get(1)
.or_else(|| m.groups.captures.get(0))
.map(|c| c.value.to_string())
.unwrap_or_default()
}
pub fn apply_baseline(
store: &mut FindingsStore,
baseline_path: &Path,
manage: bool,
roots: &[PathBuf],
) -> Result<()> {
let mut baseline = if baseline_path.exists() {
load_baseline(baseline_path)?
} else {
BaselineFile::default()
};
let mut known: HashSet<String> =
baseline.exact_findings.matches.iter().map(|m| m.fingerprint.clone()).collect();
let mut new_entries = Vec::new();
for arc_msg in store.get_matches_mut() {
let (origin, _blob, m) = Arc::make_mut(arc_msg);
let file_path = origin.iter().filter_map(|o| o.full_path()).next();
if let Some(fp) = file_path {
let normalized = normalize_path(&fp, roots);
let secret = extract_secret(m);
let hash = compute_hash(&secret, &normalized);
if known.contains(&hash) {
debug!("Skipping {} due to baseline (hash {})", normalized, hash);
m.visible = false;
} else if manage {
known.insert(hash.clone());
let entry = BaselineFinding {
filepath: normalized,
fingerprint: hash,
linenum: m.location.source_span.start.line,
lastupdated: Local::now().to_rfc2822(),
};
new_entries.push(entry);
}
}
}
if manage && !new_entries.is_empty() {
baseline.exact_findings.matches.extend(new_entries);
save_baseline(baseline_path, &baseline)?;
}
Ok(())
}

View file

@ -111,9 +111,13 @@ pub struct ContentFilteringArgs {
#[arg(long("max-file-size"), default_value_t = 25.0)]
pub max_file_size_mb: f64,
/// Use custom path-based ignore rules from the given file(s)
#[arg(long, short, value_hint = ValueHint::FilePath)]
pub ignore: Vec<PathBuf>,
// /// Use custom path-based ignore rules from the given file(s)
// #[arg(long, short, value_hint = ValueHint::FilePath)]
// pub ignore: Vec<PathBuf>,
/// Skip any file or directory whose path matches this glob pattern. Multiple
/// patterns may be provided by repeating the flag.
#[arg(long, value_name = "PATTERN")]
pub exclude: Vec<String>,
/// If true, do NOT extract archive files
#[arg(long("no-extract-archives"), default_value_t = false)]

View file

@ -88,10 +88,6 @@ pub struct ScanArgs {
#[arg(long, default_value_t = false)]
pub no_dedup: bool,
/// Ignore matches that appear to come from test files
#[arg(long, default_value_t = false)]
pub ignore_tests: bool,
/// Redact findings values using a secure hash
#[arg(long, short = 'r', default_value_t = false)]
pub redact: bool,
@ -106,6 +102,14 @@ pub struct ScanArgs {
/// Bytes of context before and after each match
#[arg(long, default_value_t = 256, value_name = "BYTES")]
pub snippet_length: usize,
/// Baseline file to filter known secrets
#[arg(long, value_name = "FILE")]
pub baseline_file: Option<std::path::PathBuf>,
/// Create or update the baseline file with current findings
#[arg(long, default_value_t = false)]
pub manage_baseline: bool,
}
/// Confidence levels for findings

View file

@ -147,9 +147,9 @@ impl RepositoryIndex {
.with_ordering(Ordering::PackLexicographicalThenLooseLexicographical)
{
let oid = unwrap_ok_or_continue!(oid, |e| debug!("Failed to read object id: {e}"));
if count % 100000 == 0 {
debug!("Indexed {} objects in RepositoryIndex::new", count);
}
// if count % 100000 == 0 {
// debug!("Indexed {} objects in RepositoryIndex::new", count);
// }
let hdr = unwrap_ok_or_continue!(odb.header(oid), |e| {
debug!("Failed to read object header for {oid}: {e}")
});

View file

@ -73,11 +73,16 @@ pub struct GitBlobMetadata {
pub struct GitRepoWithMetadataEnumerator<'a> {
path: &'a Path,
repo: Repository,
exclude_globset: Option<std::sync::Arc<globset::GlobSet>>,
}
impl<'a> GitRepoWithMetadataEnumerator<'a> {
pub fn new(path: &'a Path, repo: Repository) -> Self {
Self { path, repo }
pub fn new(
path: &'a Path,
repo: Repository,
exclude_globset: Option<std::sync::Arc<globset::GlobSet>>,
) -> Self {
Self { path, repo, exclude_globset }
}
pub fn run(self) -> Result<GitRepoResult> {
@ -173,12 +178,19 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
}
let filtered = appearances
.into_iter()
.filter(|entry| {
// Apply your actual ignore-logic here:
match entry.path.to_path() {
Ok(_path) => true,
Err(_) => true,
.filter(|entry| match entry.path.to_path() {
Ok(p) => {
if let Some(gs) = &self.exclude_globset {
let m = gs.is_match(p);
if m {
debug!("Skipping {} due to --exclude", p.display());
}
!m
} else {
true
}
}
Err(_) => true,
})
.collect::<SmallVec<_>>();
if filtered.is_empty() {

View file

@ -1,3 +1,4 @@
pub mod baseline;
pub mod binary;
pub mod blob;
pub mod bstring_escape;
@ -43,6 +44,7 @@ use crossbeam_channel::Sender;
pub use git_repo_enumerator::{GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator};
pub use gix::{self, Repository, ThreadSafeRepository};
use gix::{open::Options, open_opts};
use globset::{Glob, GlobSet, GlobSetBuilder};
pub use ignore::gitignore::{Gitignore, GitignoreBuilder};
use ignore::{DirEntry, WalkBuilder, WalkState};
use tokio::time::Duration;
@ -80,7 +82,7 @@ struct EnumeratorConfig {
enumerate_git_history: bool,
collect_git_metadata: bool,
repo_scan_timeout: Duration,
// gitignore: Gitignore,
exclude_globset: Option<std::sync::Arc<GlobSet>>,
}
pub enum FoundInput {
@ -204,6 +206,7 @@ pub struct FilesystemEnumerator {
extract_archives: bool,
extraction_depth: usize,
no_dedup: bool,
exclude_globset: Option<std::sync::Arc<GlobSet>>,
}
impl FilesystemEnumerator {
@ -234,6 +237,7 @@ impl FilesystemEnumerator {
extract_archives: !args.content_filtering_args.no_extract_archives,
extraction_depth: args.content_filtering_args.extraction_depth as usize,
no_dedup: args.no_dedup,
exclude_globset: None,
})
}
@ -287,6 +291,31 @@ impl FilesystemEnumerator {
self
}
pub fn set_exclude_patterns(&mut self, patterns: &[String]) -> Result<&mut Self> {
if patterns.is_empty() {
return Ok(self);
}
let mut builder = GlobSetBuilder::new();
for pat in patterns {
builder.add(Glob::new(pat)?);
}
let globset = std::sync::Arc::new(builder.build()?);
self.exclude_globset = Some(globset.clone());
self.filter_entry(move |entry| {
let path = entry.path();
let matches = globset.is_match(path);
if matches {
debug!("Skipping {} due to --exclude", path.display());
}
!matches
});
Ok(self)
}
pub fn exclude_globset(&self) -> Option<std::sync::Arc<GlobSet>> {
self.exclude_globset.clone()
}
pub fn gitignore(&self) -> Result<Gitignore> {
Ok(self.gitignore_builder.build()?)
}

View file

@ -132,7 +132,18 @@ pub fn determine_exit_code(datastore: &Arc<Mutex<findings_store::FindingsStore>>
// exit with code 0 if there are NO findings discovered
let ds = datastore.lock().unwrap();
// Get all matches
let all_matches = ds.get_matches();
// let all_matches = ds.get_matches();
// Only consider visible matches when determining the exit code
let all_matches = ds
.get_matches()
.iter()
.filter(|msg| {
let (_, _, match_item) = &***msg;
match_item.visible
})
.collect::<Vec<_>>();
if all_matches.is_empty() {
// No findings discovered
0
@ -141,7 +152,7 @@ pub fn determine_exit_code(datastore: &Arc<Mutex<findings_store::FindingsStore>>
let validated_matches = all_matches
.iter()
.filter(|msg| {
let (_, _, match_item) = &***msg;
let (_, _, match_item) = &****msg;
match_item.validation_success
})
.count();
@ -278,7 +289,7 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
max_file_size_mb: 25.0,
no_extract_archives: true,
extraction_depth: 2,
ignore: Vec::new(),
exclude: Vec::new(), // Exclude patterns
no_binary: true,
},
confidence: ConfidenceLevel::Medium,
@ -289,8 +300,9 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
redact: false,
git_repo_timeout: 1800,
no_dedup: false,
ignore_tests: false,
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
}
}

View file

@ -383,13 +383,13 @@ mod tests {
rules::rule::Confidence,
util::intern,
};
fn create_default_args() -> cli::commands::scan::ScanArgs {
use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope
cli::commands::scan::ScanArgs {
num_jobs: 1,
no_dedup: false,
ignore_tests: false,
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
@ -424,7 +424,7 @@ mod tests {
max_file_size_mb: 25.0,
no_extract_archives: false,
extraction_depth: 2,
ignore: Vec::new(),
exclude: Vec::new(), // Exclude patterns
no_binary: true,
},
confidence: ConfidenceLevel::Medium,
@ -436,6 +436,8 @@ mod tests {
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
}
}

View file

@ -163,7 +163,9 @@ impl<'a> Display for PrettyFinding<'a> {
writeln!(f, " |Fingerprint...: {}", rm.m.finding_fingerprint)?;
writeln!(f, " |Confidence....: {}", rm.match_confidence.to_string())?;
writeln!(f, " |Entropy.......: {:.2}", rm.m.calculated_entropy)?;
let validation_status = if rm.validation_response_status == StatusCode::CONTINUE.as_u16() {
let validation_status = if rm.validation_response_status == StatusCode::CONTINUE.as_u16()
|| rm.validation_response_status == StatusCode::PRECONDITION_REQUIRED.as_u16()
{
"Not Attempted".to_string()
} else if rm.validation_success {
"Active Credential".to_string()
@ -312,7 +314,6 @@ fn test_pretty_format_with_nan_entropy_panics() {
// core execution / performance
num_jobs: 1,
no_dedup: false,
ignore_tests: false,
// rule selection
rules: RuleSpecifierArgs {
@ -349,7 +350,7 @@ fn test_pretty_format_with_nan_entropy_panics() {
max_file_size_mb: 25.0,
no_extract_archives: false,
extraction_depth: 2,
ignore: Vec::new(),
exclude: Vec::new(), // Exclude patterns
no_binary: true,
},
@ -367,6 +368,8 @@ fn test_pretty_format_with_nan_entropy_panics() {
// display
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
// This will panic if the entropy isn't checked for NaN

View file

@ -38,6 +38,7 @@ pub enum Validation {
GCP,
MongoDB,
Postgres,
JWT,
Raw(String),
Http(HttpValidation),
}

View file

@ -37,7 +37,6 @@ use crate::{
util::is_compressed_file,
},
scanner_pool::ScannerPool,
util::is_test_like_path,
EnumeratorConfig, EnumeratorFileResult, FileResult, FilesystemEnumerator, FoundInput,
GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator, PathBuf,
};
@ -82,9 +81,10 @@ pub fn enumerate_filesystem_inputs(
}()
.context("Failed to initialize filesystem enumerator")?;
let (enum_thread, input_recv) = {
let (enum_thread, input_recv, exclude_globset) = {
let fs_enumerator = make_fs_enumerator(args, input_roots.into())
.context("Failed to initialize filesystem enumerator")?;
let exclude_globset = fs_enumerator.as_ref().and_then(|ie| ie.exclude_globset());
let channel_size = std::cmp::max(args.num_jobs * 128, 1024);
let (input_send, input_recv) = crossbeam_channel::bounded(channel_size);
@ -97,7 +97,7 @@ pub fn enumerate_filesystem_inputs(
Ok(())
})
.context("Failed to enumerate filesystem inputs")?;
(input_enumerator_thread, input_recv)
(input_enumerator_thread, input_recv, exclude_globset)
};
let enum_cfg = EnumeratorConfig {
@ -107,6 +107,7 @@ pub fn enumerate_filesystem_inputs(
},
collect_git_metadata: args.input_specifier_args.commit_metadata,
repo_scan_timeout,
exclude_globset,
};
let (send_ds, recv_ds) = create_datastore_channel(args.num_jobs);
let datastore_writer_thread =
@ -189,23 +190,11 @@ pub fn enumerate_filesystem_inputs(
Ok(Some((origin_set, blob_metadata, vec_of_matches))) => {
for (_, single_match) in vec_of_matches {
// Send each match
let is_test = if args.ignore_tests {
origin_set
.iter()
.filter_map(|o| o.full_path())
.any(|p| is_test_like_path(&p))
} else {
false
};
if !is_test {
// Send each match
send_ds.send((
Arc::new(origin_set.clone()),
Arc::new(blob_metadata.clone()),
single_match,
))?;
}
send_ds.send((
Arc::new(origin_set.clone()),
Arc::new(blob_metadata.clone()),
single_match,
))?;
}
}
Err(e) => {
@ -246,13 +235,7 @@ fn make_fs_enumerator(
// Pass no_dedup when enumerating git history
ie.no_dedup(args.no_dedup);
// Load any specified ignore files
for ignore_path in args.content_filtering_args.ignore.iter() {
debug!("Using ignore rules from {}", ignore_path.display());
ie.add_ignore(ignore_path).with_context(|| {
format!("Failed to load ignore rules from {}", ignore_path.display())
})?;
}
ie.set_exclude_patterns(&args.content_filtering_args.exclude)?;
// Determine whether to collect git metadata or not
let collect_git_metadata = false;
ie.collect_git_metadata(collect_git_metadata);
@ -610,9 +593,15 @@ impl<'cfg> ParallelBlobIterator for (&'cfg EnumeratorConfig, FoundInput) {
// Spawn an enumerator thread so we can time-out cleanly
let path_clone = path.to_path_buf();
let (tx, rx) = std::sync::mpsc::channel();
let exclude_globset = cfg.exclude_globset.clone();
let handle = std::thread::spawn(move || {
let res = if collect_git_metadata {
GitRepoWithMetadataEnumerator::new(&path_clone, repository).run()
GitRepoWithMetadataEnumerator::new(
&path_clone,
repository,
exclude_globset.clone(),
)
.run()
} else {
GitRepoEnumerator::new(&path_clone, repository).run()
};

View file

@ -100,6 +100,16 @@ pub async fn run_async_scan(
ds.replace_matches(deduped_arcs);
}
// If baseline management is enabled, apply the baseline
if args.baseline_file.is_some() || args.manage_baseline {
let path = args
.baseline_file
.clone()
.unwrap_or_else(|| std::path::PathBuf::from("baseline-file.yaml"));
let mut ds = datastore.lock().unwrap();
crate::baseline::apply_baseline(&mut ds, &path, args.manage_baseline, &input_roots)?;
}
// If validation is enabled, run it as a second phase
if !args.no_validate {
info!("Starting secret validation phase...");

View file

@ -27,6 +27,7 @@ mod aws;
mod azure;
mod gcp;
mod httpvalidation;
mod jwt;
mod mongodb;
mod postgres;
mod utils;
@ -58,35 +59,6 @@ pub fn init_validation_caches() {
IN_FLIGHT.set(DashMap::new()).ok();
}
// #[derive(Clone, FilterReflection, ParseFilter)]
// #[filter(
// name = "b64enc",
// description = "Encodes the input string using Base64 encoding",
// parsed(B64EncFilter)
// )]
// pub struct B64EncFilterParser;
// #[derive(Debug, Default, Clone)]
// pub struct B64EncFilter;
// impl std::fmt::Display for B64EncFilter {
// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// write!(f, "b64enc")
// }
// }
// impl Filter for B64EncFilter {
// fn evaluate(
// &self,
// input: &dyn ValueView,
// _runtime: &dyn Runtime,
// ) -> Result<Value, LiquidError> {
// let input_str = input.to_kstr().into_owned();
// let encoded = general_purpose::STANDARD.encode(input_str.as_bytes());
// Ok(Value::scalar(encoded))
// }
// }
#[derive(Clone)]
pub struct CachedResponse {
pub body: String,
@ -700,7 +672,36 @@ async fn timed_validate_single_match<'a>(
},
);
}
// ---------------------------------------------------- JWT validator
Some(Validation::JWT) => {
let token = captured_values
.iter()
.find(|(n, ..)| n == "TOKEN")
.map(|(_, v, ..)| v.clone())
.unwrap_or_default();
if token.is_empty() {
m.validation_success = false;
m.validation_response_body = "JWT token not found.".to_string();
m.validation_response_status = StatusCode::BAD_REQUEST;
commit_and_return(m);
return;
}
match jwt::validate_jwt(&token).await {
Ok((ok, msg)) => {
m.validation_success = ok;
m.validation_response_body = msg;
m.validation_response_status =
if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED };
}
Err(e) => {
m.validation_success = false;
m.validation_response_body = format!("JWT validation error: {}", e);
m.validation_response_status = StatusCode::BAD_REQUEST;
}
}
}
// ---------------------------------------------------- AWS validator
Some(Validation::AWS) => {
let secret = captured_values

212
src/validation/jwt.rs Normal file
View file

@ -0,0 +1,212 @@
use anyhow::{anyhow, Result};
use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _};
use chrono::Utc;
use ipnet::IpNet;
use jsonwebtoken::{decode, decode_header, jwk::JwkSet, DecodingKey, Validation as JwtValidation};
use once_cell::sync::Lazy;
use reqwest::{redirect::Policy, Client, Url};
use serde::Deserialize;
use tokio::net::lookup_host;
use super::utils::check_url_resolvable;
/// One global, redirect-free client. Building a `Client` is comparatively
/// expensive; re-using it lets reqwest share its internal connection pool
/// and TLS sessions across JWT validations. `Lazy` ensures thread-safe,
/// one-time initialisation.
static NO_REDIRECT_CLIENT: Lazy<Client> = Lazy::new(|| {
Client::builder()
.redirect(Policy::none()) // disable all redirects
.build()
.expect("failed to build no-redirect Client")
});
/// RFC 1918 + loopback + link-local nets we refuse to contact
const BLOCKED_NETS: &[&str] = &[
"10.0.0.0/8",
"172.16.0.0/12",
"192.168.0.0/16", // private
"127.0.0.0/8",
"169.254.0.0/16", // loopback / link-local
];
// aud is allowed to be either a string or an array, so let Serde flatten it.
#[derive(Debug, Deserialize)]
#[serde(untagged)]
enum Aud {
Str(String),
Arr(Vec<String>),
}
#[derive(Debug, Deserialize)]
struct Claims {
exp: Option<i64>,
nbf: Option<i64>,
iss: Option<String>,
aud: Option<Aud>,
}
pub async fn validate_jwt(token: &str) -> Result<(bool, String)> {
// --- insecure payload decode -------------------------------------------------
let claims: Claims = {
let payload_b64 = token.split('.').nth(1).ok_or_else(|| anyhow!("invalid JWT format"))?;
let payload_json = URL_SAFE_NO_PAD
.decode(payload_b64)
.map_err(|e| anyhow!("invalid base64 in payload: {e}"))?;
serde_json::from_slice(&payload_json).map_err(|e| anyhow!("invalid JSON claims: {e}"))?
};
// temporal checks
let now = Utc::now().timestamp();
if let Some(nbf) = claims.nbf {
if now < nbf {
return Ok((false, format!("Token not valid before {nbf}")));
}
}
if let Some(exp) = claims.exp {
if now > exp {
return Ok((false, format!("Token expired at {exp}")));
}
}
// ---------------------------------------------------------------------------
let issuer = claims.iss.clone().unwrap_or_default();
if let Some(iss) = claims.iss.clone() {
// parse header now (kid, alg)
let header = decode_header(token).map_err(|e| anyhow!("decode header: {e}"))?;
// build discovery URL and fetch it (redirects disabled)
let config_url = format!("{}/.well-known/openid-configuration", iss.trim_end_matches('/'));
let cfg_resp = NO_REDIRECT_CLIENT
.get(&config_url)
.send()
.await
.map_err(|e| anyhow!("issuer discovery failed: {e}"))?;
if !cfg_resp.status().is_success() {
return Ok((false, format!("issuer discovery failed: {}", cfg_resp.status())));
}
let cfg_json: serde_json::Value =
cfg_resp.json().await.map_err(|e| anyhow!("invalid discovery JSON: {e}"))?;
// extract jwks_uri
let jwks_uri = cfg_json
.get("jwks_uri")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow!("jwks_uri missing"))?;
// must be HTTPS
let url = Url::parse(jwks_uri).map_err(|e| anyhow!("invalid jwks_uri: {e}"))?;
if url.scheme() != "https" {
return Ok((false, "jwks_uri must use https".to_string()));
}
// host must match issuer host  —  prevents open redirects / SSRF-on-other-host
let iss_host = Url::parse(&iss)
.map_err(|e| anyhow!("invalid iss: {e}"))?
.host_str()
.unwrap_or_default()
.to_ascii_lowercase();
let jwks_host = url.host_str().unwrap_or_default().to_ascii_lowercase();
if jwks_host != iss_host {
return Ok((
false,
format!("jwks_uri host ({jwks_host}) must match issuer host ({iss_host})"),
));
}
// -----------------------------------------------------------------------
// DNS resolution + private-range block
for addr in lookup_host((jwks_host.as_str(), 443)).await? {
if is_blocked_ip(addr.ip()) {
return Ok((false, "jwks_uri resolves to private or link-local IP".to_string()));
}
}
// reachability check (existing helper)
check_url_resolvable(&url).await.map_err(|e| anyhow!("jwks uri unresolvable: {e}"))?;
// fetch JWKS with redirect-free client
let jwks_resp = NO_REDIRECT_CLIENT
.get(url)
.send()
.await
.map_err(|e| anyhow!("jwks fetch failed: {e}"))?;
if !jwks_resp.status().is_success() {
return Ok((false, format!("jwks fetch failed: {}", jwks_resp.status())));
}
let jwk_set: JwkSet =
jwks_resp.json().await.map_err(|e| anyhow!("invalid jwks json: {e}"))?;
// select key by kid
let kid = header.kid.ok_or_else(|| anyhow!("no kid in header"))?;
let jwk = jwk_set
.keys
.iter()
.find(|k| k.common.key_id.as_deref() == Some(&kid))
.ok_or_else(|| anyhow!("kid not found in jwks"))?;
// verify signature
let decoding_key = DecodingKey::from_jwk(jwk).map_err(|e| anyhow!("invalid jwk: {e}"))?;
let mut validation = JwtValidation::new(header.alg);
validation.set_audience(&extract_aud_strings(&claims));
validation.validate_exp = false;
validation.validate_nbf = false;
decode::<Claims>(token, &decoding_key, &validation)
.map_err(|e| anyhow!("signature verification failed: {e}"))?;
return Ok((
true,
format!("JWT valid (iss: {issuer}, aud: {:?})", extract_aud_strings(&claims)),
));
}
Ok((true, format!("JWT not expired (iss: {issuer}, aud: {:?})", extract_aud_strings(&claims))))
}
/// Helper: normalize aud into a flat Vec<String>
fn extract_aud_strings(claims: &Claims) -> Vec<String> {
match &claims.aud {
Some(Aud::Str(s)) => vec![s.clone()],
Some(Aud::Arr(v)) => v.clone(),
None => vec![],
}
}
/// returns true if IP is in a blocked network
fn is_blocked_ip(ip: std::net::IpAddr) -> bool {
BLOCKED_NETS.iter().filter_map(|cidr| cidr.parse::<IpNet>().ok()).any(|net| net.contains(&ip))
}
#[cfg(test)]
mod tests {
use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _};
use chrono::{Duration as ChronoDuration, Utc};
use reqwest::Client;
use super::validate_jwt;
fn build_token(exp_offset: i64) -> String {
let header = URL_SAFE_NO_PAD.encode(r#"{"alg":"none"}"#);
let exp = (Utc::now() + ChronoDuration::seconds(exp_offset)).timestamp();
let payload = URL_SAFE_NO_PAD.encode(format!("{{\"exp\":{exp}}}"));
format!("{header}.{payload}.")
}
#[tokio::test]
async fn valid_token() {
let token = build_token(60);
let res = validate_jwt(&token).await.unwrap();
assert!(res.0);
}
#[tokio::test]
async fn expired_token() {
let token = build_token(-60);
let res = validate_jwt(&token).await.unwrap();
assert!(!res.0);
}
}

View file

@ -90,7 +90,7 @@ rules:
extraction_depth: 1,
no_binary: true,
no_extract_archives: false,
ignore: Vec::new(),
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Low,
no_validate: true,
@ -101,8 +101,9 @@ rules:
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup,
ignore_tests: false,
snippet_length: 64,
baseline_file: None,
manage_baseline: false,
};
let global_args = GlobalArgs {

View file

@ -77,7 +77,7 @@ fn test_github_remote_scan() -> Result<()> {
no_extract_archives: false,
extraction_depth: 2,
no_binary: true,
ignore: Vec::new(),
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Medium,
no_validate: false,
@ -88,8 +88,9 @@ fn test_github_remote_scan() -> Result<()> {
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true,
ignore_tests: false,
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
// Create global arguments
let global_args = GlobalArgs {

View file

@ -74,7 +74,7 @@ fn test_gitlab_remote_scan() -> Result<()> {
no_extract_archives: false,
extraction_depth: 2,
no_binary: true,
ignore: Vec::new(),
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Medium,
no_validate: false,
@ -85,8 +85,9 @@ fn test_gitlab_remote_scan() -> Result<()> {
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true,
ignore_tests: false,
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
let global_args = GlobalArgs {

View file

@ -133,7 +133,7 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
extraction_depth: 2,
no_binary: true,
no_extract_archives: false,
ignore: Vec::new(),
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Low,
no_validate: false,
@ -144,8 +144,9 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true, // keep duplicates so the cache is stressed
ignore_tests: false,
snippet_length: 128,
baseline_file: None,
manage_baseline: false,
};
/* --------------------------------------------------------- *

View file

@ -76,7 +76,7 @@ impl TestContext {
extraction_depth: 2,
no_binary: true,
no_extract_archives: false,
ignore: Vec::new(),
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Low,
no_validate: true,
@ -87,8 +87,9 @@ impl TestContext {
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true,
ignore_tests: false,
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules)
@ -140,7 +141,7 @@ impl TestContext {
extraction_depth: 2,
no_binary: true,
no_extract_archives: false,
ignore: Vec::new(),
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Low,
no_validate: true,
@ -151,8 +152,9 @@ impl TestContext {
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true,
ignore_tests: false,
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
let global_args = GlobalArgs {

56
tests/smoke_baseline.rs Normal file
View file

@ -0,0 +1,56 @@
use std::fs;
use assert_cmd::Command;
use predicates::prelude::*;
use tempfile::tempdir;
const GH_PAT: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa";
#[test]
fn baseline_create_and_filter() -> anyhow::Result<()> {
let dir = tempdir()?;
let file = dir.path().join("leak.txt");
fs::write(&file, format!("token = \"{}\"\n", GH_PAT))?;
let baseline = dir.path().join("baseline.yaml");
// Create baseline with manage flag
Command::cargo_bin("kingfisher")?
.args([
"scan",
dir.path().to_str().unwrap(),
"--no-binary",
"--confidence=low",
"--no-validate",
"--format",
"json",
"--manage-baseline",
"--baseline-file",
baseline.to_str().unwrap(),
"--no-update-check",
])
.assert()
.code(200)
.stdout(predicate::str::contains(GH_PAT));
assert!(baseline.exists(), "baseline file created");
// Scan again using the baseline
Command::cargo_bin("kingfisher")?
.args([
"scan",
dir.path().to_str().unwrap(),
"--no-binary",
"--confidence=low",
"--no-validate",
"--format",
"json",
"--baseline-file",
baseline.to_str().unwrap(),
"--no-update-check",
])
.assert()
.code(0)
.stdout(predicate::str::contains(GH_PAT).not());
Ok(())
}

34
tests/smoke_exclude.rs Normal file
View file

@ -0,0 +1,34 @@
use std::fs;
use assert_cmd::Command;
use predicates::prelude::*;
use tempfile::tempdir;
const SECRET: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa";
#[test]
fn exclude_pattern_hides_matches() -> anyhow::Result<()> {
let dir = tempdir()?;
let py = dir.path().join("foo.py");
let txt = dir.path().join("bar.txt");
fs::write(&py, format!("token = \"{}\"\n", SECRET))?;
fs::write(&txt, format!("token = \"{}\"\n", SECRET))?;
Command::cargo_bin("kingfisher")?
.args([
"scan",
dir.path().to_str().unwrap(),
"--confidence=low",
"--no-binary",
"--no-validate",
"--format",
"json",
"--exclude=*.py",
"--no-update-check",
])
.assert()
.code(200)
.stdout(predicate::str::contains("bar.txt").and(predicate::str::contains("foo.py").not()));
Ok(())
}

View file

@ -1,9 +1,4 @@
use std::fs::{self, File};
use flate2::{write::GzEncoder, Compression};
use kingfisher::{cli::global::GlobalArgs, update::check_for_update};
use tar::Builder;
use tempfile::tempdir;
use tokio;
use wiremock::{
matchers::{method, path},