From 0ab21ace9906ec4253c7b753ff6a4885cf4f61e0 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sat, 12 Jul 2025 22:44:34 -0700 Subject: [PATCH 1/8] improved azure storage rule. Added rule to detect TravisCI encrypted values --- CHANGELOG.md | 3 +++ Cargo.toml | 2 +- data/rules/azurestorage.yml | 16 +++++++++++----- data/rules/travisci.yml | 21 +++++++++++++++++++++ src/reporter/pretty_format.rs | 4 +++- 5 files changed, 39 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b14fae..ebfc95f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ All notable changes to this project will be documented in this file. +## [1.21.0] +- Improved Azure Storage rule +- Added rule to detect TravisCI encrypted values ## [1.20.0] - Removed confirmation prompt when user provides --self-update flag diff --git a/Cargo.toml b/Cargo.toml index 19c6f95..179f285 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.20.0" +version = "1.21.0" edition.workspace = true rust-version.workspace = true license.workspace = true diff --git a/data/rules/azurestorage.yml b/data/rules/azurestorage.yml index 8f2921f..01dad46 100644 --- a/data/rules/azurestorage.yml +++ b/data/rules/azurestorage.yml @@ -1,9 +1,12 @@ rules: - name: Azure Storage Account Name - id: kingfisher.azurestorage.name.1 + id: kingfisher.azurestorage.1 pattern: | (?xi) (?: + \b + azure + (?:.|[\n\r]){0,32}? (?i: (?:Account|Storage) (?:[._-]Account)? @@ -19,13 +22,16 @@ rules: visible: false confidence: medium examples: - - storage_name=mystorageaccount123 + - azure_storage_name=mystorageaccount123 - mystorageaccount.blob.core.windows.net - name: Azure Storage Account Key - id: kingfisher.azurestorage.key.1 + id: kingfisher.azurestorage.2 pattern: | (?xi) + \b + azure + (?:.|[\n\r]){0,32}? (?i:(?:Access|Account|Storage)[_.-]?Key) (?:.|[\n\r]){0,25}? ( @@ -34,9 +40,9 @@ rules: min_entropy: 4.0 confidence: medium examples: - - AccountKey=Xy9aB8cD7eF6gH5iJ4kL3mN2oP1qR0sT9uV8wX7yZ6aB5cD4eF3gH2iJ1kL0mN9oP8qR7sT6uV5wX4yZ3aB2cD1eF0gH9iJ8kL7mN6oP5q==\ + - Azure AccountKey=Xy9aB8cD7eF6gH5iJ4kL3mN2oP1qR0sT9uV8wX7yZ6aB5cD4eF3gH2iJ1kL0mN9oP8qR7sT6uV5wX4yZ3aB2cD1eF0gH9iJ8kL7mN6oP5q==\ validation: type: AzureStorage depends_on_rule: - - rule_id: kingfisher.azurestorage.name.1 + - rule_id: kingfisher.azurestorage.1 variable: AZURENAME diff --git a/data/rules/travisci.yml b/data/rules/travisci.yml index 3461fbb..5a61c0a 100644 --- a/data/rules/travisci.yml +++ b/data/rules/travisci.yml @@ -31,3 +31,24 @@ rules: - report_response: true - type: StatusMatch status: [200] + - name: Travis CI Encrypted Variable + id: kingfisher.travisci.1 + pattern: | + (?xis) + \b + (?:language|env|deploy|script):[\r\n] + (?:.|[\\n\r]){0,256}? + \b + ( + secure:\s*"?[A-Za-z0-9+/=\\]+"?\s* + ) + \b + min_entropy: 3.0 + confidence: medium + examples: + - | + env: + global: + # This sets FOO=super-secret, but the plaintext never appears here. + - secure: "VJh0l9gOb+6AVNDk6cziZSs1AqVM8CqtZU6ot9ZQeJ+KfL1pxnGQ4qQF8Cz9\M1q85c3l1N1+qkQ0uV12QG6O6ylq6Qq1l3VjAJM3h2pY3jdmrA8kX2ZIxRjC/\8+Xj1wVtKQ0R+owM/6i5Y6cyx4hRb3VvSeYlC0lD1iTzQ2vgMyE=" + diff --git a/src/reporter/pretty_format.rs b/src/reporter/pretty_format.rs index 740f82a..0e78477 100644 --- a/src/reporter/pretty_format.rs +++ b/src/reporter/pretty_format.rs @@ -163,7 +163,9 @@ impl<'a> Display for PrettyFinding<'a> { writeln!(f, " |Fingerprint...: {}", rm.m.finding_fingerprint)?; writeln!(f, " |Confidence....: {}", rm.match_confidence.to_string())?; writeln!(f, " |Entropy.......: {:.2}", rm.m.calculated_entropy)?; - let validation_status = if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { + let validation_status = if rm.validation_response_status == StatusCode::CONTINUE.as_u16() + || rm.validation_response_status == StatusCode::PRECONDITION_REQUIRED.as_u16() + { "Not Attempted".to_string() } else if rm.validation_success { "Active Credential".to_string() From 3520c5fba53b0c404b3c262d04143812cb5ec2bb Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 14 Jul 2025 13:18:24 -0700 Subject: [PATCH 2/8] Added baseline feature with --baseline-file and --manage-baseline flags. Introduced --exclude option for skipping paths --- CHANGELOG.md | 4 ++ Cargo.toml | 1 + README.md | 44 +++++++++++++ docs/BASELINE.md | 55 ++++++++++++++++ src/baseline.rs | 115 ++++++++++++++++++++++++++++++++++ src/cli/commands/inputs.rs | 10 ++- src/cli/commands/scan.rs | 8 +++ src/lib.rs | 25 ++++++++ src/main.rs | 19 +++++- src/reporter/json_format.rs | 5 +- src/reporter/pretty_format.rs | 4 +- src/scanner/enumerate.rs | 8 +-- src/scanner/runner.rs | 10 +++ tests/int_dedup.rs | 4 +- tests/int_github.rs | 4 +- tests/int_gitlab.rs | 4 +- tests/int_validation_cache.rs | 4 +- tests/int_vulnerable_files.rs | 8 ++- tests/smoke_baseline.rs | 56 +++++++++++++++++ tests/smoke_exclude.rs | 34 ++++++++++ 20 files changed, 401 insertions(+), 21 deletions(-) create mode 100644 docs/BASELINE.md create mode 100644 src/baseline.rs create mode 100644 tests/smoke_baseline.rs create mode 100644 tests/smoke_exclude.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index ebfc95f..2e1179b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file. ## [1.21.0] - Improved Azure Storage rule - Added rule to detect TravisCI encrypted values +- Added baseline feature with `--baseline-file` and `--manage-baseline` flags +- Introduced `--exclude` option for skipping paths +- Added tests covering baseline and exclude workflow + ## [1.20.0] - Removed confirmation prompt when user provides --self-update flag diff --git a/Cargo.toml b/Cargo.toml index 179f285..dc2db13 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -161,6 +161,7 @@ trust-dns-resolver = { version = "0.23.2", default-features = false, features atty = "0.2.14" self_update = { version = "0.42.0", default-features = false, features = ["rustls", "archive-tar", "archive-zip", "compression-flate2"] } semver = "1.0.26" +globset = "0.4.16" [dependencies.tikv-jemallocator] version = "0.6" diff --git a/README.md b/README.md index 40bbce6..d7ae8b6 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,24 @@ kingfisher scan /path/to/repo --rule-stats kingfisher scan ./my-project --ignore-tests ``` +### Exclude specific paths +```bash +# Skip all Python files and any directory named tests +kingfisher scan ./my-project \ + --exclude '*.py' \ + --exclude tests +``` + +If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude): + +```bash +# Skip all Python files and any directory named tests, and report to stderr any skipped files +kingfisher scan ./my-project \ + --exclude '*.py' \ + --exclude tests \ + -v +``` + --- ## Scanning GitHub @@ -287,6 +305,32 @@ kingfisher github repos list --organization my-org - `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) - `--redact`: Replaces discovered secrets with a one-way hash for secure output - `--ignore-tests`: Skip files or directories whose path component contains _test_, _spec_, _fixture_, _example_, or _sample_ (case-insensitive) +- `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax) +- `--baseline-file `: Ignore matches listed in a baseline YAML file +- `--manage-baseline`: Create or update the baseline file with current findings + +## Build a Baseline / Detect New Secrets + +There are situations where a repository already contains checked‑in secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list. + +The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches): + +```bash +kingfisher scan /path/to/code \ + --confidence low \ + --manage-baseline \ + --baseline-file ./baseline-file.yml +``` + +Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings: + +```bash +kingfisher scan /path/to/code \ + --baseline-file /path/to/baseline-file.yaml +``` + +See ([docs/BASELINE.md](docs/BASELINE.md)) for full detail. + ## Finding Fingerprint diff --git a/docs/BASELINE.md b/docs/BASELINE.md new file mode 100644 index 0000000..aef0274 --- /dev/null +++ b/docs/BASELINE.md @@ -0,0 +1,55 @@ + +# Build a Baseline / Detect Only New Secrets + +There are situations where a repository already contains checked‑in secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list. + +The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches): + +```bash +kingfisher scan /path/to/code \ + --confidence low \ + --manage-baseline \ + --baseline-file ./baseline-file.yml +``` + +This generates a YAML file named `baseline-file.yml` in the current directory. The file tracks each finding under an `ExactFindings` section: + +```yaml +ExactFindings: + matches: + - filepath: ruby_vulnerable.rb/ + fingerprint: 056876f00ffd0622 + linenum: 52 + lastupdated: Mon, 14 Jul 2025 10:17:56 -0700 + - filepath: ruby_vulnerable.rb/ + fingerprint: ce41d19b83b2b1b0 + linenum: 53 + lastupdated: Mon, 14 Jul 2025 10:17:56 -0700 + - filepath: ruby_vulnerable.rb/ + fingerprint: e8644d91fa6654f5 + linenum: 40 + lastupdated: Mon, 14 Jul 2025 10:17:56 -0700 +``` + +`fingerprint` reuses Kingfisher's 64-bit *finding fingerprint* algorithm with offsets set to zero. It hashes the secret value together with the normalized filepath, so moving a secret around does not create a new entry. + +Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings: + +```bash +kingfisher scan /path/to/code \ + --baseline-file /path/to/baseline-file.yaml +``` + +If you intentionally add a new secret that should be ignored later, rerun the scan with both `--manage-baseline` and `--baseline-file` to append new matches to the file without removing existing entries: + +```bash +kingfisher scan /path/to/code \ + --manage-baseline \ + --baseline-file /path/to/baseline-file.yml +``` + +If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via `--exclude`): + +```bash +kingfisher scan /path/to/project -v +``` \ No newline at end of file diff --git a/src/baseline.rs b/src/baseline.rs new file mode 100644 index 0000000..c2588f2 --- /dev/null +++ b/src/baseline.rs @@ -0,0 +1,115 @@ +use std::{ + collections::HashSet, + fs, + path::{Path, PathBuf}, + sync::Arc, +}; + +use anyhow::{Context, Result}; +use chrono::Local; +use serde::{Deserialize, Serialize}; +use tracing::debug; + +use crate::{findings_store::FindingsStore, matcher::compute_finding_fingerprint}; + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct BaselineFile { + #[serde(rename = "ExactFindings", default)] + pub exact_findings: ExactFindings, +} + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct ExactFindings { + #[serde(default)] + pub matches: Vec, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct BaselineFinding { + pub filepath: String, + pub fingerprint: String, + pub linenum: usize, + pub lastupdated: String, +} + +pub fn load_baseline(path: &Path) -> Result { + let data = fs::read_to_string(path).context("read baseline file")?; + Ok(serde_yaml::from_str(&data).context("parse baseline yaml")?) +} + +pub fn save_baseline(path: &Path, baseline: &BaselineFile) -> Result<()> { + let data = serde_yaml::to_string(baseline).context("serialize baseline")?; + fs::write(path, data).context("write baseline file") +} + +fn normalize_path(p: &Path, roots: &[PathBuf]) -> String { + for root in roots { + if let Ok(stripped) = p.strip_prefix(root) { + if let Some(name) = root.file_name() { + return PathBuf::from(name).join(stripped).to_string_lossy().into(); + } + } + } + p.to_string_lossy().into() +} + +fn compute_hash(secret: &str, path: &str) -> String { + let fp = compute_finding_fingerprint(secret, path, 0, 0); + format!("{:016x}", fp) +} + +fn extract_secret(m: &crate::matcher::Match) -> String { + m.groups + .captures + .get(1) + .or_else(|| m.groups.captures.get(0)) + .map(|c| c.value.to_string()) + .unwrap_or_default() +} + +pub fn apply_baseline( + store: &mut FindingsStore, + baseline_path: &Path, + manage: bool, + roots: &[PathBuf], +) -> Result<()> { + let mut baseline = if baseline_path.exists() { + load_baseline(baseline_path)? + } else { + BaselineFile::default() + }; + + let mut known: HashSet = + baseline.exact_findings.matches.iter().map(|m| m.fingerprint.clone()).collect(); + + let mut new_entries = Vec::new(); + for arc_msg in store.get_matches_mut() { + let (origin, _blob, m) = Arc::make_mut(arc_msg); + let file_path = origin.iter().filter_map(|o| o.full_path()).next(); + if let Some(fp) = file_path { + let normalized = normalize_path(&fp, roots); + let secret = extract_secret(m); + let hash = compute_hash(&secret, &normalized); + if known.contains(&hash) { + debug!("Skipping {} due to baseline (hash {})", normalized, hash); + m.visible = false; + } else if manage { + known.insert(hash.clone()); + let entry = BaselineFinding { + filepath: normalized, + fingerprint: hash, + linenum: m.location.source_span.start.line, + lastupdated: Local::now().to_rfc2822(), + }; + new_entries.push(entry); + } + } + } + + if manage && !new_entries.is_empty() { + baseline.exact_findings.matches.extend(new_entries); + save_baseline(baseline_path, &baseline)?; + } + + Ok(()) +} diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs index acfc220..20d3dde 100644 --- a/src/cli/commands/inputs.rs +++ b/src/cli/commands/inputs.rs @@ -111,9 +111,13 @@ pub struct ContentFilteringArgs { #[arg(long("max-file-size"), default_value_t = 25.0)] pub max_file_size_mb: f64, - /// Use custom path-based ignore rules from the given file(s) - #[arg(long, short, value_hint = ValueHint::FilePath)] - pub ignore: Vec, + // /// Use custom path-based ignore rules from the given file(s) + // #[arg(long, short, value_hint = ValueHint::FilePath)] + // pub ignore: Vec, + /// Skip any file or directory whose path matches this glob pattern. Multiple + /// patterns may be provided by repeating the flag. + #[arg(long, value_name = "PATTERN")] + pub exclude: Vec, /// If true, do NOT extract archive files #[arg(long("no-extract-archives"), default_value_t = false)] diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index 36726e0..406c896 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -106,6 +106,14 @@ pub struct ScanArgs { /// Bytes of context before and after each match #[arg(long, default_value_t = 256, value_name = "BYTES")] pub snippet_length: usize, + + /// Baseline file to filter known secrets + #[arg(long, value_name = "FILE")] + pub baseline_file: Option, + + /// Create or update the baseline file with current findings + #[arg(long, default_value_t = false)] + pub manage_baseline: bool, } /// Confidence levels for findings diff --git a/src/lib.rs b/src/lib.rs index 4395b89..9f04d46 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +pub mod baseline; pub mod binary; pub mod blob; pub mod bstring_escape; @@ -43,6 +44,7 @@ use crossbeam_channel::Sender; pub use git_repo_enumerator::{GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator}; pub use gix::{self, Repository, ThreadSafeRepository}; use gix::{open::Options, open_opts}; +use globset::{Glob, GlobSet, GlobSetBuilder}; pub use ignore::gitignore::{Gitignore, GitignoreBuilder}; use ignore::{DirEntry, WalkBuilder, WalkState}; use tokio::time::Duration; @@ -204,6 +206,7 @@ pub struct FilesystemEnumerator { extract_archives: bool, extraction_depth: usize, no_dedup: bool, + exclude_globset: Option>, } impl FilesystemEnumerator { @@ -234,6 +237,7 @@ impl FilesystemEnumerator { extract_archives: !args.content_filtering_args.no_extract_archives, extraction_depth: args.content_filtering_args.extraction_depth as usize, no_dedup: args.no_dedup, + exclude_globset: None, }) } @@ -287,6 +291,27 @@ impl FilesystemEnumerator { self } + pub fn set_exclude_patterns(&mut self, patterns: &[String]) -> Result<&mut Self> { + if patterns.is_empty() { + return Ok(self); + } + let mut builder = GlobSetBuilder::new(); + for pat in patterns { + builder.add(Glob::new(pat)?); + } + let globset = std::sync::Arc::new(builder.build()?); + self.exclude_globset = Some(globset.clone()); + self.filter_entry(move |entry| { + let path = entry.path(); + let matches = globset.is_match(path); + if matches { + debug!("Skipping {} due to --exclude", path.display()); + } + !matches + }); + Ok(self) + } + pub fn gitignore(&self) -> Result { Ok(self.gitignore_builder.build()?) } diff --git a/src/main.rs b/src/main.rs index 280422c..2567187 100644 --- a/src/main.rs +++ b/src/main.rs @@ -132,7 +132,18 @@ pub fn determine_exit_code(datastore: &Arc> // exit with code 0 if there are NO findings discovered let ds = datastore.lock().unwrap(); // Get all matches - let all_matches = ds.get_matches(); + // let all_matches = ds.get_matches(); + + // Only consider visible matches when determining the exit code + let all_matches = ds + .get_matches() + .iter() + .filter(|msg| { + let (_, _, match_item) = &***msg; + match_item.visible + }) + .collect::>(); + if all_matches.is_empty() { // No findings discovered 0 @@ -141,7 +152,7 @@ pub fn determine_exit_code(datastore: &Arc> let validated_matches = all_matches .iter() .filter(|msg| { - let (_, _, match_item) = &***msg; + let (_, _, match_item) = &****msg; match_item.validation_success }) .count(); @@ -278,7 +289,7 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { max_file_size_mb: 25.0, no_extract_archives: true, extraction_depth: 2, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns no_binary: true, }, confidence: ConfidenceLevel::Medium, @@ -291,6 +302,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { no_dedup: false, ignore_tests: false, snippet_length: 256, + baseline_file: None, + manage_baseline: false, output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, } } diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index ec45e83..c1185b2 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -383,6 +383,7 @@ mod tests { rules::rule::Confidence, util::intern, }; + fn create_default_args() -> cli::commands::scan::ScanArgs { use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope @@ -424,7 +425,7 @@ mod tests { max_file_size_mb: 25.0, no_extract_archives: false, extraction_depth: 2, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns no_binary: true, }, confidence: ConfidenceLevel::Medium, @@ -436,6 +437,8 @@ mod tests { git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, snippet_length: 256, + baseline_file: None, + manage_baseline: false, } } diff --git a/src/reporter/pretty_format.rs b/src/reporter/pretty_format.rs index 0e78477..7a6212c 100644 --- a/src/reporter/pretty_format.rs +++ b/src/reporter/pretty_format.rs @@ -351,7 +351,7 @@ fn test_pretty_format_with_nan_entropy_panics() { max_file_size_mb: 25.0, no_extract_archives: false, extraction_depth: 2, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns no_binary: true, }, @@ -369,6 +369,8 @@ fn test_pretty_format_with_nan_entropy_panics() { // display snippet_length: 256, + baseline_file: None, + manage_baseline: false, }; // This will panic if the entropy isn't checked for NaN diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index d9ca0f8..62dbd56 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -246,13 +246,7 @@ fn make_fs_enumerator( // Pass no_dedup when enumerating git history ie.no_dedup(args.no_dedup); - // Load any specified ignore files - for ignore_path in args.content_filtering_args.ignore.iter() { - debug!("Using ignore rules from {}", ignore_path.display()); - ie.add_ignore(ignore_path).with_context(|| { - format!("Failed to load ignore rules from {}", ignore_path.display()) - })?; - } + ie.set_exclude_patterns(&args.content_filtering_args.exclude)?; // Determine whether to collect git metadata or not let collect_git_metadata = false; ie.collect_git_metadata(collect_git_metadata); diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs index 112a04e..7c2f964 100644 --- a/src/scanner/runner.rs +++ b/src/scanner/runner.rs @@ -100,6 +100,16 @@ pub async fn run_async_scan( ds.replace_matches(deduped_arcs); } + // If baseline management is enabled, apply the baseline + if args.baseline_file.is_some() || args.manage_baseline { + let path = args + .baseline_file + .clone() + .unwrap_or_else(|| std::path::PathBuf::from("baseline-file.yaml")); + let mut ds = datastore.lock().unwrap(); + crate::baseline::apply_baseline(&mut ds, &path, args.manage_baseline, &input_roots)?; + } + // If validation is enabled, run it as a second phase if !args.no_validate { info!("Starting secret validation phase..."); diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index 94206d0..3c41afd 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -90,7 +90,7 @@ rules: extraction_depth: 1, no_binary: true, no_extract_archives: false, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns }, confidence: ConfidenceLevel::Low, no_validate: true, @@ -103,6 +103,8 @@ rules: no_dedup, ignore_tests: false, snippet_length: 64, + baseline_file: None, + manage_baseline: false, }; let global_args = GlobalArgs { diff --git a/tests/int_github.rs b/tests/int_github.rs index e541812..c66fdb8 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -77,7 +77,7 @@ fn test_github_remote_scan() -> Result<()> { no_extract_archives: false, extraction_depth: 2, no_binary: true, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns }, confidence: ConfidenceLevel::Medium, no_validate: false, @@ -90,6 +90,8 @@ fn test_github_remote_scan() -> Result<()> { no_dedup: true, ignore_tests: false, snippet_length: 256, + baseline_file: None, + manage_baseline: false, }; // Create global arguments let global_args = GlobalArgs { diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index ea0acaf..308f6a0 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -74,7 +74,7 @@ fn test_gitlab_remote_scan() -> Result<()> { no_extract_archives: false, extraction_depth: 2, no_binary: true, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns }, confidence: ConfidenceLevel::Medium, no_validate: false, @@ -87,6 +87,8 @@ fn test_gitlab_remote_scan() -> Result<()> { no_dedup: true, ignore_tests: false, snippet_length: 256, + baseline_file: None, + manage_baseline: false, }; let global_args = GlobalArgs { diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index 7f9ca80..06bc55a 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -133,7 +133,7 @@ async fn test_validation_cache_and_depvars() -> Result<()> { extraction_depth: 2, no_binary: true, no_extract_archives: false, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns }, confidence: ConfidenceLevel::Low, no_validate: false, @@ -146,6 +146,8 @@ async fn test_validation_cache_and_depvars() -> Result<()> { no_dedup: true, // keep duplicates so the cache is stressed ignore_tests: false, snippet_length: 128, + baseline_file: None, + manage_baseline: false, }; /* --------------------------------------------------------- * diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 20ad438..032a51b 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -76,7 +76,7 @@ impl TestContext { extraction_depth: 2, no_binary: true, no_extract_archives: false, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns }, confidence: ConfidenceLevel::Low, no_validate: true, @@ -89,6 +89,8 @@ impl TestContext { no_dedup: true, ignore_tests: false, snippet_length: 256, + baseline_file: None, + manage_baseline: false, }; let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules) @@ -140,7 +142,7 @@ impl TestContext { extraction_depth: 2, no_binary: true, no_extract_archives: false, - ignore: Vec::new(), + exclude: Vec::new(), // Exclude patterns }, confidence: ConfidenceLevel::Low, no_validate: true, @@ -153,6 +155,8 @@ impl TestContext { no_dedup: true, ignore_tests: false, snippet_length: 256, + baseline_file: None, + manage_baseline: false, }; let global_args = GlobalArgs { diff --git a/tests/smoke_baseline.rs b/tests/smoke_baseline.rs new file mode 100644 index 0000000..d5d1962 --- /dev/null +++ b/tests/smoke_baseline.rs @@ -0,0 +1,56 @@ +use std::fs; + +use assert_cmd::Command; +use predicates::prelude::*; +use tempfile::tempdir; + +const GH_PAT: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; + +#[test] +fn baseline_create_and_filter() -> anyhow::Result<()> { + let dir = tempdir()?; + let file = dir.path().join("leak.txt"); + fs::write(&file, format!("token = \"{}\"\n", GH_PAT))?; + let baseline = dir.path().join("baseline.yaml"); + + // Create baseline with manage flag + Command::cargo_bin("kingfisher")? + .args([ + "scan", + dir.path().to_str().unwrap(), + "--no-binary", + "--confidence=low", + "--no-validate", + "--format", + "json", + "--manage-baseline", + "--baseline-file", + baseline.to_str().unwrap(), + "--no-update-check", + ]) + .assert() + .code(200) + .stdout(predicate::str::contains(GH_PAT)); + + assert!(baseline.exists(), "baseline file created"); + + // Scan again using the baseline + Command::cargo_bin("kingfisher")? + .args([ + "scan", + dir.path().to_str().unwrap(), + "--no-binary", + "--confidence=low", + "--no-validate", + "--format", + "json", + "--baseline-file", + baseline.to_str().unwrap(), + "--no-update-check", + ]) + .assert() + .code(0) + .stdout(predicate::str::contains(GH_PAT).not()); + + Ok(()) +} diff --git a/tests/smoke_exclude.rs b/tests/smoke_exclude.rs new file mode 100644 index 0000000..76e647d --- /dev/null +++ b/tests/smoke_exclude.rs @@ -0,0 +1,34 @@ +use std::fs; + +use assert_cmd::Command; +use predicates::prelude::*; +use tempfile::tempdir; + +const SECRET: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; + +#[test] +fn exclude_pattern_hides_matches() -> anyhow::Result<()> { + let dir = tempdir()?; + let py = dir.path().join("foo.py"); + let txt = dir.path().join("bar.txt"); + fs::write(&py, format!("token = \"{}\"\n", SECRET))?; + fs::write(&txt, format!("token = \"{}\"\n", SECRET))?; + + Command::cargo_bin("kingfisher")? + .args([ + "scan", + dir.path().to_str().unwrap(), + "--confidence=low", + "--no-binary", + "--no-validate", + "--format", + "json", + "--exclude=*.py", + "--no-update-check", + ]) + .assert() + .code(200) + .stdout(predicate::str::contains("bar.txt").and(predicate::str::contains("foo.py").not())); + + Ok(()) +} From 93f1e3b1da14ee91680f3e82de31450d9d5ce94e Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 14 Jul 2025 15:31:44 -0700 Subject: [PATCH 3/8] JWT validation performs OpenID Connect discovery using the iss claim and verifies signatures via JWKS --- CHANGELOG.md | 2 + Cargo.toml | 2 + data/rules/jwt.yml | 4 +- src/rules/rule.rs | 1 + src/validation.rs | 59 ++++++------ src/validation/jwt.rs | 207 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 245 insertions(+), 30 deletions(-) create mode 100644 src/validation/jwt.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e1179b..032379c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ All notable changes to this project will be documented in this file. - Added baseline feature with `--baseline-file` and `--manage-baseline` flags - Introduced `--exclude` option for skipping paths - Added tests covering baseline and exclude workflow +- Added validation for JWT tokens that checks `exp` and `nbf` claims +- JWT validation performs OpenID Connect discovery using the `iss` claim and verifies signatures via JWKS ## [1.20.0] diff --git a/Cargo.toml b/Cargo.toml index dc2db13..e220d39 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -162,6 +162,8 @@ atty = "0.2.14" self_update = { version = "0.42.0", default-features = false, features = ["rustls", "archive-tar", "archive-zip", "compression-flate2"] } semver = "1.0.26" globset = "0.4.16" +jsonwebtoken = "9.3.1" +ipnet = "2.11.0" [dependencies.tikv-jemallocator] version = "0.6" diff --git a/data/rules/jwt.yml b/data/rules/jwt.yml index cd3f78d..e596027 100644 --- a/data/rules/jwt.yml +++ b/data/rules/jwt.yml @@ -22,4 +22,6 @@ rules: - https://datatracker.ietf.org/doc/html/rfc7519 - https://en.wikipedia.org/wiki/Base64#URL_applications - https://datatracker.ietf.org/doc/html/rfc4648 - - https://developer.okta.com/blog/2018/06/20/what-happens-if-your-jwt-is-stolen \ No newline at end of file + - https://developer.okta.com/blog/2018/06/20/what-happens-if-your-jwt-is-stolen + validation: + type: JWT \ No newline at end of file diff --git a/src/rules/rule.rs b/src/rules/rule.rs index bf923f3..a301a09 100644 --- a/src/rules/rule.rs +++ b/src/rules/rule.rs @@ -38,6 +38,7 @@ pub enum Validation { GCP, MongoDB, Postgres, + JWT, Raw(String), Http(HttpValidation), } diff --git a/src/validation.rs b/src/validation.rs index 44a7a6f..172888d 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -27,6 +27,7 @@ mod aws; mod azure; mod gcp; mod httpvalidation; +mod jwt; mod mongodb; mod postgres; mod utils; @@ -58,35 +59,6 @@ pub fn init_validation_caches() { IN_FLIGHT.set(DashMap::new()).ok(); } -// #[derive(Clone, FilterReflection, ParseFilter)] -// #[filter( -// name = "b64enc", -// description = "Encodes the input string using Base64 encoding", -// parsed(B64EncFilter) -// )] -// pub struct B64EncFilterParser; - -// #[derive(Debug, Default, Clone)] -// pub struct B64EncFilter; - -// impl std::fmt::Display for B64EncFilter { -// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { -// write!(f, "b64enc") -// } -// } - -// impl Filter for B64EncFilter { -// fn evaluate( -// &self, -// input: &dyn ValueView, -// _runtime: &dyn Runtime, -// ) -> Result { -// let input_str = input.to_kstr().into_owned(); -// let encoded = general_purpose::STANDARD.encode(input_str.as_bytes()); -// Ok(Value::scalar(encoded)) -// } -// } - #[derive(Clone)] pub struct CachedResponse { pub body: String, @@ -700,7 +672,36 @@ async fn timed_validate_single_match<'a>( }, ); } + // ---------------------------------------------------- JWT validator + Some(Validation::JWT) => { + let token = captured_values + .iter() + .find(|(n, ..)| n == "TOKEN") + .map(|(_, v, ..)| v.clone()) + .unwrap_or_default(); + if token.is_empty() { + m.validation_success = false; + m.validation_response_body = "JWT token not found.".to_string(); + m.validation_response_status = StatusCode::BAD_REQUEST; + commit_and_return(m); + return; + } + + match jwt::validate_jwt(&token, client).await { + Ok((ok, msg)) => { + m.validation_success = ok; + m.validation_response_body = msg; + m.validation_response_status = + if ok { StatusCode::OK } else { StatusCode::UNAUTHORIZED }; + } + Err(e) => { + m.validation_success = false; + m.validation_response_body = format!("JWT validation error: {}", e); + m.validation_response_status = StatusCode::BAD_REQUEST; + } + } + } // ---------------------------------------------------- AWS validator Some(Validation::AWS) => { let secret = captured_values diff --git a/src/validation/jwt.rs b/src/validation/jwt.rs new file mode 100644 index 0000000..ed2bd3c --- /dev/null +++ b/src/validation/jwt.rs @@ -0,0 +1,207 @@ +use anyhow::{anyhow, Result}; +use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _}; +use chrono::Utc; +use ipnet::IpNet; +use jsonwebtoken::{decode, decode_header, jwk::JwkSet, DecodingKey, Validation as JwtValidation}; +use reqwest::{redirect::Policy, Client, Url}; +use serde::Deserialize; +use tokio::net::lookup_host; + +use super::utils::check_url_resolvable; + +/// RFC 1918 + loopback + link-local nets we refuse to contact +const BLOCKED_NETS: &[&str] = &[ + "10.0.0.0/8", + "172.16.0.0/12", + "192.168.0.0/16", // private + "127.0.0.0/8", + "169.254.0.0/16", // loopback / link-local +]; + +// aud is allowed to be either a string or an array, so let Serde flatten it. +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum Aud { + Str(String), + Arr(Vec), +} + +#[derive(Debug, Deserialize)] +struct Claims { + exp: Option, + nbf: Option, + iss: Option, + aud: Option, +} + +pub async fn validate_jwt(token: &str, client: &Client) -> Result<(bool, String)> { + // --- insecure payload decode ------------------------------------------------- + let claims: Claims = { + let payload_b64 = token.split('.').nth(1).ok_or_else(|| anyhow!("invalid JWT format"))?; + let payload_json = URL_SAFE_NO_PAD + .decode(payload_b64) + .map_err(|e| anyhow!("invalid base64 in payload: {e}"))?; + serde_json::from_slice(&payload_json).map_err(|e| anyhow!("invalid JSON claims: {e}"))? + }; + + // temporal checks + let now = Utc::now().timestamp(); + if let Some(nbf) = claims.nbf { + if now < nbf { + return Ok((false, format!("Token not valid before {nbf}"))); + } + } + if let Some(exp) = claims.exp { + if now > exp { + return Ok((false, format!("Token expired at {exp}"))); + } + } + + // --------------------------------------------------------------------------- + let issuer = claims.iss.clone().unwrap_or_default(); + + if let Some(iss) = claims.iss.clone() { + // parse header now (kid, alg) + let header = decode_header(token).map_err(|e| anyhow!("decode header: {e}"))?; + + // build discovery URL and fetch it (redirects disabled) + let config_url = format!("{}/.well-known/openid-configuration", iss.trim_end_matches('/')); + let no_redirect_client = Client::builder() + .redirect(Policy::none()) + .build() + .map_err(|e| anyhow!("client build: {e}"))?; + + let cfg_resp = no_redirect_client + .get(&config_url) + .send() + .await + .map_err(|e| anyhow!("issuer discovery failed: {e}"))?; + + if !cfg_resp.status().is_success() { + return Ok((false, format!("issuer discovery failed: {}", cfg_resp.status()))); + } + + let cfg_json: serde_json::Value = + cfg_resp.json().await.map_err(|e| anyhow!("invalid discovery JSON: {e}"))?; + + // extract jwks_uri + let jwks_uri = cfg_json + .get("jwks_uri") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("jwks_uri missing"))?; + + // must be HTTPS + let url = Url::parse(jwks_uri).map_err(|e| anyhow!("invalid jwks_uri: {e}"))?; + if url.scheme() != "https" { + return Ok((false, "jwks_uri must use https".to_string())); + } + + // host must match issuer host  —  prevents open redirects / SSRF-on-other-host + let iss_host = Url::parse(&iss) + .map_err(|e| anyhow!("invalid iss: {e}"))? + .host_str() + .unwrap_or_default() + .to_ascii_lowercase(); + let jwks_host = url.host_str().unwrap_or_default().to_ascii_lowercase(); + if jwks_host != iss_host { + return Ok(( + false, + format!("jwks_uri host ({jwks_host}) must match issuer host ({iss_host})"), + )); + } + + // ----------------------------------------------------------------------- + // DNS resolution + private-range block + for addr in lookup_host((jwks_host.as_str(), 443)).await? { + if is_blocked_ip(addr.ip()) { + return Ok((false, "jwks_uri resolves to private or link-local IP".to_string())); + } + } + + // reachability check (existing helper) + check_url_resolvable(&url).await.map_err(|e| anyhow!("jwks uri unresolvable: {e}"))?; + + // fetch JWKS with redirect-free client + let jwks_resp = no_redirect_client + .get(url) + .send() + .await + .map_err(|e| anyhow!("jwks fetch failed: {e}"))?; + if !jwks_resp.status().is_success() { + return Ok((false, format!("jwks fetch failed: {}", jwks_resp.status()))); + } + + let jwk_set: JwkSet = + jwks_resp.json().await.map_err(|e| anyhow!("invalid jwks json: {e}"))?; + + // select key by kid + let kid = header.kid.ok_or_else(|| anyhow!("no kid in header"))?; + let jwk = jwk_set + .keys + .iter() + .find(|k| k.common.key_id.as_deref() == Some(&kid)) + .ok_or_else(|| anyhow!("kid not found in jwks"))?; + + // verify signature + let decoding_key = DecodingKey::from_jwk(jwk).map_err(|e| anyhow!("invalid jwk: {e}"))?; + let mut validation = JwtValidation::new(header.alg); + validation.set_audience(&extract_aud_strings(&claims)); + validation.validate_exp = false; + validation.validate_nbf = false; + + decode::(token, &decoding_key, &validation) + .map_err(|e| anyhow!("signature verification failed: {e}"))?; + + return Ok(( + true, + format!("JWT valid (iss: {issuer}, aud: {:?})", extract_aud_strings(&claims)), + )); + } + + Ok((true, format!("JWT not expired (iss: {issuer}, aud: {:?})", extract_aud_strings(&claims)))) +} + +/// Helper: normalize aud into a flat Vec +fn extract_aud_strings(claims: &Claims) -> Vec { + match &claims.aud { + Some(Aud::Str(s)) => vec![s.clone()], + Some(Aud::Arr(v)) => v.clone(), + None => vec![], + } +} +/// returns true if IP is in a blocked network +fn is_blocked_ip(ip: std::net::IpAddr) -> bool { + BLOCKED_NETS.iter().filter_map(|cidr| cidr.parse::().ok()).any(|net| net.contains(&ip)) +} + +#[cfg(test)] +mod tests { + use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _}; + use chrono::{Duration as ChronoDuration, Utc}; + use reqwest::Client; + + use super::validate_jwt; + + fn build_token(exp_offset: i64) -> String { + let header = URL_SAFE_NO_PAD.encode(r#"{"alg":"none"}"#); + let exp = (Utc::now() + ChronoDuration::seconds(exp_offset)).timestamp(); + let payload = URL_SAFE_NO_PAD.encode(format!("{{\"exp\":{exp}}}")); + format!("{header}.{payload}.") + } + + #[tokio::test] + async fn valid_token() { + let token = build_token(60); + let client = Client::new(); + let res = validate_jwt(&token, &client).await.unwrap(); + assert!(res.0); + } + + #[tokio::test] + async fn expired_token() { + let token = build_token(-60); + let client = Client::new(); + let res = validate_jwt(&token, &client).await.unwrap(); + assert!(!res.0); + } +} From 45e735ec8974ab0a3a4a74c123f362f44d0e9ad2 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 14 Jul 2025 16:55:19 -0700 Subject: [PATCH 4/8] Removed --ignore-tests argument, because the --exclude flag provides more granular functionality --- CHANGELOG.md | 1 + README.md | 8 ++++++-- src/cli/commands/scan.rs | 4 ---- src/git_repo_enumerator.rs | 26 ++++++++++++++++++------- src/lib.rs | 6 +++++- src/main.rs | 1 - src/reporter/pretty_format.rs | 1 - src/scanner/enumerate.rs | 36 ++++++++++++++++------------------- tests/int_dedup.rs | 1 - tests/int_github.rs | 1 - tests/int_gitlab.rs | 1 - tests/int_validation_cache.rs | 1 - tests/int_vulnerable_files.rs | 2 -- 13 files changed, 47 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 032379c..fb1ede4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file. - Added tests covering baseline and exclude workflow - Added validation for JWT tokens that checks `exp` and `nbf` claims - JWT validation performs OpenID Connect discovery using the `iss` claim and verifies signatures via JWKS +- Removed `--ignore-tests` argument, because the `--exclude` flag provides more granular functionality ## [1.20.0] diff --git a/README.md b/README.md index d7ae8b6..0797344 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,12 @@ kingfisher scan /path/to/repo --rule-stats ```bash # Scan source but skip likely unit / integration tests -kingfisher scan ./my-project --ignore-tests +kingfisher scan ./my-project \ + --exclude='test' \ + --exclude='spec' \ + --exclude='fixture' \ + --exclude='example' \ + --exclude='sample' ``` ### Exclude specific paths @@ -304,7 +309,6 @@ kingfisher github repos list --organization my-org - `--no-extract-archives`: Do not scan inside archives - `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) - `--redact`: Replaces discovered secrets with a one-way hash for secure output -- `--ignore-tests`: Skip files or directories whose path component contains _test_, _spec_, _fixture_, _example_, or _sample_ (case-insensitive) - `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax) - `--baseline-file `: Ignore matches listed in a baseline YAML file - `--manage-baseline`: Create or update the baseline file with current findings diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index 406c896..fe4d5d6 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -88,10 +88,6 @@ pub struct ScanArgs { #[arg(long, default_value_t = false)] pub no_dedup: bool, - /// Ignore matches that appear to come from test files - #[arg(long, default_value_t = false)] - pub ignore_tests: bool, - /// Redact findings values using a secure hash #[arg(long, short = 'r', default_value_t = false)] pub redact: bool, diff --git a/src/git_repo_enumerator.rs b/src/git_repo_enumerator.rs index 77e506e..9b3151e 100644 --- a/src/git_repo_enumerator.rs +++ b/src/git_repo_enumerator.rs @@ -73,11 +73,16 @@ pub struct GitBlobMetadata { pub struct GitRepoWithMetadataEnumerator<'a> { path: &'a Path, repo: Repository, + exclude_globset: Option>, } impl<'a> GitRepoWithMetadataEnumerator<'a> { - pub fn new(path: &'a Path, repo: Repository) -> Self { - Self { path, repo } + pub fn new( + path: &'a Path, + repo: Repository, + exclude_globset: Option>, + ) -> Self { + Self { path, repo, exclude_globset } } pub fn run(self) -> Result { @@ -173,12 +178,19 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { } let filtered = appearances .into_iter() - .filter(|entry| { - // Apply your actual ignore-logic here: - match entry.path.to_path() { - Ok(_path) => true, - Err(_) => true, + .filter(|entry| match entry.path.to_path() { + Ok(p) => { + if let Some(gs) = &self.exclude_globset { + let m = gs.is_match(p); + if m { + debug!("Skipping {} due to --exclude", p.display()); + } + !m + } else { + true + } } + Err(_) => true, }) .collect::>(); if filtered.is_empty() { diff --git a/src/lib.rs b/src/lib.rs index 9f04d46..26703fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -82,7 +82,7 @@ struct EnumeratorConfig { enumerate_git_history: bool, collect_git_metadata: bool, repo_scan_timeout: Duration, - // gitignore: Gitignore, + exclude_globset: Option>, } pub enum FoundInput { @@ -312,6 +312,10 @@ impl FilesystemEnumerator { Ok(self) } + pub fn exclude_globset(&self) -> Option> { + self.exclude_globset.clone() + } + pub fn gitignore(&self) -> Result { Ok(self.gitignore_builder.build()?) } diff --git a/src/main.rs b/src/main.rs index 2567187..44a3ec3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -300,7 +300,6 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { redact: false, git_repo_timeout: 1800, no_dedup: false, - ignore_tests: false, snippet_length: 256, baseline_file: None, manage_baseline: false, diff --git a/src/reporter/pretty_format.rs b/src/reporter/pretty_format.rs index 7a6212c..b9c868c 100644 --- a/src/reporter/pretty_format.rs +++ b/src/reporter/pretty_format.rs @@ -314,7 +314,6 @@ fn test_pretty_format_with_nan_entropy_panics() { // core execution / performance num_jobs: 1, no_dedup: false, - ignore_tests: false, // rule selection rules: RuleSpecifierArgs { diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index 62dbd56..8cfa9ff 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -82,9 +82,10 @@ pub fn enumerate_filesystem_inputs( }() .context("Failed to initialize filesystem enumerator")?; - let (enum_thread, input_recv) = { + let (enum_thread, input_recv, exclude_globset) = { let fs_enumerator = make_fs_enumerator(args, input_roots.into()) .context("Failed to initialize filesystem enumerator")?; + let exclude_globset = fs_enumerator.as_ref().and_then(|ie| ie.exclude_globset()); let channel_size = std::cmp::max(args.num_jobs * 128, 1024); let (input_send, input_recv) = crossbeam_channel::bounded(channel_size); @@ -97,7 +98,7 @@ pub fn enumerate_filesystem_inputs( Ok(()) }) .context("Failed to enumerate filesystem inputs")?; - (input_enumerator_thread, input_recv) + (input_enumerator_thread, input_recv, exclude_globset) }; let enum_cfg = EnumeratorConfig { @@ -107,6 +108,7 @@ pub fn enumerate_filesystem_inputs( }, collect_git_metadata: args.input_specifier_args.commit_metadata, repo_scan_timeout, + exclude_globset, }; let (send_ds, recv_ds) = create_datastore_channel(args.num_jobs); let datastore_writer_thread = @@ -189,23 +191,11 @@ pub fn enumerate_filesystem_inputs( Ok(Some((origin_set, blob_metadata, vec_of_matches))) => { for (_, single_match) in vec_of_matches { // Send each match - let is_test = if args.ignore_tests { - origin_set - .iter() - .filter_map(|o| o.full_path()) - .any(|p| is_test_like_path(&p)) - } else { - false - }; - - if !is_test { - // Send each match - send_ds.send(( - Arc::new(origin_set.clone()), - Arc::new(blob_metadata.clone()), - single_match, - ))?; - } + send_ds.send(( + Arc::new(origin_set.clone()), + Arc::new(blob_metadata.clone()), + single_match, + ))?; } } Err(e) => { @@ -604,9 +594,15 @@ impl<'cfg> ParallelBlobIterator for (&'cfg EnumeratorConfig, FoundInput) { // Spawn an enumerator thread so we can time-out cleanly let path_clone = path.to_path_buf(); let (tx, rx) = std::sync::mpsc::channel(); + let exclude_globset = cfg.exclude_globset.clone(); let handle = std::thread::spawn(move || { let res = if collect_git_metadata { - GitRepoWithMetadataEnumerator::new(&path_clone, repository).run() + GitRepoWithMetadataEnumerator::new( + &path_clone, + repository, + exclude_globset.clone(), + ) + .run() } else { GitRepoEnumerator::new(&path_clone, repository).run() }; diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index 3c41afd..64c2c92 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -101,7 +101,6 @@ rules: git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_dedup, - ignore_tests: false, snippet_length: 64, baseline_file: None, manage_baseline: false, diff --git a/tests/int_github.rs b/tests/int_github.rs index c66fdb8..330299b 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -88,7 +88,6 @@ fn test_github_remote_scan() -> Result<()> { git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_dedup: true, - ignore_tests: false, snippet_length: 256, baseline_file: None, manage_baseline: false, diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index 308f6a0..66a7f37 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -85,7 +85,6 @@ fn test_gitlab_remote_scan() -> Result<()> { git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_dedup: true, - ignore_tests: false, snippet_length: 256, baseline_file: None, manage_baseline: false, diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index 06bc55a..7e422e1 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -144,7 +144,6 @@ async fn test_validation_cache_and_depvars() -> Result<()> { git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_dedup: true, // keep duplicates so the cache is stressed - ignore_tests: false, snippet_length: 128, baseline_file: None, manage_baseline: false, diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 032a51b..bb1d064 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -87,7 +87,6 @@ impl TestContext { git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_dedup: true, - ignore_tests: false, snippet_length: 256, baseline_file: None, manage_baseline: false, @@ -153,7 +152,6 @@ impl TestContext { git_repo_timeout: 1800, // 30 minutes output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_dedup: true, - ignore_tests: false, snippet_length: 256, baseline_file: None, manage_baseline: false, From 77c8940ba2d87b6bbdd6ca4f876b2e297235af09 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 14 Jul 2025 17:09:04 -0700 Subject: [PATCH 5/8] Removed --ignore-tests argument, because the --exclude flag provides more granular functionality --- CHANGELOG.md | 3 ++- README.md | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fb1ede4..d9bce79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,8 @@ All notable changes to this project will be documented in this file. - Added validation for JWT tokens that checks `exp` and `nbf` claims - JWT validation performs OpenID Connect discovery using the `iss` claim and verifies signatures via JWKS - Removed `--ignore-tests` argument, because the `--exclude` flag provides more granular functionality - +- DigitalOcean rule update +- Adafruit rule update ## [1.20.0] - Removed confirmation prompt when user provides --self-update flag diff --git a/README.md b/README.md index 0797344..f94fc36 100644 --- a/README.md +++ b/README.md @@ -140,12 +140,14 @@ kingfisher scan /path/to/repo --rule-stats ### Scan while ignoring likely test files +`--exclude` skips any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) + ```bash # Scan source but skip likely unit / integration tests kingfisher scan ./my-project \ - --exclude='test' \ + --exclude='[Tt]est' \ --exclude='spec' \ - --exclude='fixture' \ + --exclude='[Ff]ixture' \ --exclude='example' \ --exclude='sample' ``` @@ -155,7 +157,7 @@ kingfisher scan ./my-project \ # Skip all Python files and any directory named tests kingfisher scan ./my-project \ --exclude '*.py' \ - --exclude tests + --exclude '[Tt]ests' ``` If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude): @@ -309,7 +311,7 @@ kingfisher github repos list --organization my-org - `--no-extract-archives`: Do not scan inside archives - `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) - `--redact`: Replaces discovered secrets with a one-way hash for secure output -- `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax) +- `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) - `--baseline-file `: Ignore matches listed in a baseline YAML file - `--manage-baseline`: Create or update the baseline file with current findings From d72452159bb30e631010ee7d17229fb88204e6dd Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 14 Jul 2025 17:10:53 -0700 Subject: [PATCH 6/8] Removed --ignore-tests argument, because the --exclude flag provides more granular functionality --- src/git_metadata_graph.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/git_metadata_graph.rs b/src/git_metadata_graph.rs index 90c28fd..9144068 100644 --- a/src/git_metadata_graph.rs +++ b/src/git_metadata_graph.rs @@ -147,9 +147,9 @@ impl RepositoryIndex { .with_ordering(Ordering::PackLexicographicalThenLooseLexicographical) { let oid = unwrap_ok_or_continue!(oid, |e| debug!("Failed to read object id: {e}")); - if count % 100000 == 0 { - debug!("Indexed {} objects in RepositoryIndex::new", count); - } + // if count % 100000 == 0 { + // debug!("Indexed {} objects in RepositoryIndex::new", count); + // } let hdr = unwrap_ok_or_continue!(odb.header(oid), |e| { debug!("Failed to read object header for {oid}: {e}") }); From ee6332a78d9ec7070bfd594708efbdc038811ef4 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 14 Jul 2025 17:22:37 -0700 Subject: [PATCH 7/8] change that hoists the redirect-free reqwest::Client into a single, lazily-initialized static so every call to validate_jwt re-uses the same handle (and therefore the same connection-pool, DNS cache, TLS session cache, etc) --- src/reporter/json_format.rs | 1 - src/scanner/enumerate.rs | 1 - src/validation/jwt.rs | 21 ++++++++++++++------- tests/smoke_update.rs | 5 ----- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index c1185b2..a4e8730 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -390,7 +390,6 @@ mod tests { cli::commands::scan::ScanArgs { num_jobs: 1, no_dedup: false, - ignore_tests: false, rules: RuleSpecifierArgs { rules_path: Vec::new(), rule: vec!["all".into()], diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index 8cfa9ff..0e0c795 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -37,7 +37,6 @@ use crate::{ util::is_compressed_file, }, scanner_pool::ScannerPool, - util::is_test_like_path, EnumeratorConfig, EnumeratorFileResult, FileResult, FilesystemEnumerator, FoundInput, GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator, PathBuf, }; diff --git a/src/validation/jwt.rs b/src/validation/jwt.rs index ed2bd3c..ca0ac43 100644 --- a/src/validation/jwt.rs +++ b/src/validation/jwt.rs @@ -3,12 +3,24 @@ use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _}; use chrono::Utc; use ipnet::IpNet; use jsonwebtoken::{decode, decode_header, jwk::JwkSet, DecodingKey, Validation as JwtValidation}; +use once_cell::sync::Lazy; use reqwest::{redirect::Policy, Client, Url}; use serde::Deserialize; use tokio::net::lookup_host; use super::utils::check_url_resolvable; +/// One global, redirect-free client. Building a `Client` is comparatively +/// expensive; re-using it lets reqwest share its internal connection pool +/// and TLS sessions across JWT validations. `Lazy` ensures thread-safe, +/// one-time initialisation. +static NO_REDIRECT_CLIENT: Lazy = Lazy::new(|| { + Client::builder() + .redirect(Policy::none()) // disable all redirects + .build() + .expect("failed to build no-redirect Client") +}); + /// RFC 1918 + loopback + link-local nets we refuse to contact const BLOCKED_NETS: &[&str] = &[ "10.0.0.0/8", @@ -66,12 +78,7 @@ pub async fn validate_jwt(token: &str, client: &Client) -> Result<(bool, String) // build discovery URL and fetch it (redirects disabled) let config_url = format!("{}/.well-known/openid-configuration", iss.trim_end_matches('/')); - let no_redirect_client = Client::builder() - .redirect(Policy::none()) - .build() - .map_err(|e| anyhow!("client build: {e}"))?; - - let cfg_resp = no_redirect_client + let cfg_resp = NO_REDIRECT_CLIENT .get(&config_url) .send() .await @@ -122,7 +129,7 @@ pub async fn validate_jwt(token: &str, client: &Client) -> Result<(bool, String) check_url_resolvable(&url).await.map_err(|e| anyhow!("jwks uri unresolvable: {e}"))?; // fetch JWKS with redirect-free client - let jwks_resp = no_redirect_client + let jwks_resp = NO_REDIRECT_CLIENT .get(url) .send() .await diff --git a/tests/smoke_update.rs b/tests/smoke_update.rs index 45573fa..8224d42 100644 --- a/tests/smoke_update.rs +++ b/tests/smoke_update.rs @@ -1,9 +1,4 @@ -use std::fs::{self, File}; - -use flate2::{write::GzEncoder, Compression}; use kingfisher::{cli::global::GlobalArgs, update::check_for_update}; -use tar::Builder; -use tempfile::tempdir; use tokio; use wiremock::{ matchers::{method, path}, From 352d8ff65959bbce5475941f5d33dbcc46a6da72 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 14 Jul 2025 17:22:51 -0700 Subject: [PATCH 8/8] change that hoists the redirect-free reqwest::Client into a single, lazily-initialized static so every call to validate_jwt re-uses the same handle (and therefore the same connection-pool, DNS cache, TLS session cache, etc) --- src/validation.rs | 2 +- src/validation/jwt.rs | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/validation.rs b/src/validation.rs index 172888d..6cb3711 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -688,7 +688,7 @@ async fn timed_validate_single_match<'a>( return; } - match jwt::validate_jwt(&token, client).await { + match jwt::validate_jwt(&token).await { Ok((ok, msg)) => { m.validation_success = ok; m.validation_response_body = msg; diff --git a/src/validation/jwt.rs b/src/validation/jwt.rs index ca0ac43..6449fd2 100644 --- a/src/validation/jwt.rs +++ b/src/validation/jwt.rs @@ -46,7 +46,7 @@ struct Claims { aud: Option, } -pub async fn validate_jwt(token: &str, client: &Client) -> Result<(bool, String)> { +pub async fn validate_jwt(token: &str) -> Result<(bool, String)> { // --- insecure payload decode ------------------------------------------------- let claims: Claims = { let payload_b64 = token.split('.').nth(1).ok_or_else(|| anyhow!("invalid JWT format"))?; @@ -199,16 +199,14 @@ mod tests { #[tokio::test] async fn valid_token() { let token = build_token(60); - let client = Client::new(); - let res = validate_jwt(&token, &client).await.unwrap(); + let res = validate_jwt(&token).await.unwrap(); assert!(res.0); } #[tokio::test] async fn expired_token() { let token = build_token(-60); - let client = Client::new(); - let res = validate_jwt(&token, &client).await.unwrap(); + let res = validate_jwt(&token).await.unwrap(); assert!(!res.0); } }