From 343c08ed00b356f2b6cb6439da2c531fafaa236a Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Tue, 19 Aug 2025 09:30:26 -0700 Subject: [PATCH 1/4] Fixed issue with self-update on Linux --- CHANGELOG.md | 4 ++++ Cargo.toml | 2 +- README.md | 33 ++++----------------------------- src/update.rs | 11 +++++++++++ 4 files changed, 20 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5973cff..62d9cf8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ All notable changes to this project will be documented in this file. +## [1.44.0] +- Fixed issue with self-update on Linux + + ## [1.43.0] - Added rules for clearbit, kickbox, azure container registry, improved Azure Storage key - Grouped JSON and JSONL outputs by rule, restoring `matches` arrays in reports diff --git a/Cargo.toml b/Cargo.toml index 1a8eebe..ba14c38 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.43.0" +version = "1.44.0" description = "MongoDB's blazingly fast secret scanning and validation tool" edition.workspace = true rust-version.workspace = true diff --git a/README.md b/README.md index 1785749..7fa2626 100644 --- a/README.md +++ b/README.md @@ -158,8 +158,8 @@ Kingfisher ships with hundreds of rules that cover everything from classic cloud |----------|---------------| | **AI / LLM APIs** | OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), and more | **Cloud Providers** | AWS, Azure, GCP, Alibaba Cloud, DigitalOcean, IBM Cloud, Cloudflare, and more -| **Dev & CI/CD** | GitHub/GitLab tokens, CircleCI, TravisCI, TeamCity, Docker Hub, npm & PyPI publish token, and more -| **Messaging & Comms** | Slack, Discord, Microsoft Teams, Twilio, Mailgun/SendGrid/Mailchimp, and more +| **Dev & CI/CD** | GitHub/GitLab tokens, CircleCI, TravisCI, TeamCity, Docker Hub, npm, PyPI, and more +| **Messaging & Comms** | Slack, Discord, Microsoft Teams, Twilio, Mailgun, SendGrid, Mailchimp, and more | **Databases & Data Ops** | MongoDB Atlas, PlanetScale, Postgres DSNs, Grafana Cloud, Datadog, Dynatrace, and more | **Payments & Billing** | Stripe, PayPal, Square, GoCardless, and more | **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more @@ -179,7 +179,7 @@ Once you've done that, you can provide your custom rules (defined in a YAML file ## Basic Examples -> **Note**  `kingfisher scan` detects whether the input is a Git repository or a plain directory—no extra flags required. +> **Note**  `kingfisher scan` detects whether the input is a Git repository or a plain directory, no extra flags required. ### Scan with secret validation @@ -614,36 +614,11 @@ Use `--rule-stats` to collect timing information for every rule. After scanning, kingfisher scan --help ``` -## Business Value - -By integrating Kingfisher into your development lifecycle, you can: - -- **Prevent Costly Breaches** - Early detection of embedded credentials avoids expensive incident response, legal fees, and reputation damage -- **Automate Compliance** - Enforce secret‑scanning policies across GitOps, CI/CD, and pull requests to help satisfy SOC 2, PCI‑DSS, GDPR, and other standards -- **Reduce Noise, Focus on Real Threats** - Validation logic filters out false positives and highlights only active, valid secrets (`--only-valid`) -- **Accelerate Dev Workflows** - Run in parallel across dozens of languages, integrate with GitHub Actions or any pipeline, and shift security left to minimize delays - -## The Risk of Leaked Secrets - -Real breaches show how one exposed key can snowball into a full-scale incident: - -- **Uber (2016):** GitHub-hosted AWS key let attackers access data on 57 M riders and 600 k drivers. [[BBC](https://www.bbc.com/news/technology-42075306)] [[Ars](https://arstechnica.com/tech-policy/2017/11/report-uber-paid-hackers-100000-to-keep-2016-data-breach-quiet/)] -- **AWS engineer (2020):** Pushed log files with root credentials to GitHub. [[Register](https://www.theregister.com/2020/01/23/aws_engineer_credentials_github/)] [[UpGuard](https://www.upguard.com/breaches/identity-and-access-misstep-how-an-amazon-engineer-exposed-credentials-and-more)] -- **Infosys (2023):** Full-admin AWS key left in a public PyPI package for a year. [[Stack](https://www.thestack.technology/infosys-leak-aws-key-exposed-on-pypi/)] [[Blog](https://tomforb.es/blog/infosys-leak/)] -- **Microsoft (2023):** Azure SAS token in an AI repo exposed 38 TB of internal data. [[Wiz](https://www.wiz.io/blog/38-terabytes-of-private-data-accidentally-exposed-by-microsoft-ai-researchers)] [[TechCrunch](https://techcrunch.com/2023/09/18/microsoft-ai-researchers-accidentally-exposed-terabytes-of-internal-sensitive-data/)] -- **GitHub (2023):** RSA SSH host key briefly went public; company rotated it. [[GitHub](https://github.blog/news-insights/company-news/we-updated-our-rsa-ssh-host-key/)] - -Leaked secrets fuel unauthorized access, lateral movement, regulatory fines, and brand-damaging incident-response costs. - # Roadmap - More rules - More targets -- Please file a [feature request](https://github.com/mongodb/kingfisher/issues) if you have specific features you'd like added +- Please file a [feature request](https://github.com/mongodb/kingfisher/issues), or open a PR, if you have features you'd like added # License diff --git a/src/update.rs b/src/update.rs index a1dbe60..ea3e221 100644 --- a/src/update.rs +++ b/src/update.rs @@ -95,6 +95,17 @@ pub fn check_for_update(global_args: &GlobalArgs, base_url: Option<&str>) -> Opt #[cfg(all(target_os = "windows", target_arch = "x86_64"))] builder.target("windows-x64"); + // ────────────────────────────────────────────────────── + // Disambiguate archive format to avoid picking .deb packages. + // Linux and macOS releases use `.tgz`; Windows uses `.zip`. + // ────────────────────────────────────────────────────── + #[cfg(target_os = "windows")] + builder.identifier("zip"); + + // Linux releases also ship as .deb and .rpm packages; select the .tgz asset for self‑updates + #[cfg(not(target_os = "windows"))] + builder.identifier("tgz"); + // Build the updater. let Ok(updater) = builder.build() else { warn!("Failed to configure update checker"); From 2411b86b7807b847479c3c460d818eb58f4d66da Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Tue, 19 Aug 2025 11:55:28 -0700 Subject: [PATCH 2/4] - Fixed issue with self-update on Linux - Reverted the change to json and jsonl outputs by rule --- CHANGELOG.md | 2 +- src/reporter.rs | 6 ----- src/reporter/json_format.rs | 33 +++++------------------- tests/int_rules_no_validated_findings.rs | 24 ++++++----------- 4 files changed, 16 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62d9cf8..a618e46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. ## [1.44.0] - Fixed issue with self-update on Linux - +- Reverted the change to json and jsonl outputs by rule ## [1.43.0] - Added rules for clearbit, kickbox, azure container registry, improved Azure Storage key diff --git a/src/reporter.rs b/src/reporter.rs index 0606669..a42e682 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -517,12 +517,6 @@ pub struct FindingRecordData { pub git_metadata: Option, } -#[derive(Serialize, JsonSchema, Clone, Debug)] -pub struct RuleMatches { - pub id: String, - pub matches: Vec, -} - impl From for ReportMatch { fn from(e: finding_data::FindingDataEntry) -> Self { ReportMatch { diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index f43637e..4020cee 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -1,5 +1,4 @@ use super::*; -use std::collections::BTreeMap; impl DetailsReporter { pub fn json_format( @@ -9,13 +8,7 @@ impl DetailsReporter { ) -> Result<()> { let records = self.build_finding_records(args)?; if !records.is_empty() { - let mut grouped: BTreeMap> = BTreeMap::new(); - for record in records { - grouped.entry(record.rule.id.clone()).or_default().push(record); - } - let groups: Vec = - grouped.into_iter().map(|(id, matches)| RuleMatches { id, matches }).collect(); - serde_json::to_writer_pretty(&mut writer, &groups)?; + serde_json::to_writer_pretty(&mut writer, &records)?; writeln!(writer)?; } Ok(()) @@ -27,16 +20,9 @@ impl DetailsReporter { args: &cli::commands::scan::ScanArgs, ) -> Result<()> { let records = self.build_finding_records(args)?; - if !records.is_empty() { - let mut grouped: BTreeMap> = BTreeMap::new(); - for record in records { - grouped.entry(record.rule.id.clone()).or_default().push(record); - } - for (id, matches) in grouped { - let group = RuleMatches { id, matches }; - serde_json::to_writer(&mut writer, &group)?; - writeln!(writer)?; - } + for record in records { + serde_json::to_writer(&mut writer, &record)?; + writeln!(writer)?; } Ok(()) } @@ -237,10 +223,7 @@ mod tests { reporter.json_format(&mut output, &create_default_args())?; let json_output: Vec = serde_json::from_slice(&output.into_inner())?; assert!(!json_output.is_empty(), "JSON output should not be empty"); - let first_group = &json_output[0]; - assert_eq!(first_group["id"], "mock_rule_1"); - let matches = first_group["matches"].as_array().unwrap(); - let first = &matches[0]; + let first = &json_output[0]; assert_eq!(first["rule"]["name"], "MockRule"); assert_eq!(first["finding"]["language"], "Rust"); Ok(()) @@ -281,10 +264,8 @@ mod tests { reporter.json_format(&mut output, &create_default_args())?; let json_output: Vec = serde_json::from_slice(&output.into_inner())?; assert!(!json_output.is_empty(), "JSON output should not be empty"); - let first_group = &json_output[0]; - let first_match = &first_group["matches"][0]; - let validation_status = - first_match["finding"]["validation"]["status"].as_str().unwrap(); + let first = &json_output[0]; + let validation_status = first["finding"]["validation"]["status"].as_str().unwrap(); assert_eq!(validation_status, expected_status); } Ok(()) diff --git a/tests/int_rules_no_validated_findings.rs b/tests/int_rules_no_validated_findings.rs index 5cc4359..bd0f4f7 100644 --- a/tests/int_rules_no_validated_findings.rs +++ b/tests/int_rules_no_validated_findings.rs @@ -41,23 +41,15 @@ fn scan_rules_has_no_validated_findings() -> Result<()> { return Ok(()); } - let groups: Vec = serde_json::from_str(json_array_str)?; + let findings: Vec = serde_json::from_str(json_array_str)?; + for finding in findings { + let rule_id = finding["rule"]["id"].as_str().unwrap_or("unknown"); - for group in groups { - let rule_id = group["id"].as_str().unwrap_or("unknown"); - if let Some(matches) = group["matches"].as_array() { - for finding in matches { - let status = finding["finding"]["validation"]["status"] - .as_str() - .unwrap_or("") - .to_ascii_lowercase(); - // Fail only on genuinely validated secrets - assert_ne!( - &status, "active credential", - "Validated finding detected in rule {rule_id}" - ); - } - } + let status = + finding["finding"]["validation"]["status"].as_str().unwrap_or("").to_ascii_lowercase(); + + // Fail only on genuinely validated secrets + assert_ne!(&status, "active credential", "Validated finding detected in rule {rule_id}"); } Ok(()) From d3bf941c5faf530b135bf4f838d1eb91aeffa520 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Tue, 19 Aug 2025 19:18:25 -0700 Subject: [PATCH 3/4] Added '--skip-regex' and '--skip-word' flags to ignore secrets matching custom patterns or skipwords --- CHANGELOG.md | 1 + README.md | 25 ++ README.md.orig | 625 +++++++++++++++++++++++++++++++ README.md.rej | 53 +++ src/cli/commands/scan.rs | 8 + src/main.rs | 2 + src/matcher.rs | 16 +- src/reporter/json_format.rs | 2 + src/reporter/json_format.rs.orig | 273 ++++++++++++++ src/safe_list.rs | 58 +++ src/scanner/runner.rs | 10 + tests/int_allowlist.rs | 143 +++++++ tests/int_dedup.rs | 2 + tests/int_github.rs | 2 + tests/int_gitlab.rs | 4 + tests/int_redact.rs | 2 + tests/int_slack.rs | 4 + tests/int_validation_cache.rs | 2 + tests/int_vulnerable_files.rs | 4 + 19 files changed, 1228 insertions(+), 8 deletions(-) create mode 100644 README.md.orig create mode 100644 README.md.rej create mode 100644 src/reporter/json_format.rs.orig create mode 100644 tests/int_allowlist.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index a618e46..47af7f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file. ## [1.44.0] - Fixed issue with self-update on Linux - Reverted the change to json and jsonl outputs by rule +- Added `--skip-regex` and `--skip-word` flags to ignore secrets matching custom patterns or skipwords ## [1.43.0] - Added rules for clearbit, kickbox, azure container registry, improved Azure Storage key diff --git a/README.md b/README.md index 7fa2626..054dfea 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Kingfisher originated as a fork of Praetorian's Nosey Parker, and is built atop - **Extra targets**: GitLab repos, S3 buckets, Docker images, Jira issues, Confluence pages, and Slack messages - **Compressed Files**: Supports extracting and scanning compressed files for secrets - **Baseline mode**: ignore known secrets, flag only new ones +- **Allowlist support**: suppress false positives with custom regexes or words - **Language-aware detection** (source-code parsing) for ~20 languages - **Native Windows** binary @@ -597,7 +598,31 @@ kingfisher github repos list --organization my-org - `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) - `--baseline-file `: Ignore matches listed in a baseline YAML file - `--manage-baseline`: Create or update the baseline file with current findings +- `--skip-regex `: Ignore findings whose text matches this regex (repeatable) +- `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) +### Ignore known false positives + +Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. + +With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com) + +```bash +# Skip any finding where the finding mentions TEST_KEY +kingfisher scan --skip-regex '(?i)TEST_KEY' path/ + +# Skip findings that contain the word "dummy" anywhere in the match +kingfisher scan --skip-word dummy path/ + +# Combine multiple patterns +kingfisher scan \ + --skip-regex 'AKIA[0-9A-Z]{16}' \ + --skip-word placeholder \ + --skip-word dummy \ + path/ +``` + +If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early. ## Finding Fingerprint diff --git a/README.md.orig b/README.md.orig new file mode 100644 index 0000000..7fa2626 --- /dev/null +++ b/README.md.orig @@ -0,0 +1,625 @@ +# Kingfisher + +

+ Kingfisher Logo + +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) + +Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s hardware‑accelerated Hyperscan regex engine with language‑aware parsing via Tree‑Sitter, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production +

+ +Kingfisher originated as a fork of Praetorian's Nosey Parker, and is built atop their incredible work and the work contributed by the Nosey Parker community. + +## What Kingfisher Adds +- **Live validation** via cloud-provider APIs +- **Extra targets**: GitLab repos, S3 buckets, Docker images, Jira issues, Confluence pages, and Slack messages +- **Compressed Files**: Supports extracting and scanning compressed files for secrets +- **Baseline mode**: ignore known secrets, flag only new ones +- **Language-aware detection** (source-code parsing) for ~20 languages +- **Native Windows** binary + + +## Key Features +- **Performance**: multithreaded, Hyperscan‑powered scanning built for huge codebases +- **Extensible rules**: hundreds of built-in detectors plus YAML-defined custom rules ([docs/RULES.md](/docs/RULES.md)) +- **Multiple targets**: + - **Git history**: local repos or GitHub/GitLab orgs/users + - **Docker images**: public or private via `--docker-image` + - **Jira issues**: JQL‑driven scans with `--jira-url` and `--jql` + - **Confluence pages**: CQL‑driven scans with `--confluence-url` and `--cql` + - **Slack messages**: query‑based scans with `--slack-query` + - **AWS S3**: bucket scans via `--s3-bucket`/`--s3-prefix` with credentials from `KF_AWS_KEY`/`KF_AWS_SECRET`, `--role-arn`, `--aws-local-profile`, or anonymous +- **Compressed Files**: Supports extracting and scanning compressed files for secrets +- **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) + +**Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation) + +# Benchmark Results + +See ([docs/COMPARISON.md](docs/COMPARISON.md)) + +

+ Kingfisher Runtime Comparison +

+ +# Getting Started +## Installation + +On macOS, you can simply + +```bash +brew install kingfisher +``` + +Pre-built binaries are also available on the [Releases](https://github.com/mongodb/kingfisher/releases) section of this page. + +You can also install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform: + +```bash +# Linux, macOS +curl --silent --location \ + https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \ + sh && \ + ubi --project mongodb/kingfisher --in "$HOME/bin" +``` + +```powershell +# Windows +powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in . +``` + +This installs `ubi` and then places the `kingfisher` executable in `~/bin` on Unix-like +systems (or the current directory on Windows). + +Or you may compile for your platform via `make`: + +```bash +# NOTE: Requires Docker +make linux + +# macOS --- must build from a macOS host +make darwin + +# Windows x64 --- requires building from a Windows host with Visual Studio installed +./buildwin.bat -force +``` + +```bash +# Build all targets +make linux-all # builds both x64 and arm64 +make darwin-all # builds both x64 and arm64 +make all # builds for every OS and architecture supported +``` + +### Run Kingfisher in Docker + +Run the dockerized Kingfisher container: +```bash +# GitHub Container Registry +docker run --rm ghcr.io/mongodb/kingfisher:latest --version + +# Scan the current working directory +# (mounts your code at /src and scans it) +docker run --rm \ + -v "$PWD":/src \ + ghcr.io/mongodb/kingfisher:latest scan /src + + +# Scan while providing a GitHub token +# Mounts your working dir at /proj and passes in the token: +docker run --rm \ + -e KF_GITHUB_TOKEN=ghp_… \ + -v "$PWD":/proj \ + ghcr.io/mongodb/kingfisher:latest \ + scan --git-url https://github.com/org/private_repo.git + +# Scan an S3 bucket +# Credentials can come from KF_AWS_KEY/KF_AWS_SECRET, --role-arn, or --aws-local-profile +docker run --rm \ + -e KF_AWS_KEY=AKIA... \ + -e KF_AWS_SECRET=g5nYW... \ + ghcr.io/mongodb/kingfisher:latest \ + scan --s3-bucket bucket-name + + +# Scan and write a JSON report locally +# Here we: +# 1. Mount $PWD → /proj +# 2. Tell Kingfisher to write findings.json inside /proj/reports +# 3. Ensure ./reports exists on your host so Docker can mount it +mkdir -p reports + +# run and output into host’s ./reports directory +docker run --rm \ + -v "$PWD":/proj \ + ghcr.io/mongodb/kingfisher:latest \ + scan /proj \ + --format json \ + --output /proj/reports/findings.json + + +# Tip: you can combine multiple mounts if you prefer separating source vs. output: +# Here /src is read‑only, and /out holds your generated reports +docker run --rm \ + -v "$PWD":/src:ro \ + -v "$PWD/reports":/out \ + ghcr.io/mongodb/kingfisher:latest \ + scan /src \ + --format json \ + --output /out/findings.json + +``` + +# 🔐 Detection Rules at a Glance + +Kingfisher ships with hundreds of rules that cover everything from classic cloud keys to the latest LLM-API secrets. Below is an overview: + +| Category | What we catch | +|----------|---------------| +| **AI / LLM APIs** | OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), and more +| **Cloud Providers** | AWS, Azure, GCP, Alibaba Cloud, DigitalOcean, IBM Cloud, Cloudflare, and more +| **Dev & CI/CD** | GitHub/GitLab tokens, CircleCI, TravisCI, TeamCity, Docker Hub, npm, PyPI, and more +| **Messaging & Comms** | Slack, Discord, Microsoft Teams, Twilio, Mailgun, SendGrid, Mailchimp, and more +| **Databases & Data Ops** | MongoDB Atlas, PlanetScale, Postgres DSNs, Grafana Cloud, Datadog, Dynatrace, and more +| **Payments & Billing** | Stripe, PayPal, Square, GoCardless, and more +| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more +| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more + +## Write Custom Rules! + +Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential. + +However, you may want to add your own custom rules, or modify a detection to better suit your needs / environment. + +First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom Kingfisher rules. + +Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! + +# Usage + +## Basic Examples + +> **Note**  `kingfisher scan` detects whether the input is a Git repository or a plain directory, no extra flags required. + +### Scan with secret validation + +```bash +kingfisher scan /path/to/code +## NOTE: This path can refer to: +# 1. a local git repo +# 2. a directory with many git repos +# 3. or just a folder with files and subdirectories + +## To explicitly prevent scanning git commit history add: +# `--git-history=none` +``` + +### Scan a directory containing multiple Git repositories + +```bash +kingfisher scan /projects/mono‑repo‑dir +``` + +### Scan a Git repository without validation + +```bash +kingfisher scan ~/src/myrepo --no-validate +``` + +### Display only secrets confirmed active by third‑party APIs + +```bash +kingfisher scan /path/to/repo --only-valid +``` + +### Output JSON and capture to a file + +```bash +kingfisher scan . --format json | tee kingfisher.json +``` + +### Output SARIF directly to disk + +```bash +kingfisher scan /path/to/repo --format sarif --output findings.sarif +``` + +### Pipe any text directly into Kingfisher by passing `-` + +```bash +cat /path/to/file.py | kingfisher scan - + +``` + +### Scan using a rule _family_ with one flag + +_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws._`)\* + +```bash +# Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2) +kingfisher scan /path/to/repo --rule kingfisher.aws +``` + +### Display rule performance statistics + +```bash +kingfisher scan /path/to/repo --rule-stats +``` + +### Scan while ignoring likely test files + +`--exclude` skips any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) + +```bash +# Scan source but skip likely unit / integration tests +kingfisher scan ./my-project \ + --exclude='[Tt]est' \ + --exclude='spec' \ + --exclude='[Ff]ixture' \ + --exclude='example' \ + --exclude='sample' +``` + +### Exclude specific paths +```bash +# Skip all Python files and any directory named tests +kingfisher scan ./my-project \ + --exclude '*.py' \ + --exclude '[Tt]ests' +``` + +If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude): + +```bash +# Skip all Python files and any directory named tests, and report to stderr any skipped files +kingfisher scan ./my-project \ + --exclude '*.py' \ + --exclude tests \ + -v +``` +## Scan an S3 bucket +You can scan S3 objects directly: + +```bash +kingfisher scan --s3-bucket bucket-name [--s3-prefix path/] +``` + +Credential resolution happens in this order: + +1. `KF_AWS_KEY` and `KF_AWS_SECRET` environment variables +2. `--aws-local-profile` pointing to a profile in `~/.aws/config` (works with AWS SSO) +3. anonymous access for public buckets + +If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role. + +Examples: + +```bash +# using explicit keys +export KF_AWS_KEY=AKIA... +export KF_AWS_SECRET=g5nYW... +kingfisher scan --s3-bucket some-example-bucket + +# Above can also be run as: +KF_AWS_KEY=AKIA... KF_AWS_SECRET=g5nYW... kingfisher scan --s3-bucket some-example-bucket + +# using a local profile (e.g., SSO) that exists in your AWS profile (~/.aws/config) +kingfisher scan --s3-bucket some-example-bucket --aws-local-profile default + +# anonymous scan of a bucket, while providing an object prefix to only scan subset of the s3 bucket +kingfisher scan \ + --s3-bucket awsglue-datasets \ + --s3-prefix examples/us-legislators/all + +# assuming a role when scanning +kingfisher scan --s3-bucket some-example-bucket \ + --role-arn arn:aws:iam::123456789012:role/MyRole + +# anonymous scan of a public bucket +kingfisher scan --s3-bucket some-example-bucket +``` + +Docker example: + +```bash +docker run --rm \ + -e KF_AWS_KEY=AKIA... \ + -e KF_AWS_SECRET=g5nYW... \ + ghcr.io/mongodb/kingfisher:latest \ + scan --s3-bucket bucket-name +``` +## Scanning Docker Images + +Kingfisher will first try to use any locally available image, then fall back to pulling via OCI. + +Authentication happens *in this order*: + +1. **`KF_DOCKER_TOKEN`** env var + - If it contains `user:pass`, it’s used as Basic auth + - Otherwise it’s sent as a Bearer token +2. **Docker CLI credentials** + - Checks `credHelpers` (per-registry) and `credsStore` in `~/.docker/config.json`. + - Falls back to the legacy `auths` → `auth` (base64) entries. +3. **Anonymous** (no credentials) + + +```bash +# 1) Scan public or already-pulled image +kingfisher scan --docker-image ghcr.io/owasp/wrongsecrets/wrongsecrets-master:latest-master + +# 2) For private registries, explicitly set KF_DOCKER_TOKEN: +# - Basic auth: "user:pass" +# - Bearer only: "TOKEN" +export KF_DOCKER_TOKEN="AWS:$(aws ecr get-login-password --region us-east-1)" +kingfisher scan --docker-image some-private-registry.dkr.ecr.us-east-1.amazonaws.com/base/amazonlinux2023:latest + +# 3) Or rely on your Docker CLI login/keychain: +# (e.g. aws ecr get-login-password … | docker login …) +kingfisher scan --docker-image private.registry.example.com/my-image:tag +``` + +## Scanning GitHub + +### Scan GitHub organisation (requires `KF_GITHUB_TOKEN`) + +```bash +kingfisher scan --github-organization my-org +``` + +### Scan remote GitHub repository + +```bash +kingfisher scan --git-url https://github.com/org/repo.git + +# Optionally provide a GitHub Token +KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/private_repo.git + +``` + +--- + +## Scanning GitLab + +### Scan GitLab group (requires `KF_GITLAB_TOKEN`) + +```bash +kingfisher scan --gitlab-group my-group +# include repositories from all nested subgroups +kingfisher scan --gitlab-group my-group --gitlab-include-subgroups +``` + +### Scan GitLab user + +```bash +kingfisher scan --gitlab-user johndoe +``` + +### Scan remote GitLab repository by URL + +```bash +kingfisher scan --git-url https://gitlab.com/group/project.git +``` + +### List GitLab repositories + +```bash +kingfisher gitlab repos list --group my-group +# include repositories from all nested subgroups +kingfisher gitlab repos list --group my-group --include-subgroups +``` + +## Scanning Jira + +### Scan Jira issues matching a JQL query + +```bash +KF_JIRA_TOKEN="token" kingfisher scan \ + --jira-url https://jira.company.com \ + --jql "project = TEST AND status = Open" \ + --max-results 500 +``` + +### Scan the last 1,000 Jira issues: +```bash +KF_JIRA_TOKEN="token" kingfisher scan \ + --jira-url https://jira.mongodb.org \ + --jql 'ORDER BY created DESC' \ + --max-results 1000 +``` + +## Scanning Confluence + +### Scan Confluence pages matching a CQL query + +```bash +# Bearer token +KF_CONFLUENCE_TOKEN="token" kingfisher scan \ + --confluence-url https://confluence.company.com \ + --cql "label = secret" \ + --max-results 500 + +# Basic auth with username and token +KF_CONFLUENCE_USER="user@example.com" KF_CONFLUENCE_TOKEN="token" kingfisher scan \ + --confluence-url https://confluence.company.com \ + --cql "text ~ 'password'" \ + --max-results 500 +``` + +Use the base URL of your Confluence site for `--confluence-url`. Kingfisher +automatically adds `/rest/api` to the end, so `https://example.com/wiki` and +`https://example.com` both work depending on your server configuration. + +Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environment variable. By default, Kingfisher sends the token as a bearer token in the `Authorization` header. + +To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions. + +## Scanning Slack + +### Scan Slack messages matching a search query + +```bash +KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \ + --slack-query "from:username has:link" \ + --max-results 1000 + +KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \ + --slack-query "akia" \ + --max-results 1000 +``` +*The Slack token must be a user token with the `search:read` scope. Bot tokens (those beginning with `xoxb-`) cannot call the Slack search API.* + +## Environment Variables for Tokens + +| Variable | Purpose | +| ----------------- | ---------------------------- | +| `KF_GITHUB_TOKEN` | GitHub Personal Access Token | +| `KF_GITLAB_TOKEN` | GitLab Personal Access Token | +| `KF_JIRA_TOKEN` | Jira API token | +| `KF_CONFLUENCE_TOKEN` | Confluence API token | +| `KF_SLACK_TOKEN` | Slack API token | +| `KF_DOCKER_TOKEN` | Docker registry token (`user:pass` or bearer token). If unset, credentials from the Docker keychain are used | +| `KF_AWS_KEY` and `KF_AWS_SECRET` | AWS Credentials to use with S3 bucket scanning | + +Set them temporarily per command: + +```bash +KF_GITLAB_TOKEN="glpat-…" kingfisher scan --gitlab-group my-group +``` + +Or export for the session: + +```bash +export KF_GITLAB_TOKEN="glpat-…" +``` + +To authenticate Jira requests: +```bash +export KF_JIRA_TOKEN="token" +``` + +To authenticate Confluence requests: +```bash +export KF_CONFLUENCE_TOKEN="token" +``` + +_If no token is provided Kingfisher still works for public repositories._ + +--- + +## Exit Codes + +| Code | Meaning | +| ---- | ----------------------------- | +| 0 | No findings | +| 200 | Findings discovered | +| 205 | Validated findings discovered | + +## Update Checks + +Kingfisher automatically queries GitHub for a newer release when it starts and tells you whether an update is available. + +- **Hands-free updates** – Add `--self-update` to any Kingfisher command + + * If a newer version exists, Kingfisher will download it, replace the running binary, and re-launch itself with the **exact same arguments**. + * If the update fails or no newer release is found, the current run proceeds as normal + +- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely + +# Advanced Options + +## Build a Baseline / Detect New Secrets + +There are situations where a repository already contains checked‑in secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list. + +The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches): + +```bash +kingfisher scan /path/to/code \ + --confidence low \ + --manage-baseline \ + --baseline-file ./baseline-file.yml +``` + +Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings: + +```bash +kingfisher scan /path/to/code \ + --baseline-file /path/to/baseline-file.yaml +``` + +Running the scan again with `--manage-baseline` refreshes the baseline by adding new findings and pruning entries for secrets that no longer appear. See [docs/BASELINE.md](docs/BASELINE.md) for full detail. + +## List Builtin Rules + +```bash +kingfisher rules list +``` + +## To scan using **only** your own `my_rules.yaml` you could run: + +```bash +kingfisher scan \ + --load-builtins=false \ + --rules-path path/to/my_rules.yaml \ + ./src/ +``` + +## To add your rules alongside the built‑ins: + +```bash +kingfisher scan \ + --rules-path ./custom-rules/ \ + --rules-path my_rules.yml \ + ~/path/to/project-dir/ +``` + +## Other Examples + +```bash +# Check custom rules - this ensures all regular expressions compile, and can match the rule's `examples` in the YML file +kingfisher rules check --rules-path ./my_rules.yml + +# List GitHub repos +kingfisher github repos list --user my-user +kingfisher github repos list --organization my-org + +``` + +## Notable Scan Options + +- `--no-dedup`: Report every occurrence of a finding (disable the default de-duplicate behavior) +- `--confidence `: (low|medium|high) +- `--min-entropy `: Override default threshold +- `--no-binary`: Skip binary files +- `--no-extract-archives`: Do not scan inside archives +- `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) +- `--redact`: Replaces discovered secrets with a one-way hash for secure output +- `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) +- `--baseline-file `: Ignore matches listed in a baseline YAML file +- `--manage-baseline`: Create or update the baseline file with current findings + + +## Finding Fingerprint + +The document below details the four-field formula (rule SHA-1, origin label, start & end offsets) hashed with XXH3-64 to create Kingfisher’s 64-bit finding fingerprint, and explains how this ID powers safe deduplication; plus how `--no-dedup` can be used shows every raw match. +See ([docs/FINGERPRINT.md](docs/FINGERPRINT.md)) + +## Rule Performance Profiling + +Use `--rule-stats` to collect timing information for every rule. After scanning, the summary prints a **Rule Performance Stats** section showing how many matches each rule produced along with its slowest and average match times. Useful when creating rules or debugging rules. + +## CLI Options + +```bash +kingfisher scan --help +``` + +# Roadmap + +- More rules +- More targets +- Please file a [feature request](https://github.com/mongodb/kingfisher/issues), or open a PR, if you have features you'd like added + +# License + +[Apache2 License](LICENSE) diff --git a/README.md.rej b/README.md.rej new file mode 100644 index 0000000..40ab324 --- /dev/null +++ b/README.md.rej @@ -0,0 +1,53 @@ +@@ -575,50 +576,52 @@ + + ## Other Examples + + ```bash + # Check custom rules - this ensures all regular expressions compile, and can match the rule's `examples` in the YML file + kingfisher rules check --rules-path ./my_rules.yml + + # List GitHub repos + kingfisher github repos list --user my-user + kingfisher github repos list --organization my-org + + ``` + + ## Notable Scan Options + + - `--no-dedup`: Report every occurrence of a finding (disable the default de-duplicate behavior) + - `--confidence `: (low|medium|high) + - `--min-entropy `: Override default threshold + - `--no-binary`: Skip binary files + - `--no-extract-archives`: Do not scan inside archives + - `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) + - `--redact`: Replaces discovered secrets with a one-way hash for secure output + - `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) + - `--baseline-file `: Ignore matches listed in a baseline YAML file + - `--manage-baseline`: Create or update the baseline file with current findings ++- `--skip-regex `: Ignore findings whose text matches this regex (repeatable) ++- `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) + + + ## Finding Fingerprint + + The document below details the four-field formula (rule SHA-1, origin label, start & end offsets) hashed with XXH3-64 to create Kingfisher’s 64-bit finding fingerprint, and explains how this ID powers safe deduplication; plus how `--no-dedup` can be used shows every raw match. + See ([docs/FINGERPRINT.md](docs/FINGERPRINT.md)) + + ## Rule Performance Profiling + + Use `--rule-stats` to collect timing information for every rule. After scanning, the summary prints a **Rule Performance Stats** section showing how many matches each rule produced along with its slowest and average match times. Useful when creating rules or debugging rules. + + ## CLI Options + + ```bash + kingfisher scan --help + ``` + + ## Business Value + + By integrating Kingfisher into your development lifecycle, you can: + + - **Prevent Costly Breaches** + Early detection of embedded credentials avoids expensive incident response, legal fees, and reputation damage + - **Automate Compliance** + Enforce secret‑scanning policies across GitOps, CI/CD, and pull requests to help satisfy SOC 2, PCI‑DSS, GDPR, and other standards diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index ae2b4f0..5a4d22b 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -106,6 +106,14 @@ pub struct ScanArgs { /// Create or update the baseline file with current findings #[arg(long, default_value_t = false)] pub manage_baseline: bool, + + /// Regex patterns to allow-list secret matches (repeatable) + #[arg(long = "skip-regex", value_name = "PATTERN")] + pub skip_regex: Vec, + + /// Skipwords to allow-list secret matches (case-insensitive, repeatable) + #[arg(long = "skip-word", value_name = "WORD")] + pub skip_word: Vec, } /// Confidence levels for findings diff --git a/src/main.rs b/src/main.rs index 2e9b6e2..eff2a57 100644 --- a/src/main.rs +++ b/src/main.rs @@ -325,6 +325,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { no_dedup: false, baseline_file: None, manage_baseline: false, + skip_regex: Vec::new(), + skip_word: Vec::new(), output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, } } diff --git a/src/matcher.rs b/src/matcher.rs index 21f7dec..b09429f 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -34,7 +34,7 @@ use crate::{ rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, rules::rule::Rule, rules_database::RulesDatabase, - safe_list::is_safe_match, + safe_list::{is_safe_match, is_user_match}, scanner_pool::ScannerPool, snippet::Base64BString, util::{intern, redact_value}, @@ -472,16 +472,16 @@ fn filter_match<'b>( None => Cow::Borrowed(&blob.bytes()[start..end]), }; for captures in re.captures_iter(byte_slice.as_ref()) { - let matching_input = captures.get(1).or_else(|| captures.get(0)).unwrap(); - // let str_input = std::str::from_utf8(matching_input.as_bytes()).unwrap_or(""); - // let calculated_entropy = calculate_shannon_entropy(str_input); - // if calculated_entropy <= rule.min_entropy() || is_safe_match(str_input) { - // continue; - // } + let full_capture = captures.get(0).unwrap(); + let matching_input = captures.get(1).unwrap_or(full_capture); let min_entropy = rule.min_entropy(); let mi_bytes = matching_input.as_bytes(); + let full_bytes = full_capture.as_bytes(); let calculated_entropy = calculate_shannon_entropy(mi_bytes); - if calculated_entropy <= min_entropy || is_safe_match(mi_bytes) { + if calculated_entropy <= min_entropy + || is_safe_match(mi_bytes) + || is_user_match(mi_bytes, full_bytes) + { debug!( "Skipping match with entropy {} <= {} or safe match", calculated_entropy, min_entropy diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 4020cee..d1e78f9 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -125,6 +125,8 @@ mod tests { output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, baseline_file: None, manage_baseline: false, + skip_regex: Vec::new(), + skip_word: Vec::new(), } } diff --git a/src/reporter/json_format.rs.orig b/src/reporter/json_format.rs.orig new file mode 100644 index 0000000..4020cee --- /dev/null +++ b/src/reporter/json_format.rs.orig @@ -0,0 +1,273 @@ +use super::*; + +impl DetailsReporter { + pub fn json_format( + &self, + mut writer: W, + args: &cli::commands::scan::ScanArgs, + ) -> Result<()> { + let records = self.build_finding_records(args)?; + if !records.is_empty() { + serde_json::to_writer_pretty(&mut writer, &records)?; + writeln!(writer)?; + } + Ok(()) + } + + pub fn jsonl_format( + &self, + mut writer: W, + args: &cli::commands::scan::ScanArgs, + ) -> Result<()> { + let records = self.build_finding_records(args)?; + for record in records { + serde_json::to_writer(&mut writer, &record)?; + writeln!(writer)?; + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cli::commands::github::GitCloneMode; + use crate::cli::commands::github::GitHistoryMode; + use crate::cli::commands::rules::RuleSpecifierArgs; + use crate::matcher::{SerializableCapture, SerializableCaptures}; + use crate::util::intern; + use crate::{ + blob::BlobId, + cli::commands::github::GitHubRepoType, + cli::commands::inputs::ContentFilteringArgs, + cli::commands::inputs::InputSpecifierArgs, + cli::commands::output::{OutputArgs, ReportOutputFormat}, + cli::commands::scan::ConfidenceLevel, + findings_store::FindingsStore, + location::{Location, OffsetSpan, SourcePoint, SourceSpan}, + matcher::Match, + origin::Origin, + reporter::styles::Styles, + }; + use std::{ + io::Cursor, + path::PathBuf, + sync::{Arc, Mutex}, + }; + use url::Url; + fn create_default_args() -> cli::commands::scan::ScanArgs { + use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope + + cli::commands::scan::ScanArgs { + num_jobs: 1, + no_dedup: false, + rules: RuleSpecifierArgs { + rules_path: Vec::new(), + rule: vec!["all".into()], + load_builtins: true, + }, + input_specifier_args: InputSpecifierArgs { + // local path / git URL inputs + path_inputs: Vec::new(), + git_url: Vec::new(), + + // GitHub + github_user: Vec::new(), + github_organization: Vec::new(), + all_github_organizations: false, + github_api_url: Url::parse("https://api.github.com/").unwrap(), + github_repo_type: GitHubRepoType::Source, + + // GitLab + gitlab_user: Vec::new(), + gitlab_group: Vec::new(), + all_gitlab_groups: false, + gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(), + gitlab_repo_type: GitLabRepoType::All, + gitlab_include_subgroups: false, + // Jira options + jira_url: None, + jql: None, + // Confluence options + confluence_url: None, + cql: None, + max_results: 100, + // Slack options + slack_query: None, + slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + // s3 + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, + + docker_image: Vec::new(), + // clone / history options + git_clone: GitCloneMode::Bare, + git_history: GitHistoryMode::Full, + scan_nested_repos: true, + commit_metadata: true, + }, + content_filtering_args: ContentFilteringArgs { + max_file_size_mb: 25.0, + no_extract_archives: false, + extraction_depth: 2, + exclude: Vec::new(), // Exclude patterns + no_binary: true, + }, + confidence: ConfidenceLevel::Medium, + no_validate: false, + rule_stats: false, + only_valid: false, + min_entropy: None, + redact: false, + git_repo_timeout: 1800, // 30 minutes + output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, + baseline_file: None, + manage_baseline: false, + } + } + + fn create_mock_match( + rule_name: &str, + rule_text_id: &str, + rule_finding_fingerprint: &str, + validation_success: bool, + ) -> Match { + Match { + location: Location { + offset_span: OffsetSpan { start: 10, end: 20 }, + source_span: SourceSpan { + start: SourcePoint { line: 5, column: 10 }, + end: SourcePoint { line: 5, column: 20 }, + }, + }, + groups: SerializableCaptures { + captures: vec![SerializableCapture { + name: Some("token".to_string()), + match_number: 1, + start: 10, + end: 20, + value: "mock_token".into(), + }], + }, + blob_id: BlobId::new(b"mock_blob"), + finding_fingerprint: 0123, + rule_finding_fingerprint: intern(rule_finding_fingerprint), + rule_text_id: intern(rule_text_id), + rule_name: intern(rule_name), + rule_confidence: Confidence::Medium, + validation_response_body: "validation response".to_string(), + validation_response_status: 200, + validation_success, + calculated_entropy: 4.5, + visible: true, + } + } + + fn setup_mock_reporter(matches: Vec) -> DetailsReporter { + let mut datastore = FindingsStore::new(PathBuf::from("/tmp")); + if !matches.is_empty() { + let blob_metadata = BlobMetadata { + id: BlobId::new(b"mock_blob"), + num_bytes: 1024, + mime_essence: Some("text/plain".to_string()), + charset: Some("UTF-8".to_string()), + language: Some("Rust".to_string()), + }; + let dedup = true; + for m in matches.clone() { + datastore.record( + vec![( + Arc::new(OriginSet::new( + Origin::from_file(PathBuf::from("/mock/path/file.rs")), + vec![], + )), + Arc::new(blob_metadata.clone()), + m.m.clone(), + )], + dedup, + ); + } + } + DetailsReporter { + datastore: Arc::new(Mutex::new(datastore)), + styles: Styles::new(false), + only_valid: false, + } + } + + #[test] + fn test_json_format() -> Result<()> { + let mock_match = + create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); + let matches = vec![ReportMatch { + origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]), + blob_metadata: BlobMetadata { + id: BlobId::new(b"mock_blob"), + num_bytes: 1024, + mime_essence: Some("text/plain".to_string()), + charset: Some("UTF-8".to_string()), + language: Some("Rust".to_string()), + }, + m: mock_match, + comment: None, + match_confidence: Confidence::Medium, + visible: true, + validation_response_body: "validation response".to_string(), + validation_response_status: 200, + validation_success: true, + }]; + let reporter = setup_mock_reporter(matches); + let mut output = Cursor::new(Vec::new()); + reporter.json_format(&mut output, &create_default_args())?; + let json_output: Vec = serde_json::from_slice(&output.into_inner())?; + assert!(!json_output.is_empty(), "JSON output should not be empty"); + let first = &json_output[0]; + assert_eq!(first["rule"]["name"], "MockRule"); + assert_eq!(first["finding"]["language"], "Rust"); + Ok(()) + } + + #[test] + fn test_validation_status_in_json() -> Result<()> { + let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")]; + for (validation_success, expected_status) in test_cases { + let mock_match = create_mock_match( + "MockRule", + "mock_rule_1", + "mock_finding_fingerprint", + validation_success, + ); + let matches = vec![ReportMatch { + origin: OriginSet::new( + Origin::from_file(PathBuf::from("/mock/path/file.rs")), + vec![], + ), + blob_metadata: BlobMetadata { + id: BlobId::new(b"mock_blob"), + num_bytes: 1024, + mime_essence: Some("text/plain".to_string()), + charset: Some("UTF-8".to_string()), + language: Some("Rust".to_string()), + }, + m: mock_match, + comment: None, + match_confidence: Confidence::Medium, + visible: true, + validation_response_body: "validation response".to_string(), + validation_response_status: 200, + validation_success, + }]; + let reporter = setup_mock_reporter(matches); + let mut output = Cursor::new(Vec::new()); + reporter.json_format(&mut output, &create_default_args())?; + let json_output: Vec = serde_json::from_slice(&output.into_inner())?; + assert!(!json_output.is_empty(), "JSON output should not be empty"); + let first = &json_output[0]; + let validation_status = first["finding"]["validation"]["status"].as_str().unwrap(); + assert_eq!(validation_status, expected_status); + } + Ok(()) + } +} diff --git a/src/safe_list.rs b/src/safe_list.rs index 60d88aa..ba7f852 100644 --- a/src/safe_list.rs +++ b/src/safe_list.rs @@ -12,6 +12,7 @@ use once_cell::sync::Lazy; use regex::bytes::Regex; +use std::sync::Mutex; use tracing::debug; /// A rule that describes *why* a match is considered safe/benign. @@ -131,6 +132,63 @@ static SAFE_LIST_FILTER_RULES: Lazy> = Lazy::new(|| { ] }); +// User-supplied allow-list patterns (regexes) and skipwords. These are empty by +// default and populated via CLI flags at runtime. +static USER_SAFE_REGEXES: Lazy>> = Lazy::new(|| Mutex::new(Vec::new())); +static USER_SAFE_SKIPWORDS: Lazy>> = Lazy::new(|| Mutex::new(Vec::new())); + +/// Register an additional allow-list regex provided by the user. +/// If the pattern fails to compile, the error is returned so the caller can +/// surface it. +pub fn add_user_regex(pattern: &str) -> std::result::Result<(), regex::Error> { + let re = Regex::new(pattern)?; + USER_SAFE_REGEXES.lock().unwrap().push(re); + Ok(()) +} + +/// Register an allow-list skipword provided by the user. Comparisons are +/// case-insensitive. +pub fn add_user_skipword(word: &str) { + USER_SAFE_SKIPWORDS.lock().unwrap().push(word.to_lowercase()); +} + +/// Returns `true` if the given input matches any user-supplied allow-list +/// patterns (regexes or skipwords). +/// +/// `secret` is the primary capture group (typically just the secret value) +/// while `full_match` includes the entire match, allowing regexes to target +/// surrounding context such as variable names. +pub fn is_user_match(secret: &[u8], full_match: &[u8]) -> bool { + { + let regexes = USER_SAFE_REGEXES.lock().unwrap(); + if regexes.iter().any(|re| re.is_match(secret) || re.is_match(full_match)) { + debug!("Safe match: user skip-regex"); + return true; + } + } + + let skipwords = USER_SAFE_SKIPWORDS.lock().unwrap(); + if skipwords.is_empty() { + return false; + } + + // Check skipwords against both the secret and full match (case-insensitive) + let contains_skipword = |bytes: &[u8]| -> bool { + if let Ok(s) = std::str::from_utf8(bytes) { + let lower = s.to_lowercase(); + return skipwords.iter().any(|w| lower.contains(w)); + } + false + }; + + if contains_skipword(secret) || contains_skipword(full_match) { + debug!("Safe match: user skip-word"); + return true; + } + + false +} + /// Returns `Some(&'static str)` with the rule description if the input likely /// contains *benign* placeholder/test strings; otherwise `None`. pub fn is_safe_match_reason(input: &[u8]) -> Option<&'static str> { diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs index e389543..08bac87 100644 --- a/src/scanner/runner.rs +++ b/src/scanner/runner.rs @@ -16,6 +16,7 @@ use crate::{ rule_loader::RuleLoader, rule_profiling::ConcurrentRuleProfiler, rules_database::RulesDatabase, + safe_list, scanner::{ clone_or_update_git_repos, enumerate_filesystem_inputs, enumerate_github_repos, repos::{ @@ -52,6 +53,15 @@ pub async fn run_async_scan( } } + // Register user-provided allow-list patterns + for pattern in &args.skip_regex { + safe_list::add_user_regex(pattern) + .map_err(|e| anyhow::anyhow!("Invalid skip-regex '{pattern}': {e}"))?; + } + for word in &args.skip_word { + safe_list::add_user_skipword(word); + } + let start_time = Instant::now(); trace!("Args:\n{global_args:#?}\n{args:#?}"); diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs new file mode 100644 index 0000000..2dfa70b --- /dev/null +++ b/tests/int_allowlist.rs @@ -0,0 +1,143 @@ +use std::{ + fs, + sync::{Arc, Mutex}, +}; + +use anyhow::Result; +use kingfisher::{ + cli::{ + commands::{ + github::{GitCloneMode, GitHistoryMode, GitHubRepoType}, + gitlab::GitLabRepoType, + inputs::{ContentFilteringArgs, InputSpecifierArgs}, + output::{OutputArgs, ReportOutputFormat}, + rules::RuleSpecifierArgs, + scan::{ConfidenceLevel, ScanArgs}, + }, + global::{AdvancedArgs, Mode}, + GlobalArgs, + }, + findings_store::FindingsStore, + rule_loader::RuleLoader, + rules_database::RulesDatabase, + scanner::run_async_scan, +}; +use tempfile::TempDir; +use tokio::runtime::Runtime; +use url::Url; + +fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result { + let rt = Runtime::new().unwrap(); + let work = TempDir::new()?; + let rules_dir = work.path().join("rules"); + fs::create_dir_all(&rules_dir)?; + let inputs_dir = work.path().join("in"); + fs::create_dir_all(&inputs_dir)?; + + fs::write( + rules_dir.join("demo.yml"), + r#"rules: + - id: demo.token + name: Demo token + pattern: 'token_(\w+)' + confidence: low +"#, + )?; + + fs::write(inputs_dir.join("a.txt"), "token_realvalue\ntoken_testvalue\n")?; + + let scan_args = ScanArgs { + num_jobs: 2, + rules: RuleSpecifierArgs { + rules_path: vec![rules_dir.clone()], + rule: vec!["all".into()], + load_builtins: false, + }, + input_specifier_args: InputSpecifierArgs { + path_inputs: vec![inputs_dir.join("a.txt")], + git_url: Vec::new(), + github_user: Vec::new(), + github_organization: Vec::new(), + all_github_organizations: false, + github_api_url: Url::parse("https://api.github.com/").unwrap(), + github_repo_type: GitHubRepoType::Source, + gitlab_user: Vec::new(), + gitlab_group: Vec::new(), + all_gitlab_groups: false, + gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(), + gitlab_repo_type: GitLabRepoType::Owner, + gitlab_include_subgroups: false, + jira_url: None, + jql: None, + confluence_url: None, + cql: None, + slack_query: None, + slack_api_url: Url::parse("https://slack.com/api/").unwrap(), + max_results: 100, + s3_bucket: None, + s3_prefix: None, + role_arn: None, + aws_local_profile: None, + docker_image: Vec::new(), + git_clone: GitCloneMode::Bare, + git_history: GitHistoryMode::Full, + scan_nested_repos: true, + commit_metadata: true, + }, + content_filtering_args: ContentFilteringArgs { + max_file_size_mb: 5.0, + exclude: Vec::new(), + no_extract_archives: false, + extraction_depth: 1, + no_binary: true, + }, + confidence: ConfidenceLevel::Low, + no_validate: true, + rule_stats: false, + only_valid: false, + min_entropy: Some(0.0), + redact: false, + git_repo_timeout: 1800, + output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, + no_dedup: false, + baseline_file: None, + manage_baseline: false, + skip_regex: skip_regex, + skip_word: skip_skipword, + }; + + let global_args = GlobalArgs { + verbose: 0, + quiet: true, + color: Mode::Never, + progress: Mode::Never, + no_update_check: true, + self_update: false, + ignore_certs: false, + advanced: AdvancedArgs { rlimit_nofile: 8192 }, + }; + + let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules).load(&scan_args)?; + let resolved = loaded.resolve_enabled_rules()?; + let rules_db = Arc::new(RulesDatabase::from_rules(resolved.into_iter().cloned().collect())?); + + let datastore = Arc::new(Mutex::new(FindingsStore::new(work.path().join("store")))); + + rt.block_on(run_async_scan(&global_args, &scan_args, Arc::clone(&datastore), &rules_db))?; + + let x = Ok(datastore.lock().unwrap().get_matches().len()); x +} + +#[test] +fn skip_regex_filters_match() -> Result<()> { + let count = run_skiplist(vec!["token_realvalue".into()], Vec::new())?; + assert_eq!(count, 1); + Ok(()) +} + +#[test] +fn skip_skipword_filters_match() -> Result<()> { + let count = run_skiplist(Vec::new(), vec!["test".into()])?; + assert_eq!(count, 1); + Ok(()) +} \ No newline at end of file diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index 090effa..578c2bf 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -118,6 +118,8 @@ rules: no_dedup, baseline_file: None, manage_baseline: false, + skip_regex: Vec::new(), + skip_word: Vec::new(), }; let global_args = GlobalArgs { diff --git a/tests/int_github.rs b/tests/int_github.rs index dbedcb5..2156087 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -105,6 +105,8 @@ fn test_github_remote_scan() -> Result<()> { no_dedup: true, baseline_file: None, manage_baseline: false, + skip_regex: Vec::new(), + skip_word: Vec::new(), }; // Create global arguments let global_args = GlobalArgs { diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index 5a72ce5..601262f 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -103,6 +103,8 @@ fn test_gitlab_remote_scan() -> Result<()> { no_dedup: true, baseline_file: None, manage_baseline: false, + skip_regex: Vec::new(), + skip_word: Vec::new(), }; let global_args = GlobalArgs { @@ -207,6 +209,8 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { no_dedup: true, baseline_file: None, manage_baseline: false, + skip_regex: Vec::new(), + skip_word: Vec::new(), }; let global_args = GlobalArgs { diff --git a/tests/int_redact.rs b/tests/int_redact.rs index 6bd97c7..b309a54 100644 --- a/tests/int_redact.rs +++ b/tests/int_redact.rs @@ -85,6 +85,8 @@ async fn test_redact_hashes_finding_values() -> Result<()> { no_dedup: true, baseline_file: None, manage_baseline: false, + skip_regex: Vec::new(), + skip_word: Vec::new(), }; let global_args = GlobalArgs { diff --git a/tests/int_slack.rs b/tests/int_slack.rs index 86cea6f..8726231 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -91,6 +91,8 @@ impl TestContext { no_dedup: true, baseline_file: None, manage_baseline: false, + skip_regex: Vec::new(), + skip_word: Vec::new(), }; let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules).load(&scan_args)?; @@ -185,6 +187,8 @@ async fn test_scan_slack_messages() -> Result<()> { no_dedup: true, baseline_file: None, manage_baseline: false, + skip_regex: Vec::new(), + skip_word: Vec::new(), }; let global_args = GlobalArgs { diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index 6e2cc6a..a220f88 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -161,6 +161,8 @@ async fn test_validation_cache_and_depvars() -> Result<()> { no_dedup: true, // keep duplicates so the cache is stressed baseline_file: None, manage_baseline: false, + skip_regex: Vec::new(), + skip_word: Vec::new(), }; /* --------------------------------------------------------- * diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 31a74ac..e301143 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -104,6 +104,8 @@ impl TestContext { no_dedup: true, baseline_file: None, manage_baseline: false, + skip_regex: Vec::new(), + skip_word: Vec::new(), }; let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules) @@ -183,6 +185,8 @@ impl TestContext { no_dedup: true, baseline_file: None, manage_baseline: false, + skip_regex: Vec::new(), + skip_word: Vec::new(), }; let global_args = GlobalArgs { From 32b11dd40968cfbd338ed198f1400b81018625d7 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Tue, 19 Aug 2025 19:19:13 -0700 Subject: [PATCH 4/4] Added '--skip-regex' and '--skip-word' flags to ignore secrets matching custom patterns or skipwords --- README.md.orig | 625 ------------------------------- README.md.rej | 53 --- src/reporter/json_format.rs.orig | 273 -------------- 3 files changed, 951 deletions(-) delete mode 100644 README.md.orig delete mode 100644 README.md.rej delete mode 100644 src/reporter/json_format.rs.orig diff --git a/README.md.orig b/README.md.orig deleted file mode 100644 index 7fa2626..0000000 --- a/README.md.orig +++ /dev/null @@ -1,625 +0,0 @@ -# Kingfisher - -

- Kingfisher Logo - -[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) - -Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s hardware‑accelerated Hyperscan regex engine with language‑aware parsing via Tree‑Sitter, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production -

- -Kingfisher originated as a fork of Praetorian's Nosey Parker, and is built atop their incredible work and the work contributed by the Nosey Parker community. - -## What Kingfisher Adds -- **Live validation** via cloud-provider APIs -- **Extra targets**: GitLab repos, S3 buckets, Docker images, Jira issues, Confluence pages, and Slack messages -- **Compressed Files**: Supports extracting and scanning compressed files for secrets -- **Baseline mode**: ignore known secrets, flag only new ones -- **Language-aware detection** (source-code parsing) for ~20 languages -- **Native Windows** binary - - -## Key Features -- **Performance**: multithreaded, Hyperscan‑powered scanning built for huge codebases -- **Extensible rules**: hundreds of built-in detectors plus YAML-defined custom rules ([docs/RULES.md](/docs/RULES.md)) -- **Multiple targets**: - - **Git history**: local repos or GitHub/GitLab orgs/users - - **Docker images**: public or private via `--docker-image` - - **Jira issues**: JQL‑driven scans with `--jira-url` and `--jql` - - **Confluence pages**: CQL‑driven scans with `--confluence-url` and `--cql` - - **Slack messages**: query‑based scans with `--slack-query` - - **AWS S3**: bucket scans via `--s3-bucket`/`--s3-prefix` with credentials from `KF_AWS_KEY`/`KF_AWS_SECRET`, `--role-arn`, `--aws-local-profile`, or anonymous -- **Compressed Files**: Supports extracting and scanning compressed files for secrets -- **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) - -**Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation) - -# Benchmark Results - -See ([docs/COMPARISON.md](docs/COMPARISON.md)) - -

- Kingfisher Runtime Comparison -

- -# Getting Started -## Installation - -On macOS, you can simply - -```bash -brew install kingfisher -``` - -Pre-built binaries are also available on the [Releases](https://github.com/mongodb/kingfisher/releases) section of this page. - -You can also install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform: - -```bash -# Linux, macOS -curl --silent --location \ - https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \ - sh && \ - ubi --project mongodb/kingfisher --in "$HOME/bin" -``` - -```powershell -# Windows -powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in . -``` - -This installs `ubi` and then places the `kingfisher` executable in `~/bin` on Unix-like -systems (or the current directory on Windows). - -Or you may compile for your platform via `make`: - -```bash -# NOTE: Requires Docker -make linux - -# macOS --- must build from a macOS host -make darwin - -# Windows x64 --- requires building from a Windows host with Visual Studio installed -./buildwin.bat -force -``` - -```bash -# Build all targets -make linux-all # builds both x64 and arm64 -make darwin-all # builds both x64 and arm64 -make all # builds for every OS and architecture supported -``` - -### Run Kingfisher in Docker - -Run the dockerized Kingfisher container: -```bash -# GitHub Container Registry -docker run --rm ghcr.io/mongodb/kingfisher:latest --version - -# Scan the current working directory -# (mounts your code at /src and scans it) -docker run --rm \ - -v "$PWD":/src \ - ghcr.io/mongodb/kingfisher:latest scan /src - - -# Scan while providing a GitHub token -# Mounts your working dir at /proj and passes in the token: -docker run --rm \ - -e KF_GITHUB_TOKEN=ghp_… \ - -v "$PWD":/proj \ - ghcr.io/mongodb/kingfisher:latest \ - scan --git-url https://github.com/org/private_repo.git - -# Scan an S3 bucket -# Credentials can come from KF_AWS_KEY/KF_AWS_SECRET, --role-arn, or --aws-local-profile -docker run --rm \ - -e KF_AWS_KEY=AKIA... \ - -e KF_AWS_SECRET=g5nYW... \ - ghcr.io/mongodb/kingfisher:latest \ - scan --s3-bucket bucket-name - - -# Scan and write a JSON report locally -# Here we: -# 1. Mount $PWD → /proj -# 2. Tell Kingfisher to write findings.json inside /proj/reports -# 3. Ensure ./reports exists on your host so Docker can mount it -mkdir -p reports - -# run and output into host’s ./reports directory -docker run --rm \ - -v "$PWD":/proj \ - ghcr.io/mongodb/kingfisher:latest \ - scan /proj \ - --format json \ - --output /proj/reports/findings.json - - -# Tip: you can combine multiple mounts if you prefer separating source vs. output: -# Here /src is read‑only, and /out holds your generated reports -docker run --rm \ - -v "$PWD":/src:ro \ - -v "$PWD/reports":/out \ - ghcr.io/mongodb/kingfisher:latest \ - scan /src \ - --format json \ - --output /out/findings.json - -``` - -# 🔐 Detection Rules at a Glance - -Kingfisher ships with hundreds of rules that cover everything from classic cloud keys to the latest LLM-API secrets. Below is an overview: - -| Category | What we catch | -|----------|---------------| -| **AI / LLM APIs** | OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), and more -| **Cloud Providers** | AWS, Azure, GCP, Alibaba Cloud, DigitalOcean, IBM Cloud, Cloudflare, and more -| **Dev & CI/CD** | GitHub/GitLab tokens, CircleCI, TravisCI, TeamCity, Docker Hub, npm, PyPI, and more -| **Messaging & Comms** | Slack, Discord, Microsoft Teams, Twilio, Mailgun, SendGrid, Mailchimp, and more -| **Databases & Data Ops** | MongoDB Atlas, PlanetScale, Postgres DSNs, Grafana Cloud, Datadog, Dynatrace, and more -| **Payments & Billing** | Stripe, PayPal, Square, GoCardless, and more -| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more -| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more - -## Write Custom Rules! - -Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential. - -However, you may want to add your own custom rules, or modify a detection to better suit your needs / environment. - -First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom Kingfisher rules. - -Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! - -# Usage - -## Basic Examples - -> **Note**  `kingfisher scan` detects whether the input is a Git repository or a plain directory, no extra flags required. - -### Scan with secret validation - -```bash -kingfisher scan /path/to/code -## NOTE: This path can refer to: -# 1. a local git repo -# 2. a directory with many git repos -# 3. or just a folder with files and subdirectories - -## To explicitly prevent scanning git commit history add: -# `--git-history=none` -``` - -### Scan a directory containing multiple Git repositories - -```bash -kingfisher scan /projects/mono‑repo‑dir -``` - -### Scan a Git repository without validation - -```bash -kingfisher scan ~/src/myrepo --no-validate -``` - -### Display only secrets confirmed active by third‑party APIs - -```bash -kingfisher scan /path/to/repo --only-valid -``` - -### Output JSON and capture to a file - -```bash -kingfisher scan . --format json | tee kingfisher.json -``` - -### Output SARIF directly to disk - -```bash -kingfisher scan /path/to/repo --format sarif --output findings.sarif -``` - -### Pipe any text directly into Kingfisher by passing `-` - -```bash -cat /path/to/file.py | kingfisher scan - - -``` - -### Scan using a rule _family_ with one flag - -_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws._`)\* - -```bash -# Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2) -kingfisher scan /path/to/repo --rule kingfisher.aws -``` - -### Display rule performance statistics - -```bash -kingfisher scan /path/to/repo --rule-stats -``` - -### Scan while ignoring likely test files - -`--exclude` skips any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) - -```bash -# Scan source but skip likely unit / integration tests -kingfisher scan ./my-project \ - --exclude='[Tt]est' \ - --exclude='spec' \ - --exclude='[Ff]ixture' \ - --exclude='example' \ - --exclude='sample' -``` - -### Exclude specific paths -```bash -# Skip all Python files and any directory named tests -kingfisher scan ./my-project \ - --exclude '*.py' \ - --exclude '[Tt]ests' -``` - -If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude): - -```bash -# Skip all Python files and any directory named tests, and report to stderr any skipped files -kingfisher scan ./my-project \ - --exclude '*.py' \ - --exclude tests \ - -v -``` -## Scan an S3 bucket -You can scan S3 objects directly: - -```bash -kingfisher scan --s3-bucket bucket-name [--s3-prefix path/] -``` - -Credential resolution happens in this order: - -1. `KF_AWS_KEY` and `KF_AWS_SECRET` environment variables -2. `--aws-local-profile` pointing to a profile in `~/.aws/config` (works with AWS SSO) -3. anonymous access for public buckets - -If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role. - -Examples: - -```bash -# using explicit keys -export KF_AWS_KEY=AKIA... -export KF_AWS_SECRET=g5nYW... -kingfisher scan --s3-bucket some-example-bucket - -# Above can also be run as: -KF_AWS_KEY=AKIA... KF_AWS_SECRET=g5nYW... kingfisher scan --s3-bucket some-example-bucket - -# using a local profile (e.g., SSO) that exists in your AWS profile (~/.aws/config) -kingfisher scan --s3-bucket some-example-bucket --aws-local-profile default - -# anonymous scan of a bucket, while providing an object prefix to only scan subset of the s3 bucket -kingfisher scan \ - --s3-bucket awsglue-datasets \ - --s3-prefix examples/us-legislators/all - -# assuming a role when scanning -kingfisher scan --s3-bucket some-example-bucket \ - --role-arn arn:aws:iam::123456789012:role/MyRole - -# anonymous scan of a public bucket -kingfisher scan --s3-bucket some-example-bucket -``` - -Docker example: - -```bash -docker run --rm \ - -e KF_AWS_KEY=AKIA... \ - -e KF_AWS_SECRET=g5nYW... \ - ghcr.io/mongodb/kingfisher:latest \ - scan --s3-bucket bucket-name -``` -## Scanning Docker Images - -Kingfisher will first try to use any locally available image, then fall back to pulling via OCI. - -Authentication happens *in this order*: - -1. **`KF_DOCKER_TOKEN`** env var - - If it contains `user:pass`, it’s used as Basic auth - - Otherwise it’s sent as a Bearer token -2. **Docker CLI credentials** - - Checks `credHelpers` (per-registry) and `credsStore` in `~/.docker/config.json`. - - Falls back to the legacy `auths` → `auth` (base64) entries. -3. **Anonymous** (no credentials) - - -```bash -# 1) Scan public or already-pulled image -kingfisher scan --docker-image ghcr.io/owasp/wrongsecrets/wrongsecrets-master:latest-master - -# 2) For private registries, explicitly set KF_DOCKER_TOKEN: -# - Basic auth: "user:pass" -# - Bearer only: "TOKEN" -export KF_DOCKER_TOKEN="AWS:$(aws ecr get-login-password --region us-east-1)" -kingfisher scan --docker-image some-private-registry.dkr.ecr.us-east-1.amazonaws.com/base/amazonlinux2023:latest - -# 3) Or rely on your Docker CLI login/keychain: -# (e.g. aws ecr get-login-password … | docker login …) -kingfisher scan --docker-image private.registry.example.com/my-image:tag -``` - -## Scanning GitHub - -### Scan GitHub organisation (requires `KF_GITHUB_TOKEN`) - -```bash -kingfisher scan --github-organization my-org -``` - -### Scan remote GitHub repository - -```bash -kingfisher scan --git-url https://github.com/org/repo.git - -# Optionally provide a GitHub Token -KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/private_repo.git - -``` - ---- - -## Scanning GitLab - -### Scan GitLab group (requires `KF_GITLAB_TOKEN`) - -```bash -kingfisher scan --gitlab-group my-group -# include repositories from all nested subgroups -kingfisher scan --gitlab-group my-group --gitlab-include-subgroups -``` - -### Scan GitLab user - -```bash -kingfisher scan --gitlab-user johndoe -``` - -### Scan remote GitLab repository by URL - -```bash -kingfisher scan --git-url https://gitlab.com/group/project.git -``` - -### List GitLab repositories - -```bash -kingfisher gitlab repos list --group my-group -# include repositories from all nested subgroups -kingfisher gitlab repos list --group my-group --include-subgroups -``` - -## Scanning Jira - -### Scan Jira issues matching a JQL query - -```bash -KF_JIRA_TOKEN="token" kingfisher scan \ - --jira-url https://jira.company.com \ - --jql "project = TEST AND status = Open" \ - --max-results 500 -``` - -### Scan the last 1,000 Jira issues: -```bash -KF_JIRA_TOKEN="token" kingfisher scan \ - --jira-url https://jira.mongodb.org \ - --jql 'ORDER BY created DESC' \ - --max-results 1000 -``` - -## Scanning Confluence - -### Scan Confluence pages matching a CQL query - -```bash -# Bearer token -KF_CONFLUENCE_TOKEN="token" kingfisher scan \ - --confluence-url https://confluence.company.com \ - --cql "label = secret" \ - --max-results 500 - -# Basic auth with username and token -KF_CONFLUENCE_USER="user@example.com" KF_CONFLUENCE_TOKEN="token" kingfisher scan \ - --confluence-url https://confluence.company.com \ - --cql "text ~ 'password'" \ - --max-results 500 -``` - -Use the base URL of your Confluence site for `--confluence-url`. Kingfisher -automatically adds `/rest/api` to the end, so `https://example.com/wiki` and -`https://example.com` both work depending on your server configuration. - -Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environment variable. By default, Kingfisher sends the token as a bearer token in the `Authorization` header. - -To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions. - -## Scanning Slack - -### Scan Slack messages matching a search query - -```bash -KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \ - --slack-query "from:username has:link" \ - --max-results 1000 - -KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \ - --slack-query "akia" \ - --max-results 1000 -``` -*The Slack token must be a user token with the `search:read` scope. Bot tokens (those beginning with `xoxb-`) cannot call the Slack search API.* - -## Environment Variables for Tokens - -| Variable | Purpose | -| ----------------- | ---------------------------- | -| `KF_GITHUB_TOKEN` | GitHub Personal Access Token | -| `KF_GITLAB_TOKEN` | GitLab Personal Access Token | -| `KF_JIRA_TOKEN` | Jira API token | -| `KF_CONFLUENCE_TOKEN` | Confluence API token | -| `KF_SLACK_TOKEN` | Slack API token | -| `KF_DOCKER_TOKEN` | Docker registry token (`user:pass` or bearer token). If unset, credentials from the Docker keychain are used | -| `KF_AWS_KEY` and `KF_AWS_SECRET` | AWS Credentials to use with S3 bucket scanning | - -Set them temporarily per command: - -```bash -KF_GITLAB_TOKEN="glpat-…" kingfisher scan --gitlab-group my-group -``` - -Or export for the session: - -```bash -export KF_GITLAB_TOKEN="glpat-…" -``` - -To authenticate Jira requests: -```bash -export KF_JIRA_TOKEN="token" -``` - -To authenticate Confluence requests: -```bash -export KF_CONFLUENCE_TOKEN="token" -``` - -_If no token is provided Kingfisher still works for public repositories._ - ---- - -## Exit Codes - -| Code | Meaning | -| ---- | ----------------------------- | -| 0 | No findings | -| 200 | Findings discovered | -| 205 | Validated findings discovered | - -## Update Checks - -Kingfisher automatically queries GitHub for a newer release when it starts and tells you whether an update is available. - -- **Hands-free updates** – Add `--self-update` to any Kingfisher command - - * If a newer version exists, Kingfisher will download it, replace the running binary, and re-launch itself with the **exact same arguments**. - * If the update fails or no newer release is found, the current run proceeds as normal - -- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely - -# Advanced Options - -## Build a Baseline / Detect New Secrets - -There are situations where a repository already contains checked‑in secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list. - -The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches): - -```bash -kingfisher scan /path/to/code \ - --confidence low \ - --manage-baseline \ - --baseline-file ./baseline-file.yml -``` - -Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings: - -```bash -kingfisher scan /path/to/code \ - --baseline-file /path/to/baseline-file.yaml -``` - -Running the scan again with `--manage-baseline` refreshes the baseline by adding new findings and pruning entries for secrets that no longer appear. See [docs/BASELINE.md](docs/BASELINE.md) for full detail. - -## List Builtin Rules - -```bash -kingfisher rules list -``` - -## To scan using **only** your own `my_rules.yaml` you could run: - -```bash -kingfisher scan \ - --load-builtins=false \ - --rules-path path/to/my_rules.yaml \ - ./src/ -``` - -## To add your rules alongside the built‑ins: - -```bash -kingfisher scan \ - --rules-path ./custom-rules/ \ - --rules-path my_rules.yml \ - ~/path/to/project-dir/ -``` - -## Other Examples - -```bash -# Check custom rules - this ensures all regular expressions compile, and can match the rule's `examples` in the YML file -kingfisher rules check --rules-path ./my_rules.yml - -# List GitHub repos -kingfisher github repos list --user my-user -kingfisher github repos list --organization my-org - -``` - -## Notable Scan Options - -- `--no-dedup`: Report every occurrence of a finding (disable the default de-duplicate behavior) -- `--confidence `: (low|medium|high) -- `--min-entropy `: Override default threshold -- `--no-binary`: Skip binary files -- `--no-extract-archives`: Do not scan inside archives -- `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) -- `--redact`: Replaces discovered secrets with a one-way hash for secure output -- `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) -- `--baseline-file `: Ignore matches listed in a baseline YAML file -- `--manage-baseline`: Create or update the baseline file with current findings - - -## Finding Fingerprint - -The document below details the four-field formula (rule SHA-1, origin label, start & end offsets) hashed with XXH3-64 to create Kingfisher’s 64-bit finding fingerprint, and explains how this ID powers safe deduplication; plus how `--no-dedup` can be used shows every raw match. -See ([docs/FINGERPRINT.md](docs/FINGERPRINT.md)) - -## Rule Performance Profiling - -Use `--rule-stats` to collect timing information for every rule. After scanning, the summary prints a **Rule Performance Stats** section showing how many matches each rule produced along with its slowest and average match times. Useful when creating rules or debugging rules. - -## CLI Options - -```bash -kingfisher scan --help -``` - -# Roadmap - -- More rules -- More targets -- Please file a [feature request](https://github.com/mongodb/kingfisher/issues), or open a PR, if you have features you'd like added - -# License - -[Apache2 License](LICENSE) diff --git a/README.md.rej b/README.md.rej deleted file mode 100644 index 40ab324..0000000 --- a/README.md.rej +++ /dev/null @@ -1,53 +0,0 @@ -@@ -575,50 +576,52 @@ - - ## Other Examples - - ```bash - # Check custom rules - this ensures all regular expressions compile, and can match the rule's `examples` in the YML file - kingfisher rules check --rules-path ./my_rules.yml - - # List GitHub repos - kingfisher github repos list --user my-user - kingfisher github repos list --organization my-org - - ``` - - ## Notable Scan Options - - - `--no-dedup`: Report every occurrence of a finding (disable the default de-duplicate behavior) - - `--confidence `: (low|medium|high) - - `--min-entropy `: Override default threshold - - `--no-binary`: Skip binary files - - `--no-extract-archives`: Do not scan inside archives - - `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) - - `--redact`: Replaces discovered secrets with a one-way hash for secure output - - `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) - - `--baseline-file `: Ignore matches listed in a baseline YAML file - - `--manage-baseline`: Create or update the baseline file with current findings -+- `--skip-regex `: Ignore findings whose text matches this regex (repeatable) -+- `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) - - - ## Finding Fingerprint - - The document below details the four-field formula (rule SHA-1, origin label, start & end offsets) hashed with XXH3-64 to create Kingfisher’s 64-bit finding fingerprint, and explains how this ID powers safe deduplication; plus how `--no-dedup` can be used shows every raw match. - See ([docs/FINGERPRINT.md](docs/FINGERPRINT.md)) - - ## Rule Performance Profiling - - Use `--rule-stats` to collect timing information for every rule. After scanning, the summary prints a **Rule Performance Stats** section showing how many matches each rule produced along with its slowest and average match times. Useful when creating rules or debugging rules. - - ## CLI Options - - ```bash - kingfisher scan --help - ``` - - ## Business Value - - By integrating Kingfisher into your development lifecycle, you can: - - - **Prevent Costly Breaches** - Early detection of embedded credentials avoids expensive incident response, legal fees, and reputation damage - - **Automate Compliance** - Enforce secret‑scanning policies across GitOps, CI/CD, and pull requests to help satisfy SOC 2, PCI‑DSS, GDPR, and other standards diff --git a/src/reporter/json_format.rs.orig b/src/reporter/json_format.rs.orig deleted file mode 100644 index 4020cee..0000000 --- a/src/reporter/json_format.rs.orig +++ /dev/null @@ -1,273 +0,0 @@ -use super::*; - -impl DetailsReporter { - pub fn json_format( - &self, - mut writer: W, - args: &cli::commands::scan::ScanArgs, - ) -> Result<()> { - let records = self.build_finding_records(args)?; - if !records.is_empty() { - serde_json::to_writer_pretty(&mut writer, &records)?; - writeln!(writer)?; - } - Ok(()) - } - - pub fn jsonl_format( - &self, - mut writer: W, - args: &cli::commands::scan::ScanArgs, - ) -> Result<()> { - let records = self.build_finding_records(args)?; - for record in records { - serde_json::to_writer(&mut writer, &record)?; - writeln!(writer)?; - } - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::cli::commands::github::GitCloneMode; - use crate::cli::commands::github::GitHistoryMode; - use crate::cli::commands::rules::RuleSpecifierArgs; - use crate::matcher::{SerializableCapture, SerializableCaptures}; - use crate::util::intern; - use crate::{ - blob::BlobId, - cli::commands::github::GitHubRepoType, - cli::commands::inputs::ContentFilteringArgs, - cli::commands::inputs::InputSpecifierArgs, - cli::commands::output::{OutputArgs, ReportOutputFormat}, - cli::commands::scan::ConfidenceLevel, - findings_store::FindingsStore, - location::{Location, OffsetSpan, SourcePoint, SourceSpan}, - matcher::Match, - origin::Origin, - reporter::styles::Styles, - }; - use std::{ - io::Cursor, - path::PathBuf, - sync::{Arc, Mutex}, - }; - use url::Url; - fn create_default_args() -> cli::commands::scan::ScanArgs { - use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope - - cli::commands::scan::ScanArgs { - num_jobs: 1, - no_dedup: false, - rules: RuleSpecifierArgs { - rules_path: Vec::new(), - rule: vec!["all".into()], - load_builtins: true, - }, - input_specifier_args: InputSpecifierArgs { - // local path / git URL inputs - path_inputs: Vec::new(), - git_url: Vec::new(), - - // GitHub - github_user: Vec::new(), - github_organization: Vec::new(), - all_github_organizations: false, - github_api_url: Url::parse("https://api.github.com/").unwrap(), - github_repo_type: GitHubRepoType::Source, - - // GitLab - gitlab_user: Vec::new(), - gitlab_group: Vec::new(), - all_gitlab_groups: false, - gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(), - gitlab_repo_type: GitLabRepoType::All, - gitlab_include_subgroups: false, - // Jira options - jira_url: None, - jql: None, - // Confluence options - confluence_url: None, - cql: None, - max_results: 100, - // Slack options - slack_query: None, - slack_api_url: Url::parse("https://slack.com/api/").unwrap(), - // s3 - s3_bucket: None, - s3_prefix: None, - role_arn: None, - aws_local_profile: None, - - docker_image: Vec::new(), - // clone / history options - git_clone: GitCloneMode::Bare, - git_history: GitHistoryMode::Full, - scan_nested_repos: true, - commit_metadata: true, - }, - content_filtering_args: ContentFilteringArgs { - max_file_size_mb: 25.0, - no_extract_archives: false, - extraction_depth: 2, - exclude: Vec::new(), // Exclude patterns - no_binary: true, - }, - confidence: ConfidenceLevel::Medium, - no_validate: false, - rule_stats: false, - only_valid: false, - min_entropy: None, - redact: false, - git_repo_timeout: 1800, // 30 minutes - output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, - baseline_file: None, - manage_baseline: false, - } - } - - fn create_mock_match( - rule_name: &str, - rule_text_id: &str, - rule_finding_fingerprint: &str, - validation_success: bool, - ) -> Match { - Match { - location: Location { - offset_span: OffsetSpan { start: 10, end: 20 }, - source_span: SourceSpan { - start: SourcePoint { line: 5, column: 10 }, - end: SourcePoint { line: 5, column: 20 }, - }, - }, - groups: SerializableCaptures { - captures: vec![SerializableCapture { - name: Some("token".to_string()), - match_number: 1, - start: 10, - end: 20, - value: "mock_token".into(), - }], - }, - blob_id: BlobId::new(b"mock_blob"), - finding_fingerprint: 0123, - rule_finding_fingerprint: intern(rule_finding_fingerprint), - rule_text_id: intern(rule_text_id), - rule_name: intern(rule_name), - rule_confidence: Confidence::Medium, - validation_response_body: "validation response".to_string(), - validation_response_status: 200, - validation_success, - calculated_entropy: 4.5, - visible: true, - } - } - - fn setup_mock_reporter(matches: Vec) -> DetailsReporter { - let mut datastore = FindingsStore::new(PathBuf::from("/tmp")); - if !matches.is_empty() { - let blob_metadata = BlobMetadata { - id: BlobId::new(b"mock_blob"), - num_bytes: 1024, - mime_essence: Some("text/plain".to_string()), - charset: Some("UTF-8".to_string()), - language: Some("Rust".to_string()), - }; - let dedup = true; - for m in matches.clone() { - datastore.record( - vec![( - Arc::new(OriginSet::new( - Origin::from_file(PathBuf::from("/mock/path/file.rs")), - vec![], - )), - Arc::new(blob_metadata.clone()), - m.m.clone(), - )], - dedup, - ); - } - } - DetailsReporter { - datastore: Arc::new(Mutex::new(datastore)), - styles: Styles::new(false), - only_valid: false, - } - } - - #[test] - fn test_json_format() -> Result<()> { - let mock_match = - create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); - let matches = vec![ReportMatch { - origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]), - blob_metadata: BlobMetadata { - id: BlobId::new(b"mock_blob"), - num_bytes: 1024, - mime_essence: Some("text/plain".to_string()), - charset: Some("UTF-8".to_string()), - language: Some("Rust".to_string()), - }, - m: mock_match, - comment: None, - match_confidence: Confidence::Medium, - visible: true, - validation_response_body: "validation response".to_string(), - validation_response_status: 200, - validation_success: true, - }]; - let reporter = setup_mock_reporter(matches); - let mut output = Cursor::new(Vec::new()); - reporter.json_format(&mut output, &create_default_args())?; - let json_output: Vec = serde_json::from_slice(&output.into_inner())?; - assert!(!json_output.is_empty(), "JSON output should not be empty"); - let first = &json_output[0]; - assert_eq!(first["rule"]["name"], "MockRule"); - assert_eq!(first["finding"]["language"], "Rust"); - Ok(()) - } - - #[test] - fn test_validation_status_in_json() -> Result<()> { - let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")]; - for (validation_success, expected_status) in test_cases { - let mock_match = create_mock_match( - "MockRule", - "mock_rule_1", - "mock_finding_fingerprint", - validation_success, - ); - let matches = vec![ReportMatch { - origin: OriginSet::new( - Origin::from_file(PathBuf::from("/mock/path/file.rs")), - vec![], - ), - blob_metadata: BlobMetadata { - id: BlobId::new(b"mock_blob"), - num_bytes: 1024, - mime_essence: Some("text/plain".to_string()), - charset: Some("UTF-8".to_string()), - language: Some("Rust".to_string()), - }, - m: mock_match, - comment: None, - match_confidence: Confidence::Medium, - visible: true, - validation_response_body: "validation response".to_string(), - validation_response_status: 200, - validation_success, - }]; - let reporter = setup_mock_reporter(matches); - let mut output = Cursor::new(Vec::new()); - reporter.json_format(&mut output, &create_default_args())?; - let json_output: Vec = serde_json::from_slice(&output.into_inner())?; - assert!(!json_output.is_empty(), "JSON output should not be empty"); - let first = &json_output[0]; - let validation_status = first["finding"]["validation"]["status"].as_str().unwrap(); - assert_eq!(validation_status, expected_status); - } - Ok(()) - } -}