From 394d05dd4d40f7460b80a7b83c53e7189e647aee Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 4 May 2026 23:10:16 -0700 Subject: [PATCH] preparing for v1.99.0 --- AGENTS.md | 2 +- CHANGELOG.md | 2 +- README.md | 12 +- docs-site/docs/changelog.md | 9 +- docs-site/docs/features/access-map.md | 3 +- docs-site/docs/index.md | 2 +- docs-site/docs/rules/builtin-rules.md | 46 +++++- docs-site/docs/usage/advanced.md | 12 +- docs-site/docs/usage/basic-scanning.md | 40 ++++++ docs-site/docs/usage/configuration.md | 63 +++++++-- docs-site/docs/usage/integrations.md | 149 +++++++++++++++++++- docs-site/mkdocs.yml | 2 +- docs-site/overrides/home.html | 2 +- docs-site/overrides/main.html | 2 +- docs-site/scripts/prepare-docs.py | 1 + docs/CONFIG.md | 67 ++++++--- docs/USAGE.md | 187 +++++++++++++++++++++++-- src/cli/config.rs | 85 ++++++----- src/cli/global.rs | 7 +- src/main.rs | 65 ++++++--- 20 files changed, 622 insertions(+), 136 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 03db3f5..0f5beac 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -7,7 +7,7 @@ Guidance for coding agents working in this repository. Kingfisher is an open-source secret scanner and live secret validator written in Rust by MongoDB. It detects, validates, and helps remediate leaked API keys, tokens, and credentials across code repositories, git history, and integrated platforms. Key capabilities: -- Secret detection with 942 built-in rules (820 standalone detectors + 122 dependent rules; 484 standalone detectors include live validation as of 2026-04-24) +- Secret detection with 950 built-in rules (826 standalone detectors + 124 dependent rules; 485 standalone detectors include live validation as of 2026-05-04) - Live credential validation against provider APIs - Direct secret revocation from CLI - Blast radius mapping (AWS, GCP, Azure, GitHub, GitLab, Slack) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d7f95d..c3dbe05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ All notable changes to this project will be documented in this file. - **Webhook alerting — `--alert-report-url` pivot link:** pass a CI run URL (or set `KINGFISHER_ALERT_REPORT_URL`) to embed a one-click "Full report →" link in every chat payload. In GitHub Actions, pair with `github.server_url/${{ github.repository }}/actions/runs/${{ github.run_id }}` to land the responder directly in the SARIF view for that run. - **Webhook alerting — fingerprints in chat payloads:** every finding rendered in detail mode now includes its stable `fingerprint` ID (e.g. `fp:1635470773610661884`), matching the value emitted in JSON/JSONL/SARIF/baseline outputs. SOAR playbooks and SIEM rules can use these IDs to dedupe across runs without a separate correlation step. - **Webhook alerting — scan target in all alert modes:** the "Target" line in chat payloads now correctly reflects the actual scan target for all input modes (GitHub org/user, GitLab group, Bitbucket workspace, S3/GCS bucket, Docker image, Jira/Confluence, Slack, Teams, Postman, etc.), not just local path scans. -- **`kingfisher.yaml` reaches near-CLI parity:** scalar overrides for `--confidence`, `--redact`, `--format`, `--baseline-file`, `--tls-mode`, validation tuning (timeout / retries / rps / per-rule rps), filters (`--max-file-size`, `--no-binary`, `--extraction-depth`, `--skip-aws-account*`), output (`--output`), git options (`--git-clone-dir`, `--keep-clones`, `--repo-clone-limit`, `--include-contributors`), `alerts.defaults.*`, and global flags (`--allow-internal-ips`, `--no-update-check`, `--user-agent-suffix`, `--endpoint`). Precedence is `CLI > env > config > built-in default` (clap `ValueSource` decides per-flag); list-typed values stay additive. Scan-target inputs (paths, `--git-url`, provider user/org/bucket flags) remain CLI-only by design. See `docs/CONFIG.md`. +- **`kingfisher.yaml` reaches near-CLI parity:** scalar overrides for `--confidence`, `--redact`, `--format`, `--baseline-file`, `--tls-mode`, validation tuning (timeout / retries / rps / per-rule rps), filters (`--max-file-size`, `--no-binary`, `--extraction-depth`, `--skip-aws-account*`), output (`--output`), git options (`--git-clone-dir`, `--keep-clones`, `--repo-clone-limit`, `--include-contributors`), `alerts.defaults.*`, and global flags (`--allow-internal-ips`, `--no-update-check`, `--user-agent-suffix`, `--endpoint`). Precedence is `CLI > env > config > built-in default` (clap `ValueSource` decides per-flag); list-typed values stay additive. Scan-target inputs (paths, `--git-url`, provider user/org/bucket flags) remain CLI-only by design. The config is loaded **only** when `--config FILE` is passed explicitly — there is no auto-discovery, so scan results never depend on which directory the binary was launched from. See `docs/CONFIG.md`. - **`kingfisher config init` subcommand:** convert an existing `kingfisher scan ...` invocation into a reusable `kingfisher.yaml` by replacing `scan` with `config init` (e.g. `kingfisher config init --confidence high --redact --exclude vendor/ > kingfisher.yaml`). Only flags the user actually supplied appear in the output — clap defaults are stripped — and scan-target inputs are dropped. Writes to stdout by default, or to `--out FILE` (with `--force` to overwrite). - **Access Map UI redesign** in the report viewer: identities are now grouped into collapsible per-provider sections (admin-bearing providers first); permissions are classified by severity (admin / privilege escalation / risky / read-only) with color-coded badges and rollup chips on each card header; the expanded card body renders permissions **once per group** with a "These permissions apply to all N resources above" banner instead of repeating the same 50+ badges per resource; duplicate-named identities (e.g., multiple MongoDB `admin` tokens) now display a discriminator subtitle (`identity_id · access_type`) so they're tellable apart; new "Critical only" toolbar toggle (persisted in `localStorage`) hides read-only permissions and zero-risk identities; the stats bar gained an admin-permission count. Imported TruffleHog/Gitleaks reports keep the previous flat rendering as a backwards-compatible fallback. Underlying JSON now includes `permissions_by_severity` and an `identity.context` discriminator on each `AccessMapEntry`. diff --git a/README.md b/README.md index a1d3ebd..00fe70c 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ License - Detection Rules + Detection Rules
@@ -20,7 +20,7 @@ Kingfisher is an open source secret scanner and **live secret validation** tool built in Rust. -It combines Intel's SIMD-accelerated regex engine (Hyperscan) with language-aware parsing to achieve high accuracy at massive scale, and ships with [945 built-in rules](https://mongodb.github.io/kingfisher/rules/builtin-rules/) to detect, **validate**, and triage leaked API keys, tokens, and credentials before they ever reach production. +It combines Intel's SIMD-accelerated regex engine (Hyperscan) with language-aware parsing to achieve high accuracy at massive scale, and ships with [950 built-in rules](https://mongodb.github.io/kingfisher/rules/builtin-rules/) to detect, **validate**, and triage leaked API keys, tokens, and credentials before they ever reach production. Kingfisher also ships a **browser-based report viewer** that visualizes and triages findings from Kingfisher **and** from Gitleaks and TruffleHog JSON reports — so you can import scans from other tools and triage them in the same UI. A [hosted copy of the viewer](https://mongodb.github.io/kingfisher/viewer/) is published on the Kingfisher docs site [or run locally](#3-scan-and-view-results-in-browser) @@ -54,9 +54,9 @@ Kingfisher is a high-performance, open source secret detection tool for source c -### Performance, Accuracy, and 945 Rules +### Performance, Accuracy, and 950 Rules - **Performance**: multithreaded, Hyperscan‑powered scanning built for huge codebases -- **Extensible rules**: 945 built-in rules (485 with live validation) plus YAML-defined custom rules ([docs/RULES.md](/docs/RULES.md)) +- **Extensible rules**: 950 built-in rules plus YAML-defined custom rules ([docs/RULES.md](/docs/RULES.md)) - **Validate & Revoke**: live validation of discovered secrets, plus direct revocation for supported platforms (GitHub, GitLab, Slack, AWS, GCP, and more) ([docs/USAGE.md](/docs/USAGE.md)) - **Revocation support matrix**: current built-in revocation coverage across providers and rule IDs ([docs/REVOCATION_PROVIDERS.md](/docs/REVOCATION_PROVIDERS.md)) - **Blast Radius Mapping**: instantly map leaked keys to their effective cloud identities and exposed resources with `--access-map` (alias `--blast-radius`). Supports 43 providers (see table below). @@ -399,7 +399,7 @@ kingfisher scan /path/to/scan --access-map --view-report # Detection Rules -Kingfisher ships with [945 built-in rules](crates/kingfisher-rules/data/rules/) covering cloud keys, AI tokens, CI/CD secrets, database credentials, and SaaS API keys. Below is an overview — see the full list in [crates/kingfisher-rules/data/rules/](crates/kingfisher-rules/data/rules/): +Kingfisher ships with [950 built-in rules](crates/kingfisher-rules/data/rules/) covering cloud keys, AI tokens, CI/CD secrets, database credentials, and SaaS API keys. Below is an overview — see the full list in [crates/kingfisher-rules/data/rules/](crates/kingfisher-rules/data/rules/): | Category | What we catch | |----------|---------------| @@ -416,7 +416,7 @@ Kingfisher ships with [945 built-in rules](crates/kingfisher-rules/data/rules/) ## Write Custom Rules -Of Kingfisher's 945 built-in rules, 485 include HTTP and service-specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential. +Kingfisher ships with 950 built-in rules. However, you may want to add your own custom rules, or modify a detection to better suit your needs / environment. diff --git a/docs-site/docs/changelog.md b/docs-site/docs/changelog.md index eae6633..05ab499 100644 --- a/docs-site/docs/changelog.md +++ b/docs-site/docs/changelog.md @@ -7,7 +7,7 @@ description: "Kingfisher release history: new features, rules, bug fixes, and im All notable changes to this project will be documented in this file. -## [unreleased v1.99.0] +## [v1.99.0] - Fixed [#371](https://github.com/mongodb/kingfisher/issues/371): `pip install kingfisher-bin` on glibc Linux distros (Ubuntu, Debian, RHEL, Fedora, …) installed a macOS Mach-O binary and failed with `OSError: [Errno 8] Exec format error`. Linux wheels are now tagged `manylinux_2_17_.musllinux_1_2_` (instead of `musllinux_1_2_` only), so pip accepts them on both glibc-2.17+ and musl distros. The `pypi/hatch_build.py` hook now hard-fails when `KINGFISHER_PYPI_WHEEL_TAG` is unset, and the publish workflow refuses to upload any `py3-none-any.whl`, so the v1.92.0-era pure-Python wheel cannot recur. - `--self-update` (alias `--update`) on a scan or other command now **re-execs into the freshly installed binary** so the current invocation completes with the new code and the latest detection rules. Previously the on-disk binary was replaced but the running process kept using the old in-memory version, requiring a second invocation to pick up the changes. On Unix this is a true `exec()` (same PID); on Windows the new binary is spawned and the parent exits with its status code. The explicit `kingfisher self-update` subcommand still updates and exits without re-execing. Self-update now also covers Windows arm64 (the asset was already published; the runtime cfg map gained the missing arm). See `docs/ADVANCED.md` → *Update Checks*. - `--include-contributors` now respects `--github-repo-type` when enumerating contributor-owned repositories: by default contributor forks are excluded (matching the existing `Source` default), previously they were always included regardless of the flag. Added a new `--github-repo-type all` option to opt into the prior behavior of scanning both source and fork repos for contributors, organizations, and users. @@ -18,10 +18,15 @@ All notable changes to this project will be documented in this file. - **Webhook alerting — `--alert-report-url` pivot link:** pass a CI run URL (or set `KINGFISHER_ALERT_REPORT_URL`) to embed a one-click "Full report →" link in every chat payload. In GitHub Actions, pair with `github.server_url/${{ github.repository }}/actions/runs/${{ github.run_id }}` to land the responder directly in the SARIF view for that run. - **Webhook alerting — fingerprints in chat payloads:** every finding rendered in detail mode now includes its stable `fingerprint` ID (e.g. `fp:1635470773610661884`), matching the value emitted in JSON/JSONL/SARIF/baseline outputs. SOAR playbooks and SIEM rules can use these IDs to dedupe across runs without a separate correlation step. - **Webhook alerting — scan target in all alert modes:** the "Target" line in chat payloads now correctly reflects the actual scan target for all input modes (GitHub org/user, GitLab group, Bitbucket workspace, S3/GCS bucket, Docker image, Jira/Confluence, Slack, Teams, Postman, etc.), not just local path scans. -- **`kingfisher.yaml` reaches near-CLI parity:** scalar overrides for `--confidence`, `--redact`, `--format`, `--baseline-file`, `--tls-mode`, validation tuning (timeout / retries / rps / per-rule rps), filters (`--max-file-size`, `--no-binary`, `--extraction-depth`, `--skip-aws-account*`), output (`--output`), git options (`--git-clone-dir`, `--keep-clones`, `--repo-clone-limit`, `--include-contributors`), `alerts.defaults.*`, and global flags (`--allow-internal-ips`, `--no-update-check`, `--user-agent-suffix`, `--endpoint`). Precedence is `CLI > env > config > built-in default` (clap `ValueSource` decides per-flag); list-typed values stay additive. Scan-target inputs (paths, `--git-url`, provider user/org/bucket flags) remain CLI-only by design. See `docs/CONFIG.md` and `docs-site/docs/usage/configuration.md`. +- **`kingfisher.yaml` reaches near-CLI parity:** scalar overrides for `--confidence`, `--redact`, `--format`, `--baseline-file`, `--tls-mode`, validation tuning (timeout / retries / rps / per-rule rps), filters (`--max-file-size`, `--no-binary`, `--extraction-depth`, `--skip-aws-account*`), output (`--output`), git options (`--git-clone-dir`, `--keep-clones`, `--repo-clone-limit`, `--include-contributors`), `alerts.defaults.*`, and global flags (`--allow-internal-ips`, `--no-update-check`, `--user-agent-suffix`, `--endpoint`). Precedence is `CLI > env > config > built-in default` (clap `ValueSource` decides per-flag); list-typed values stay additive. Scan-target inputs (paths, `--git-url`, provider user/org/bucket flags) remain CLI-only by design. The config is loaded **only** when `--config FILE` is passed explicitly — there is no auto-discovery, so scan results never depend on which directory the binary was launched from. See `docs/CONFIG.md`. - **`kingfisher config init` subcommand:** convert an existing `kingfisher scan ...` invocation into a reusable `kingfisher.yaml` by replacing `scan` with `config init` (e.g. `kingfisher config init --confidence high --redact --exclude vendor/ > kingfisher.yaml`). Only flags the user actually supplied appear in the output — clap defaults are stripped — and scan-target inputs are dropped. Writes to stdout by default, or to `--out FILE` (with `--force` to overwrite). +- **Access Map UI redesign** in the report viewer: identities are now grouped into collapsible per-provider sections (admin-bearing providers first); permissions are classified by severity (admin / privilege escalation / risky / read-only) with color-coded badges and rollup chips on each card header; the expanded card body renders permissions **once per group** with a "These permissions apply to all N resources above" banner instead of repeating the same 50+ badges per resource; duplicate-named identities (e.g., multiple MongoDB `admin` tokens) now display a discriminator subtitle (`identity_id · access_type`) so they're tellable apart; new "Critical only" toolbar toggle (persisted in `localStorage`) hides read-only permissions and zero-risk identities; the stats bar gained an admin-permission count. Imported TruffleHog/Gitleaks reports keep the previous flat rendering as a backwards-compatible fallback. Underlying JSON now includes `permissions_by_severity` and an `identity.context` discriminator on each `AccessMapEntry`. ## [v1.98.0] +- Bounded disk usage for large multi-repo scans (e.g. `--include-contributors --repo-artifacts` against orgs with thousands of repos): cloning, artifact fetching, and scanning now run concurrently through bounded channels, and each cloned repo is removed from the temp directory as soon as its scan completes. On-disk footprint stays roughly `O(num_jobs)` regardless of total repo count instead of growing without bound. `--keep-clones` and `--git-clone-dir` opt out of the per-repo cleanup as before. +- Parallelized `--repo-artifacts` fetching with `buffer_unordered(num_jobs)` so issue/PR/wiki API calls run concurrently and stream into the scan loop, replacing the previous per-repo serial loop that delayed the start of scanning by hours on large fan-outs. +- Streamed `--format json` output as compact one-envelope-per-line so concatenated per-repo emits from the parallel scan path produce valid JSONL that `kingfisher view` can load. Pipe through `jq .` for pretty-printed output. +- Fixed a panic in the lexer when a string literal ends in a trailing backslash (`'... \`); the escape handling now clamps past-EOF so `extract_literal_values` returns instead of slicing out of bounds. - Added first-class **Postman** scanning target: new `kingfisher scan postman` subcommand (and equivalent `--postman-*` flags) fetches workspaces, collections, and environments via the Postman API and scans them for hard-coded credentials in request `auth` blocks, pre-request/test scripts, saved example responses, and — notably — `secret`-typed environment variables, which the API returns in plaintext despite the UI mask. Selectors: `--workspace`, `--collection`, `--environment`, `--all`, with optional `--include-mocks-monitors` and `--api-url` for self-hosted endpoints. Authenticates via `KF_POSTMAN_TOKEN` (or `POSTMAN_API_KEY`) sent as `X-Api-Key`; honors `X-RateLimit-RetryAfter` on 429s. Findings link back to `https://go.postman.co/...` URLs in reports. - Fixed [#359](https://github.com/mongodb/kingfisher/issues/359): added `kingfisher.github.9` to detect the new ~520-character stateless GitHub App installation token format (`ghs__`). The legacy 36-character `ghs_` rule (`kingfisher.github.5`) is retained for older / GHES-issued tokens that are still in circulation. - Added provider endpoint overrides for validation and revocation via global `--endpoint PROVIDER=URL` and `--endpoint-config FILE`, with built-in support for self-hosted GitHub, GitLab, Gitea, Jira, Confluence, and Artifactory instances. diff --git a/docs-site/docs/features/access-map.md b/docs-site/docs/features/access-map.md index fe27ee2..619f990 100644 --- a/docs-site/docs/features/access-map.md +++ b/docs-site/docs/features/access-map.md @@ -572,6 +572,5 @@ The `kingfisher blast-radius` and `kingfisher blast_radius` aliases also work fo ## Notes on access-map generation during `scan --access-map` - Access-map entries are only recorded for **validated** findings. -- The `--blast-radius` flag is an alias for `--access-map`. The `kingfisher blast-radius ` subcommand is also an alias for `kingfisher access-map `. - Some providers require extra context that Kingfisher infers from the finding context or validation response (for example, Azure DevOps organization name). -- Validated Hugging Face, Gitea, Bitbucket, Buildkite, Harness, OpenAI, Anthropic, Salesforce, Weights & Biases, Microsoft Teams, monday.com, Asana, and Pinecone credentials discovered during scans with `--access-map` (or `--blast-radius`) are automatically collected and mapped, matching the existing behavior for other platforms. +- Validated Hugging Face, Gitea, Bitbucket, Buildkite, Harness, OpenAI, Anthropic, Salesforce, Weights & Biases, Microsoft Teams, monday.com, Asana, and Pinecone credentials discovered during scans with `--access-map` (or the `--blast-radius` alias) are automatically collected and mapped, matching the existing behavior for other platforms. diff --git a/docs-site/docs/index.md b/docs-site/docs/index.md index b4ae0e0..e9f0847 100644 --- a/docs-site/docs/index.md +++ b/docs-site/docs/index.md @@ -2,7 +2,7 @@ title: Kingfisher — Open Source Secret Scanner with Live Validation description: >- Kingfisher is an open source secret scanner with live validation, blast radius - mapping, and credential revocation. 945 detection rules (485 with live validation), + mapping, and credential revocation. 950 detection rules (485 with live validation), plus a browser-based report viewer that also triages Gitleaks and TruffleHog output. Built in Rust by MongoDB. template: home.html diff --git a/docs-site/docs/rules/builtin-rules.md b/docs-site/docs/rules/builtin-rules.md index 19308a8..ff627f1 100644 --- a/docs-site/docs/rules/builtin-rules.md +++ b/docs-site/docs/rules/builtin-rules.md @@ -1,12 +1,12 @@ --- title: "Built-in Rules List" -description: "Complete list of all 945 built-in secret detection rules in Kingfisher. Searchable and filterable by provider, confidence level, and validation support." +description: "Complete list of all 950 built-in secret detection rules in Kingfisher. Searchable and filterable by provider, confidence level, and validation support." --- # Built-in Rules -Kingfisher ships with **945 detection rules** across **581 providers** -(822 detectors + 123 dependent rules). +Kingfisher ships with **950 detection rules** across **583 providers** +(826 detectors + 124 dependent rules). Of these, **485** include live validation and **50** support direct revocation. !!! tip "Search" @@ -508,6 +508,22 @@ Of these, **485** include live validation and **50** support direct revocation. +Auth +HTTP Basic Authorization Header +kingfisher.auth.1 +Medium + + + + +Auth +HTTP Bearer Authorization Header (non-JWT) +kingfisher.auth.2 +Medium + + + + Auth0 Auth0 Client ID kingfisher.auth0.1 @@ -4836,6 +4852,30 @@ Of these, **485** include live validation and **50** support direct revocation. +Onelogin +OneLogin Client ID +kingfisher.onelogin.1 +Medium + + + + +Onelogin +OneLogin Client Secret +kingfisher.onelogin.2 +Medium +Yes + + + +Onelogin +OneLogin Tenant Domain +kingfisher.onelogin.3 +Medium + + + + Onepassword 1Password Service-Account Token kingfisher.1password.1 diff --git a/docs-site/docs/usage/advanced.md b/docs-site/docs/usage/advanced.md index 57ade08..2c7edcb 100644 --- a/docs-site/docs/usage/advanced.md +++ b/docs-site/docs/usage/advanced.md @@ -438,11 +438,17 @@ See [FINGERPRINT.md](../features/fingerprints.md) for complete details. ## Update Checks -Kingfisher automatically queries GitHub for a newer release when it starts and tells you whether an update is available. +Kingfisher automatically queries GitHub for a newer release when it starts and tells you whether an update is available. The check is informational only — the binary is not modified unless you explicitly opt in. -- **Manual update** – Run `kingfisher update` to update the binary without scanning +- **Update and exit** – Run `kingfisher self-update` (alias `kingfisher update`) to download the latest release, replace the running binary in place, and exit. No scanning occurs. -- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely +- **Update then run with the new version** – Pass the global `--self-update` flag (alias `--update`) on any scan or other command. If a newer release exists, Kingfisher downloads it, replaces the on-disk binary, and **re-execs into the freshly installed binary** so the current invocation completes with the new code (including the latest detection rules). On Unix this is a true `exec()` (same PID); on Windows the new binary is spawned and the parent exits with its status code. If no update is available, the command runs normally with no extra steps. + +- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely. Recommended for CI runs to keep behavior reproducible. + +Self-update writes to wherever the running binary lives, so it requires the calling user to have write access to that location. If you installed Kingfisher via a package manager (Homebrew, the `.deb`/`.rpm` packages, the PyPI wrapper, etc.), use that package manager's upgrade command instead — Kingfisher will detect the permission error and tell you so. + +Self-update supports all six release platforms: Linux x64/arm64, macOS x64/arm64, and Windows x64/arm64. ## Exit Codes diff --git a/docs-site/docs/usage/basic-scanning.md b/docs-site/docs/usage/basic-scanning.md index 3172424..17bdde2 100644 --- a/docs-site/docs/usage/basic-scanning.md +++ b/docs-site/docs/usage/basic-scanning.md @@ -422,6 +422,46 @@ kingfisher scan ./my-project \ --exclude '[Tt]ests' ``` +### Project configuration file (`kingfisher.yaml`) + +Most `kingfisher scan` flags can be set as project defaults via a +`kingfisher.yaml` file in the repo root (or any ancestor directory). CLI +flags always win; config values fill in defaults. Lists are concatenated. + +```yaml +# kingfisher.yaml +scan: + confidence: high + redact: true +output: + format: sarif + path: ./kingfisher.sarif +filters: + exclude: + - vendor/ + - "**/node_modules/**" +alerts: + webhooks: + - url: https://hooks.slack.com/services/T0/B0/AAA + format: slack +``` + +```bash +kingfisher scan . # auto-discovers ./kingfisher.yaml +kingfisher scan . --config /etc/kf.yaml # explicit path +``` + +Don't write the YAML by hand. If you already have a long `kingfisher scan` +command, run the same flags under `kingfisher config init` to generate it: + +```bash +kingfisher config init \ + --confidence high --redact --exclude vendor/ --format sarif \ + > kingfisher.yaml +``` + +See [`docs/CONFIG.md`](../usage/configuration.md) for the full schema and precedence rules. + ### Scan changes in CI pipelines Limit scanning to the delta between your default branch and a pull request branch by combining `--since-commit` with `--branch` (defaults to `HEAD`). This only scans files that differ between the two references, which keeps CI runs fast while still blocking new secrets. diff --git a/docs-site/docs/usage/configuration.md b/docs-site/docs/usage/configuration.md index 0fff723..a6429f3 100644 --- a/docs-site/docs/usage/configuration.md +++ b/docs-site/docs/usage/configuration.md @@ -11,10 +11,18 @@ stays short. The file is **additive** for list/map values and **default-only** for scalars: a config value applies only when the user did not pass the matching `--flag`. CLI flags always win. -## Discovery +## Loading a config -- `--config FILE` overrides everything; an explicit path that fails to parse is fatal. -- Otherwise Kingfisher walks up from the current working directory looking for `kingfisher.yaml`. Missing config is silent. +Kingfisher does **not** auto-discover `kingfisher.yaml`. The file is loaded +only when you pass `--config FILE` explicitly: + +```bash +kingfisher scan . --config ./kingfisher.yaml +``` + +A missing or malformed file is a fatal error — there is no silent fallback, +so a typo in the path or a broken YAML block fails fast instead of running +with surprising defaults. ## Precedence @@ -68,15 +76,36 @@ GitHub/GitLab/etc. flags, S3/GCS buckets) are stripped — they describe | `baseline` | `--baseline-file`, `--manage-baseline` | | `alerts` | per-webhook entries + global `--alert-*` defaults | | `global` | TLS mode, internal-IP allow-list, endpoint overrides | -| `git` | clone dir, keep-clones, repo-clone-limit, include-contributors | +| `git` | clone dir, keep-clones, repo-clone-limit, include-contributors, GHE / self-hosted GitLab API URLs | A complete worked example, with every field annotated, lives in [`docs/CONFIG.md`](https://github.com/mongodb/kingfisher/blob/main/docs/CONFIG.md). -## Example: a minimal CI config +## End-to-end: create a config and scan with it + +### Step 1 — generate the config from your existing CLI command + +Don't write the YAML by hand. Run your existing `kingfisher scan ...` +flags under `kingfisher config init` and capture the output: + +```bash +kingfisher config init \ + --confidence high \ + --redact \ + --exclude vendor/ \ + --exclude '**/node_modules/**' \ + --exclude '**/__snapshots__/**' \ + --format sarif \ + --output ./kingfisher.sarif \ + --alert-min-confidence high \ + --alert-webhook https://hooks.slack.com/services/T0/B0/AAA \ + > kingfisher.yaml +``` + +The result, ready to commit: ```yaml -# kingfisher.yaml — checked into the repo root +# kingfisher.yaml — generated by `kingfisher config init`. scan: confidence: high redact: true @@ -93,13 +122,21 @@ alerts: min_confidence: high webhooks: - url: https://hooks.slack.com/services/T0/B0/AAA - format: slack ``` +### Step 2 — run the scan, passing the config explicitly + ```bash -kingfisher scan . # auto-discovers ./kingfisher.yaml -kingfisher scan . --config /etc/kf.yaml # explicit path -kingfisher scan . --confidence low # CLI overrides the config value +kingfisher scan . --config ./kingfisher.yaml +``` + +`--config FILE` is required — Kingfisher does not auto-discover +`kingfisher.yaml`. You can override any config value on the CLI for a +single run; CLI always wins: + +```bash +kingfisher scan . --config ./kingfisher.yaml --confidence low +# scan.confidence: high in YAML → CLI flag wins, runs at low confidence ``` ## What is *not* config-overridable @@ -125,6 +162,6 @@ checked-in config files. `kingfisher.yaml` is rejected at startup if it has unknown fields, malformed URLs in webhook entries, invalid regex, out-of-range numeric values, or -`endpoints` that don't follow `provider=url`. Use `--config /path/to/file.yaml` -to surface parse errors when iterating; auto-discovered configs that fail to -parse are also fatal. +`endpoints` that don't follow `provider=url`. There is no auto-discovery — +pass `--config FILE` explicitly so a missing or malformed file fails fast +instead of being silently skipped. diff --git a/docs-site/docs/usage/integrations.md b/docs-site/docs/usage/integrations.md index b6891b7..bdd12e2 100644 --- a/docs-site/docs/usage/integrations.md +++ b/docs-site/docs/usage/integrations.md @@ -214,6 +214,60 @@ kingfisher scan https://github.com/org/repo.git --repo-artifacts KF_GITHUB_TOKEN="ghp_…" kingfisher scan https://github.com/org/private_repo.git --repo-artifacts ``` +### Scan a GitHub Enterprise / self-hosted GitHub instance + +For GitHub Enterprise Server (GHES) or any self-hosted GitHub install, you +need two flags: + +- `--github-api-url ` — points the **enumeration / clone** flow at the + custom API root (typically `https://ghe.example.com/api/v3/`). +- `--endpoint github=` — points the **token validation / revocation** + flow at the same instance, so any GitHub PATs Kingfisher discovers in the + scanned source are checked against your GHE rather than `api.github.com`. + +```bash +# 1. Scan every org repo on GHE and validate discovered tokens against the same instance +KF_GITHUB_TOKEN="ghp_…" kingfisher scan github \ + --organization my-org \ + --github-api-url https://ghe.corp.example.com/api/v3/ \ + --endpoint github=https://ghe.corp.example.com + +# 2. Scan a single GHE repo by URL (positional target) +KF_GITHUB_TOKEN="ghp_…" kingfisher scan https://ghe.corp.example.com/org/repo.git \ + --endpoint github=https://ghe.corp.example.com + +# 3. Scan ALL orgs on a GHE instance (requires non-default --github-api-url) +KF_GITHUB_TOKEN="ghp_…" kingfisher scan github \ + --all-orgs \ + --github-api-url https://ghe.corp.example.com/api/v3/ \ + --endpoint github=https://ghe.corp.example.com + +# 4. GHE on a private network — add --allow-internal-ips so the validator +# can reach RFC1918 / loopback hosts (SSRF guard is on by default). +KF_GITHUB_TOKEN="ghp_…" kingfisher scan github \ + --organization my-org \ + --github-api-url https://ghe.internal/api/v3/ \ + --endpoint github=https://ghe.internal \ + --allow-internal-ips + +# 5. Validate a single PAT against GHE without scanning anything +kingfisher validate --rule github \ + --endpoint github=https://ghe.corp.example.com \ + "ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + +# 6. Revoke (delete) a confirmed-leaked PAT against GHE +kingfisher revoke --rule github \ + --endpoint github=https://ghe.corp.example.com \ + "ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +``` + +`--github-api-url` is the GHE *cloning* root that Kingfisher walks to +enumerate orgs, repos, and contributors. `--endpoint github=…` is the +*validator* root used to live-check discovered tokens. They are usually the +same host, but they're separate flags because some deployments front-load +auth (an SSO portal for repo access vs. a direct API endpoint for token +validation). + ## GitLab ### Scan GitLab group (requires `KF_GITLAB_TOKEN`) @@ -284,6 +338,99 @@ kingfisher scan https://gitlab.com/group/project.git --repo-artifacts KF_GITLAB_TOKEN="glpat-…" kingfisher scan https://gitlab.com/group/private_project.git --repo-artifacts ``` +### Scan a self-hosted (Omnibus / Cloud Native) GitLab instance + +For GitLab self-hosted (Omnibus, Helm, or Cloud Native), pair the +enumeration flag with a matching validation endpoint: + +- `--gitlab-api-url ` — points the **enumeration / clone** flow at + the custom GitLab root (typically `https://gitlab.example.com/`). +- `--endpoint gitlab=` — points the **token validation / revocation** + flow at the same instance, so any GitLab PATs found in the scanned + source are checked against your self-hosted GitLab rather than + `gitlab.com`. + +```bash +# 1. Scan a self-hosted group and validate discovered tokens against the same instance +KF_GITLAB_TOKEN="glpat-…" kingfisher scan gitlab \ + --group my-group \ + --include-subgroups \ + --gitlab-api-url https://gitlab.corp.example.com/ \ + --endpoint gitlab=https://gitlab.corp.example.com + +# 2. Scan a single self-hosted GitLab project by URL +KF_GITLAB_TOKEN="glpat-…" kingfisher scan https://gitlab.corp.example.com/group/project.git \ + --endpoint gitlab=https://gitlab.corp.example.com + +# 3. Scan ALL groups on a self-hosted GitLab (requires non-default --gitlab-api-url) +KF_GITLAB_TOKEN="glpat-…" kingfisher scan gitlab \ + --all-groups \ + --gitlab-api-url https://gitlab.corp.example.com/ \ + --endpoint gitlab=https://gitlab.corp.example.com + +# 4. Self-hosted GitLab on a private network — add --allow-internal-ips so +# the validator can reach RFC1918 / loopback hosts. +KF_GITLAB_TOKEN="glpat-…" kingfisher scan gitlab \ + --group my-group \ + --gitlab-api-url https://gitlab.internal/ \ + --endpoint gitlab=https://gitlab.internal \ + --allow-internal-ips + +# 5. Validate a single PAT against self-hosted GitLab without scanning anything +kingfisher validate --rule gitlab \ + --endpoint gitlab=https://gitlab.corp.example.com \ + "glpat-xxxxxxxxxxxxxxxxxxxx" + +# 6. Revoke (delete) a confirmed-leaked PAT against self-hosted GitLab +kingfisher revoke --rule gitlab \ + --endpoint gitlab=https://gitlab.corp.example.com \ + "glpat-xxxxxxxxxxxxxxxxxxxx" +``` + +### Many endpoints at once: `--endpoint-config` + +If you maintain a fleet of self-hosted instances (GHE, self-hosted GitLab, +Gitea, Jira DC, Confluence, Artifactory), put them in a single YAML file +and reference it instead of repeating `--endpoint` on every command: + +```yaml +# kingfisher-endpoints.yml +endpoints: + github: https://ghe.corp.example.com + gitlab: https://gitlab.corp.example.com + gitea: https://gitea.corp.example.com + jira: https://jira.corp.example.com + confluence: https://wiki.corp.example.com + artifactory: http://artifactory.internal:8081 +``` + +```bash +KF_GITHUB_TOKEN="ghp_…" KF_GITLAB_TOKEN="glpat-…" kingfisher scan github \ + --organization my-org \ + --github-api-url https://ghe.corp.example.com/api/v3/ \ + --endpoint-config ./kingfisher-endpoints.yml \ + --allow-internal-ips +``` + +### Tip: bake the endpoints into `kingfisher.yaml` + +Once you've worked out the right flags, capture them as project defaults +so every scan uses the same config: + +```bash +kingfisher config init \ + --github-api-url https://ghe.corp.example.com/api/v3/ \ + --gitlab-api-url https://gitlab.corp.example.com/ \ + --endpoint github=https://ghe.corp.example.com \ + --endpoint gitlab=https://gitlab.corp.example.com \ + --allow-internal-ips \ + > kingfisher.yaml + +# Then every scan inherits the same self-hosted defaults: +KF_GITHUB_TOKEN="ghp_…" kingfisher scan github --organization my-org \ + --config ./kingfisher.yaml +``` + ### List GitLab repositories ```bash @@ -658,7 +805,7 @@ The token is sent as the `X-Api-Key` header. Either `KF_POSTMAN_TOKEN` or `POSTM > Top-level `kingfisher scan --postman-*` flags remain accepted as hidden aliases for backward compatibility, but new usage should prefer the `kingfisher scan postman` subcommand shown above. -**Out of scope:** Postman Vault secrets are client-side and not reachable via the API. The Postman API Network does not expose a search endpoint; supply specific public-workspace IDs via `kingfisher scan postman --workspace` to scan public surfaces. +**Out of scope:** Postman Vault secrets are client-side and not reachable via the API. The Postman API Network does not expose a search endpoint; supply specific public-workspace IDs via `--postman-workspace` to scan public surfaces. ## Environment Variables diff --git a/docs-site/mkdocs.yml b/docs-site/mkdocs.yml index ff5f578..cd58d1b 100644 --- a/docs-site/mkdocs.yml +++ b/docs-site/mkdocs.yml @@ -1,7 +1,7 @@ site_name: Kingfisher site_url: https://mongodb.github.io/kingfisher site_description: >- - Open source secret scanner with live validation. 945 detection rules, + Open source secret scanner with live validation. 950 detection rules, blast radius mapping, credential revocation, and a browser-based report viewer that also imports Gitleaks and TruffleHog output. Built in Rust by MongoDB. diff --git a/docs-site/overrides/home.html b/docs-site/overrides/home.html index fed8406..a74fbc7 100644 --- a/docs-site/overrides/home.html +++ b/docs-site/overrides/home.html @@ -36,7 +36,7 @@
- 945 + 950 Detection Rules
diff --git a/docs-site/overrides/main.html b/docs-site/overrides/main.html index 2804a17..3d2331f 100644 --- a/docs-site/overrides/main.html +++ b/docs-site/overrides/main.html @@ -7,7 +7,7 @@ "@context": "https://schema.org", "@type": "SoftwareApplication", "name": "Kingfisher", - "description": "Open source secret scanner with live validation. 945 detection rules, blast radius mapping, and credential revocation.", + "description": "Open source secret scanner with live validation. 950 detection rules, blast radius mapping, and credential revocation.", "applicationCategory": "DeveloperApplication", "operatingSystem": "Linux, macOS, Windows", "license": "https://opensource.org/licenses/Apache-2.0", diff --git a/docs-site/scripts/prepare-docs.py b/docs-site/scripts/prepare-docs.py index 37fca39..8dea79e 100644 --- a/docs-site/scripts/prepare-docs.py +++ b/docs-site/scripts/prepare-docs.py @@ -108,6 +108,7 @@ LINK_REWRITES = { "ADVANCED.md": "../usage/advanced.md", "BASELINE.md": "../usage/baseline.md", "DEPLOYMENT.md": "../usage/deployment.md", + "CONFIG.md": "../usage/configuration.md", "ACCESS_MAP.md": "../features/access-map.md", "REVOCATION_PROVIDERS.md": "../features/revocation.md", "TOKEN_REVOCATION_SUPPORT.md": "../features/revocation.md", diff --git a/docs/CONFIG.md b/docs/CONFIG.md index c9ed36d..7c2ab14 100644 --- a/docs/CONFIG.md +++ b/docs/CONFIG.md @@ -7,11 +7,20 @@ concatenated); scalars are **default-only** — a config value applies only when the user did not pass the matching `--flag`. This keeps CI overrides predictable and makes the CLI authoritative. -## Discovery +## Loading a config -- `--config FILE` overrides everything; an explicit path that fails to parse is fatal. -- Otherwise Kingfisher walks up from the current working directory looking for - `kingfisher.yaml`. Missing config is silent. +Kingfisher does **not** auto-discover `kingfisher.yaml`. The file is loaded +only when you pass `--config FILE` explicitly: + +```bash +kingfisher scan . --config ./kingfisher.yaml +``` + +A missing or malformed file is a fatal error — there is no silent fallback, +so a typo in the path or a broken YAML block fails fast instead of running +with surprising defaults. Auto-discovery was rejected because it makes scan +results depend on where the binary was launched from, which is too easy to +get wrong in CI. ## Precedence @@ -25,14 +34,16 @@ but redundant. The one nuance: `rules.enabled` *replaces* the synthetic `["all"]` default when you don't pass `--rule`, so a config that lists `["custom"]` actually narrows the selection. -## Generating a config from an existing CLI invocation +## End-to-end: create a config and scan with it -Don't write the YAML by hand. If you already have a long -`kingfisher scan ...` command (or a CI step assembling flags), run the same -flags under `kingfisher config init` and capture the YAML: +### Step 1 — generate the config + +Don't write the YAML by hand. Take your existing `kingfisher scan ...` +command (or the CI step that builds it) and run the same flags under +`kingfisher config init`: ```bash -# Print to stdout, redirect to file +# Print to stdout, redirect to file: kingfisher config init \ --confidence high \ --redact \ @@ -45,9 +56,8 @@ kingfisher config init \ --tls-mode lax \ > kingfisher.yaml -# Or write directly: +# Or write the file directly (pass --force to overwrite): kingfisher config init [...flags...] --out kingfisher.yaml -# Pass --force to overwrite an existing file. ``` Only flags you actually supply appear in the output; clap defaults are @@ -55,6 +65,20 @@ omitted to keep the file minimal. Scan-target inputs (paths, `--git-url`, GitHub/GitLab/etc. flags, S3/GCS buckets) are stripped — they describe *what* this run scans and don't belong in shared project policy. +### Step 2 — run the scan, passing the config explicitly + +```bash +kingfisher scan . --config ./kingfisher.yaml +``` + +`--config FILE` is required: there is no auto-discovery. CLI flags can +still override any individual value for a single run: + +```bash +kingfisher scan . --config ./kingfisher.yaml --confidence low +# scan.confidence: high in YAML → CLI flag wins, runs at low confidence +``` + ## Webhook URL policy `alerts.webhooks[].url` (and `--alert-webhook URL`) **must use `https://`**. @@ -184,6 +208,8 @@ git: keep_clones: false # bool (--keep-clones) repo_clone_limit: null # int (--repo-clone-limit) include_contributors: false # bool (--include-contributors) + github_api_url: null # URL GHE / self-hosted GH (--github-api-url) + gitlab_api_url: null # URL self-hosted GitLab (--gitlab-api-url) ``` Unknown fields are rejected (typo protection). Empty sections and a missing @@ -191,15 +217,20 @@ top-level file are both fine. ## Example: CI workflow -```yaml -# .github/workflows/secrets.yml -- uses: mongodb/kingfisher/.github/actions/kingfisher@main - with: - config: ./kingfisher.yaml - alert-webhook: ${{ secrets.SLACK_SECURITY_WEBHOOK }} +A typical `kingfisher.yaml` for a CI repo, paired with a workflow step +that runs `kingfisher scan` against it: + +```bash +# .github/workflows/secrets.yml — run step +kingfisher scan . \ + --config ./kingfisher.yaml \ + --alert-webhook "$SLACK_SECURITY_WEBHOOK" +# `--alert-webhook` here is appended to any webhooks already in +# kingfisher.yaml (lists are additive). Everything else comes from the +# config file. ``` -A typical `kingfisher.yaml` for a CI repo: +The committed `kingfisher.yaml`: ```yaml scan: diff --git a/docs/USAGE.md b/docs/USAGE.md index 915fe49..74f6ed4 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -420,11 +420,31 @@ kingfisher scan ./my-project \ ### Project configuration file (`kingfisher.yaml`) Most `kingfisher scan` flags can be set as project defaults via a -`kingfisher.yaml` file in the repo root (or any ancestor directory). CLI -flags always win; config values fill in defaults. Lists are concatenated. +`kingfisher.yaml` file. CLI flags always win; config values fill in +defaults. Lists are concatenated. + +The config file is **never auto-discovered** — pass `--config FILE` +explicitly or it is not loaded. + +**Step 1 — generate the config from your existing CLI command** (don't +write the YAML by hand): + +```bash +kingfisher config init \ + --confidence high \ + --redact \ + --exclude vendor/ \ + --exclude '**/node_modules/**' \ + --format sarif \ + --output ./kingfisher.sarif \ + --alert-webhook https://hooks.slack.com/services/T0/B0/AAA \ + > kingfisher.yaml +``` + +The resulting `kingfisher.yaml`: ```yaml -# kingfisher.yaml +# kingfisher.yaml — generated by `kingfisher config init`. scan: confidence: high redact: true @@ -438,21 +458,19 @@ filters: alerts: webhooks: - url: https://hooks.slack.com/services/T0/B0/AAA - format: slack ``` +**Step 2 — run the scan, passing the config explicitly:** + ```bash -kingfisher scan . # auto-discovers ./kingfisher.yaml -kingfisher scan . --config /etc/kf.yaml # explicit path +kingfisher scan . --config ./kingfisher.yaml ``` -Don't write the YAML by hand. If you already have a long `kingfisher scan` -command, run the same flags under `kingfisher config init` to generate it: +You can override any config value on the CLI for a single run: ```bash -kingfisher config init \ - --confidence high --redact --exclude vendor/ --format sarif \ - > kingfisher.yaml +kingfisher scan . --config ./kingfisher.yaml --confidence low +# scan.confidence: high in YAML → CLI flag wins, runs at low confidence ``` See [`docs/CONFIG.md`](CONFIG.md) for the full schema and precedence rules. @@ -723,6 +741,60 @@ kingfisher scan https://github.com/org/repo.git --repo-artifacts KF_GITHUB_TOKEN="ghp_…" kingfisher scan https://github.com/org/private_repo.git --repo-artifacts ``` +### Scan a GitHub Enterprise / self-hosted GitHub instance + +For GitHub Enterprise Server (GHES) or any self-hosted GitHub install, you +need two flags: + +- `--github-api-url ` — points the **enumeration / clone** flow at the + custom API root (typically `https://ghe.example.com/api/v3/`). +- `--endpoint github=` — points the **token validation / revocation** + flow at the same instance, so any GitHub PATs Kingfisher discovers in the + scanned source are checked against your GHE rather than `api.github.com`. + +```bash +# 1. Scan every org repo on GHE and validate discovered tokens against the same instance +KF_GITHUB_TOKEN="ghp_…" kingfisher scan github \ + --organization my-org \ + --github-api-url https://ghe.corp.example.com/api/v3/ \ + --endpoint github=https://ghe.corp.example.com + +# 2. Scan a single GHE repo by URL (positional target) +KF_GITHUB_TOKEN="ghp_…" kingfisher scan https://ghe.corp.example.com/org/repo.git \ + --endpoint github=https://ghe.corp.example.com + +# 3. Scan ALL orgs on a GHE instance (requires non-default --github-api-url) +KF_GITHUB_TOKEN="ghp_…" kingfisher scan github \ + --all-orgs \ + --github-api-url https://ghe.corp.example.com/api/v3/ \ + --endpoint github=https://ghe.corp.example.com + +# 4. GHE on a private network — add --allow-internal-ips so the validator +# can reach RFC1918 / loopback hosts (SSRF guard is on by default). +KF_GITHUB_TOKEN="ghp_…" kingfisher scan github \ + --organization my-org \ + --github-api-url https://ghe.internal/api/v3/ \ + --endpoint github=https://ghe.internal \ + --allow-internal-ips + +# 5. Validate a single PAT against GHE without scanning anything +kingfisher validate --rule github \ + --endpoint github=https://ghe.corp.example.com \ + "ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + +# 6. Revoke (delete) a confirmed-leaked PAT against GHE +kingfisher revoke --rule github \ + --endpoint github=https://ghe.corp.example.com \ + "ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +``` + +> **Why two URLs?** `--github-api-url` is the GHE *cloning* root that +> Kingfisher walks to enumerate orgs, repos, and contributors. +> `--endpoint github=…` is the *validator* root used to live-check +> discovered tokens. They are usually the same host, but they're separate +> flags because some deployments front-load auth (an SSO portal for repo +> access vs. a direct API endpoint for token validation). + --- ## GitLab @@ -783,6 +855,99 @@ kingfisher scan https://gitlab.com/group/project.git --repo-artifacts KF_GITLAB_TOKEN="glpat-…" kingfisher scan https://gitlab.com/group/private_project.git --repo-artifacts ``` +### Scan a self-hosted (Omnibus / Cloud Native) GitLab instance + +For GitLab self-hosted (Omnibus, Helm, or Cloud Native), pair the +enumeration flag with a matching validation endpoint, just like with GHE: + +- `--gitlab-api-url ` — points the **enumeration / clone** flow at + the custom GitLab root (typically `https://gitlab.example.com/`). +- `--endpoint gitlab=` — points the **token validation / revocation** + flow at the same instance, so any GitLab PATs found in the scanned + source are checked against your self-hosted GitLab rather than + `gitlab.com`. + +```bash +# 1. Scan a self-hosted group and validate discovered tokens against the same instance +KF_GITLAB_TOKEN="glpat-…" kingfisher scan gitlab \ + --group my-group \ + --include-subgroups \ + --gitlab-api-url https://gitlab.corp.example.com/ \ + --endpoint gitlab=https://gitlab.corp.example.com + +# 2. Scan a single self-hosted GitLab project by URL +KF_GITLAB_TOKEN="glpat-…" kingfisher scan https://gitlab.corp.example.com/group/project.git \ + --endpoint gitlab=https://gitlab.corp.example.com + +# 3. Scan ALL groups on a self-hosted GitLab (requires non-default --gitlab-api-url) +KF_GITLAB_TOKEN="glpat-…" kingfisher scan gitlab \ + --all-groups \ + --gitlab-api-url https://gitlab.corp.example.com/ \ + --endpoint gitlab=https://gitlab.corp.example.com + +# 4. Self-hosted GitLab on a private network — add --allow-internal-ips so +# the validator can reach RFC1918 / loopback hosts. +KF_GITLAB_TOKEN="glpat-…" kingfisher scan gitlab \ + --group my-group \ + --gitlab-api-url https://gitlab.internal/ \ + --endpoint gitlab=https://gitlab.internal \ + --allow-internal-ips + +# 5. Validate a single PAT against self-hosted GitLab without scanning anything +kingfisher validate --rule gitlab \ + --endpoint gitlab=https://gitlab.corp.example.com \ + "glpat-xxxxxxxxxxxxxxxxxxxx" + +# 6. Revoke (delete) a confirmed-leaked PAT against self-hosted GitLab +kingfisher revoke --rule gitlab \ + --endpoint gitlab=https://gitlab.corp.example.com \ + "glpat-xxxxxxxxxxxxxxxxxxxx" +``` + +### Many endpoints at once: `--endpoint-config` + +If you maintain a fleet of self-hosted instances (GHE, self-hosted GitLab, +Gitea, Jira DC, Confluence, Artifactory), put them in a single YAML file +and reference it instead of repeating `--endpoint` on every command: + +```yaml +# kingfisher-endpoints.yml +endpoints: + github: https://ghe.corp.example.com + gitlab: https://gitlab.corp.example.com + gitea: https://gitea.corp.example.com + jira: https://jira.corp.example.com + confluence: https://wiki.corp.example.com + artifactory: http://artifactory.internal:8081 +``` + +```bash +KF_GITHUB_TOKEN="ghp_…" KF_GITLAB_TOKEN="glpat-…" kingfisher scan github \ + --organization my-org \ + --github-api-url https://ghe.corp.example.com/api/v3/ \ + --endpoint-config ./kingfisher-endpoints.yml \ + --allow-internal-ips +``` + +### Tip: bake the endpoints into `kingfisher.yaml` + +Once you've worked out the right flags, capture them as project defaults +so every scan uses the same config: + +```bash +kingfisher config init \ + --github-api-url https://ghe.corp.example.com/api/v3/ \ + --gitlab-api-url https://gitlab.corp.example.com/ \ + --endpoint github=https://ghe.corp.example.com \ + --endpoint gitlab=https://gitlab.corp.example.com \ + --allow-internal-ips \ + > kingfisher.yaml + +# Then every scan inherits the same self-hosted defaults: +KF_GITHUB_TOKEN="ghp_…" kingfisher scan github --organization my-org \ + --config ./kingfisher.yaml +``` + ### List GitLab repositories ```bash diff --git a/src/cli/config.rs b/src/cli/config.rs index 7174cd6..7e8f1e8 100644 --- a/src/cli/config.rs +++ b/src/cli/config.rs @@ -61,7 +61,9 @@ use crate::cli::commands::output::ReportOutputFormat; use crate::cli::commands::scan::ConfidenceLevel; use crate::cli::global::TlsMode; -/// File name auto-discovered when the user does not pass `--config`. +/// Conventional file name when users save a project-local config. The path +/// must still be passed explicitly via `--config`; nothing in the binary +/// auto-loads a file with this name. pub const DEFAULT_CONFIG_NAME: &str = "kingfisher.yaml"; #[derive(Debug, Clone, Default, Serialize, Deserialize)] @@ -252,7 +254,7 @@ pub struct GlobalConfig { } // ---------------------------------------------------------------------------- -// git: clone behavior for git scans. +// git: clone behavior + provider API roots for git scans. // ---------------------------------------------------------------------------- #[derive(Debug, Clone, Default, Serialize, Deserialize)] #[serde(deny_unknown_fields)] @@ -261,6 +263,17 @@ pub struct GitConfig { pub keep_clones: Option, pub repo_clone_limit: Option, pub include_contributors: Option, + /// GitHub Enterprise / self-hosted GitHub API root used during enumeration + /// and cloning. Equivalent to `--github-api-url` on the bare `scan` form + /// or `--api-url` on `kingfisher scan github`. For *validation* of + /// discovered tokens against the same instance, set + /// `global.endpoints` (e.g. `github=https://ghe.example.com`). + pub github_api_url: Option, + /// Self-hosted GitLab API root used during enumeration and cloning. + /// Equivalent to `--gitlab-api-url`. Pair with a matching + /// `global.endpoints` `gitlab=...` entry to also redirect token + /// validation to the same instance. + pub gitlab_api_url: Option, } // ---------------------------------------------------------------------------- @@ -364,10 +377,6 @@ impl From for ConfigReportFormat { } } -/// Cap on `discover_path` upward walks. Avoids unbounded directory traversal -/// on networked filesystems or pathological mount layouts. -const DISCOVER_MAX_DEPTH: usize = 32; - /// Parse YAML text into a config struct, validating webhook URLs, regex /// patterns, range-bounded scalars, and endpoint formats so config errors /// surface at the `--config` site rather than mid-scan. @@ -440,33 +449,22 @@ fn validate(cfg: &KingfisherConfig) -> Result<()> { .with_context(|| format!("global.endpoints[{idx}] URL is not valid"))?; } + // git.github_api_url / git.gitlab_api_url — must parse as URLs. + if let Some(u) = &cfg.git.github_api_url { + url::Url::parse(u).context("git.github_api_url is not a valid URL")?; + } + if let Some(u) = &cfg.git.gitlab_api_url { + url::Url::parse(u).context("git.gitlab_api_url is not a valid URL")?; + } + // alerts.defaults.report_url already checked above. Ok(()) } -/// Walk upward from `start` looking for `kingfisher.yaml` in each ancestor -/// directory. Returns the absolute path when found. Performs *no* file reads — -/// the caller does the read once it has decided which file to use. Capped at -/// [`DISCOVER_MAX_DEPTH`] levels to bound the walk on networked filesystems. -pub fn discover_path(start: &std::path::Path) -> Option { - let mut current = start.to_path_buf(); - for _ in 0..=DISCOVER_MAX_DEPTH { - let candidate = current.join(DEFAULT_CONFIG_NAME); - if candidate.is_file() { - return Some(candidate); - } - if !current.pop() { - return None; - } - } - None -} - #[cfg(test)] mod tests { use super::*; - use tempfile::TempDir; #[test] fn parse_minimal_alerts() { @@ -547,6 +545,8 @@ git: clone_dir: "./clones" keep_clones: true repo_clone_limit: 50 + github_api_url: https://ghe.example.com/api/v3/ + gitlab_api_url: https://gitlab.example.com/ "#; let cfg = parse_str(yaml).unwrap(); assert!(matches!(cfg.scan.confidence, Some(ConfigConfidence::High))); @@ -569,6 +569,20 @@ git: assert_eq!(cfg.global.endpoints.len(), 1); assert_eq!(cfg.git.clone_dir.as_deref().map(|p| p.to_str().unwrap()), Some("./clones")); assert_eq!(cfg.git.keep_clones, Some(true)); + assert_eq!(cfg.git.github_api_url.as_deref(), Some("https://ghe.example.com/api/v3/")); + assert_eq!(cfg.git.gitlab_api_url.as_deref(), Some("https://gitlab.example.com/")); + } + + #[test] + fn invalid_git_github_api_url_is_rejected() { + let err = parse_str("git:\n github_api_url: 'not_a_url'\n").unwrap_err(); + assert!(format!("{err:#}").contains("git.github_api_url")); + } + + #[test] + fn invalid_git_gitlab_api_url_is_rejected() { + let err = parse_str("git:\n gitlab_api_url: 'also not a url'\n").unwrap_err(); + assert!(format!("{err:#}").contains("git.gitlab_api_url")); } #[test] @@ -700,25 +714,4 @@ git: {} let err = parse_str("global:\n endpoints:\n - '=https://example.com/'\n").unwrap_err(); assert!(format!("{err:#}").contains("global.endpoints[0]")); } - - #[test] - fn discover_walks_upward() { - let temp = TempDir::new().unwrap(); - let nested = temp.path().join("a/b/c"); - std::fs::create_dir_all(&nested).unwrap(); - let cfg_path = temp.path().join(DEFAULT_CONFIG_NAME); - std::fs::write(&cfg_path, "alerts: { webhooks: [] }\n").unwrap(); - let found = discover_path(&nested).unwrap(); - assert_eq!( - std::fs::canonicalize(&found).unwrap(), - std::fs::canonicalize(&cfg_path).unwrap() - ); - } - - #[test] - fn discover_returns_none_when_absent() { - let temp = TempDir::new().unwrap(); - let found = discover_path(temp.path()); - assert!(found.is_none()); - } } diff --git a/src/cli/global.rs b/src/cli/global.rs index ddb852e..cceca20 100644 --- a/src/cli/global.rs +++ b/src/cli/global.rs @@ -170,9 +170,10 @@ pub struct GlobalArgs { pub endpoint_config: Option, /// Path to a `kingfisher.yaml` project config file. - /// If omitted, Kingfisher walks up from the current working directory - /// looking for `kingfisher.yaml`. List-typed config values are concatenated - /// onto matching CLI flags; scalar config values are applied only when the + /// + /// **No auto-discovery** — the file is loaded only when this flag is + /// passed explicitly. List-typed config values are concatenated onto + /// matching CLI flags; scalar config values are applied only when the /// matching `--flag` was not passed (precedence: CLI > env > config > /// built-in default). See `docs/CONFIG.md` for the full schema. #[arg(global = true, long = "config", value_name = "FILE")] diff --git a/src/main.rs b/src/main.rs index c66e698..dd04318 100644 --- a/src/main.rs +++ b/src/main.rs @@ -298,31 +298,24 @@ fn setup_logging(global_args: &GlobalArgs) { } /// Resolve and read a `kingfisher.yaml` project config. -/// - Explicit `--config ` is required to exist; missing/unreadable is an error. -/// - Otherwise we walk up from CWD looking for `kingfisher.yaml`. Missing is fine. +/// +/// The config file is loaded **only** when the user passes `--config ` +/// explicitly. There is intentionally no auto-discovery — relying on a +/// `kingfisher.yaml` that happens to sit in the cwd (or any ancestor +/// directory) makes scan results depend on where the binary was invoked +/// from, which is too easy to get wrong in CI. If the explicit path is +/// missing or fails to parse, that is a fatal error. fn load_project_config( explicit: Option<&std::path::Path>, ) -> Result> { - let path = match explicit { - Some(p) => Some(p.to_path_buf()), - None => { - let cwd = std::env::current_dir().context("read CWD for config discovery")?; - kingfisher::cli::config::discover_path(&cwd) - } - }; - match path { - Some(p) => { - let bytes = - std::fs::read(&p).with_context(|| format!("read config {}", p.display()))?; - let yaml = String::from_utf8(bytes) - .with_context(|| format!("config {} is not UTF-8", p.display()))?; - let cfg = kingfisher::cli::config::parse_str(&yaml) - .with_context(|| format!("parse config {}", p.display()))?; - info!("loaded config from {}", p.display()); - Ok(Some(cfg)) - } - None => Ok(None), - } + let Some(p) = explicit else { return Ok(None) }; + let bytes = std::fs::read(p).with_context(|| format!("read config {}", p.display()))?; + let yaml = + String::from_utf8(bytes).with_context(|| format!("config {} is not UTF-8", p.display()))?; + let cfg = kingfisher::cli::config::parse_str(&yaml) + .with_context(|| format!("parse config {}", p.display()))?; + info!("loaded config from {}", p.display()); + Ok(Some(cfg)) } /// Merge config-file values into clap-parsed args. @@ -632,6 +625,24 @@ fn apply_config( scan_args.input_specifier_args.include_contributors = v; } } + // Provider API roots for enumeration / cloning. We accept the YAML value + // as `String` (the schema serializer keeps it stable across `Url`'s + // trailing-slash normalization), then parse to a `Url` for the runtime + // field. parse_str() already validated this — `unwrap_or_default()` + // would mask a real config bug, so we re-parse and *fail loud* if the + // string somehow does not parse here. + if let Some(u) = &cfg.git.github_api_url + && config_wins(scan_matches, "github_api_url") + && let Ok(parsed) = url::Url::parse(u) + { + scan_args.input_specifier_args.github_api_url = parsed; + } + if let Some(u) = &cfg.git.gitlab_api_url + && config_wins(scan_matches, "gitlab_api_url") + && let Ok(parsed) = url::Url::parse(u) + { + scan_args.input_specifier_args.gitlab_api_url = parsed; + } } /// Run `kingfisher config `. @@ -935,6 +946,16 @@ fn build_config_yaml( if user_set(sub_matches, "include_contributors") { git.include_contributors = Some(scan_args.input_specifier_args.include_contributors); } + // Provider API roots are stored as `Url` on the runtime side; the YAML + // schema is a `String` so the emitted file matches exactly what the + // user typed (Url adds a trailing `/` on bare-host URLs which would + // surprise diff-watchers). + if user_set(sub_matches, "github_api_url") { + git.github_api_url = Some(scan_args.input_specifier_args.github_api_url.to_string()); + } + if user_set(sub_matches, "gitlab_api_url") { + git.gitlab_api_url = Some(scan_args.input_specifier_args.gitlab_api_url.to_string()); + } cfg.git = git; // Serialize, then prune null/empty mappings so the YAML is concise.