diff --git a/.clusterfuzzlite/build.sh b/.clusterfuzzlite/build.sh index c9712bc..7abfc4e 100755 --- a/.clusterfuzzlite/build.sh +++ b/.clusterfuzzlite/build.sh @@ -1,9 +1,28 @@ #!/bin/bash -eu -# Install build dependencies required by vendored vectorscan (C/C++) -apt-get update -qq -apt-get install -y --no-install-recommends \ - cmake pkg-config libboost-dev patch ragel +# Install build dependencies required by vendored vectorscan (C/C++). +# +# OSS-Fuzz's Ubuntu 20.04 mirrors are intermittently flaky from the +# ClusterFuzzLite runners — a single connection-failed on archive.ubuntu.com +# while fetching e.g. libxml2 used to fail the entire build. Retry the +# update + install up to 5 times with `--fix-missing` so a transient hiccup +# doesn't block a PR. +APT_PACKAGES=(cmake pkg-config libboost-dev patch ragel) +apt_install_with_retry() { + local attempt + for attempt in 1 2 3 4 5; do + if apt-get update -qq \ + && apt-get install -y --no-install-recommends --fix-missing \ + "${APT_PACKAGES[@]}"; then + return 0 + fi + echo "apt-get attempt ${attempt} failed; retrying after backoff..." >&2 + sleep $((attempt * 5)) + done + echo "apt-get failed after 5 attempts" >&2 + return 1 +} +apt_install_with_retry cd "$SRC/kingfisher" diff --git a/docs-site/docs/blog/posts/2026-05-04-real-time-secret-alerts-webhooks.md b/docs-site/docs/blog/posts/2026-05-04-real-time-secret-alerts-webhooks.md index 1b0c489..9db1839 100644 --- a/docs-site/docs/blog/posts/2026-05-04-real-time-secret-alerts-webhooks.md +++ b/docs-site/docs/blog/posts/2026-05-04-real-time-secret-alerts-webhooks.md @@ -166,8 +166,9 @@ choice: ## Declarative setup with `kingfisher.yaml` -Long CLI invocations get awkward in CI. Drop a `kingfisher.yaml` next to the -repo root and Kingfisher auto-discovers it: +Long CLI invocations get awkward in CI. Drop a `kingfisher.yaml` next to +the repo root and pass `--config ./kingfisher.yaml` so Kingfisher loads +it (the file is never auto-discovered — the path must be explicit): ```yaml alerts: diff --git a/docs-site/docs/usage/configuration.md b/docs-site/docs/usage/configuration.md index a6429f3..6d63219 100644 --- a/docs-site/docs/usage/configuration.md +++ b/docs-site/docs/usage/configuration.md @@ -1,15 +1,16 @@ --- title: "Project Configuration (kingfisher.yaml)" -description: "Set project-wide defaults for kingfisher scan via a kingfisher.yaml file. Most CLI flags can live in config; CLI flags always win over config values." +description: "Use kingfisher.yaml as project-default policy: confidence, filters, output, alerts, and global flags. Loaded only via --config FILE." --- # Project Configuration (`kingfisher.yaml`) -Long `kingfisher scan` invocations get awkward in CI. A project-local -`kingfisher.yaml` lets you set most flags as defaults so the actual command -stays short. The file is **additive** for list/map values and **default-only** -for scalars: a config value applies only when the user did not pass the -matching `--flag`. CLI flags always win. +Long CLI invocations are awkward in CI. Kingfisher loads a project-local +`kingfisher.yaml` to provide defaults for nearly every `kingfisher scan` flag, +plus alert webhooks and filter lists. Lists are **additive** (config + CLI +concatenated); scalars are **default-only** — a config value applies only when +the user did not pass the matching `--flag`. This keeps CI overrides +predictable and makes the CLI authoritative. ## Loading a config @@ -22,7 +23,9 @@ kingfisher scan . --config ./kingfisher.yaml A missing or malformed file is a fatal error — there is no silent fallback, so a typo in the path or a broken YAML block fails fast instead of running -with surprising defaults. +with surprising defaults. Auto-discovery was rejected because it makes scan +results depend on where the binary was launched from, which is too easy to +get wrong in CI. ## Precedence @@ -32,16 +35,21 @@ CLI flag > environment variable > kingfisher.yaml > built-in default For list-typed values both sources are concatenated, so passing `--skip-word EXAMPLE` and listing `EXAMPLE` again in `kingfisher.yaml` is safe -but redundant. +but redundant. The one nuance: `rules.enabled` *replaces* the synthetic +`["all"]` default when you don't pass `--rule`, so a config that lists +`["custom"]` actually narrows the selection. -## Generating a config from an existing CLI invocation +## End-to-end: create a config and scan with it -Don't write the YAML by hand. If you already have a long `kingfisher scan` -command, run the same flags under `kingfisher config init` and capture the -output: +### Step 1 — generate the config + +Don't write the YAML by hand. Start from the **scan-default flags** you +already pass to `kingfisher scan` (the policy-shaped ones — confidence, +redaction, filters, output, alerts, TLS, self-hosted API roots) and pass +them to `kingfisher config init`: ```bash -# Print to stdout, redirect to a file +# Print to stdout, redirect to file: kingfisher config init \ --confidence high \ --redact \ @@ -52,60 +60,197 @@ kingfisher config init \ --alert-min-confidence high \ --alert-webhook https://hooks.slack.com/services/T0/B0/AAA \ --tls-mode lax \ + --github-api-url https://ghe.corp.example.com/api/v3/ \ + --gitlab-api-url https://gitlab.corp.example.com/ \ > kingfisher.yaml -# Or write directly: +# Or write the file directly (pass --force to overwrite): kingfisher config init [...flags...] --out kingfisher.yaml -# Pass --force to overwrite an existing file. ``` Only flags you actually supply appear in the output; clap defaults are omitted to keep the file minimal. Scan-target inputs (paths, `--git-url`, -GitHub/GitLab/etc. flags, S3/GCS buckets) are stripped — they describe -*what* this run scans and don't belong in shared project policy. +GitHub/GitLab/etc. user/org/group flags, S3/GCS buckets) are stripped — +they describe *what* this run scans and don't belong in shared project +policy. -## Schema (top-level sections) +> **Important:** `config init` does **not** accept the provider-subcommand +> form. `kingfisher scan gitlab --group my-group --api-url https://...` +> cannot be pasted verbatim — `config init` has no `gitlab` subcommand, +> and `--group` / the subcommand-scoped `--api-url` are not accepted at +> the top level. Use the top-level aliases instead: `--gitlab-api-url` +> for the GitLab API root and `--github-api-url` for GHE. Target +> selectors like `--group` / `--organization` are intentionally CLI-only +> and have no config-file equivalent. -| Section | What it sets | -|--------------|---------------------------------------------------------------------| -| `scan` | confidence, redact, only-valid, turbo, jobs, etc. | -| `rules` | `--rule` ruleset selection, `--rules-path`, `--load-builtins` | -| `validation` | timeout, retries, RPS limits (global + per-rule) | -| `filters` | skip-words / skip-regex / exclude / max-file-size / archive depth | -| `output` | `--format`, `--output` (report destination) | -| `baseline` | `--baseline-file`, `--manage-baseline` | -| `alerts` | per-webhook entries + global `--alert-*` defaults | -| `global` | TLS mode, internal-IP allow-list, endpoint overrides | -| `git` | clone dir, keep-clones, repo-clone-limit, include-contributors, GHE / self-hosted GitLab API URLs | - -A complete worked example, with every field annotated, lives in -[`docs/CONFIG.md`](https://github.com/mongodb/kingfisher/blob/main/docs/CONFIG.md). - -## End-to-end: create a config and scan with it - -### Step 1 — generate the config from your existing CLI command - -Don't write the YAML by hand. Run your existing `kingfisher scan ...` -flags under `kingfisher config init` and capture the output: +### Step 2 — run the scan, passing the config explicitly ```bash -kingfisher config init \ - --confidence high \ - --redact \ - --exclude vendor/ \ - --exclude '**/node_modules/**' \ - --exclude '**/__snapshots__/**' \ - --format sarif \ - --output ./kingfisher.sarif \ - --alert-min-confidence high \ - --alert-webhook https://hooks.slack.com/services/T0/B0/AAA \ - > kingfisher.yaml +kingfisher scan . --config ./kingfisher.yaml ``` -The result, ready to commit: +`--config FILE` is required: there is no auto-discovery. CLI flags can +still override any individual value for a single run: + +```bash +kingfisher scan . --config ./kingfisher.yaml --confidence low +# scan.confidence: high in YAML → CLI flag wins, runs at low confidence +``` + +## Webhook URL policy + +`alerts.webhooks[].url` (and `--alert-webhook URL`) **must use `https://`**. +Webhook URLs typically embed a secret token in the path and the alert +payload contains finding metadata, so cleartext transport is never the right +default. `http://` is allowed only when the host is a loopback address +(`localhost`, `127.0.0.0/8`, `::1`) — useful for local development against an +on-host receiver. Loopback decisions are made on the literal hostname / IP +in the URL; we do not consult DNS, so a resolver cannot trick the validator +into permitting `http://` for a remote host. + +## Caveats + +- **`scan.jobs` and the Tokio runtime.** The Tokio runtime is sized from the + CLI value of `--jobs` *before* `kingfisher.yaml` is loaded, so config-only + `scan.jobs` will resize the scanner's job pool but not the underlying async + worker pool. If you want both to match, pass `--jobs N` on the CLI (or set + the same value in both places). This only affects parallelism, never + correctness. +- **Subcommand scope.** Project config only applies to `kingfisher scan`. + `validate`, `revoke`, `access-map`, `view`, and `rules` commands ignore + `kingfisher.yaml`; pass their flags on the CLI directly. + +## What is *not* config-overridable + +Scan-target inputs are intentionally CLI-only — they describe *what* this +invocation is scanning, not project policy: + +- positional paths, `--git-url` +- `--github-user` / `--github-org`, `--gitlab-user` / `--gitlab-group` and + the equivalent Gitea / Bitbucket / Azure / Hugging Face flags +- `--s3-bucket`, `--gcs-bucket`, `--docker-image` +- `--jira-url`, `--confluence-url`, `--slack-query`, `--teams-query`, + `--postman-*` + +Auth tokens are also intentionally not in YAML; they continue to come from +env vars (`KINGFISHER_GITHUB_TOKEN`, etc.) so secrets stay out of +checked-in config files. + +## Schema + +```yaml +scan: + confidence: medium # low | medium | high (--confidence) + min_entropy: 3.5 # float (--min-entropy) + no_validate: false # bool (--no-validate) + only_valid: false # bool (--only-valid) + redact: false # bool (--redact) + no_dedup: false # bool (--no-dedup) + turbo: false # bool (--turbo) + no_base64: false # bool (--no-base64) + access_map: false # bool (--access-map) + rule_stats: false # bool (--rule-stats) + jobs: 8 # int (--jobs) + git_repo_timeout: 1800 # seconds (--git-repo-timeout) + +rules: + enabled: ["all"] # list, additive (--rule) + paths: # list, additive (--rules-path) + - ./custom-rules/ + load_builtins: true # bool (--load-builtins) + +validation: + timeout: 10 # seconds, 1..=60 (--validation-timeout) + retries: 1 # int, 0..=5 (--validation-retries) + rps: 5.0 # float (--validation-rps) + rps_per_rule: # map, additive (--validation-rps-rule) + kingfisher.aws: 1.0 + full_response: false # bool (--full-validation-response) + max_response_length: 2048 # bytes (--max-validation-response-length) + +filters: + skip_words: # list, additive (--skip-word) + - EXAMPLE + - PLACEHOLDER + skip_regex: # list, additive (--skip-regex) + - '^DUMMY_[A-Z]+$' + exclude: # list, additive (--exclude) + - vendor/ + - "**/node_modules/**" + max_file_size_mb: 256.0 # float (--max-file-size) + no_binary: false # bool (--no-binary) + no_extract_archives: false # bool (--no-extract-archives) + extraction_depth: 2 # int, 1..=25 (--extraction-depth) + no_inline_ignore: false # bool (--no-ignore) + no_ignore_if_contains: false # bool (--no-ignore-if-contains) + extra_ignore_comments: [] # list, additive (--ignore-comment) + skip_aws_accounts: [] # list, additive (--skip-aws-account) + skip_aws_account_file: null # path (--skip-aws-account-file) + +output: + format: pretty # pretty|json|jsonl|bson|toon|sarif|html (--format) + path: ./kingfisher-report.json # path (--output) + +baseline: + file: ./baseline.json # path (--baseline-file) + manage: false # bool (--manage-baseline) + +alerts: + defaults: # global defaults; per-webhook overrides still win + format: null # null = auto-infer (--alert-format) + on: findings # findings | always (--alert-on) + min_confidence: medium # low | medium | high (--alert-min-confidence) + include_secret: false # bool (--alert-include-secret) + report_url: null # URL (--alert-report-url) + detail: auto # summary | detail | auto (--alert-detail) + webhooks: + - url: https://hooks.slack.com/services/T0/B0/AAA # required + format: slack # slack | teams | generic | discord | mattermost | googlechat + on: findings # findings | always + min_confidence: medium # low | medium | high + include_secret: false # default false + report_url: https://ci.example/run/42 # optional pivot link rendered in payload + detail: auto # summary | detail | auto (default auto) + +global: + tls_mode: strict # strict | lax | off (--tls-mode) + allow_internal_ips: false # bool (--allow-internal-ips) + no_update_check: false # bool (--no-update-check) + user_agent_suffix: null # string (--user-agent-suffix) + endpoints: # list, additive (--endpoint) + - github=https://ghe.example.com/api/v3 + endpoint_config: null # path (--endpoint-config) + +git: + clone_dir: null # path (--git-clone-dir) + keep_clones: false # bool (--keep-clones) + repo_clone_limit: null # int (--repo-clone-limit) + include_contributors: false # bool (--include-contributors) + github_api_url: null # URL GHE / self-hosted GH (--github-api-url) + gitlab_api_url: null # URL self-hosted GitLab (--gitlab-api-url) +``` + +Unknown fields are rejected (typo protection). Empty sections and a missing +top-level file are both fine. + +## Example: CI workflow + +A typical `kingfisher.yaml` for a CI repo, paired with a workflow step +that runs `kingfisher scan` against it: + +```bash +# .github/workflows/secrets.yml — run step +kingfisher scan . \ + --config ./kingfisher.yaml \ + --alert-webhook "$SLACK_SECURITY_WEBHOOK" +# `--alert-webhook` here is appended to any webhooks already in +# kingfisher.yaml (lists are additive). Everything else comes from the +# config file. +``` + +The committed `kingfisher.yaml`: ```yaml -# kingfisher.yaml — generated by `kingfisher config init`. scan: confidence: high redact: true @@ -117,51 +262,15 @@ filters: - vendor/ - "**/node_modules/**" - "**/__snapshots__/**" + skip_aws_accounts: + - "111122223333" # a test account whose creds we tolerate in test fixtures alerts: defaults: min_confidence: high webhooks: - url: https://hooks.slack.com/services/T0/B0/AAA + format: slack ``` -### Step 2 — run the scan, passing the config explicitly - -```bash -kingfisher scan . --config ./kingfisher.yaml -``` - -`--config FILE` is required — Kingfisher does not auto-discover -`kingfisher.yaml`. You can override any config value on the CLI for a -single run; CLI always wins: - -```bash -kingfisher scan . --config ./kingfisher.yaml --confidence low -# scan.confidence: high in YAML → CLI flag wins, runs at low confidence -``` - -## What is *not* config-overridable - -Scan-target inputs are intentionally CLI-only — they describe *what* this -invocation is scanning, not project policy: - -- positional paths, `--git-url` -- `--github-user` / `--github-org`, `--gitlab-user` / `--gitlab-group`, and the equivalent Gitea / Bitbucket / Azure / Hugging Face flags -- `--s3-bucket`, `--gcs-bucket`, `--docker-image` -- `--jira-url`, `--confluence-url`, `--slack-query`, `--teams-query`, `--postman-*` - -Auth tokens are also intentionally not in YAML; they continue to come from -env vars (`KINGFISHER_GITHUB_TOKEN`, etc.) so secrets stay out of -checked-in config files. - -## Caveats - -- **`scan.jobs` and the Tokio runtime.** The Tokio runtime is sized from the CLI value of `--jobs` *before* `kingfisher.yaml` is loaded, so config-only `scan.jobs` will resize the scanner's job pool but not the underlying async worker pool. If you want both to match, pass `--jobs N` on the CLI. -- **Subcommand scope.** Project config only applies to `kingfisher scan`. `validate`, `revoke`, `access-map`, `view`, and `rules` ignore `kingfisher.yaml`; pass their flags on the CLI directly. - -## Validation - -`kingfisher.yaml` is rejected at startup if it has unknown fields, malformed -URLs in webhook entries, invalid regex, out-of-range numeric values, or -`endpoints` that don't follow `provider=url`. There is no auto-discovery — -pass `--config FILE` explicitly so a missing or malformed file fails fast -instead of being silently skipped. +Combined with [`docs/ALERTS.md`](ALERTS.md), this lets one repo own its +webhook configuration and CI policy without baking it into command-line strings. diff --git a/docs-site/scripts/prepare-docs.py b/docs-site/scripts/prepare-docs.py index 8dea79e..cace510 100644 --- a/docs-site/scripts/prepare-docs.py +++ b/docs-site/scripts/prepare-docs.py @@ -52,6 +52,11 @@ DOC_MAP = { "Deployment", "Deployment strategies for Kingfisher: self-serve CLI, CI/pre-commit enforcement, centralized scanning, and embedded library.", ), + "CONFIG.md": ( + "usage/configuration.md", + "Project Configuration (kingfisher.yaml)", + "Use kingfisher.yaml as project-default policy: confidence, filters, output, alerts, and global flags. Loaded only via --config FILE.", + ), "ACCESS_MAP.md": ( "features/access-map.md", "Access Map (Blast Radius)", diff --git a/docs/CONFIG.md b/docs/CONFIG.md index 7c2ab14..d4ffd84 100644 --- a/docs/CONFIG.md +++ b/docs/CONFIG.md @@ -38,9 +38,10 @@ but redundant. The one nuance: `rules.enabled` *replaces* the synthetic ### Step 1 — generate the config -Don't write the YAML by hand. Take your existing `kingfisher scan ...` -command (or the CI step that builds it) and run the same flags under -`kingfisher config init`: +Don't write the YAML by hand. Start from the **scan-default flags** you +already pass to `kingfisher scan` (the policy-shaped ones — confidence, +redaction, filters, output, alerts, TLS, self-hosted API roots) and pass +them to `kingfisher config init`: ```bash # Print to stdout, redirect to file: @@ -54,6 +55,8 @@ kingfisher config init \ --alert-min-confidence high \ --alert-webhook https://hooks.slack.com/services/T0/B0/AAA \ --tls-mode lax \ + --github-api-url https://ghe.corp.example.com/api/v3/ \ + --gitlab-api-url https://gitlab.corp.example.com/ \ > kingfisher.yaml # Or write the file directly (pass --force to overwrite): @@ -62,8 +65,18 @@ kingfisher config init [...flags...] --out kingfisher.yaml Only flags you actually supply appear in the output; clap defaults are omitted to keep the file minimal. Scan-target inputs (paths, `--git-url`, -GitHub/GitLab/etc. flags, S3/GCS buckets) are stripped — they describe -*what* this run scans and don't belong in shared project policy. +GitHub/GitLab/etc. user/org/group flags, S3/GCS buckets) are stripped — +they describe *what* this run scans and don't belong in shared project +policy. + +> **Important:** `config init` does **not** accept the provider-subcommand +> form. `kingfisher scan gitlab --group my-group --api-url https://...` +> cannot be pasted verbatim — `config init` has no `gitlab` subcommand, +> and `--group` / the subcommand-scoped `--api-url` are not accepted at +> the top level. Use the top-level aliases instead: `--gitlab-api-url` +> for the GitLab API root and `--github-api-url` for GHE. Target +> selectors like `--group` / `--organization` are intentionally CLI-only +> and have no config-file equivalent. ### Step 2 — run the scan, passing the config explicitly diff --git a/src/main.rs b/src/main.rs index 9d6933e..7752885 100644 --- a/src/main.rs +++ b/src/main.rs @@ -973,13 +973,23 @@ fn build_config_yaml( } // Provider API roots are stored as `Url` on the runtime side; the YAML // schema is a `String` so the emitted file matches exactly what the - // user typed (Url adds a trailing `/` on bare-host URLs which would - // surprise diff-watchers). + // user typed. `Url::to_string()` adds a trailing `/` on bare-host URLs + // (e.g. `https://gitlab.example.com` → `https://gitlab.example.com/`), + // which would silently rewrite the user's input on every `config init` + // round-trip. Pull the raw CLI/env string from `ArgMatches` instead so + // the emitted YAML matches what the user actually passed. + fn raw_arg_string(matches: &clap::ArgMatches, id: &str) -> Option { + matches + .get_raw(id) + .and_then(|mut v| v.next()) + .and_then(|s| s.to_str()) + .map(str::to_owned) + } if user_set(sub_matches, "github_api_url") { - git.github_api_url = Some(scan_args.input_specifier_args.github_api_url.to_string()); + git.github_api_url = raw_arg_string(sub_matches, "github_api_url"); } if user_set(sub_matches, "gitlab_api_url") { - git.gitlab_api_url = Some(scan_args.input_specifier_args.gitlab_api_url.to_string()); + git.gitlab_api_url = raw_arg_string(sub_matches, "gitlab_api_url"); } cfg.git = git; @@ -2243,6 +2253,82 @@ alerts: assert!(matches!(cfg.global.tls_mode, Some(kingfisher::cli::config::ConfigTlsMode::Lax))); } + /// Regression: `config init --github-api-url ... --gitlab-api-url ...` + /// must round-trip the strings the user typed. `Url::to_string()` adds + /// a trailing `/` to bare-host URLs, so re-serializing the parsed `Url` + /// would silently rewrite `https://gitlab.example.com` → + /// `https://gitlab.example.com/` on every `config init` run. + #[test] + fn config_init_preserves_raw_api_url_strings() { + use kingfisher::cli::config::parse_str; + + let argv = &[ + "kingfisher", + "config", + "init", + // Bare host (no trailing slash) — `Url::to_string()` would add one. + "--github-api-url", + "https://ghe.corp.example.com/api/v3", + "--gitlab-api-url", + "https://gitlab.corp.example.com", + ]; + let matches = CommandLineArgs::command().try_get_matches_from(argv).unwrap(); + let parsed = CommandLineArgs::from_arg_matches(&matches).unwrap(); + let global_args = parsed.global_args.clone(); + let init_matches = + matches.subcommand_matches("config").unwrap().subcommand_matches("init").unwrap(); + let scan_args = match parsed.command { + Command::Config(c) => match c.command { + kingfisher::cli::commands::config_command::ConfigSubcommand::Init(args) => { + args.scan_args + } + }, + _ => panic!("expected config init"), + }; + + let yaml = super::build_config_yaml(&scan_args, &global_args, init_matches).unwrap(); + let cfg = parse_str(&yaml).expect("emitted YAML must round-trip"); + + assert_eq!( + cfg.git.github_api_url.as_deref(), + Some("https://ghe.corp.example.com/api/v3"), + "github_api_url must preserve user input verbatim, no trailing-slash rewrite", + ); + assert_eq!( + cfg.git.gitlab_api_url.as_deref(), + Some("https://gitlab.corp.example.com"), + "gitlab_api_url must preserve user input verbatim, no trailing-slash rewrite", + ); + + // Sanity: when the user *does* pass a trailing slash, that's preserved too. + let argv = &[ + "kingfisher", + "config", + "init", + "--github-api-url", + "https://ghe.corp.example.com/api/v3/", + ]; + let matches = CommandLineArgs::command().try_get_matches_from(argv).unwrap(); + let parsed = CommandLineArgs::from_arg_matches(&matches).unwrap(); + let global_args = parsed.global_args.clone(); + let init_matches = + matches.subcommand_matches("config").unwrap().subcommand_matches("init").unwrap(); + let scan_args = match parsed.command { + Command::Config(c) => match c.command { + kingfisher::cli::commands::config_command::ConfigSubcommand::Init(args) => { + args.scan_args + } + }, + _ => panic!("expected config init"), + }; + let yaml = super::build_config_yaml(&scan_args, &global_args, init_matches).unwrap(); + let cfg = parse_str(&yaml).expect("emitted YAML must round-trip"); + assert_eq!( + cfg.git.github_api_url.as_deref(), + Some("https://ghe.corp.example.com/api/v3/"), + ); + } + #[test] fn config_init_with_no_flags_emits_placeholder_comment() { // Edge case: user runs `kingfisher config init` with no flags. The