From 8aced005b8b77c49647bc6a6dcbf03dae273aedb Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Wed, 15 Oct 2025 22:47:40 -0700
Subject: [PATCH 1/8] - Added first-class Hugging Face scanning support,
including CLI enumeration, token authentication, and integration with remote
scans. - Condensed GitError formatting to report the exit status and the
first informative lines from stdout/stderr, producing concise git clone
failure logs. - Added support for scanning Google Cloud Storage buckets via
--gcs-bucket, including optional prefixes and service-account authentication.
- Added --skip-aws-account (now accepting comma-separated values) and
--skip-aws-account-file to bypass live AWS validation for known
canary/honey-token account IDs without triggering alerts. Kingfisher now
ships with several canary AWS account IDs pre-seeded in the skip list and now
reports matching findings as "Not Attempted" with the "Response" containing
"(skip list entry)" so its clear that validation was intentionally skipped
and why.
---
CHANGELOG.md | 6 +
Cargo.toml | 5 +-
Makefile | 18 +-
README.md | 238 +++++++++--
docs/assets/icons/huggingface.svg | 41 ++
src/cli/commands/huggingface.rs | 73 ++++
src/cli/commands/inputs.rs | 45 ++-
src/cli/commands/mod.rs | 1 +
src/cli/commands/scan.rs | 9 +
src/cli/global.rs | 6 +-
src/gcs.rs | 87 ++++
src/git_binary.rs | 48 ++-
src/huggingface.rs | 636 ++++++++++++++++++++++++++++++
src/lib.rs | 2 +
src/main.rs | 38 +-
src/reporter.rs | 197 +++++----
src/reporter/json_format.rs | 13 +
src/reporter/pretty_format.rs | 2 +-
src/scanner/mod.rs | 2 +-
src/scanner/repos.rs | 145 ++++++-
src/scanner/runner.rs | 48 ++-
src/validation.rs | 28 ++
src/validation/aws.rs | 193 +++++++--
src/validation/utils.rs | 97 ++++-
tests/int_allowlist.rs | 11 +
tests/int_bitbucket.rs | 12 +
tests/int_dedup.rs | 12 +
tests/int_github.rs | 12 +
tests/int_gitlab.rs | 24 ++
tests/int_redact.rs | 11 +
tests/int_slack.rs | 24 ++
tests/int_validation_cache.rs | 12 +
tests/int_vulnerable_files.rs | 25 ++
33 files changed, 1939 insertions(+), 182 deletions(-)
create mode 100644 docs/assets/icons/huggingface.svg
create mode 100644 src/cli/commands/huggingface.rs
create mode 100644 src/gcs.rs
create mode 100644 src/huggingface.rs
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1be8150..0f7df0d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,12 @@
All notable changes to this project will be documented in this file.
+## [v1.58.0]
+- Added first-class Hugging Face scanning support, including CLI enumeration, token authentication, and integration with remote scans.
+- Condensed GitError formatting to report the exit status and the first informative lines from stdout/stderr, producing concise git clone failure logs.
+- Added support for scanning Google Cloud Storage buckets via `--gcs-bucket`, including optional prefixes and service-account authentication.
+- Added `--skip-aws-account` (now accepting comma-separated values) and `--skip-aws-account-file` to bypass live AWS validation for known canary/honey-token account IDs without triggering alerts. Kingfisher now ships with several canary AWS account IDs pre-seeded in the skip list and now reports matching findings as "Not Attempted" with the "Response" containing "(skip list entry)" so it's clear that validation was intentionally skipped and why.
+
## [v1.57.0]
- Added inline ignore directive detection to treat suppression tokens anywhere on surrounding lines, including multi-line handling
- Added a `--no-ignore` CLI flag to disable inline directives when you need every potential secret reported
diff --git a/Cargo.toml b/Cargo.toml
index 852194d..b7144c8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[workspace.package]
edition = "2021"
-rust-version = "1.88"
+rust-version = "1.90"
license = "Apache-2.0"
authors = ["Mick Grove "]
homepage = "https://github.com/mongodb/kingfisher"
@@ -10,7 +10,7 @@ publish = false
[package]
name = "kingfisher"
-version = "1.57.0"
+version = "1.58.0"
description = "MongoDB's blazingly fast and accurate secret scanning and validation tool"
edition.workspace = true
rust-version.workspace = true
@@ -192,6 +192,7 @@ walkdir = "2.5.0"
p256 = "0.13.2"
ed25519-dalek = { version = "2.2", features = ["pkcs8"] }
aws-sdk-s3 = "1.100.0"
+gcloud-storage = "1.1.1"
tokei = "12.1.2"
[target.'cfg(not(windows))'.dependencies]
diff --git a/Makefile b/Makefile
index 14a4c6a..10b6f43 100644
--- a/Makefile
+++ b/Makefile
@@ -110,11 +110,11 @@ setup-zig:
ubuntu-x64: setup-zig # ensures Zig & cargo-zigbuild exist
@echo "Checking Rust toolchain…"
@$(MAKE) check-rust || { \
- echo "🦀 Installing Rust 1.88.0 …"; \
+ echo "🦀 Installing Rust 1.90.0 …"; \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
. $$HOME/.cargo/env; \
- rustup toolchain install 1.88.0; \
- rustup default 1.88.0; \
+ rustup toolchain install 1.90.0; \
+ rustup default 1.90.0; \
}
@echo "📦 Installing build dependencies (musl, cmake, etc.)…"
@@ -150,11 +150,11 @@ ubuntu-x64: setup-zig # ensures Zig & cargo-zigbuild exist
ubuntu-arm64: setup-zig # ensures Zig & cargo-zigbuild exist
@echo "Checking Rust toolchain…"
@$(MAKE) check-rust || { \
- echo "🦀 Installing Rust 1.88.0 …"; \
+ echo "🦀 Installing Rust 1.90.0 …"; \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
. $$HOME/.cargo/env; \
- rustup toolchain install 1.88.0; \
- rustup default 1.88.0; \
+ rustup toolchain install 1.90.0; \
+ rustup default 1.90.0; \
}
@echo "📦 Installing build dependencies (musl, cmake, etc.)…"
@@ -245,7 +245,7 @@ endif
linux-x64: check-docker create-dockerignore
@mkdir -p target/release
docker run --platform linux/amd64 --rm \
- -v "$$(pwd):/src" -w /src rust:1.88-alpine sh -eu -c '\
+ -v "$$(pwd):/src" -w /src rust:1.90-alpine sh -eu -c '\
apk add --no-cache \
musl-dev \
gcc g++ make cmake pkgconfig \
@@ -274,7 +274,7 @@ linux-x64: check-docker create-dockerignore
linux-arm64: check-docker create-dockerignore
@mkdir -p target/release
docker run --platform linux/arm64 --rm \
- -v "$$(pwd):/src" -w /src rust:1.88-alpine sh -eu -c '\
+ -v "$$(pwd):/src" -w /src rust:1.90-alpine sh -eu -c '\
apk add --no-cache \
musl-dev \
gcc g++ make cmake pkgconfig \
@@ -385,7 +385,7 @@ check-rust:
echo "Rust not found."; \
exit 1; \
fi; \
- required=1.88.0; \
+ required=1.90.0; \
if [ $$(printf '%s\n' "$$required" "$$version" | sort -V | head -n1) != "$$required" ]; then \
echo "Rust version $$version is older than required $$required."; \
exit 1; \
diff --git a/README.md b/README.md
index d77d01e..56b85be 100644
--- a/README.md
+++ b/README.md
@@ -5,22 +5,23 @@
[](https://opensource.org/licenses/Apache-2.0)
-Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s hardware‑accelerated Hyperscan regex engine with language‑aware source code parsing, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production
+Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s SIMD accelerated regex engine (Hyperscan) with language‑aware source code parsing, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production
-Originally forked from Praetorian’s Nosey Parker, Kingfisher **adds** live cloud-API validation; many more targets (GitLab, BitBucket, Gitea, S3, Docker, Jira, Confluence, Slack); compressed-file extraction and scanning; baseline and allowlist controls; language-aware detection (~20 languages); and a native Windows binary. See [Origins and Divergence](#origins-and-divergence) for details.
+Originally forked from Praetorian’s Nosey Parker, Kingfisher has since significantly expanded and diverged, adding live validation, 10+ new scan targets, and major architectural enhancements. See [Origins and Divergence](#origins-and-divergence) for details.
+
## Key Features
### Multiple Scan Targets
-| Files / Dirs | Local Git | GitHub | GitLab | Azure DevOps | Bitbucket | Gitea |
-|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|
-|
Files / Dirs |
Local Git |
GitHub |
GitLab |
Azure DevOps |
Bitbucket |
Gitea |
+| Files / Dirs | Local Git | GitHub | GitLab | Azure Repos | Bitbucket | Gitea | Hugging Face |
+|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|:-------------:|
+|
Files / Dirs |
Local Git |
GitHub |
GitLab |
Azure Repos |
Bitbucket |
Gitea |
Hugging Face |
-| Docker | Jira | Confluence | Slack | AWS S3 |
-|:------:|:----:|:-----------:|:-----:|:------:|
-|
Docker |
Jira |
Confluence |
Slack |
AWS S3 |
+| Docker | Jira | Confluence | Slack | AWS S3 | Google Cloud |
+|:------:|:----:|:-----------:|:-----:|:------:|:---:|
+|
Docker |
Jira |
Confluence |
Slack |
AWS S3 |
Cloud Storage |
@@ -41,17 +42,27 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
+# Table of Contents
+
+
+
- [Kingfisher](#kingfisher)
- [Key Features](#key-features)
- [Multiple Scan Targets](#multiple-scan-targets)
- [Performance, Accuracy, and Hundreds of Rules](#performance-accuracy-and-hundreds-of-rules)
- [Benchmark Results](#benchmark-results)
+- [Table of Contents](#table-of-contents)
- [Getting Started](#getting-started)
- [Installation](#installation)
- - [Run Kingfisher in Docker](#run-kingfisher-in-docker)
+ - [Pre-built Releases](#pre-built-releases)
+ - [Homebrew](#homebrew)
+ - [Linux and macOS](#linux-and-macos)
+ - [Windows](#windows)
+ - [Compile](#compile)
+ - [ Run Kingfisher in Docker](#-run-kingfisher-in-docker)
- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance)
- - [Write Custom Rules!](#write-custom-rules)
-- [Usage](#usage)
+ - [📝 Write Custom Rules!](#-write-custom-rules)
+- [🎉 Usage](#-usage)
- [Basic Examples](#basic-examples)
- [Scan with secret validation](#scan-with-secret-validation)
- [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories)
@@ -67,6 +78,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Exclude specific paths](#exclude-specific-paths)
- [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines)
- [ Scanning an AWS S3 Bucket](#-scanning-an-aws-s3-bucket)
+ - [ Scanning a Google Cloud Storage Bucket](#-scanning-a-google-cloud-storage-bucket)
- [ Scanning Docker Images](#-scanning-docker-images)
- [ Scanning GitHub](#-scanning-github)
- [Scan GitHub organization (requires `KF_GITHUB_TOKEN`)](#scan-github-organization-requires-kf_github_token)
@@ -79,8 +91,8 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url)
- [List GitLab repositories](#list-gitlab-repositories)
- [ Scanning Azure Repos](#-scanning-azure-repos)
- - [Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-devops-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat)
- - [Scan specific Azure DevOps projects](#scan-specific-azure-devops-projects)
+ - [Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-repos-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat)
+ - [Scan specific Azure Repos projects](#scan-specific-azure-repos-projects)
- [Skip specific Azure repositories during enumeration](#skip-specific-azure-repositories-during-enumeration)
- [List Azure repositories](#list-azure-repositories)
- [ Scanning Gitea](#-scanning-gitea)
@@ -97,6 +109,12 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [List Bitbucket repositories](#list-bitbucket-repositories)
- [Authenticate to Bitbucket](#authenticate-to-bitbucket)
- [Self-hosted Bitbucket Server](#self-hosted-bitbucket-server)
+ - [ Scanning Hugging Face](#-scanning-hugging-face)
+ - [Scan Hugging Face user](#scan-hugging-face-user)
+ - [Scan Hugging Face organization](#scan-hugging-face-organization)
+ - [Scan specific Hugging Face resources](#scan-specific-hugging-face-resources)
+ - [List Hugging Face repositories](#list-hugging-face-repositories)
+ - [Authenticate to Hugging Face](#authenticate-to-hugging-face)
- [ Scanning Jira](#-scanning-jira)
- [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query)
- [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues)
@@ -107,7 +125,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Environment Variables for Tokens](#environment-variables-for-tokens)
- [Exit Codes](#exit-codes)
- [Update Checks](#update-checks)
-- [Advanced Options](#advanced-options)
+- [🤓 Advanced Options](#-advanced-options)
- [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets)
- [List Builtin Rules](#list-builtin-rules)
- [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run)
@@ -117,6 +135,8 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Notable Scan Options](#notable-scan-options)
- [Understanding `--confidence`](#understanding---confidence)
- [Ignore known false positives](#ignore-known-false-positives)
+ - [Skip Canary Tokens](#skip-canary-tokens)
+ - [Common CLI flows](#common-cli-flows)
- [Inline ignore directives](#inline-ignore-directives)
- [Finding Fingerprint](#finding-fingerprint)
- [Rule Performance Profiling](#rule-performance-profiling)
@@ -125,36 +145,58 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Roadmap](#roadmap)
- [License](#license)
+
+
+
# Getting Started
## Installation
+### Pre-built Releases
+Pre-built binaries are available from the [Releases](https://github.com/mongodb/kingfisher/releases) section.
+
+### Homebrew
-On macOS, you can simply
```bash
brew install kingfisher
```
-Pre-built binaries are also available on the [Releases](https://github.com/mongodb/kingfisher/releases) section of this page.
+### Linux and macOS
-You can also install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform:
+
+
+You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform.
```bash
# Linux, macOS
curl --silent --location \
https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \
sh && \
- ubi --project mongodb/kingfisher --in "$HOME/bin"
+ ubi --project mongodb/kingfisher --in "$HOME/.local/bin"
```
+This installs and runs `ubi` and then places the `kingfisher` executable in `~/.local/bin` on Unix-like systems.
+
+
+
+### Windows
+
+
+
+You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform.
+
```powershell
# Windows
powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in .
```
-This installs `ubi` and then places the `kingfisher` executable in `~/bin` on Unix-like
-systems (or the current directory on Windows).
+This installs and runs `ubi` and then places the `kingfisher` executable in the current directory on Windows.
+
-Or you may compile for your platform via `make`:
+
+### Compile
+You may compile for your platform via `make`
+
+
```bash
# NOTE: Requires Docker
@@ -174,9 +216,14 @@ make darwin-all # builds both x64 and arm64
make all # builds for every OS and architecture supported
```
-### Run Kingfisher in Docker
+
+
+### Run Kingfisher in Docker
+
+Run the dockerized Kingfisher container
+
+
-Run the dockerized Kingfisher container:
```bash
# GitHub Container Registry
docker run --rm ghcr.io/mongodb/kingfisher:latest --version
@@ -233,6 +280,8 @@ docker run --rm \
```
+
+
# 🔐 Detection Rules at a Glance
Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything from classic cloud keys to the latest AI SaaS tokens. Below is an overview:
@@ -248,7 +297,7 @@ Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything fr
| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more |
| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more |
-## Write Custom Rules!
+## 📝 Write Custom Rules!
Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential.
@@ -258,7 +307,7 @@ First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom King
Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required!
-# Usage
+# 🎉 Usage
## Basic Examples
@@ -400,7 +449,7 @@ kingfisher scan ./my-project \
-v
```
-## Scanning an AWS S3 Bucket
+## Scanning an AWS S3 Bucket
You can scan S3 objects directly:
```bash
@@ -415,7 +464,7 @@ Credential resolution happens in this order:
If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role.
-Examples:
+Examples
```bash
# using explicit keys
@@ -452,7 +501,30 @@ docker run --rm \
scan --s3-bucket bucket-name
```
-## Scanning Docker Images
+## Scanning a Google Cloud Storage Bucket
+
+The `--gcs-bucket` flag streams objects directly from Google Cloud Storage. Authentication uses
+Application Default Credentials, so you can provide a service-account JSON file via the
+`GOOGLE_APPLICATION_CREDENTIALS` environment variable or by passing `--gcs-service-account`. Public
+buckets work without credentials.
+
+```bash
+kingfisher scan --gcs-bucket bucket-name
+
+# scan a sub-tree inside the bucket
+kingfisher scan --gcs-bucket bucket-name --gcs-prefix path/to/data/
+
+# supply a service-account key explicitly
+kingfisher scan --gcs-bucket bucket-name --gcs-service-account /path/to/key.json
+```
+
+Functional example:
+```bash
+kingfisher scan --gcs-bucket cloud-samples-data --gcs-prefix "storage/"
+```
+
+
+## Scanning Docker Images
Kingfisher will first try to use any locally available image, then fall back to pulling via OCI.
@@ -482,7 +554,7 @@ kingfisher scan --docker-image some-private-registry.dkr.ecr.us-east-1.amazonaws
kingfisher scan --docker-image private.registry.example.com/my-image:tag
```
-## Scanning GitHub
+## Scanning GitHub
### Scan GitHub organization (requires `KF_GITHUB_TOKEN`)
@@ -524,7 +596,7 @@ KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/priva
---
-## Scanning GitLab
+## Scanning GitLab
### Scan GitLab group (requires `KF_GITLAB_TOKEN`)
@@ -580,18 +652,18 @@ kingfisher gitlab repos list --group my-group --include-subgroups
# skip specific projects when listing or scanning (supports glob patterns)
kingfisher gitlab repos list --group my-group --gitlab-exclude my-group/**/legacy-*
```
-## Scanning Azure Repos
+## Scanning Azure Repos
-### Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)
+### Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)
```bash
kingfisher scan --azure-organization my-org
-# Azure DevOps Server example
+# Azure Repos Server example
KF_AZURE_PAT="pat" kingfisher scan --azure-organization DefaultCollection --azure-base-url https://ado.internal.example/tfs/
```
-### Scan specific Azure DevOps projects
+### Scan specific Azure Repos projects
Projects are specified as `ORGANIZATION/PROJECT`. Repeat the flag for multiple projects.
@@ -621,7 +693,7 @@ kingfisher azure repos list --project my-org/app --project my-org/api
# skip specific repositories while listing (supports glob patterns)
kingfisher azure repos list --organization my-org --azure-exclude my-org/**/experimental-*
```
-## Scanning Gitea
+## Scanning Gitea
### Scan Gitea organization (requires `KF_GITEA_TOKEN`)
@@ -673,7 +745,7 @@ KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --all-gitea-organizations
# self-hosted example
KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --user johndoe --gitea-api-url https://gitea.internal.example/api/v1/
```
-## Scanning Bitbucket
+## Scanning Bitbucket
### Scan Bitbucket workspace
```bash
@@ -745,7 +817,45 @@ Use `--bitbucket-api-url` to point Kingfisher at your server's REST endpoint, fo
`https://bitbucket.example.com/rest/api/1.0/`. Provide credentials with
`--bitbucket-username` and `--bitbucket-token`, and pass `--ignore-certs` when
connecting to HTTP or otherwise insecure instances.
-## Scanning Jira
+## Scanning Hugging Face
+
+Hugging Face hosts git repositories for models, datasets, and Spaces. Kingfisher can enumerate and scan all three resource types.
+
+### Scan Hugging Face user
+
+```bash
+kingfisher scan --huggingface-user
+```
+
+### Scan Hugging Face organization
+
+```bash
+kingfisher scan --huggingface-organization
+```
+
+### Scan specific Hugging Face resources
+
+Scan individual repositories by ID (owner/name) or by passing the full HTTPS URL:
+
+```bash
+kingfisher scan --huggingface-model
+kingfisher scan --huggingface-dataset https://huggingface.co/datasets//
+kingfisher scan --huggingface-space
+```
+
+Use `--huggingface-exclude` to omit results returned by user or organization enumeration. Prefix values with `model:`, `dataset:`, or `space:` when you only want to skip a specific resource type.
+
+### List Hugging Face repositories
+
+```bash
+kingfisher huggingface repos list --huggingface-user
+```
+
+### Authenticate to Hugging Face
+
+Private repositories require an access token provided through the `KF_HUGGINGFACE_TOKEN` environment variable. For git authentication the helper also honours `KF_HUGGINGFACE_USERNAME` (default `hf_user`).
+
+## Scanning Jira
### Scan Jira issues matching a JQL query
@@ -764,7 +874,7 @@ KF_JIRA_TOKEN="token" kingfisher scan \
--max-results 1000
```
-## Scanning Confluence
+## Scanning Confluence
### Scan Confluence pages matching a CQL query
```bash
@@ -789,7 +899,7 @@ Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environ
To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions.
-## Scanning Slack
+## Scanning Slack
### Scan Slack messages matching a search query
```bash
@@ -811,11 +921,13 @@ KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \
| `KF_GITLAB_TOKEN` | GitLab Personal Access Token |
| `KF_GITEA_TOKEN` | Gitea Personal Access Token |
| `KF_GITEA_USERNAME` | Username for private Gitea clones (used with `KF_GITEA_TOKEN`) |
-| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure DevOps Personal Access Token |
-| `KF_AZURE_USERNAME` | Username to use with Azure DevOps PATs (defaults to `pat` when unset) |
+| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure Repos Personal Access Token |
+| `KF_AZURE_USERNAME` | Username to use with Azure Repos PATs (defaults to `pat` when unset) |
| `KF_BITBUCKET_USERNAME` | Bitbucket username for basic authentication |
| `KF_BITBUCKET_APP_PASSWORD` / `KF_BITBUCKET_TOKEN` | Bitbucket app password or server token |
| `KF_BITBUCKET_OAUTH_TOKEN` | Bitbucket OAuth or PAT token |
+| `KF_HUGGINGFACE_TOKEN` | Hugging Face access token for API enumeration and git cloning |
+| `KF_HUGGINGFACE_USERNAME` | Optional username for Hugging Face git operations (defaults to `hf_user`) |
| `KF_JIRA_TOKEN` | Jira API token |
| `KF_CONFLUENCE_TOKEN` | Confluence API token |
| `KF_SLACK_TOKEN` | Slack API token |
@@ -869,7 +981,7 @@ Kingfisher automatically queries GitHub for a newer release when it starts and t
- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely
-# Advanced Options
+# 🤓 Advanced Options
## Build a Baseline / Detect New Secrets
@@ -963,6 +1075,8 @@ leaves the default unchanged.
- `--manage-baseline`: Create or update the baseline file with current findings
- `--skip-regex `: Ignore findings whose text matches this regex (repeatable)
- `--skip-word `: Ignore findings containing this case-insensitive word (repeatable)
+- `--skip-aws-account `: Skip live AWS validation for findings tied to the specified AWS account number (repeatable, accepts comma-separated lists)
+- `--skip-aws-account-file `: Load AWS account numbers to skip from a file (one account per line; `#` comments allowed)
- `--ignore-comment `: Honor additional inline directives from other scanners (repeatable; e.g. `--ignore-comment "gitleaks:allow"`)
- `--no-ignore`: Disable inline directives entirely so every match is reported
## Understanding `--confidence`
@@ -996,6 +1110,46 @@ kingfisher scan \
If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early.
+### Skip Canary Tokens
+
+Canary tokens are intentionally-leaked credentials defenders sprinkle across infrastructure to catch adversaries. Research has highlighted that static identification of AWS canary tokens is now table stakes for both attackers avoiding noisy traps and responders running hygiene scans. Kingfisher ships with the same awareness so you can clean environments without detonating the tripwires that many canary tokens deploy.
+
+To avoid triggering AWS honey tokens, provide `--skip-aws-account` with the 12-digit account numbers associated with your canaries (you can pass a comma-separated list or use `--skip-aws-account-file` to read from disk—blank lines and `#` comments are ignored). Kingfisher pre-seeds its skip list with a number of (but not every) Thinkst Canary AWS account IDs that back canarytokens.org, so routine scans bypass them automatically.
+
+```bash
+kingfisher scan /path/to/file \
+ --skip-aws-account "171436882533,534261010715"
+
+```
+
+By default, any finding tied to a skip-listed account is marked as **Not Attempted** and the validation response explains that AWS verification was not performed. This keeps routine hygiene scans from triggering alerts in production telemetry while making it clear that the credential's status still needs manual verification if required.
+
+#### Common CLI flows
+
+```bash
+# Skip a few in-house canaries during a filesystem scan
+kingfisher scan repo/ \
+ --skip-aws-account 111122223333,444455556666
+
+# Read a longer list from disk while also ignoring test fixtures
+kingfisher scan repo/ \
+ --skip-aws-account-file /tmp/scripts/canary_accounts.txt \
+ --skip-word fixture
+
+# Combine preloaded canary IDs with a just-created decoy account
+printf '999900001111\n' > /tmp/new_canary.txt
+
+kingfisher scan /path/to/repo \
+ --skip-aws-account-file /tmp/new_canary.txt
+
+# Validate only newly introduced secrets in CI without firing alerts
+kingfisher scan --git-diff origin/main \
+ --skip-aws-account-file .ci/canaries.txt \
+ --exit-on-validation
+```
+
+Tip: if you manage multiple canary fleets (Thinkst, self-hosted alternatives, or bespoke decoys), checkpoint the account IDs alongside your infrastructure-as-code so security teams can rotate or expand the skip list without editing pipelines.
+
### Inline ignore directives
Add `kingfisher:ignore` anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value:
diff --git a/docs/assets/icons/huggingface.svg b/docs/assets/icons/huggingface.svg
new file mode 100644
index 0000000..43711df
--- /dev/null
+++ b/docs/assets/icons/huggingface.svg
@@ -0,0 +1,41 @@
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/cli/commands/huggingface.rs b/src/cli/commands/huggingface.rs
new file mode 100644
index 0000000..f0879ab
--- /dev/null
+++ b/src/cli/commands/huggingface.rs
@@ -0,0 +1,73 @@
+use clap::{Args, Subcommand};
+
+use crate::cli::commands::output::OutputArgs;
+
+use super::github::GitHubOutputFormat;
+
+/// Top-level Hugging Face command group
+#[derive(Args, Debug)]
+pub struct HuggingFaceArgs {
+ #[command(subcommand)]
+ pub command: HuggingFaceCommand,
+}
+
+#[derive(Subcommand, Debug)]
+pub enum HuggingFaceCommand {
+ /// Interact with Hugging Face repositories
+ #[command(subcommand)]
+ Repos(HuggingFaceReposCommand),
+}
+
+#[derive(Subcommand, Debug)]
+pub enum HuggingFaceReposCommand {
+ /// List Hugging Face repositories
+ List(HuggingFaceReposListArgs),
+}
+
+#[derive(Args, Debug, Clone)]
+pub struct HuggingFaceReposListArgs {
+ #[command(flatten)]
+ pub repo_specifiers: HuggingFaceRepoSpecifiers,
+
+ #[command(flatten)]
+ pub output_args: OutputArgs,
+}
+
+#[derive(Args, Debug, Clone, Default)]
+pub struct HuggingFaceRepoSpecifiers {
+ /// Models, datasets, and Spaces owned by these users
+ #[arg(long = "huggingface-user")]
+ pub user: Vec,
+
+ /// Models, datasets, and Spaces owned by these organizations
+ #[arg(long = "huggingface-organization", alias = "huggingface-org")]
+ pub organization: Vec,
+
+ /// Specific models to scan (format: owner/name or full URL)
+ #[arg(long = "huggingface-model")]
+ pub model: Vec,
+
+ /// Specific datasets to scan (format: owner/name or full URL)
+ #[arg(long = "huggingface-dataset")]
+ pub dataset: Vec,
+
+ /// Specific Spaces to scan (format: owner/name or full URL)
+ #[arg(long = "huggingface-space")]
+ pub space: Vec,
+
+ /// Skip specific repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:)
+ #[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")]
+ pub exclude: Vec,
+}
+
+impl HuggingFaceRepoSpecifiers {
+ pub fn is_empty(&self) -> bool {
+ self.user.is_empty()
+ && self.organization.is_empty()
+ && self.model.is_empty()
+ && self.dataset.is_empty()
+ && self.space.is_empty()
+ }
+}
+
+pub type HuggingFaceOutputFormat = GitHubOutputFormat;
diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs
index 4bab9d1..a41cf82 100644
--- a/src/cli/commands/inputs.rs
+++ b/src/cli/commands/inputs.rs
@@ -28,6 +28,11 @@ pub struct InputSpecifierArgs {
"gitlab_group",
"gitea_user",
"gitea_organization",
+ "huggingface_user",
+ "huggingface_organization",
+ "huggingface_model",
+ "huggingface_dataset",
+ "huggingface_space",
"bitbucket_user",
"bitbucket_workspace",
"bitbucket_project",
@@ -43,7 +48,8 @@ pub struct InputSpecifierArgs {
"confluence_url",
"docker_image",
"slack_query",
- "s3_bucket"
+ "s3_bucket",
+ "gcs_bucket"
]),
num_args = 0..,
value_hint = ValueHint::AnyPath
@@ -120,6 +126,30 @@ pub struct InputSpecifierArgs {
#[arg(long, alias = "include-subgroups")]
pub gitlab_include_subgroups: bool,
+ /// Scan models, datasets, and Spaces belonging to the specified Hugging Face users
+ #[arg(long = "huggingface-user")]
+ pub huggingface_user: Vec,
+
+ /// Scan models, datasets, and Spaces belonging to the specified Hugging Face organizations
+ #[arg(long = "huggingface-organization", alias = "huggingface-org")]
+ pub huggingface_organization: Vec,
+
+ /// Scan a specific Hugging Face model (format: owner/name or full URL)
+ #[arg(long = "huggingface-model")]
+ pub huggingface_model: Vec,
+
+ /// Scan a specific Hugging Face dataset (format: owner/name or full URL)
+ #[arg(long = "huggingface-dataset")]
+ pub huggingface_dataset: Vec,
+
+ /// Scan a specific Hugging Face Space (format: owner/name or full URL)
+ #[arg(long = "huggingface-space")]
+ pub huggingface_space: Vec,
+
+ /// Skip specific Hugging Face repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:)
+ #[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")]
+ pub huggingface_exclude: Vec,
+
// Gitea Options
/// Scan repositories belonging to the specified Gitea user
#[arg(long)]
@@ -256,6 +286,18 @@ pub struct InputSpecifierArgs {
#[arg(long, requires = "s3_bucket")]
pub aws_local_profile: Option,
+ /// Scan the specified Google Cloud Storage bucket
+ #[arg(long)]
+ pub gcs_bucket: Option,
+
+ /// Optional prefix within the GCS bucket
+ #[arg(long, requires = "gcs_bucket")]
+ pub gcs_prefix: Option,
+
+ /// Path to a service account JSON file for GCS authentication
+ #[arg(long, value_hint = ValueHint::FilePath, requires = "gcs_bucket")]
+ pub gcs_service_account: Option,
+
/// Docker/OCI images to scan (no local Docker required)
#[arg(long = "docker-image")]
pub docker_image: Vec,
@@ -299,7 +341,6 @@ pub struct ContentFilteringArgs {
#[arg(
long = "max-file-size",
visible_alias = "max-filesize", // also show in --help
- // alias = "max-filesize", // use this instead if you DON’T want it shown in --help
default_value_t = 256.0,
value_name = "MB"
)]
diff --git a/src/cli/commands/mod.rs b/src/cli/commands/mod.rs
index 0434af9..4b25b89 100644
--- a/src/cli/commands/mod.rs
+++ b/src/cli/commands/mod.rs
@@ -3,6 +3,7 @@ pub mod bitbucket;
pub mod gitea;
pub mod github;
pub mod gitlab;
+pub mod huggingface;
pub mod inputs;
pub mod output;
pub mod rules;
diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs
index 78312d8..f4d7921 100644
--- a/src/cli/commands/scan.rs
+++ b/src/cli/commands/scan.rs
@@ -1,4 +1,5 @@
use clap::{Args, ValueEnum};
+use std::path::PathBuf;
use strum::Display;
use tracing::debug;
@@ -119,6 +120,14 @@ pub struct ScanArgs {
#[arg(long = "skip-word", value_name = "WORD")]
pub skip_word: Vec,
+ /// AWS account IDs whose findings should skip live credential validation (repeatable)
+ #[arg(long = "skip-aws-account", value_name = "ACCOUNT_ID", value_delimiter = ',')]
+ pub skip_aws_account: Vec,
+
+ /// File containing AWS account IDs to skip (one per line, `#` comments ignored)
+ #[arg(long = "skip-aws-account-file", value_name = "FILE")]
+ pub skip_aws_account_file: Option,
+
/// Additional inline ignore directives to recognise (repeatable)
#[arg(long = "ignore-comment", value_name = "DIRECTIVE")]
pub extra_ignore_comments: Vec,
diff --git a/src/cli/global.rs b/src/cli/global.rs
index a03d3d4..3c2fe3f 100644
--- a/src/cli/global.rs
+++ b/src/cli/global.rs
@@ -8,7 +8,7 @@ use tracing::Level;
use crate::cli::commands::{
azure::AzureArgs, bitbucket::BitbucketArgs, gitea::GiteaArgs, github::GitHubArgs,
- gitlab::GitLabArgs, rules::RulesArgs, scan::ScanArgs,
+ gitlab::GitLabArgs, huggingface::HuggingFaceArgs, rules::RulesArgs, scan::ScanArgs,
};
#[deny(missing_docs)]
@@ -81,6 +81,10 @@ pub enum Command {
#[command(name = "azure")]
Azure(AzureArgs),
+ /// Interact with the Hugging Face Hub
+ #[command(name = "huggingface")]
+ HuggingFace(HuggingFaceArgs),
+
/// Manage rules
#[command(alias = "rule")]
Rules(RulesArgs),
diff --git a/src/gcs.rs b/src/gcs.rs
new file mode 100644
index 0000000..b93caf3
--- /dev/null
+++ b/src/gcs.rs
@@ -0,0 +1,87 @@
+use std::path::Path;
+
+use anyhow::{Context, Result};
+use gcloud_storage::{
+ client::{Client, ClientConfig},
+ http::objects::{
+ download::Range,
+ get::GetObjectRequest,
+ list::{ListObjectsRequest, ListObjectsResponse},
+ },
+};
+use tracing::debug;
+
+/// Visit every object in the given GCS bucket, optionally filtered by prefix.
+///
+/// Authentication is attempted via Application Default Credentials. When that
+/// fails and no explicit service account path was provided, the client falls
+/// back to anonymous access so public buckets can still be scanned.
+pub async fn visit_bucket_objects(
+ bucket: &str,
+ prefix: Option<&str>,
+ service_account_path: Option<&Path>,
+ mut visitor: F,
+) -> Result<()>
+where
+ F: FnMut(String, Vec) -> Result<()>,
+{
+ if let Some(path) = service_account_path {
+ std::env::set_var("GOOGLE_APPLICATION_CREDENTIALS", path);
+ }
+
+ let config = match ClientConfig::default().with_auth().await {
+ Ok(config) => config,
+ Err(err) => {
+ if service_account_path.is_some()
+ || std::env::var("GOOGLE_APPLICATION_CREDENTIALS").is_ok()
+ {
+ return Err(err)
+ .context("Failed to authenticate with GCS using provided credentials");
+ }
+ debug!("Falling back to anonymous GCS access: {err}");
+ ClientConfig::default().anonymous()
+ }
+ };
+
+ let client = Client::new(config);
+ let mut page_token: Option = None;
+
+ loop {
+ let request = ListObjectsRequest {
+ bucket: bucket.to_string(),
+ prefix: prefix.map(|p| p.to_string()),
+ page_token: page_token.clone(),
+ ..ListObjectsRequest::default()
+ };
+
+ let mut response: ListObjectsResponse = client
+ .list_objects(&request)
+ .await
+ .with_context(|| format!("Failed to list objects in bucket {bucket}"))?;
+
+ if let Some(items) = response.items.take() {
+ for object in items.into_iter().filter(|o| !o.name.is_empty()) {
+ let data = client
+ .download_object(
+ &GetObjectRequest {
+ bucket: bucket.to_string(),
+ object: object.name.clone(),
+ ..GetObjectRequest::default()
+ },
+ &Range::default(),
+ )
+ .await
+ .with_context(|| format!("Failed to fetch object {}", object.name))?;
+
+ visitor(object.name, data)?;
+ }
+ }
+
+ match response.next_page_token {
+ Some(token) if !token.is_empty() => page_token = Some(token),
+ _ => break,
+ }
+ }
+
+ Ok(())
+}
diff --git a/src/git_binary.rs b/src/git_binary.rs
index 82fd990..a629373 100644
--- a/src/git_binary.rs
+++ b/src/git_binary.rs
@@ -40,6 +40,15 @@ const AZURE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_azcreds() {
fi
}; _azcreds"#;
+const HUGGINGFACE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_hfcreds() {
+ token="$KF_HUGGINGFACE_TOKEN";
+ if [ -n "$token" ]; then
+ user="${KF_HUGGINGFACE_USERNAME:-hf_user}";
+ echo username="$user";
+ echo password="$token";
+ fi
+}; _hfcreds"#;
+
/// Represents errors that can occur when interacting with the `git` CLI.
#[derive(Debug, thiserror::Error)]
pub enum GitError {
@@ -47,14 +56,37 @@ pub enum GitError {
IOError(#[from] std::io::Error),
#[error(
- "git execution failed\ncode={}\nstdout=```\n{}```\nstderr=```\n{}```",
- .status,
- String::from_utf8_lossy(.stdout),
- String::from_utf8_lossy(.stderr)
+ "git execution failed (status: {status}){summary}",
+ status = format_exit_status(.status),
+ summary = format_git_error_summary(.stdout.as_slice(), .stderr.as_slice())
)]
GitError { stdout: Vec, stderr: Vec, status: ExitStatus },
}
+fn format_exit_status(status: &ExitStatus) -> String {
+ status.code().map(|code| code.to_string()).unwrap_or_else(|| status.to_string())
+}
+
+fn format_git_error_summary(stdout: &[u8], stderr: &[u8]) -> String {
+ let mut messages = Vec::new();
+ if let Some(line) = summarize_output(stderr) {
+ messages.push(line);
+ }
+ if let Some(line) = summarize_output(stdout) {
+ messages.push(line);
+ }
+ if messages.is_empty() {
+ String::new()
+ } else {
+ format!(": {}", messages.join(" | "))
+ }
+}
+
+fn summarize_output(output: &[u8]) -> Option {
+ let text = String::from_utf8_lossy(output);
+ text.lines().map(str::trim).find(|line| !line.is_empty()).map(|line| line.to_owned())
+}
+
/// A helper struct for running `git` commands.
///
/// It supports optional GitHub, GitLab, Gitea, and Bitbucket credentials passed via
@@ -91,6 +123,8 @@ impl Git {
let has_azure_token = ["KF_AZURE_TOKEN", "KF_AZURE_PAT"]
.iter()
.any(|key| matches!(std::env::var(key), Ok(value) if !value.is_empty()));
+ let has_huggingface_token =
+ matches!(std::env::var("KF_HUGGINGFACE_TOKEN"), Ok(value) if !value.is_empty());
// If credentials are provided via environment variables, clear existing helpers first.
if has_github_token
@@ -98,6 +132,7 @@ impl Git {
|| has_gitea_token
|| has_bitbucket_credentials
|| has_azure_token
+ || has_huggingface_token
{
credentials.push("-c".into());
credentials.push(r#"credential.helper="#.into());
@@ -136,6 +171,11 @@ impl Git {
credentials.push(AZURE_CREDENTIAL_HELPER.into());
}
+ if has_huggingface_token {
+ credentials.push("-c".into());
+ credentials.push(HUGGINGFACE_CREDENTIAL_HELPER.into());
+ }
+
Self { credentials, ignore_certs }
}
diff --git a/src/huggingface.rs b/src/huggingface.rs
new file mode 100644
index 0000000..42bbf03
--- /dev/null
+++ b/src/huggingface.rs
@@ -0,0 +1,636 @@
+use std::{collections::HashSet, env, time::Duration};
+
+use anyhow::{anyhow, Result};
+use indicatif::{ProgressBar, ProgressStyle};
+use reqwest::{header::LINK, StatusCode, Url};
+use serde::Deserialize;
+use serde_json::Value;
+use tracing::{debug, warn};
+
+use crate::{git_url::GitUrl, validation::GLOBAL_USER_AGENT};
+
+#[derive(Debug, Clone, Default)]
+pub struct RepoSpecifiers {
+ pub user: Vec,
+ pub organization: Vec,
+ pub model: Vec,
+ pub dataset: Vec,
+ pub space: Vec,
+ pub exclude: Vec,
+}
+
+impl RepoSpecifiers {
+ pub fn is_empty(&self) -> bool {
+ self.user.is_empty()
+ && self.organization.is_empty()
+ && self.model.is_empty()
+ && self.dataset.is_empty()
+ && self.space.is_empty()
+ }
+}
+
+#[derive(Clone, Default)]
+pub struct AuthConfig {
+ token: Option,
+}
+
+impl std::fmt::Debug for AuthConfig {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ f.debug_struct("AuthConfig")
+ .field(
+ "token",
+ &self
+ .token
+ .as_ref()
+ .map(|token| format!("{}…", token.chars().take(4).collect::())),
+ )
+ .finish()
+ }
+}
+
+impl AuthConfig {
+ pub fn from_env() -> Self {
+ let token = env::var("KF_HUGGINGFACE_TOKEN").ok().filter(|t| !t.trim().is_empty());
+ Self { token }
+ }
+
+ fn apply(&self, request: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
+ if let Some(token) = &self.token {
+ request.bearer_auth(token)
+ } else {
+ request
+ }
+ }
+
+ fn has_token(&self) -> bool {
+ self.token.is_some()
+ }
+}
+
+#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
+enum ResourceKind {
+ Model,
+ Dataset,
+ Space,
+}
+
+impl ResourceKind {
+ fn api_path(self) -> &'static str {
+ match self {
+ ResourceKind::Model => "models",
+ ResourceKind::Dataset => "datasets",
+ ResourceKind::Space => "spaces",
+ }
+ }
+
+ fn git_url(self, slug: &str) -> String {
+ match self {
+ ResourceKind::Model => format!("https://huggingface.co/{slug}.git"),
+ ResourceKind::Dataset => format!("https://huggingface.co/datasets/{slug}.git"),
+ ResourceKind::Space => format!("https://huggingface.co/spaces/{slug}.git"),
+ }
+ }
+
+ fn canonical_prefix(self) -> &'static str {
+ match self {
+ ResourceKind::Model => "model",
+ ResourceKind::Dataset => "dataset",
+ ResourceKind::Space => "space",
+ }
+ }
+
+ fn display_name_singular(self) -> &'static str {
+ match self {
+ ResourceKind::Model => "model",
+ ResourceKind::Dataset => "dataset",
+ ResourceKind::Space => "space",
+ }
+ }
+
+ fn display_name_plural(self) -> &'static str {
+ match self {
+ ResourceKind::Model => "models",
+ ResourceKind::Dataset => "datasets",
+ ResourceKind::Space => "spaces",
+ }
+ }
+}
+
+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
+struct ResourceRef {
+ kind: ResourceKind,
+ slug: String,
+}
+
+impl ResourceRef {
+ fn new(kind: ResourceKind, slug: String) -> Self {
+ Self { kind, slug }
+ }
+
+ fn canonical_key(&self) -> String {
+ format!("{}:{}", self.kind.canonical_prefix(), self.slug.to_lowercase())
+ }
+
+ fn git_url(&self) -> String {
+ self.kind.git_url(&self.slug)
+ }
+}
+
+#[derive(Debug, Deserialize)]
+#[serde(untagged)]
+enum HuggingFaceItem {
+ Id {
+ id: String,
+ },
+ ModelId {
+ #[serde(rename = "modelId")]
+ model_id: String,
+ },
+}
+
+impl HuggingFaceItem {
+ fn into_identifier(self) -> String {
+ match self {
+ HuggingFaceItem::Id { id } => id,
+ HuggingFaceItem::ModelId { model_id } => model_id,
+ }
+ }
+}
+
+#[derive(Default)]
+struct ExcludeSet {
+ typed: HashSet,
+ untyped: HashSet,
+}
+
+impl ExcludeSet {
+ fn from_list(values: &[String]) -> Self {
+ let mut typed = HashSet::new();
+ let mut untyped = HashSet::new();
+ for raw in values {
+ let trimmed = raw.trim();
+ if trimmed.is_empty() {
+ continue;
+ }
+ if let Some((prefix, rest)) = trimmed.split_once(':') {
+ match normalize_kind(prefix) {
+ Some(kind) => {
+ if let Some(slug) = parse_slug_for_kind(kind, rest) {
+ typed.insert(format!(
+ "{}:{}",
+ kind.canonical_prefix(),
+ slug.to_lowercase()
+ ));
+ } else {
+ warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)");
+ }
+ }
+ None => warn!("Ignoring invalid Hugging Face exclusion '{raw}' (unknown type)"),
+ }
+ } else if let Some(slug) = normalize_untyped_slug(trimmed) {
+ untyped.insert(slug);
+ } else {
+ warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)");
+ }
+ }
+ Self { typed, untyped }
+ }
+
+ fn should_exclude(&self, kind: ResourceKind, slug: &str) -> bool {
+ let typed_key = format!("{}:{}", kind.canonical_prefix(), slug.to_lowercase());
+ if self.typed.contains(&typed_key) {
+ return true;
+ }
+ self.untyped.contains(&slug.to_lowercase())
+ }
+}
+
+fn normalize_kind(raw: &str) -> Option {
+ match raw.trim().to_ascii_lowercase().as_str() {
+ "model" | "models" => Some(ResourceKind::Model),
+ "dataset" | "datasets" => Some(ResourceKind::Dataset),
+ "space" | "spaces" => Some(ResourceKind::Space),
+ _ => None,
+ }
+}
+
+fn normalize_untyped_slug(raw: &str) -> Option {
+ let trimmed = raw.trim();
+ if trimmed.is_empty() {
+ return None;
+ }
+ let segments: Vec<&str> = trimmed.split('/').filter(|segment| !segment.is_empty()).collect();
+ normalize_untyped_segments(&segments)
+}
+
+fn normalize_untyped_segments(segments: &[&str]) -> Option {
+ if segments.is_empty() {
+ return None;
+ }
+ let mut parts: Vec<&str> = segments.to_vec();
+ if let Some(first) = parts.first() {
+ let lowered = first.trim().to_ascii_lowercase();
+ if matches!(
+ lowered.as_str(),
+ "models" | "model" | "datasets" | "dataset" | "spaces" | "space"
+ ) {
+ parts.remove(0);
+ }
+ }
+ if parts.len() < 2 {
+ return None;
+ }
+ let owner = parts[0].trim();
+ let binding = parts[1..].join("/");
+ let name = binding.trim_end_matches(".git").trim();
+
+ if owner.is_empty() || name.is_empty() {
+ return None;
+ }
+ Some(format!("{}/{}", owner, name).to_lowercase())
+}
+
+fn parse_slug_for_kind(kind: ResourceKind, raw: &str) -> Option {
+ let trimmed = raw.trim();
+ if trimmed.is_empty() {
+ return None;
+ }
+ if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
+ let url = Url::parse(trimmed).ok()?;
+ let segments: Vec<&str> = url
+ .path_segments()
+ .map(|segments| segments.filter(|s| !s.is_empty()).collect())
+ .unwrap_or_default();
+ return parse_slug_segments(kind, &segments);
+ }
+ let segments: Vec<&str> = trimmed.split('/').filter(|s| !s.is_empty()).collect();
+ parse_slug_segments(kind, &segments)
+}
+
+fn parse_slug_segments(kind: ResourceKind, segments: &[&str]) -> Option {
+ if segments.is_empty() {
+ return None;
+ }
+ let mut parts: Vec<&str> = segments.to_vec();
+ if let Some(first) = parts.first() {
+ let lowered = first.trim().to_ascii_lowercase();
+ let should_trim = match kind {
+ ResourceKind::Model => matches!(lowered.as_str(), "models" | "model"),
+ ResourceKind::Dataset => matches!(lowered.as_str(), "datasets" | "dataset"),
+ ResourceKind::Space => matches!(lowered.as_str(), "spaces" | "space"),
+ };
+ if should_trim {
+ parts.remove(0);
+ }
+ }
+ if parts.len() < 2 {
+ return None;
+ }
+ let owner = parts[0].trim();
+ let binding = parts[1..].join("/");
+ let name = binding.trim_end_matches(".git").trim();
+
+ if owner.is_empty() || name.is_empty() {
+ return None;
+ }
+ Some(format!("{owner}/{name}"))
+}
+
+fn parse_next_link(value: &str) -> Option {
+ value.split(',').find_map(|part| {
+ let part = part.trim();
+ let (url_part, params) = part.split_once('>')?;
+ if params.contains("rel=\"next\"") {
+ let url = url_part.trim_start_matches('<').trim();
+ Url::parse(url).ok()
+ } else {
+ None
+ }
+ })
+}
+
+const BODY_SNIPPET_LIMIT: usize = 200;
+
+async fn fetch_paginated(
+ client: &reqwest::Client,
+ mut current_url: Url,
+ auth: &AuthConfig,
+ context: &str,
+) -> Result> {
+ let mut items = Vec::new();
+ loop {
+ let mut request =
+ client.get(current_url.clone()).header("User-Agent", GLOBAL_USER_AGENT.as_str());
+ request = auth.apply(request);
+ let response = request.send().await?;
+ let status = response.status();
+ let link_header = response
+ .headers()
+ .get(LINK)
+ .and_then(|value| value.to_str().ok())
+ .map(|value| value.to_string());
+ if !status.is_success() {
+ let body = response.text().await.unwrap_or_default();
+ let mut message = format!(
+ "Hugging Face API request failed while enumerating {context} ({status}): {body}"
+ );
+ if matches!(status, StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN)
+ && !auth.has_token()
+ {
+ message.push_str(
+ "\nProvide a Hugging Face access token via the KF_HUGGINGFACE_TOKEN environment variable.",
+ );
+ }
+ return Err(anyhow!(message));
+ }
+ let body = response.bytes().await?;
+ let value: Value = serde_json::from_slice(&body).map_err(|err| {
+ let snippet = body_snippet(&body);
+ anyhow!(
+ "Failed to parse Hugging Face response while enumerating {context}: {err}. Body snippet: {snippet}",
+ context = context,
+ err = err,
+ snippet = snippet
+ )
+ })?;
+
+ let array = value.as_array().ok_or_else(|| {
+ let snippet = body_snippet(&body);
+ anyhow!(
+ "Unexpected Hugging Face response format while enumerating {context} (expected array). Body snippet: {snippet}",
+ context = context,
+ snippet = snippet
+ )
+ })?;
+
+ let mut page = Vec::new();
+ for (index, element) in array.iter().enumerate() {
+ match serde_json::from_value::(element.clone()) {
+ Ok(item) => page.push(item),
+ Err(err) => {
+ let snippet = value_snippet(element);
+ warn!(
+ "Skipping Hugging Face item at index {index} while enumerating {context}: {err}. Item snippet: {snippet}"
+ );
+ }
+ }
+ }
+ items.append(&mut page);
+ if let Some(link_value) = link_header {
+ if let Some(next_url) = parse_next_link(&link_value) {
+ current_url = next_url;
+ continue;
+ }
+ }
+ break;
+ }
+ Ok(items)
+}
+
+fn body_snippet(body: &[u8]) -> String {
+ truncate_for_display(&String::from_utf8_lossy(body), BODY_SNIPPET_LIMIT)
+}
+
+fn value_snippet(value: &Value) -> String {
+ let text = value.to_string();
+ truncate_for_display(&text, BODY_SNIPPET_LIMIT)
+}
+
+fn truncate_for_display(text: &str, limit: usize) -> String {
+ let mut snippet: String = text.chars().take(limit).collect();
+ if text.chars().count() > limit {
+ snippet.push('…');
+ }
+ snippet
+}
+
+async fn fetch_resources_for_owner(
+ client: &reqwest::Client,
+ base_url: &Url,
+ owner: &str,
+ label: &str,
+ auth: &AuthConfig,
+ progress: Option<&ProgressBar>,
+) -> Result> {
+ let mut resources = Vec::new();
+ for kind in [ResourceKind::Model, ResourceKind::Dataset, ResourceKind::Space] {
+ if let Some(pb) = progress {
+ pb.set_message(format!(
+ "Enumerating Hugging Face {label} {}",
+ kind.display_name_plural()
+ ));
+ }
+ let mut url = base_url.join(kind.api_path())?;
+ {
+ let mut pairs = url.query_pairs_mut();
+ pairs.append_pair("author", owner);
+ pairs.append_pair("limit", "100");
+ }
+ let context = format!("{} for {label}", kind.display_name_plural());
+ match fetch_paginated(client, url, auth, &context).await {
+ Ok(items) => {
+ for item in items {
+ let identifier = item.into_identifier();
+ if let Some(slug) = parse_slug_for_kind(kind, &identifier) {
+ resources.push(ResourceRef::new(kind, slug));
+ } else {
+ warn!(
+ "Skipping Hugging Face {} with unexpected identifier '{}'",
+ kind.display_name_singular(),
+ identifier
+ );
+ }
+ }
+ }
+ Err(err) => {
+ warn!(
+ "Failed to enumerate Hugging Face {} for {label}: {err}",
+ kind.display_name_plural()
+ );
+ }
+ }
+ }
+ Ok(resources)
+}
+
+fn append_explicit_resources(specifiers: &RepoSpecifiers, resources: &mut Vec) {
+ for model in &specifiers.model {
+ if let Some(slug) = parse_slug_for_kind(ResourceKind::Model, model) {
+ resources.push(ResourceRef::new(ResourceKind::Model, slug));
+ } else {
+ warn!("Ignoring invalid Hugging Face model identifier '{model}'");
+ }
+ }
+ for dataset in &specifiers.dataset {
+ if let Some(slug) = parse_slug_for_kind(ResourceKind::Dataset, dataset) {
+ resources.push(ResourceRef::new(ResourceKind::Dataset, slug));
+ } else {
+ warn!("Ignoring invalid Hugging Face dataset identifier '{dataset}'");
+ }
+ }
+ for space in &specifiers.space {
+ if let Some(slug) = parse_slug_for_kind(ResourceKind::Space, space) {
+ resources.push(ResourceRef::new(ResourceKind::Space, slug));
+ } else {
+ warn!("Ignoring invalid Hugging Face space identifier '{space}'");
+ }
+ }
+}
+
+pub async fn enumerate_repo_urls(
+ specifiers: &RepoSpecifiers,
+ auth: &AuthConfig,
+ ignore_certs: bool,
+ progress: Option<&mut ProgressBar>,
+) -> Result> {
+ let client = reqwest::Client::builder()
+ .timeout(Duration::from_secs(30))
+ .danger_accept_invalid_certs(ignore_certs)
+ .build()?;
+ let base_url = Url::parse("https://huggingface.co/api/")?;
+ let excludes = ExcludeSet::from_list(&specifiers.exclude);
+ let mut collected = Vec::new();
+
+ for user in &specifiers.user {
+ let label = format!("user {user}");
+ if let Some(pb) = progress.as_ref() {
+ pb.set_message(format!("Enumerating Hugging Face {label}"));
+ }
+ match fetch_resources_for_owner(
+ &client,
+ &base_url,
+ user,
+ &label,
+ auth,
+ progress.as_ref().map(|pb| &**pb),
+ )
+ .await
+ {
+ Ok(mut resources) => collected.append(&mut resources),
+ Err(err) => warn!("Failed to enumerate Hugging Face user {user}: {err}"),
+ }
+ }
+
+ for org in &specifiers.organization {
+ let label = format!("organization {org}");
+ if let Some(pb) = progress.as_ref() {
+ pb.set_message(format!("Enumerating Hugging Face {label}"));
+ }
+ match fetch_resources_for_owner(
+ &client,
+ &base_url,
+ org,
+ &label,
+ auth,
+ progress.as_ref().map(|pb| &**pb),
+ )
+ .await
+ {
+ Ok(mut resources) => collected.append(&mut resources),
+ Err(err) => warn!("Failed to enumerate Hugging Face organization {org}: {err}"),
+ }
+ }
+
+ append_explicit_resources(specifiers, &mut collected);
+
+ let mut seen = HashSet::new();
+ let mut urls = Vec::new();
+ for resource in collected {
+ if excludes.should_exclude(resource.kind, &resource.slug) {
+ debug!(
+ "Skipping Hugging Face {} {} due to exclusion",
+ resource.kind.display_name_singular(),
+ resource.slug
+ );
+ continue;
+ }
+ let key = resource.canonical_key();
+ if seen.insert(key) {
+ urls.push(resource.git_url());
+ }
+ }
+ urls.sort();
+ urls.dedup();
+ Ok(urls)
+}
+
+pub async fn list_repositories(
+ specifiers: &RepoSpecifiers,
+ auth: &AuthConfig,
+ ignore_certs: bool,
+ progress_enabled: bool,
+) -> Result<()> {
+ let mut progress = if progress_enabled {
+ let style = ProgressStyle::with_template("{spinner} {msg} [{elapsed_precise}]")
+ .expect("progress bar style template should compile");
+ let pb = ProgressBar::new_spinner()
+ .with_style(style)
+ .with_message("Enumerating Hugging Face repositories");
+ pb.enable_steady_tick(Duration::from_millis(500));
+ pb
+ } else {
+ ProgressBar::hidden()
+ };
+
+ let urls = enumerate_repo_urls(specifiers, auth, ignore_certs, Some(&mut progress)).await?;
+ for url in urls {
+ println!("{url}");
+ }
+ progress.finish_and_clear();
+ Ok(())
+}
+
+pub fn wiki_url(_repo_url: &GitUrl) -> Option {
+ None
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn parse_model_slug_from_plain() {
+ assert_eq!(
+ parse_slug_for_kind(ResourceKind::Model, "user/model"),
+ Some("user/model".to_string())
+ );
+ }
+
+ #[test]
+ fn parse_dataset_slug_with_prefix() {
+ assert_eq!(
+ parse_slug_for_kind(ResourceKind::Dataset, "datasets/user/data.git"),
+ Some("user/data".to_string())
+ );
+ }
+
+ #[test]
+ fn parse_space_slug_from_url() {
+ assert_eq!(
+ parse_slug_for_kind(ResourceKind::Space, "https://huggingface.co/spaces/user/demo"),
+ Some("user/demo".to_string())
+ );
+ }
+
+ #[test]
+ fn exclude_set_matches_typed_and_untyped() {
+ let excludes =
+ ExcludeSet::from_list(&vec!["model:user/model".into(), "datasets/user/data".into()]);
+ assert!(excludes.should_exclude(ResourceKind::Model, "user/model"));
+ assert!(excludes.should_exclude(ResourceKind::Dataset, "user/data"));
+ assert!(!excludes.should_exclude(ResourceKind::Space, "user/space"));
+ }
+
+ #[test]
+ fn parse_link_header() {
+ let header = "; rel=\"next\"";
+ let url = parse_next_link(header).expect("next link");
+ assert_eq!(url.as_str(), "https://huggingface.co/api/models?cursor=abc");
+ }
+
+ #[test]
+ fn truncate_for_display_adds_ellipsis() {
+ assert_eq!(truncate_for_display("abcdef", 3), "abc…");
+ assert_eq!(truncate_for_display("abc", 5), "abc");
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 9fc8106..1736bd1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -13,6 +13,7 @@ pub mod defaults;
pub mod entropy;
pub mod finding_data;
pub mod findings_store;
+pub mod gcs;
pub mod git_binary;
pub mod git_commit_metadata;
pub mod git_metadata_graph;
@@ -21,6 +22,7 @@ pub mod git_url;
pub mod gitea;
pub mod github;
pub mod gitlab;
+pub mod huggingface;
pub mod inline_ignore;
pub mod jira;
pub mod liquid_filters;
diff --git a/src/main.rs b/src/main.rs
index c0f6f9a..d6c36c7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -52,7 +52,7 @@ use kingfisher::{
},
findings_store,
findings_store::FindingsStore,
- gitea, github,
+ gitea, github, huggingface,
rule_loader::RuleLoader,
rules_database::RulesDatabase,
scanner::{load_and_record_rules, run_scan},
@@ -75,6 +75,7 @@ use crate::cli::commands::{
bitbucket::{BitbucketAuthArgs, BitbucketCommand, BitbucketRepoType, BitbucketReposCommand},
gitea::{GiteaCommand, GiteaRepoType, GiteaReposCommand},
gitlab::{GitLabCommand, GitLabRepoType, GitLabReposCommand},
+ huggingface::{HuggingFaceCommand, HuggingFaceReposCommand},
};
fn main() -> anyhow::Result<()> {
@@ -93,6 +94,7 @@ fn main() -> anyhow::Result<()> {
Command::Bitbucket(_) => num_cpus::get(), // Default for Bitbucket commands
Command::Gitea(_) => num_cpus::get(), // Default for Gitea commands
Command::Azure(_) => num_cpus::get(), // Default for Azure commands
+ Command::HuggingFace(_) => num_cpus::get(), // Default for Hugging Face commands
Command::Rules(_) => num_cpus::get(), // Default for Rules commands
};
@@ -327,6 +329,28 @@ async fn async_main(args: CommandLineArgs) -> Result<()> {
}
},
},
+ Command::HuggingFace(hf_args) => match hf_args.command {
+ HuggingFaceCommand::Repos(repos_command) => match repos_command {
+ HuggingFaceReposCommand::List(list_args) => {
+ let specifiers = huggingface::RepoSpecifiers {
+ user: list_args.repo_specifiers.user.clone(),
+ organization: list_args.repo_specifiers.organization.clone(),
+ model: list_args.repo_specifiers.model.clone(),
+ dataset: list_args.repo_specifiers.dataset.clone(),
+ space: list_args.repo_specifiers.space.clone(),
+ exclude: list_args.repo_specifiers.exclude.clone(),
+ };
+ let auth = huggingface::AuthConfig::from_env();
+ huggingface::list_repositories(
+ &specifiers,
+ &auth,
+ global_args.ignore_certs,
+ global_args.use_progress(),
+ )
+ .await?;
+ }
+ },
+ },
Command::SelfUpdate => {
anyhow::bail!("SelfUpdate command should not reach this branch")
}
@@ -367,6 +391,13 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -400,6 +431,9 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
// Slack query
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
@@ -436,6 +470,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_base64: false,
no_inline_ignore: false,
diff --git a/src/reporter.rs b/src/reporter.rs
index 52a491c..127a9ad 100644
--- a/src/reporter.rs
+++ b/src/reporter.rs
@@ -703,7 +703,7 @@ mod tests {
git_commit_metadata::CommitMetadata,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::{SerializableCapture, SerializableCaptures},
- origin::OriginSet,
+ origin::{Origin, OriginSet},
rules::rule::{Confidence, Rule, RuleSyntax},
};
use gix::{date::Time, ObjectId};
@@ -711,79 +711,8 @@ mod tests {
use std::path::PathBuf;
use tempfile::tempdir;
- #[test]
- fn build_finding_record_uses_git_blob_path() {
- let temp = tempdir().unwrap();
- let datastore =
- Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
- let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
-
- let repo_path = Arc::new(PathBuf::from("/tmp/repo"));
- let commit_metadata = Arc::new(CommitMetadata {
- commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(),
- committer_name: "Alice".into(),
- committer_email: "alice@example.com".into(),
- committer_timestamp: Time::new(0, 0),
- });
- let blob_path = "path/in/history.txt".to_string();
- let origin = OriginSet::new(
- Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()),
- vec![],
- );
-
- let rule = Arc::new(Rule::new(RuleSyntax {
- name: "Test Rule".into(),
- id: "test.rule".into(),
- pattern: ".*".into(),
- min_entropy: 0.0,
- confidence: Confidence::Medium,
- visible: true,
- examples: vec![],
- negative_examples: vec![],
- references: vec![],
- validation: None,
- depends_on_rule: vec![],
- }));
-
- let blob_id = BlobId::new(b"blob-data");
- let report_match = ReportMatch {
- origin,
- blob_metadata: BlobMetadata {
- id: blob_id,
- num_bytes: 42,
- mime_essence: None,
- language: Some("Unknown".into()),
- },
- m: Match {
- location: Location {
- offset_span: OffsetSpan { start: 0, end: 10 },
- source_span: SourceSpan {
- start: SourcePoint { line: 19, column: 0 },
- end: SourcePoint { line: 19, column: 10 },
- },
- },
- groups: SerializableCaptures {
- captures: SmallVec::<[SerializableCapture; 2]>::new(),
- },
- blob_id,
- finding_fingerprint: 123,
- rule: Arc::clone(&rule),
- validation_response_body: "Bad credentials".into(),
- validation_response_status: 401,
- validation_success: false,
- calculated_entropy: 5.29,
- visible: true,
- is_base64: false,
- },
- comment: None,
- match_confidence: Confidence::Medium,
- visible: true,
- validation_response_body: "Bad credentials".into(),
- validation_response_status: 401,
- validation_success: false,
- };
-
- let scan_args = ScanArgs {
+ fn sample_scan_args() -> ScanArgs {
+ ScanArgs {
num_jobs: 1,
rules: RuleSpecifierArgs::default(),
input_specifier_args: InputSpecifierArgs {
@@ -802,6 +731,12 @@ mod tests {
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -833,6 +768,9 @@ mod tests {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@@ -864,9 +802,98 @@ mod tests {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_inline_ignore: false,
+ }
+ }
+
+ fn sample_report_match(
+ validation_body: &str,
+ validation_status: u16,
+ validation_success: bool,
+ ) -> (ReportMatch, String) {
+ let repo_path = Arc::new(PathBuf::from("/tmp/repo"));
+ let commit_metadata = Arc::new(CommitMetadata {
+ commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(),
+ committer_name: "Alice".into(),
+ committer_email: "alice@example.com".into(),
+ committer_timestamp: Time::new(0, 0),
+ });
+ let blob_path = "path/in/history.txt".to_string();
+ let origin = OriginSet::new(
+ Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()),
+ vec![],
+ );
+
+ let rule = Arc::new(Rule::new(RuleSyntax {
+ name: "Test Rule".into(),
+ id: "test.rule".into(),
+ pattern: ".*".into(),
+ min_entropy: 0.0,
+ confidence: Confidence::Medium,
+ visible: true,
+ examples: vec![],
+ negative_examples: vec![],
+ references: vec![],
+ validation: None,
+ depends_on_rule: vec![],
+ }));
+
+ let blob_id = BlobId::new(b"blob-data");
+ let validation_body_owned = validation_body.to_string();
+ let report_match = ReportMatch {
+ origin,
+ blob_metadata: BlobMetadata {
+ id: blob_id,
+ num_bytes: 42,
+ mime_essence: None,
+ language: Some("Unknown".into()),
+ },
+ m: Match {
+ location: Location {
+ offset_span: OffsetSpan { start: 0, end: 10 },
+ source_span: SourceSpan {
+ start: SourcePoint { line: 19, column: 0 },
+ end: SourcePoint { line: 19, column: 10 },
+ },
+ },
+ groups: SerializableCaptures {
+ captures: SmallVec::<[SerializableCapture; 2]>::new(),
+ },
+ blob_id,
+ finding_fingerprint: 123,
+ rule: Arc::clone(&rule),
+ validation_response_body: validation_body_owned.clone(),
+ validation_response_status: validation_status,
+ validation_success,
+ calculated_entropy: 5.29,
+ visible: true,
+ is_base64: false,
+ },
+ comment: None,
+ match_confidence: Confidence::Medium,
+ visible: true,
+ validation_response_body: validation_body_owned,
+ validation_response_status: validation_status,
+ validation_success,
};
+ (report_match, blob_path)
+ }
+
+ #[test]
+ fn build_finding_record_uses_git_blob_path() {
+ let temp = tempdir().unwrap();
+ let datastore =
+ Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
+ let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
+
+ let (report_match, blob_path) =
+ sample_report_match("Bad credentials", StatusCode::UNAUTHORIZED.as_u16(), false);
+
+ let scan_args = sample_scan_args();
+
let record = reporter.build_finding_record(&report_match, &scan_args);
assert_eq!(record.finding.path, blob_path);
let git_file_path = record
@@ -880,6 +907,28 @@ mod tests {
assert_eq!(git_file_path, "path/in/history.txt");
}
+ #[test]
+ fn skip_list_matches_surface_skip_reason() {
+ let temp = tempdir().unwrap();
+ let datastore =
+ Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
+ let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
+
+ let (report_match, _) = sample_report_match(
+ "(skip list entry) AWS validation not attempted for account 111122223333.",
+ StatusCode::CONTINUE.as_u16(),
+ false,
+ );
+ let scan_args = sample_scan_args();
+
+ let record = reporter.build_finding_record(&report_match, &scan_args);
+ assert_eq!(record.finding.validation.status, "Not Attempted");
+ assert_eq!(
+ record.finding.validation.response,
+ "(skip list entry) AWS validation not attempted for account 111122223333."
+ );
+ }
+
use super::build_git_urls;
#[test]
diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs
index 02dc20c..6435c71 100644
--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@@ -93,6 +93,14 @@ mod tests {
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
+ // Hugging Face
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
// Gitea
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
@@ -132,6 +140,9 @@ mod tests {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
docker_image: Vec::new(),
// clone / history options
@@ -163,6 +174,8 @@ mod tests {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
}
diff --git a/src/reporter/pretty_format.rs b/src/reporter/pretty_format.rs
index 086648c..ea31fc5 100644
--- a/src/reporter/pretty_format.rs
+++ b/src/reporter/pretty_format.rs
@@ -115,7 +115,7 @@ impl<'a> Display for PrettyFindingRecord<'a> {
} else {
writeln!(f, " |Validation....: {}", finding.validation.status)?;
}
- if finding.validation.status != "Not Attempted" {
+ if !finding.validation.response.is_empty() {
writeln!(f, " |__Response....: {}", style_fn(&finding.validation.response))?;
}
writeln!(f, " |Language......: {}", finding.language)?;
diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs
index a6e0b6a..6d98b34 100644
--- a/src/scanner/mod.rs
+++ b/src/scanner/mod.rs
@@ -3,7 +3,7 @@ pub(crate) use docker::save_docker_images;
pub(crate) use enumerate::enumerate_filesystem_inputs;
pub(crate) use repos::{
clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos,
- enumerate_github_repos,
+ enumerate_github_repos, enumerate_huggingface_repos,
};
pub use runner::{load_and_record_rules, run_async_scan, run_scan};
pub(crate) use validation::run_secret_validation;
diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs
index fb6577a..da00dcc 100644
--- a/src/scanner/repos.rs
+++ b/src/scanner/repos.rs
@@ -17,10 +17,10 @@ use crate::{
commands::{github::GitCloneMode, github::GitHistoryMode, scan},
global,
},
- confluence, findings_store,
+ confluence, findings_store, gcs,
git_binary::{CloneMode, Git},
git_url::GitUrl,
- gitea, github, gitlab, jira,
+ gitea, github, gitlab, huggingface, jira,
matcher::{Match, Matcher, MatcherStats},
origin::{Origin, OriginSet},
rules_database::RulesDatabase,
@@ -305,6 +305,69 @@ pub async fn enumerate_gitea_repos(
Ok(repo_urls)
}
+pub async fn enumerate_huggingface_repos(
+ args: &scan::ScanArgs,
+ global_args: &global::GlobalArgs,
+) -> Result> {
+ let repo_specifiers = huggingface::RepoSpecifiers {
+ user: args.input_specifier_args.huggingface_user.clone(),
+ organization: args.input_specifier_args.huggingface_organization.clone(),
+ model: args.input_specifier_args.huggingface_model.clone(),
+ dataset: args.input_specifier_args.huggingface_dataset.clone(),
+ space: args.input_specifier_args.huggingface_space.clone(),
+ exclude: args.input_specifier_args.huggingface_exclude.clone(),
+ };
+
+ let mut repo_urls = args.input_specifier_args.git_url.clone();
+ if !repo_specifiers.is_empty() {
+ let mut progress = if global_args.use_progress() {
+ let style =
+ ProgressStyle::with_template("{spinner} {msg} {human_len} [{elapsed_precise}]")
+ .expect("progress bar style template should compile");
+ let pb = ProgressBar::new_spinner()
+ .with_style(style)
+ .with_message("Enumerating Hugging Face repositories...");
+ pb.enable_steady_tick(Duration::from_millis(500));
+ pb
+ } else {
+ ProgressBar::hidden()
+ };
+
+ let mut num_found: u64 = 0;
+ let auth = huggingface::AuthConfig::from_env();
+ let repo_strings = huggingface::enumerate_repo_urls(
+ &repo_specifiers,
+ &auth,
+ global_args.ignore_certs,
+ Some(&mut progress),
+ )
+ .await
+ .context("Failed to enumerate Hugging Face repositories")?;
+
+ for repo_string in repo_strings {
+ match GitUrl::from_str(&repo_string) {
+ Ok(repo_url) => {
+ repo_urls.push(repo_url);
+ num_found += 1;
+ }
+ Err(e) => {
+ progress.suspend(|| {
+ error!("Failed to parse repo URL from {repo_string}: {e}");
+ });
+ }
+ }
+ }
+
+ progress.finish_with_message(format!(
+ "Found {} repositories from Hugging Face",
+ HumanCount(num_found)
+ ));
+ }
+ repo_urls.sort();
+ repo_urls.dedup();
+ Ok(repo_urls)
+}
+
pub async fn enumerate_bitbucket_repos(
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
@@ -678,3 +741,81 @@ pub async fn fetch_s3_objects(
Ok(())
}
+
+pub async fn fetch_gcs_objects(
+ args: &scan::ScanArgs,
+ datastore: &Arc>,
+ rules_db: &RulesDatabase,
+ matcher_stats: &Mutex,
+ enable_profiling: bool,
+ shared_profiler: Arc,
+ progress_enabled: bool,
+) -> Result<()> {
+ let Some(bucket) = args.input_specifier_args.gcs_bucket.as_deref() else {
+ return Ok(());
+ };
+ let prefix = args.input_specifier_args.gcs_prefix.as_deref();
+ let service_account = args.input_specifier_args.gcs_service_account.as_deref();
+
+ let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+ let seen_blobs = BlobIdMap::new();
+ let matcher = Matcher::new(
+ rules_db,
+ scanner_pool,
+ &seen_blobs,
+ Some(matcher_stats),
+ enable_profiling,
+ Some(shared_profiler.clone()),
+ &args.extra_ignore_comments,
+ args.no_inline_ignore,
+ )?;
+ let mut processor = BlobProcessor { matcher };
+
+ let progress = if progress_enabled {
+ let style =
+ ProgressStyle::with_template("{spinner} {msg} ({pos} objects) [{elapsed_precise}]")
+ .expect("progress bar style template should compile");
+ let pb = ProgressBar::new_spinner().with_style(style).with_message("Fetching GCS objects");
+ pb.enable_steady_tick(Duration::from_millis(500));
+ pb
+ } else {
+ ProgressBar::hidden()
+ };
+
+ let pb = progress.clone();
+
+ let bucket_name = bucket.to_string();
+
+ gcs::visit_bucket_objects(bucket, prefix, service_account, move |key, bytes| {
+ let origin = OriginSet::new(
+ Origin::from_extended(serde_json::json!({
+ "path": format!("gs://{}/{}", bucket_name, key)
+ })),
+ Vec::new(),
+ );
+ let blob = crate::blob::Blob::from_bytes(bytes);
+
+ if let Some((origin, blob_md, scored_matches)) =
+ processor.run(origin, blob, args.no_dedup, args.redact, args.no_base64)?
+ {
+ let origin_arc = Arc::new(origin);
+ let blob_arc = Arc::new(blob_md);
+
+ let mut batch = Vec::with_capacity(scored_matches.len());
+ for (_score, m) in scored_matches {
+ batch.push((origin_arc.clone(), blob_arc.clone(), m));
+ }
+
+ let added = datastore.lock().unwrap().record(batch, !args.no_dedup);
+ debug!("Added {} new GCS blobs", added);
+ }
+ pb.inc(1);
+ Ok(())
+ })
+ .await?;
+
+ let total = progress.position();
+ progress.finish_with_message(format!("Fetched {} GCS objects", total));
+
+ Ok(())
+}
diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs
index 9de4a00..25dae47 100644
--- a/src/scanner/runner.rs
+++ b/src/scanner/runner.rs
@@ -1,4 +1,7 @@
-use std::sync::{Arc, Mutex};
+use std::{
+ fs,
+ sync::{Arc, Mutex},
+};
use anyhow::{bail, Context, Result};
use crossbeam_skiplist::SkipMap;
@@ -21,10 +24,11 @@ use crate::{
safe_list,
scanner::{
clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos,
- enumerate_filesystem_inputs, enumerate_github_repos,
+ enumerate_filesystem_inputs, enumerate_github_repos, enumerate_huggingface_repos,
repos::{
enumerate_gitea_repos, enumerate_gitlab_repos, fetch_confluence_pages,
- fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects, fetch_slack_messages,
+ fetch_gcs_objects, fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects,
+ fetch_slack_messages,
},
run_secret_validation, save_docker_images,
summary::print_scan_summary,
@@ -74,12 +78,14 @@ pub async fn run_async_scan(
let mut repo_urls = enumerate_github_repos(args, global_args).await?;
let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
+ let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
// Combine repository URLs
repo_urls.extend(gitlab_repo_urls);
repo_urls.extend(gitea_repo_urls);
+ repo_urls.extend(huggingface_repo_urls);
repo_urls.extend(bitbucket_repo_urls);
repo_urls.extend(azure_repo_urls);
repo_urls.sort();
@@ -182,8 +188,20 @@ pub async fn run_async_scan(
)
.await?;
- let has_s3 = args.input_specifier_args.s3_bucket.is_some();
- if input_roots.is_empty() && !has_s3 {
+ fetch_gcs_objects(
+ args,
+ &datastore,
+ rules_db,
+ &matcher_stats,
+ enable_profiling,
+ Arc::clone(&shared_profiler),
+ progress_enabled,
+ )
+ .await?;
+
+ let has_remote_objects = args.input_specifier_args.s3_bucket.is_some()
+ || args.input_specifier_args.gcs_bucket.is_some();
+ if input_roots.is_empty() && !has_remote_objects {
bail!("No inputs to scan");
}
@@ -231,6 +249,26 @@ pub async fn run_async_scan(
crate::baseline::apply_baseline(&mut ds, &path, args.manage_baseline, &input_roots)?;
}
+ let mut skip_aws_accounts = args.skip_aws_account.clone();
+
+ if let Some(path) = args.skip_aws_account_file.as_ref() {
+ let contents = fs::read_to_string(path).with_context(|| {
+ format!("Failed to read --skip-aws-account-file {}", path.display())
+ })?;
+
+ for line in contents.lines() {
+ let content = line.split('#').next().unwrap_or("");
+ for value in content.split(|c: char| c.is_ascii_whitespace() || c == ',' || c == ';') {
+ let trimmed = value.trim();
+ if !trimmed.is_empty() {
+ skip_aws_accounts.push(trimmed.to_string());
+ }
+ }
+ }
+ }
+
+ crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
+
// If validation is enabled, run it as a second phase
if !args.no_validate {
info!("Starting secret validation phase...");
diff --git a/src/validation.rs b/src/validation.rs
index 81f97dd..b371bf0 100644
--- a/src/validation.rs
+++ b/src/validation.rs
@@ -95,6 +95,14 @@ pub fn init_validation_caches() {
aws::set_aws_validation_concurrency(15);
}
+pub fn set_skip_aws_account_ids(ids: I)
+where
+ I: IntoIterator- ,
+ S: Into
,
+{
+ aws::set_aws_skip_account_ids(ids);
+}
+
#[derive(Clone)]
pub struct CachedResponse {
pub body: String,
@@ -781,6 +789,26 @@ async fn timed_validate_single_match<'a>(
}
}
+ if let Some(account_id) = aws::should_skip_aws_validation(&akid) {
+ m.validation_success = false;
+ m.validation_response_body = format!(
+ "(skip list entry) AWS validation not attempted for account {}.",
+ account_id
+ );
+ m.validation_response_status = StatusCode::CONTINUE;
+ cache.insert(
+ cache_key,
+ CachedResponse {
+ body: m.validation_response_body.clone(),
+ status: m.validation_response_status,
+ is_valid: m.validation_success,
+ timestamp: Instant::now(),
+ },
+ );
+ commit_and_return(m);
+ return;
+ }
+
if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) {
m.validation_success = false;
m.validation_response_body = format!("Invalid AWS credentials ({}): {}", akid, e);
diff --git a/src/validation/aws.rs b/src/validation/aws.rs
index 7e1073d..156b32c 100644
--- a/src/validation/aws.rs
+++ b/src/validation/aws.rs
@@ -1,7 +1,7 @@
-use std::time::Duration;
+use std::{collections::HashSet, sync::RwLock, time::Duration};
use anyhow::{anyhow, Result};
-use aws_config::{retry::RetryConfig, BehaviorVersion};
+use aws_config::{retry::RetryConfig, BehaviorVersion, SdkConfig};
use aws_credential_types::Credentials;
use aws_sdk_sts::{
config::Builder as StsConfigBuilder, error::SdkError,
@@ -26,8 +26,9 @@ use http::{
header::{HeaderValue, USER_AGENT},
StatusCode,
};
-use once_cell::sync::OnceCell;
+use once_cell::sync::{Lazy, OnceCell};
use rand::{rng, Rng};
+use regex::Regex;
use tokio::{
sync::Semaphore,
time::{sleep, timeout},
@@ -36,6 +37,57 @@ use tokio::{
use crate::validation::GLOBAL_USER_AGENT;
static AWS_VALIDATION_SEMAPHORE: OnceCell = OnceCell::new();
+const BUILTIN_SKIP_ACCOUNT_IDS: &[&str] = &[
+ "052310077262",
+ "171436882533",
+ "528757803018",
+ "534261010715",
+ "595918472158",
+ "729780141977",
+ "893192397702",
+ "992382622183",
+];
+
+static AWS_SKIP_ACCOUNT_IDS: Lazy>> = Lazy::new(|| {
+ let mut set = HashSet::new();
+ set.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string()));
+ RwLock::new(set)
+});
+
+fn build_http_client() -> SharedHttpClient {
+ HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| {
+ let mut conn_builder = ConnectorBuilder::default()
+ .tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc));
+
+ conn_builder.set_connector_settings(settings.cloned());
+ if let Some(components) = runtime_components {
+ conn_builder.set_sleep_impl(components.sleep_impl());
+ }
+ conn_builder.set_proxy_config(Some(ProxyConfig::from_env()));
+ conn_builder.build()
+ })
+}
+
+async fn build_base_config(credentials: Credentials) -> SdkConfig {
+ let retry_config = RetryConfig::adaptive().with_max_attempts(3);
+ aws_config::defaults(BehaviorVersion::latest())
+ .region(Region::new("us-east-1"))
+ .credentials_provider(credentials)
+ .http_client(build_http_client())
+ .retry_config(retry_config)
+ .load()
+ .await
+}
+
+fn extract_account_id(input: &str) -> Option {
+ let trimmed = input.trim();
+ if trimmed.len() == 12 && trimmed.chars().all(|c| c.is_ascii_digit()) {
+ return Some(trimmed.to_string());
+ }
+
+ static ACCOUNT_ID_RE: Lazy = Lazy::new(|| Regex::new(r"(\d{12})").expect("valid regex"));
+ ACCOUNT_ID_RE.captures(trimmed).and_then(|caps| caps.get(1)).map(|m| m.as_str().to_string())
+}
/// Set the maximum number of concurrent AWS validations. Call before first use.
pub fn set_aws_validation_concurrency(max: usize) {
@@ -46,6 +98,46 @@ fn aws_validation_semaphore() -> &'static Semaphore {
AWS_VALIDATION_SEMAPHORE.get_or_init(|| Semaphore::new(15))
}
+pub fn set_aws_skip_account_ids(ids: I)
+where
+ I: IntoIterator- ,
+ S: Into
,
+{
+ let mut guard = match AWS_SKIP_ACCOUNT_IDS.write() {
+ Ok(g) => g,
+ Err(poisoned) => poisoned.into_inner(),
+ };
+ guard.clear();
+
+ guard.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string()));
+
+ for raw in ids.into_iter() {
+ let value = raw.into();
+ if value.trim().is_empty() {
+ continue;
+ }
+ if let Some(normalized) = extract_account_id(&value) {
+ guard.insert(normalized);
+ } else {
+ tracing::warn!("Ignoring invalid AWS account ID in skip list: {value}");
+ }
+ }
+}
+
+pub fn should_skip_aws_validation(access_key_id: &str) -> Option {
+ let guard = AWS_SKIP_ACCOUNT_IDS.read().ok()?;
+ if guard.is_empty() {
+ return None;
+ }
+
+ let account = aws_key_to_account_number(access_key_id).ok()?;
+ if guard.contains(&account) {
+ Some(account)
+ } else {
+ None
+ }
+}
+
#[derive(Debug)]
struct UaInterceptor;
@@ -132,29 +224,7 @@ pub async fn validate_aws_credentials(
None, // expiry
"static", // provider name
);
- // Create HTTP client that respects proxy settings from the environment
- let http_client: SharedHttpClient =
- HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| {
- let mut conn_builder = ConnectorBuilder::default()
- .tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc));
-
- conn_builder.set_connector_settings(settings.cloned());
- if let Some(components) = runtime_components {
- conn_builder.set_sleep_impl(components.sleep_impl());
- }
- conn_builder.set_proxy_config(Some(ProxyConfig::from_env()));
- conn_builder.build()
- });
-
- // Create AWS config with adaptive retries
- let retry_config = RetryConfig::adaptive().with_max_attempts(3);
- let config = aws_config::defaults(BehaviorVersion::latest())
- .region(Region::new("us-east-1"))
- .credentials_provider(credentials)
- .http_client(http_client)
- .retry_config(retry_config)
- .load()
- .await;
+ let config = build_base_config(credentials).await;
// Create STS client
let sts_config = StsConfigBuilder::from(&config).interceptor(UaInterceptor).build();
@@ -230,3 +300,74 @@ pub fn aws_key_to_account_number(aws_key_id: &str) -> Result> = Lazy::new(|| Mutex::new(()));
+
+ #[test]
+ fn skip_account_list_normalizes_inputs() {
+ let _lock = TEST_GUARD.lock().unwrap();
+
+ set_aws_skip_account_ids([
+ " 052310077262 ",
+ "arn:aws:iam::171436882533:role/demo",
+ "invalid",
+ ]);
+
+ let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap();
+ assert!(guard.contains("052310077262"));
+ assert!(guard.contains("171436882533"));
+ assert_eq!(guard.len(), BUILTIN_SKIP_ACCOUNT_IDS.len());
+ drop(guard);
+
+ set_aws_skip_account_ids(Vec::::new());
+ }
+
+ #[test]
+ fn should_skip_when_account_matches() {
+ let _lock = TEST_GUARD.lock().unwrap();
+
+ set_aws_skip_account_ids(["534261010715"]);
+ assert_eq!(
+ should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"),
+ Some("534261010715".to_string())
+ );
+
+ set_aws_skip_account_ids(Vec::::new());
+ }
+
+ #[test]
+ fn builtin_canary_accounts_are_preseeded() {
+ let _lock = TEST_GUARD.lock().unwrap();
+
+ set_aws_skip_account_ids(Vec::::new());
+ assert_eq!(
+ should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"),
+ Some("534261010715".to_string())
+ );
+
+ set_aws_skip_account_ids(Vec::::new());
+ }
+
+ #[test]
+ fn duplicate_accounts_are_deduplicated() {
+ let _lock = TEST_GUARD.lock().unwrap();
+
+ set_aws_skip_account_ids([
+ "534261010715",
+ "arn:aws:iam::534261010715:user/canarytokens",
+ " 534261010715 ",
+ ]);
+
+ let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap();
+ assert_eq!(guard.iter().filter(|id| id.as_str() == "534261010715").count(), 1);
+ drop(guard);
+
+ set_aws_skip_account_ids(Vec::::new());
+ }
+}
diff --git a/src/validation/utils.rs b/src/validation/utils.rs
index ea0b020..9e8422d 100644
--- a/src/validation/utils.rs
+++ b/src/validation/utils.rs
@@ -33,31 +33,77 @@ pub fn find_closest_variable(
target_variable_name: &str,
search_variable_name: &str,
) -> Option {
- // Find positions of the target variable with the target value
+ // Collect the positions of the target variable for the provided value so we can
+ // compare relative offsets with candidate variables.
let mut target_positions = Vec::new();
for (name, value, start, end) in captures {
if name == target_variable_name && value == target_value {
target_positions.push((*start, *end));
}
}
+
if target_positions.is_empty() {
return None;
}
- // For each target position, find the closest search variable
- let mut closest_distance = usize::MAX;
- let mut closest_value: Option = None;
- for (_target_start, target_end) in target_positions {
- for (name, value, start, _) in captures {
- if name == search_variable_name {
- let distance = (*start as isize - target_end as isize).abs() as usize;
- if distance < closest_distance {
- closest_distance = distance;
- closest_value = Some(value.clone());
+
+ // Prefer candidates that appear before the target value (same logical block), but
+ // fall back to overlapping values and then to those that appear after the target
+ // value when no better match exists. This avoids pairing with the next block when
+ // multiple credentials are close together in the same file.
+ let mut best_before: Option<(usize, String)> = None;
+ let mut best_overlap: Option<(usize, String)> = None;
+ let mut best_after: Option<(usize, String)> = None;
+
+ for (target_start, target_end) in target_positions.iter().copied() {
+ for (name, value, start, end) in captures {
+ if name != search_variable_name {
+ continue;
+ }
+
+ if *end <= target_start {
+ // Candidate is before the target; choose the one closest to the target start.
+ let distance = target_start - *end;
+ match &mut best_before {
+ Some((best_distance, best_value)) if distance < *best_distance => {
+ *best_distance = distance;
+ *best_value = value.clone();
+ }
+ None => {
+ best_before = Some((distance, value.clone()));
+ }
+ _ => {}
+ }
+ } else if *start >= target_end {
+ // Candidate is after the target; choose the one closest to the target end.
+ let distance = *start - target_end;
+ match &mut best_after {
+ Some((best_distance, best_value)) if distance < *best_distance => {
+ *best_distance = distance;
+ *best_value = value.clone();
+ }
+ None => {
+ best_after = Some((distance, value.clone()));
+ }
+ _ => {}
+ }
+ } else {
+ // Candidate overlaps the target – treat as an exact match.
+ let distance = 0usize;
+ match &mut best_overlap {
+ Some((best_distance, best_value)) if distance < *best_distance => {
+ *best_distance = distance;
+ *best_value = value.clone();
+ }
+ None => {
+ best_overlap = Some((distance, value.clone()));
+ }
+ _ => {}
}
}
}
}
- closest_value
+
+ best_before.or(best_overlap).or(best_after).map(|(_, value)| value)
}
pub async fn check_url_resolvable(url: &Url) -> Result<(), Box> {
@@ -147,4 +193,31 @@ mod tests {
]
);
}
+
+ #[test]
+ fn prefers_closest_preceding_variable() {
+ let captures = vec![
+ ("TOKEN".to_string(), "secret".to_string(), 75usize, 115usize),
+ ("AKID".to_string(), "preceding".to_string(), 30usize, 50usize),
+ ("AKID".to_string(), "following".to_string(), 180usize, 200usize),
+ ];
+
+ let result =
+ find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
+
+ assert_eq!(result, "preceding".to_string());
+ }
+
+ #[test]
+ fn falls_back_to_following_when_no_preceding() {
+ let captures = vec![
+ ("TOKEN".to_string(), "secret".to_string(), 10usize, 50usize),
+ ("AKID".to_string(), "after".to_string(), 60usize, 80usize),
+ ];
+
+ let result =
+ find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
+
+ assert_eq!(result, "after".to_string());
+ }
}
diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs
index 3947548..54379a3 100644
--- a/tests/int_allowlist.rs
+++ b/tests/int_allowlist.rs
@@ -72,6 +72,12 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result, skip_skipword: Vec) -> Result, skip_skipword: Vec) -> Result Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -102,6 +109,9 @@ fn test_bitbucket_remote_scan() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@@ -131,6 +141,8 @@ fn test_bitbucket_remote_scan() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs
index e86c589..2f1998c 100644
--- a/tests/int_dedup.rs
+++ b/tests/int_dedup.rs
@@ -85,6 +85,13 @@ rules:
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -120,6 +127,9 @@ rules:
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@@ -151,6 +161,8 @@ rules:
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_github.rs b/tests/int_github.rs
index dcf936a..82a0f78 100644
--- a/tests/int_github.rs
+++ b/tests/int_github.rs
@@ -72,6 +72,13 @@ fn test_github_remote_scan() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -107,6 +114,9 @@ fn test_github_remote_scan() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@@ -138,6 +148,8 @@ fn test_github_remote_scan() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs
index 7c2e0b9..745f323 100644
--- a/tests/int_gitlab.rs
+++ b/tests/int_gitlab.rs
@@ -71,6 +71,13 @@ fn test_gitlab_remote_scan() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -106,6 +113,9 @@ fn test_gitlab_remote_scan() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
@@ -137,6 +147,8 @@ fn test_gitlab_remote_scan() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};
@@ -210,6 +222,13 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -252,6 +271,9 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
scan_nested_repos: true,
since_commit: None,
branch: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
@@ -273,6 +295,8 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_redact.rs b/tests/int_redact.rs
index 7d7accb..a570cfe 100644
--- a/tests/int_redact.rs
+++ b/tests/int_redact.rs
@@ -55,6 +55,12 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -86,6 +92,9 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@@ -115,6 +124,8 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_slack.rs b/tests/int_slack.rs
index 5f2d01a..dd9df68 100644
--- a/tests/int_slack.rs
+++ b/tests/int_slack.rs
@@ -62,6 +62,13 @@ impl TestContext {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -92,6 +99,9 @@ impl TestContext {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
max_results: 10,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
@@ -123,6 +133,8 @@ impl TestContext {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};
@@ -186,6 +198,13 @@ async fn test_scan_slack_messages() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -218,6 +237,9 @@ async fn test_scan_slack_messages() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@@ -247,6 +269,8 @@ async fn test_scan_slack_messages() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs
index 74cf6e4..ba81576 100644
--- a/tests/int_validation_cache.rs
+++ b/tests/int_validation_cache.rs
@@ -128,6 +128,13 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -163,6 +170,9 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@@ -194,6 +204,8 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs
index 464377a..5dec6b5 100644
--- a/tests/int_vulnerable_files.rs
+++ b/tests/int_vulnerable_files.rs
@@ -71,6 +71,13 @@ impl TestContext {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -106,6 +113,9 @@ impl TestContext {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@@ -137,6 +147,8 @@ impl TestContext {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
@@ -183,6 +195,13 @@ impl TestContext {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -228,6 +247,10 @@ impl TestContext {
scan_nested_repos: true,
since_commit: None,
branch: None,
+
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
},
extra_ignore_comments: Vec::new(),
content_filtering_args: ContentFilteringArgs {
@@ -250,6 +273,8 @@ impl TestContext {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};
From a6824aa753095610c0cef7a26344beb473b41ad2 Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Wed, 15 Oct 2025 23:01:28 -0700
Subject: [PATCH 2/8] updated ci to use Rust 1.90
---
.github/workflows/ci.yml | 7 +++++--
.github/workflows/release.yml | 12 +++++++-----
2 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 27e1f98..beeec7f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,6 +5,9 @@ on:
branches:
- main
+env:
+ RUST_TOOLCHAIN: "1.90"
+
# This workflow runs on pull requests to the main branch
# It builds the project for 2 platforms, Linux arm64 and macOS arm64,
# and runs tests for each platform. All platforms tested on merge to main
@@ -16,7 +19,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
- uses: swatinem/rust-cache@v2
@@ -34,7 +37,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
- uses: swatinem/rust-cache@v2
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index abe38f9..4debe1d 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -4,6 +4,8 @@ on:
push:
branches:
- main
+env:
+ RUST_TOOLCHAIN: "1.90"
jobs:
# ──────────────── Linux (via Makefile) ────────────────
@@ -15,7 +17,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
@@ -69,7 +71,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
@@ -123,7 +125,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
@@ -154,7 +156,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
@@ -185,7 +187,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
From a04741c75619374203ee0ddf1c31258ec4f9a09b Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Thu, 16 Oct 2025 10:09:06 -0700
Subject: [PATCH 3/8] updated dependencies
---
Cargo.toml | 6 ++++--
Makefile | 4 ++--
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index b7144c8..1eb11b5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -122,7 +122,6 @@ reqwest-middleware = "0.4.2"
tracing-subscriber = {version = "0.3.19", features = ["env-filter"] }
tracing-core = "0.1.34"
tree-sitter = "0.25.8"
-aws-smithy-runtime = "1.9.1"
aws-smithy-http-client = "1.1.1"
aws-smithy-runtime-api = "1.9.0"
aws-smithy-types = "1.3.2"
@@ -192,7 +191,10 @@ walkdir = "2.5.0"
p256 = "0.13.2"
ed25519-dalek = { version = "2.2", features = ["pkcs8"] }
aws-sdk-s3 = "1.100.0"
-gcloud-storage = "1.1.1"
+gcloud-storage = { version = "1.1.1", default-features = false, features = [
+ "rustls-tls",
+ "auth",
+] }
tokei = "12.1.2"
[target.'cfg(not(windows))'.dependencies]
diff --git a/Makefile b/Makefile
index 10b6f43..0325d64 100644
--- a/Makefile
+++ b/Makefile
@@ -256,7 +256,7 @@ linux-x64: check-docker create-dockerignore
patch perl ragel && \
git openssl-dev curl && \
\
- cargo test --workspace --all-targets --release ; \
+ cargo test --workspace --all-targets ; \
\
rustup target add x86_64-unknown-linux-musl && \
\
@@ -287,7 +287,7 @@ linux-arm64: check-docker create-dockerignore
\
rustup target add aarch64-unknown-linux-musl && \
\
- cargo test --workspace --all-targets --release ; \
+ cargo test --workspace --all-targets ; \
\
export PKG_CONFIG_ALLOW_CROSS=1 ; \
export RUSTFLAGS="-C target-feature=+crt-static" ; \
From f65b0b2c226ce8e1488381cc4830c19f89e9cf74 Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Thu, 16 Oct 2025 10:20:18 -0700
Subject: [PATCH 4/8] change in response to code review
---
src/gcs.rs | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/src/gcs.rs b/src/gcs.rs
index b93caf3..dc346fd 100644
--- a/src/gcs.rs
+++ b/src/gcs.rs
@@ -2,7 +2,7 @@ use std::path::Path;
use anyhow::{Context, Result};
use gcloud_storage::{
- client::{Client, ClientConfig},
+ client::{google_cloud_auth::credentials::CredentialsFile, Client, ClientConfig},
http::objects::{
download::Range,
get::GetObjectRequest,
@@ -25,15 +25,24 @@ pub async fn visit_bucket_objects(
where
F: FnMut(String, Vec) -> Result<()>,
{
- if let Some(path) = service_account_path {
- std::env::set_var("GOOGLE_APPLICATION_CREDENTIALS", path);
- }
+ let config_result = if let Some(path) = service_account_path {
+ let credentials = CredentialsFile::new_from_file(path.to_string_lossy().into_owned())
+ .await
+ .with_context(|| {
+ format!("Failed to read GCS service account credentials from {}", path.display())
+ })?;
- let config = match ClientConfig::default().with_auth().await {
+ ClientConfig::default().with_credentials(credentials).await
+ } else {
+ ClientConfig::default().with_auth().await
+ };
+
+ let config = match config_result {
Ok(config) => config,
Err(err) => {
if service_account_path.is_some()
|| std::env::var("GOOGLE_APPLICATION_CREDENTIALS").is_ok()
+ || std::env::var("GOOGLE_APPLICATION_CREDENTIALS_JSON").is_ok()
{
return Err(err)
.context("Failed to authenticate with GCS using provided credentials");
From 6525c48e300b2bfd5fabdfc1f9d903667f6af4d3 Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Thu, 16 Oct 2025 10:50:07 -0700
Subject: [PATCH 5/8] change in response to code review
---
README.md | 50 +++++++++++++++++++++++++++++++++++---------------
1 file changed, 35 insertions(+), 15 deletions(-)
diff --git a/README.md b/README.md
index 56b85be..4eac839 100644
--- a/README.md
+++ b/README.md
@@ -135,7 +135,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Notable Scan Options](#notable-scan-options)
- [Understanding `--confidence`](#understanding---confidence)
- [Ignore known false positives](#ignore-known-false-positives)
- - [Skip Canary Tokens](#skip-canary-tokens)
+ - [Skip Canary Tokens (AWS)](#skip-canary-tokens-aws)
- [Common CLI flows](#common-cli-flows)
- [Inline ignore directives](#inline-ignore-directives)
- [Finding Fingerprint](#finding-fingerprint)
@@ -1110,42 +1110,62 @@ kingfisher scan \
If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early.
-### Skip Canary Tokens
+### Skip Canary Tokens (AWS)
-Canary tokens are intentionally-leaked credentials defenders sprinkle across infrastructure to catch adversaries. Research has highlighted that static identification of AWS canary tokens is now table stakes for both attackers avoiding noisy traps and responders running hygiene scans. Kingfisher ships with the same awareness so you can clean environments without detonating the tripwires that many canary tokens deploy.
+Canary/honey tokens are intentionally leaked credentials used to catch misuse. Kingfisher can **recognize and skip** known AWS canary accounts so hygiene scans don’t set off alerts.
-To avoid triggering AWS honey tokens, provide `--skip-aws-account` with the 12-digit account numbers associated with your canaries (you can pass a comma-separated list or use `--skip-aws-account-file` to read from disk—blank lines and `#` comments are ignored). Kingfisher pre-seeds its skip list with a number of (but not every) Thinkst Canary AWS account IDs that back canarytokens.org, so routine scans bypass them automatically.
+**How to skip**
+Pass the 12-digit AWS account IDs for your canaries via `--skip-aws-account` (comma-separated) or `--skip-aws-account-file` (one ID per line; blank lines and `#` comments allowed). Kingfisher also ships with a **pre-seeded (but not exhaustive)** list of Thinkst Canary account IDs used by canarytokens.org, so many are skipped automatically.
```bash
-kingfisher scan /path/to/file \
+kingfisher scan /path/to/code \
--skip-aws-account "171436882533,534261010715"
+# or combine preloaded canary IDs with a just-created decoy account
+printf '999900001111 \n534261010715' > /tmp/canary_accounts.txt
+
+kingfisher scan /path/to/repo \
+ --skip-aws-account-file /tmp/canary_accounts.txt
+
```
-By default, any finding tied to a skip-listed account is marked as **Not Attempted** and the validation response explains that AWS verification was not performed. This keeps routine hygiene scans from triggering alerts in production telemetry while making it clear that the credential's status still needs manual verification if required.
+**What you’ll see**
+Findings tied to a skip-listed account report Validation: Not Attempted and note that the entry came from the skip list:
+
+```bash
+AWS SECRET ACCESS KEY => [KINGFISHER.AWS.2]
+ |Finding.......:
+ |Fingerprint...: 2141074333616819500
+ |Confidence....: medium
+ |Entropy.......: 5.00
+ |Validation....: Not Attempted
+ |__Response....: (skip list entry) AWS validation not attempted for account 171436882533.
+ |Language......: Unknown
+ |Line Num......: 21
+ |Path..........: /Users/mickg/dev/ghcanary/new-canaries.log
+```
+
+**Why this matters**
+Skipping prevents noisy tripwires in prod telemetry while keeping the status explicit—“Not Attempted” isn’t a pass. If needed, verify these credentials out-of-band or with a safe, non-triggering method.
+
#### Common CLI flows
```bash
# Skip a few in-house canaries during a filesystem scan
kingfisher scan repo/ \
- --skip-aws-account 111122223333,444455556666
+ --skip-aws-account "111122223333,444455556666"
-# Read a longer list from disk while also ignoring test fixtures
+# Read a longer list from disk
kingfisher scan repo/ \
- --skip-aws-account-file /tmp/scripts/canary_accounts.txt \
- --skip-word fixture
+ --skip-aws-account-file /tmp/scripts/canary_accounts.txt
# Combine preloaded canary IDs with a just-created decoy account
-printf '999900001111\n' > /tmp/new_canary.txt
+printf '999900001111\n534261010715\n' > /tmp/new_canary.txt
kingfisher scan /path/to/repo \
--skip-aws-account-file /tmp/new_canary.txt
-# Validate only newly introduced secrets in CI without firing alerts
-kingfisher scan --git-diff origin/main \
- --skip-aws-account-file .ci/canaries.txt \
- --exit-on-validation
```
Tip: if you manage multiple canary fleets (Thinkst, self-hosted alternatives, or bespoke decoys), checkpoint the account IDs alongside your infrastructure-as-code so security teams can rotate or expand the skip list without editing pipelines.
From be14f632b2867a54c5921d81f387486cec30f0ef Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Thu, 16 Oct 2025 10:50:37 -0700
Subject: [PATCH 6/8] change in response to code review
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 4eac839..f034c69 100644
--- a/README.md
+++ b/README.md
@@ -1130,7 +1130,7 @@ kingfisher scan /path/to/repo \
```
**What you’ll see**
-Findings tied to a skip-listed account report Validation: Not Attempted and note that the entry came from the skip list:
+Findings tied to a skip-listed account report `Validation: Not Attempted` and note in the `Response:` that the entry came from the skip list:
```bash
AWS SECRET ACCESS KEY => [KINGFISHER.AWS.2]
From 63fa008992fd1a99f92828bea3396cc12386ee89 Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Thu, 16 Oct 2025 10:52:33 -0700
Subject: [PATCH 7/8] change in response to code review
---
docs/assets/icons/gcs.svg | 38 ++++++++++++++++++++++++++++++++++++++
1 file changed, 38 insertions(+)
create mode 100644 docs/assets/icons/gcs.svg
diff --git a/docs/assets/icons/gcs.svg b/docs/assets/icons/gcs.svg
new file mode 100644
index 0000000..842c121
--- /dev/null
+++ b/docs/assets/icons/gcs.svg
@@ -0,0 +1,38 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
From 03e218d7f9bc12dc88c8a5451e1293711cd8a4cb Mon Sep 17 00:00:00 2001
From: Mick Grove
Date: Thu, 16 Oct 2025 10:54:48 -0700
Subject: [PATCH 8/8] change in response to code review
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index f034c69..bd07bc7 100644
--- a/README.md
+++ b/README.md
@@ -1142,7 +1142,7 @@ AWS SECRET ACCESS KEY => [KINGFISHER.AWS.2]
|__Response....: (skip list entry) AWS validation not attempted for account 171436882533.
|Language......: Unknown
|Line Num......: 21
- |Path..........: /Users/mickg/dev/ghcanary/new-canaries.log
+ |Path..........: /tmp/test_canary_accounts.log
```
**Why this matters**