diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 27e1f98..beeec7f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,6 +5,9 @@ on:
branches:
- main
+env:
+ RUST_TOOLCHAIN: "1.90"
+
# This workflow runs on pull requests to the main branch
# It builds the project for 2 platforms, Linux arm64 and macOS arm64,
# and runs tests for each platform. All platforms tested on merge to main
@@ -16,7 +19,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
- uses: swatinem/rust-cache@v2
@@ -34,7 +37,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
- uses: swatinem/rust-cache@v2
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index abe38f9..4debe1d 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -4,6 +4,8 @@ on:
push:
branches:
- main
+env:
+ RUST_TOOLCHAIN: "1.90"
jobs:
# ──────────────── Linux (via Makefile) ────────────────
@@ -15,7 +17,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
@@ -69,7 +71,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
@@ -123,7 +125,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
@@ -154,7 +156,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
@@ -185,7 +187,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rs/toolchain@v1
with:
- toolchain: 1.88.0
+ toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1be8150..0f7df0d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,12 @@
All notable changes to this project will be documented in this file.
+## [v1.58.0]
+- Added first-class Hugging Face scanning support, including CLI enumeration, token authentication, and integration with remote scans.
+- Condensed GitError formatting to report the exit status and the first informative lines from stdout/stderr, producing concise git clone failure logs.
+- Added support for scanning Google Cloud Storage buckets via `--gcs-bucket`, including optional prefixes and service-account authentication.
+- Added `--skip-aws-account` (now accepting comma-separated values) and `--skip-aws-account-file` to bypass live AWS validation for known canary/honey-token account IDs without triggering alerts. Kingfisher now ships with several canary AWS account IDs pre-seeded in the skip list and now reports matching findings as "Not Attempted" with the "Response" containing "(skip list entry)" so it's clear that validation was intentionally skipped and why.
+
## [v1.57.0]
- Added inline ignore directive detection to treat suppression tokens anywhere on surrounding lines, including multi-line handling
- Added a `--no-ignore` CLI flag to disable inline directives when you need every potential secret reported
diff --git a/Cargo.toml b/Cargo.toml
index 852194d..1eb11b5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[workspace.package]
edition = "2021"
-rust-version = "1.88"
+rust-version = "1.90"
license = "Apache-2.0"
authors = ["Mick Grove "]
homepage = "https://github.com/mongodb/kingfisher"
@@ -10,7 +10,7 @@ publish = false
[package]
name = "kingfisher"
-version = "1.57.0"
+version = "1.58.0"
description = "MongoDB's blazingly fast and accurate secret scanning and validation tool"
edition.workspace = true
rust-version.workspace = true
@@ -122,7 +122,6 @@ reqwest-middleware = "0.4.2"
tracing-subscriber = {version = "0.3.19", features = ["env-filter"] }
tracing-core = "0.1.34"
tree-sitter = "0.25.8"
-aws-smithy-runtime = "1.9.1"
aws-smithy-http-client = "1.1.1"
aws-smithy-runtime-api = "1.9.0"
aws-smithy-types = "1.3.2"
@@ -192,6 +191,10 @@ walkdir = "2.5.0"
p256 = "0.13.2"
ed25519-dalek = { version = "2.2", features = ["pkcs8"] }
aws-sdk-s3 = "1.100.0"
+gcloud-storage = { version = "1.1.1", default-features = false, features = [
+ "rustls-tls",
+ "auth",
+] }
tokei = "12.1.2"
[target.'cfg(not(windows))'.dependencies]
diff --git a/Makefile b/Makefile
index 14a4c6a..0325d64 100644
--- a/Makefile
+++ b/Makefile
@@ -110,11 +110,11 @@ setup-zig:
ubuntu-x64: setup-zig # ensures Zig & cargo-zigbuild exist
@echo "Checking Rust toolchain…"
@$(MAKE) check-rust || { \
- echo "🦀 Installing Rust 1.88.0 …"; \
+ echo "🦀 Installing Rust 1.90.0 …"; \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
. $$HOME/.cargo/env; \
- rustup toolchain install 1.88.0; \
- rustup default 1.88.0; \
+ rustup toolchain install 1.90.0; \
+ rustup default 1.90.0; \
}
@echo "📦 Installing build dependencies (musl, cmake, etc.)…"
@@ -150,11 +150,11 @@ ubuntu-x64: setup-zig # ensures Zig & cargo-zigbuild exist
ubuntu-arm64: setup-zig # ensures Zig & cargo-zigbuild exist
@echo "Checking Rust toolchain…"
@$(MAKE) check-rust || { \
- echo "🦀 Installing Rust 1.88.0 …"; \
+ echo "🦀 Installing Rust 1.90.0 …"; \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
. $$HOME/.cargo/env; \
- rustup toolchain install 1.88.0; \
- rustup default 1.88.0; \
+ rustup toolchain install 1.90.0; \
+ rustup default 1.90.0; \
}
@echo "📦 Installing build dependencies (musl, cmake, etc.)…"
@@ -245,7 +245,7 @@ endif
linux-x64: check-docker create-dockerignore
@mkdir -p target/release
docker run --platform linux/amd64 --rm \
- -v "$$(pwd):/src" -w /src rust:1.88-alpine sh -eu -c '\
+ -v "$$(pwd):/src" -w /src rust:1.90-alpine sh -eu -c '\
apk add --no-cache \
musl-dev \
gcc g++ make cmake pkgconfig \
@@ -256,7 +256,7 @@ linux-x64: check-docker create-dockerignore
patch perl ragel && \
git openssl-dev curl && \
\
- cargo test --workspace --all-targets --release ; \
+ cargo test --workspace --all-targets ; \
\
rustup target add x86_64-unknown-linux-musl && \
\
@@ -274,7 +274,7 @@ linux-x64: check-docker create-dockerignore
linux-arm64: check-docker create-dockerignore
@mkdir -p target/release
docker run --platform linux/arm64 --rm \
- -v "$$(pwd):/src" -w /src rust:1.88-alpine sh -eu -c '\
+ -v "$$(pwd):/src" -w /src rust:1.90-alpine sh -eu -c '\
apk add --no-cache \
musl-dev \
gcc g++ make cmake pkgconfig \
@@ -287,7 +287,7 @@ linux-arm64: check-docker create-dockerignore
\
rustup target add aarch64-unknown-linux-musl && \
\
- cargo test --workspace --all-targets --release ; \
+ cargo test --workspace --all-targets ; \
\
export PKG_CONFIG_ALLOW_CROSS=1 ; \
export RUSTFLAGS="-C target-feature=+crt-static" ; \
@@ -385,7 +385,7 @@ check-rust:
echo "Rust not found."; \
exit 1; \
fi; \
- required=1.88.0; \
+ required=1.90.0; \
if [ $$(printf '%s\n' "$$required" "$$version" | sort -V | head -n1) != "$$required" ]; then \
echo "Rust version $$version is older than required $$required."; \
exit 1; \
diff --git a/README.md b/README.md
index d77d01e..bd07bc7 100644
--- a/README.md
+++ b/README.md
@@ -5,22 +5,23 @@
[](https://opensource.org/licenses/Apache-2.0)
-Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s hardware‑accelerated Hyperscan regex engine with language‑aware source code parsing, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production
+Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s SIMD accelerated regex engine (Hyperscan) with language‑aware source code parsing, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production
-Originally forked from Praetorian’s Nosey Parker, Kingfisher **adds** live cloud-API validation; many more targets (GitLab, BitBucket, Gitea, S3, Docker, Jira, Confluence, Slack); compressed-file extraction and scanning; baseline and allowlist controls; language-aware detection (~20 languages); and a native Windows binary. See [Origins and Divergence](#origins-and-divergence) for details.
+Originally forked from Praetorian’s Nosey Parker, Kingfisher has since significantly expanded and diverged, adding live validation, 10+ new scan targets, and major architectural enhancements. See [Origins and Divergence](#origins-and-divergence) for details.
+
## Key Features
### Multiple Scan Targets
-| Files / Dirs | Local Git | GitHub | GitLab | Azure DevOps | Bitbucket | Gitea |
-|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|
-|
Files / Dirs |
Local Git |
GitHub |
GitLab |
Azure DevOps |
Bitbucket |
Gitea |
+| Files / Dirs | Local Git | GitHub | GitLab | Azure Repos | Bitbucket | Gitea | Hugging Face |
+|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|:-------------:|
+|
Files / Dirs |
Local Git |
GitHub |
GitLab |
Azure Repos |
Bitbucket |
Gitea |
Hugging Face |
-| Docker | Jira | Confluence | Slack | AWS S3 |
-|:------:|:----:|:-----------:|:-----:|:------:|
-|
Docker |
Jira |
Confluence |
Slack |
AWS S3 |
+| Docker | Jira | Confluence | Slack | AWS S3 | Google Cloud |
+|:------:|:----:|:-----------:|:-----:|:------:|:---:|
+|
Docker |
Jira |
Confluence |
Slack |
AWS S3 |
Cloud Storage |
@@ -41,17 +42,27 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
+# Table of Contents
+
+
+
- [Kingfisher](#kingfisher)
- [Key Features](#key-features)
- [Multiple Scan Targets](#multiple-scan-targets)
- [Performance, Accuracy, and Hundreds of Rules](#performance-accuracy-and-hundreds-of-rules)
- [Benchmark Results](#benchmark-results)
+- [Table of Contents](#table-of-contents)
- [Getting Started](#getting-started)
- [Installation](#installation)
- - [Run Kingfisher in Docker](#run-kingfisher-in-docker)
+ - [Pre-built Releases](#pre-built-releases)
+ - [Homebrew](#homebrew)
+ - [Linux and macOS](#linux-and-macos)
+ - [Windows](#windows)
+ - [Compile](#compile)
+ - [ Run Kingfisher in Docker](#-run-kingfisher-in-docker)
- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance)
- - [Write Custom Rules!](#write-custom-rules)
-- [Usage](#usage)
+ - [📝 Write Custom Rules!](#-write-custom-rules)
+- [🎉 Usage](#-usage)
- [Basic Examples](#basic-examples)
- [Scan with secret validation](#scan-with-secret-validation)
- [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories)
@@ -67,6 +78,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Exclude specific paths](#exclude-specific-paths)
- [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines)
- [ Scanning an AWS S3 Bucket](#-scanning-an-aws-s3-bucket)
+ - [ Scanning a Google Cloud Storage Bucket](#-scanning-a-google-cloud-storage-bucket)
- [ Scanning Docker Images](#-scanning-docker-images)
- [ Scanning GitHub](#-scanning-github)
- [Scan GitHub organization (requires `KF_GITHUB_TOKEN`)](#scan-github-organization-requires-kf_github_token)
@@ -79,8 +91,8 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url)
- [List GitLab repositories](#list-gitlab-repositories)
- [ Scanning Azure Repos](#-scanning-azure-repos)
- - [Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-devops-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat)
- - [Scan specific Azure DevOps projects](#scan-specific-azure-devops-projects)
+ - [Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-repos-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat)
+ - [Scan specific Azure Repos projects](#scan-specific-azure-repos-projects)
- [Skip specific Azure repositories during enumeration](#skip-specific-azure-repositories-during-enumeration)
- [List Azure repositories](#list-azure-repositories)
- [ Scanning Gitea](#-scanning-gitea)
@@ -97,6 +109,12 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [List Bitbucket repositories](#list-bitbucket-repositories)
- [Authenticate to Bitbucket](#authenticate-to-bitbucket)
- [Self-hosted Bitbucket Server](#self-hosted-bitbucket-server)
+ - [ Scanning Hugging Face](#-scanning-hugging-face)
+ - [Scan Hugging Face user](#scan-hugging-face-user)
+ - [Scan Hugging Face organization](#scan-hugging-face-organization)
+ - [Scan specific Hugging Face resources](#scan-specific-hugging-face-resources)
+ - [List Hugging Face repositories](#list-hugging-face-repositories)
+ - [Authenticate to Hugging Face](#authenticate-to-hugging-face)
- [ Scanning Jira](#-scanning-jira)
- [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query)
- [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues)
@@ -107,7 +125,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Environment Variables for Tokens](#environment-variables-for-tokens)
- [Exit Codes](#exit-codes)
- [Update Checks](#update-checks)
-- [Advanced Options](#advanced-options)
+- [🤓 Advanced Options](#-advanced-options)
- [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets)
- [List Builtin Rules](#list-builtin-rules)
- [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run)
@@ -117,6 +135,8 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Notable Scan Options](#notable-scan-options)
- [Understanding `--confidence`](#understanding---confidence)
- [Ignore known false positives](#ignore-known-false-positives)
+ - [Skip Canary Tokens (AWS)](#skip-canary-tokens-aws)
+ - [Common CLI flows](#common-cli-flows)
- [Inline ignore directives](#inline-ignore-directives)
- [Finding Fingerprint](#finding-fingerprint)
- [Rule Performance Profiling](#rule-performance-profiling)
@@ -125,36 +145,58 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Roadmap](#roadmap)
- [License](#license)
+
+
+
# Getting Started
## Installation
+### Pre-built Releases
+Pre-built binaries are available from the [Releases](https://github.com/mongodb/kingfisher/releases) section.
+
+### Homebrew
-On macOS, you can simply
```bash
brew install kingfisher
```
-Pre-built binaries are also available on the [Releases](https://github.com/mongodb/kingfisher/releases) section of this page.
+### Linux and macOS
-You can also install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform:
+
+
+You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform.
```bash
# Linux, macOS
curl --silent --location \
https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \
sh && \
- ubi --project mongodb/kingfisher --in "$HOME/bin"
+ ubi --project mongodb/kingfisher --in "$HOME/.local/bin"
```
+This installs and runs `ubi` and then places the `kingfisher` executable in `~/.local/bin` on Unix-like systems.
+
+
+
+### Windows
+
+
+
+You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform.
+
```powershell
# Windows
powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in .
```
-This installs `ubi` and then places the `kingfisher` executable in `~/bin` on Unix-like
-systems (or the current directory on Windows).
+This installs and runs `ubi` and then places the `kingfisher` executable in the current directory on Windows.
+
-Or you may compile for your platform via `make`:
+
+### Compile
+You may compile for your platform via `make`
+
+
```bash
# NOTE: Requires Docker
@@ -174,9 +216,14 @@ make darwin-all # builds both x64 and arm64
make all # builds for every OS and architecture supported
```
-### Run Kingfisher in Docker
+
+
+### Run Kingfisher in Docker
+
+Run the dockerized Kingfisher container
+
+
-Run the dockerized Kingfisher container:
```bash
# GitHub Container Registry
docker run --rm ghcr.io/mongodb/kingfisher:latest --version
@@ -233,6 +280,8 @@ docker run --rm \
```
+
+
# 🔐 Detection Rules at a Glance
Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything from classic cloud keys to the latest AI SaaS tokens. Below is an overview:
@@ -248,7 +297,7 @@ Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything fr
| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more |
| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more |
-## Write Custom Rules!
+## 📝 Write Custom Rules!
Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential.
@@ -258,7 +307,7 @@ First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom King
Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required!
-# Usage
+# 🎉 Usage
## Basic Examples
@@ -400,7 +449,7 @@ kingfisher scan ./my-project \
-v
```
-## Scanning an AWS S3 Bucket
+## Scanning an AWS S3 Bucket
You can scan S3 objects directly:
```bash
@@ -415,7 +464,7 @@ Credential resolution happens in this order:
If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role.
-Examples:
+Examples
```bash
# using explicit keys
@@ -452,7 +501,30 @@ docker run --rm \
scan --s3-bucket bucket-name
```
-## Scanning Docker Images
+## Scanning a Google Cloud Storage Bucket
+
+The `--gcs-bucket` flag streams objects directly from Google Cloud Storage. Authentication uses
+Application Default Credentials, so you can provide a service-account JSON file via the
+`GOOGLE_APPLICATION_CREDENTIALS` environment variable or by passing `--gcs-service-account`. Public
+buckets work without credentials.
+
+```bash
+kingfisher scan --gcs-bucket bucket-name
+
+# scan a sub-tree inside the bucket
+kingfisher scan --gcs-bucket bucket-name --gcs-prefix path/to/data/
+
+# supply a service-account key explicitly
+kingfisher scan --gcs-bucket bucket-name --gcs-service-account /path/to/key.json
+```
+
+Functional example:
+```bash
+kingfisher scan --gcs-bucket cloud-samples-data --gcs-prefix "storage/"
+```
+
+
+## Scanning Docker Images
Kingfisher will first try to use any locally available image, then fall back to pulling via OCI.
@@ -482,7 +554,7 @@ kingfisher scan --docker-image some-private-registry.dkr.ecr.us-east-1.amazonaws
kingfisher scan --docker-image private.registry.example.com/my-image:tag
```
-## Scanning GitHub
+## Scanning GitHub
### Scan GitHub organization (requires `KF_GITHUB_TOKEN`)
@@ -524,7 +596,7 @@ KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/priva
---
-## Scanning GitLab
+## Scanning GitLab
### Scan GitLab group (requires `KF_GITLAB_TOKEN`)
@@ -580,18 +652,18 @@ kingfisher gitlab repos list --group my-group --include-subgroups
# skip specific projects when listing or scanning (supports glob patterns)
kingfisher gitlab repos list --group my-group --gitlab-exclude my-group/**/legacy-*
```
-## Scanning Azure Repos
+## Scanning Azure Repos
-### Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)
+### Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)
```bash
kingfisher scan --azure-organization my-org
-# Azure DevOps Server example
+# Azure Repos Server example
KF_AZURE_PAT="pat" kingfisher scan --azure-organization DefaultCollection --azure-base-url https://ado.internal.example/tfs/
```
-### Scan specific Azure DevOps projects
+### Scan specific Azure Repos projects
Projects are specified as `ORGANIZATION/PROJECT`. Repeat the flag for multiple projects.
@@ -621,7 +693,7 @@ kingfisher azure repos list --project my-org/app --project my-org/api
# skip specific repositories while listing (supports glob patterns)
kingfisher azure repos list --organization my-org --azure-exclude my-org/**/experimental-*
```
-## Scanning Gitea
+## Scanning Gitea
### Scan Gitea organization (requires `KF_GITEA_TOKEN`)
@@ -673,7 +745,7 @@ KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --all-gitea-organizations
# self-hosted example
KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --user johndoe --gitea-api-url https://gitea.internal.example/api/v1/
```
-## Scanning Bitbucket
+## Scanning Bitbucket
### Scan Bitbucket workspace
```bash
@@ -745,7 +817,45 @@ Use `--bitbucket-api-url` to point Kingfisher at your server's REST endpoint, fo
`https://bitbucket.example.com/rest/api/1.0/`. Provide credentials with
`--bitbucket-username` and `--bitbucket-token`, and pass `--ignore-certs` when
connecting to HTTP or otherwise insecure instances.
-## Scanning Jira
+## Scanning Hugging Face
+
+Hugging Face hosts git repositories for models, datasets, and Spaces. Kingfisher can enumerate and scan all three resource types.
+
+### Scan Hugging Face user
+
+```bash
+kingfisher scan --huggingface-user
+```
+
+### Scan Hugging Face organization
+
+```bash
+kingfisher scan --huggingface-organization
+```
+
+### Scan specific Hugging Face resources
+
+Scan individual repositories by ID (owner/name) or by passing the full HTTPS URL:
+
+```bash
+kingfisher scan --huggingface-model
+kingfisher scan --huggingface-dataset https://huggingface.co/datasets//
+kingfisher scan --huggingface-space
+```
+
+Use `--huggingface-exclude` to omit results returned by user or organization enumeration. Prefix values with `model:`, `dataset:`, or `space:` when you only want to skip a specific resource type.
+
+### List Hugging Face repositories
+
+```bash
+kingfisher huggingface repos list --huggingface-user
+```
+
+### Authenticate to Hugging Face
+
+Private repositories require an access token provided through the `KF_HUGGINGFACE_TOKEN` environment variable. For git authentication the helper also honours `KF_HUGGINGFACE_USERNAME` (default `hf_user`).
+
+## Scanning Jira
### Scan Jira issues matching a JQL query
@@ -764,7 +874,7 @@ KF_JIRA_TOKEN="token" kingfisher scan \
--max-results 1000
```
-## Scanning Confluence
+## Scanning Confluence
### Scan Confluence pages matching a CQL query
```bash
@@ -789,7 +899,7 @@ Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environ
To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions.
-## Scanning Slack
+## Scanning Slack
### Scan Slack messages matching a search query
```bash
@@ -811,11 +921,13 @@ KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \
| `KF_GITLAB_TOKEN` | GitLab Personal Access Token |
| `KF_GITEA_TOKEN` | Gitea Personal Access Token |
| `KF_GITEA_USERNAME` | Username for private Gitea clones (used with `KF_GITEA_TOKEN`) |
-| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure DevOps Personal Access Token |
-| `KF_AZURE_USERNAME` | Username to use with Azure DevOps PATs (defaults to `pat` when unset) |
+| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure Repos Personal Access Token |
+| `KF_AZURE_USERNAME` | Username to use with Azure Repos PATs (defaults to `pat` when unset) |
| `KF_BITBUCKET_USERNAME` | Bitbucket username for basic authentication |
| `KF_BITBUCKET_APP_PASSWORD` / `KF_BITBUCKET_TOKEN` | Bitbucket app password or server token |
| `KF_BITBUCKET_OAUTH_TOKEN` | Bitbucket OAuth or PAT token |
+| `KF_HUGGINGFACE_TOKEN` | Hugging Face access token for API enumeration and git cloning |
+| `KF_HUGGINGFACE_USERNAME` | Optional username for Hugging Face git operations (defaults to `hf_user`) |
| `KF_JIRA_TOKEN` | Jira API token |
| `KF_CONFLUENCE_TOKEN` | Confluence API token |
| `KF_SLACK_TOKEN` | Slack API token |
@@ -869,7 +981,7 @@ Kingfisher automatically queries GitHub for a newer release when it starts and t
- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely
-# Advanced Options
+# 🤓 Advanced Options
## Build a Baseline / Detect New Secrets
@@ -963,6 +1075,8 @@ leaves the default unchanged.
- `--manage-baseline`: Create or update the baseline file with current findings
- `--skip-regex `: Ignore findings whose text matches this regex (repeatable)
- `--skip-word `: Ignore findings containing this case-insensitive word (repeatable)
+- `--skip-aws-account `: Skip live AWS validation for findings tied to the specified AWS account number (repeatable, accepts comma-separated lists)
+- `--skip-aws-account-file `: Load AWS account numbers to skip from a file (one account per line; `#` comments allowed)
- `--ignore-comment `: Honor additional inline directives from other scanners (repeatable; e.g. `--ignore-comment "gitleaks:allow"`)
- `--no-ignore`: Disable inline directives entirely so every match is reported
## Understanding `--confidence`
@@ -996,6 +1110,66 @@ kingfisher scan \
If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early.
+### Skip Canary Tokens (AWS)
+
+Canary/honey tokens are intentionally leaked credentials used to catch misuse. Kingfisher can **recognize and skip** known AWS canary accounts so hygiene scans don’t set off alerts.
+
+**How to skip**
+Pass the 12-digit AWS account IDs for your canaries via `--skip-aws-account` (comma-separated) or `--skip-aws-account-file` (one ID per line; blank lines and `#` comments allowed). Kingfisher also ships with a **pre-seeded (but not exhaustive)** list of Thinkst Canary account IDs used by canarytokens.org, so many are skipped automatically.
+
+```bash
+kingfisher scan /path/to/code \
+ --skip-aws-account "171436882533,534261010715"
+
+# or combine preloaded canary IDs with a just-created decoy account
+printf '999900001111 \n534261010715' > /tmp/canary_accounts.txt
+
+kingfisher scan /path/to/repo \
+ --skip-aws-account-file /tmp/canary_accounts.txt
+
+```
+
+**What you’ll see**
+Findings tied to a skip-listed account report `Validation: Not Attempted` and note in the `Response:` that the entry came from the skip list:
+
+```bash
+AWS SECRET ACCESS KEY => [KINGFISHER.AWS.2]
+ |Finding.......:
+ |Fingerprint...: 2141074333616819500
+ |Confidence....: medium
+ |Entropy.......: 5.00
+ |Validation....: Not Attempted
+ |__Response....: (skip list entry) AWS validation not attempted for account 171436882533.
+ |Language......: Unknown
+ |Line Num......: 21
+ |Path..........: /tmp/test_canary_accounts.log
+```
+
+**Why this matters**
+Skipping prevents noisy tripwires in prod telemetry while keeping the status explicit—“Not Attempted” isn’t a pass. If needed, verify these credentials out-of-band or with a safe, non-triggering method.
+
+
+#### Common CLI flows
+
+```bash
+# Skip a few in-house canaries during a filesystem scan
+kingfisher scan repo/ \
+ --skip-aws-account "111122223333,444455556666"
+
+# Read a longer list from disk
+kingfisher scan repo/ \
+ --skip-aws-account-file /tmp/scripts/canary_accounts.txt
+
+# Combine preloaded canary IDs with a just-created decoy account
+printf '999900001111\n534261010715\n' > /tmp/new_canary.txt
+
+kingfisher scan /path/to/repo \
+ --skip-aws-account-file /tmp/new_canary.txt
+
+```
+
+Tip: if you manage multiple canary fleets (Thinkst, self-hosted alternatives, or bespoke decoys), checkpoint the account IDs alongside your infrastructure-as-code so security teams can rotate or expand the skip list without editing pipelines.
+
### Inline ignore directives
Add `kingfisher:ignore` anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value:
diff --git a/docs/assets/icons/gcs.svg b/docs/assets/icons/gcs.svg
new file mode 100644
index 0000000..842c121
--- /dev/null
+++ b/docs/assets/icons/gcs.svg
@@ -0,0 +1,38 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/assets/icons/huggingface.svg b/docs/assets/icons/huggingface.svg
new file mode 100644
index 0000000..43711df
--- /dev/null
+++ b/docs/assets/icons/huggingface.svg
@@ -0,0 +1,41 @@
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/cli/commands/huggingface.rs b/src/cli/commands/huggingface.rs
new file mode 100644
index 0000000..f0879ab
--- /dev/null
+++ b/src/cli/commands/huggingface.rs
@@ -0,0 +1,73 @@
+use clap::{Args, Subcommand};
+
+use crate::cli::commands::output::OutputArgs;
+
+use super::github::GitHubOutputFormat;
+
+/// Top-level Hugging Face command group
+#[derive(Args, Debug)]
+pub struct HuggingFaceArgs {
+ #[command(subcommand)]
+ pub command: HuggingFaceCommand,
+}
+
+#[derive(Subcommand, Debug)]
+pub enum HuggingFaceCommand {
+ /// Interact with Hugging Face repositories
+ #[command(subcommand)]
+ Repos(HuggingFaceReposCommand),
+}
+
+#[derive(Subcommand, Debug)]
+pub enum HuggingFaceReposCommand {
+ /// List Hugging Face repositories
+ List(HuggingFaceReposListArgs),
+}
+
+#[derive(Args, Debug, Clone)]
+pub struct HuggingFaceReposListArgs {
+ #[command(flatten)]
+ pub repo_specifiers: HuggingFaceRepoSpecifiers,
+
+ #[command(flatten)]
+ pub output_args: OutputArgs,
+}
+
+#[derive(Args, Debug, Clone, Default)]
+pub struct HuggingFaceRepoSpecifiers {
+ /// Models, datasets, and Spaces owned by these users
+ #[arg(long = "huggingface-user")]
+ pub user: Vec,
+
+ /// Models, datasets, and Spaces owned by these organizations
+ #[arg(long = "huggingface-organization", alias = "huggingface-org")]
+ pub organization: Vec,
+
+ /// Specific models to scan (format: owner/name or full URL)
+ #[arg(long = "huggingface-model")]
+ pub model: Vec,
+
+ /// Specific datasets to scan (format: owner/name or full URL)
+ #[arg(long = "huggingface-dataset")]
+ pub dataset: Vec,
+
+ /// Specific Spaces to scan (format: owner/name or full URL)
+ #[arg(long = "huggingface-space")]
+ pub space: Vec,
+
+ /// Skip specific repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:)
+ #[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")]
+ pub exclude: Vec,
+}
+
+impl HuggingFaceRepoSpecifiers {
+ pub fn is_empty(&self) -> bool {
+ self.user.is_empty()
+ && self.organization.is_empty()
+ && self.model.is_empty()
+ && self.dataset.is_empty()
+ && self.space.is_empty()
+ }
+}
+
+pub type HuggingFaceOutputFormat = GitHubOutputFormat;
diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs
index 4bab9d1..a41cf82 100644
--- a/src/cli/commands/inputs.rs
+++ b/src/cli/commands/inputs.rs
@@ -28,6 +28,11 @@ pub struct InputSpecifierArgs {
"gitlab_group",
"gitea_user",
"gitea_organization",
+ "huggingface_user",
+ "huggingface_organization",
+ "huggingface_model",
+ "huggingface_dataset",
+ "huggingface_space",
"bitbucket_user",
"bitbucket_workspace",
"bitbucket_project",
@@ -43,7 +48,8 @@ pub struct InputSpecifierArgs {
"confluence_url",
"docker_image",
"slack_query",
- "s3_bucket"
+ "s3_bucket",
+ "gcs_bucket"
]),
num_args = 0..,
value_hint = ValueHint::AnyPath
@@ -120,6 +126,30 @@ pub struct InputSpecifierArgs {
#[arg(long, alias = "include-subgroups")]
pub gitlab_include_subgroups: bool,
+ /// Scan models, datasets, and Spaces belonging to the specified Hugging Face users
+ #[arg(long = "huggingface-user")]
+ pub huggingface_user: Vec,
+
+ /// Scan models, datasets, and Spaces belonging to the specified Hugging Face organizations
+ #[arg(long = "huggingface-organization", alias = "huggingface-org")]
+ pub huggingface_organization: Vec,
+
+ /// Scan a specific Hugging Face model (format: owner/name or full URL)
+ #[arg(long = "huggingface-model")]
+ pub huggingface_model: Vec,
+
+ /// Scan a specific Hugging Face dataset (format: owner/name or full URL)
+ #[arg(long = "huggingface-dataset")]
+ pub huggingface_dataset: Vec,
+
+ /// Scan a specific Hugging Face Space (format: owner/name or full URL)
+ #[arg(long = "huggingface-space")]
+ pub huggingface_space: Vec,
+
+ /// Skip specific Hugging Face repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:)
+ #[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")]
+ pub huggingface_exclude: Vec,
+
// Gitea Options
/// Scan repositories belonging to the specified Gitea user
#[arg(long)]
@@ -256,6 +286,18 @@ pub struct InputSpecifierArgs {
#[arg(long, requires = "s3_bucket")]
pub aws_local_profile: Option,
+ /// Scan the specified Google Cloud Storage bucket
+ #[arg(long)]
+ pub gcs_bucket: Option,
+
+ /// Optional prefix within the GCS bucket
+ #[arg(long, requires = "gcs_bucket")]
+ pub gcs_prefix: Option,
+
+ /// Path to a service account JSON file for GCS authentication
+ #[arg(long, value_hint = ValueHint::FilePath, requires = "gcs_bucket")]
+ pub gcs_service_account: Option,
+
/// Docker/OCI images to scan (no local Docker required)
#[arg(long = "docker-image")]
pub docker_image: Vec,
@@ -299,7 +341,6 @@ pub struct ContentFilteringArgs {
#[arg(
long = "max-file-size",
visible_alias = "max-filesize", // also show in --help
- // alias = "max-filesize", // use this instead if you DON’T want it shown in --help
default_value_t = 256.0,
value_name = "MB"
)]
diff --git a/src/cli/commands/mod.rs b/src/cli/commands/mod.rs
index 0434af9..4b25b89 100644
--- a/src/cli/commands/mod.rs
+++ b/src/cli/commands/mod.rs
@@ -3,6 +3,7 @@ pub mod bitbucket;
pub mod gitea;
pub mod github;
pub mod gitlab;
+pub mod huggingface;
pub mod inputs;
pub mod output;
pub mod rules;
diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs
index 78312d8..f4d7921 100644
--- a/src/cli/commands/scan.rs
+++ b/src/cli/commands/scan.rs
@@ -1,4 +1,5 @@
use clap::{Args, ValueEnum};
+use std::path::PathBuf;
use strum::Display;
use tracing::debug;
@@ -119,6 +120,14 @@ pub struct ScanArgs {
#[arg(long = "skip-word", value_name = "WORD")]
pub skip_word: Vec,
+ /// AWS account IDs whose findings should skip live credential validation (repeatable)
+ #[arg(long = "skip-aws-account", value_name = "ACCOUNT_ID", value_delimiter = ',')]
+ pub skip_aws_account: Vec,
+
+ /// File containing AWS account IDs to skip (one per line, `#` comments ignored)
+ #[arg(long = "skip-aws-account-file", value_name = "FILE")]
+ pub skip_aws_account_file: Option,
+
/// Additional inline ignore directives to recognise (repeatable)
#[arg(long = "ignore-comment", value_name = "DIRECTIVE")]
pub extra_ignore_comments: Vec,
diff --git a/src/cli/global.rs b/src/cli/global.rs
index a03d3d4..3c2fe3f 100644
--- a/src/cli/global.rs
+++ b/src/cli/global.rs
@@ -8,7 +8,7 @@ use tracing::Level;
use crate::cli::commands::{
azure::AzureArgs, bitbucket::BitbucketArgs, gitea::GiteaArgs, github::GitHubArgs,
- gitlab::GitLabArgs, rules::RulesArgs, scan::ScanArgs,
+ gitlab::GitLabArgs, huggingface::HuggingFaceArgs, rules::RulesArgs, scan::ScanArgs,
};
#[deny(missing_docs)]
@@ -81,6 +81,10 @@ pub enum Command {
#[command(name = "azure")]
Azure(AzureArgs),
+ /// Interact with the Hugging Face Hub
+ #[command(name = "huggingface")]
+ HuggingFace(HuggingFaceArgs),
+
/// Manage rules
#[command(alias = "rule")]
Rules(RulesArgs),
diff --git a/src/gcs.rs b/src/gcs.rs
new file mode 100644
index 0000000..dc346fd
--- /dev/null
+++ b/src/gcs.rs
@@ -0,0 +1,96 @@
+use std::path::Path;
+
+use anyhow::{Context, Result};
+use gcloud_storage::{
+ client::{google_cloud_auth::credentials::CredentialsFile, Client, ClientConfig},
+ http::objects::{
+ download::Range,
+ get::GetObjectRequest,
+ list::{ListObjectsRequest, ListObjectsResponse},
+ },
+};
+use tracing::debug;
+
+/// Visit every object in the given GCS bucket, optionally filtered by prefix.
+///
+/// Authentication is attempted via Application Default Credentials. When that
+/// fails and no explicit service account path was provided, the client falls
+/// back to anonymous access so public buckets can still be scanned.
+pub async fn visit_bucket_objects(
+ bucket: &str,
+ prefix: Option<&str>,
+ service_account_path: Option<&Path>,
+ mut visitor: F,
+) -> Result<()>
+where
+ F: FnMut(String, Vec) -> Result<()>,
+{
+ let config_result = if let Some(path) = service_account_path {
+ let credentials = CredentialsFile::new_from_file(path.to_string_lossy().into_owned())
+ .await
+ .with_context(|| {
+ format!("Failed to read GCS service account credentials from {}", path.display())
+ })?;
+
+ ClientConfig::default().with_credentials(credentials).await
+ } else {
+ ClientConfig::default().with_auth().await
+ };
+
+ let config = match config_result {
+ Ok(config) => config,
+ Err(err) => {
+ if service_account_path.is_some()
+ || std::env::var("GOOGLE_APPLICATION_CREDENTIALS").is_ok()
+ || std::env::var("GOOGLE_APPLICATION_CREDENTIALS_JSON").is_ok()
+ {
+ return Err(err)
+ .context("Failed to authenticate with GCS using provided credentials");
+ }
+ debug!("Falling back to anonymous GCS access: {err}");
+ ClientConfig::default().anonymous()
+ }
+ };
+
+ let client = Client::new(config);
+ let mut page_token: Option = None;
+
+ loop {
+ let request = ListObjectsRequest {
+ bucket: bucket.to_string(),
+ prefix: prefix.map(|p| p.to_string()),
+ page_token: page_token.clone(),
+ ..ListObjectsRequest::default()
+ };
+
+ let mut response: ListObjectsResponse = client
+ .list_objects(&request)
+ .await
+ .with_context(|| format!("Failed to list objects in bucket {bucket}"))?;
+
+ if let Some(items) = response.items.take() {
+ for object in items.into_iter().filter(|o| !o.name.is_empty()) {
+ let data = client
+ .download_object(
+ &GetObjectRequest {
+ bucket: bucket.to_string(),
+ object: object.name.clone(),
+ ..GetObjectRequest::default()
+ },
+ &Range::default(),
+ )
+ .await
+ .with_context(|| format!("Failed to fetch object {}", object.name))?;
+
+ visitor(object.name, data)?;
+ }
+ }
+
+ match response.next_page_token {
+ Some(token) if !token.is_empty() => page_token = Some(token),
+ _ => break,
+ }
+ }
+
+ Ok(())
+}
diff --git a/src/git_binary.rs b/src/git_binary.rs
index 82fd990..a629373 100644
--- a/src/git_binary.rs
+++ b/src/git_binary.rs
@@ -40,6 +40,15 @@ const AZURE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_azcreds() {
fi
}; _azcreds"#;
+const HUGGINGFACE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_hfcreds() {
+ token="$KF_HUGGINGFACE_TOKEN";
+ if [ -n "$token" ]; then
+ user="${KF_HUGGINGFACE_USERNAME:-hf_user}";
+ echo username="$user";
+ echo password="$token";
+ fi
+}; _hfcreds"#;
+
/// Represents errors that can occur when interacting with the `git` CLI.
#[derive(Debug, thiserror::Error)]
pub enum GitError {
@@ -47,14 +56,37 @@ pub enum GitError {
IOError(#[from] std::io::Error),
#[error(
- "git execution failed\ncode={}\nstdout=```\n{}```\nstderr=```\n{}```",
- .status,
- String::from_utf8_lossy(.stdout),
- String::from_utf8_lossy(.stderr)
+ "git execution failed (status: {status}){summary}",
+ status = format_exit_status(.status),
+ summary = format_git_error_summary(.stdout.as_slice(), .stderr.as_slice())
)]
GitError { stdout: Vec, stderr: Vec, status: ExitStatus },
}
+fn format_exit_status(status: &ExitStatus) -> String {
+ status.code().map(|code| code.to_string()).unwrap_or_else(|| status.to_string())
+}
+
+fn format_git_error_summary(stdout: &[u8], stderr: &[u8]) -> String {
+ let mut messages = Vec::new();
+ if let Some(line) = summarize_output(stderr) {
+ messages.push(line);
+ }
+ if let Some(line) = summarize_output(stdout) {
+ messages.push(line);
+ }
+ if messages.is_empty() {
+ String::new()
+ } else {
+ format!(": {}", messages.join(" | "))
+ }
+}
+
+fn summarize_output(output: &[u8]) -> Option {
+ let text = String::from_utf8_lossy(output);
+ text.lines().map(str::trim).find(|line| !line.is_empty()).map(|line| line.to_owned())
+}
+
/// A helper struct for running `git` commands.
///
/// It supports optional GitHub, GitLab, Gitea, and Bitbucket credentials passed via
@@ -91,6 +123,8 @@ impl Git {
let has_azure_token = ["KF_AZURE_TOKEN", "KF_AZURE_PAT"]
.iter()
.any(|key| matches!(std::env::var(key), Ok(value) if !value.is_empty()));
+ let has_huggingface_token =
+ matches!(std::env::var("KF_HUGGINGFACE_TOKEN"), Ok(value) if !value.is_empty());
// If credentials are provided via environment variables, clear existing helpers first.
if has_github_token
@@ -98,6 +132,7 @@ impl Git {
|| has_gitea_token
|| has_bitbucket_credentials
|| has_azure_token
+ || has_huggingface_token
{
credentials.push("-c".into());
credentials.push(r#"credential.helper="#.into());
@@ -136,6 +171,11 @@ impl Git {
credentials.push(AZURE_CREDENTIAL_HELPER.into());
}
+ if has_huggingface_token {
+ credentials.push("-c".into());
+ credentials.push(HUGGINGFACE_CREDENTIAL_HELPER.into());
+ }
+
Self { credentials, ignore_certs }
}
diff --git a/src/huggingface.rs b/src/huggingface.rs
new file mode 100644
index 0000000..42bbf03
--- /dev/null
+++ b/src/huggingface.rs
@@ -0,0 +1,636 @@
+use std::{collections::HashSet, env, time::Duration};
+
+use anyhow::{anyhow, Result};
+use indicatif::{ProgressBar, ProgressStyle};
+use reqwest::{header::LINK, StatusCode, Url};
+use serde::Deserialize;
+use serde_json::Value;
+use tracing::{debug, warn};
+
+use crate::{git_url::GitUrl, validation::GLOBAL_USER_AGENT};
+
+#[derive(Debug, Clone, Default)]
+pub struct RepoSpecifiers {
+ pub user: Vec,
+ pub organization: Vec,
+ pub model: Vec,
+ pub dataset: Vec,
+ pub space: Vec,
+ pub exclude: Vec,
+}
+
+impl RepoSpecifiers {
+ pub fn is_empty(&self) -> bool {
+ self.user.is_empty()
+ && self.organization.is_empty()
+ && self.model.is_empty()
+ && self.dataset.is_empty()
+ && self.space.is_empty()
+ }
+}
+
+#[derive(Clone, Default)]
+pub struct AuthConfig {
+ token: Option,
+}
+
+impl std::fmt::Debug for AuthConfig {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ f.debug_struct("AuthConfig")
+ .field(
+ "token",
+ &self
+ .token
+ .as_ref()
+ .map(|token| format!("{}…", token.chars().take(4).collect::())),
+ )
+ .finish()
+ }
+}
+
+impl AuthConfig {
+ pub fn from_env() -> Self {
+ let token = env::var("KF_HUGGINGFACE_TOKEN").ok().filter(|t| !t.trim().is_empty());
+ Self { token }
+ }
+
+ fn apply(&self, request: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
+ if let Some(token) = &self.token {
+ request.bearer_auth(token)
+ } else {
+ request
+ }
+ }
+
+ fn has_token(&self) -> bool {
+ self.token.is_some()
+ }
+}
+
+#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
+enum ResourceKind {
+ Model,
+ Dataset,
+ Space,
+}
+
+impl ResourceKind {
+ fn api_path(self) -> &'static str {
+ match self {
+ ResourceKind::Model => "models",
+ ResourceKind::Dataset => "datasets",
+ ResourceKind::Space => "spaces",
+ }
+ }
+
+ fn git_url(self, slug: &str) -> String {
+ match self {
+ ResourceKind::Model => format!("https://huggingface.co/{slug}.git"),
+ ResourceKind::Dataset => format!("https://huggingface.co/datasets/{slug}.git"),
+ ResourceKind::Space => format!("https://huggingface.co/spaces/{slug}.git"),
+ }
+ }
+
+ fn canonical_prefix(self) -> &'static str {
+ match self {
+ ResourceKind::Model => "model",
+ ResourceKind::Dataset => "dataset",
+ ResourceKind::Space => "space",
+ }
+ }
+
+ fn display_name_singular(self) -> &'static str {
+ match self {
+ ResourceKind::Model => "model",
+ ResourceKind::Dataset => "dataset",
+ ResourceKind::Space => "space",
+ }
+ }
+
+ fn display_name_plural(self) -> &'static str {
+ match self {
+ ResourceKind::Model => "models",
+ ResourceKind::Dataset => "datasets",
+ ResourceKind::Space => "spaces",
+ }
+ }
+}
+
+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
+struct ResourceRef {
+ kind: ResourceKind,
+ slug: String,
+}
+
+impl ResourceRef {
+ fn new(kind: ResourceKind, slug: String) -> Self {
+ Self { kind, slug }
+ }
+
+ fn canonical_key(&self) -> String {
+ format!("{}:{}", self.kind.canonical_prefix(), self.slug.to_lowercase())
+ }
+
+ fn git_url(&self) -> String {
+ self.kind.git_url(&self.slug)
+ }
+}
+
+#[derive(Debug, Deserialize)]
+#[serde(untagged)]
+enum HuggingFaceItem {
+ Id {
+ id: String,
+ },
+ ModelId {
+ #[serde(rename = "modelId")]
+ model_id: String,
+ },
+}
+
+impl HuggingFaceItem {
+ fn into_identifier(self) -> String {
+ match self {
+ HuggingFaceItem::Id { id } => id,
+ HuggingFaceItem::ModelId { model_id } => model_id,
+ }
+ }
+}
+
+#[derive(Default)]
+struct ExcludeSet {
+ typed: HashSet,
+ untyped: HashSet,
+}
+
+impl ExcludeSet {
+ fn from_list(values: &[String]) -> Self {
+ let mut typed = HashSet::new();
+ let mut untyped = HashSet::new();
+ for raw in values {
+ let trimmed = raw.trim();
+ if trimmed.is_empty() {
+ continue;
+ }
+ if let Some((prefix, rest)) = trimmed.split_once(':') {
+ match normalize_kind(prefix) {
+ Some(kind) => {
+ if let Some(slug) = parse_slug_for_kind(kind, rest) {
+ typed.insert(format!(
+ "{}:{}",
+ kind.canonical_prefix(),
+ slug.to_lowercase()
+ ));
+ } else {
+ warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)");
+ }
+ }
+ None => warn!("Ignoring invalid Hugging Face exclusion '{raw}' (unknown type)"),
+ }
+ } else if let Some(slug) = normalize_untyped_slug(trimmed) {
+ untyped.insert(slug);
+ } else {
+ warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)");
+ }
+ }
+ Self { typed, untyped }
+ }
+
+ fn should_exclude(&self, kind: ResourceKind, slug: &str) -> bool {
+ let typed_key = format!("{}:{}", kind.canonical_prefix(), slug.to_lowercase());
+ if self.typed.contains(&typed_key) {
+ return true;
+ }
+ self.untyped.contains(&slug.to_lowercase())
+ }
+}
+
+fn normalize_kind(raw: &str) -> Option {
+ match raw.trim().to_ascii_lowercase().as_str() {
+ "model" | "models" => Some(ResourceKind::Model),
+ "dataset" | "datasets" => Some(ResourceKind::Dataset),
+ "space" | "spaces" => Some(ResourceKind::Space),
+ _ => None,
+ }
+}
+
+fn normalize_untyped_slug(raw: &str) -> Option {
+ let trimmed = raw.trim();
+ if trimmed.is_empty() {
+ return None;
+ }
+ let segments: Vec<&str> = trimmed.split('/').filter(|segment| !segment.is_empty()).collect();
+ normalize_untyped_segments(&segments)
+}
+
+fn normalize_untyped_segments(segments: &[&str]) -> Option {
+ if segments.is_empty() {
+ return None;
+ }
+ let mut parts: Vec<&str> = segments.to_vec();
+ if let Some(first) = parts.first() {
+ let lowered = first.trim().to_ascii_lowercase();
+ if matches!(
+ lowered.as_str(),
+ "models" | "model" | "datasets" | "dataset" | "spaces" | "space"
+ ) {
+ parts.remove(0);
+ }
+ }
+ if parts.len() < 2 {
+ return None;
+ }
+ let owner = parts[0].trim();
+ let binding = parts[1..].join("/");
+ let name = binding.trim_end_matches(".git").trim();
+
+ if owner.is_empty() || name.is_empty() {
+ return None;
+ }
+ Some(format!("{}/{}", owner, name).to_lowercase())
+}
+
+fn parse_slug_for_kind(kind: ResourceKind, raw: &str) -> Option {
+ let trimmed = raw.trim();
+ if trimmed.is_empty() {
+ return None;
+ }
+ if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
+ let url = Url::parse(trimmed).ok()?;
+ let segments: Vec<&str> = url
+ .path_segments()
+ .map(|segments| segments.filter(|s| !s.is_empty()).collect())
+ .unwrap_or_default();
+ return parse_slug_segments(kind, &segments);
+ }
+ let segments: Vec<&str> = trimmed.split('/').filter(|s| !s.is_empty()).collect();
+ parse_slug_segments(kind, &segments)
+}
+
+fn parse_slug_segments(kind: ResourceKind, segments: &[&str]) -> Option {
+ if segments.is_empty() {
+ return None;
+ }
+ let mut parts: Vec<&str> = segments.to_vec();
+ if let Some(first) = parts.first() {
+ let lowered = first.trim().to_ascii_lowercase();
+ let should_trim = match kind {
+ ResourceKind::Model => matches!(lowered.as_str(), "models" | "model"),
+ ResourceKind::Dataset => matches!(lowered.as_str(), "datasets" | "dataset"),
+ ResourceKind::Space => matches!(lowered.as_str(), "spaces" | "space"),
+ };
+ if should_trim {
+ parts.remove(0);
+ }
+ }
+ if parts.len() < 2 {
+ return None;
+ }
+ let owner = parts[0].trim();
+ let binding = parts[1..].join("/");
+ let name = binding.trim_end_matches(".git").trim();
+
+ if owner.is_empty() || name.is_empty() {
+ return None;
+ }
+ Some(format!("{owner}/{name}"))
+}
+
+fn parse_next_link(value: &str) -> Option {
+ value.split(',').find_map(|part| {
+ let part = part.trim();
+ let (url_part, params) = part.split_once('>')?;
+ if params.contains("rel=\"next\"") {
+ let url = url_part.trim_start_matches('<').trim();
+ Url::parse(url).ok()
+ } else {
+ None
+ }
+ })
+}
+
+const BODY_SNIPPET_LIMIT: usize = 200;
+
+async fn fetch_paginated(
+ client: &reqwest::Client,
+ mut current_url: Url,
+ auth: &AuthConfig,
+ context: &str,
+) -> Result> {
+ let mut items = Vec::new();
+ loop {
+ let mut request =
+ client.get(current_url.clone()).header("User-Agent", GLOBAL_USER_AGENT.as_str());
+ request = auth.apply(request);
+ let response = request.send().await?;
+ let status = response.status();
+ let link_header = response
+ .headers()
+ .get(LINK)
+ .and_then(|value| value.to_str().ok())
+ .map(|value| value.to_string());
+ if !status.is_success() {
+ let body = response.text().await.unwrap_or_default();
+ let mut message = format!(
+ "Hugging Face API request failed while enumerating {context} ({status}): {body}"
+ );
+ if matches!(status, StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN)
+ && !auth.has_token()
+ {
+ message.push_str(
+ "\nProvide a Hugging Face access token via the KF_HUGGINGFACE_TOKEN environment variable.",
+ );
+ }
+ return Err(anyhow!(message));
+ }
+ let body = response.bytes().await?;
+ let value: Value = serde_json::from_slice(&body).map_err(|err| {
+ let snippet = body_snippet(&body);
+ anyhow!(
+ "Failed to parse Hugging Face response while enumerating {context}: {err}. Body snippet: {snippet}",
+ context = context,
+ err = err,
+ snippet = snippet
+ )
+ })?;
+
+ let array = value.as_array().ok_or_else(|| {
+ let snippet = body_snippet(&body);
+ anyhow!(
+ "Unexpected Hugging Face response format while enumerating {context} (expected array). Body snippet: {snippet}",
+ context = context,
+ snippet = snippet
+ )
+ })?;
+
+ let mut page = Vec::new();
+ for (index, element) in array.iter().enumerate() {
+ match serde_json::from_value::(element.clone()) {
+ Ok(item) => page.push(item),
+ Err(err) => {
+ let snippet = value_snippet(element);
+ warn!(
+ "Skipping Hugging Face item at index {index} while enumerating {context}: {err}. Item snippet: {snippet}"
+ );
+ }
+ }
+ }
+ items.append(&mut page);
+ if let Some(link_value) = link_header {
+ if let Some(next_url) = parse_next_link(&link_value) {
+ current_url = next_url;
+ continue;
+ }
+ }
+ break;
+ }
+ Ok(items)
+}
+
+fn body_snippet(body: &[u8]) -> String {
+ truncate_for_display(&String::from_utf8_lossy(body), BODY_SNIPPET_LIMIT)
+}
+
+fn value_snippet(value: &Value) -> String {
+ let text = value.to_string();
+ truncate_for_display(&text, BODY_SNIPPET_LIMIT)
+}
+
+fn truncate_for_display(text: &str, limit: usize) -> String {
+ let mut snippet: String = text.chars().take(limit).collect();
+ if text.chars().count() > limit {
+ snippet.push('…');
+ }
+ snippet
+}
+
+async fn fetch_resources_for_owner(
+ client: &reqwest::Client,
+ base_url: &Url,
+ owner: &str,
+ label: &str,
+ auth: &AuthConfig,
+ progress: Option<&ProgressBar>,
+) -> Result> {
+ let mut resources = Vec::new();
+ for kind in [ResourceKind::Model, ResourceKind::Dataset, ResourceKind::Space] {
+ if let Some(pb) = progress {
+ pb.set_message(format!(
+ "Enumerating Hugging Face {label} {}",
+ kind.display_name_plural()
+ ));
+ }
+ let mut url = base_url.join(kind.api_path())?;
+ {
+ let mut pairs = url.query_pairs_mut();
+ pairs.append_pair("author", owner);
+ pairs.append_pair("limit", "100");
+ }
+ let context = format!("{} for {label}", kind.display_name_plural());
+ match fetch_paginated(client, url, auth, &context).await {
+ Ok(items) => {
+ for item in items {
+ let identifier = item.into_identifier();
+ if let Some(slug) = parse_slug_for_kind(kind, &identifier) {
+ resources.push(ResourceRef::new(kind, slug));
+ } else {
+ warn!(
+ "Skipping Hugging Face {} with unexpected identifier '{}'",
+ kind.display_name_singular(),
+ identifier
+ );
+ }
+ }
+ }
+ Err(err) => {
+ warn!(
+ "Failed to enumerate Hugging Face {} for {label}: {err}",
+ kind.display_name_plural()
+ );
+ }
+ }
+ }
+ Ok(resources)
+}
+
+fn append_explicit_resources(specifiers: &RepoSpecifiers, resources: &mut Vec) {
+ for model in &specifiers.model {
+ if let Some(slug) = parse_slug_for_kind(ResourceKind::Model, model) {
+ resources.push(ResourceRef::new(ResourceKind::Model, slug));
+ } else {
+ warn!("Ignoring invalid Hugging Face model identifier '{model}'");
+ }
+ }
+ for dataset in &specifiers.dataset {
+ if let Some(slug) = parse_slug_for_kind(ResourceKind::Dataset, dataset) {
+ resources.push(ResourceRef::new(ResourceKind::Dataset, slug));
+ } else {
+ warn!("Ignoring invalid Hugging Face dataset identifier '{dataset}'");
+ }
+ }
+ for space in &specifiers.space {
+ if let Some(slug) = parse_slug_for_kind(ResourceKind::Space, space) {
+ resources.push(ResourceRef::new(ResourceKind::Space, slug));
+ } else {
+ warn!("Ignoring invalid Hugging Face space identifier '{space}'");
+ }
+ }
+}
+
+pub async fn enumerate_repo_urls(
+ specifiers: &RepoSpecifiers,
+ auth: &AuthConfig,
+ ignore_certs: bool,
+ progress: Option<&mut ProgressBar>,
+) -> Result> {
+ let client = reqwest::Client::builder()
+ .timeout(Duration::from_secs(30))
+ .danger_accept_invalid_certs(ignore_certs)
+ .build()?;
+ let base_url = Url::parse("https://huggingface.co/api/")?;
+ let excludes = ExcludeSet::from_list(&specifiers.exclude);
+ let mut collected = Vec::new();
+
+ for user in &specifiers.user {
+ let label = format!("user {user}");
+ if let Some(pb) = progress.as_ref() {
+ pb.set_message(format!("Enumerating Hugging Face {label}"));
+ }
+ match fetch_resources_for_owner(
+ &client,
+ &base_url,
+ user,
+ &label,
+ auth,
+ progress.as_ref().map(|pb| &**pb),
+ )
+ .await
+ {
+ Ok(mut resources) => collected.append(&mut resources),
+ Err(err) => warn!("Failed to enumerate Hugging Face user {user}: {err}"),
+ }
+ }
+
+ for org in &specifiers.organization {
+ let label = format!("organization {org}");
+ if let Some(pb) = progress.as_ref() {
+ pb.set_message(format!("Enumerating Hugging Face {label}"));
+ }
+ match fetch_resources_for_owner(
+ &client,
+ &base_url,
+ org,
+ &label,
+ auth,
+ progress.as_ref().map(|pb| &**pb),
+ )
+ .await
+ {
+ Ok(mut resources) => collected.append(&mut resources),
+ Err(err) => warn!("Failed to enumerate Hugging Face organization {org}: {err}"),
+ }
+ }
+
+ append_explicit_resources(specifiers, &mut collected);
+
+ let mut seen = HashSet::new();
+ let mut urls = Vec::new();
+ for resource in collected {
+ if excludes.should_exclude(resource.kind, &resource.slug) {
+ debug!(
+ "Skipping Hugging Face {} {} due to exclusion",
+ resource.kind.display_name_singular(),
+ resource.slug
+ );
+ continue;
+ }
+ let key = resource.canonical_key();
+ if seen.insert(key) {
+ urls.push(resource.git_url());
+ }
+ }
+ urls.sort();
+ urls.dedup();
+ Ok(urls)
+}
+
+pub async fn list_repositories(
+ specifiers: &RepoSpecifiers,
+ auth: &AuthConfig,
+ ignore_certs: bool,
+ progress_enabled: bool,
+) -> Result<()> {
+ let mut progress = if progress_enabled {
+ let style = ProgressStyle::with_template("{spinner} {msg} [{elapsed_precise}]")
+ .expect("progress bar style template should compile");
+ let pb = ProgressBar::new_spinner()
+ .with_style(style)
+ .with_message("Enumerating Hugging Face repositories");
+ pb.enable_steady_tick(Duration::from_millis(500));
+ pb
+ } else {
+ ProgressBar::hidden()
+ };
+
+ let urls = enumerate_repo_urls(specifiers, auth, ignore_certs, Some(&mut progress)).await?;
+ for url in urls {
+ println!("{url}");
+ }
+ progress.finish_and_clear();
+ Ok(())
+}
+
+pub fn wiki_url(_repo_url: &GitUrl) -> Option {
+ None
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn parse_model_slug_from_plain() {
+ assert_eq!(
+ parse_slug_for_kind(ResourceKind::Model, "user/model"),
+ Some("user/model".to_string())
+ );
+ }
+
+ #[test]
+ fn parse_dataset_slug_with_prefix() {
+ assert_eq!(
+ parse_slug_for_kind(ResourceKind::Dataset, "datasets/user/data.git"),
+ Some("user/data".to_string())
+ );
+ }
+
+ #[test]
+ fn parse_space_slug_from_url() {
+ assert_eq!(
+ parse_slug_for_kind(ResourceKind::Space, "https://huggingface.co/spaces/user/demo"),
+ Some("user/demo".to_string())
+ );
+ }
+
+ #[test]
+ fn exclude_set_matches_typed_and_untyped() {
+ let excludes =
+ ExcludeSet::from_list(&vec!["model:user/model".into(), "datasets/user/data".into()]);
+ assert!(excludes.should_exclude(ResourceKind::Model, "user/model"));
+ assert!(excludes.should_exclude(ResourceKind::Dataset, "user/data"));
+ assert!(!excludes.should_exclude(ResourceKind::Space, "user/space"));
+ }
+
+ #[test]
+ fn parse_link_header() {
+ let header = "; rel=\"next\"";
+ let url = parse_next_link(header).expect("next link");
+ assert_eq!(url.as_str(), "https://huggingface.co/api/models?cursor=abc");
+ }
+
+ #[test]
+ fn truncate_for_display_adds_ellipsis() {
+ assert_eq!(truncate_for_display("abcdef", 3), "abc…");
+ assert_eq!(truncate_for_display("abc", 5), "abc");
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 9fc8106..1736bd1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -13,6 +13,7 @@ pub mod defaults;
pub mod entropy;
pub mod finding_data;
pub mod findings_store;
+pub mod gcs;
pub mod git_binary;
pub mod git_commit_metadata;
pub mod git_metadata_graph;
@@ -21,6 +22,7 @@ pub mod git_url;
pub mod gitea;
pub mod github;
pub mod gitlab;
+pub mod huggingface;
pub mod inline_ignore;
pub mod jira;
pub mod liquid_filters;
diff --git a/src/main.rs b/src/main.rs
index c0f6f9a..d6c36c7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -52,7 +52,7 @@ use kingfisher::{
},
findings_store,
findings_store::FindingsStore,
- gitea, github,
+ gitea, github, huggingface,
rule_loader::RuleLoader,
rules_database::RulesDatabase,
scanner::{load_and_record_rules, run_scan},
@@ -75,6 +75,7 @@ use crate::cli::commands::{
bitbucket::{BitbucketAuthArgs, BitbucketCommand, BitbucketRepoType, BitbucketReposCommand},
gitea::{GiteaCommand, GiteaRepoType, GiteaReposCommand},
gitlab::{GitLabCommand, GitLabRepoType, GitLabReposCommand},
+ huggingface::{HuggingFaceCommand, HuggingFaceReposCommand},
};
fn main() -> anyhow::Result<()> {
@@ -93,6 +94,7 @@ fn main() -> anyhow::Result<()> {
Command::Bitbucket(_) => num_cpus::get(), // Default for Bitbucket commands
Command::Gitea(_) => num_cpus::get(), // Default for Gitea commands
Command::Azure(_) => num_cpus::get(), // Default for Azure commands
+ Command::HuggingFace(_) => num_cpus::get(), // Default for Hugging Face commands
Command::Rules(_) => num_cpus::get(), // Default for Rules commands
};
@@ -327,6 +329,28 @@ async fn async_main(args: CommandLineArgs) -> Result<()> {
}
},
},
+ Command::HuggingFace(hf_args) => match hf_args.command {
+ HuggingFaceCommand::Repos(repos_command) => match repos_command {
+ HuggingFaceReposCommand::List(list_args) => {
+ let specifiers = huggingface::RepoSpecifiers {
+ user: list_args.repo_specifiers.user.clone(),
+ organization: list_args.repo_specifiers.organization.clone(),
+ model: list_args.repo_specifiers.model.clone(),
+ dataset: list_args.repo_specifiers.dataset.clone(),
+ space: list_args.repo_specifiers.space.clone(),
+ exclude: list_args.repo_specifiers.exclude.clone(),
+ };
+ let auth = huggingface::AuthConfig::from_env();
+ huggingface::list_repositories(
+ &specifiers,
+ &auth,
+ global_args.ignore_certs,
+ global_args.use_progress(),
+ )
+ .await?;
+ }
+ },
+ },
Command::SelfUpdate => {
anyhow::bail!("SelfUpdate command should not reach this branch")
}
@@ -367,6 +391,13 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -400,6 +431,9 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
// Slack query
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
@@ -436,6 +470,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_base64: false,
no_inline_ignore: false,
diff --git a/src/reporter.rs b/src/reporter.rs
index 52a491c..127a9ad 100644
--- a/src/reporter.rs
+++ b/src/reporter.rs
@@ -703,7 +703,7 @@ mod tests {
git_commit_metadata::CommitMetadata,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::{SerializableCapture, SerializableCaptures},
- origin::OriginSet,
+ origin::{Origin, OriginSet},
rules::rule::{Confidence, Rule, RuleSyntax},
};
use gix::{date::Time, ObjectId};
@@ -711,79 +711,8 @@ mod tests {
use std::path::PathBuf;
use tempfile::tempdir;
- #[test]
- fn build_finding_record_uses_git_blob_path() {
- let temp = tempdir().unwrap();
- let datastore =
- Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
- let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
-
- let repo_path = Arc::new(PathBuf::from("/tmp/repo"));
- let commit_metadata = Arc::new(CommitMetadata {
- commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(),
- committer_name: "Alice".into(),
- committer_email: "alice@example.com".into(),
- committer_timestamp: Time::new(0, 0),
- });
- let blob_path = "path/in/history.txt".to_string();
- let origin = OriginSet::new(
- Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()),
- vec![],
- );
-
- let rule = Arc::new(Rule::new(RuleSyntax {
- name: "Test Rule".into(),
- id: "test.rule".into(),
- pattern: ".*".into(),
- min_entropy: 0.0,
- confidence: Confidence::Medium,
- visible: true,
- examples: vec![],
- negative_examples: vec![],
- references: vec![],
- validation: None,
- depends_on_rule: vec![],
- }));
-
- let blob_id = BlobId::new(b"blob-data");
- let report_match = ReportMatch {
- origin,
- blob_metadata: BlobMetadata {
- id: blob_id,
- num_bytes: 42,
- mime_essence: None,
- language: Some("Unknown".into()),
- },
- m: Match {
- location: Location {
- offset_span: OffsetSpan { start: 0, end: 10 },
- source_span: SourceSpan {
- start: SourcePoint { line: 19, column: 0 },
- end: SourcePoint { line: 19, column: 10 },
- },
- },
- groups: SerializableCaptures {
- captures: SmallVec::<[SerializableCapture; 2]>::new(),
- },
- blob_id,
- finding_fingerprint: 123,
- rule: Arc::clone(&rule),
- validation_response_body: "Bad credentials".into(),
- validation_response_status: 401,
- validation_success: false,
- calculated_entropy: 5.29,
- visible: true,
- is_base64: false,
- },
- comment: None,
- match_confidence: Confidence::Medium,
- visible: true,
- validation_response_body: "Bad credentials".into(),
- validation_response_status: 401,
- validation_success: false,
- };
-
- let scan_args = ScanArgs {
+ fn sample_scan_args() -> ScanArgs {
+ ScanArgs {
num_jobs: 1,
rules: RuleSpecifierArgs::default(),
input_specifier_args: InputSpecifierArgs {
@@ -802,6 +731,12 @@ mod tests {
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -833,6 +768,9 @@ mod tests {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@@ -864,9 +802,98 @@ mod tests {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_inline_ignore: false,
+ }
+ }
+
+ fn sample_report_match(
+ validation_body: &str,
+ validation_status: u16,
+ validation_success: bool,
+ ) -> (ReportMatch, String) {
+ let repo_path = Arc::new(PathBuf::from("/tmp/repo"));
+ let commit_metadata = Arc::new(CommitMetadata {
+ commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(),
+ committer_name: "Alice".into(),
+ committer_email: "alice@example.com".into(),
+ committer_timestamp: Time::new(0, 0),
+ });
+ let blob_path = "path/in/history.txt".to_string();
+ let origin = OriginSet::new(
+ Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()),
+ vec![],
+ );
+
+ let rule = Arc::new(Rule::new(RuleSyntax {
+ name: "Test Rule".into(),
+ id: "test.rule".into(),
+ pattern: ".*".into(),
+ min_entropy: 0.0,
+ confidence: Confidence::Medium,
+ visible: true,
+ examples: vec![],
+ negative_examples: vec![],
+ references: vec![],
+ validation: None,
+ depends_on_rule: vec![],
+ }));
+
+ let blob_id = BlobId::new(b"blob-data");
+ let validation_body_owned = validation_body.to_string();
+ let report_match = ReportMatch {
+ origin,
+ blob_metadata: BlobMetadata {
+ id: blob_id,
+ num_bytes: 42,
+ mime_essence: None,
+ language: Some("Unknown".into()),
+ },
+ m: Match {
+ location: Location {
+ offset_span: OffsetSpan { start: 0, end: 10 },
+ source_span: SourceSpan {
+ start: SourcePoint { line: 19, column: 0 },
+ end: SourcePoint { line: 19, column: 10 },
+ },
+ },
+ groups: SerializableCaptures {
+ captures: SmallVec::<[SerializableCapture; 2]>::new(),
+ },
+ blob_id,
+ finding_fingerprint: 123,
+ rule: Arc::clone(&rule),
+ validation_response_body: validation_body_owned.clone(),
+ validation_response_status: validation_status,
+ validation_success,
+ calculated_entropy: 5.29,
+ visible: true,
+ is_base64: false,
+ },
+ comment: None,
+ match_confidence: Confidence::Medium,
+ visible: true,
+ validation_response_body: validation_body_owned,
+ validation_response_status: validation_status,
+ validation_success,
};
+ (report_match, blob_path)
+ }
+
+ #[test]
+ fn build_finding_record_uses_git_blob_path() {
+ let temp = tempdir().unwrap();
+ let datastore =
+ Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
+ let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
+
+ let (report_match, blob_path) =
+ sample_report_match("Bad credentials", StatusCode::UNAUTHORIZED.as_u16(), false);
+
+ let scan_args = sample_scan_args();
+
let record = reporter.build_finding_record(&report_match, &scan_args);
assert_eq!(record.finding.path, blob_path);
let git_file_path = record
@@ -880,6 +907,28 @@ mod tests {
assert_eq!(git_file_path, "path/in/history.txt");
}
+ #[test]
+ fn skip_list_matches_surface_skip_reason() {
+ let temp = tempdir().unwrap();
+ let datastore =
+ Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
+ let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
+
+ let (report_match, _) = sample_report_match(
+ "(skip list entry) AWS validation not attempted for account 111122223333.",
+ StatusCode::CONTINUE.as_u16(),
+ false,
+ );
+ let scan_args = sample_scan_args();
+
+ let record = reporter.build_finding_record(&report_match, &scan_args);
+ assert_eq!(record.finding.validation.status, "Not Attempted");
+ assert_eq!(
+ record.finding.validation.response,
+ "(skip list entry) AWS validation not attempted for account 111122223333."
+ );
+ }
+
use super::build_git_urls;
#[test]
diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs
index 02dc20c..6435c71 100644
--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@@ -93,6 +93,14 @@ mod tests {
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
+ // Hugging Face
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
// Gitea
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
@@ -132,6 +140,9 @@ mod tests {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
docker_image: Vec::new(),
// clone / history options
@@ -163,6 +174,8 @@ mod tests {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
}
diff --git a/src/reporter/pretty_format.rs b/src/reporter/pretty_format.rs
index 086648c..ea31fc5 100644
--- a/src/reporter/pretty_format.rs
+++ b/src/reporter/pretty_format.rs
@@ -115,7 +115,7 @@ impl<'a> Display for PrettyFindingRecord<'a> {
} else {
writeln!(f, " |Validation....: {}", finding.validation.status)?;
}
- if finding.validation.status != "Not Attempted" {
+ if !finding.validation.response.is_empty() {
writeln!(f, " |__Response....: {}", style_fn(&finding.validation.response))?;
}
writeln!(f, " |Language......: {}", finding.language)?;
diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs
index a6e0b6a..6d98b34 100644
--- a/src/scanner/mod.rs
+++ b/src/scanner/mod.rs
@@ -3,7 +3,7 @@ pub(crate) use docker::save_docker_images;
pub(crate) use enumerate::enumerate_filesystem_inputs;
pub(crate) use repos::{
clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos,
- enumerate_github_repos,
+ enumerate_github_repos, enumerate_huggingface_repos,
};
pub use runner::{load_and_record_rules, run_async_scan, run_scan};
pub(crate) use validation::run_secret_validation;
diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs
index fb6577a..da00dcc 100644
--- a/src/scanner/repos.rs
+++ b/src/scanner/repos.rs
@@ -17,10 +17,10 @@ use crate::{
commands::{github::GitCloneMode, github::GitHistoryMode, scan},
global,
},
- confluence, findings_store,
+ confluence, findings_store, gcs,
git_binary::{CloneMode, Git},
git_url::GitUrl,
- gitea, github, gitlab, jira,
+ gitea, github, gitlab, huggingface, jira,
matcher::{Match, Matcher, MatcherStats},
origin::{Origin, OriginSet},
rules_database::RulesDatabase,
@@ -305,6 +305,69 @@ pub async fn enumerate_gitea_repos(
Ok(repo_urls)
}
+pub async fn enumerate_huggingface_repos(
+ args: &scan::ScanArgs,
+ global_args: &global::GlobalArgs,
+) -> Result> {
+ let repo_specifiers = huggingface::RepoSpecifiers {
+ user: args.input_specifier_args.huggingface_user.clone(),
+ organization: args.input_specifier_args.huggingface_organization.clone(),
+ model: args.input_specifier_args.huggingface_model.clone(),
+ dataset: args.input_specifier_args.huggingface_dataset.clone(),
+ space: args.input_specifier_args.huggingface_space.clone(),
+ exclude: args.input_specifier_args.huggingface_exclude.clone(),
+ };
+
+ let mut repo_urls = args.input_specifier_args.git_url.clone();
+ if !repo_specifiers.is_empty() {
+ let mut progress = if global_args.use_progress() {
+ let style =
+ ProgressStyle::with_template("{spinner} {msg} {human_len} [{elapsed_precise}]")
+ .expect("progress bar style template should compile");
+ let pb = ProgressBar::new_spinner()
+ .with_style(style)
+ .with_message("Enumerating Hugging Face repositories...");
+ pb.enable_steady_tick(Duration::from_millis(500));
+ pb
+ } else {
+ ProgressBar::hidden()
+ };
+
+ let mut num_found: u64 = 0;
+ let auth = huggingface::AuthConfig::from_env();
+ let repo_strings = huggingface::enumerate_repo_urls(
+ &repo_specifiers,
+ &auth,
+ global_args.ignore_certs,
+ Some(&mut progress),
+ )
+ .await
+ .context("Failed to enumerate Hugging Face repositories")?;
+
+ for repo_string in repo_strings {
+ match GitUrl::from_str(&repo_string) {
+ Ok(repo_url) => {
+ repo_urls.push(repo_url);
+ num_found += 1;
+ }
+ Err(e) => {
+ progress.suspend(|| {
+ error!("Failed to parse repo URL from {repo_string}: {e}");
+ });
+ }
+ }
+ }
+
+ progress.finish_with_message(format!(
+ "Found {} repositories from Hugging Face",
+ HumanCount(num_found)
+ ));
+ }
+ repo_urls.sort();
+ repo_urls.dedup();
+ Ok(repo_urls)
+}
+
pub async fn enumerate_bitbucket_repos(
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
@@ -678,3 +741,81 @@ pub async fn fetch_s3_objects(
Ok(())
}
+
+pub async fn fetch_gcs_objects(
+ args: &scan::ScanArgs,
+ datastore: &Arc>,
+ rules_db: &RulesDatabase,
+ matcher_stats: &Mutex,
+ enable_profiling: bool,
+ shared_profiler: Arc,
+ progress_enabled: bool,
+) -> Result<()> {
+ let Some(bucket) = args.input_specifier_args.gcs_bucket.as_deref() else {
+ return Ok(());
+ };
+ let prefix = args.input_specifier_args.gcs_prefix.as_deref();
+ let service_account = args.input_specifier_args.gcs_service_account.as_deref();
+
+ let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
+ let seen_blobs = BlobIdMap::new();
+ let matcher = Matcher::new(
+ rules_db,
+ scanner_pool,
+ &seen_blobs,
+ Some(matcher_stats),
+ enable_profiling,
+ Some(shared_profiler.clone()),
+ &args.extra_ignore_comments,
+ args.no_inline_ignore,
+ )?;
+ let mut processor = BlobProcessor { matcher };
+
+ let progress = if progress_enabled {
+ let style =
+ ProgressStyle::with_template("{spinner} {msg} ({pos} objects) [{elapsed_precise}]")
+ .expect("progress bar style template should compile");
+ let pb = ProgressBar::new_spinner().with_style(style).with_message("Fetching GCS objects");
+ pb.enable_steady_tick(Duration::from_millis(500));
+ pb
+ } else {
+ ProgressBar::hidden()
+ };
+
+ let pb = progress.clone();
+
+ let bucket_name = bucket.to_string();
+
+ gcs::visit_bucket_objects(bucket, prefix, service_account, move |key, bytes| {
+ let origin = OriginSet::new(
+ Origin::from_extended(serde_json::json!({
+ "path": format!("gs://{}/{}", bucket_name, key)
+ })),
+ Vec::new(),
+ );
+ let blob = crate::blob::Blob::from_bytes(bytes);
+
+ if let Some((origin, blob_md, scored_matches)) =
+ processor.run(origin, blob, args.no_dedup, args.redact, args.no_base64)?
+ {
+ let origin_arc = Arc::new(origin);
+ let blob_arc = Arc::new(blob_md);
+
+ let mut batch = Vec::with_capacity(scored_matches.len());
+ for (_score, m) in scored_matches {
+ batch.push((origin_arc.clone(), blob_arc.clone(), m));
+ }
+
+ let added = datastore.lock().unwrap().record(batch, !args.no_dedup);
+ debug!("Added {} new GCS blobs", added);
+ }
+ pb.inc(1);
+ Ok(())
+ })
+ .await?;
+
+ let total = progress.position();
+ progress.finish_with_message(format!("Fetched {} GCS objects", total));
+
+ Ok(())
+}
diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs
index 9de4a00..25dae47 100644
--- a/src/scanner/runner.rs
+++ b/src/scanner/runner.rs
@@ -1,4 +1,7 @@
-use std::sync::{Arc, Mutex};
+use std::{
+ fs,
+ sync::{Arc, Mutex},
+};
use anyhow::{bail, Context, Result};
use crossbeam_skiplist::SkipMap;
@@ -21,10 +24,11 @@ use crate::{
safe_list,
scanner::{
clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos,
- enumerate_filesystem_inputs, enumerate_github_repos,
+ enumerate_filesystem_inputs, enumerate_github_repos, enumerate_huggingface_repos,
repos::{
enumerate_gitea_repos, enumerate_gitlab_repos, fetch_confluence_pages,
- fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects, fetch_slack_messages,
+ fetch_gcs_objects, fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects,
+ fetch_slack_messages,
},
run_secret_validation, save_docker_images,
summary::print_scan_summary,
@@ -74,12 +78,14 @@ pub async fn run_async_scan(
let mut repo_urls = enumerate_github_repos(args, global_args).await?;
let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
+ let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
// Combine repository URLs
repo_urls.extend(gitlab_repo_urls);
repo_urls.extend(gitea_repo_urls);
+ repo_urls.extend(huggingface_repo_urls);
repo_urls.extend(bitbucket_repo_urls);
repo_urls.extend(azure_repo_urls);
repo_urls.sort();
@@ -182,8 +188,20 @@ pub async fn run_async_scan(
)
.await?;
- let has_s3 = args.input_specifier_args.s3_bucket.is_some();
- if input_roots.is_empty() && !has_s3 {
+ fetch_gcs_objects(
+ args,
+ &datastore,
+ rules_db,
+ &matcher_stats,
+ enable_profiling,
+ Arc::clone(&shared_profiler),
+ progress_enabled,
+ )
+ .await?;
+
+ let has_remote_objects = args.input_specifier_args.s3_bucket.is_some()
+ || args.input_specifier_args.gcs_bucket.is_some();
+ if input_roots.is_empty() && !has_remote_objects {
bail!("No inputs to scan");
}
@@ -231,6 +249,26 @@ pub async fn run_async_scan(
crate::baseline::apply_baseline(&mut ds, &path, args.manage_baseline, &input_roots)?;
}
+ let mut skip_aws_accounts = args.skip_aws_account.clone();
+
+ if let Some(path) = args.skip_aws_account_file.as_ref() {
+ let contents = fs::read_to_string(path).with_context(|| {
+ format!("Failed to read --skip-aws-account-file {}", path.display())
+ })?;
+
+ for line in contents.lines() {
+ let content = line.split('#').next().unwrap_or("");
+ for value in content.split(|c: char| c.is_ascii_whitespace() || c == ',' || c == ';') {
+ let trimmed = value.trim();
+ if !trimmed.is_empty() {
+ skip_aws_accounts.push(trimmed.to_string());
+ }
+ }
+ }
+ }
+
+ crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
+
// If validation is enabled, run it as a second phase
if !args.no_validate {
info!("Starting secret validation phase...");
diff --git a/src/validation.rs b/src/validation.rs
index 81f97dd..b371bf0 100644
--- a/src/validation.rs
+++ b/src/validation.rs
@@ -95,6 +95,14 @@ pub fn init_validation_caches() {
aws::set_aws_validation_concurrency(15);
}
+pub fn set_skip_aws_account_ids(ids: I)
+where
+ I: IntoIterator- ,
+ S: Into
,
+{
+ aws::set_aws_skip_account_ids(ids);
+}
+
#[derive(Clone)]
pub struct CachedResponse {
pub body: String,
@@ -781,6 +789,26 @@ async fn timed_validate_single_match<'a>(
}
}
+ if let Some(account_id) = aws::should_skip_aws_validation(&akid) {
+ m.validation_success = false;
+ m.validation_response_body = format!(
+ "(skip list entry) AWS validation not attempted for account {}.",
+ account_id
+ );
+ m.validation_response_status = StatusCode::CONTINUE;
+ cache.insert(
+ cache_key,
+ CachedResponse {
+ body: m.validation_response_body.clone(),
+ status: m.validation_response_status,
+ is_valid: m.validation_success,
+ timestamp: Instant::now(),
+ },
+ );
+ commit_and_return(m);
+ return;
+ }
+
if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) {
m.validation_success = false;
m.validation_response_body = format!("Invalid AWS credentials ({}): {}", akid, e);
diff --git a/src/validation/aws.rs b/src/validation/aws.rs
index 7e1073d..156b32c 100644
--- a/src/validation/aws.rs
+++ b/src/validation/aws.rs
@@ -1,7 +1,7 @@
-use std::time::Duration;
+use std::{collections::HashSet, sync::RwLock, time::Duration};
use anyhow::{anyhow, Result};
-use aws_config::{retry::RetryConfig, BehaviorVersion};
+use aws_config::{retry::RetryConfig, BehaviorVersion, SdkConfig};
use aws_credential_types::Credentials;
use aws_sdk_sts::{
config::Builder as StsConfigBuilder, error::SdkError,
@@ -26,8 +26,9 @@ use http::{
header::{HeaderValue, USER_AGENT},
StatusCode,
};
-use once_cell::sync::OnceCell;
+use once_cell::sync::{Lazy, OnceCell};
use rand::{rng, Rng};
+use regex::Regex;
use tokio::{
sync::Semaphore,
time::{sleep, timeout},
@@ -36,6 +37,57 @@ use tokio::{
use crate::validation::GLOBAL_USER_AGENT;
static AWS_VALIDATION_SEMAPHORE: OnceCell = OnceCell::new();
+const BUILTIN_SKIP_ACCOUNT_IDS: &[&str] = &[
+ "052310077262",
+ "171436882533",
+ "528757803018",
+ "534261010715",
+ "595918472158",
+ "729780141977",
+ "893192397702",
+ "992382622183",
+];
+
+static AWS_SKIP_ACCOUNT_IDS: Lazy>> = Lazy::new(|| {
+ let mut set = HashSet::new();
+ set.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string()));
+ RwLock::new(set)
+});
+
+fn build_http_client() -> SharedHttpClient {
+ HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| {
+ let mut conn_builder = ConnectorBuilder::default()
+ .tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc));
+
+ conn_builder.set_connector_settings(settings.cloned());
+ if let Some(components) = runtime_components {
+ conn_builder.set_sleep_impl(components.sleep_impl());
+ }
+ conn_builder.set_proxy_config(Some(ProxyConfig::from_env()));
+ conn_builder.build()
+ })
+}
+
+async fn build_base_config(credentials: Credentials) -> SdkConfig {
+ let retry_config = RetryConfig::adaptive().with_max_attempts(3);
+ aws_config::defaults(BehaviorVersion::latest())
+ .region(Region::new("us-east-1"))
+ .credentials_provider(credentials)
+ .http_client(build_http_client())
+ .retry_config(retry_config)
+ .load()
+ .await
+}
+
+fn extract_account_id(input: &str) -> Option {
+ let trimmed = input.trim();
+ if trimmed.len() == 12 && trimmed.chars().all(|c| c.is_ascii_digit()) {
+ return Some(trimmed.to_string());
+ }
+
+ static ACCOUNT_ID_RE: Lazy = Lazy::new(|| Regex::new(r"(\d{12})").expect("valid regex"));
+ ACCOUNT_ID_RE.captures(trimmed).and_then(|caps| caps.get(1)).map(|m| m.as_str().to_string())
+}
/// Set the maximum number of concurrent AWS validations. Call before first use.
pub fn set_aws_validation_concurrency(max: usize) {
@@ -46,6 +98,46 @@ fn aws_validation_semaphore() -> &'static Semaphore {
AWS_VALIDATION_SEMAPHORE.get_or_init(|| Semaphore::new(15))
}
+pub fn set_aws_skip_account_ids(ids: I)
+where
+ I: IntoIterator- ,
+ S: Into
,
+{
+ let mut guard = match AWS_SKIP_ACCOUNT_IDS.write() {
+ Ok(g) => g,
+ Err(poisoned) => poisoned.into_inner(),
+ };
+ guard.clear();
+
+ guard.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string()));
+
+ for raw in ids.into_iter() {
+ let value = raw.into();
+ if value.trim().is_empty() {
+ continue;
+ }
+ if let Some(normalized) = extract_account_id(&value) {
+ guard.insert(normalized);
+ } else {
+ tracing::warn!("Ignoring invalid AWS account ID in skip list: {value}");
+ }
+ }
+}
+
+pub fn should_skip_aws_validation(access_key_id: &str) -> Option {
+ let guard = AWS_SKIP_ACCOUNT_IDS.read().ok()?;
+ if guard.is_empty() {
+ return None;
+ }
+
+ let account = aws_key_to_account_number(access_key_id).ok()?;
+ if guard.contains(&account) {
+ Some(account)
+ } else {
+ None
+ }
+}
+
#[derive(Debug)]
struct UaInterceptor;
@@ -132,29 +224,7 @@ pub async fn validate_aws_credentials(
None, // expiry
"static", // provider name
);
- // Create HTTP client that respects proxy settings from the environment
- let http_client: SharedHttpClient =
- HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| {
- let mut conn_builder = ConnectorBuilder::default()
- .tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc));
-
- conn_builder.set_connector_settings(settings.cloned());
- if let Some(components) = runtime_components {
- conn_builder.set_sleep_impl(components.sleep_impl());
- }
- conn_builder.set_proxy_config(Some(ProxyConfig::from_env()));
- conn_builder.build()
- });
-
- // Create AWS config with adaptive retries
- let retry_config = RetryConfig::adaptive().with_max_attempts(3);
- let config = aws_config::defaults(BehaviorVersion::latest())
- .region(Region::new("us-east-1"))
- .credentials_provider(credentials)
- .http_client(http_client)
- .retry_config(retry_config)
- .load()
- .await;
+ let config = build_base_config(credentials).await;
// Create STS client
let sts_config = StsConfigBuilder::from(&config).interceptor(UaInterceptor).build();
@@ -230,3 +300,74 @@ pub fn aws_key_to_account_number(aws_key_id: &str) -> Result> = Lazy::new(|| Mutex::new(()));
+
+ #[test]
+ fn skip_account_list_normalizes_inputs() {
+ let _lock = TEST_GUARD.lock().unwrap();
+
+ set_aws_skip_account_ids([
+ " 052310077262 ",
+ "arn:aws:iam::171436882533:role/demo",
+ "invalid",
+ ]);
+
+ let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap();
+ assert!(guard.contains("052310077262"));
+ assert!(guard.contains("171436882533"));
+ assert_eq!(guard.len(), BUILTIN_SKIP_ACCOUNT_IDS.len());
+ drop(guard);
+
+ set_aws_skip_account_ids(Vec::::new());
+ }
+
+ #[test]
+ fn should_skip_when_account_matches() {
+ let _lock = TEST_GUARD.lock().unwrap();
+
+ set_aws_skip_account_ids(["534261010715"]);
+ assert_eq!(
+ should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"),
+ Some("534261010715".to_string())
+ );
+
+ set_aws_skip_account_ids(Vec::::new());
+ }
+
+ #[test]
+ fn builtin_canary_accounts_are_preseeded() {
+ let _lock = TEST_GUARD.lock().unwrap();
+
+ set_aws_skip_account_ids(Vec::::new());
+ assert_eq!(
+ should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"),
+ Some("534261010715".to_string())
+ );
+
+ set_aws_skip_account_ids(Vec::::new());
+ }
+
+ #[test]
+ fn duplicate_accounts_are_deduplicated() {
+ let _lock = TEST_GUARD.lock().unwrap();
+
+ set_aws_skip_account_ids([
+ "534261010715",
+ "arn:aws:iam::534261010715:user/canarytokens",
+ " 534261010715 ",
+ ]);
+
+ let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap();
+ assert_eq!(guard.iter().filter(|id| id.as_str() == "534261010715").count(), 1);
+ drop(guard);
+
+ set_aws_skip_account_ids(Vec::::new());
+ }
+}
diff --git a/src/validation/utils.rs b/src/validation/utils.rs
index ea0b020..9e8422d 100644
--- a/src/validation/utils.rs
+++ b/src/validation/utils.rs
@@ -33,31 +33,77 @@ pub fn find_closest_variable(
target_variable_name: &str,
search_variable_name: &str,
) -> Option {
- // Find positions of the target variable with the target value
+ // Collect the positions of the target variable for the provided value so we can
+ // compare relative offsets with candidate variables.
let mut target_positions = Vec::new();
for (name, value, start, end) in captures {
if name == target_variable_name && value == target_value {
target_positions.push((*start, *end));
}
}
+
if target_positions.is_empty() {
return None;
}
- // For each target position, find the closest search variable
- let mut closest_distance = usize::MAX;
- let mut closest_value: Option = None;
- for (_target_start, target_end) in target_positions {
- for (name, value, start, _) in captures {
- if name == search_variable_name {
- let distance = (*start as isize - target_end as isize).abs() as usize;
- if distance < closest_distance {
- closest_distance = distance;
- closest_value = Some(value.clone());
+
+ // Prefer candidates that appear before the target value (same logical block), but
+ // fall back to overlapping values and then to those that appear after the target
+ // value when no better match exists. This avoids pairing with the next block when
+ // multiple credentials are close together in the same file.
+ let mut best_before: Option<(usize, String)> = None;
+ let mut best_overlap: Option<(usize, String)> = None;
+ let mut best_after: Option<(usize, String)> = None;
+
+ for (target_start, target_end) in target_positions.iter().copied() {
+ for (name, value, start, end) in captures {
+ if name != search_variable_name {
+ continue;
+ }
+
+ if *end <= target_start {
+ // Candidate is before the target; choose the one closest to the target start.
+ let distance = target_start - *end;
+ match &mut best_before {
+ Some((best_distance, best_value)) if distance < *best_distance => {
+ *best_distance = distance;
+ *best_value = value.clone();
+ }
+ None => {
+ best_before = Some((distance, value.clone()));
+ }
+ _ => {}
+ }
+ } else if *start >= target_end {
+ // Candidate is after the target; choose the one closest to the target end.
+ let distance = *start - target_end;
+ match &mut best_after {
+ Some((best_distance, best_value)) if distance < *best_distance => {
+ *best_distance = distance;
+ *best_value = value.clone();
+ }
+ None => {
+ best_after = Some((distance, value.clone()));
+ }
+ _ => {}
+ }
+ } else {
+ // Candidate overlaps the target – treat as an exact match.
+ let distance = 0usize;
+ match &mut best_overlap {
+ Some((best_distance, best_value)) if distance < *best_distance => {
+ *best_distance = distance;
+ *best_value = value.clone();
+ }
+ None => {
+ best_overlap = Some((distance, value.clone()));
+ }
+ _ => {}
}
}
}
}
- closest_value
+
+ best_before.or(best_overlap).or(best_after).map(|(_, value)| value)
}
pub async fn check_url_resolvable(url: &Url) -> Result<(), Box> {
@@ -147,4 +193,31 @@ mod tests {
]
);
}
+
+ #[test]
+ fn prefers_closest_preceding_variable() {
+ let captures = vec![
+ ("TOKEN".to_string(), "secret".to_string(), 75usize, 115usize),
+ ("AKID".to_string(), "preceding".to_string(), 30usize, 50usize),
+ ("AKID".to_string(), "following".to_string(), 180usize, 200usize),
+ ];
+
+ let result =
+ find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
+
+ assert_eq!(result, "preceding".to_string());
+ }
+
+ #[test]
+ fn falls_back_to_following_when_no_preceding() {
+ let captures = vec![
+ ("TOKEN".to_string(), "secret".to_string(), 10usize, 50usize),
+ ("AKID".to_string(), "after".to_string(), 60usize, 80usize),
+ ];
+
+ let result =
+ find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
+
+ assert_eq!(result, "after".to_string());
+ }
}
diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs
index 3947548..54379a3 100644
--- a/tests/int_allowlist.rs
+++ b/tests/int_allowlist.rs
@@ -72,6 +72,12 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result, skip_skipword: Vec) -> Result, skip_skipword: Vec) -> Result Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -102,6 +109,9 @@ fn test_bitbucket_remote_scan() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@@ -131,6 +141,8 @@ fn test_bitbucket_remote_scan() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs
index e86c589..2f1998c 100644
--- a/tests/int_dedup.rs
+++ b/tests/int_dedup.rs
@@ -85,6 +85,13 @@ rules:
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -120,6 +127,9 @@ rules:
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@@ -151,6 +161,8 @@ rules:
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_github.rs b/tests/int_github.rs
index dcf936a..82a0f78 100644
--- a/tests/int_github.rs
+++ b/tests/int_github.rs
@@ -72,6 +72,13 @@ fn test_github_remote_scan() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -107,6 +114,9 @@ fn test_github_remote_scan() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@@ -138,6 +148,8 @@ fn test_github_remote_scan() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs
index 7c2e0b9..745f323 100644
--- a/tests/int_gitlab.rs
+++ b/tests/int_gitlab.rs
@@ -71,6 +71,13 @@ fn test_gitlab_remote_scan() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -106,6 +113,9 @@ fn test_gitlab_remote_scan() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
@@ -137,6 +147,8 @@ fn test_gitlab_remote_scan() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};
@@ -210,6 +222,13 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -252,6 +271,9 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
scan_nested_repos: true,
since_commit: None,
branch: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
@@ -273,6 +295,8 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_redact.rs b/tests/int_redact.rs
index 7d7accb..a570cfe 100644
--- a/tests/int_redact.rs
+++ b/tests/int_redact.rs
@@ -55,6 +55,12 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -86,6 +92,9 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@@ -115,6 +124,8 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_slack.rs b/tests/int_slack.rs
index 5f2d01a..dd9df68 100644
--- a/tests/int_slack.rs
+++ b/tests/int_slack.rs
@@ -62,6 +62,13 @@ impl TestContext {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -92,6 +99,9 @@ impl TestContext {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
max_results: 10,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
@@ -123,6 +133,8 @@ impl TestContext {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};
@@ -186,6 +198,13 @@ async fn test_scan_slack_messages() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -218,6 +237,9 @@ async fn test_scan_slack_messages() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@@ -247,6 +269,8 @@ async fn test_scan_slack_messages() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs
index 74cf6e4..ba81576 100644
--- a/tests/int_validation_cache.rs
+++ b/tests/int_validation_cache.rs
@@ -128,6 +128,13 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -163,6 +170,9 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@@ -194,6 +204,8 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs
index 464377a..5dec6b5 100644
--- a/tests/int_vulnerable_files.rs
+++ b/tests/int_vulnerable_files.rs
@@ -71,6 +71,13 @@ impl TestContext {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -106,6 +113,9 @@ impl TestContext {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@@ -137,6 +147,8 @@ impl TestContext {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
@@ -183,6 +195,13 @@ impl TestContext {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
+ huggingface_user: Vec::new(),
+ huggingface_organization: Vec::new(),
+ huggingface_model: Vec::new(),
+ huggingface_dataset: Vec::new(),
+ huggingface_space: Vec::new(),
+ huggingface_exclude: Vec::new(),
+
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@@ -228,6 +247,10 @@ impl TestContext {
scan_nested_repos: true,
since_commit: None,
branch: None,
+
+ gcs_bucket: None,
+ gcs_prefix: None,
+ gcs_service_account: None,
},
extra_ignore_comments: Vec::new(),
content_filtering_args: ContentFilteringArgs {
@@ -250,6 +273,8 @@ impl TestContext {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
+ skip_aws_account: Vec::new(),
+ skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};