- Added first-class Hugging Face scanning support, including CLI enumeration, token authentication, and integration with remote scans.

- Condensed GitError formatting to report the exit status and the first informative lines from stdout/stderr, producing concise git clone failure logs.
- Added support for scanning Google Cloud Storage buckets via --gcs-bucket, including optional prefixes and service-account authentication.
- Added --skip-aws-account (now accepting comma-separated values) and --skip-aws-account-file to bypass live AWS validation for known canary/honey-token account IDs without triggering alerts. Kingfisher now ships with several canary AWS account IDs pre-seeded in the skip list and now reports matching findings as "Not Attempted" with the "Response" containing "(skip list entry)" so its clear that validation was intentionally skipped and why.
This commit is contained in:
Mick Grove 2025-10-15 22:47:40 -07:00
commit 03d7364888
33 changed files with 1939 additions and 182 deletions

View file

@ -2,6 +2,12 @@
All notable changes to this project will be documented in this file.
## [v1.58.0]
- Added first-class Hugging Face scanning support, including CLI enumeration, token authentication, and integration with remote scans.
- Condensed GitError formatting to report the exit status and the first informative lines from stdout/stderr, producing concise git clone failure logs.
- Added support for scanning Google Cloud Storage buckets via `--gcs-bucket`, including optional prefixes and service-account authentication.
- Added `--skip-aws-account` (now accepting comma-separated values) and `--skip-aws-account-file` to bypass live AWS validation for known canary/honey-token account IDs without triggering alerts. Kingfisher now ships with several canary AWS account IDs pre-seeded in the skip list and now reports matching findings as "Not Attempted" with the "Response" containing "(skip list entry)" so it's clear that validation was intentionally skipped and why.
## [v1.57.0]
- Added inline ignore directive detection to treat suppression tokens anywhere on surrounding lines, including multi-line handling
- Added a `--no-ignore` CLI flag to disable inline directives when you need every potential secret reported

View file

@ -1,6 +1,6 @@
[workspace.package]
edition = "2021"
rust-version = "1.88"
rust-version = "1.90"
license = "Apache-2.0"
authors = ["Mick Grove <mick.grove@mongodb.com>"]
homepage = "https://github.com/mongodb/kingfisher"
@ -10,7 +10,7 @@ publish = false
[package]
name = "kingfisher"
version = "1.57.0"
version = "1.58.0"
description = "MongoDB's blazingly fast and accurate secret scanning and validation tool"
edition.workspace = true
rust-version.workspace = true
@ -192,6 +192,7 @@ walkdir = "2.5.0"
p256 = "0.13.2"
ed25519-dalek = { version = "2.2", features = ["pkcs8"] }
aws-sdk-s3 = "1.100.0"
gcloud-storage = "1.1.1"
tokei = "12.1.2"
[target.'cfg(not(windows))'.dependencies]

View file

@ -110,11 +110,11 @@ setup-zig:
ubuntu-x64: setup-zig # ensures Zig & cargo-zigbuild exist
@echo "Checking Rust toolchain…"
@$(MAKE) check-rust || { \
echo "🦀 Installing Rust 1.88.0 …"; \
echo "🦀 Installing Rust 1.90.0 …"; \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
. $$HOME/.cargo/env; \
rustup toolchain install 1.88.0; \
rustup default 1.88.0; \
rustup toolchain install 1.90.0; \
rustup default 1.90.0; \
}
@echo "📦 Installing build dependencies (musl, cmake, etc.)…"
@ -150,11 +150,11 @@ ubuntu-x64: setup-zig # ensures Zig & cargo-zigbuild exist
ubuntu-arm64: setup-zig # ensures Zig & cargo-zigbuild exist
@echo "Checking Rust toolchain…"
@$(MAKE) check-rust || { \
echo "🦀 Installing Rust 1.88.0 …"; \
echo "🦀 Installing Rust 1.90.0 …"; \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
. $$HOME/.cargo/env; \
rustup toolchain install 1.88.0; \
rustup default 1.88.0; \
rustup toolchain install 1.90.0; \
rustup default 1.90.0; \
}
@echo "📦 Installing build dependencies (musl, cmake, etc.)…"
@ -245,7 +245,7 @@ endif
linux-x64: check-docker create-dockerignore
@mkdir -p target/release
docker run --platform linux/amd64 --rm \
-v "$$(pwd):/src" -w /src rust:1.88-alpine sh -eu -c '\
-v "$$(pwd):/src" -w /src rust:1.90-alpine sh -eu -c '\
apk add --no-cache \
musl-dev \
gcc g++ make cmake pkgconfig \
@ -274,7 +274,7 @@ linux-x64: check-docker create-dockerignore
linux-arm64: check-docker create-dockerignore
@mkdir -p target/release
docker run --platform linux/arm64 --rm \
-v "$$(pwd):/src" -w /src rust:1.88-alpine sh -eu -c '\
-v "$$(pwd):/src" -w /src rust:1.90-alpine sh -eu -c '\
apk add --no-cache \
musl-dev \
gcc g++ make cmake pkgconfig \
@ -385,7 +385,7 @@ check-rust:
echo "Rust not found."; \
exit 1; \
fi; \
required=1.88.0; \
required=1.90.0; \
if [ $$(printf '%s\n' "$$required" "$$version" | sort -V | head -n1) != "$$required" ]; then \
echo "Rust version $$version is older than required $$required."; \
exit 1; \

238
README.md
View file

@ -5,22 +5,23 @@
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
Kingfisher is a blazingly fast secretscanning and live validation tool built in Rust. It combines Intels hardwareaccelerated Hyperscan regex engine with languageaware source code parsing, and **ships with hundreds of builtin rules** to detect, validate, and triage secrets before they ever reach production
Kingfisher is a blazingly fast secretscanning and live validation tool built in Rust. It combines Intels SIMD accelerated regex engine (Hyperscan) with languageaware source code parsing, and **ships with hundreds of builtin rules** to detect, validate, and triage secrets before they ever reach production
</p>
Originally forked from Praetorians Nosey Parker, Kingfisher **adds** live cloud-API validation; many more targets (GitLab, BitBucket, Gitea, S3, Docker, Jira, Confluence, Slack); compressed-file extraction and scanning; baseline and allowlist controls; language-aware detection (~20 languages); and a native Windows binary. See [Origins and Divergence](#origins-and-divergence) for details.
Originally forked from Praetorians Nosey Parker, Kingfisher has since significantly expanded and diverged, adding live validation, 10+ new scan targets, and major architectural enhancements. See [Origins and Divergence](#origins-and-divergence) for details.
## Key Features
### Multiple Scan Targets
<div align="center">
| Files / Dirs | Local Git | GitHub | GitLab | Azure DevOps | Bitbucket | Gitea |
|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|
| <img src="./docs/assets/icons/files.svg" height="40" alt="Files / Dirs"/><br/><sub>Files / Dirs</sub> | <img src="./docs/assets/icons/local-git.svg" height="40" alt="Local Git"/><br/><sub>Local Git</sub> | <img src="./docs/assets/icons/github.svg" height="40" alt="GitHub"/><br/><sub>GitHub</sub> | <img src="./docs/assets/icons/gitlab.svg" height="40" alt="GitLab"/><br/><sub>GitLab</sub> | <img src="./docs/assets/icons/azure-devops.svg" height="40" alt="Azure DevOps"/><br/><sub>Azure DevOps</sub> | <img src="./docs/assets/icons/bitbucket.svg" height="40" alt="Bitbucket"/><br/><sub>Bitbucket</sub> | <img src="./docs/assets/icons/gitea.svg" height="40" alt="Gitea"/><br/><sub>Gitea</sub> |
| Files / Dirs | Local Git | GitHub | GitLab | Azure Repos | Bitbucket | Gitea | Hugging Face |
|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|:-------------:|
| <img src="./docs/assets/icons/files.svg" height="40" alt="Files / Dirs"/><br/><sub>Files / Dirs</sub> | <img src="./docs/assets/icons/local-git.svg" height="40" alt="Local Git"/><br/><sub>Local Git</sub> | <img src="./docs/assets/icons/github.svg" height="40" alt="GitHub"/><br/><sub>GitHub</sub> | <img src="./docs/assets/icons/gitlab.svg" height="40" alt="GitLab"/><br/><sub>GitLab</sub> | <img src="./docs/assets/icons/azure-devops.svg" height="40" alt="Azure Repos"/><br/><sub>Azure Repos</sub> | <img src="./docs/assets/icons/bitbucket.svg" height="40" alt="Bitbucket"/><br/><sub>Bitbucket</sub> | <img src="./docs/assets/icons/gitea.svg" height="40" alt="Gitea"/><br/><sub>Gitea</sub> |<img src="./docs/assets/icons/huggingface.svg" height="40" width="40" alt="Hugging Face"/><br/><sub>Hugging Face</sub> |
| Docker | Jira | Confluence | Slack | AWS S3 |
|:------:|:----:|:-----------:|:-----:|:------:|
| <img src="./docs/assets/icons/docker.svg" height="40" alt="Docker"/><br/><sub>Docker</sub> | <img src="./docs/assets/icons/jira.svg" height="40" alt="Jira"/><br/><sub>Jira</sub> | <img src="./docs/assets/icons/confluence.svg" height="40" alt="Confluence"/><br/><sub>Confluence</sub> | <img src="./docs/assets/icons/slack.svg" height="40" alt="Slack"/><br/><sub>Slack</sub> | <img src="./docs/assets/icons/aws-s3.svg" height="40" alt="AWS S3"/><br/><sub>AWS&nbsp;S3</sub> |
| Docker | Jira | Confluence | Slack | AWS S3 | Google Cloud |
|:------:|:----:|:-----------:|:-----:|:------:|:---:|
| <img src="./docs/assets/icons/docker.svg" height="40" alt="Docker"/><br/><sub>Docker</sub> | <img src="./docs/assets/icons/jira.svg" height="40" alt="Jira"/><br/><sub>Jira</sub> | <img src="./docs/assets/icons/confluence.svg" height="40" alt="Confluence"/><br/><sub>Confluence</sub> | <img src="./docs/assets/icons/slack.svg" height="40" alt="Slack"/><br/><sub>Slack</sub> | <img src="./docs/assets/icons/aws-s3.svg" height="40" alt="AWS S3"/><br/><sub>AWS&nbsp;S3</sub> | <img src="./docs/assets/icons/gcs.svg" height="40" alt="Google Cloud Storage"/><br/><sub>Cloud Storage</sub> |
</div>
@ -41,17 +42,27 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
<img src="docs/runtime-comparison.png" alt="Kingfisher Runtime Comparison" style="vertical-align: center;" />
</p>
# Table of Contents
<details>
- [Kingfisher](#kingfisher)
- [Key Features](#key-features)
- [Multiple Scan Targets](#multiple-scan-targets)
- [Performance, Accuracy, and Hundreds of Rules](#performance-accuracy-and-hundreds-of-rules)
- [Benchmark Results](#benchmark-results)
- [Table of Contents](#table-of-contents)
- [Getting Started](#getting-started)
- [Installation](#installation)
- [Run Kingfisher in Docker](#run-kingfisher-in-docker)
- [Pre-built Releases](#pre-built-releases)
- [Homebrew](#homebrew)
- [Linux and macOS](#linux-and-macos)
- [Windows](#windows)
- [Compile](#compile)
- [ Run Kingfisher in Docker](#-run-kingfisher-in-docker)
- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance)
- [Write Custom Rules!](#write-custom-rules)
- [Usage](#usage)
- [📝 Write Custom Rules!](#-write-custom-rules)
- [🎉 Usage](#-usage)
- [Basic Examples](#basic-examples)
- [Scan with secret validation](#scan-with-secret-validation)
- [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories)
@ -67,6 +78,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Exclude specific paths](#exclude-specific-paths)
- [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines)
- [ Scanning an AWS S3 Bucket](#-scanning-an-aws-s3-bucket)
- [ Scanning a Google Cloud Storage Bucket](#-scanning-a-google-cloud-storage-bucket)
- [ Scanning Docker Images](#-scanning-docker-images)
- [ Scanning GitHub](#-scanning-github)
- [Scan GitHub organization (requires `KF_GITHUB_TOKEN`)](#scan-github-organization-requires-kf_github_token)
@ -79,8 +91,8 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url)
- [List GitLab repositories](#list-gitlab-repositories)
- [ Scanning Azure Repos](#-scanning-azure-repos)
- [Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-devops-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat)
- [Scan specific Azure DevOps projects](#scan-specific-azure-devops-projects)
- [Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-repos-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat)
- [Scan specific Azure Repos projects](#scan-specific-azure-repos-projects)
- [Skip specific Azure repositories during enumeration](#skip-specific-azure-repositories-during-enumeration)
- [List Azure repositories](#list-azure-repositories)
- [ Scanning Gitea](#-scanning-gitea)
@ -97,6 +109,12 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [List Bitbucket repositories](#list-bitbucket-repositories)
- [Authenticate to Bitbucket](#authenticate-to-bitbucket)
- [Self-hosted Bitbucket Server](#self-hosted-bitbucket-server)
- [ Scanning Hugging Face](#-scanning-hugging-face)
- [Scan Hugging Face user](#scan-hugging-face-user)
- [Scan Hugging Face organization](#scan-hugging-face-organization)
- [Scan specific Hugging Face resources](#scan-specific-hugging-face-resources)
- [List Hugging Face repositories](#list-hugging-face-repositories)
- [Authenticate to Hugging Face](#authenticate-to-hugging-face)
- [ Scanning Jira](#-scanning-jira)
- [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query)
- [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues)
@ -107,7 +125,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Environment Variables for Tokens](#environment-variables-for-tokens)
- [Exit Codes](#exit-codes)
- [Update Checks](#update-checks)
- [Advanced Options](#advanced-options)
- [🤓 Advanced Options](#-advanced-options)
- [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets)
- [List Builtin Rules](#list-builtin-rules)
- [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run)
@ -117,6 +135,8 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Notable Scan Options](#notable-scan-options)
- [Understanding `--confidence`](#understanding---confidence)
- [Ignore known false positives](#ignore-known-false-positives)
- [Skip Canary Tokens](#skip-canary-tokens)
- [Common CLI flows](#common-cli-flows)
- [Inline ignore directives](#inline-ignore-directives)
- [Finding Fingerprint](#finding-fingerprint)
- [Rule Performance Profiling](#rule-performance-profiling)
@ -125,36 +145,58 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Roadmap](#roadmap)
- [License](#license)
</details>
# Getting Started
## Installation
### Pre-built Releases
Pre-built binaries are available from the [Releases](https://github.com/mongodb/kingfisher/releases) section.
### Homebrew
On macOS, you can simply
```bash
brew install kingfisher
```
Pre-built binaries are also available on the [Releases](https://github.com/mongodb/kingfisher/releases) section of this page.
### Linux and macOS
You can also install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform:
<details>
You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform.
```bash
# Linux, macOS
curl --silent --location \
https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \
sh && \
ubi --project mongodb/kingfisher --in "$HOME/bin"
ubi --project mongodb/kingfisher --in "$HOME/.local/bin"
```
This installs and runs `ubi` and then places the `kingfisher` executable in `~/.local/bin` on Unix-like systems.
</details>
### Windows
<details>
You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform.
```powershell
# Windows
powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in .
```
This installs `ubi` and then places the `kingfisher` executable in `~/bin` on Unix-like
systems (or the current directory on Windows).
This installs and runs `ubi` and then places the `kingfisher` executable in the current directory on Windows.
</details>
Or you may compile for your platform via `make`:
### Compile
You may compile for your platform via `make`
<details>
```bash
# NOTE: Requires Docker
@ -174,9 +216,14 @@ make darwin-all # builds both x64 and arm64
make all # builds for every OS and architecture supported
```
### Run Kingfisher in Docker
</details>
### <img src="./docs/assets/icons/docker.svg" height="40" style="vertical-align:text-bottom;" alt="Docker"/> Run Kingfisher in Docker
Run the dockerized Kingfisher container
<details>
Run the dockerized Kingfisher container:
```bash
# GitHub Container Registry
docker run --rm ghcr.io/mongodb/kingfisher:latest --version
@ -233,6 +280,8 @@ docker run --rm \
```
</details>
# 🔐 Detection Rules at a Glance
Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything from classic cloud keys to the latest AI SaaS tokens. Below is an overview:
@ -248,7 +297,7 @@ Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything fr
| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more |
| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more |
## Write Custom Rules!
## 📝 Write Custom Rules!
Kingfisher ships with hundreds of rules with HTTP and servicespecific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential.
@ -258,7 +307,7 @@ First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom King
Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required!
# Usage
# 🎉 Usage
## Basic Examples
@ -400,7 +449,7 @@ kingfisher scan ./my-project \
-v
```
## <img alt="GitHub" src="./docs/assets/icons/aws-s3.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning an AWS S3 Bucket
## <img alt="GitHub" src="./docs/assets/icons/aws-s3.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning an AWS S3 Bucket
You can scan S3 objects directly:
```bash
@ -415,7 +464,7 @@ Credential resolution happens in this order:
If `--role-arn` is supplied, the credentials from steps 12 are used to assume that role.
Examples:
Examples
```bash
# using explicit keys
@ -452,7 +501,30 @@ docker run --rm \
scan --s3-bucket bucket-name
```
## <img alt="Docker" src="./docs/assets/icons/docker.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Docker Images
## <img src="./docs/assets/icons/gcs.svg" height="40" width="40" alt="Google Cloud Storage"/> Scanning a Google Cloud Storage Bucket
The `--gcs-bucket` flag streams objects directly from Google Cloud Storage. Authentication uses
Application Default Credentials, so you can provide a service-account JSON file via the
`GOOGLE_APPLICATION_CREDENTIALS` environment variable or by passing `--gcs-service-account`. Public
buckets work without credentials.
```bash
kingfisher scan --gcs-bucket bucket-name
# scan a sub-tree inside the bucket
kingfisher scan --gcs-bucket bucket-name --gcs-prefix path/to/data/
# supply a service-account key explicitly
kingfisher scan --gcs-bucket bucket-name --gcs-service-account /path/to/key.json
```
Functional example:
```bash
kingfisher scan --gcs-bucket cloud-samples-data --gcs-prefix "storage/"
```
## <img alt="Docker" src="./docs/assets/icons/docker.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Docker Images
Kingfisher will first try to use any locally available image, then fall back to pulling via OCI.
@ -482,7 +554,7 @@ kingfisher scan --docker-image some-private-registry.dkr.ecr.us-east-1.amazonaws
kingfisher scan --docker-image private.registry.example.com/my-image:tag
```
## <img alt="GitHub" src="./docs/assets/icons/github.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning GitHub
## <img alt="GitHub" src="./docs/assets/icons/github.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning GitHub
### Scan GitHub organization (requires `KF_GITHUB_TOKEN`)
@ -524,7 +596,7 @@ KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/priva
---
## <img alt="GitLab" src="./docs/assets/icons/gitlab.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning GitLab
## <img alt="GitLab" src="./docs/assets/icons/gitlab.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning GitLab
### Scan GitLab group (requires `KF_GITLAB_TOKEN`)
@ -580,18 +652,18 @@ kingfisher gitlab repos list --group my-group --include-subgroups
# skip specific projects when listing or scanning (supports glob patterns)
kingfisher gitlab repos list --group my-group --gitlab-exclude my-group/**/legacy-*
```
## <img alt="Azure Repos" src="./docs/assets/icons/azure-devops.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Azure Repos
## <img alt="Azure Repos" src="./docs/assets/icons/azure-devops.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Azure Repos
### Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)
### Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)
```bash
kingfisher scan --azure-organization my-org
# Azure DevOps Server example
# Azure Repos Server example
KF_AZURE_PAT="pat" kingfisher scan --azure-organization DefaultCollection --azure-base-url https://ado.internal.example/tfs/
```
### Scan specific Azure DevOps projects
### Scan specific Azure Repos projects
Projects are specified as `ORGANIZATION/PROJECT`. Repeat the flag for multiple projects.
@ -621,7 +693,7 @@ kingfisher azure repos list --project my-org/app --project my-org/api
# skip specific repositories while listing (supports glob patterns)
kingfisher azure repos list --organization my-org --azure-exclude my-org/**/experimental-*
```
## <img alt="Gitea" src="./docs/assets/icons/gitea.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Gitea
## <img alt="Gitea" src="./docs/assets/icons/gitea.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Gitea
### Scan Gitea organization (requires `KF_GITEA_TOKEN`)
@ -673,7 +745,7 @@ KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --all-gitea-organizations
# self-hosted example
KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --user johndoe --gitea-api-url https://gitea.internal.example/api/v1/
```
## <img alt="Bitbucket" src="./docs/assets/icons/bitbucket.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Bitbucket
## <img alt="Bitbucket" src="./docs/assets/icons/bitbucket.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Bitbucket
### Scan Bitbucket workspace
```bash
@ -745,7 +817,45 @@ Use `--bitbucket-api-url` to point Kingfisher at your server's REST endpoint, fo
`https://bitbucket.example.com/rest/api/1.0/`. Provide credentials with
`--bitbucket-username` and `--bitbucket-token`, and pass `--ignore-certs` when
connecting to HTTP or otherwise insecure instances.
## <img alt="Jira" src="./docs/assets/icons/jira.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Jira
## <img src="./docs/assets/icons/huggingface.svg" height="40" width="40" alt="Hugging Face"/> Scanning Hugging Face
Hugging Face hosts git repositories for models, datasets, and Spaces. Kingfisher can enumerate and scan all three resource types.
### Scan Hugging Face user
```bash
kingfisher scan --huggingface-user <username>
```
### Scan Hugging Face organization
```bash
kingfisher scan --huggingface-organization <orgname>
```
### Scan specific Hugging Face resources
Scan individual repositories by ID (owner/name) or by passing the full HTTPS URL:
```bash
kingfisher scan --huggingface-model <owner/model>
kingfisher scan --huggingface-dataset https://huggingface.co/datasets/<owner>/<dataset>
kingfisher scan --huggingface-space <owner/space>
```
Use `--huggingface-exclude` to omit results returned by user or organization enumeration. Prefix values with `model:`, `dataset:`, or `space:` when you only want to skip a specific resource type.
### List Hugging Face repositories
```bash
kingfisher huggingface repos list --huggingface-user <username>
```
### Authenticate to Hugging Face
Private repositories require an access token provided through the `KF_HUGGINGFACE_TOKEN` environment variable. For git authentication the helper also honours `KF_HUGGINGFACE_USERNAME` (default `hf_user`).
## <img alt="Jira" src="./docs/assets/icons/jira.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Jira
### Scan Jira issues matching a JQL query
@ -764,7 +874,7 @@ KF_JIRA_TOKEN="token" kingfisher scan \
--max-results 1000
```
## <img alt="Confluence" src="./docs/assets/icons/confluence.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Confluence
## <img alt="Confluence" src="./docs/assets/icons/confluence.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Confluence
### Scan Confluence pages matching a CQL query
```bash
@ -789,7 +899,7 @@ Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environ
To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions.
## <img alt="Slack" src="./docs/assets/icons/slack.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Slack
## <img alt="Slack" src="./docs/assets/icons/slack.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Slack
### Scan Slack messages matching a search query
```bash
@ -811,11 +921,13 @@ KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \
| `KF_GITLAB_TOKEN` | GitLab Personal Access Token |
| `KF_GITEA_TOKEN` | Gitea Personal Access Token |
| `KF_GITEA_USERNAME` | Username for private Gitea clones (used with `KF_GITEA_TOKEN`) |
| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure DevOps Personal Access Token |
| `KF_AZURE_USERNAME` | Username to use with Azure DevOps PATs (defaults to `pat` when unset) |
| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure Repos Personal Access Token |
| `KF_AZURE_USERNAME` | Username to use with Azure Repos PATs (defaults to `pat` when unset) |
| `KF_BITBUCKET_USERNAME` | Bitbucket username for basic authentication |
| `KF_BITBUCKET_APP_PASSWORD` / `KF_BITBUCKET_TOKEN` | Bitbucket app password or server token |
| `KF_BITBUCKET_OAUTH_TOKEN` | Bitbucket OAuth or PAT token |
| `KF_HUGGINGFACE_TOKEN` | Hugging Face access token for API enumeration and git cloning |
| `KF_HUGGINGFACE_USERNAME` | Optional username for Hugging Face git operations (defaults to `hf_user`) |
| `KF_JIRA_TOKEN` | Jira API token |
| `KF_CONFLUENCE_TOKEN` | Confluence API token |
| `KF_SLACK_TOKEN` | Slack API token |
@ -869,7 +981,7 @@ Kingfisher automatically queries GitHub for a newer release when it starts and t
- **Disable version checks** Pass `--no-update-check` to skip both the startup and shutdown checks entirely
# Advanced Options
# 🤓 Advanced Options
## Build a Baseline / Detect New Secrets
@ -963,6 +1075,8 @@ leaves the default unchanged.
- `--manage-baseline`: Create or update the baseline file with current findings
- `--skip-regex <PATTERN>`: Ignore findings whose text matches this regex (repeatable)
- `--skip-word <WORD>`: Ignore findings containing this case-insensitive word (repeatable)
- `--skip-aws-account <ACCOUNT_ID>`: Skip live AWS validation for findings tied to the specified AWS account number (repeatable, accepts comma-separated lists)
- `--skip-aws-account-file <FILE>`: Load AWS account numbers to skip from a file (one account per line; `#` comments allowed)
- `--ignore-comment <DIRECTIVE>`: Honor additional inline directives from other scanners (repeatable; e.g. `--ignore-comment "gitleaks:allow"`)
- `--no-ignore`: Disable inline directives entirely so every match is reported
## Understanding `--confidence`
@ -996,6 +1110,46 @@ kingfisher scan \
If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early.
### Skip Canary Tokens
Canary tokens are intentionally-leaked credentials defenders sprinkle across infrastructure to catch adversaries. Research has highlighted that static identification of AWS canary tokens is now table stakes for both attackers avoiding noisy traps and responders running hygiene scans. Kingfisher ships with the same awareness so you can clean environments without detonating the tripwires that many canary tokens deploy.
To avoid triggering AWS honey tokens, provide `--skip-aws-account` with the 12-digit account numbers associated with your canaries (you can pass a comma-separated list or use `--skip-aws-account-file` to read from disk—blank lines and `#` comments are ignored). Kingfisher pre-seeds its skip list with a number of (but not every) Thinkst Canary AWS account IDs that back canarytokens.org, so routine scans bypass them automatically.
```bash
kingfisher scan /path/to/file \
--skip-aws-account "171436882533,534261010715"
```
By default, any finding tied to a skip-listed account is marked as **Not Attempted** and the validation response explains that AWS verification was not performed. This keeps routine hygiene scans from triggering alerts in production telemetry while making it clear that the credential's status still needs manual verification if required.
#### Common CLI flows
```bash
# Skip a few in-house canaries during a filesystem scan
kingfisher scan repo/ \
--skip-aws-account 111122223333,444455556666
# Read a longer list from disk while also ignoring test fixtures
kingfisher scan repo/ \
--skip-aws-account-file /tmp/scripts/canary_accounts.txt \
--skip-word fixture
# Combine preloaded canary IDs with a just-created decoy account
printf '999900001111\n' > /tmp/new_canary.txt
kingfisher scan /path/to/repo \
--skip-aws-account-file /tmp/new_canary.txt
# Validate only newly introduced secrets in CI without firing alerts
kingfisher scan --git-diff origin/main \
--skip-aws-account-file .ci/canaries.txt \
--exit-on-validation
```
Tip: if you manage multiple canary fleets (Thinkst, self-hosted alternatives, or bespoke decoys), checkpoint the account IDs alongside your infrastructure-as-code so security teams can rotate or expand the skip list without editing pipelines.
### Inline ignore directives
Add `kingfisher:ignore` anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value:

View file

@ -0,0 +1,41 @@
<svg xmlns="http://www.w3.org/2000/svg" width="95" height="88" fill="none">
<path
fill="#fff"
d="M94.25 70.08a8.28 8.28 0 0 1-.43 6.46 10.57 10.57 0 0 1-3 3.6 25.18 25.18 0 0 1-5.7 3.2 65.74 65.74 0 0 1-7.56 2.65 46.67 46.67 0 0 1-11.42 1.68c-5.42.05-10.09-1.23-13.4-4.5a40.4 40.4 0 0 1-10.14.03c-3.34 3.25-7.99 4.52-13.39 4.47a46.82 46.82 0 0 1-11.43-1.68 66.37 66.37 0 0 1-7.55-2.65c-2.28-.98-4.17-2-5.68-3.2a10.5 10.5 0 0 1-3.02-3.6c-.99-2-1.18-4.3-.42-6.46a8.54 8.54 0 0 1-.33-5.63c.25-.95.66-1.83 1.18-2.61a8.67 8.67 0 0 1 2.1-8.47 8.23 8.23 0 0 1 2.82-2.07 41.75 41.75 0 1 1 81.3-.12 8.27 8.27 0 0 1 3.11 2.19 8.7 8.7 0 0 1 2.1 8.47c.52.78.93 1.66 1.18 2.61a8.61 8.61 0 0 1-.32 5.63Z"
/>
<path fill="#FFD21E" d="M47.21 76.5a34.75 34.75 0 1 0 0-69.5 34.75 34.75 0 0 0 0 69.5Z" />
<path
fill="#FF9D0B"
d="M81.96 41.75a34.75 34.75 0 1 0-69.5 0 34.75 34.75 0 0 0 69.5 0Zm-73.5 0a38.75 38.75 0 1 1 77.5 0 38.75 38.75 0 0 1-77.5 0Z"
/>
<path
fill="#3A3B45"
d="M58.5 32.3c1.28.44 1.78 3.06 3.07 2.38a5 5 0 1 0-6.76-2.07c.61 1.15 2.55-.72 3.7-.32ZM34.95 32.3c-1.28.44-1.79 3.06-3.07 2.38a5 5 0 1 1 6.76-2.07c-.61 1.15-2.56-.72-3.7-.32Z"
/>
<path
fill="#FF323D"
d="M46.96 56.29c9.83 0 13-8.76 13-13.26 0-2.34-1.57-1.6-4.09-.36-2.33 1.15-5.46 2.74-8.9 2.74-7.19 0-13-6.88-13-2.38s3.16 13.26 13 13.26Z"
/>
<path
fill="#3A3B45"
fill-rule="evenodd"
d="M39.43 54a8.7 8.7 0 0 1 5.3-4.49c.4-.12.81.57 1.24 1.28.4.68.82 1.37 1.24 1.37.45 0 .9-.68 1.33-1.35.45-.7.89-1.38 1.32-1.25a8.61 8.61 0 0 1 5 4.17c3.73-2.94 5.1-7.74 5.1-10.7 0-2.34-1.57-1.6-4.09-.36l-.14.07c-2.31 1.15-5.39 2.67-8.77 2.67s-6.45-1.52-8.77-2.67c-2.6-1.29-4.23-2.1-4.23.29 0 3.05 1.46 8.06 5.47 10.97Z"
clip-rule="evenodd"
/>
<path
fill="#FF9D0B"
d="M70.71 37a3.25 3.25 0 1 0 0-6.5 3.25 3.25 0 0 0 0 6.5ZM24.21 37a3.25 3.25 0 1 0 0-6.5 3.25 3.25 0 0 0 0 6.5ZM17.52 48c-1.62 0-3.06.66-4.07 1.87a5.97 5.97 0 0 0-1.33 3.76 7.1 7.1 0 0 0-1.94-.3c-1.55 0-2.95.59-3.94 1.66a5.8 5.8 0 0 0-.8 7 5.3 5.3 0 0 0-1.79 2.82c-.24.9-.48 2.8.8 4.74a5.22 5.22 0 0 0-.37 5.02c1.02 2.32 3.57 4.14 8.52 6.1 3.07 1.22 5.89 2 5.91 2.01a44.33 44.33 0 0 0 10.93 1.6c5.86 0 10.05-1.8 12.46-5.34 3.88-5.69 3.33-10.9-1.7-15.92-2.77-2.78-4.62-6.87-5-7.77-.78-2.66-2.84-5.62-6.25-5.62a5.7 5.7 0 0 0-4.6 2.46c-1-1.26-1.98-2.25-2.86-2.82A7.4 7.4 0 0 0 17.52 48Zm0 4c.51 0 1.14.22 1.82.65 2.14 1.36 6.25 8.43 7.76 11.18.5.92 1.37 1.31 2.14 1.31 1.55 0 2.75-1.53.15-3.48-3.92-2.93-2.55-7.72-.68-8.01.08-.02.17-.02.24-.02 1.7 0 2.45 2.93 2.45 2.93s2.2 5.52 5.98 9.3c3.77 3.77 3.97 6.8 1.22 10.83-1.88 2.75-5.47 3.58-9.16 3.58-3.81 0-7.73-.9-9.92-1.46-.11-.03-13.45-3.8-11.76-7 .28-.54.75-.76 1.34-.76 2.38 0 6.7 3.54 8.57 3.54.41 0 .7-.17.83-.6.79-2.85-12.06-4.05-10.98-8.17.2-.73.71-1.02 1.44-1.02 3.14 0 10.2 5.53 11.68 5.53.11 0 .2-.03.24-.1.74-1.2.33-2.04-4.9-5.2-5.21-3.16-8.88-5.06-6.8-7.33.24-.26.58-.38 1-.38 3.17 0 10.66 6.82 10.66 6.82s2.02 2.1 3.25 2.1c.28 0 .52-.1.68-.38.86-1.46-8.06-8.22-8.56-11.01-.34-1.9.24-2.85 1.31-2.85Z"
/>
<path
fill="#FFD21E"
d="M38.6 76.69c2.75-4.04 2.55-7.07-1.22-10.84-3.78-3.77-5.98-9.3-5.98-9.3s-.82-3.2-2.69-2.9c-1.87.3-3.24 5.08.68 8.01 3.91 2.93-.78 4.92-2.29 2.17-1.5-2.75-5.62-9.82-7.76-11.18-2.13-1.35-3.63-.6-3.13 2.2.5 2.79 9.43 9.55 8.56 11-.87 1.47-3.93-1.71-3.93-1.71s-9.57-8.71-11.66-6.44c-2.08 2.27 1.59 4.17 6.8 7.33 5.23 3.16 5.64 4 4.9 5.2-.75 1.2-12.28-8.53-13.36-4.4-1.08 4.11 11.77 5.3 10.98 8.15-.8 2.85-9.06-5.38-10.74-2.18-1.7 3.21 11.65 6.98 11.76 7.01 4.3 1.12 15.25 3.49 19.08-2.12Z"
/>
<path
fill="#FF9D0B"
d="M77.4 48c1.62 0 3.07.66 4.07 1.87a5.97 5.97 0 0 1 1.33 3.76 7.1 7.1 0 0 1 1.95-.3c1.55 0 2.95.59 3.94 1.66a5.8 5.8 0 0 1 .8 7 5.3 5.3 0 0 1 1.78 2.82c.24.9.48 2.8-.8 4.74a5.22 5.22 0 0 1 .37 5.02c-1.02 2.32-3.57 4.14-8.51 6.1-3.08 1.22-5.9 2-5.92 2.01a44.33 44.33 0 0 1-10.93 1.6c-5.86 0-10.05-1.8-12.46-5.34-3.88-5.69-3.33-10.9 1.7-15.92 2.78-2.78 4.63-6.87 5.01-7.77.78-2.66 2.83-5.62 6.24-5.62a5.7 5.7 0 0 1 4.6 2.46c1-1.26 1.98-2.25 2.87-2.82A7.4 7.4 0 0 1 77.4 48Zm0 4c-.51 0-1.13.22-1.82.65-2.13 1.36-6.25 8.43-7.76 11.18a2.43 2.43 0 0 1-2.14 1.31c-1.54 0-2.75-1.53-.14-3.48 3.91-2.93 2.54-7.72.67-8.01a1.54 1.54 0 0 0-.24-.02c-1.7 0-2.45 2.93-2.45 2.93s-2.2 5.52-5.97 9.3c-3.78 3.77-3.98 6.8-1.22 10.83 1.87 2.75 5.47 3.58 9.15 3.58 3.82 0 7.73-.9 9.93-1.46.1-.03 13.45-3.8 11.76-7-.29-.54-.75-.76-1.34-.76-2.38 0-6.71 3.54-8.57 3.54-.42 0-.71-.17-.83-.6-.8-2.85 12.05-4.05 10.97-8.17-.19-.73-.7-1.02-1.44-1.02-3.14 0-10.2 5.53-11.68 5.53-.1 0-.19-.03-.23-.1-.74-1.2-.34-2.04 4.88-5.2 5.23-3.16 8.9-5.06 6.8-7.33-.23-.26-.57-.38-.98-.38-3.18 0-10.67 6.82-10.67 6.82s-2.02 2.1-3.24 2.1a.74.74 0 0 1-.68-.38c-.87-1.46 8.05-8.22 8.55-11.01.34-1.9-.24-2.85-1.31-2.85Z"
/>
<path
fill="#FFD21E"
d="M56.33 76.69c-2.75-4.04-2.56-7.07 1.22-10.84 3.77-3.77 5.97-9.3 5.97-9.3s.82-3.2 2.7-2.9c1.86.3 3.23 5.08-.68 8.01-3.92 2.93.78 4.92 2.28 2.17 1.51-2.75 5.63-9.82 7.76-11.18 2.13-1.35 3.64-.6 3.13 2.2-.5 2.79-9.42 9.55-8.55 11 .86 1.47 3.92-1.71 3.92-1.71s9.58-8.71 11.66-6.44c2.08 2.27-1.58 4.17-6.8 7.33-5.23 3.16-5.63 4-4.9 5.2.75 1.2 12.28-8.53 13.36-4.4 1.08 4.11-11.76 5.3-10.97 8.15.8 2.85 9.05-5.38 10.74-2.18 1.69 3.21-11.65 6.98-11.76 7.01-4.31 1.12-15.26 3.49-19.08-2.12Z"
/>
</svg>

After

Width:  |  Height:  |  Size: 5.1 KiB

View file

@ -0,0 +1,73 @@
use clap::{Args, Subcommand};
use crate::cli::commands::output::OutputArgs;
use super::github::GitHubOutputFormat;
/// Top-level Hugging Face command group
#[derive(Args, Debug)]
pub struct HuggingFaceArgs {
#[command(subcommand)]
pub command: HuggingFaceCommand,
}
#[derive(Subcommand, Debug)]
pub enum HuggingFaceCommand {
/// Interact with Hugging Face repositories
#[command(subcommand)]
Repos(HuggingFaceReposCommand),
}
#[derive(Subcommand, Debug)]
pub enum HuggingFaceReposCommand {
/// List Hugging Face repositories
List(HuggingFaceReposListArgs),
}
#[derive(Args, Debug, Clone)]
pub struct HuggingFaceReposListArgs {
#[command(flatten)]
pub repo_specifiers: HuggingFaceRepoSpecifiers,
#[command(flatten)]
pub output_args: OutputArgs<HuggingFaceOutputFormat>,
}
#[derive(Args, Debug, Clone, Default)]
pub struct HuggingFaceRepoSpecifiers {
/// Models, datasets, and Spaces owned by these users
#[arg(long = "huggingface-user")]
pub user: Vec<String>,
/// Models, datasets, and Spaces owned by these organizations
#[arg(long = "huggingface-organization", alias = "huggingface-org")]
pub organization: Vec<String>,
/// Specific models to scan (format: owner/name or full URL)
#[arg(long = "huggingface-model")]
pub model: Vec<String>,
/// Specific datasets to scan (format: owner/name or full URL)
#[arg(long = "huggingface-dataset")]
pub dataset: Vec<String>,
/// Specific Spaces to scan (format: owner/name or full URL)
#[arg(long = "huggingface-space")]
pub space: Vec<String>,
/// Skip specific repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:)
#[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")]
pub exclude: Vec<String>,
}
impl HuggingFaceRepoSpecifiers {
pub fn is_empty(&self) -> bool {
self.user.is_empty()
&& self.organization.is_empty()
&& self.model.is_empty()
&& self.dataset.is_empty()
&& self.space.is_empty()
}
}
pub type HuggingFaceOutputFormat = GitHubOutputFormat;

View file

@ -28,6 +28,11 @@ pub struct InputSpecifierArgs {
"gitlab_group",
"gitea_user",
"gitea_organization",
"huggingface_user",
"huggingface_organization",
"huggingface_model",
"huggingface_dataset",
"huggingface_space",
"bitbucket_user",
"bitbucket_workspace",
"bitbucket_project",
@ -43,7 +48,8 @@ pub struct InputSpecifierArgs {
"confluence_url",
"docker_image",
"slack_query",
"s3_bucket"
"s3_bucket",
"gcs_bucket"
]),
num_args = 0..,
value_hint = ValueHint::AnyPath
@ -120,6 +126,30 @@ pub struct InputSpecifierArgs {
#[arg(long, alias = "include-subgroups")]
pub gitlab_include_subgroups: bool,
/// Scan models, datasets, and Spaces belonging to the specified Hugging Face users
#[arg(long = "huggingface-user")]
pub huggingface_user: Vec<String>,
/// Scan models, datasets, and Spaces belonging to the specified Hugging Face organizations
#[arg(long = "huggingface-organization", alias = "huggingface-org")]
pub huggingface_organization: Vec<String>,
/// Scan a specific Hugging Face model (format: owner/name or full URL)
#[arg(long = "huggingface-model")]
pub huggingface_model: Vec<String>,
/// Scan a specific Hugging Face dataset (format: owner/name or full URL)
#[arg(long = "huggingface-dataset")]
pub huggingface_dataset: Vec<String>,
/// Scan a specific Hugging Face Space (format: owner/name or full URL)
#[arg(long = "huggingface-space")]
pub huggingface_space: Vec<String>,
/// Skip specific Hugging Face repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:)
#[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")]
pub huggingface_exclude: Vec<String>,
// Gitea Options
/// Scan repositories belonging to the specified Gitea user
#[arg(long)]
@ -256,6 +286,18 @@ pub struct InputSpecifierArgs {
#[arg(long, requires = "s3_bucket")]
pub aws_local_profile: Option<String>,
/// Scan the specified Google Cloud Storage bucket
#[arg(long)]
pub gcs_bucket: Option<String>,
/// Optional prefix within the GCS bucket
#[arg(long, requires = "gcs_bucket")]
pub gcs_prefix: Option<String>,
/// Path to a service account JSON file for GCS authentication
#[arg(long, value_hint = ValueHint::FilePath, requires = "gcs_bucket")]
pub gcs_service_account: Option<PathBuf>,
/// Docker/OCI images to scan (no local Docker required)
#[arg(long = "docker-image")]
pub docker_image: Vec<String>,
@ -299,7 +341,6 @@ pub struct ContentFilteringArgs {
#[arg(
long = "max-file-size",
visible_alias = "max-filesize", // also show in --help
// alias = "max-filesize", // use this instead if you DONT want it shown in --help
default_value_t = 256.0,
value_name = "MB"
)]

View file

@ -3,6 +3,7 @@ pub mod bitbucket;
pub mod gitea;
pub mod github;
pub mod gitlab;
pub mod huggingface;
pub mod inputs;
pub mod output;
pub mod rules;

View file

@ -1,4 +1,5 @@
use clap::{Args, ValueEnum};
use std::path::PathBuf;
use strum::Display;
use tracing::debug;
@ -119,6 +120,14 @@ pub struct ScanArgs {
#[arg(long = "skip-word", value_name = "WORD")]
pub skip_word: Vec<String>,
/// AWS account IDs whose findings should skip live credential validation (repeatable)
#[arg(long = "skip-aws-account", value_name = "ACCOUNT_ID", value_delimiter = ',')]
pub skip_aws_account: Vec<String>,
/// File containing AWS account IDs to skip (one per line, `#` comments ignored)
#[arg(long = "skip-aws-account-file", value_name = "FILE")]
pub skip_aws_account_file: Option<PathBuf>,
/// Additional inline ignore directives to recognise (repeatable)
#[arg(long = "ignore-comment", value_name = "DIRECTIVE")]
pub extra_ignore_comments: Vec<String>,

View file

@ -8,7 +8,7 @@ use tracing::Level;
use crate::cli::commands::{
azure::AzureArgs, bitbucket::BitbucketArgs, gitea::GiteaArgs, github::GitHubArgs,
gitlab::GitLabArgs, rules::RulesArgs, scan::ScanArgs,
gitlab::GitLabArgs, huggingface::HuggingFaceArgs, rules::RulesArgs, scan::ScanArgs,
};
#[deny(missing_docs)]
@ -81,6 +81,10 @@ pub enum Command {
#[command(name = "azure")]
Azure(AzureArgs),
/// Interact with the Hugging Face Hub
#[command(name = "huggingface")]
HuggingFace(HuggingFaceArgs),
/// Manage rules
#[command(alias = "rule")]
Rules(RulesArgs),

87
src/gcs.rs Normal file
View file

@ -0,0 +1,87 @@
use std::path::Path;
use anyhow::{Context, Result};
use gcloud_storage::{
client::{Client, ClientConfig},
http::objects::{
download::Range,
get::GetObjectRequest,
list::{ListObjectsRequest, ListObjectsResponse},
},
};
use tracing::debug;
/// Visit every object in the given GCS bucket, optionally filtered by prefix.
///
/// Authentication is attempted via Application Default Credentials. When that
/// fails and no explicit service account path was provided, the client falls
/// back to anonymous access so public buckets can still be scanned.
pub async fn visit_bucket_objects<F>(
bucket: &str,
prefix: Option<&str>,
service_account_path: Option<&Path>,
mut visitor: F,
) -> Result<()>
where
F: FnMut(String, Vec<u8>) -> Result<()>,
{
if let Some(path) = service_account_path {
std::env::set_var("GOOGLE_APPLICATION_CREDENTIALS", path);
}
let config = match ClientConfig::default().with_auth().await {
Ok(config) => config,
Err(err) => {
if service_account_path.is_some()
|| std::env::var("GOOGLE_APPLICATION_CREDENTIALS").is_ok()
{
return Err(err)
.context("Failed to authenticate with GCS using provided credentials");
}
debug!("Falling back to anonymous GCS access: {err}");
ClientConfig::default().anonymous()
}
};
let client = Client::new(config);
let mut page_token: Option<String> = None;
loop {
let request = ListObjectsRequest {
bucket: bucket.to_string(),
prefix: prefix.map(|p| p.to_string()),
page_token: page_token.clone(),
..ListObjectsRequest::default()
};
let mut response: ListObjectsResponse = client
.list_objects(&request)
.await
.with_context(|| format!("Failed to list objects in bucket {bucket}"))?;
if let Some(items) = response.items.take() {
for object in items.into_iter().filter(|o| !o.name.is_empty()) {
let data = client
.download_object(
&GetObjectRequest {
bucket: bucket.to_string(),
object: object.name.clone(),
..GetObjectRequest::default()
},
&Range::default(),
)
.await
.with_context(|| format!("Failed to fetch object {}", object.name))?;
visitor(object.name, data)?;
}
}
match response.next_page_token {
Some(token) if !token.is_empty() => page_token = Some(token),
_ => break,
}
}
Ok(())
}

View file

@ -40,6 +40,15 @@ const AZURE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_azcreds() {
fi
}; _azcreds"#;
const HUGGINGFACE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_hfcreds() {
token="$KF_HUGGINGFACE_TOKEN";
if [ -n "$token" ]; then
user="${KF_HUGGINGFACE_USERNAME:-hf_user}";
echo username="$user";
echo password="$token";
fi
}; _hfcreds"#;
/// Represents errors that can occur when interacting with the `git` CLI.
#[derive(Debug, thiserror::Error)]
pub enum GitError {
@ -47,14 +56,37 @@ pub enum GitError {
IOError(#[from] std::io::Error),
#[error(
"git execution failed\ncode={}\nstdout=```\n{}```\nstderr=```\n{}```",
.status,
String::from_utf8_lossy(.stdout),
String::from_utf8_lossy(.stderr)
"git execution failed (status: {status}){summary}",
status = format_exit_status(.status),
summary = format_git_error_summary(.stdout.as_slice(), .stderr.as_slice())
)]
GitError { stdout: Vec<u8>, stderr: Vec<u8>, status: ExitStatus },
}
fn format_exit_status(status: &ExitStatus) -> String {
status.code().map(|code| code.to_string()).unwrap_or_else(|| status.to_string())
}
fn format_git_error_summary(stdout: &[u8], stderr: &[u8]) -> String {
let mut messages = Vec::new();
if let Some(line) = summarize_output(stderr) {
messages.push(line);
}
if let Some(line) = summarize_output(stdout) {
messages.push(line);
}
if messages.is_empty() {
String::new()
} else {
format!(": {}", messages.join(" | "))
}
}
fn summarize_output(output: &[u8]) -> Option<String> {
let text = String::from_utf8_lossy(output);
text.lines().map(str::trim).find(|line| !line.is_empty()).map(|line| line.to_owned())
}
/// A helper struct for running `git` commands.
///
/// It supports optional GitHub, GitLab, Gitea, and Bitbucket credentials passed via
@ -91,6 +123,8 @@ impl Git {
let has_azure_token = ["KF_AZURE_TOKEN", "KF_AZURE_PAT"]
.iter()
.any(|key| matches!(std::env::var(key), Ok(value) if !value.is_empty()));
let has_huggingface_token =
matches!(std::env::var("KF_HUGGINGFACE_TOKEN"), Ok(value) if !value.is_empty());
// If credentials are provided via environment variables, clear existing helpers first.
if has_github_token
@ -98,6 +132,7 @@ impl Git {
|| has_gitea_token
|| has_bitbucket_credentials
|| has_azure_token
|| has_huggingface_token
{
credentials.push("-c".into());
credentials.push(r#"credential.helper="#.into());
@ -136,6 +171,11 @@ impl Git {
credentials.push(AZURE_CREDENTIAL_HELPER.into());
}
if has_huggingface_token {
credentials.push("-c".into());
credentials.push(HUGGINGFACE_CREDENTIAL_HELPER.into());
}
Self { credentials, ignore_certs }
}

636
src/huggingface.rs Normal file
View file

@ -0,0 +1,636 @@
use std::{collections::HashSet, env, time::Duration};
use anyhow::{anyhow, Result};
use indicatif::{ProgressBar, ProgressStyle};
use reqwest::{header::LINK, StatusCode, Url};
use serde::Deserialize;
use serde_json::Value;
use tracing::{debug, warn};
use crate::{git_url::GitUrl, validation::GLOBAL_USER_AGENT};
#[derive(Debug, Clone, Default)]
pub struct RepoSpecifiers {
pub user: Vec<String>,
pub organization: Vec<String>,
pub model: Vec<String>,
pub dataset: Vec<String>,
pub space: Vec<String>,
pub exclude: Vec<String>,
}
impl RepoSpecifiers {
pub fn is_empty(&self) -> bool {
self.user.is_empty()
&& self.organization.is_empty()
&& self.model.is_empty()
&& self.dataset.is_empty()
&& self.space.is_empty()
}
}
#[derive(Clone, Default)]
pub struct AuthConfig {
token: Option<String>,
}
impl std::fmt::Debug for AuthConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("AuthConfig")
.field(
"token",
&self
.token
.as_ref()
.map(|token| format!("{}", token.chars().take(4).collect::<String>())),
)
.finish()
}
}
impl AuthConfig {
pub fn from_env() -> Self {
let token = env::var("KF_HUGGINGFACE_TOKEN").ok().filter(|t| !t.trim().is_empty());
Self { token }
}
fn apply(&self, request: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
if let Some(token) = &self.token {
request.bearer_auth(token)
} else {
request
}
}
fn has_token(&self) -> bool {
self.token.is_some()
}
}
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
enum ResourceKind {
Model,
Dataset,
Space,
}
impl ResourceKind {
fn api_path(self) -> &'static str {
match self {
ResourceKind::Model => "models",
ResourceKind::Dataset => "datasets",
ResourceKind::Space => "spaces",
}
}
fn git_url(self, slug: &str) -> String {
match self {
ResourceKind::Model => format!("https://huggingface.co/{slug}.git"),
ResourceKind::Dataset => format!("https://huggingface.co/datasets/{slug}.git"),
ResourceKind::Space => format!("https://huggingface.co/spaces/{slug}.git"),
}
}
fn canonical_prefix(self) -> &'static str {
match self {
ResourceKind::Model => "model",
ResourceKind::Dataset => "dataset",
ResourceKind::Space => "space",
}
}
fn display_name_singular(self) -> &'static str {
match self {
ResourceKind::Model => "model",
ResourceKind::Dataset => "dataset",
ResourceKind::Space => "space",
}
}
fn display_name_plural(self) -> &'static str {
match self {
ResourceKind::Model => "models",
ResourceKind::Dataset => "datasets",
ResourceKind::Space => "spaces",
}
}
}
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
struct ResourceRef {
kind: ResourceKind,
slug: String,
}
impl ResourceRef {
fn new(kind: ResourceKind, slug: String) -> Self {
Self { kind, slug }
}
fn canonical_key(&self) -> String {
format!("{}:{}", self.kind.canonical_prefix(), self.slug.to_lowercase())
}
fn git_url(&self) -> String {
self.kind.git_url(&self.slug)
}
}
#[derive(Debug, Deserialize)]
#[serde(untagged)]
enum HuggingFaceItem {
Id {
id: String,
},
ModelId {
#[serde(rename = "modelId")]
model_id: String,
},
}
impl HuggingFaceItem {
fn into_identifier(self) -> String {
match self {
HuggingFaceItem::Id { id } => id,
HuggingFaceItem::ModelId { model_id } => model_id,
}
}
}
#[derive(Default)]
struct ExcludeSet {
typed: HashSet<String>,
untyped: HashSet<String>,
}
impl ExcludeSet {
fn from_list(values: &[String]) -> Self {
let mut typed = HashSet::new();
let mut untyped = HashSet::new();
for raw in values {
let trimmed = raw.trim();
if trimmed.is_empty() {
continue;
}
if let Some((prefix, rest)) = trimmed.split_once(':') {
match normalize_kind(prefix) {
Some(kind) => {
if let Some(slug) = parse_slug_for_kind(kind, rest) {
typed.insert(format!(
"{}:{}",
kind.canonical_prefix(),
slug.to_lowercase()
));
} else {
warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)");
}
}
None => warn!("Ignoring invalid Hugging Face exclusion '{raw}' (unknown type)"),
}
} else if let Some(slug) = normalize_untyped_slug(trimmed) {
untyped.insert(slug);
} else {
warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)");
}
}
Self { typed, untyped }
}
fn should_exclude(&self, kind: ResourceKind, slug: &str) -> bool {
let typed_key = format!("{}:{}", kind.canonical_prefix(), slug.to_lowercase());
if self.typed.contains(&typed_key) {
return true;
}
self.untyped.contains(&slug.to_lowercase())
}
}
fn normalize_kind(raw: &str) -> Option<ResourceKind> {
match raw.trim().to_ascii_lowercase().as_str() {
"model" | "models" => Some(ResourceKind::Model),
"dataset" | "datasets" => Some(ResourceKind::Dataset),
"space" | "spaces" => Some(ResourceKind::Space),
_ => None,
}
}
fn normalize_untyped_slug(raw: &str) -> Option<String> {
let trimmed = raw.trim();
if trimmed.is_empty() {
return None;
}
let segments: Vec<&str> = trimmed.split('/').filter(|segment| !segment.is_empty()).collect();
normalize_untyped_segments(&segments)
}
fn normalize_untyped_segments(segments: &[&str]) -> Option<String> {
if segments.is_empty() {
return None;
}
let mut parts: Vec<&str> = segments.to_vec();
if let Some(first) = parts.first() {
let lowered = first.trim().to_ascii_lowercase();
if matches!(
lowered.as_str(),
"models" | "model" | "datasets" | "dataset" | "spaces" | "space"
) {
parts.remove(0);
}
}
if parts.len() < 2 {
return None;
}
let owner = parts[0].trim();
let binding = parts[1..].join("/");
let name = binding.trim_end_matches(".git").trim();
if owner.is_empty() || name.is_empty() {
return None;
}
Some(format!("{}/{}", owner, name).to_lowercase())
}
fn parse_slug_for_kind(kind: ResourceKind, raw: &str) -> Option<String> {
let trimmed = raw.trim();
if trimmed.is_empty() {
return None;
}
if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
let url = Url::parse(trimmed).ok()?;
let segments: Vec<&str> = url
.path_segments()
.map(|segments| segments.filter(|s| !s.is_empty()).collect())
.unwrap_or_default();
return parse_slug_segments(kind, &segments);
}
let segments: Vec<&str> = trimmed.split('/').filter(|s| !s.is_empty()).collect();
parse_slug_segments(kind, &segments)
}
fn parse_slug_segments(kind: ResourceKind, segments: &[&str]) -> Option<String> {
if segments.is_empty() {
return None;
}
let mut parts: Vec<&str> = segments.to_vec();
if let Some(first) = parts.first() {
let lowered = first.trim().to_ascii_lowercase();
let should_trim = match kind {
ResourceKind::Model => matches!(lowered.as_str(), "models" | "model"),
ResourceKind::Dataset => matches!(lowered.as_str(), "datasets" | "dataset"),
ResourceKind::Space => matches!(lowered.as_str(), "spaces" | "space"),
};
if should_trim {
parts.remove(0);
}
}
if parts.len() < 2 {
return None;
}
let owner = parts[0].trim();
let binding = parts[1..].join("/");
let name = binding.trim_end_matches(".git").trim();
if owner.is_empty() || name.is_empty() {
return None;
}
Some(format!("{owner}/{name}"))
}
fn parse_next_link(value: &str) -> Option<Url> {
value.split(',').find_map(|part| {
let part = part.trim();
let (url_part, params) = part.split_once('>')?;
if params.contains("rel=\"next\"") {
let url = url_part.trim_start_matches('<').trim();
Url::parse(url).ok()
} else {
None
}
})
}
const BODY_SNIPPET_LIMIT: usize = 200;
async fn fetch_paginated(
client: &reqwest::Client,
mut current_url: Url,
auth: &AuthConfig,
context: &str,
) -> Result<Vec<HuggingFaceItem>> {
let mut items = Vec::new();
loop {
let mut request =
client.get(current_url.clone()).header("User-Agent", GLOBAL_USER_AGENT.as_str());
request = auth.apply(request);
let response = request.send().await?;
let status = response.status();
let link_header = response
.headers()
.get(LINK)
.and_then(|value| value.to_str().ok())
.map(|value| value.to_string());
if !status.is_success() {
let body = response.text().await.unwrap_or_default();
let mut message = format!(
"Hugging Face API request failed while enumerating {context} ({status}): {body}"
);
if matches!(status, StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN)
&& !auth.has_token()
{
message.push_str(
"\nProvide a Hugging Face access token via the KF_HUGGINGFACE_TOKEN environment variable.",
);
}
return Err(anyhow!(message));
}
let body = response.bytes().await?;
let value: Value = serde_json::from_slice(&body).map_err(|err| {
let snippet = body_snippet(&body);
anyhow!(
"Failed to parse Hugging Face response while enumerating {context}: {err}. Body snippet: {snippet}",
context = context,
err = err,
snippet = snippet
)
})?;
let array = value.as_array().ok_or_else(|| {
let snippet = body_snippet(&body);
anyhow!(
"Unexpected Hugging Face response format while enumerating {context} (expected array). Body snippet: {snippet}",
context = context,
snippet = snippet
)
})?;
let mut page = Vec::new();
for (index, element) in array.iter().enumerate() {
match serde_json::from_value::<HuggingFaceItem>(element.clone()) {
Ok(item) => page.push(item),
Err(err) => {
let snippet = value_snippet(element);
warn!(
"Skipping Hugging Face item at index {index} while enumerating {context}: {err}. Item snippet: {snippet}"
);
}
}
}
items.append(&mut page);
if let Some(link_value) = link_header {
if let Some(next_url) = parse_next_link(&link_value) {
current_url = next_url;
continue;
}
}
break;
}
Ok(items)
}
fn body_snippet(body: &[u8]) -> String {
truncate_for_display(&String::from_utf8_lossy(body), BODY_SNIPPET_LIMIT)
}
fn value_snippet(value: &Value) -> String {
let text = value.to_string();
truncate_for_display(&text, BODY_SNIPPET_LIMIT)
}
fn truncate_for_display(text: &str, limit: usize) -> String {
let mut snippet: String = text.chars().take(limit).collect();
if text.chars().count() > limit {
snippet.push('…');
}
snippet
}
async fn fetch_resources_for_owner(
client: &reqwest::Client,
base_url: &Url,
owner: &str,
label: &str,
auth: &AuthConfig,
progress: Option<&ProgressBar>,
) -> Result<Vec<ResourceRef>> {
let mut resources = Vec::new();
for kind in [ResourceKind::Model, ResourceKind::Dataset, ResourceKind::Space] {
if let Some(pb) = progress {
pb.set_message(format!(
"Enumerating Hugging Face {label} {}",
kind.display_name_plural()
));
}
let mut url = base_url.join(kind.api_path())?;
{
let mut pairs = url.query_pairs_mut();
pairs.append_pair("author", owner);
pairs.append_pair("limit", "100");
}
let context = format!("{} for {label}", kind.display_name_plural());
match fetch_paginated(client, url, auth, &context).await {
Ok(items) => {
for item in items {
let identifier = item.into_identifier();
if let Some(slug) = parse_slug_for_kind(kind, &identifier) {
resources.push(ResourceRef::new(kind, slug));
} else {
warn!(
"Skipping Hugging Face {} with unexpected identifier '{}'",
kind.display_name_singular(),
identifier
);
}
}
}
Err(err) => {
warn!(
"Failed to enumerate Hugging Face {} for {label}: {err}",
kind.display_name_plural()
);
}
}
}
Ok(resources)
}
fn append_explicit_resources(specifiers: &RepoSpecifiers, resources: &mut Vec<ResourceRef>) {
for model in &specifiers.model {
if let Some(slug) = parse_slug_for_kind(ResourceKind::Model, model) {
resources.push(ResourceRef::new(ResourceKind::Model, slug));
} else {
warn!("Ignoring invalid Hugging Face model identifier '{model}'");
}
}
for dataset in &specifiers.dataset {
if let Some(slug) = parse_slug_for_kind(ResourceKind::Dataset, dataset) {
resources.push(ResourceRef::new(ResourceKind::Dataset, slug));
} else {
warn!("Ignoring invalid Hugging Face dataset identifier '{dataset}'");
}
}
for space in &specifiers.space {
if let Some(slug) = parse_slug_for_kind(ResourceKind::Space, space) {
resources.push(ResourceRef::new(ResourceKind::Space, slug));
} else {
warn!("Ignoring invalid Hugging Face space identifier '{space}'");
}
}
}
pub async fn enumerate_repo_urls(
specifiers: &RepoSpecifiers,
auth: &AuthConfig,
ignore_certs: bool,
progress: Option<&mut ProgressBar>,
) -> Result<Vec<String>> {
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(30))
.danger_accept_invalid_certs(ignore_certs)
.build()?;
let base_url = Url::parse("https://huggingface.co/api/")?;
let excludes = ExcludeSet::from_list(&specifiers.exclude);
let mut collected = Vec::new();
for user in &specifiers.user {
let label = format!("user {user}");
if let Some(pb) = progress.as_ref() {
pb.set_message(format!("Enumerating Hugging Face {label}"));
}
match fetch_resources_for_owner(
&client,
&base_url,
user,
&label,
auth,
progress.as_ref().map(|pb| &**pb),
)
.await
{
Ok(mut resources) => collected.append(&mut resources),
Err(err) => warn!("Failed to enumerate Hugging Face user {user}: {err}"),
}
}
for org in &specifiers.organization {
let label = format!("organization {org}");
if let Some(pb) = progress.as_ref() {
pb.set_message(format!("Enumerating Hugging Face {label}"));
}
match fetch_resources_for_owner(
&client,
&base_url,
org,
&label,
auth,
progress.as_ref().map(|pb| &**pb),
)
.await
{
Ok(mut resources) => collected.append(&mut resources),
Err(err) => warn!("Failed to enumerate Hugging Face organization {org}: {err}"),
}
}
append_explicit_resources(specifiers, &mut collected);
let mut seen = HashSet::new();
let mut urls = Vec::new();
for resource in collected {
if excludes.should_exclude(resource.kind, &resource.slug) {
debug!(
"Skipping Hugging Face {} {} due to exclusion",
resource.kind.display_name_singular(),
resource.slug
);
continue;
}
let key = resource.canonical_key();
if seen.insert(key) {
urls.push(resource.git_url());
}
}
urls.sort();
urls.dedup();
Ok(urls)
}
pub async fn list_repositories(
specifiers: &RepoSpecifiers,
auth: &AuthConfig,
ignore_certs: bool,
progress_enabled: bool,
) -> Result<()> {
let mut progress = if progress_enabled {
let style = ProgressStyle::with_template("{spinner} {msg} [{elapsed_precise}]")
.expect("progress bar style template should compile");
let pb = ProgressBar::new_spinner()
.with_style(style)
.with_message("Enumerating Hugging Face repositories");
pb.enable_steady_tick(Duration::from_millis(500));
pb
} else {
ProgressBar::hidden()
};
let urls = enumerate_repo_urls(specifiers, auth, ignore_certs, Some(&mut progress)).await?;
for url in urls {
println!("{url}");
}
progress.finish_and_clear();
Ok(())
}
pub fn wiki_url(_repo_url: &GitUrl) -> Option<GitUrl> {
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_model_slug_from_plain() {
assert_eq!(
parse_slug_for_kind(ResourceKind::Model, "user/model"),
Some("user/model".to_string())
);
}
#[test]
fn parse_dataset_slug_with_prefix() {
assert_eq!(
parse_slug_for_kind(ResourceKind::Dataset, "datasets/user/data.git"),
Some("user/data".to_string())
);
}
#[test]
fn parse_space_slug_from_url() {
assert_eq!(
parse_slug_for_kind(ResourceKind::Space, "https://huggingface.co/spaces/user/demo"),
Some("user/demo".to_string())
);
}
#[test]
fn exclude_set_matches_typed_and_untyped() {
let excludes =
ExcludeSet::from_list(&vec!["model:user/model".into(), "datasets/user/data".into()]);
assert!(excludes.should_exclude(ResourceKind::Model, "user/model"));
assert!(excludes.should_exclude(ResourceKind::Dataset, "user/data"));
assert!(!excludes.should_exclude(ResourceKind::Space, "user/space"));
}
#[test]
fn parse_link_header() {
let header = "<https://huggingface.co/api/models?cursor=abc>; rel=\"next\"";
let url = parse_next_link(header).expect("next link");
assert_eq!(url.as_str(), "https://huggingface.co/api/models?cursor=abc");
}
#[test]
fn truncate_for_display_adds_ellipsis() {
assert_eq!(truncate_for_display("abcdef", 3), "abc…");
assert_eq!(truncate_for_display("abc", 5), "abc");
}
}

View file

@ -13,6 +13,7 @@ pub mod defaults;
pub mod entropy;
pub mod finding_data;
pub mod findings_store;
pub mod gcs;
pub mod git_binary;
pub mod git_commit_metadata;
pub mod git_metadata_graph;
@ -21,6 +22,7 @@ pub mod git_url;
pub mod gitea;
pub mod github;
pub mod gitlab;
pub mod huggingface;
pub mod inline_ignore;
pub mod jira;
pub mod liquid_filters;

View file

@ -52,7 +52,7 @@ use kingfisher::{
},
findings_store,
findings_store::FindingsStore,
gitea, github,
gitea, github, huggingface,
rule_loader::RuleLoader,
rules_database::RulesDatabase,
scanner::{load_and_record_rules, run_scan},
@ -75,6 +75,7 @@ use crate::cli::commands::{
bitbucket::{BitbucketAuthArgs, BitbucketCommand, BitbucketRepoType, BitbucketReposCommand},
gitea::{GiteaCommand, GiteaRepoType, GiteaReposCommand},
gitlab::{GitLabCommand, GitLabRepoType, GitLabReposCommand},
huggingface::{HuggingFaceCommand, HuggingFaceReposCommand},
};
fn main() -> anyhow::Result<()> {
@ -93,6 +94,7 @@ fn main() -> anyhow::Result<()> {
Command::Bitbucket(_) => num_cpus::get(), // Default for Bitbucket commands
Command::Gitea(_) => num_cpus::get(), // Default for Gitea commands
Command::Azure(_) => num_cpus::get(), // Default for Azure commands
Command::HuggingFace(_) => num_cpus::get(), // Default for Hugging Face commands
Command::Rules(_) => num_cpus::get(), // Default for Rules commands
};
@ -327,6 +329,28 @@ async fn async_main(args: CommandLineArgs) -> Result<()> {
}
},
},
Command::HuggingFace(hf_args) => match hf_args.command {
HuggingFaceCommand::Repos(repos_command) => match repos_command {
HuggingFaceReposCommand::List(list_args) => {
let specifiers = huggingface::RepoSpecifiers {
user: list_args.repo_specifiers.user.clone(),
organization: list_args.repo_specifiers.organization.clone(),
model: list_args.repo_specifiers.model.clone(),
dataset: list_args.repo_specifiers.dataset.clone(),
space: list_args.repo_specifiers.space.clone(),
exclude: list_args.repo_specifiers.exclude.clone(),
};
let auth = huggingface::AuthConfig::from_env();
huggingface::list_repositories(
&specifiers,
&auth,
global_args.ignore_certs,
global_args.use_progress(),
)
.await?;
}
},
},
Command::SelfUpdate => {
anyhow::bail!("SelfUpdate command should not reach this branch")
}
@ -367,6 +391,13 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -400,6 +431,9 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Slack query
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
@ -436,6 +470,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_base64: false,
no_inline_ignore: false,

View file

@ -703,7 +703,7 @@ mod tests {
git_commit_metadata::CommitMetadata,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::{SerializableCapture, SerializableCaptures},
origin::OriginSet,
origin::{Origin, OriginSet},
rules::rule::{Confidence, Rule, RuleSyntax},
};
use gix::{date::Time, ObjectId};
@ -711,79 +711,8 @@ mod tests {
use std::path::PathBuf;
use tempfile::tempdir;
#[test]
fn build_finding_record_uses_git_blob_path() {
let temp = tempdir().unwrap();
let datastore =
Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
let repo_path = Arc::new(PathBuf::from("/tmp/repo"));
let commit_metadata = Arc::new(CommitMetadata {
commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(),
committer_name: "Alice".into(),
committer_email: "alice@example.com".into(),
committer_timestamp: Time::new(0, 0),
});
let blob_path = "path/in/history.txt".to_string();
let origin = OriginSet::new(
Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()),
vec![],
);
let rule = Arc::new(Rule::new(RuleSyntax {
name: "Test Rule".into(),
id: "test.rule".into(),
pattern: ".*".into(),
min_entropy: 0.0,
confidence: Confidence::Medium,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
depends_on_rule: vec![],
}));
let blob_id = BlobId::new(b"blob-data");
let report_match = ReportMatch {
origin,
blob_metadata: BlobMetadata {
id: blob_id,
num_bytes: 42,
mime_essence: None,
language: Some("Unknown".into()),
},
m: Match {
location: Location {
offset_span: OffsetSpan { start: 0, end: 10 },
source_span: SourceSpan {
start: SourcePoint { line: 19, column: 0 },
end: SourcePoint { line: 19, column: 10 },
},
},
groups: SerializableCaptures {
captures: SmallVec::<[SerializableCapture; 2]>::new(),
},
blob_id,
finding_fingerprint: 123,
rule: Arc::clone(&rule),
validation_response_body: "Bad credentials".into(),
validation_response_status: 401,
validation_success: false,
calculated_entropy: 5.29,
visible: true,
is_base64: false,
},
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: "Bad credentials".into(),
validation_response_status: 401,
validation_success: false,
};
let scan_args = ScanArgs {
fn sample_scan_args() -> ScanArgs {
ScanArgs {
num_jobs: 1,
rules: RuleSpecifierArgs::default(),
input_specifier_args: InputSpecifierArgs {
@ -802,6 +731,12 @@ mod tests {
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -833,6 +768,9 @@ mod tests {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@ -864,9 +802,98 @@ mod tests {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_inline_ignore: false,
}
}
fn sample_report_match(
validation_body: &str,
validation_status: u16,
validation_success: bool,
) -> (ReportMatch, String) {
let repo_path = Arc::new(PathBuf::from("/tmp/repo"));
let commit_metadata = Arc::new(CommitMetadata {
commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(),
committer_name: "Alice".into(),
committer_email: "alice@example.com".into(),
committer_timestamp: Time::new(0, 0),
});
let blob_path = "path/in/history.txt".to_string();
let origin = OriginSet::new(
Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()),
vec![],
);
let rule = Arc::new(Rule::new(RuleSyntax {
name: "Test Rule".into(),
id: "test.rule".into(),
pattern: ".*".into(),
min_entropy: 0.0,
confidence: Confidence::Medium,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
depends_on_rule: vec![],
}));
let blob_id = BlobId::new(b"blob-data");
let validation_body_owned = validation_body.to_string();
let report_match = ReportMatch {
origin,
blob_metadata: BlobMetadata {
id: blob_id,
num_bytes: 42,
mime_essence: None,
language: Some("Unknown".into()),
},
m: Match {
location: Location {
offset_span: OffsetSpan { start: 0, end: 10 },
source_span: SourceSpan {
start: SourcePoint { line: 19, column: 0 },
end: SourcePoint { line: 19, column: 10 },
},
},
groups: SerializableCaptures {
captures: SmallVec::<[SerializableCapture; 2]>::new(),
},
blob_id,
finding_fingerprint: 123,
rule: Arc::clone(&rule),
validation_response_body: validation_body_owned.clone(),
validation_response_status: validation_status,
validation_success,
calculated_entropy: 5.29,
visible: true,
is_base64: false,
},
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: validation_body_owned,
validation_response_status: validation_status,
validation_success,
};
(report_match, blob_path)
}
#[test]
fn build_finding_record_uses_git_blob_path() {
let temp = tempdir().unwrap();
let datastore =
Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
let (report_match, blob_path) =
sample_report_match("Bad credentials", StatusCode::UNAUTHORIZED.as_u16(), false);
let scan_args = sample_scan_args();
let record = reporter.build_finding_record(&report_match, &scan_args);
assert_eq!(record.finding.path, blob_path);
let git_file_path = record
@ -880,6 +907,28 @@ mod tests {
assert_eq!(git_file_path, "path/in/history.txt");
}
#[test]
fn skip_list_matches_surface_skip_reason() {
let temp = tempdir().unwrap();
let datastore =
Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
let (report_match, _) = sample_report_match(
"(skip list entry) AWS validation not attempted for account 111122223333.",
StatusCode::CONTINUE.as_u16(),
false,
);
let scan_args = sample_scan_args();
let record = reporter.build_finding_record(&report_match, &scan_args);
assert_eq!(record.finding.validation.status, "Not Attempted");
assert_eq!(
record.finding.validation.response,
"(skip list entry) AWS validation not attempted for account 111122223333."
);
}
use super::build_git_urls;
#[test]

View file

@ -93,6 +93,14 @@ mod tests {
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
// Hugging Face
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
// Gitea
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
@ -132,6 +140,9 @@ mod tests {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
// clone / history options
@ -163,6 +174,8 @@ mod tests {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
}

View file

@ -115,7 +115,7 @@ impl<'a> Display for PrettyFindingRecord<'a> {
} else {
writeln!(f, " |Validation....: {}", finding.validation.status)?;
}
if finding.validation.status != "Not Attempted" {
if !finding.validation.response.is_empty() {
writeln!(f, " |__Response....: {}", style_fn(&finding.validation.response))?;
}
writeln!(f, " |Language......: {}", finding.language)?;

View file

@ -3,7 +3,7 @@ pub(crate) use docker::save_docker_images;
pub(crate) use enumerate::enumerate_filesystem_inputs;
pub(crate) use repos::{
clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos,
enumerate_github_repos,
enumerate_github_repos, enumerate_huggingface_repos,
};
pub use runner::{load_and_record_rules, run_async_scan, run_scan};
pub(crate) use validation::run_secret_validation;

View file

@ -17,10 +17,10 @@ use crate::{
commands::{github::GitCloneMode, github::GitHistoryMode, scan},
global,
},
confluence, findings_store,
confluence, findings_store, gcs,
git_binary::{CloneMode, Git},
git_url::GitUrl,
gitea, github, gitlab, jira,
gitea, github, gitlab, huggingface, jira,
matcher::{Match, Matcher, MatcherStats},
origin::{Origin, OriginSet},
rules_database::RulesDatabase,
@ -305,6 +305,69 @@ pub async fn enumerate_gitea_repos(
Ok(repo_urls)
}
pub async fn enumerate_huggingface_repos(
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
) -> Result<Vec<GitUrl>> {
let repo_specifiers = huggingface::RepoSpecifiers {
user: args.input_specifier_args.huggingface_user.clone(),
organization: args.input_specifier_args.huggingface_organization.clone(),
model: args.input_specifier_args.huggingface_model.clone(),
dataset: args.input_specifier_args.huggingface_dataset.clone(),
space: args.input_specifier_args.huggingface_space.clone(),
exclude: args.input_specifier_args.huggingface_exclude.clone(),
};
let mut repo_urls = args.input_specifier_args.git_url.clone();
if !repo_specifiers.is_empty() {
let mut progress = if global_args.use_progress() {
let style =
ProgressStyle::with_template("{spinner} {msg} {human_len} [{elapsed_precise}]")
.expect("progress bar style template should compile");
let pb = ProgressBar::new_spinner()
.with_style(style)
.with_message("Enumerating Hugging Face repositories...");
pb.enable_steady_tick(Duration::from_millis(500));
pb
} else {
ProgressBar::hidden()
};
let mut num_found: u64 = 0;
let auth = huggingface::AuthConfig::from_env();
let repo_strings = huggingface::enumerate_repo_urls(
&repo_specifiers,
&auth,
global_args.ignore_certs,
Some(&mut progress),
)
.await
.context("Failed to enumerate Hugging Face repositories")?;
for repo_string in repo_strings {
match GitUrl::from_str(&repo_string) {
Ok(repo_url) => {
repo_urls.push(repo_url);
num_found += 1;
}
Err(e) => {
progress.suspend(|| {
error!("Failed to parse repo URL from {repo_string}: {e}");
});
}
}
}
progress.finish_with_message(format!(
"Found {} repositories from Hugging Face",
HumanCount(num_found)
));
}
repo_urls.sort();
repo_urls.dedup();
Ok(repo_urls)
}
pub async fn enumerate_bitbucket_repos(
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
@ -678,3 +741,81 @@ pub async fn fetch_s3_objects(
Ok(())
}
pub async fn fetch_gcs_objects(
args: &scan::ScanArgs,
datastore: &Arc<Mutex<findings_store::FindingsStore>>,
rules_db: &RulesDatabase,
matcher_stats: &Mutex<MatcherStats>,
enable_profiling: bool,
shared_profiler: Arc<crate::rule_profiling::ConcurrentRuleProfiler>,
progress_enabled: bool,
) -> Result<()> {
let Some(bucket) = args.input_specifier_args.gcs_bucket.as_deref() else {
return Ok(());
};
let prefix = args.input_specifier_args.gcs_prefix.as_deref();
let service_account = args.input_specifier_args.gcs_service_account.as_deref();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let seen_blobs = BlobIdMap::new();
let matcher = Matcher::new(
rules_db,
scanner_pool,
&seen_blobs,
Some(matcher_stats),
enable_profiling,
Some(shared_profiler.clone()),
&args.extra_ignore_comments,
args.no_inline_ignore,
)?;
let mut processor = BlobProcessor { matcher };
let progress = if progress_enabled {
let style =
ProgressStyle::with_template("{spinner} {msg} ({pos} objects) [{elapsed_precise}]")
.expect("progress bar style template should compile");
let pb = ProgressBar::new_spinner().with_style(style).with_message("Fetching GCS objects");
pb.enable_steady_tick(Duration::from_millis(500));
pb
} else {
ProgressBar::hidden()
};
let pb = progress.clone();
let bucket_name = bucket.to_string();
gcs::visit_bucket_objects(bucket, prefix, service_account, move |key, bytes| {
let origin = OriginSet::new(
Origin::from_extended(serde_json::json!({
"path": format!("gs://{}/{}", bucket_name, key)
})),
Vec::new(),
);
let blob = crate::blob::Blob::from_bytes(bytes);
if let Some((origin, blob_md, scored_matches)) =
processor.run(origin, blob, args.no_dedup, args.redact, args.no_base64)?
{
let origin_arc = Arc::new(origin);
let blob_arc = Arc::new(blob_md);
let mut batch = Vec::with_capacity(scored_matches.len());
for (_score, m) in scored_matches {
batch.push((origin_arc.clone(), blob_arc.clone(), m));
}
let added = datastore.lock().unwrap().record(batch, !args.no_dedup);
debug!("Added {} new GCS blobs", added);
}
pb.inc(1);
Ok(())
})
.await?;
let total = progress.position();
progress.finish_with_message(format!("Fetched {} GCS objects", total));
Ok(())
}

View file

@ -1,4 +1,7 @@
use std::sync::{Arc, Mutex};
use std::{
fs,
sync::{Arc, Mutex},
};
use anyhow::{bail, Context, Result};
use crossbeam_skiplist::SkipMap;
@ -21,10 +24,11 @@ use crate::{
safe_list,
scanner::{
clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos,
enumerate_filesystem_inputs, enumerate_github_repos,
enumerate_filesystem_inputs, enumerate_github_repos, enumerate_huggingface_repos,
repos::{
enumerate_gitea_repos, enumerate_gitlab_repos, fetch_confluence_pages,
fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects, fetch_slack_messages,
fetch_gcs_objects, fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects,
fetch_slack_messages,
},
run_secret_validation, save_docker_images,
summary::print_scan_summary,
@ -74,12 +78,14 @@ pub async fn run_async_scan(
let mut repo_urls = enumerate_github_repos(args, global_args).await?;
let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
// Combine repository URLs
repo_urls.extend(gitlab_repo_urls);
repo_urls.extend(gitea_repo_urls);
repo_urls.extend(huggingface_repo_urls);
repo_urls.extend(bitbucket_repo_urls);
repo_urls.extend(azure_repo_urls);
repo_urls.sort();
@ -182,8 +188,20 @@ pub async fn run_async_scan(
)
.await?;
let has_s3 = args.input_specifier_args.s3_bucket.is_some();
if input_roots.is_empty() && !has_s3 {
fetch_gcs_objects(
args,
&datastore,
rules_db,
&matcher_stats,
enable_profiling,
Arc::clone(&shared_profiler),
progress_enabled,
)
.await?;
let has_remote_objects = args.input_specifier_args.s3_bucket.is_some()
|| args.input_specifier_args.gcs_bucket.is_some();
if input_roots.is_empty() && !has_remote_objects {
bail!("No inputs to scan");
}
@ -231,6 +249,26 @@ pub async fn run_async_scan(
crate::baseline::apply_baseline(&mut ds, &path, args.manage_baseline, &input_roots)?;
}
let mut skip_aws_accounts = args.skip_aws_account.clone();
if let Some(path) = args.skip_aws_account_file.as_ref() {
let contents = fs::read_to_string(path).with_context(|| {
format!("Failed to read --skip-aws-account-file {}", path.display())
})?;
for line in contents.lines() {
let content = line.split('#').next().unwrap_or("");
for value in content.split(|c: char| c.is_ascii_whitespace() || c == ',' || c == ';') {
let trimmed = value.trim();
if !trimmed.is_empty() {
skip_aws_accounts.push(trimmed.to_string());
}
}
}
}
crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
// If validation is enabled, run it as a second phase
if !args.no_validate {
info!("Starting secret validation phase...");

View file

@ -95,6 +95,14 @@ pub fn init_validation_caches() {
aws::set_aws_validation_concurrency(15);
}
pub fn set_skip_aws_account_ids<I, S>(ids: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
aws::set_aws_skip_account_ids(ids);
}
#[derive(Clone)]
pub struct CachedResponse {
pub body: String,
@ -781,6 +789,26 @@ async fn timed_validate_single_match<'a>(
}
}
if let Some(account_id) = aws::should_skip_aws_validation(&akid) {
m.validation_success = false;
m.validation_response_body = format!(
"(skip list entry) AWS validation not attempted for account {}.",
account_id
);
m.validation_response_status = StatusCode::CONTINUE;
cache.insert(
cache_key,
CachedResponse {
body: m.validation_response_body.clone(),
status: m.validation_response_status,
is_valid: m.validation_success,
timestamp: Instant::now(),
},
);
commit_and_return(m);
return;
}
if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) {
m.validation_success = false;
m.validation_response_body = format!("Invalid AWS credentials ({}): {}", akid, e);

View file

@ -1,7 +1,7 @@
use std::time::Duration;
use std::{collections::HashSet, sync::RwLock, time::Duration};
use anyhow::{anyhow, Result};
use aws_config::{retry::RetryConfig, BehaviorVersion};
use aws_config::{retry::RetryConfig, BehaviorVersion, SdkConfig};
use aws_credential_types::Credentials;
use aws_sdk_sts::{
config::Builder as StsConfigBuilder, error::SdkError,
@ -26,8 +26,9 @@ use http::{
header::{HeaderValue, USER_AGENT},
StatusCode,
};
use once_cell::sync::OnceCell;
use once_cell::sync::{Lazy, OnceCell};
use rand::{rng, Rng};
use regex::Regex;
use tokio::{
sync::Semaphore,
time::{sleep, timeout},
@ -36,6 +37,57 @@ use tokio::{
use crate::validation::GLOBAL_USER_AGENT;
static AWS_VALIDATION_SEMAPHORE: OnceCell<Semaphore> = OnceCell::new();
const BUILTIN_SKIP_ACCOUNT_IDS: &[&str] = &[
"052310077262",
"171436882533",
"528757803018",
"534261010715",
"595918472158",
"729780141977",
"893192397702",
"992382622183",
];
static AWS_SKIP_ACCOUNT_IDS: Lazy<RwLock<HashSet<String>>> = Lazy::new(|| {
let mut set = HashSet::new();
set.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string()));
RwLock::new(set)
});
fn build_http_client() -> SharedHttpClient {
HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| {
let mut conn_builder = ConnectorBuilder::default()
.tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc));
conn_builder.set_connector_settings(settings.cloned());
if let Some(components) = runtime_components {
conn_builder.set_sleep_impl(components.sleep_impl());
}
conn_builder.set_proxy_config(Some(ProxyConfig::from_env()));
conn_builder.build()
})
}
async fn build_base_config(credentials: Credentials) -> SdkConfig {
let retry_config = RetryConfig::adaptive().with_max_attempts(3);
aws_config::defaults(BehaviorVersion::latest())
.region(Region::new("us-east-1"))
.credentials_provider(credentials)
.http_client(build_http_client())
.retry_config(retry_config)
.load()
.await
}
fn extract_account_id(input: &str) -> Option<String> {
let trimmed = input.trim();
if trimmed.len() == 12 && trimmed.chars().all(|c| c.is_ascii_digit()) {
return Some(trimmed.to_string());
}
static ACCOUNT_ID_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\d{12})").expect("valid regex"));
ACCOUNT_ID_RE.captures(trimmed).and_then(|caps| caps.get(1)).map(|m| m.as_str().to_string())
}
/// Set the maximum number of concurrent AWS validations. Call before first use.
pub fn set_aws_validation_concurrency(max: usize) {
@ -46,6 +98,46 @@ fn aws_validation_semaphore() -> &'static Semaphore {
AWS_VALIDATION_SEMAPHORE.get_or_init(|| Semaphore::new(15))
}
pub fn set_aws_skip_account_ids<I, S>(ids: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
let mut guard = match AWS_SKIP_ACCOUNT_IDS.write() {
Ok(g) => g,
Err(poisoned) => poisoned.into_inner(),
};
guard.clear();
guard.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string()));
for raw in ids.into_iter() {
let value = raw.into();
if value.trim().is_empty() {
continue;
}
if let Some(normalized) = extract_account_id(&value) {
guard.insert(normalized);
} else {
tracing::warn!("Ignoring invalid AWS account ID in skip list: {value}");
}
}
}
pub fn should_skip_aws_validation(access_key_id: &str) -> Option<String> {
let guard = AWS_SKIP_ACCOUNT_IDS.read().ok()?;
if guard.is_empty() {
return None;
}
let account = aws_key_to_account_number(access_key_id).ok()?;
if guard.contains(&account) {
Some(account)
} else {
None
}
}
#[derive(Debug)]
struct UaInterceptor;
@ -132,29 +224,7 @@ pub async fn validate_aws_credentials(
None, // expiry
"static", // provider name
);
// Create HTTP client that respects proxy settings from the environment
let http_client: SharedHttpClient =
HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| {
let mut conn_builder = ConnectorBuilder::default()
.tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc));
conn_builder.set_connector_settings(settings.cloned());
if let Some(components) = runtime_components {
conn_builder.set_sleep_impl(components.sleep_impl());
}
conn_builder.set_proxy_config(Some(ProxyConfig::from_env()));
conn_builder.build()
});
// Create AWS config with adaptive retries
let retry_config = RetryConfig::adaptive().with_max_attempts(3);
let config = aws_config::defaults(BehaviorVersion::latest())
.region(Region::new("us-east-1"))
.credentials_provider(credentials)
.http_client(http_client)
.retry_config(retry_config)
.load()
.await;
let config = build_base_config(credentials).await;
// Create STS client
let sts_config = StsConfigBuilder::from(&config).interceptor(UaInterceptor).build();
@ -230,3 +300,74 @@ pub fn aws_key_to_account_number(aws_key_id: &str) -> Result<String, Box<dyn std
// Return the account number formatted as a 12-digit string
Ok(format!("{:012}", account_num))
}
#[cfg(test)]
mod tests {
use super::*;
use once_cell::sync::Lazy;
use std::sync::Mutex;
static TEST_GUARD: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
#[test]
fn skip_account_list_normalizes_inputs() {
let _lock = TEST_GUARD.lock().unwrap();
set_aws_skip_account_ids([
" 052310077262 ",
"arn:aws:iam::171436882533:role/demo",
"invalid",
]);
let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap();
assert!(guard.contains("052310077262"));
assert!(guard.contains("171436882533"));
assert_eq!(guard.len(), BUILTIN_SKIP_ACCOUNT_IDS.len());
drop(guard);
set_aws_skip_account_ids(Vec::<String>::new());
}
#[test]
fn should_skip_when_account_matches() {
let _lock = TEST_GUARD.lock().unwrap();
set_aws_skip_account_ids(["534261010715"]);
assert_eq!(
should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"),
Some("534261010715".to_string())
);
set_aws_skip_account_ids(Vec::<String>::new());
}
#[test]
fn builtin_canary_accounts_are_preseeded() {
let _lock = TEST_GUARD.lock().unwrap();
set_aws_skip_account_ids(Vec::<String>::new());
assert_eq!(
should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"),
Some("534261010715".to_string())
);
set_aws_skip_account_ids(Vec::<String>::new());
}
#[test]
fn duplicate_accounts_are_deduplicated() {
let _lock = TEST_GUARD.lock().unwrap();
set_aws_skip_account_ids([
"534261010715",
"arn:aws:iam::534261010715:user/canarytokens",
" 534261010715 ",
]);
let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap();
assert_eq!(guard.iter().filter(|id| id.as_str() == "534261010715").count(), 1);
drop(guard);
set_aws_skip_account_ids(Vec::<String>::new());
}
}

View file

@ -33,31 +33,77 @@ pub fn find_closest_variable(
target_variable_name: &str,
search_variable_name: &str,
) -> Option<String> {
// Find positions of the target variable with the target value
// Collect the positions of the target variable for the provided value so we can
// compare relative offsets with candidate variables.
let mut target_positions = Vec::new();
for (name, value, start, end) in captures {
if name == target_variable_name && value == target_value {
target_positions.push((*start, *end));
}
}
if target_positions.is_empty() {
return None;
}
// For each target position, find the closest search variable
let mut closest_distance = usize::MAX;
let mut closest_value: Option<String> = None;
for (_target_start, target_end) in target_positions {
for (name, value, start, _) in captures {
if name == search_variable_name {
let distance = (*start as isize - target_end as isize).abs() as usize;
if distance < closest_distance {
closest_distance = distance;
closest_value = Some(value.clone());
// Prefer candidates that appear before the target value (same logical block), but
// fall back to overlapping values and then to those that appear after the target
// value when no better match exists. This avoids pairing with the next block when
// multiple credentials are close together in the same file.
let mut best_before: Option<(usize, String)> = None;
let mut best_overlap: Option<(usize, String)> = None;
let mut best_after: Option<(usize, String)> = None;
for (target_start, target_end) in target_positions.iter().copied() {
for (name, value, start, end) in captures {
if name != search_variable_name {
continue;
}
if *end <= target_start {
// Candidate is before the target; choose the one closest to the target start.
let distance = target_start - *end;
match &mut best_before {
Some((best_distance, best_value)) if distance < *best_distance => {
*best_distance = distance;
*best_value = value.clone();
}
None => {
best_before = Some((distance, value.clone()));
}
_ => {}
}
} else if *start >= target_end {
// Candidate is after the target; choose the one closest to the target end.
let distance = *start - target_end;
match &mut best_after {
Some((best_distance, best_value)) if distance < *best_distance => {
*best_distance = distance;
*best_value = value.clone();
}
None => {
best_after = Some((distance, value.clone()));
}
_ => {}
}
} else {
// Candidate overlaps the target treat as an exact match.
let distance = 0usize;
match &mut best_overlap {
Some((best_distance, best_value)) if distance < *best_distance => {
*best_distance = distance;
*best_value = value.clone();
}
None => {
best_overlap = Some((distance, value.clone()));
}
_ => {}
}
}
}
}
closest_value
best_before.or(best_overlap).or(best_after).map(|(_, value)| value)
}
pub async fn check_url_resolvable(url: &Url) -> Result<(), Box<dyn std::error::Error>> {
@ -147,4 +193,31 @@ mod tests {
]
);
}
#[test]
fn prefers_closest_preceding_variable() {
let captures = vec![
("TOKEN".to_string(), "secret".to_string(), 75usize, 115usize),
("AKID".to_string(), "preceding".to_string(), 30usize, 50usize),
("AKID".to_string(), "following".to_string(), 180usize, 200usize),
];
let result =
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
assert_eq!(result, "preceding".to_string());
}
#[test]
fn falls_back_to_following_when_no_preceding() {
let captures = vec![
("TOKEN".to_string(), "secret".to_string(), 10usize, 50usize),
("AKID".to_string(), "after".to_string(), 60usize, 80usize),
];
let result =
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
assert_eq!(result, "after".to_string());
}
}

View file

@ -72,6 +72,12 @@ fn run_skiplist(skip_regex: Vec<String>, skip_skipword: Vec<String>) -> Result<u
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -103,6 +109,9 @@ fn run_skiplist(skip_regex: Vec<String>, skip_skipword: Vec<String>) -> Result<u
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@ -133,6 +142,8 @@ fn run_skiplist(skip_regex: Vec<String>, skip_skipword: Vec<String>) -> Result<u
manage_baseline: false,
skip_regex: skip_regex,
skip_word: skip_skipword,
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};

View file

@ -68,6 +68,13 @@ fn test_bitbucket_remote_scan() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -102,6 +109,9 @@ fn test_bitbucket_remote_scan() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@ -131,6 +141,8 @@ fn test_bitbucket_remote_scan() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -85,6 +85,13 @@ rules:
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -120,6 +127,9 @@ rules:
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@ -151,6 +161,8 @@ rules:
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -72,6 +72,13 @@ fn test_github_remote_scan() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -107,6 +114,9 @@ fn test_github_remote_scan() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@ -138,6 +148,8 @@ fn test_github_remote_scan() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -71,6 +71,13 @@ fn test_gitlab_remote_scan() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -106,6 +113,9 @@ fn test_gitlab_remote_scan() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
@ -137,6 +147,8 @@ fn test_gitlab_remote_scan() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};
@ -210,6 +222,13 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -252,6 +271,9 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
scan_nested_repos: true,
since_commit: None,
branch: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
@ -273,6 +295,8 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -55,6 +55,12 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -86,6 +92,9 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@ -115,6 +124,8 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -62,6 +62,13 @@ impl TestContext {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -92,6 +99,9 @@ impl TestContext {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
max_results: 10,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
@ -123,6 +133,8 @@ impl TestContext {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};
@ -186,6 +198,13 @@ async fn test_scan_slack_messages() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -218,6 +237,9 @@ async fn test_scan_slack_messages() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@ -247,6 +269,8 @@ async fn test_scan_slack_messages() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -128,6 +128,13 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -163,6 +170,9 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@ -194,6 +204,8 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -71,6 +71,13 @@ impl TestContext {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -106,6 +113,9 @@ impl TestContext {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@ -137,6 +147,8 @@ impl TestContext {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
@ -183,6 +195,13 @@ impl TestContext {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -228,6 +247,10 @@ impl TestContext {
scan_nested_repos: true,
since_commit: None,
branch: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
},
extra_ignore_comments: Vec::new(),
content_filtering_args: ContentFilteringArgs {
@ -250,6 +273,8 @@ impl TestContext {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};