Merge pull request #125 from mongodb/development

v1.58.0
This commit is contained in:
Mick Grove 2025-10-16 13:29:38 -07:00 committed by GitHub
commit fecf858bfd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
36 changed files with 2023 additions and 192 deletions

View file

@ -5,6 +5,9 @@ on:
branches:
- main
env:
RUST_TOOLCHAIN: "1.90"
# This workflow runs on pull requests to the main branch
# It builds the project for 2 platforms, Linux arm64 and macOS arm64,
# and runs tests for each platform. All platforms tested on merge to main
@ -16,7 +19,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.88.0
toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
- uses: swatinem/rust-cache@v2
@ -34,7 +37,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.88.0
toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
- uses: swatinem/rust-cache@v2

View file

@ -4,6 +4,8 @@ on:
push:
branches:
- main
env:
RUST_TOOLCHAIN: "1.90"
jobs:
# ──────────────── Linux (via Makefile) ────────────────
@ -15,7 +17,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.88.0
toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
@ -69,7 +71,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.88.0
toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
@ -123,7 +125,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.88.0
toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
@ -154,7 +156,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.88.0
toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true
@ -185,7 +187,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.88.0
toolchain: ${{ env.RUST_TOOLCHAIN }}
profile: minimal
override: true

View file

@ -2,6 +2,12 @@
All notable changes to this project will be documented in this file.
## [v1.58.0]
- Added first-class Hugging Face scanning support, including CLI enumeration, token authentication, and integration with remote scans.
- Condensed GitError formatting to report the exit status and the first informative lines from stdout/stderr, producing concise git clone failure logs.
- Added support for scanning Google Cloud Storage buckets via `--gcs-bucket`, including optional prefixes and service-account authentication.
- Added `--skip-aws-account` (now accepting comma-separated values) and `--skip-aws-account-file` to bypass live AWS validation for known canary/honey-token account IDs without triggering alerts. Kingfisher now ships with several canary AWS account IDs pre-seeded in the skip list and now reports matching findings as "Not Attempted" with the "Response" containing "(skip list entry)" so it's clear that validation was intentionally skipped and why.
## [v1.57.0]
- Added inline ignore directive detection to treat suppression tokens anywhere on surrounding lines, including multi-line handling
- Added a `--no-ignore` CLI flag to disable inline directives when you need every potential secret reported

View file

@ -1,6 +1,6 @@
[workspace.package]
edition = "2021"
rust-version = "1.88"
rust-version = "1.90"
license = "Apache-2.0"
authors = ["Mick Grove <mick.grove@mongodb.com>"]
homepage = "https://github.com/mongodb/kingfisher"
@ -10,7 +10,7 @@ publish = false
[package]
name = "kingfisher"
version = "1.57.0"
version = "1.58.0"
description = "MongoDB's blazingly fast and accurate secret scanning and validation tool"
edition.workspace = true
rust-version.workspace = true
@ -122,7 +122,6 @@ reqwest-middleware = "0.4.2"
tracing-subscriber = {version = "0.3.19", features = ["env-filter"] }
tracing-core = "0.1.34"
tree-sitter = "0.25.8"
aws-smithy-runtime = "1.9.1"
aws-smithy-http-client = "1.1.1"
aws-smithy-runtime-api = "1.9.0"
aws-smithy-types = "1.3.2"
@ -192,6 +191,10 @@ walkdir = "2.5.0"
p256 = "0.13.2"
ed25519-dalek = { version = "2.2", features = ["pkcs8"] }
aws-sdk-s3 = "1.100.0"
gcloud-storage = { version = "1.1.1", default-features = false, features = [
"rustls-tls",
"auth",
] }
tokei = "12.1.2"
[target.'cfg(not(windows))'.dependencies]

View file

@ -110,11 +110,11 @@ setup-zig:
ubuntu-x64: setup-zig # ensures Zig & cargo-zigbuild exist
@echo "Checking Rust toolchain…"
@$(MAKE) check-rust || { \
echo "🦀 Installing Rust 1.88.0 …"; \
echo "🦀 Installing Rust 1.90.0 …"; \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
. $$HOME/.cargo/env; \
rustup toolchain install 1.88.0; \
rustup default 1.88.0; \
rustup toolchain install 1.90.0; \
rustup default 1.90.0; \
}
@echo "📦 Installing build dependencies (musl, cmake, etc.)…"
@ -150,11 +150,11 @@ ubuntu-x64: setup-zig # ensures Zig & cargo-zigbuild exist
ubuntu-arm64: setup-zig # ensures Zig & cargo-zigbuild exist
@echo "Checking Rust toolchain…"
@$(MAKE) check-rust || { \
echo "🦀 Installing Rust 1.88.0 …"; \
echo "🦀 Installing Rust 1.90.0 …"; \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
. $$HOME/.cargo/env; \
rustup toolchain install 1.88.0; \
rustup default 1.88.0; \
rustup toolchain install 1.90.0; \
rustup default 1.90.0; \
}
@echo "📦 Installing build dependencies (musl, cmake, etc.)…"
@ -245,7 +245,7 @@ endif
linux-x64: check-docker create-dockerignore
@mkdir -p target/release
docker run --platform linux/amd64 --rm \
-v "$$(pwd):/src" -w /src rust:1.88-alpine sh -eu -c '\
-v "$$(pwd):/src" -w /src rust:1.90-alpine sh -eu -c '\
apk add --no-cache \
musl-dev \
gcc g++ make cmake pkgconfig \
@ -256,7 +256,7 @@ linux-x64: check-docker create-dockerignore
patch perl ragel && \
git openssl-dev curl && \
\
cargo test --workspace --all-targets --release ; \
cargo test --workspace --all-targets ; \
\
rustup target add x86_64-unknown-linux-musl && \
\
@ -274,7 +274,7 @@ linux-x64: check-docker create-dockerignore
linux-arm64: check-docker create-dockerignore
@mkdir -p target/release
docker run --platform linux/arm64 --rm \
-v "$$(pwd):/src" -w /src rust:1.88-alpine sh -eu -c '\
-v "$$(pwd):/src" -w /src rust:1.90-alpine sh -eu -c '\
apk add --no-cache \
musl-dev \
gcc g++ make cmake pkgconfig \
@ -287,7 +287,7 @@ linux-arm64: check-docker create-dockerignore
\
rustup target add aarch64-unknown-linux-musl && \
\
cargo test --workspace --all-targets --release ; \
cargo test --workspace --all-targets ; \
\
export PKG_CONFIG_ALLOW_CROSS=1 ; \
export RUSTFLAGS="-C target-feature=+crt-static" ; \
@ -385,7 +385,7 @@ check-rust:
echo "Rust not found."; \
exit 1; \
fi; \
required=1.88.0; \
required=1.90.0; \
if [ $$(printf '%s\n' "$$required" "$$version" | sort -V | head -n1) != "$$required" ]; then \
echo "Rust version $$version is older than required $$required."; \
exit 1; \

258
README.md
View file

@ -5,22 +5,23 @@
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
Kingfisher is a blazingly fast secretscanning and live validation tool built in Rust. It combines Intels hardwareaccelerated Hyperscan regex engine with languageaware source code parsing, and **ships with hundreds of builtin rules** to detect, validate, and triage secrets before they ever reach production
Kingfisher is a blazingly fast secretscanning and live validation tool built in Rust. It combines Intels SIMD accelerated regex engine (Hyperscan) with languageaware source code parsing, and **ships with hundreds of builtin rules** to detect, validate, and triage secrets before they ever reach production
</p>
Originally forked from Praetorians Nosey Parker, Kingfisher **adds** live cloud-API validation; many more targets (GitLab, BitBucket, Gitea, S3, Docker, Jira, Confluence, Slack); compressed-file extraction and scanning; baseline and allowlist controls; language-aware detection (~20 languages); and a native Windows binary. See [Origins and Divergence](#origins-and-divergence) for details.
Originally forked from Praetorians Nosey Parker, Kingfisher has since significantly expanded and diverged, adding live validation, 10+ new scan targets, and major architectural enhancements. See [Origins and Divergence](#origins-and-divergence) for details.
## Key Features
### Multiple Scan Targets
<div align="center">
| Files / Dirs | Local Git | GitHub | GitLab | Azure DevOps | Bitbucket | Gitea |
|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|
| <img src="./docs/assets/icons/files.svg" height="40" alt="Files / Dirs"/><br/><sub>Files / Dirs</sub> | <img src="./docs/assets/icons/local-git.svg" height="40" alt="Local Git"/><br/><sub>Local Git</sub> | <img src="./docs/assets/icons/github.svg" height="40" alt="GitHub"/><br/><sub>GitHub</sub> | <img src="./docs/assets/icons/gitlab.svg" height="40" alt="GitLab"/><br/><sub>GitLab</sub> | <img src="./docs/assets/icons/azure-devops.svg" height="40" alt="Azure DevOps"/><br/><sub>Azure DevOps</sub> | <img src="./docs/assets/icons/bitbucket.svg" height="40" alt="Bitbucket"/><br/><sub>Bitbucket</sub> | <img src="./docs/assets/icons/gitea.svg" height="40" alt="Gitea"/><br/><sub>Gitea</sub> |
| Files / Dirs | Local Git | GitHub | GitLab | Azure Repos | Bitbucket | Gitea | Hugging Face |
|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|:-------------:|
| <img src="./docs/assets/icons/files.svg" height="40" alt="Files / Dirs"/><br/><sub>Files / Dirs</sub> | <img src="./docs/assets/icons/local-git.svg" height="40" alt="Local Git"/><br/><sub>Local Git</sub> | <img src="./docs/assets/icons/github.svg" height="40" alt="GitHub"/><br/><sub>GitHub</sub> | <img src="./docs/assets/icons/gitlab.svg" height="40" alt="GitLab"/><br/><sub>GitLab</sub> | <img src="./docs/assets/icons/azure-devops.svg" height="40" alt="Azure Repos"/><br/><sub>Azure Repos</sub> | <img src="./docs/assets/icons/bitbucket.svg" height="40" alt="Bitbucket"/><br/><sub>Bitbucket</sub> | <img src="./docs/assets/icons/gitea.svg" height="40" alt="Gitea"/><br/><sub>Gitea</sub> |<img src="./docs/assets/icons/huggingface.svg" height="40" width="40" alt="Hugging Face"/><br/><sub>Hugging Face</sub> |
| Docker | Jira | Confluence | Slack | AWS S3 |
|:------:|:----:|:-----------:|:-----:|:------:|
| <img src="./docs/assets/icons/docker.svg" height="40" alt="Docker"/><br/><sub>Docker</sub> | <img src="./docs/assets/icons/jira.svg" height="40" alt="Jira"/><br/><sub>Jira</sub> | <img src="./docs/assets/icons/confluence.svg" height="40" alt="Confluence"/><br/><sub>Confluence</sub> | <img src="./docs/assets/icons/slack.svg" height="40" alt="Slack"/><br/><sub>Slack</sub> | <img src="./docs/assets/icons/aws-s3.svg" height="40" alt="AWS S3"/><br/><sub>AWS&nbsp;S3</sub> |
| Docker | Jira | Confluence | Slack | AWS S3 | Google Cloud |
|:------:|:----:|:-----------:|:-----:|:------:|:---:|
| <img src="./docs/assets/icons/docker.svg" height="40" alt="Docker"/><br/><sub>Docker</sub> | <img src="./docs/assets/icons/jira.svg" height="40" alt="Jira"/><br/><sub>Jira</sub> | <img src="./docs/assets/icons/confluence.svg" height="40" alt="Confluence"/><br/><sub>Confluence</sub> | <img src="./docs/assets/icons/slack.svg" height="40" alt="Slack"/><br/><sub>Slack</sub> | <img src="./docs/assets/icons/aws-s3.svg" height="40" alt="AWS S3"/><br/><sub>AWS&nbsp;S3</sub> | <img src="./docs/assets/icons/gcs.svg" height="40" alt="Google Cloud Storage"/><br/><sub>Cloud Storage</sub> |
</div>
@ -41,17 +42,27 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
<img src="docs/runtime-comparison.png" alt="Kingfisher Runtime Comparison" style="vertical-align: center;" />
</p>
# Table of Contents
<details>
- [Kingfisher](#kingfisher)
- [Key Features](#key-features)
- [Multiple Scan Targets](#multiple-scan-targets)
- [Performance, Accuracy, and Hundreds of Rules](#performance-accuracy-and-hundreds-of-rules)
- [Benchmark Results](#benchmark-results)
- [Table of Contents](#table-of-contents)
- [Getting Started](#getting-started)
- [Installation](#installation)
- [Run Kingfisher in Docker](#run-kingfisher-in-docker)
- [Pre-built Releases](#pre-built-releases)
- [Homebrew](#homebrew)
- [Linux and macOS](#linux-and-macos)
- [Windows](#windows)
- [Compile](#compile)
- [ Run Kingfisher in Docker](#-run-kingfisher-in-docker)
- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance)
- [Write Custom Rules!](#write-custom-rules)
- [Usage](#usage)
- [📝 Write Custom Rules!](#-write-custom-rules)
- [🎉 Usage](#-usage)
- [Basic Examples](#basic-examples)
- [Scan with secret validation](#scan-with-secret-validation)
- [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories)
@ -67,6 +78,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Exclude specific paths](#exclude-specific-paths)
- [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines)
- [ Scanning an AWS S3 Bucket](#-scanning-an-aws-s3-bucket)
- [ Scanning a Google Cloud Storage Bucket](#-scanning-a-google-cloud-storage-bucket)
- [ Scanning Docker Images](#-scanning-docker-images)
- [ Scanning GitHub](#-scanning-github)
- [Scan GitHub organization (requires `KF_GITHUB_TOKEN`)](#scan-github-organization-requires-kf_github_token)
@ -79,8 +91,8 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url)
- [List GitLab repositories](#list-gitlab-repositories)
- [ Scanning Azure Repos](#-scanning-azure-repos)
- [Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-devops-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat)
- [Scan specific Azure DevOps projects](#scan-specific-azure-devops-projects)
- [Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-repos-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat)
- [Scan specific Azure Repos projects](#scan-specific-azure-repos-projects)
- [Skip specific Azure repositories during enumeration](#skip-specific-azure-repositories-during-enumeration)
- [List Azure repositories](#list-azure-repositories)
- [ Scanning Gitea](#-scanning-gitea)
@ -97,6 +109,12 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [List Bitbucket repositories](#list-bitbucket-repositories)
- [Authenticate to Bitbucket](#authenticate-to-bitbucket)
- [Self-hosted Bitbucket Server](#self-hosted-bitbucket-server)
- [ Scanning Hugging Face](#-scanning-hugging-face)
- [Scan Hugging Face user](#scan-hugging-face-user)
- [Scan Hugging Face organization](#scan-hugging-face-organization)
- [Scan specific Hugging Face resources](#scan-specific-hugging-face-resources)
- [List Hugging Face repositories](#list-hugging-face-repositories)
- [Authenticate to Hugging Face](#authenticate-to-hugging-face)
- [ Scanning Jira](#-scanning-jira)
- [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query)
- [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues)
@ -107,7 +125,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Environment Variables for Tokens](#environment-variables-for-tokens)
- [Exit Codes](#exit-codes)
- [Update Checks](#update-checks)
- [Advanced Options](#advanced-options)
- [🤓 Advanced Options](#-advanced-options)
- [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets)
- [List Builtin Rules](#list-builtin-rules)
- [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run)
@ -117,6 +135,8 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Notable Scan Options](#notable-scan-options)
- [Understanding `--confidence`](#understanding---confidence)
- [Ignore known false positives](#ignore-known-false-positives)
- [Skip Canary Tokens (AWS)](#skip-canary-tokens-aws)
- [Common CLI flows](#common-cli-flows)
- [Inline ignore directives](#inline-ignore-directives)
- [Finding Fingerprint](#finding-fingerprint)
- [Rule Performance Profiling](#rule-performance-profiling)
@ -125,36 +145,58 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
- [Roadmap](#roadmap)
- [License](#license)
</details>
# Getting Started
## Installation
### Pre-built Releases
Pre-built binaries are available from the [Releases](https://github.com/mongodb/kingfisher/releases) section.
### Homebrew
On macOS, you can simply
```bash
brew install kingfisher
```
Pre-built binaries are also available on the [Releases](https://github.com/mongodb/kingfisher/releases) section of this page.
### Linux and macOS
You can also install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform:
<details>
You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform.
```bash
# Linux, macOS
curl --silent --location \
https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \
sh && \
ubi --project mongodb/kingfisher --in "$HOME/bin"
ubi --project mongodb/kingfisher --in "$HOME/.local/bin"
```
This installs and runs `ubi` and then places the `kingfisher` executable in `~/.local/bin` on Unix-like systems.
</details>
### Windows
<details>
You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform.
```powershell
# Windows
powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in .
```
This installs `ubi` and then places the `kingfisher` executable in `~/bin` on Unix-like
systems (or the current directory on Windows).
This installs and runs `ubi` and then places the `kingfisher` executable in the current directory on Windows.
</details>
Or you may compile for your platform via `make`:
### Compile
You may compile for your platform via `make`
<details>
```bash
# NOTE: Requires Docker
@ -174,9 +216,14 @@ make darwin-all # builds both x64 and arm64
make all # builds for every OS and architecture supported
```
### Run Kingfisher in Docker
</details>
### <img src="./docs/assets/icons/docker.svg" height="40" style="vertical-align:text-bottom;" alt="Docker"/> Run Kingfisher in Docker
Run the dockerized Kingfisher container
<details>
Run the dockerized Kingfisher container:
```bash
# GitHub Container Registry
docker run --rm ghcr.io/mongodb/kingfisher:latest --version
@ -233,6 +280,8 @@ docker run --rm \
```
</details>
# 🔐 Detection Rules at a Glance
Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything from classic cloud keys to the latest AI SaaS tokens. Below is an overview:
@ -248,7 +297,7 @@ Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything fr
| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more |
| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more |
## Write Custom Rules!
## 📝 Write Custom Rules!
Kingfisher ships with hundreds of rules with HTTP and servicespecific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential.
@ -258,7 +307,7 @@ First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom King
Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required!
# Usage
# 🎉 Usage
## Basic Examples
@ -400,7 +449,7 @@ kingfisher scan ./my-project \
-v
```
## <img alt="GitHub" src="./docs/assets/icons/aws-s3.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning an AWS S3 Bucket
## <img alt="GitHub" src="./docs/assets/icons/aws-s3.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning an AWS S3 Bucket
You can scan S3 objects directly:
```bash
@ -415,7 +464,7 @@ Credential resolution happens in this order:
If `--role-arn` is supplied, the credentials from steps 12 are used to assume that role.
Examples:
Examples
```bash
# using explicit keys
@ -452,7 +501,30 @@ docker run --rm \
scan --s3-bucket bucket-name
```
## <img alt="Docker" src="./docs/assets/icons/docker.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Docker Images
## <img src="./docs/assets/icons/gcs.svg" height="40" width="40" alt="Google Cloud Storage"/> Scanning a Google Cloud Storage Bucket
The `--gcs-bucket` flag streams objects directly from Google Cloud Storage. Authentication uses
Application Default Credentials, so you can provide a service-account JSON file via the
`GOOGLE_APPLICATION_CREDENTIALS` environment variable or by passing `--gcs-service-account`. Public
buckets work without credentials.
```bash
kingfisher scan --gcs-bucket bucket-name
# scan a sub-tree inside the bucket
kingfisher scan --gcs-bucket bucket-name --gcs-prefix path/to/data/
# supply a service-account key explicitly
kingfisher scan --gcs-bucket bucket-name --gcs-service-account /path/to/key.json
```
Functional example:
```bash
kingfisher scan --gcs-bucket cloud-samples-data --gcs-prefix "storage/"
```
## <img alt="Docker" src="./docs/assets/icons/docker.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Docker Images
Kingfisher will first try to use any locally available image, then fall back to pulling via OCI.
@ -482,7 +554,7 @@ kingfisher scan --docker-image some-private-registry.dkr.ecr.us-east-1.amazonaws
kingfisher scan --docker-image private.registry.example.com/my-image:tag
```
## <img alt="GitHub" src="./docs/assets/icons/github.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning GitHub
## <img alt="GitHub" src="./docs/assets/icons/github.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning GitHub
### Scan GitHub organization (requires `KF_GITHUB_TOKEN`)
@ -524,7 +596,7 @@ KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/priva
---
## <img alt="GitLab" src="./docs/assets/icons/gitlab.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning GitLab
## <img alt="GitLab" src="./docs/assets/icons/gitlab.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning GitLab
### Scan GitLab group (requires `KF_GITLAB_TOKEN`)
@ -580,18 +652,18 @@ kingfisher gitlab repos list --group my-group --include-subgroups
# skip specific projects when listing or scanning (supports glob patterns)
kingfisher gitlab repos list --group my-group --gitlab-exclude my-group/**/legacy-*
```
## <img alt="Azure Repos" src="./docs/assets/icons/azure-devops.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Azure Repos
## <img alt="Azure Repos" src="./docs/assets/icons/azure-devops.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Azure Repos
### Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)
### Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)
```bash
kingfisher scan --azure-organization my-org
# Azure DevOps Server example
# Azure Repos Server example
KF_AZURE_PAT="pat" kingfisher scan --azure-organization DefaultCollection --azure-base-url https://ado.internal.example/tfs/
```
### Scan specific Azure DevOps projects
### Scan specific Azure Repos projects
Projects are specified as `ORGANIZATION/PROJECT`. Repeat the flag for multiple projects.
@ -621,7 +693,7 @@ kingfisher azure repos list --project my-org/app --project my-org/api
# skip specific repositories while listing (supports glob patterns)
kingfisher azure repos list --organization my-org --azure-exclude my-org/**/experimental-*
```
## <img alt="Gitea" src="./docs/assets/icons/gitea.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Gitea
## <img alt="Gitea" src="./docs/assets/icons/gitea.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Gitea
### Scan Gitea organization (requires `KF_GITEA_TOKEN`)
@ -673,7 +745,7 @@ KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --all-gitea-organizations
# self-hosted example
KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --user johndoe --gitea-api-url https://gitea.internal.example/api/v1/
```
## <img alt="Bitbucket" src="./docs/assets/icons/bitbucket.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Bitbucket
## <img alt="Bitbucket" src="./docs/assets/icons/bitbucket.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Bitbucket
### Scan Bitbucket workspace
```bash
@ -745,7 +817,45 @@ Use `--bitbucket-api-url` to point Kingfisher at your server's REST endpoint, fo
`https://bitbucket.example.com/rest/api/1.0/`. Provide credentials with
`--bitbucket-username` and `--bitbucket-token`, and pass `--ignore-certs` when
connecting to HTTP or otherwise insecure instances.
## <img alt="Jira" src="./docs/assets/icons/jira.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Jira
## <img src="./docs/assets/icons/huggingface.svg" height="40" width="40" alt="Hugging Face"/> Scanning Hugging Face
Hugging Face hosts git repositories for models, datasets, and Spaces. Kingfisher can enumerate and scan all three resource types.
### Scan Hugging Face user
```bash
kingfisher scan --huggingface-user <username>
```
### Scan Hugging Face organization
```bash
kingfisher scan --huggingface-organization <orgname>
```
### Scan specific Hugging Face resources
Scan individual repositories by ID (owner/name) or by passing the full HTTPS URL:
```bash
kingfisher scan --huggingface-model <owner/model>
kingfisher scan --huggingface-dataset https://huggingface.co/datasets/<owner>/<dataset>
kingfisher scan --huggingface-space <owner/space>
```
Use `--huggingface-exclude` to omit results returned by user or organization enumeration. Prefix values with `model:`, `dataset:`, or `space:` when you only want to skip a specific resource type.
### List Hugging Face repositories
```bash
kingfisher huggingface repos list --huggingface-user <username>
```
### Authenticate to Hugging Face
Private repositories require an access token provided through the `KF_HUGGINGFACE_TOKEN` environment variable. For git authentication the helper also honours `KF_HUGGINGFACE_USERNAME` (default `hf_user`).
## <img alt="Jira" src="./docs/assets/icons/jira.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Jira
### Scan Jira issues matching a JQL query
@ -764,7 +874,7 @@ KF_JIRA_TOKEN="token" kingfisher scan \
--max-results 1000
```
## <img alt="Confluence" src="./docs/assets/icons/confluence.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Confluence
## <img alt="Confluence" src="./docs/assets/icons/confluence.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Confluence
### Scan Confluence pages matching a CQL query
```bash
@ -789,7 +899,7 @@ Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environ
To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions.
## <img alt="Slack" src="./docs/assets/icons/slack.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Slack
## <img alt="Slack" src="./docs/assets/icons/slack.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Slack
### Scan Slack messages matching a search query
```bash
@ -811,11 +921,13 @@ KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \
| `KF_GITLAB_TOKEN` | GitLab Personal Access Token |
| `KF_GITEA_TOKEN` | Gitea Personal Access Token |
| `KF_GITEA_USERNAME` | Username for private Gitea clones (used with `KF_GITEA_TOKEN`) |
| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure DevOps Personal Access Token |
| `KF_AZURE_USERNAME` | Username to use with Azure DevOps PATs (defaults to `pat` when unset) |
| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure Repos Personal Access Token |
| `KF_AZURE_USERNAME` | Username to use with Azure Repos PATs (defaults to `pat` when unset) |
| `KF_BITBUCKET_USERNAME` | Bitbucket username for basic authentication |
| `KF_BITBUCKET_APP_PASSWORD` / `KF_BITBUCKET_TOKEN` | Bitbucket app password or server token |
| `KF_BITBUCKET_OAUTH_TOKEN` | Bitbucket OAuth or PAT token |
| `KF_HUGGINGFACE_TOKEN` | Hugging Face access token for API enumeration and git cloning |
| `KF_HUGGINGFACE_USERNAME` | Optional username for Hugging Face git operations (defaults to `hf_user`) |
| `KF_JIRA_TOKEN` | Jira API token |
| `KF_CONFLUENCE_TOKEN` | Confluence API token |
| `KF_SLACK_TOKEN` | Slack API token |
@ -869,7 +981,7 @@ Kingfisher automatically queries GitHub for a newer release when it starts and t
- **Disable version checks** Pass `--no-update-check` to skip both the startup and shutdown checks entirely
# Advanced Options
# 🤓 Advanced Options
## Build a Baseline / Detect New Secrets
@ -963,6 +1075,8 @@ leaves the default unchanged.
- `--manage-baseline`: Create or update the baseline file with current findings
- `--skip-regex <PATTERN>`: Ignore findings whose text matches this regex (repeatable)
- `--skip-word <WORD>`: Ignore findings containing this case-insensitive word (repeatable)
- `--skip-aws-account <ACCOUNT_ID>`: Skip live AWS validation for findings tied to the specified AWS account number (repeatable, accepts comma-separated lists)
- `--skip-aws-account-file <FILE>`: Load AWS account numbers to skip from a file (one account per line; `#` comments allowed)
- `--ignore-comment <DIRECTIVE>`: Honor additional inline directives from other scanners (repeatable; e.g. `--ignore-comment "gitleaks:allow"`)
- `--no-ignore`: Disable inline directives entirely so every match is reported
## Understanding `--confidence`
@ -996,6 +1110,66 @@ kingfisher scan \
If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early.
### Skip Canary Tokens (AWS)
Canary/honey tokens are intentionally leaked credentials used to catch misuse. Kingfisher can **recognize and skip** known AWS canary accounts so hygiene scans dont set off alerts.
**How to skip**
Pass the 12-digit AWS account IDs for your canaries via `--skip-aws-account` (comma-separated) or `--skip-aws-account-file` (one ID per line; blank lines and `#` comments allowed). Kingfisher also ships with a **pre-seeded (but not exhaustive)** list of Thinkst Canary account IDs used by canarytokens.org, so many are skipped automatically.
```bash
kingfisher scan /path/to/code \
--skip-aws-account "171436882533,534261010715"
# or combine preloaded canary IDs with a just-created decoy account
printf '999900001111 \n534261010715' > /tmp/canary_accounts.txt
kingfisher scan /path/to/repo \
--skip-aws-account-file /tmp/canary_accounts.txt
```
**What youll see**
Findings tied to a skip-listed account report `Validation: Not Attempted` and note in the `Response:` that the entry came from the skip list:
```bash
AWS SECRET ACCESS KEY => [KINGFISHER.AWS.2]
|Finding.......: <REDACTED>
|Fingerprint...: 2141074333616819500
|Confidence....: medium
|Entropy.......: 5.00
|Validation....: Not Attempted
|__Response....: (skip list entry) AWS validation not attempted for account 171436882533.
|Language......: Unknown
|Line Num......: 21
|Path..........: /tmp/test_canary_accounts.log
```
**Why this matters**
Skipping prevents noisy tripwires in prod telemetry while keeping the status explicit—“Not Attempted” isnt a pass. If needed, verify these credentials out-of-band or with a safe, non-triggering method.
#### Common CLI flows
```bash
# Skip a few in-house canaries during a filesystem scan
kingfisher scan repo/ \
--skip-aws-account "111122223333,444455556666"
# Read a longer list from disk
kingfisher scan repo/ \
--skip-aws-account-file /tmp/scripts/canary_accounts.txt
# Combine preloaded canary IDs with a just-created decoy account
printf '999900001111\n534261010715\n' > /tmp/new_canary.txt
kingfisher scan /path/to/repo \
--skip-aws-account-file /tmp/new_canary.txt
```
Tip: if you manage multiple canary fleets (Thinkst, self-hosted alternatives, or bespoke decoys), checkpoint the account IDs alongside your infrastructure-as-code so security teams can rotate or expand the skip list without editing pipelines.
### Inline ignore directives
Add `kingfisher:ignore` anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value:

38
docs/assets/icons/gcs.svg Normal file
View file

@ -0,0 +1,38 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg id="standard_product_icon" xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 512 512">
<!-- Generator: Adobe Illustrator 29.1.0, SVG Export Plug-In . SVG Version: 2.1.0 Build 142) -->
<defs>
<style>
.st0 {
fill: none;
}
.st1 {
fill: #4285f4;
}
.st2 {
fill: #34a853;
}
.st3 {
fill: #fbbc04;
}
.st4 {
fill: #ea4335;
}
</style>
</defs>
<g id="bounding_box">
<rect class="st0" width="512" height="512"/>
</g>
<g id="art">
<path class="st2" d="M442,277.9H70c-8.8,0-16,7.2-16,16v148.1c0,8.8,7.2,16,16,16h107.2c1.5.5,3.1.7,4.8.7s3.3-.3,4.8-.7h255.2c8.8,0,16-7.2,16-16v-148.1c0-8.8-7.2-16-16-16ZM86,309.9h80v116.1h-80v-116.1ZM426,425.9h-228v-116.1h228v116.1Z"/>
<path class="st3" d="M442,54H70c-8.8,0-16,7.2-16,16v148.8c0,8.8,7.2,16,16,16h372c8.8,0,16-7.2,16-16V70c0-8.8-7.2-16-16-16ZM86,86h80v116.8h-80v-116.8ZM426,202.8h-228v-116.8h228v116.8Z"/>
<path class="st4" d="M442,234.8h-16V86H54v-16c0-8.8,7.2-16,16-16h372c8.8,0,16,7.2,16,16v148.8c0,8.8-7.2,16-16,16Z"/>
<path class="st1" d="M442,457.9h-16v-148.1H54v-16c0-8.8,7.2-16,16-16h372c8.8,0,16,7.2,16,16v148.1c0,8.8-7.2,16-16,16Z"/>
<circle class="st4" cx="349" cy="144.4" r="37"/>
<circle class="st1" cx="349" cy="367.9" r="37"/>
</g>
</svg>

After

Width:  |  Height:  |  Size: 1.4 KiB

View file

@ -0,0 +1,41 @@
<svg xmlns="http://www.w3.org/2000/svg" width="95" height="88" fill="none">
<path
fill="#fff"
d="M94.25 70.08a8.28 8.28 0 0 1-.43 6.46 10.57 10.57 0 0 1-3 3.6 25.18 25.18 0 0 1-5.7 3.2 65.74 65.74 0 0 1-7.56 2.65 46.67 46.67 0 0 1-11.42 1.68c-5.42.05-10.09-1.23-13.4-4.5a40.4 40.4 0 0 1-10.14.03c-3.34 3.25-7.99 4.52-13.39 4.47a46.82 46.82 0 0 1-11.43-1.68 66.37 66.37 0 0 1-7.55-2.65c-2.28-.98-4.17-2-5.68-3.2a10.5 10.5 0 0 1-3.02-3.6c-.99-2-1.18-4.3-.42-6.46a8.54 8.54 0 0 1-.33-5.63c.25-.95.66-1.83 1.18-2.61a8.67 8.67 0 0 1 2.1-8.47 8.23 8.23 0 0 1 2.82-2.07 41.75 41.75 0 1 1 81.3-.12 8.27 8.27 0 0 1 3.11 2.19 8.7 8.7 0 0 1 2.1 8.47c.52.78.93 1.66 1.18 2.61a8.61 8.61 0 0 1-.32 5.63Z"
/>
<path fill="#FFD21E" d="M47.21 76.5a34.75 34.75 0 1 0 0-69.5 34.75 34.75 0 0 0 0 69.5Z" />
<path
fill="#FF9D0B"
d="M81.96 41.75a34.75 34.75 0 1 0-69.5 0 34.75 34.75 0 0 0 69.5 0Zm-73.5 0a38.75 38.75 0 1 1 77.5 0 38.75 38.75 0 0 1-77.5 0Z"
/>
<path
fill="#3A3B45"
d="M58.5 32.3c1.28.44 1.78 3.06 3.07 2.38a5 5 0 1 0-6.76-2.07c.61 1.15 2.55-.72 3.7-.32ZM34.95 32.3c-1.28.44-1.79 3.06-3.07 2.38a5 5 0 1 1 6.76-2.07c-.61 1.15-2.56-.72-3.7-.32Z"
/>
<path
fill="#FF323D"
d="M46.96 56.29c9.83 0 13-8.76 13-13.26 0-2.34-1.57-1.6-4.09-.36-2.33 1.15-5.46 2.74-8.9 2.74-7.19 0-13-6.88-13-2.38s3.16 13.26 13 13.26Z"
/>
<path
fill="#3A3B45"
fill-rule="evenodd"
d="M39.43 54a8.7 8.7 0 0 1 5.3-4.49c.4-.12.81.57 1.24 1.28.4.68.82 1.37 1.24 1.37.45 0 .9-.68 1.33-1.35.45-.7.89-1.38 1.32-1.25a8.61 8.61 0 0 1 5 4.17c3.73-2.94 5.1-7.74 5.1-10.7 0-2.34-1.57-1.6-4.09-.36l-.14.07c-2.31 1.15-5.39 2.67-8.77 2.67s-6.45-1.52-8.77-2.67c-2.6-1.29-4.23-2.1-4.23.29 0 3.05 1.46 8.06 5.47 10.97Z"
clip-rule="evenodd"
/>
<path
fill="#FF9D0B"
d="M70.71 37a3.25 3.25 0 1 0 0-6.5 3.25 3.25 0 0 0 0 6.5ZM24.21 37a3.25 3.25 0 1 0 0-6.5 3.25 3.25 0 0 0 0 6.5ZM17.52 48c-1.62 0-3.06.66-4.07 1.87a5.97 5.97 0 0 0-1.33 3.76 7.1 7.1 0 0 0-1.94-.3c-1.55 0-2.95.59-3.94 1.66a5.8 5.8 0 0 0-.8 7 5.3 5.3 0 0 0-1.79 2.82c-.24.9-.48 2.8.8 4.74a5.22 5.22 0 0 0-.37 5.02c1.02 2.32 3.57 4.14 8.52 6.1 3.07 1.22 5.89 2 5.91 2.01a44.33 44.33 0 0 0 10.93 1.6c5.86 0 10.05-1.8 12.46-5.34 3.88-5.69 3.33-10.9-1.7-15.92-2.77-2.78-4.62-6.87-5-7.77-.78-2.66-2.84-5.62-6.25-5.62a5.7 5.7 0 0 0-4.6 2.46c-1-1.26-1.98-2.25-2.86-2.82A7.4 7.4 0 0 0 17.52 48Zm0 4c.51 0 1.14.22 1.82.65 2.14 1.36 6.25 8.43 7.76 11.18.5.92 1.37 1.31 2.14 1.31 1.55 0 2.75-1.53.15-3.48-3.92-2.93-2.55-7.72-.68-8.01.08-.02.17-.02.24-.02 1.7 0 2.45 2.93 2.45 2.93s2.2 5.52 5.98 9.3c3.77 3.77 3.97 6.8 1.22 10.83-1.88 2.75-5.47 3.58-9.16 3.58-3.81 0-7.73-.9-9.92-1.46-.11-.03-13.45-3.8-11.76-7 .28-.54.75-.76 1.34-.76 2.38 0 6.7 3.54 8.57 3.54.41 0 .7-.17.83-.6.79-2.85-12.06-4.05-10.98-8.17.2-.73.71-1.02 1.44-1.02 3.14 0 10.2 5.53 11.68 5.53.11 0 .2-.03.24-.1.74-1.2.33-2.04-4.9-5.2-5.21-3.16-8.88-5.06-6.8-7.33.24-.26.58-.38 1-.38 3.17 0 10.66 6.82 10.66 6.82s2.02 2.1 3.25 2.1c.28 0 .52-.1.68-.38.86-1.46-8.06-8.22-8.56-11.01-.34-1.9.24-2.85 1.31-2.85Z"
/>
<path
fill="#FFD21E"
d="M38.6 76.69c2.75-4.04 2.55-7.07-1.22-10.84-3.78-3.77-5.98-9.3-5.98-9.3s-.82-3.2-2.69-2.9c-1.87.3-3.24 5.08.68 8.01 3.91 2.93-.78 4.92-2.29 2.17-1.5-2.75-5.62-9.82-7.76-11.18-2.13-1.35-3.63-.6-3.13 2.2.5 2.79 9.43 9.55 8.56 11-.87 1.47-3.93-1.71-3.93-1.71s-9.57-8.71-11.66-6.44c-2.08 2.27 1.59 4.17 6.8 7.33 5.23 3.16 5.64 4 4.9 5.2-.75 1.2-12.28-8.53-13.36-4.4-1.08 4.11 11.77 5.3 10.98 8.15-.8 2.85-9.06-5.38-10.74-2.18-1.7 3.21 11.65 6.98 11.76 7.01 4.3 1.12 15.25 3.49 19.08-2.12Z"
/>
<path
fill="#FF9D0B"
d="M77.4 48c1.62 0 3.07.66 4.07 1.87a5.97 5.97 0 0 1 1.33 3.76 7.1 7.1 0 0 1 1.95-.3c1.55 0 2.95.59 3.94 1.66a5.8 5.8 0 0 1 .8 7 5.3 5.3 0 0 1 1.78 2.82c.24.9.48 2.8-.8 4.74a5.22 5.22 0 0 1 .37 5.02c-1.02 2.32-3.57 4.14-8.51 6.1-3.08 1.22-5.9 2-5.92 2.01a44.33 44.33 0 0 1-10.93 1.6c-5.86 0-10.05-1.8-12.46-5.34-3.88-5.69-3.33-10.9 1.7-15.92 2.78-2.78 4.63-6.87 5.01-7.77.78-2.66 2.83-5.62 6.24-5.62a5.7 5.7 0 0 1 4.6 2.46c1-1.26 1.98-2.25 2.87-2.82A7.4 7.4 0 0 1 77.4 48Zm0 4c-.51 0-1.13.22-1.82.65-2.13 1.36-6.25 8.43-7.76 11.18a2.43 2.43 0 0 1-2.14 1.31c-1.54 0-2.75-1.53-.14-3.48 3.91-2.93 2.54-7.72.67-8.01a1.54 1.54 0 0 0-.24-.02c-1.7 0-2.45 2.93-2.45 2.93s-2.2 5.52-5.97 9.3c-3.78 3.77-3.98 6.8-1.22 10.83 1.87 2.75 5.47 3.58 9.15 3.58 3.82 0 7.73-.9 9.93-1.46.1-.03 13.45-3.8 11.76-7-.29-.54-.75-.76-1.34-.76-2.38 0-6.71 3.54-8.57 3.54-.42 0-.71-.17-.83-.6-.8-2.85 12.05-4.05 10.97-8.17-.19-.73-.7-1.02-1.44-1.02-3.14 0-10.2 5.53-11.68 5.53-.1 0-.19-.03-.23-.1-.74-1.2-.34-2.04 4.88-5.2 5.23-3.16 8.9-5.06 6.8-7.33-.23-.26-.57-.38-.98-.38-3.18 0-10.67 6.82-10.67 6.82s-2.02 2.1-3.24 2.1a.74.74 0 0 1-.68-.38c-.87-1.46 8.05-8.22 8.55-11.01.34-1.9-.24-2.85-1.31-2.85Z"
/>
<path
fill="#FFD21E"
d="M56.33 76.69c-2.75-4.04-2.56-7.07 1.22-10.84 3.77-3.77 5.97-9.3 5.97-9.3s.82-3.2 2.7-2.9c1.86.3 3.23 5.08-.68 8.01-3.92 2.93.78 4.92 2.28 2.17 1.51-2.75 5.63-9.82 7.76-11.18 2.13-1.35 3.64-.6 3.13 2.2-.5 2.79-9.42 9.55-8.55 11 .86 1.47 3.92-1.71 3.92-1.71s9.58-8.71 11.66-6.44c2.08 2.27-1.58 4.17-6.8 7.33-5.23 3.16-5.63 4-4.9 5.2.75 1.2 12.28-8.53 13.36-4.4 1.08 4.11-11.76 5.3-10.97 8.15.8 2.85 9.05-5.38 10.74-2.18 1.69 3.21-11.65 6.98-11.76 7.01-4.31 1.12-15.26 3.49-19.08-2.12Z"
/>
</svg>

After

Width:  |  Height:  |  Size: 5.1 KiB

View file

@ -0,0 +1,73 @@
use clap::{Args, Subcommand};
use crate::cli::commands::output::OutputArgs;
use super::github::GitHubOutputFormat;
/// Top-level Hugging Face command group
#[derive(Args, Debug)]
pub struct HuggingFaceArgs {
#[command(subcommand)]
pub command: HuggingFaceCommand,
}
#[derive(Subcommand, Debug)]
pub enum HuggingFaceCommand {
/// Interact with Hugging Face repositories
#[command(subcommand)]
Repos(HuggingFaceReposCommand),
}
#[derive(Subcommand, Debug)]
pub enum HuggingFaceReposCommand {
/// List Hugging Face repositories
List(HuggingFaceReposListArgs),
}
#[derive(Args, Debug, Clone)]
pub struct HuggingFaceReposListArgs {
#[command(flatten)]
pub repo_specifiers: HuggingFaceRepoSpecifiers,
#[command(flatten)]
pub output_args: OutputArgs<HuggingFaceOutputFormat>,
}
#[derive(Args, Debug, Clone, Default)]
pub struct HuggingFaceRepoSpecifiers {
/// Models, datasets, and Spaces owned by these users
#[arg(long = "huggingface-user")]
pub user: Vec<String>,
/// Models, datasets, and Spaces owned by these organizations
#[arg(long = "huggingface-organization", alias = "huggingface-org")]
pub organization: Vec<String>,
/// Specific models to scan (format: owner/name or full URL)
#[arg(long = "huggingface-model")]
pub model: Vec<String>,
/// Specific datasets to scan (format: owner/name or full URL)
#[arg(long = "huggingface-dataset")]
pub dataset: Vec<String>,
/// Specific Spaces to scan (format: owner/name or full URL)
#[arg(long = "huggingface-space")]
pub space: Vec<String>,
/// Skip specific repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:)
#[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")]
pub exclude: Vec<String>,
}
impl HuggingFaceRepoSpecifiers {
pub fn is_empty(&self) -> bool {
self.user.is_empty()
&& self.organization.is_empty()
&& self.model.is_empty()
&& self.dataset.is_empty()
&& self.space.is_empty()
}
}
pub type HuggingFaceOutputFormat = GitHubOutputFormat;

View file

@ -28,6 +28,11 @@ pub struct InputSpecifierArgs {
"gitlab_group",
"gitea_user",
"gitea_organization",
"huggingface_user",
"huggingface_organization",
"huggingface_model",
"huggingface_dataset",
"huggingface_space",
"bitbucket_user",
"bitbucket_workspace",
"bitbucket_project",
@ -43,7 +48,8 @@ pub struct InputSpecifierArgs {
"confluence_url",
"docker_image",
"slack_query",
"s3_bucket"
"s3_bucket",
"gcs_bucket"
]),
num_args = 0..,
value_hint = ValueHint::AnyPath
@ -120,6 +126,30 @@ pub struct InputSpecifierArgs {
#[arg(long, alias = "include-subgroups")]
pub gitlab_include_subgroups: bool,
/// Scan models, datasets, and Spaces belonging to the specified Hugging Face users
#[arg(long = "huggingface-user")]
pub huggingface_user: Vec<String>,
/// Scan models, datasets, and Spaces belonging to the specified Hugging Face organizations
#[arg(long = "huggingface-organization", alias = "huggingface-org")]
pub huggingface_organization: Vec<String>,
/// Scan a specific Hugging Face model (format: owner/name or full URL)
#[arg(long = "huggingface-model")]
pub huggingface_model: Vec<String>,
/// Scan a specific Hugging Face dataset (format: owner/name or full URL)
#[arg(long = "huggingface-dataset")]
pub huggingface_dataset: Vec<String>,
/// Scan a specific Hugging Face Space (format: owner/name or full URL)
#[arg(long = "huggingface-space")]
pub huggingface_space: Vec<String>,
/// Skip specific Hugging Face repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:)
#[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")]
pub huggingface_exclude: Vec<String>,
// Gitea Options
/// Scan repositories belonging to the specified Gitea user
#[arg(long)]
@ -256,6 +286,18 @@ pub struct InputSpecifierArgs {
#[arg(long, requires = "s3_bucket")]
pub aws_local_profile: Option<String>,
/// Scan the specified Google Cloud Storage bucket
#[arg(long)]
pub gcs_bucket: Option<String>,
/// Optional prefix within the GCS bucket
#[arg(long, requires = "gcs_bucket")]
pub gcs_prefix: Option<String>,
/// Path to a service account JSON file for GCS authentication
#[arg(long, value_hint = ValueHint::FilePath, requires = "gcs_bucket")]
pub gcs_service_account: Option<PathBuf>,
/// Docker/OCI images to scan (no local Docker required)
#[arg(long = "docker-image")]
pub docker_image: Vec<String>,
@ -299,7 +341,6 @@ pub struct ContentFilteringArgs {
#[arg(
long = "max-file-size",
visible_alias = "max-filesize", // also show in --help
// alias = "max-filesize", // use this instead if you DONT want it shown in --help
default_value_t = 256.0,
value_name = "MB"
)]

View file

@ -3,6 +3,7 @@ pub mod bitbucket;
pub mod gitea;
pub mod github;
pub mod gitlab;
pub mod huggingface;
pub mod inputs;
pub mod output;
pub mod rules;

View file

@ -1,4 +1,5 @@
use clap::{Args, ValueEnum};
use std::path::PathBuf;
use strum::Display;
use tracing::debug;
@ -119,6 +120,14 @@ pub struct ScanArgs {
#[arg(long = "skip-word", value_name = "WORD")]
pub skip_word: Vec<String>,
/// AWS account IDs whose findings should skip live credential validation (repeatable)
#[arg(long = "skip-aws-account", value_name = "ACCOUNT_ID", value_delimiter = ',')]
pub skip_aws_account: Vec<String>,
/// File containing AWS account IDs to skip (one per line, `#` comments ignored)
#[arg(long = "skip-aws-account-file", value_name = "FILE")]
pub skip_aws_account_file: Option<PathBuf>,
/// Additional inline ignore directives to recognise (repeatable)
#[arg(long = "ignore-comment", value_name = "DIRECTIVE")]
pub extra_ignore_comments: Vec<String>,

View file

@ -8,7 +8,7 @@ use tracing::Level;
use crate::cli::commands::{
azure::AzureArgs, bitbucket::BitbucketArgs, gitea::GiteaArgs, github::GitHubArgs,
gitlab::GitLabArgs, rules::RulesArgs, scan::ScanArgs,
gitlab::GitLabArgs, huggingface::HuggingFaceArgs, rules::RulesArgs, scan::ScanArgs,
};
#[deny(missing_docs)]
@ -81,6 +81,10 @@ pub enum Command {
#[command(name = "azure")]
Azure(AzureArgs),
/// Interact with the Hugging Face Hub
#[command(name = "huggingface")]
HuggingFace(HuggingFaceArgs),
/// Manage rules
#[command(alias = "rule")]
Rules(RulesArgs),

96
src/gcs.rs Normal file
View file

@ -0,0 +1,96 @@
use std::path::Path;
use anyhow::{Context, Result};
use gcloud_storage::{
client::{google_cloud_auth::credentials::CredentialsFile, Client, ClientConfig},
http::objects::{
download::Range,
get::GetObjectRequest,
list::{ListObjectsRequest, ListObjectsResponse},
},
};
use tracing::debug;
/// Visit every object in the given GCS bucket, optionally filtered by prefix.
///
/// Authentication is attempted via Application Default Credentials. When that
/// fails and no explicit service account path was provided, the client falls
/// back to anonymous access so public buckets can still be scanned.
pub async fn visit_bucket_objects<F>(
bucket: &str,
prefix: Option<&str>,
service_account_path: Option<&Path>,
mut visitor: F,
) -> Result<()>
where
F: FnMut(String, Vec<u8>) -> Result<()>,
{
let config_result = if let Some(path) = service_account_path {
let credentials = CredentialsFile::new_from_file(path.to_string_lossy().into_owned())
.await
.with_context(|| {
format!("Failed to read GCS service account credentials from {}", path.display())
})?;
ClientConfig::default().with_credentials(credentials).await
} else {
ClientConfig::default().with_auth().await
};
let config = match config_result {
Ok(config) => config,
Err(err) => {
if service_account_path.is_some()
|| std::env::var("GOOGLE_APPLICATION_CREDENTIALS").is_ok()
|| std::env::var("GOOGLE_APPLICATION_CREDENTIALS_JSON").is_ok()
{
return Err(err)
.context("Failed to authenticate with GCS using provided credentials");
}
debug!("Falling back to anonymous GCS access: {err}");
ClientConfig::default().anonymous()
}
};
let client = Client::new(config);
let mut page_token: Option<String> = None;
loop {
let request = ListObjectsRequest {
bucket: bucket.to_string(),
prefix: prefix.map(|p| p.to_string()),
page_token: page_token.clone(),
..ListObjectsRequest::default()
};
let mut response: ListObjectsResponse = client
.list_objects(&request)
.await
.with_context(|| format!("Failed to list objects in bucket {bucket}"))?;
if let Some(items) = response.items.take() {
for object in items.into_iter().filter(|o| !o.name.is_empty()) {
let data = client
.download_object(
&GetObjectRequest {
bucket: bucket.to_string(),
object: object.name.clone(),
..GetObjectRequest::default()
},
&Range::default(),
)
.await
.with_context(|| format!("Failed to fetch object {}", object.name))?;
visitor(object.name, data)?;
}
}
match response.next_page_token {
Some(token) if !token.is_empty() => page_token = Some(token),
_ => break,
}
}
Ok(())
}

View file

@ -40,6 +40,15 @@ const AZURE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_azcreds() {
fi
}; _azcreds"#;
const HUGGINGFACE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_hfcreds() {
token="$KF_HUGGINGFACE_TOKEN";
if [ -n "$token" ]; then
user="${KF_HUGGINGFACE_USERNAME:-hf_user}";
echo username="$user";
echo password="$token";
fi
}; _hfcreds"#;
/// Represents errors that can occur when interacting with the `git` CLI.
#[derive(Debug, thiserror::Error)]
pub enum GitError {
@ -47,14 +56,37 @@ pub enum GitError {
IOError(#[from] std::io::Error),
#[error(
"git execution failed\ncode={}\nstdout=```\n{}```\nstderr=```\n{}```",
.status,
String::from_utf8_lossy(.stdout),
String::from_utf8_lossy(.stderr)
"git execution failed (status: {status}){summary}",
status = format_exit_status(.status),
summary = format_git_error_summary(.stdout.as_slice(), .stderr.as_slice())
)]
GitError { stdout: Vec<u8>, stderr: Vec<u8>, status: ExitStatus },
}
fn format_exit_status(status: &ExitStatus) -> String {
status.code().map(|code| code.to_string()).unwrap_or_else(|| status.to_string())
}
fn format_git_error_summary(stdout: &[u8], stderr: &[u8]) -> String {
let mut messages = Vec::new();
if let Some(line) = summarize_output(stderr) {
messages.push(line);
}
if let Some(line) = summarize_output(stdout) {
messages.push(line);
}
if messages.is_empty() {
String::new()
} else {
format!(": {}", messages.join(" | "))
}
}
fn summarize_output(output: &[u8]) -> Option<String> {
let text = String::from_utf8_lossy(output);
text.lines().map(str::trim).find(|line| !line.is_empty()).map(|line| line.to_owned())
}
/// A helper struct for running `git` commands.
///
/// It supports optional GitHub, GitLab, Gitea, and Bitbucket credentials passed via
@ -91,6 +123,8 @@ impl Git {
let has_azure_token = ["KF_AZURE_TOKEN", "KF_AZURE_PAT"]
.iter()
.any(|key| matches!(std::env::var(key), Ok(value) if !value.is_empty()));
let has_huggingface_token =
matches!(std::env::var("KF_HUGGINGFACE_TOKEN"), Ok(value) if !value.is_empty());
// If credentials are provided via environment variables, clear existing helpers first.
if has_github_token
@ -98,6 +132,7 @@ impl Git {
|| has_gitea_token
|| has_bitbucket_credentials
|| has_azure_token
|| has_huggingface_token
{
credentials.push("-c".into());
credentials.push(r#"credential.helper="#.into());
@ -136,6 +171,11 @@ impl Git {
credentials.push(AZURE_CREDENTIAL_HELPER.into());
}
if has_huggingface_token {
credentials.push("-c".into());
credentials.push(HUGGINGFACE_CREDENTIAL_HELPER.into());
}
Self { credentials, ignore_certs }
}

636
src/huggingface.rs Normal file
View file

@ -0,0 +1,636 @@
use std::{collections::HashSet, env, time::Duration};
use anyhow::{anyhow, Result};
use indicatif::{ProgressBar, ProgressStyle};
use reqwest::{header::LINK, StatusCode, Url};
use serde::Deserialize;
use serde_json::Value;
use tracing::{debug, warn};
use crate::{git_url::GitUrl, validation::GLOBAL_USER_AGENT};
#[derive(Debug, Clone, Default)]
pub struct RepoSpecifiers {
pub user: Vec<String>,
pub organization: Vec<String>,
pub model: Vec<String>,
pub dataset: Vec<String>,
pub space: Vec<String>,
pub exclude: Vec<String>,
}
impl RepoSpecifiers {
pub fn is_empty(&self) -> bool {
self.user.is_empty()
&& self.organization.is_empty()
&& self.model.is_empty()
&& self.dataset.is_empty()
&& self.space.is_empty()
}
}
#[derive(Clone, Default)]
pub struct AuthConfig {
token: Option<String>,
}
impl std::fmt::Debug for AuthConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("AuthConfig")
.field(
"token",
&self
.token
.as_ref()
.map(|token| format!("{}", token.chars().take(4).collect::<String>())),
)
.finish()
}
}
impl AuthConfig {
pub fn from_env() -> Self {
let token = env::var("KF_HUGGINGFACE_TOKEN").ok().filter(|t| !t.trim().is_empty());
Self { token }
}
fn apply(&self, request: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
if let Some(token) = &self.token {
request.bearer_auth(token)
} else {
request
}
}
fn has_token(&self) -> bool {
self.token.is_some()
}
}
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
enum ResourceKind {
Model,
Dataset,
Space,
}
impl ResourceKind {
fn api_path(self) -> &'static str {
match self {
ResourceKind::Model => "models",
ResourceKind::Dataset => "datasets",
ResourceKind::Space => "spaces",
}
}
fn git_url(self, slug: &str) -> String {
match self {
ResourceKind::Model => format!("https://huggingface.co/{slug}.git"),
ResourceKind::Dataset => format!("https://huggingface.co/datasets/{slug}.git"),
ResourceKind::Space => format!("https://huggingface.co/spaces/{slug}.git"),
}
}
fn canonical_prefix(self) -> &'static str {
match self {
ResourceKind::Model => "model",
ResourceKind::Dataset => "dataset",
ResourceKind::Space => "space",
}
}
fn display_name_singular(self) -> &'static str {
match self {
ResourceKind::Model => "model",
ResourceKind::Dataset => "dataset",
ResourceKind::Space => "space",
}
}
fn display_name_plural(self) -> &'static str {
match self {
ResourceKind::Model => "models",
ResourceKind::Dataset => "datasets",
ResourceKind::Space => "spaces",
}
}
}
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
struct ResourceRef {
kind: ResourceKind,
slug: String,
}
impl ResourceRef {
fn new(kind: ResourceKind, slug: String) -> Self {
Self { kind, slug }
}
fn canonical_key(&self) -> String {
format!("{}:{}", self.kind.canonical_prefix(), self.slug.to_lowercase())
}
fn git_url(&self) -> String {
self.kind.git_url(&self.slug)
}
}
#[derive(Debug, Deserialize)]
#[serde(untagged)]
enum HuggingFaceItem {
Id {
id: String,
},
ModelId {
#[serde(rename = "modelId")]
model_id: String,
},
}
impl HuggingFaceItem {
fn into_identifier(self) -> String {
match self {
HuggingFaceItem::Id { id } => id,
HuggingFaceItem::ModelId { model_id } => model_id,
}
}
}
#[derive(Default)]
struct ExcludeSet {
typed: HashSet<String>,
untyped: HashSet<String>,
}
impl ExcludeSet {
fn from_list(values: &[String]) -> Self {
let mut typed = HashSet::new();
let mut untyped = HashSet::new();
for raw in values {
let trimmed = raw.trim();
if trimmed.is_empty() {
continue;
}
if let Some((prefix, rest)) = trimmed.split_once(':') {
match normalize_kind(prefix) {
Some(kind) => {
if let Some(slug) = parse_slug_for_kind(kind, rest) {
typed.insert(format!(
"{}:{}",
kind.canonical_prefix(),
slug.to_lowercase()
));
} else {
warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)");
}
}
None => warn!("Ignoring invalid Hugging Face exclusion '{raw}' (unknown type)"),
}
} else if let Some(slug) = normalize_untyped_slug(trimmed) {
untyped.insert(slug);
} else {
warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)");
}
}
Self { typed, untyped }
}
fn should_exclude(&self, kind: ResourceKind, slug: &str) -> bool {
let typed_key = format!("{}:{}", kind.canonical_prefix(), slug.to_lowercase());
if self.typed.contains(&typed_key) {
return true;
}
self.untyped.contains(&slug.to_lowercase())
}
}
fn normalize_kind(raw: &str) -> Option<ResourceKind> {
match raw.trim().to_ascii_lowercase().as_str() {
"model" | "models" => Some(ResourceKind::Model),
"dataset" | "datasets" => Some(ResourceKind::Dataset),
"space" | "spaces" => Some(ResourceKind::Space),
_ => None,
}
}
fn normalize_untyped_slug(raw: &str) -> Option<String> {
let trimmed = raw.trim();
if trimmed.is_empty() {
return None;
}
let segments: Vec<&str> = trimmed.split('/').filter(|segment| !segment.is_empty()).collect();
normalize_untyped_segments(&segments)
}
fn normalize_untyped_segments(segments: &[&str]) -> Option<String> {
if segments.is_empty() {
return None;
}
let mut parts: Vec<&str> = segments.to_vec();
if let Some(first) = parts.first() {
let lowered = first.trim().to_ascii_lowercase();
if matches!(
lowered.as_str(),
"models" | "model" | "datasets" | "dataset" | "spaces" | "space"
) {
parts.remove(0);
}
}
if parts.len() < 2 {
return None;
}
let owner = parts[0].trim();
let binding = parts[1..].join("/");
let name = binding.trim_end_matches(".git").trim();
if owner.is_empty() || name.is_empty() {
return None;
}
Some(format!("{}/{}", owner, name).to_lowercase())
}
fn parse_slug_for_kind(kind: ResourceKind, raw: &str) -> Option<String> {
let trimmed = raw.trim();
if trimmed.is_empty() {
return None;
}
if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
let url = Url::parse(trimmed).ok()?;
let segments: Vec<&str> = url
.path_segments()
.map(|segments| segments.filter(|s| !s.is_empty()).collect())
.unwrap_or_default();
return parse_slug_segments(kind, &segments);
}
let segments: Vec<&str> = trimmed.split('/').filter(|s| !s.is_empty()).collect();
parse_slug_segments(kind, &segments)
}
fn parse_slug_segments(kind: ResourceKind, segments: &[&str]) -> Option<String> {
if segments.is_empty() {
return None;
}
let mut parts: Vec<&str> = segments.to_vec();
if let Some(first) = parts.first() {
let lowered = first.trim().to_ascii_lowercase();
let should_trim = match kind {
ResourceKind::Model => matches!(lowered.as_str(), "models" | "model"),
ResourceKind::Dataset => matches!(lowered.as_str(), "datasets" | "dataset"),
ResourceKind::Space => matches!(lowered.as_str(), "spaces" | "space"),
};
if should_trim {
parts.remove(0);
}
}
if parts.len() < 2 {
return None;
}
let owner = parts[0].trim();
let binding = parts[1..].join("/");
let name = binding.trim_end_matches(".git").trim();
if owner.is_empty() || name.is_empty() {
return None;
}
Some(format!("{owner}/{name}"))
}
fn parse_next_link(value: &str) -> Option<Url> {
value.split(',').find_map(|part| {
let part = part.trim();
let (url_part, params) = part.split_once('>')?;
if params.contains("rel=\"next\"") {
let url = url_part.trim_start_matches('<').trim();
Url::parse(url).ok()
} else {
None
}
})
}
const BODY_SNIPPET_LIMIT: usize = 200;
async fn fetch_paginated(
client: &reqwest::Client,
mut current_url: Url,
auth: &AuthConfig,
context: &str,
) -> Result<Vec<HuggingFaceItem>> {
let mut items = Vec::new();
loop {
let mut request =
client.get(current_url.clone()).header("User-Agent", GLOBAL_USER_AGENT.as_str());
request = auth.apply(request);
let response = request.send().await?;
let status = response.status();
let link_header = response
.headers()
.get(LINK)
.and_then(|value| value.to_str().ok())
.map(|value| value.to_string());
if !status.is_success() {
let body = response.text().await.unwrap_or_default();
let mut message = format!(
"Hugging Face API request failed while enumerating {context} ({status}): {body}"
);
if matches!(status, StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN)
&& !auth.has_token()
{
message.push_str(
"\nProvide a Hugging Face access token via the KF_HUGGINGFACE_TOKEN environment variable.",
);
}
return Err(anyhow!(message));
}
let body = response.bytes().await?;
let value: Value = serde_json::from_slice(&body).map_err(|err| {
let snippet = body_snippet(&body);
anyhow!(
"Failed to parse Hugging Face response while enumerating {context}: {err}. Body snippet: {snippet}",
context = context,
err = err,
snippet = snippet
)
})?;
let array = value.as_array().ok_or_else(|| {
let snippet = body_snippet(&body);
anyhow!(
"Unexpected Hugging Face response format while enumerating {context} (expected array). Body snippet: {snippet}",
context = context,
snippet = snippet
)
})?;
let mut page = Vec::new();
for (index, element) in array.iter().enumerate() {
match serde_json::from_value::<HuggingFaceItem>(element.clone()) {
Ok(item) => page.push(item),
Err(err) => {
let snippet = value_snippet(element);
warn!(
"Skipping Hugging Face item at index {index} while enumerating {context}: {err}. Item snippet: {snippet}"
);
}
}
}
items.append(&mut page);
if let Some(link_value) = link_header {
if let Some(next_url) = parse_next_link(&link_value) {
current_url = next_url;
continue;
}
}
break;
}
Ok(items)
}
fn body_snippet(body: &[u8]) -> String {
truncate_for_display(&String::from_utf8_lossy(body), BODY_SNIPPET_LIMIT)
}
fn value_snippet(value: &Value) -> String {
let text = value.to_string();
truncate_for_display(&text, BODY_SNIPPET_LIMIT)
}
fn truncate_for_display(text: &str, limit: usize) -> String {
let mut snippet: String = text.chars().take(limit).collect();
if text.chars().count() > limit {
snippet.push('…');
}
snippet
}
async fn fetch_resources_for_owner(
client: &reqwest::Client,
base_url: &Url,
owner: &str,
label: &str,
auth: &AuthConfig,
progress: Option<&ProgressBar>,
) -> Result<Vec<ResourceRef>> {
let mut resources = Vec::new();
for kind in [ResourceKind::Model, ResourceKind::Dataset, ResourceKind::Space] {
if let Some(pb) = progress {
pb.set_message(format!(
"Enumerating Hugging Face {label} {}",
kind.display_name_plural()
));
}
let mut url = base_url.join(kind.api_path())?;
{
let mut pairs = url.query_pairs_mut();
pairs.append_pair("author", owner);
pairs.append_pair("limit", "100");
}
let context = format!("{} for {label}", kind.display_name_plural());
match fetch_paginated(client, url, auth, &context).await {
Ok(items) => {
for item in items {
let identifier = item.into_identifier();
if let Some(slug) = parse_slug_for_kind(kind, &identifier) {
resources.push(ResourceRef::new(kind, slug));
} else {
warn!(
"Skipping Hugging Face {} with unexpected identifier '{}'",
kind.display_name_singular(),
identifier
);
}
}
}
Err(err) => {
warn!(
"Failed to enumerate Hugging Face {} for {label}: {err}",
kind.display_name_plural()
);
}
}
}
Ok(resources)
}
fn append_explicit_resources(specifiers: &RepoSpecifiers, resources: &mut Vec<ResourceRef>) {
for model in &specifiers.model {
if let Some(slug) = parse_slug_for_kind(ResourceKind::Model, model) {
resources.push(ResourceRef::new(ResourceKind::Model, slug));
} else {
warn!("Ignoring invalid Hugging Face model identifier '{model}'");
}
}
for dataset in &specifiers.dataset {
if let Some(slug) = parse_slug_for_kind(ResourceKind::Dataset, dataset) {
resources.push(ResourceRef::new(ResourceKind::Dataset, slug));
} else {
warn!("Ignoring invalid Hugging Face dataset identifier '{dataset}'");
}
}
for space in &specifiers.space {
if let Some(slug) = parse_slug_for_kind(ResourceKind::Space, space) {
resources.push(ResourceRef::new(ResourceKind::Space, slug));
} else {
warn!("Ignoring invalid Hugging Face space identifier '{space}'");
}
}
}
pub async fn enumerate_repo_urls(
specifiers: &RepoSpecifiers,
auth: &AuthConfig,
ignore_certs: bool,
progress: Option<&mut ProgressBar>,
) -> Result<Vec<String>> {
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(30))
.danger_accept_invalid_certs(ignore_certs)
.build()?;
let base_url = Url::parse("https://huggingface.co/api/")?;
let excludes = ExcludeSet::from_list(&specifiers.exclude);
let mut collected = Vec::new();
for user in &specifiers.user {
let label = format!("user {user}");
if let Some(pb) = progress.as_ref() {
pb.set_message(format!("Enumerating Hugging Face {label}"));
}
match fetch_resources_for_owner(
&client,
&base_url,
user,
&label,
auth,
progress.as_ref().map(|pb| &**pb),
)
.await
{
Ok(mut resources) => collected.append(&mut resources),
Err(err) => warn!("Failed to enumerate Hugging Face user {user}: {err}"),
}
}
for org in &specifiers.organization {
let label = format!("organization {org}");
if let Some(pb) = progress.as_ref() {
pb.set_message(format!("Enumerating Hugging Face {label}"));
}
match fetch_resources_for_owner(
&client,
&base_url,
org,
&label,
auth,
progress.as_ref().map(|pb| &**pb),
)
.await
{
Ok(mut resources) => collected.append(&mut resources),
Err(err) => warn!("Failed to enumerate Hugging Face organization {org}: {err}"),
}
}
append_explicit_resources(specifiers, &mut collected);
let mut seen = HashSet::new();
let mut urls = Vec::new();
for resource in collected {
if excludes.should_exclude(resource.kind, &resource.slug) {
debug!(
"Skipping Hugging Face {} {} due to exclusion",
resource.kind.display_name_singular(),
resource.slug
);
continue;
}
let key = resource.canonical_key();
if seen.insert(key) {
urls.push(resource.git_url());
}
}
urls.sort();
urls.dedup();
Ok(urls)
}
pub async fn list_repositories(
specifiers: &RepoSpecifiers,
auth: &AuthConfig,
ignore_certs: bool,
progress_enabled: bool,
) -> Result<()> {
let mut progress = if progress_enabled {
let style = ProgressStyle::with_template("{spinner} {msg} [{elapsed_precise}]")
.expect("progress bar style template should compile");
let pb = ProgressBar::new_spinner()
.with_style(style)
.with_message("Enumerating Hugging Face repositories");
pb.enable_steady_tick(Duration::from_millis(500));
pb
} else {
ProgressBar::hidden()
};
let urls = enumerate_repo_urls(specifiers, auth, ignore_certs, Some(&mut progress)).await?;
for url in urls {
println!("{url}");
}
progress.finish_and_clear();
Ok(())
}
pub fn wiki_url(_repo_url: &GitUrl) -> Option<GitUrl> {
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_model_slug_from_plain() {
assert_eq!(
parse_slug_for_kind(ResourceKind::Model, "user/model"),
Some("user/model".to_string())
);
}
#[test]
fn parse_dataset_slug_with_prefix() {
assert_eq!(
parse_slug_for_kind(ResourceKind::Dataset, "datasets/user/data.git"),
Some("user/data".to_string())
);
}
#[test]
fn parse_space_slug_from_url() {
assert_eq!(
parse_slug_for_kind(ResourceKind::Space, "https://huggingface.co/spaces/user/demo"),
Some("user/demo".to_string())
);
}
#[test]
fn exclude_set_matches_typed_and_untyped() {
let excludes =
ExcludeSet::from_list(&vec!["model:user/model".into(), "datasets/user/data".into()]);
assert!(excludes.should_exclude(ResourceKind::Model, "user/model"));
assert!(excludes.should_exclude(ResourceKind::Dataset, "user/data"));
assert!(!excludes.should_exclude(ResourceKind::Space, "user/space"));
}
#[test]
fn parse_link_header() {
let header = "<https://huggingface.co/api/models?cursor=abc>; rel=\"next\"";
let url = parse_next_link(header).expect("next link");
assert_eq!(url.as_str(), "https://huggingface.co/api/models?cursor=abc");
}
#[test]
fn truncate_for_display_adds_ellipsis() {
assert_eq!(truncate_for_display("abcdef", 3), "abc…");
assert_eq!(truncate_for_display("abc", 5), "abc");
}
}

View file

@ -13,6 +13,7 @@ pub mod defaults;
pub mod entropy;
pub mod finding_data;
pub mod findings_store;
pub mod gcs;
pub mod git_binary;
pub mod git_commit_metadata;
pub mod git_metadata_graph;
@ -21,6 +22,7 @@ pub mod git_url;
pub mod gitea;
pub mod github;
pub mod gitlab;
pub mod huggingface;
pub mod inline_ignore;
pub mod jira;
pub mod liquid_filters;

View file

@ -52,7 +52,7 @@ use kingfisher::{
},
findings_store,
findings_store::FindingsStore,
gitea, github,
gitea, github, huggingface,
rule_loader::RuleLoader,
rules_database::RulesDatabase,
scanner::{load_and_record_rules, run_scan},
@ -75,6 +75,7 @@ use crate::cli::commands::{
bitbucket::{BitbucketAuthArgs, BitbucketCommand, BitbucketRepoType, BitbucketReposCommand},
gitea::{GiteaCommand, GiteaRepoType, GiteaReposCommand},
gitlab::{GitLabCommand, GitLabRepoType, GitLabReposCommand},
huggingface::{HuggingFaceCommand, HuggingFaceReposCommand},
};
fn main() -> anyhow::Result<()> {
@ -93,6 +94,7 @@ fn main() -> anyhow::Result<()> {
Command::Bitbucket(_) => num_cpus::get(), // Default for Bitbucket commands
Command::Gitea(_) => num_cpus::get(), // Default for Gitea commands
Command::Azure(_) => num_cpus::get(), // Default for Azure commands
Command::HuggingFace(_) => num_cpus::get(), // Default for Hugging Face commands
Command::Rules(_) => num_cpus::get(), // Default for Rules commands
};
@ -327,6 +329,28 @@ async fn async_main(args: CommandLineArgs) -> Result<()> {
}
},
},
Command::HuggingFace(hf_args) => match hf_args.command {
HuggingFaceCommand::Repos(repos_command) => match repos_command {
HuggingFaceReposCommand::List(list_args) => {
let specifiers = huggingface::RepoSpecifiers {
user: list_args.repo_specifiers.user.clone(),
organization: list_args.repo_specifiers.organization.clone(),
model: list_args.repo_specifiers.model.clone(),
dataset: list_args.repo_specifiers.dataset.clone(),
space: list_args.repo_specifiers.space.clone(),
exclude: list_args.repo_specifiers.exclude.clone(),
};
let auth = huggingface::AuthConfig::from_env();
huggingface::list_repositories(
&specifiers,
&auth,
global_args.ignore_certs,
global_args.use_progress(),
)
.await?;
}
},
},
Command::SelfUpdate => {
anyhow::bail!("SelfUpdate command should not reach this branch")
}
@ -367,6 +391,13 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -400,6 +431,9 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Slack query
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
@ -436,6 +470,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_base64: false,
no_inline_ignore: false,

View file

@ -703,7 +703,7 @@ mod tests {
git_commit_metadata::CommitMetadata,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::{SerializableCapture, SerializableCaptures},
origin::OriginSet,
origin::{Origin, OriginSet},
rules::rule::{Confidence, Rule, RuleSyntax},
};
use gix::{date::Time, ObjectId};
@ -711,79 +711,8 @@ mod tests {
use std::path::PathBuf;
use tempfile::tempdir;
#[test]
fn build_finding_record_uses_git_blob_path() {
let temp = tempdir().unwrap();
let datastore =
Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
let repo_path = Arc::new(PathBuf::from("/tmp/repo"));
let commit_metadata = Arc::new(CommitMetadata {
commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(),
committer_name: "Alice".into(),
committer_email: "alice@example.com".into(),
committer_timestamp: Time::new(0, 0),
});
let blob_path = "path/in/history.txt".to_string();
let origin = OriginSet::new(
Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()),
vec![],
);
let rule = Arc::new(Rule::new(RuleSyntax {
name: "Test Rule".into(),
id: "test.rule".into(),
pattern: ".*".into(),
min_entropy: 0.0,
confidence: Confidence::Medium,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
depends_on_rule: vec![],
}));
let blob_id = BlobId::new(b"blob-data");
let report_match = ReportMatch {
origin,
blob_metadata: BlobMetadata {
id: blob_id,
num_bytes: 42,
mime_essence: None,
language: Some("Unknown".into()),
},
m: Match {
location: Location {
offset_span: OffsetSpan { start: 0, end: 10 },
source_span: SourceSpan {
start: SourcePoint { line: 19, column: 0 },
end: SourcePoint { line: 19, column: 10 },
},
},
groups: SerializableCaptures {
captures: SmallVec::<[SerializableCapture; 2]>::new(),
},
blob_id,
finding_fingerprint: 123,
rule: Arc::clone(&rule),
validation_response_body: "Bad credentials".into(),
validation_response_status: 401,
validation_success: false,
calculated_entropy: 5.29,
visible: true,
is_base64: false,
},
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: "Bad credentials".into(),
validation_response_status: 401,
validation_success: false,
};
let scan_args = ScanArgs {
fn sample_scan_args() -> ScanArgs {
ScanArgs {
num_jobs: 1,
rules: RuleSpecifierArgs::default(),
input_specifier_args: InputSpecifierArgs {
@ -802,6 +731,12 @@ mod tests {
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -833,6 +768,9 @@ mod tests {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@ -864,9 +802,98 @@ mod tests {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_inline_ignore: false,
}
}
fn sample_report_match(
validation_body: &str,
validation_status: u16,
validation_success: bool,
) -> (ReportMatch, String) {
let repo_path = Arc::new(PathBuf::from("/tmp/repo"));
let commit_metadata = Arc::new(CommitMetadata {
commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(),
committer_name: "Alice".into(),
committer_email: "alice@example.com".into(),
committer_timestamp: Time::new(0, 0),
});
let blob_path = "path/in/history.txt".to_string();
let origin = OriginSet::new(
Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()),
vec![],
);
let rule = Arc::new(Rule::new(RuleSyntax {
name: "Test Rule".into(),
id: "test.rule".into(),
pattern: ".*".into(),
min_entropy: 0.0,
confidence: Confidence::Medium,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
depends_on_rule: vec![],
}));
let blob_id = BlobId::new(b"blob-data");
let validation_body_owned = validation_body.to_string();
let report_match = ReportMatch {
origin,
blob_metadata: BlobMetadata {
id: blob_id,
num_bytes: 42,
mime_essence: None,
language: Some("Unknown".into()),
},
m: Match {
location: Location {
offset_span: OffsetSpan { start: 0, end: 10 },
source_span: SourceSpan {
start: SourcePoint { line: 19, column: 0 },
end: SourcePoint { line: 19, column: 10 },
},
},
groups: SerializableCaptures {
captures: SmallVec::<[SerializableCapture; 2]>::new(),
},
blob_id,
finding_fingerprint: 123,
rule: Arc::clone(&rule),
validation_response_body: validation_body_owned.clone(),
validation_response_status: validation_status,
validation_success,
calculated_entropy: 5.29,
visible: true,
is_base64: false,
},
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: validation_body_owned,
validation_response_status: validation_status,
validation_success,
};
(report_match, blob_path)
}
#[test]
fn build_finding_record_uses_git_blob_path() {
let temp = tempdir().unwrap();
let datastore =
Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
let (report_match, blob_path) =
sample_report_match("Bad credentials", StatusCode::UNAUTHORIZED.as_u16(), false);
let scan_args = sample_scan_args();
let record = reporter.build_finding_record(&report_match, &scan_args);
assert_eq!(record.finding.path, blob_path);
let git_file_path = record
@ -880,6 +907,28 @@ mod tests {
assert_eq!(git_file_path, "path/in/history.txt");
}
#[test]
fn skip_list_matches_surface_skip_reason() {
let temp = tempdir().unwrap();
let datastore =
Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
let (report_match, _) = sample_report_match(
"(skip list entry) AWS validation not attempted for account 111122223333.",
StatusCode::CONTINUE.as_u16(),
false,
);
let scan_args = sample_scan_args();
let record = reporter.build_finding_record(&report_match, &scan_args);
assert_eq!(record.finding.validation.status, "Not Attempted");
assert_eq!(
record.finding.validation.response,
"(skip list entry) AWS validation not attempted for account 111122223333."
);
}
use super::build_git_urls;
#[test]

View file

@ -93,6 +93,14 @@ mod tests {
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
// Hugging Face
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
// Gitea
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
@ -132,6 +140,9 @@ mod tests {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
// clone / history options
@ -163,6 +174,8 @@ mod tests {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
}

View file

@ -115,7 +115,7 @@ impl<'a> Display for PrettyFindingRecord<'a> {
} else {
writeln!(f, " |Validation....: {}", finding.validation.status)?;
}
if finding.validation.status != "Not Attempted" {
if !finding.validation.response.is_empty() {
writeln!(f, " |__Response....: {}", style_fn(&finding.validation.response))?;
}
writeln!(f, " |Language......: {}", finding.language)?;

View file

@ -3,7 +3,7 @@ pub(crate) use docker::save_docker_images;
pub(crate) use enumerate::enumerate_filesystem_inputs;
pub(crate) use repos::{
clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos,
enumerate_github_repos,
enumerate_github_repos, enumerate_huggingface_repos,
};
pub use runner::{load_and_record_rules, run_async_scan, run_scan};
pub(crate) use validation::run_secret_validation;

View file

@ -17,10 +17,10 @@ use crate::{
commands::{github::GitCloneMode, github::GitHistoryMode, scan},
global,
},
confluence, findings_store,
confluence, findings_store, gcs,
git_binary::{CloneMode, Git},
git_url::GitUrl,
gitea, github, gitlab, jira,
gitea, github, gitlab, huggingface, jira,
matcher::{Match, Matcher, MatcherStats},
origin::{Origin, OriginSet},
rules_database::RulesDatabase,
@ -305,6 +305,69 @@ pub async fn enumerate_gitea_repos(
Ok(repo_urls)
}
pub async fn enumerate_huggingface_repos(
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
) -> Result<Vec<GitUrl>> {
let repo_specifiers = huggingface::RepoSpecifiers {
user: args.input_specifier_args.huggingface_user.clone(),
organization: args.input_specifier_args.huggingface_organization.clone(),
model: args.input_specifier_args.huggingface_model.clone(),
dataset: args.input_specifier_args.huggingface_dataset.clone(),
space: args.input_specifier_args.huggingface_space.clone(),
exclude: args.input_specifier_args.huggingface_exclude.clone(),
};
let mut repo_urls = args.input_specifier_args.git_url.clone();
if !repo_specifiers.is_empty() {
let mut progress = if global_args.use_progress() {
let style =
ProgressStyle::with_template("{spinner} {msg} {human_len} [{elapsed_precise}]")
.expect("progress bar style template should compile");
let pb = ProgressBar::new_spinner()
.with_style(style)
.with_message("Enumerating Hugging Face repositories...");
pb.enable_steady_tick(Duration::from_millis(500));
pb
} else {
ProgressBar::hidden()
};
let mut num_found: u64 = 0;
let auth = huggingface::AuthConfig::from_env();
let repo_strings = huggingface::enumerate_repo_urls(
&repo_specifiers,
&auth,
global_args.ignore_certs,
Some(&mut progress),
)
.await
.context("Failed to enumerate Hugging Face repositories")?;
for repo_string in repo_strings {
match GitUrl::from_str(&repo_string) {
Ok(repo_url) => {
repo_urls.push(repo_url);
num_found += 1;
}
Err(e) => {
progress.suspend(|| {
error!("Failed to parse repo URL from {repo_string}: {e}");
});
}
}
}
progress.finish_with_message(format!(
"Found {} repositories from Hugging Face",
HumanCount(num_found)
));
}
repo_urls.sort();
repo_urls.dedup();
Ok(repo_urls)
}
pub async fn enumerate_bitbucket_repos(
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
@ -678,3 +741,81 @@ pub async fn fetch_s3_objects(
Ok(())
}
pub async fn fetch_gcs_objects(
args: &scan::ScanArgs,
datastore: &Arc<Mutex<findings_store::FindingsStore>>,
rules_db: &RulesDatabase,
matcher_stats: &Mutex<MatcherStats>,
enable_profiling: bool,
shared_profiler: Arc<crate::rule_profiling::ConcurrentRuleProfiler>,
progress_enabled: bool,
) -> Result<()> {
let Some(bucket) = args.input_specifier_args.gcs_bucket.as_deref() else {
return Ok(());
};
let prefix = args.input_specifier_args.gcs_prefix.as_deref();
let service_account = args.input_specifier_args.gcs_service_account.as_deref();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let seen_blobs = BlobIdMap::new();
let matcher = Matcher::new(
rules_db,
scanner_pool,
&seen_blobs,
Some(matcher_stats),
enable_profiling,
Some(shared_profiler.clone()),
&args.extra_ignore_comments,
args.no_inline_ignore,
)?;
let mut processor = BlobProcessor { matcher };
let progress = if progress_enabled {
let style =
ProgressStyle::with_template("{spinner} {msg} ({pos} objects) [{elapsed_precise}]")
.expect("progress bar style template should compile");
let pb = ProgressBar::new_spinner().with_style(style).with_message("Fetching GCS objects");
pb.enable_steady_tick(Duration::from_millis(500));
pb
} else {
ProgressBar::hidden()
};
let pb = progress.clone();
let bucket_name = bucket.to_string();
gcs::visit_bucket_objects(bucket, prefix, service_account, move |key, bytes| {
let origin = OriginSet::new(
Origin::from_extended(serde_json::json!({
"path": format!("gs://{}/{}", bucket_name, key)
})),
Vec::new(),
);
let blob = crate::blob::Blob::from_bytes(bytes);
if let Some((origin, blob_md, scored_matches)) =
processor.run(origin, blob, args.no_dedup, args.redact, args.no_base64)?
{
let origin_arc = Arc::new(origin);
let blob_arc = Arc::new(blob_md);
let mut batch = Vec::with_capacity(scored_matches.len());
for (_score, m) in scored_matches {
batch.push((origin_arc.clone(), blob_arc.clone(), m));
}
let added = datastore.lock().unwrap().record(batch, !args.no_dedup);
debug!("Added {} new GCS blobs", added);
}
pb.inc(1);
Ok(())
})
.await?;
let total = progress.position();
progress.finish_with_message(format!("Fetched {} GCS objects", total));
Ok(())
}

View file

@ -1,4 +1,7 @@
use std::sync::{Arc, Mutex};
use std::{
fs,
sync::{Arc, Mutex},
};
use anyhow::{bail, Context, Result};
use crossbeam_skiplist::SkipMap;
@ -21,10 +24,11 @@ use crate::{
safe_list,
scanner::{
clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos,
enumerate_filesystem_inputs, enumerate_github_repos,
enumerate_filesystem_inputs, enumerate_github_repos, enumerate_huggingface_repos,
repos::{
enumerate_gitea_repos, enumerate_gitlab_repos, fetch_confluence_pages,
fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects, fetch_slack_messages,
fetch_gcs_objects, fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects,
fetch_slack_messages,
},
run_secret_validation, save_docker_images,
summary::print_scan_summary,
@ -74,12 +78,14 @@ pub async fn run_async_scan(
let mut repo_urls = enumerate_github_repos(args, global_args).await?;
let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
// Combine repository URLs
repo_urls.extend(gitlab_repo_urls);
repo_urls.extend(gitea_repo_urls);
repo_urls.extend(huggingface_repo_urls);
repo_urls.extend(bitbucket_repo_urls);
repo_urls.extend(azure_repo_urls);
repo_urls.sort();
@ -182,8 +188,20 @@ pub async fn run_async_scan(
)
.await?;
let has_s3 = args.input_specifier_args.s3_bucket.is_some();
if input_roots.is_empty() && !has_s3 {
fetch_gcs_objects(
args,
&datastore,
rules_db,
&matcher_stats,
enable_profiling,
Arc::clone(&shared_profiler),
progress_enabled,
)
.await?;
let has_remote_objects = args.input_specifier_args.s3_bucket.is_some()
|| args.input_specifier_args.gcs_bucket.is_some();
if input_roots.is_empty() && !has_remote_objects {
bail!("No inputs to scan");
}
@ -231,6 +249,26 @@ pub async fn run_async_scan(
crate::baseline::apply_baseline(&mut ds, &path, args.manage_baseline, &input_roots)?;
}
let mut skip_aws_accounts = args.skip_aws_account.clone();
if let Some(path) = args.skip_aws_account_file.as_ref() {
let contents = fs::read_to_string(path).with_context(|| {
format!("Failed to read --skip-aws-account-file {}", path.display())
})?;
for line in contents.lines() {
let content = line.split('#').next().unwrap_or("");
for value in content.split(|c: char| c.is_ascii_whitespace() || c == ',' || c == ';') {
let trimmed = value.trim();
if !trimmed.is_empty() {
skip_aws_accounts.push(trimmed.to_string());
}
}
}
}
crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
// If validation is enabled, run it as a second phase
if !args.no_validate {
info!("Starting secret validation phase...");

View file

@ -95,6 +95,14 @@ pub fn init_validation_caches() {
aws::set_aws_validation_concurrency(15);
}
pub fn set_skip_aws_account_ids<I, S>(ids: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
aws::set_aws_skip_account_ids(ids);
}
#[derive(Clone)]
pub struct CachedResponse {
pub body: String,
@ -781,6 +789,26 @@ async fn timed_validate_single_match<'a>(
}
}
if let Some(account_id) = aws::should_skip_aws_validation(&akid) {
m.validation_success = false;
m.validation_response_body = format!(
"(skip list entry) AWS validation not attempted for account {}.",
account_id
);
m.validation_response_status = StatusCode::CONTINUE;
cache.insert(
cache_key,
CachedResponse {
body: m.validation_response_body.clone(),
status: m.validation_response_status,
is_valid: m.validation_success,
timestamp: Instant::now(),
},
);
commit_and_return(m);
return;
}
if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) {
m.validation_success = false;
m.validation_response_body = format!("Invalid AWS credentials ({}): {}", akid, e);

View file

@ -1,7 +1,7 @@
use std::time::Duration;
use std::{collections::HashSet, sync::RwLock, time::Duration};
use anyhow::{anyhow, Result};
use aws_config::{retry::RetryConfig, BehaviorVersion};
use aws_config::{retry::RetryConfig, BehaviorVersion, SdkConfig};
use aws_credential_types::Credentials;
use aws_sdk_sts::{
config::Builder as StsConfigBuilder, error::SdkError,
@ -26,8 +26,9 @@ use http::{
header::{HeaderValue, USER_AGENT},
StatusCode,
};
use once_cell::sync::OnceCell;
use once_cell::sync::{Lazy, OnceCell};
use rand::{rng, Rng};
use regex::Regex;
use tokio::{
sync::Semaphore,
time::{sleep, timeout},
@ -36,6 +37,57 @@ use tokio::{
use crate::validation::GLOBAL_USER_AGENT;
static AWS_VALIDATION_SEMAPHORE: OnceCell<Semaphore> = OnceCell::new();
const BUILTIN_SKIP_ACCOUNT_IDS: &[&str] = &[
"052310077262",
"171436882533",
"528757803018",
"534261010715",
"595918472158",
"729780141977",
"893192397702",
"992382622183",
];
static AWS_SKIP_ACCOUNT_IDS: Lazy<RwLock<HashSet<String>>> = Lazy::new(|| {
let mut set = HashSet::new();
set.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string()));
RwLock::new(set)
});
fn build_http_client() -> SharedHttpClient {
HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| {
let mut conn_builder = ConnectorBuilder::default()
.tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc));
conn_builder.set_connector_settings(settings.cloned());
if let Some(components) = runtime_components {
conn_builder.set_sleep_impl(components.sleep_impl());
}
conn_builder.set_proxy_config(Some(ProxyConfig::from_env()));
conn_builder.build()
})
}
async fn build_base_config(credentials: Credentials) -> SdkConfig {
let retry_config = RetryConfig::adaptive().with_max_attempts(3);
aws_config::defaults(BehaviorVersion::latest())
.region(Region::new("us-east-1"))
.credentials_provider(credentials)
.http_client(build_http_client())
.retry_config(retry_config)
.load()
.await
}
fn extract_account_id(input: &str) -> Option<String> {
let trimmed = input.trim();
if trimmed.len() == 12 && trimmed.chars().all(|c| c.is_ascii_digit()) {
return Some(trimmed.to_string());
}
static ACCOUNT_ID_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\d{12})").expect("valid regex"));
ACCOUNT_ID_RE.captures(trimmed).and_then(|caps| caps.get(1)).map(|m| m.as_str().to_string())
}
/// Set the maximum number of concurrent AWS validations. Call before first use.
pub fn set_aws_validation_concurrency(max: usize) {
@ -46,6 +98,46 @@ fn aws_validation_semaphore() -> &'static Semaphore {
AWS_VALIDATION_SEMAPHORE.get_or_init(|| Semaphore::new(15))
}
pub fn set_aws_skip_account_ids<I, S>(ids: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
let mut guard = match AWS_SKIP_ACCOUNT_IDS.write() {
Ok(g) => g,
Err(poisoned) => poisoned.into_inner(),
};
guard.clear();
guard.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string()));
for raw in ids.into_iter() {
let value = raw.into();
if value.trim().is_empty() {
continue;
}
if let Some(normalized) = extract_account_id(&value) {
guard.insert(normalized);
} else {
tracing::warn!("Ignoring invalid AWS account ID in skip list: {value}");
}
}
}
pub fn should_skip_aws_validation(access_key_id: &str) -> Option<String> {
let guard = AWS_SKIP_ACCOUNT_IDS.read().ok()?;
if guard.is_empty() {
return None;
}
let account = aws_key_to_account_number(access_key_id).ok()?;
if guard.contains(&account) {
Some(account)
} else {
None
}
}
#[derive(Debug)]
struct UaInterceptor;
@ -132,29 +224,7 @@ pub async fn validate_aws_credentials(
None, // expiry
"static", // provider name
);
// Create HTTP client that respects proxy settings from the environment
let http_client: SharedHttpClient =
HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| {
let mut conn_builder = ConnectorBuilder::default()
.tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc));
conn_builder.set_connector_settings(settings.cloned());
if let Some(components) = runtime_components {
conn_builder.set_sleep_impl(components.sleep_impl());
}
conn_builder.set_proxy_config(Some(ProxyConfig::from_env()));
conn_builder.build()
});
// Create AWS config with adaptive retries
let retry_config = RetryConfig::adaptive().with_max_attempts(3);
let config = aws_config::defaults(BehaviorVersion::latest())
.region(Region::new("us-east-1"))
.credentials_provider(credentials)
.http_client(http_client)
.retry_config(retry_config)
.load()
.await;
let config = build_base_config(credentials).await;
// Create STS client
let sts_config = StsConfigBuilder::from(&config).interceptor(UaInterceptor).build();
@ -230,3 +300,74 @@ pub fn aws_key_to_account_number(aws_key_id: &str) -> Result<String, Box<dyn std
// Return the account number formatted as a 12-digit string
Ok(format!("{:012}", account_num))
}
#[cfg(test)]
mod tests {
use super::*;
use once_cell::sync::Lazy;
use std::sync::Mutex;
static TEST_GUARD: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
#[test]
fn skip_account_list_normalizes_inputs() {
let _lock = TEST_GUARD.lock().unwrap();
set_aws_skip_account_ids([
" 052310077262 ",
"arn:aws:iam::171436882533:role/demo",
"invalid",
]);
let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap();
assert!(guard.contains("052310077262"));
assert!(guard.contains("171436882533"));
assert_eq!(guard.len(), BUILTIN_SKIP_ACCOUNT_IDS.len());
drop(guard);
set_aws_skip_account_ids(Vec::<String>::new());
}
#[test]
fn should_skip_when_account_matches() {
let _lock = TEST_GUARD.lock().unwrap();
set_aws_skip_account_ids(["534261010715"]);
assert_eq!(
should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"),
Some("534261010715".to_string())
);
set_aws_skip_account_ids(Vec::<String>::new());
}
#[test]
fn builtin_canary_accounts_are_preseeded() {
let _lock = TEST_GUARD.lock().unwrap();
set_aws_skip_account_ids(Vec::<String>::new());
assert_eq!(
should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"),
Some("534261010715".to_string())
);
set_aws_skip_account_ids(Vec::<String>::new());
}
#[test]
fn duplicate_accounts_are_deduplicated() {
let _lock = TEST_GUARD.lock().unwrap();
set_aws_skip_account_ids([
"534261010715",
"arn:aws:iam::534261010715:user/canarytokens",
" 534261010715 ",
]);
let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap();
assert_eq!(guard.iter().filter(|id| id.as_str() == "534261010715").count(), 1);
drop(guard);
set_aws_skip_account_ids(Vec::<String>::new());
}
}

View file

@ -33,31 +33,77 @@ pub fn find_closest_variable(
target_variable_name: &str,
search_variable_name: &str,
) -> Option<String> {
// Find positions of the target variable with the target value
// Collect the positions of the target variable for the provided value so we can
// compare relative offsets with candidate variables.
let mut target_positions = Vec::new();
for (name, value, start, end) in captures {
if name == target_variable_name && value == target_value {
target_positions.push((*start, *end));
}
}
if target_positions.is_empty() {
return None;
}
// For each target position, find the closest search variable
let mut closest_distance = usize::MAX;
let mut closest_value: Option<String> = None;
for (_target_start, target_end) in target_positions {
for (name, value, start, _) in captures {
if name == search_variable_name {
let distance = (*start as isize - target_end as isize).abs() as usize;
if distance < closest_distance {
closest_distance = distance;
closest_value = Some(value.clone());
// Prefer candidates that appear before the target value (same logical block), but
// fall back to overlapping values and then to those that appear after the target
// value when no better match exists. This avoids pairing with the next block when
// multiple credentials are close together in the same file.
let mut best_before: Option<(usize, String)> = None;
let mut best_overlap: Option<(usize, String)> = None;
let mut best_after: Option<(usize, String)> = None;
for (target_start, target_end) in target_positions.iter().copied() {
for (name, value, start, end) in captures {
if name != search_variable_name {
continue;
}
if *end <= target_start {
// Candidate is before the target; choose the one closest to the target start.
let distance = target_start - *end;
match &mut best_before {
Some((best_distance, best_value)) if distance < *best_distance => {
*best_distance = distance;
*best_value = value.clone();
}
None => {
best_before = Some((distance, value.clone()));
}
_ => {}
}
} else if *start >= target_end {
// Candidate is after the target; choose the one closest to the target end.
let distance = *start - target_end;
match &mut best_after {
Some((best_distance, best_value)) if distance < *best_distance => {
*best_distance = distance;
*best_value = value.clone();
}
None => {
best_after = Some((distance, value.clone()));
}
_ => {}
}
} else {
// Candidate overlaps the target treat as an exact match.
let distance = 0usize;
match &mut best_overlap {
Some((best_distance, best_value)) if distance < *best_distance => {
*best_distance = distance;
*best_value = value.clone();
}
None => {
best_overlap = Some((distance, value.clone()));
}
_ => {}
}
}
}
}
closest_value
best_before.or(best_overlap).or(best_after).map(|(_, value)| value)
}
pub async fn check_url_resolvable(url: &Url) -> Result<(), Box<dyn std::error::Error>> {
@ -147,4 +193,31 @@ mod tests {
]
);
}
#[test]
fn prefers_closest_preceding_variable() {
let captures = vec![
("TOKEN".to_string(), "secret".to_string(), 75usize, 115usize),
("AKID".to_string(), "preceding".to_string(), 30usize, 50usize),
("AKID".to_string(), "following".to_string(), 180usize, 200usize),
];
let result =
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
assert_eq!(result, "preceding".to_string());
}
#[test]
fn falls_back_to_following_when_no_preceding() {
let captures = vec![
("TOKEN".to_string(), "secret".to_string(), 10usize, 50usize),
("AKID".to_string(), "after".to_string(), 60usize, 80usize),
];
let result =
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
assert_eq!(result, "after".to_string());
}
}

View file

@ -72,6 +72,12 @@ fn run_skiplist(skip_regex: Vec<String>, skip_skipword: Vec<String>) -> Result<u
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -103,6 +109,9 @@ fn run_skiplist(skip_regex: Vec<String>, skip_skipword: Vec<String>) -> Result<u
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@ -133,6 +142,8 @@ fn run_skiplist(skip_regex: Vec<String>, skip_skipword: Vec<String>) -> Result<u
manage_baseline: false,
skip_regex: skip_regex,
skip_word: skip_skipword,
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};

View file

@ -68,6 +68,13 @@ fn test_bitbucket_remote_scan() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -102,6 +109,9 @@ fn test_bitbucket_remote_scan() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@ -131,6 +141,8 @@ fn test_bitbucket_remote_scan() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -85,6 +85,13 @@ rules:
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -120,6 +127,9 @@ rules:
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@ -151,6 +161,8 @@ rules:
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -72,6 +72,13 @@ fn test_github_remote_scan() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -107,6 +114,9 @@ fn test_github_remote_scan() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@ -138,6 +148,8 @@ fn test_github_remote_scan() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -71,6 +71,13 @@ fn test_gitlab_remote_scan() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -106,6 +113,9 @@ fn test_gitlab_remote_scan() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
@ -137,6 +147,8 @@ fn test_gitlab_remote_scan() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};
@ -210,6 +222,13 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -252,6 +271,9 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
scan_nested_repos: true,
since_commit: None,
branch: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
@ -273,6 +295,8 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -55,6 +55,12 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -86,6 +92,9 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@ -115,6 +124,8 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -62,6 +62,13 @@ impl TestContext {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -92,6 +99,9 @@ impl TestContext {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
max_results: 10,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
@ -123,6 +133,8 @@ impl TestContext {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};
@ -186,6 +198,13 @@ async fn test_scan_slack_messages() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -218,6 +237,9 @@ async fn test_scan_slack_messages() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@ -247,6 +269,8 @@ async fn test_scan_slack_messages() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -128,6 +128,13 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -163,6 +170,9 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@ -194,6 +204,8 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,

View file

@ -71,6 +71,13 @@ impl TestContext {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -106,6 +113,9 @@ impl TestContext {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
@ -137,6 +147,8 @@ impl TestContext {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
@ -183,6 +195,13 @@ impl TestContext {
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -228,6 +247,10 @@ impl TestContext {
scan_nested_repos: true,
since_commit: None,
branch: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
},
extra_ignore_comments: Vec::new(),
content_filtering_args: ContentFilteringArgs {
@ -250,6 +273,8 @@ impl TestContext {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
};