forked from mirrors/kingfisher
commit
fecf858bfd
36 changed files with 2023 additions and 192 deletions
7
.github/workflows/ci.yml
vendored
7
.github/workflows/ci.yml
vendored
|
|
@ -5,6 +5,9 @@ on:
|
|||
branches:
|
||||
- main
|
||||
|
||||
env:
|
||||
RUST_TOOLCHAIN: "1.90"
|
||||
|
||||
# This workflow runs on pull requests to the main branch
|
||||
# It builds the project for 2 platforms, Linux arm64 and macOS arm64,
|
||||
# and runs tests for each platform. All platforms tested on merge to main
|
||||
|
|
@ -16,7 +19,7 @@ jobs:
|
|||
- uses: actions/checkout@v4
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: 1.88.0
|
||||
toolchain: ${{ env.RUST_TOOLCHAIN }}
|
||||
profile: minimal
|
||||
override: true
|
||||
- uses: swatinem/rust-cache@v2
|
||||
|
|
@ -34,7 +37,7 @@ jobs:
|
|||
- uses: actions/checkout@v4
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: 1.88.0
|
||||
toolchain: ${{ env.RUST_TOOLCHAIN }}
|
||||
profile: minimal
|
||||
override: true
|
||||
- uses: swatinem/rust-cache@v2
|
||||
|
|
|
|||
12
.github/workflows/release.yml
vendored
12
.github/workflows/release.yml
vendored
|
|
@ -4,6 +4,8 @@ on:
|
|||
push:
|
||||
branches:
|
||||
- main
|
||||
env:
|
||||
RUST_TOOLCHAIN: "1.90"
|
||||
|
||||
jobs:
|
||||
# ──────────────── Linux (via Makefile) ────────────────
|
||||
|
|
@ -15,7 +17,7 @@ jobs:
|
|||
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: 1.88.0
|
||||
toolchain: ${{ env.RUST_TOOLCHAIN }}
|
||||
profile: minimal
|
||||
override: true
|
||||
|
||||
|
|
@ -69,7 +71,7 @@ jobs:
|
|||
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: 1.88.0
|
||||
toolchain: ${{ env.RUST_TOOLCHAIN }}
|
||||
profile: minimal
|
||||
override: true
|
||||
|
||||
|
|
@ -123,7 +125,7 @@ jobs:
|
|||
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: 1.88.0
|
||||
toolchain: ${{ env.RUST_TOOLCHAIN }}
|
||||
profile: minimal
|
||||
override: true
|
||||
|
||||
|
|
@ -154,7 +156,7 @@ jobs:
|
|||
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: 1.88.0
|
||||
toolchain: ${{ env.RUST_TOOLCHAIN }}
|
||||
profile: minimal
|
||||
override: true
|
||||
|
||||
|
|
@ -185,7 +187,7 @@ jobs:
|
|||
- uses: actions/checkout@v4
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: 1.88.0
|
||||
toolchain: ${{ env.RUST_TOOLCHAIN }}
|
||||
profile: minimal
|
||||
override: true
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,12 @@
|
|||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
## [v1.58.0]
|
||||
- Added first-class Hugging Face scanning support, including CLI enumeration, token authentication, and integration with remote scans.
|
||||
- Condensed GitError formatting to report the exit status and the first informative lines from stdout/stderr, producing concise git clone failure logs.
|
||||
- Added support for scanning Google Cloud Storage buckets via `--gcs-bucket`, including optional prefixes and service-account authentication.
|
||||
- Added `--skip-aws-account` (now accepting comma-separated values) and `--skip-aws-account-file` to bypass live AWS validation for known canary/honey-token account IDs without triggering alerts. Kingfisher now ships with several canary AWS account IDs pre-seeded in the skip list and now reports matching findings as "Not Attempted" with the "Response" containing "(skip list entry)" so it's clear that validation was intentionally skipped and why.
|
||||
|
||||
## [v1.57.0]
|
||||
- Added inline ignore directive detection to treat suppression tokens anywhere on surrounding lines, including multi-line handling
|
||||
- Added a `--no-ignore` CLI flag to disable inline directives when you need every potential secret reported
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
[workspace.package]
|
||||
edition = "2021"
|
||||
rust-version = "1.88"
|
||||
rust-version = "1.90"
|
||||
license = "Apache-2.0"
|
||||
authors = ["Mick Grove <mick.grove@mongodb.com>"]
|
||||
homepage = "https://github.com/mongodb/kingfisher"
|
||||
|
|
@ -10,7 +10,7 @@ publish = false
|
|||
|
||||
[package]
|
||||
name = "kingfisher"
|
||||
version = "1.57.0"
|
||||
version = "1.58.0"
|
||||
description = "MongoDB's blazingly fast and accurate secret scanning and validation tool"
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
|
|
@ -122,7 +122,6 @@ reqwest-middleware = "0.4.2"
|
|||
tracing-subscriber = {version = "0.3.19", features = ["env-filter"] }
|
||||
tracing-core = "0.1.34"
|
||||
tree-sitter = "0.25.8"
|
||||
aws-smithy-runtime = "1.9.1"
|
||||
aws-smithy-http-client = "1.1.1"
|
||||
aws-smithy-runtime-api = "1.9.0"
|
||||
aws-smithy-types = "1.3.2"
|
||||
|
|
@ -192,6 +191,10 @@ walkdir = "2.5.0"
|
|||
p256 = "0.13.2"
|
||||
ed25519-dalek = { version = "2.2", features = ["pkcs8"] }
|
||||
aws-sdk-s3 = "1.100.0"
|
||||
gcloud-storage = { version = "1.1.1", default-features = false, features = [
|
||||
"rustls-tls",
|
||||
"auth",
|
||||
] }
|
||||
tokei = "12.1.2"
|
||||
|
||||
[target.'cfg(not(windows))'.dependencies]
|
||||
|
|
|
|||
22
Makefile
22
Makefile
|
|
@ -110,11 +110,11 @@ setup-zig:
|
|||
ubuntu-x64: setup-zig # ensures Zig & cargo-zigbuild exist
|
||||
@echo "Checking Rust toolchain…"
|
||||
@$(MAKE) check-rust || { \
|
||||
echo "🦀 Installing Rust 1.88.0 …"; \
|
||||
echo "🦀 Installing Rust 1.90.0 …"; \
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
|
||||
. $$HOME/.cargo/env; \
|
||||
rustup toolchain install 1.88.0; \
|
||||
rustup default 1.88.0; \
|
||||
rustup toolchain install 1.90.0; \
|
||||
rustup default 1.90.0; \
|
||||
}
|
||||
|
||||
@echo "📦 Installing build dependencies (musl, cmake, etc.)…"
|
||||
|
|
@ -150,11 +150,11 @@ ubuntu-x64: setup-zig # ensures Zig & cargo-zigbuild exist
|
|||
ubuntu-arm64: setup-zig # ensures Zig & cargo-zigbuild exist
|
||||
@echo "Checking Rust toolchain…"
|
||||
@$(MAKE) check-rust || { \
|
||||
echo "🦀 Installing Rust 1.88.0 …"; \
|
||||
echo "🦀 Installing Rust 1.90.0 …"; \
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
|
||||
. $$HOME/.cargo/env; \
|
||||
rustup toolchain install 1.88.0; \
|
||||
rustup default 1.88.0; \
|
||||
rustup toolchain install 1.90.0; \
|
||||
rustup default 1.90.0; \
|
||||
}
|
||||
|
||||
@echo "📦 Installing build dependencies (musl, cmake, etc.)…"
|
||||
|
|
@ -245,7 +245,7 @@ endif
|
|||
linux-x64: check-docker create-dockerignore
|
||||
@mkdir -p target/release
|
||||
docker run --platform linux/amd64 --rm \
|
||||
-v "$$(pwd):/src" -w /src rust:1.88-alpine sh -eu -c '\
|
||||
-v "$$(pwd):/src" -w /src rust:1.90-alpine sh -eu -c '\
|
||||
apk add --no-cache \
|
||||
musl-dev \
|
||||
gcc g++ make cmake pkgconfig \
|
||||
|
|
@ -256,7 +256,7 @@ linux-x64: check-docker create-dockerignore
|
|||
patch perl ragel && \
|
||||
git openssl-dev curl && \
|
||||
\
|
||||
cargo test --workspace --all-targets --release ; \
|
||||
cargo test --workspace --all-targets ; \
|
||||
\
|
||||
rustup target add x86_64-unknown-linux-musl && \
|
||||
\
|
||||
|
|
@ -274,7 +274,7 @@ linux-x64: check-docker create-dockerignore
|
|||
linux-arm64: check-docker create-dockerignore
|
||||
@mkdir -p target/release
|
||||
docker run --platform linux/arm64 --rm \
|
||||
-v "$$(pwd):/src" -w /src rust:1.88-alpine sh -eu -c '\
|
||||
-v "$$(pwd):/src" -w /src rust:1.90-alpine sh -eu -c '\
|
||||
apk add --no-cache \
|
||||
musl-dev \
|
||||
gcc g++ make cmake pkgconfig \
|
||||
|
|
@ -287,7 +287,7 @@ linux-arm64: check-docker create-dockerignore
|
|||
\
|
||||
rustup target add aarch64-unknown-linux-musl && \
|
||||
\
|
||||
cargo test --workspace --all-targets --release ; \
|
||||
cargo test --workspace --all-targets ; \
|
||||
\
|
||||
export PKG_CONFIG_ALLOW_CROSS=1 ; \
|
||||
export RUSTFLAGS="-C target-feature=+crt-static" ; \
|
||||
|
|
@ -385,7 +385,7 @@ check-rust:
|
|||
echo "Rust not found."; \
|
||||
exit 1; \
|
||||
fi; \
|
||||
required=1.88.0; \
|
||||
required=1.90.0; \
|
||||
if [ $$(printf '%s\n' "$$required" "$$version" | sort -V | head -n1) != "$$required" ]; then \
|
||||
echo "Rust version $$version is older than required $$required."; \
|
||||
exit 1; \
|
||||
|
|
|
|||
258
README.md
258
README.md
|
|
@ -5,22 +5,23 @@
|
|||
|
||||
[](https://opensource.org/licenses/Apache-2.0)
|
||||
|
||||
Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s hardware‑accelerated Hyperscan regex engine with language‑aware source code parsing, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production
|
||||
Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s SIMD accelerated regex engine (Hyperscan) with language‑aware source code parsing, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production
|
||||
</p>
|
||||
|
||||
Originally forked from Praetorian’s Nosey Parker, Kingfisher **adds** live cloud-API validation; many more targets (GitLab, BitBucket, Gitea, S3, Docker, Jira, Confluence, Slack); compressed-file extraction and scanning; baseline and allowlist controls; language-aware detection (~20 languages); and a native Windows binary. See [Origins and Divergence](#origins-and-divergence) for details.
|
||||
Originally forked from Praetorian’s Nosey Parker, Kingfisher has since significantly expanded and diverged, adding live validation, 10+ new scan targets, and major architectural enhancements. See [Origins and Divergence](#origins-and-divergence) for details.
|
||||
|
||||
## Key Features
|
||||
|
||||
### Multiple Scan Targets
|
||||
<div align="center">
|
||||
|
||||
| Files / Dirs | Local Git | GitHub | GitLab | Azure DevOps | Bitbucket | Gitea |
|
||||
|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|
|
||||
| <img src="./docs/assets/icons/files.svg" height="40" alt="Files / Dirs"/><br/><sub>Files / Dirs</sub> | <img src="./docs/assets/icons/local-git.svg" height="40" alt="Local Git"/><br/><sub>Local Git</sub> | <img src="./docs/assets/icons/github.svg" height="40" alt="GitHub"/><br/><sub>GitHub</sub> | <img src="./docs/assets/icons/gitlab.svg" height="40" alt="GitLab"/><br/><sub>GitLab</sub> | <img src="./docs/assets/icons/azure-devops.svg" height="40" alt="Azure DevOps"/><br/><sub>Azure DevOps</sub> | <img src="./docs/assets/icons/bitbucket.svg" height="40" alt="Bitbucket"/><br/><sub>Bitbucket</sub> | <img src="./docs/assets/icons/gitea.svg" height="40" alt="Gitea"/><br/><sub>Gitea</sub> |
|
||||
| Files / Dirs | Local Git | GitHub | GitLab | Azure Repos | Bitbucket | Gitea | Hugging Face |
|
||||
|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|:-------------:|
|
||||
| <img src="./docs/assets/icons/files.svg" height="40" alt="Files / Dirs"/><br/><sub>Files / Dirs</sub> | <img src="./docs/assets/icons/local-git.svg" height="40" alt="Local Git"/><br/><sub>Local Git</sub> | <img src="./docs/assets/icons/github.svg" height="40" alt="GitHub"/><br/><sub>GitHub</sub> | <img src="./docs/assets/icons/gitlab.svg" height="40" alt="GitLab"/><br/><sub>GitLab</sub> | <img src="./docs/assets/icons/azure-devops.svg" height="40" alt="Azure Repos"/><br/><sub>Azure Repos</sub> | <img src="./docs/assets/icons/bitbucket.svg" height="40" alt="Bitbucket"/><br/><sub>Bitbucket</sub> | <img src="./docs/assets/icons/gitea.svg" height="40" alt="Gitea"/><br/><sub>Gitea</sub> |<img src="./docs/assets/icons/huggingface.svg" height="40" width="40" alt="Hugging Face"/><br/><sub>Hugging Face</sub> |
|
||||
|
||||
| Docker | Jira | Confluence | Slack | AWS S3 |
|
||||
|:------:|:----:|:-----------:|:-----:|:------:|
|
||||
| <img src="./docs/assets/icons/docker.svg" height="40" alt="Docker"/><br/><sub>Docker</sub> | <img src="./docs/assets/icons/jira.svg" height="40" alt="Jira"/><br/><sub>Jira</sub> | <img src="./docs/assets/icons/confluence.svg" height="40" alt="Confluence"/><br/><sub>Confluence</sub> | <img src="./docs/assets/icons/slack.svg" height="40" alt="Slack"/><br/><sub>Slack</sub> | <img src="./docs/assets/icons/aws-s3.svg" height="40" alt="AWS S3"/><br/><sub>AWS S3</sub> |
|
||||
| Docker | Jira | Confluence | Slack | AWS S3 | Google Cloud |
|
||||
|:------:|:----:|:-----------:|:-----:|:------:|:---:|
|
||||
| <img src="./docs/assets/icons/docker.svg" height="40" alt="Docker"/><br/><sub>Docker</sub> | <img src="./docs/assets/icons/jira.svg" height="40" alt="Jira"/><br/><sub>Jira</sub> | <img src="./docs/assets/icons/confluence.svg" height="40" alt="Confluence"/><br/><sub>Confluence</sub> | <img src="./docs/assets/icons/slack.svg" height="40" alt="Slack"/><br/><sub>Slack</sub> | <img src="./docs/assets/icons/aws-s3.svg" height="40" alt="AWS S3"/><br/><sub>AWS S3</sub> | <img src="./docs/assets/icons/gcs.svg" height="40" alt="Google Cloud Storage"/><br/><sub>Cloud Storage</sub> |
|
||||
|
||||
</div>
|
||||
|
||||
|
|
@ -41,17 +42,27 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
|
|||
<img src="docs/runtime-comparison.png" alt="Kingfisher Runtime Comparison" style="vertical-align: center;" />
|
||||
</p>
|
||||
|
||||
# Table of Contents
|
||||
|
||||
<details>
|
||||
|
||||
- [Kingfisher](#kingfisher)
|
||||
- [Key Features](#key-features)
|
||||
- [Multiple Scan Targets](#multiple-scan-targets)
|
||||
- [Performance, Accuracy, and Hundreds of Rules](#performance-accuracy-and-hundreds-of-rules)
|
||||
- [Benchmark Results](#benchmark-results)
|
||||
- [Table of Contents](#table-of-contents)
|
||||
- [Getting Started](#getting-started)
|
||||
- [Installation](#installation)
|
||||
- [Run Kingfisher in Docker](#run-kingfisher-in-docker)
|
||||
- [Pre-built Releases](#pre-built-releases)
|
||||
- [Homebrew](#homebrew)
|
||||
- [Linux and macOS](#linux-and-macos)
|
||||
- [Windows](#windows)
|
||||
- [Compile](#compile)
|
||||
- [ Run Kingfisher in Docker](#-run-kingfisher-in-docker)
|
||||
- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance)
|
||||
- [Write Custom Rules!](#write-custom-rules)
|
||||
- [Usage](#usage)
|
||||
- [📝 Write Custom Rules!](#-write-custom-rules)
|
||||
- [🎉 Usage](#-usage)
|
||||
- [Basic Examples](#basic-examples)
|
||||
- [Scan with secret validation](#scan-with-secret-validation)
|
||||
- [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories)
|
||||
|
|
@ -67,6 +78,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
|
|||
- [Exclude specific paths](#exclude-specific-paths)
|
||||
- [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines)
|
||||
- [ Scanning an AWS S3 Bucket](#-scanning-an-aws-s3-bucket)
|
||||
- [ Scanning a Google Cloud Storage Bucket](#-scanning-a-google-cloud-storage-bucket)
|
||||
- [ Scanning Docker Images](#-scanning-docker-images)
|
||||
- [ Scanning GitHub](#-scanning-github)
|
||||
- [Scan GitHub organization (requires `KF_GITHUB_TOKEN`)](#scan-github-organization-requires-kf_github_token)
|
||||
|
|
@ -79,8 +91,8 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
|
|||
- [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url)
|
||||
- [List GitLab repositories](#list-gitlab-repositories)
|
||||
- [ Scanning Azure Repos](#-scanning-azure-repos)
|
||||
- [Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-devops-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat)
|
||||
- [Scan specific Azure DevOps projects](#scan-specific-azure-devops-projects)
|
||||
- [Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-repos-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat)
|
||||
- [Scan specific Azure Repos projects](#scan-specific-azure-repos-projects)
|
||||
- [Skip specific Azure repositories during enumeration](#skip-specific-azure-repositories-during-enumeration)
|
||||
- [List Azure repositories](#list-azure-repositories)
|
||||
- [ Scanning Gitea](#-scanning-gitea)
|
||||
|
|
@ -97,6 +109,12 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
|
|||
- [List Bitbucket repositories](#list-bitbucket-repositories)
|
||||
- [Authenticate to Bitbucket](#authenticate-to-bitbucket)
|
||||
- [Self-hosted Bitbucket Server](#self-hosted-bitbucket-server)
|
||||
- [ Scanning Hugging Face](#-scanning-hugging-face)
|
||||
- [Scan Hugging Face user](#scan-hugging-face-user)
|
||||
- [Scan Hugging Face organization](#scan-hugging-face-organization)
|
||||
- [Scan specific Hugging Face resources](#scan-specific-hugging-face-resources)
|
||||
- [List Hugging Face repositories](#list-hugging-face-repositories)
|
||||
- [Authenticate to Hugging Face](#authenticate-to-hugging-face)
|
||||
- [ Scanning Jira](#-scanning-jira)
|
||||
- [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query)
|
||||
- [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues)
|
||||
|
|
@ -107,7 +125,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
|
|||
- [Environment Variables for Tokens](#environment-variables-for-tokens)
|
||||
- [Exit Codes](#exit-codes)
|
||||
- [Update Checks](#update-checks)
|
||||
- [Advanced Options](#advanced-options)
|
||||
- [🤓 Advanced Options](#-advanced-options)
|
||||
- [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets)
|
||||
- [List Builtin Rules](#list-builtin-rules)
|
||||
- [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run)
|
||||
|
|
@ -117,6 +135,8 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
|
|||
- [Notable Scan Options](#notable-scan-options)
|
||||
- [Understanding `--confidence`](#understanding---confidence)
|
||||
- [Ignore known false positives](#ignore-known-false-positives)
|
||||
- [Skip Canary Tokens (AWS)](#skip-canary-tokens-aws)
|
||||
- [Common CLI flows](#common-cli-flows)
|
||||
- [Inline ignore directives](#inline-ignore-directives)
|
||||
- [Finding Fingerprint](#finding-fingerprint)
|
||||
- [Rule Performance Profiling](#rule-performance-profiling)
|
||||
|
|
@ -125,36 +145,58 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md))
|
|||
- [Roadmap](#roadmap)
|
||||
- [License](#license)
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
# Getting Started
|
||||
## Installation
|
||||
### Pre-built Releases
|
||||
Pre-built binaries are available from the [Releases](https://github.com/mongodb/kingfisher/releases) section.
|
||||
|
||||
### Homebrew
|
||||
|
||||
On macOS, you can simply
|
||||
|
||||
```bash
|
||||
brew install kingfisher
|
||||
```
|
||||
|
||||
Pre-built binaries are also available on the [Releases](https://github.com/mongodb/kingfisher/releases) section of this page.
|
||||
### Linux and macOS
|
||||
|
||||
You can also install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform:
|
||||
<details>
|
||||
|
||||
You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform.
|
||||
|
||||
```bash
|
||||
# Linux, macOS
|
||||
curl --silent --location \
|
||||
https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \
|
||||
sh && \
|
||||
ubi --project mongodb/kingfisher --in "$HOME/bin"
|
||||
ubi --project mongodb/kingfisher --in "$HOME/.local/bin"
|
||||
```
|
||||
|
||||
This installs and runs `ubi` and then places the `kingfisher` executable in `~/.local/bin` on Unix-like systems.
|
||||
|
||||
</details>
|
||||
|
||||
### Windows
|
||||
|
||||
<details>
|
||||
|
||||
You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform.
|
||||
|
||||
```powershell
|
||||
# Windows
|
||||
powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in .
|
||||
```
|
||||
|
||||
This installs `ubi` and then places the `kingfisher` executable in `~/bin` on Unix-like
|
||||
systems (or the current directory on Windows).
|
||||
This installs and runs `ubi` and then places the `kingfisher` executable in the current directory on Windows.
|
||||
</details>
|
||||
|
||||
Or you may compile for your platform via `make`:
|
||||
|
||||
### Compile
|
||||
You may compile for your platform via `make`
|
||||
|
||||
<details>
|
||||
|
||||
```bash
|
||||
# NOTE: Requires Docker
|
||||
|
|
@ -174,9 +216,14 @@ make darwin-all # builds both x64 and arm64
|
|||
make all # builds for every OS and architecture supported
|
||||
```
|
||||
|
||||
### Run Kingfisher in Docker
|
||||
</details>
|
||||
|
||||
### <img src="./docs/assets/icons/docker.svg" height="40" style="vertical-align:text-bottom;" alt="Docker"/> Run Kingfisher in Docker
|
||||
|
||||
Run the dockerized Kingfisher container
|
||||
|
||||
<details>
|
||||
|
||||
Run the dockerized Kingfisher container:
|
||||
```bash
|
||||
# GitHub Container Registry
|
||||
docker run --rm ghcr.io/mongodb/kingfisher:latest --version
|
||||
|
|
@ -233,6 +280,8 @@ docker run --rm \
|
|||
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
# 🔐 Detection Rules at a Glance
|
||||
|
||||
Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything from classic cloud keys to the latest AI SaaS tokens. Below is an overview:
|
||||
|
|
@ -248,7 +297,7 @@ Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything fr
|
|||
| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more |
|
||||
| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more |
|
||||
|
||||
## Write Custom Rules!
|
||||
## 📝 Write Custom Rules!
|
||||
|
||||
Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential.
|
||||
|
||||
|
|
@ -258,7 +307,7 @@ First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom King
|
|||
|
||||
Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required!
|
||||
|
||||
# Usage
|
||||
# 🎉 Usage
|
||||
|
||||
## Basic Examples
|
||||
|
||||
|
|
@ -400,7 +449,7 @@ kingfisher scan ./my-project \
|
|||
-v
|
||||
```
|
||||
|
||||
## <img alt="GitHub" src="./docs/assets/icons/aws-s3.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning an AWS S3 Bucket
|
||||
## <img alt="GitHub" src="./docs/assets/icons/aws-s3.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning an AWS S3 Bucket
|
||||
You can scan S3 objects directly:
|
||||
|
||||
```bash
|
||||
|
|
@ -415,7 +464,7 @@ Credential resolution happens in this order:
|
|||
|
||||
If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role.
|
||||
|
||||
Examples:
|
||||
Examples
|
||||
|
||||
```bash
|
||||
# using explicit keys
|
||||
|
|
@ -452,7 +501,30 @@ docker run --rm \
|
|||
scan --s3-bucket bucket-name
|
||||
```
|
||||
|
||||
## <img alt="Docker" src="./docs/assets/icons/docker.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Docker Images
|
||||
## <img src="./docs/assets/icons/gcs.svg" height="40" width="40" alt="Google Cloud Storage"/> Scanning a Google Cloud Storage Bucket
|
||||
|
||||
The `--gcs-bucket` flag streams objects directly from Google Cloud Storage. Authentication uses
|
||||
Application Default Credentials, so you can provide a service-account JSON file via the
|
||||
`GOOGLE_APPLICATION_CREDENTIALS` environment variable or by passing `--gcs-service-account`. Public
|
||||
buckets work without credentials.
|
||||
|
||||
```bash
|
||||
kingfisher scan --gcs-bucket bucket-name
|
||||
|
||||
# scan a sub-tree inside the bucket
|
||||
kingfisher scan --gcs-bucket bucket-name --gcs-prefix path/to/data/
|
||||
|
||||
# supply a service-account key explicitly
|
||||
kingfisher scan --gcs-bucket bucket-name --gcs-service-account /path/to/key.json
|
||||
```
|
||||
|
||||
Functional example:
|
||||
```bash
|
||||
kingfisher scan --gcs-bucket cloud-samples-data --gcs-prefix "storage/"
|
||||
```
|
||||
|
||||
|
||||
## <img alt="Docker" src="./docs/assets/icons/docker.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Docker Images
|
||||
|
||||
Kingfisher will first try to use any locally available image, then fall back to pulling via OCI.
|
||||
|
||||
|
|
@ -482,7 +554,7 @@ kingfisher scan --docker-image some-private-registry.dkr.ecr.us-east-1.amazonaws
|
|||
kingfisher scan --docker-image private.registry.example.com/my-image:tag
|
||||
```
|
||||
|
||||
## <img alt="GitHub" src="./docs/assets/icons/github.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning GitHub
|
||||
## <img alt="GitHub" src="./docs/assets/icons/github.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning GitHub
|
||||
|
||||
### Scan GitHub organization (requires `KF_GITHUB_TOKEN`)
|
||||
|
||||
|
|
@ -524,7 +596,7 @@ KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/priva
|
|||
|
||||
---
|
||||
|
||||
## <img alt="GitLab" src="./docs/assets/icons/gitlab.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning GitLab
|
||||
## <img alt="GitLab" src="./docs/assets/icons/gitlab.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning GitLab
|
||||
|
||||
### Scan GitLab group (requires `KF_GITLAB_TOKEN`)
|
||||
|
||||
|
|
@ -580,18 +652,18 @@ kingfisher gitlab repos list --group my-group --include-subgroups
|
|||
# skip specific projects when listing or scanning (supports glob patterns)
|
||||
kingfisher gitlab repos list --group my-group --gitlab-exclude my-group/**/legacy-*
|
||||
```
|
||||
## <img alt="Azure Repos" src="./docs/assets/icons/azure-devops.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Azure Repos
|
||||
## <img alt="Azure Repos" src="./docs/assets/icons/azure-devops.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Azure Repos
|
||||
|
||||
### Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)
|
||||
### Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)
|
||||
|
||||
```bash
|
||||
kingfisher scan --azure-organization my-org
|
||||
|
||||
# Azure DevOps Server example
|
||||
# Azure Repos Server example
|
||||
KF_AZURE_PAT="pat" kingfisher scan --azure-organization DefaultCollection --azure-base-url https://ado.internal.example/tfs/
|
||||
```
|
||||
|
||||
### Scan specific Azure DevOps projects
|
||||
### Scan specific Azure Repos projects
|
||||
|
||||
Projects are specified as `ORGANIZATION/PROJECT`. Repeat the flag for multiple projects.
|
||||
|
||||
|
|
@ -621,7 +693,7 @@ kingfisher azure repos list --project my-org/app --project my-org/api
|
|||
# skip specific repositories while listing (supports glob patterns)
|
||||
kingfisher azure repos list --organization my-org --azure-exclude my-org/**/experimental-*
|
||||
```
|
||||
## <img alt="Gitea" src="./docs/assets/icons/gitea.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Gitea
|
||||
## <img alt="Gitea" src="./docs/assets/icons/gitea.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Gitea
|
||||
|
||||
### Scan Gitea organization (requires `KF_GITEA_TOKEN`)
|
||||
|
||||
|
|
@ -673,7 +745,7 @@ KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --all-gitea-organizations
|
|||
# self-hosted example
|
||||
KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --user johndoe --gitea-api-url https://gitea.internal.example/api/v1/
|
||||
```
|
||||
## <img alt="Bitbucket" src="./docs/assets/icons/bitbucket.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Bitbucket
|
||||
## <img alt="Bitbucket" src="./docs/assets/icons/bitbucket.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Bitbucket
|
||||
### Scan Bitbucket workspace
|
||||
|
||||
```bash
|
||||
|
|
@ -745,7 +817,45 @@ Use `--bitbucket-api-url` to point Kingfisher at your server's REST endpoint, fo
|
|||
`https://bitbucket.example.com/rest/api/1.0/`. Provide credentials with
|
||||
`--bitbucket-username` and `--bitbucket-token`, and pass `--ignore-certs` when
|
||||
connecting to HTTP or otherwise insecure instances.
|
||||
## <img alt="Jira" src="./docs/assets/icons/jira.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Jira
|
||||
## <img src="./docs/assets/icons/huggingface.svg" height="40" width="40" alt="Hugging Face"/> Scanning Hugging Face
|
||||
|
||||
Hugging Face hosts git repositories for models, datasets, and Spaces. Kingfisher can enumerate and scan all three resource types.
|
||||
|
||||
### Scan Hugging Face user
|
||||
|
||||
```bash
|
||||
kingfisher scan --huggingface-user <username>
|
||||
```
|
||||
|
||||
### Scan Hugging Face organization
|
||||
|
||||
```bash
|
||||
kingfisher scan --huggingface-organization <orgname>
|
||||
```
|
||||
|
||||
### Scan specific Hugging Face resources
|
||||
|
||||
Scan individual repositories by ID (owner/name) or by passing the full HTTPS URL:
|
||||
|
||||
```bash
|
||||
kingfisher scan --huggingface-model <owner/model>
|
||||
kingfisher scan --huggingface-dataset https://huggingface.co/datasets/<owner>/<dataset>
|
||||
kingfisher scan --huggingface-space <owner/space>
|
||||
```
|
||||
|
||||
Use `--huggingface-exclude` to omit results returned by user or organization enumeration. Prefix values with `model:`, `dataset:`, or `space:` when you only want to skip a specific resource type.
|
||||
|
||||
### List Hugging Face repositories
|
||||
|
||||
```bash
|
||||
kingfisher huggingface repos list --huggingface-user <username>
|
||||
```
|
||||
|
||||
### Authenticate to Hugging Face
|
||||
|
||||
Private repositories require an access token provided through the `KF_HUGGINGFACE_TOKEN` environment variable. For git authentication the helper also honours `KF_HUGGINGFACE_USERNAME` (default `hf_user`).
|
||||
|
||||
## <img alt="Jira" src="./docs/assets/icons/jira.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Jira
|
||||
|
||||
### Scan Jira issues matching a JQL query
|
||||
|
||||
|
|
@ -764,7 +874,7 @@ KF_JIRA_TOKEN="token" kingfisher scan \
|
|||
--max-results 1000
|
||||
```
|
||||
|
||||
## <img alt="Confluence" src="./docs/assets/icons/confluence.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Confluence
|
||||
## <img alt="Confluence" src="./docs/assets/icons/confluence.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Confluence
|
||||
### Scan Confluence pages matching a CQL query
|
||||
|
||||
```bash
|
||||
|
|
@ -789,7 +899,7 @@ Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environ
|
|||
|
||||
To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions.
|
||||
|
||||
## <img alt="Slack" src="./docs/assets/icons/slack.svg" width="20" height="20" style="vertical-align:text-bottom;"> Scanning Slack
|
||||
## <img alt="Slack" src="./docs/assets/icons/slack.svg" width="40" height="40" style="vertical-align:text-bottom;"> Scanning Slack
|
||||
### Scan Slack messages matching a search query
|
||||
|
||||
```bash
|
||||
|
|
@ -811,11 +921,13 @@ KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \
|
|||
| `KF_GITLAB_TOKEN` | GitLab Personal Access Token |
|
||||
| `KF_GITEA_TOKEN` | Gitea Personal Access Token |
|
||||
| `KF_GITEA_USERNAME` | Username for private Gitea clones (used with `KF_GITEA_TOKEN`) |
|
||||
| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure DevOps Personal Access Token |
|
||||
| `KF_AZURE_USERNAME` | Username to use with Azure DevOps PATs (defaults to `pat` when unset) |
|
||||
| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure Repos Personal Access Token |
|
||||
| `KF_AZURE_USERNAME` | Username to use with Azure Repos PATs (defaults to `pat` when unset) |
|
||||
| `KF_BITBUCKET_USERNAME` | Bitbucket username for basic authentication |
|
||||
| `KF_BITBUCKET_APP_PASSWORD` / `KF_BITBUCKET_TOKEN` | Bitbucket app password or server token |
|
||||
| `KF_BITBUCKET_OAUTH_TOKEN` | Bitbucket OAuth or PAT token |
|
||||
| `KF_HUGGINGFACE_TOKEN` | Hugging Face access token for API enumeration and git cloning |
|
||||
| `KF_HUGGINGFACE_USERNAME` | Optional username for Hugging Face git operations (defaults to `hf_user`) |
|
||||
| `KF_JIRA_TOKEN` | Jira API token |
|
||||
| `KF_CONFLUENCE_TOKEN` | Confluence API token |
|
||||
| `KF_SLACK_TOKEN` | Slack API token |
|
||||
|
|
@ -869,7 +981,7 @@ Kingfisher automatically queries GitHub for a newer release when it starts and t
|
|||
|
||||
- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely
|
||||
|
||||
# Advanced Options
|
||||
# 🤓 Advanced Options
|
||||
|
||||
## Build a Baseline / Detect New Secrets
|
||||
|
||||
|
|
@ -963,6 +1075,8 @@ leaves the default unchanged.
|
|||
- `--manage-baseline`: Create or update the baseline file with current findings
|
||||
- `--skip-regex <PATTERN>`: Ignore findings whose text matches this regex (repeatable)
|
||||
- `--skip-word <WORD>`: Ignore findings containing this case-insensitive word (repeatable)
|
||||
- `--skip-aws-account <ACCOUNT_ID>`: Skip live AWS validation for findings tied to the specified AWS account number (repeatable, accepts comma-separated lists)
|
||||
- `--skip-aws-account-file <FILE>`: Load AWS account numbers to skip from a file (one account per line; `#` comments allowed)
|
||||
- `--ignore-comment <DIRECTIVE>`: Honor additional inline directives from other scanners (repeatable; e.g. `--ignore-comment "gitleaks:allow"`)
|
||||
- `--no-ignore`: Disable inline directives entirely so every match is reported
|
||||
## Understanding `--confidence`
|
||||
|
|
@ -996,6 +1110,66 @@ kingfisher scan \
|
|||
|
||||
If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early.
|
||||
|
||||
### Skip Canary Tokens (AWS)
|
||||
|
||||
Canary/honey tokens are intentionally leaked credentials used to catch misuse. Kingfisher can **recognize and skip** known AWS canary accounts so hygiene scans don’t set off alerts.
|
||||
|
||||
**How to skip**
|
||||
Pass the 12-digit AWS account IDs for your canaries via `--skip-aws-account` (comma-separated) or `--skip-aws-account-file` (one ID per line; blank lines and `#` comments allowed). Kingfisher also ships with a **pre-seeded (but not exhaustive)** list of Thinkst Canary account IDs used by canarytokens.org, so many are skipped automatically.
|
||||
|
||||
```bash
|
||||
kingfisher scan /path/to/code \
|
||||
--skip-aws-account "171436882533,534261010715"
|
||||
|
||||
# or combine preloaded canary IDs with a just-created decoy account
|
||||
printf '999900001111 \n534261010715' > /tmp/canary_accounts.txt
|
||||
|
||||
kingfisher scan /path/to/repo \
|
||||
--skip-aws-account-file /tmp/canary_accounts.txt
|
||||
|
||||
```
|
||||
|
||||
**What you’ll see**
|
||||
Findings tied to a skip-listed account report `Validation: Not Attempted` and note in the `Response:` that the entry came from the skip list:
|
||||
|
||||
```bash
|
||||
AWS SECRET ACCESS KEY => [KINGFISHER.AWS.2]
|
||||
|Finding.......: <REDACTED>
|
||||
|Fingerprint...: 2141074333616819500
|
||||
|Confidence....: medium
|
||||
|Entropy.......: 5.00
|
||||
|Validation....: Not Attempted
|
||||
|__Response....: (skip list entry) AWS validation not attempted for account 171436882533.
|
||||
|Language......: Unknown
|
||||
|Line Num......: 21
|
||||
|Path..........: /tmp/test_canary_accounts.log
|
||||
```
|
||||
|
||||
**Why this matters**
|
||||
Skipping prevents noisy tripwires in prod telemetry while keeping the status explicit—“Not Attempted” isn’t a pass. If needed, verify these credentials out-of-band or with a safe, non-triggering method.
|
||||
|
||||
|
||||
#### Common CLI flows
|
||||
|
||||
```bash
|
||||
# Skip a few in-house canaries during a filesystem scan
|
||||
kingfisher scan repo/ \
|
||||
--skip-aws-account "111122223333,444455556666"
|
||||
|
||||
# Read a longer list from disk
|
||||
kingfisher scan repo/ \
|
||||
--skip-aws-account-file /tmp/scripts/canary_accounts.txt
|
||||
|
||||
# Combine preloaded canary IDs with a just-created decoy account
|
||||
printf '999900001111\n534261010715\n' > /tmp/new_canary.txt
|
||||
|
||||
kingfisher scan /path/to/repo \
|
||||
--skip-aws-account-file /tmp/new_canary.txt
|
||||
|
||||
```
|
||||
|
||||
Tip: if you manage multiple canary fleets (Thinkst, self-hosted alternatives, or bespoke decoys), checkpoint the account IDs alongside your infrastructure-as-code so security teams can rotate or expand the skip list without editing pipelines.
|
||||
|
||||
### Inline ignore directives
|
||||
|
||||
Add `kingfisher:ignore` anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value:
|
||||
|
|
|
|||
38
docs/assets/icons/gcs.svg
Normal file
38
docs/assets/icons/gcs.svg
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg id="standard_product_icon" xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 512 512">
|
||||
<!-- Generator: Adobe Illustrator 29.1.0, SVG Export Plug-In . SVG Version: 2.1.0 Build 142) -->
|
||||
<defs>
|
||||
<style>
|
||||
.st0 {
|
||||
fill: none;
|
||||
}
|
||||
|
||||
.st1 {
|
||||
fill: #4285f4;
|
||||
}
|
||||
|
||||
.st2 {
|
||||
fill: #34a853;
|
||||
}
|
||||
|
||||
.st3 {
|
||||
fill: #fbbc04;
|
||||
}
|
||||
|
||||
.st4 {
|
||||
fill: #ea4335;
|
||||
}
|
||||
</style>
|
||||
</defs>
|
||||
<g id="bounding_box">
|
||||
<rect class="st0" width="512" height="512"/>
|
||||
</g>
|
||||
<g id="art">
|
||||
<path class="st2" d="M442,277.9H70c-8.8,0-16,7.2-16,16v148.1c0,8.8,7.2,16,16,16h107.2c1.5.5,3.1.7,4.8.7s3.3-.3,4.8-.7h255.2c8.8,0,16-7.2,16-16v-148.1c0-8.8-7.2-16-16-16ZM86,309.9h80v116.1h-80v-116.1ZM426,425.9h-228v-116.1h228v116.1Z"/>
|
||||
<path class="st3" d="M442,54H70c-8.8,0-16,7.2-16,16v148.8c0,8.8,7.2,16,16,16h372c8.8,0,16-7.2,16-16V70c0-8.8-7.2-16-16-16ZM86,86h80v116.8h-80v-116.8ZM426,202.8h-228v-116.8h228v116.8Z"/>
|
||||
<path class="st4" d="M442,234.8h-16V86H54v-16c0-8.8,7.2-16,16-16h372c8.8,0,16,7.2,16,16v148.8c0,8.8-7.2,16-16,16Z"/>
|
||||
<path class="st1" d="M442,457.9h-16v-148.1H54v-16c0-8.8,7.2-16,16-16h372c8.8,0,16,7.2,16,16v148.1c0,8.8-7.2,16-16,16Z"/>
|
||||
<circle class="st4" cx="349" cy="144.4" r="37"/>
|
||||
<circle class="st1" cx="349" cy="367.9" r="37"/>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 1.4 KiB |
41
docs/assets/icons/huggingface.svg
Normal file
41
docs/assets/icons/huggingface.svg
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
<svg xmlns="http://www.w3.org/2000/svg" width="95" height="88" fill="none">
|
||||
<path
|
||||
fill="#fff"
|
||||
d="M94.25 70.08a8.28 8.28 0 0 1-.43 6.46 10.57 10.57 0 0 1-3 3.6 25.18 25.18 0 0 1-5.7 3.2 65.74 65.74 0 0 1-7.56 2.65 46.67 46.67 0 0 1-11.42 1.68c-5.42.05-10.09-1.23-13.4-4.5a40.4 40.4 0 0 1-10.14.03c-3.34 3.25-7.99 4.52-13.39 4.47a46.82 46.82 0 0 1-11.43-1.68 66.37 66.37 0 0 1-7.55-2.65c-2.28-.98-4.17-2-5.68-3.2a10.5 10.5 0 0 1-3.02-3.6c-.99-2-1.18-4.3-.42-6.46a8.54 8.54 0 0 1-.33-5.63c.25-.95.66-1.83 1.18-2.61a8.67 8.67 0 0 1 2.1-8.47 8.23 8.23 0 0 1 2.82-2.07 41.75 41.75 0 1 1 81.3-.12 8.27 8.27 0 0 1 3.11 2.19 8.7 8.7 0 0 1 2.1 8.47c.52.78.93 1.66 1.18 2.61a8.61 8.61 0 0 1-.32 5.63Z"
|
||||
/>
|
||||
<path fill="#FFD21E" d="M47.21 76.5a34.75 34.75 0 1 0 0-69.5 34.75 34.75 0 0 0 0 69.5Z" />
|
||||
<path
|
||||
fill="#FF9D0B"
|
||||
d="M81.96 41.75a34.75 34.75 0 1 0-69.5 0 34.75 34.75 0 0 0 69.5 0Zm-73.5 0a38.75 38.75 0 1 1 77.5 0 38.75 38.75 0 0 1-77.5 0Z"
|
||||
/>
|
||||
<path
|
||||
fill="#3A3B45"
|
||||
d="M58.5 32.3c1.28.44 1.78 3.06 3.07 2.38a5 5 0 1 0-6.76-2.07c.61 1.15 2.55-.72 3.7-.32ZM34.95 32.3c-1.28.44-1.79 3.06-3.07 2.38a5 5 0 1 1 6.76-2.07c-.61 1.15-2.56-.72-3.7-.32Z"
|
||||
/>
|
||||
<path
|
||||
fill="#FF323D"
|
||||
d="M46.96 56.29c9.83 0 13-8.76 13-13.26 0-2.34-1.57-1.6-4.09-.36-2.33 1.15-5.46 2.74-8.9 2.74-7.19 0-13-6.88-13-2.38s3.16 13.26 13 13.26Z"
|
||||
/>
|
||||
<path
|
||||
fill="#3A3B45"
|
||||
fill-rule="evenodd"
|
||||
d="M39.43 54a8.7 8.7 0 0 1 5.3-4.49c.4-.12.81.57 1.24 1.28.4.68.82 1.37 1.24 1.37.45 0 .9-.68 1.33-1.35.45-.7.89-1.38 1.32-1.25a8.61 8.61 0 0 1 5 4.17c3.73-2.94 5.1-7.74 5.1-10.7 0-2.34-1.57-1.6-4.09-.36l-.14.07c-2.31 1.15-5.39 2.67-8.77 2.67s-6.45-1.52-8.77-2.67c-2.6-1.29-4.23-2.1-4.23.29 0 3.05 1.46 8.06 5.47 10.97Z"
|
||||
clip-rule="evenodd"
|
||||
/>
|
||||
<path
|
||||
fill="#FF9D0B"
|
||||
d="M70.71 37a3.25 3.25 0 1 0 0-6.5 3.25 3.25 0 0 0 0 6.5ZM24.21 37a3.25 3.25 0 1 0 0-6.5 3.25 3.25 0 0 0 0 6.5ZM17.52 48c-1.62 0-3.06.66-4.07 1.87a5.97 5.97 0 0 0-1.33 3.76 7.1 7.1 0 0 0-1.94-.3c-1.55 0-2.95.59-3.94 1.66a5.8 5.8 0 0 0-.8 7 5.3 5.3 0 0 0-1.79 2.82c-.24.9-.48 2.8.8 4.74a5.22 5.22 0 0 0-.37 5.02c1.02 2.32 3.57 4.14 8.52 6.1 3.07 1.22 5.89 2 5.91 2.01a44.33 44.33 0 0 0 10.93 1.6c5.86 0 10.05-1.8 12.46-5.34 3.88-5.69 3.33-10.9-1.7-15.92-2.77-2.78-4.62-6.87-5-7.77-.78-2.66-2.84-5.62-6.25-5.62a5.7 5.7 0 0 0-4.6 2.46c-1-1.26-1.98-2.25-2.86-2.82A7.4 7.4 0 0 0 17.52 48Zm0 4c.51 0 1.14.22 1.82.65 2.14 1.36 6.25 8.43 7.76 11.18.5.92 1.37 1.31 2.14 1.31 1.55 0 2.75-1.53.15-3.48-3.92-2.93-2.55-7.72-.68-8.01.08-.02.17-.02.24-.02 1.7 0 2.45 2.93 2.45 2.93s2.2 5.52 5.98 9.3c3.77 3.77 3.97 6.8 1.22 10.83-1.88 2.75-5.47 3.58-9.16 3.58-3.81 0-7.73-.9-9.92-1.46-.11-.03-13.45-3.8-11.76-7 .28-.54.75-.76 1.34-.76 2.38 0 6.7 3.54 8.57 3.54.41 0 .7-.17.83-.6.79-2.85-12.06-4.05-10.98-8.17.2-.73.71-1.02 1.44-1.02 3.14 0 10.2 5.53 11.68 5.53.11 0 .2-.03.24-.1.74-1.2.33-2.04-4.9-5.2-5.21-3.16-8.88-5.06-6.8-7.33.24-.26.58-.38 1-.38 3.17 0 10.66 6.82 10.66 6.82s2.02 2.1 3.25 2.1c.28 0 .52-.1.68-.38.86-1.46-8.06-8.22-8.56-11.01-.34-1.9.24-2.85 1.31-2.85Z"
|
||||
/>
|
||||
<path
|
||||
fill="#FFD21E"
|
||||
d="M38.6 76.69c2.75-4.04 2.55-7.07-1.22-10.84-3.78-3.77-5.98-9.3-5.98-9.3s-.82-3.2-2.69-2.9c-1.87.3-3.24 5.08.68 8.01 3.91 2.93-.78 4.92-2.29 2.17-1.5-2.75-5.62-9.82-7.76-11.18-2.13-1.35-3.63-.6-3.13 2.2.5 2.79 9.43 9.55 8.56 11-.87 1.47-3.93-1.71-3.93-1.71s-9.57-8.71-11.66-6.44c-2.08 2.27 1.59 4.17 6.8 7.33 5.23 3.16 5.64 4 4.9 5.2-.75 1.2-12.28-8.53-13.36-4.4-1.08 4.11 11.77 5.3 10.98 8.15-.8 2.85-9.06-5.38-10.74-2.18-1.7 3.21 11.65 6.98 11.76 7.01 4.3 1.12 15.25 3.49 19.08-2.12Z"
|
||||
/>
|
||||
<path
|
||||
fill="#FF9D0B"
|
||||
d="M77.4 48c1.62 0 3.07.66 4.07 1.87a5.97 5.97 0 0 1 1.33 3.76 7.1 7.1 0 0 1 1.95-.3c1.55 0 2.95.59 3.94 1.66a5.8 5.8 0 0 1 .8 7 5.3 5.3 0 0 1 1.78 2.82c.24.9.48 2.8-.8 4.74a5.22 5.22 0 0 1 .37 5.02c-1.02 2.32-3.57 4.14-8.51 6.1-3.08 1.22-5.9 2-5.92 2.01a44.33 44.33 0 0 1-10.93 1.6c-5.86 0-10.05-1.8-12.46-5.34-3.88-5.69-3.33-10.9 1.7-15.92 2.78-2.78 4.63-6.87 5.01-7.77.78-2.66 2.83-5.62 6.24-5.62a5.7 5.7 0 0 1 4.6 2.46c1-1.26 1.98-2.25 2.87-2.82A7.4 7.4 0 0 1 77.4 48Zm0 4c-.51 0-1.13.22-1.82.65-2.13 1.36-6.25 8.43-7.76 11.18a2.43 2.43 0 0 1-2.14 1.31c-1.54 0-2.75-1.53-.14-3.48 3.91-2.93 2.54-7.72.67-8.01a1.54 1.54 0 0 0-.24-.02c-1.7 0-2.45 2.93-2.45 2.93s-2.2 5.52-5.97 9.3c-3.78 3.77-3.98 6.8-1.22 10.83 1.87 2.75 5.47 3.58 9.15 3.58 3.82 0 7.73-.9 9.93-1.46.1-.03 13.45-3.8 11.76-7-.29-.54-.75-.76-1.34-.76-2.38 0-6.71 3.54-8.57 3.54-.42 0-.71-.17-.83-.6-.8-2.85 12.05-4.05 10.97-8.17-.19-.73-.7-1.02-1.44-1.02-3.14 0-10.2 5.53-11.68 5.53-.1 0-.19-.03-.23-.1-.74-1.2-.34-2.04 4.88-5.2 5.23-3.16 8.9-5.06 6.8-7.33-.23-.26-.57-.38-.98-.38-3.18 0-10.67 6.82-10.67 6.82s-2.02 2.1-3.24 2.1a.74.74 0 0 1-.68-.38c-.87-1.46 8.05-8.22 8.55-11.01.34-1.9-.24-2.85-1.31-2.85Z"
|
||||
/>
|
||||
<path
|
||||
fill="#FFD21E"
|
||||
d="M56.33 76.69c-2.75-4.04-2.56-7.07 1.22-10.84 3.77-3.77 5.97-9.3 5.97-9.3s.82-3.2 2.7-2.9c1.86.3 3.23 5.08-.68 8.01-3.92 2.93.78 4.92 2.28 2.17 1.51-2.75 5.63-9.82 7.76-11.18 2.13-1.35 3.64-.6 3.13 2.2-.5 2.79-9.42 9.55-8.55 11 .86 1.47 3.92-1.71 3.92-1.71s9.58-8.71 11.66-6.44c2.08 2.27-1.58 4.17-6.8 7.33-5.23 3.16-5.63 4-4.9 5.2.75 1.2 12.28-8.53 13.36-4.4 1.08 4.11-11.76 5.3-10.97 8.15.8 2.85 9.05-5.38 10.74-2.18 1.69 3.21-11.65 6.98-11.76 7.01-4.31 1.12-15.26 3.49-19.08-2.12Z"
|
||||
/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 5.1 KiB |
73
src/cli/commands/huggingface.rs
Normal file
73
src/cli/commands/huggingface.rs
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
use clap::{Args, Subcommand};
|
||||
|
||||
use crate::cli::commands::output::OutputArgs;
|
||||
|
||||
use super::github::GitHubOutputFormat;
|
||||
|
||||
/// Top-level Hugging Face command group
|
||||
#[derive(Args, Debug)]
|
||||
pub struct HuggingFaceArgs {
|
||||
#[command(subcommand)]
|
||||
pub command: HuggingFaceCommand,
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
pub enum HuggingFaceCommand {
|
||||
/// Interact with Hugging Face repositories
|
||||
#[command(subcommand)]
|
||||
Repos(HuggingFaceReposCommand),
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
pub enum HuggingFaceReposCommand {
|
||||
/// List Hugging Face repositories
|
||||
List(HuggingFaceReposListArgs),
|
||||
}
|
||||
|
||||
#[derive(Args, Debug, Clone)]
|
||||
pub struct HuggingFaceReposListArgs {
|
||||
#[command(flatten)]
|
||||
pub repo_specifiers: HuggingFaceRepoSpecifiers,
|
||||
|
||||
#[command(flatten)]
|
||||
pub output_args: OutputArgs<HuggingFaceOutputFormat>,
|
||||
}
|
||||
|
||||
#[derive(Args, Debug, Clone, Default)]
|
||||
pub struct HuggingFaceRepoSpecifiers {
|
||||
/// Models, datasets, and Spaces owned by these users
|
||||
#[arg(long = "huggingface-user")]
|
||||
pub user: Vec<String>,
|
||||
|
||||
/// Models, datasets, and Spaces owned by these organizations
|
||||
#[arg(long = "huggingface-organization", alias = "huggingface-org")]
|
||||
pub organization: Vec<String>,
|
||||
|
||||
/// Specific models to scan (format: owner/name or full URL)
|
||||
#[arg(long = "huggingface-model")]
|
||||
pub model: Vec<String>,
|
||||
|
||||
/// Specific datasets to scan (format: owner/name or full URL)
|
||||
#[arg(long = "huggingface-dataset")]
|
||||
pub dataset: Vec<String>,
|
||||
|
||||
/// Specific Spaces to scan (format: owner/name or full URL)
|
||||
#[arg(long = "huggingface-space")]
|
||||
pub space: Vec<String>,
|
||||
|
||||
/// Skip specific repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:)
|
||||
#[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")]
|
||||
pub exclude: Vec<String>,
|
||||
}
|
||||
|
||||
impl HuggingFaceRepoSpecifiers {
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.user.is_empty()
|
||||
&& self.organization.is_empty()
|
||||
&& self.model.is_empty()
|
||||
&& self.dataset.is_empty()
|
||||
&& self.space.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
pub type HuggingFaceOutputFormat = GitHubOutputFormat;
|
||||
|
|
@ -28,6 +28,11 @@ pub struct InputSpecifierArgs {
|
|||
"gitlab_group",
|
||||
"gitea_user",
|
||||
"gitea_organization",
|
||||
"huggingface_user",
|
||||
"huggingface_organization",
|
||||
"huggingface_model",
|
||||
"huggingface_dataset",
|
||||
"huggingface_space",
|
||||
"bitbucket_user",
|
||||
"bitbucket_workspace",
|
||||
"bitbucket_project",
|
||||
|
|
@ -43,7 +48,8 @@ pub struct InputSpecifierArgs {
|
|||
"confluence_url",
|
||||
"docker_image",
|
||||
"slack_query",
|
||||
"s3_bucket"
|
||||
"s3_bucket",
|
||||
"gcs_bucket"
|
||||
]),
|
||||
num_args = 0..,
|
||||
value_hint = ValueHint::AnyPath
|
||||
|
|
@ -120,6 +126,30 @@ pub struct InputSpecifierArgs {
|
|||
#[arg(long, alias = "include-subgroups")]
|
||||
pub gitlab_include_subgroups: bool,
|
||||
|
||||
/// Scan models, datasets, and Spaces belonging to the specified Hugging Face users
|
||||
#[arg(long = "huggingface-user")]
|
||||
pub huggingface_user: Vec<String>,
|
||||
|
||||
/// Scan models, datasets, and Spaces belonging to the specified Hugging Face organizations
|
||||
#[arg(long = "huggingface-organization", alias = "huggingface-org")]
|
||||
pub huggingface_organization: Vec<String>,
|
||||
|
||||
/// Scan a specific Hugging Face model (format: owner/name or full URL)
|
||||
#[arg(long = "huggingface-model")]
|
||||
pub huggingface_model: Vec<String>,
|
||||
|
||||
/// Scan a specific Hugging Face dataset (format: owner/name or full URL)
|
||||
#[arg(long = "huggingface-dataset")]
|
||||
pub huggingface_dataset: Vec<String>,
|
||||
|
||||
/// Scan a specific Hugging Face Space (format: owner/name or full URL)
|
||||
#[arg(long = "huggingface-space")]
|
||||
pub huggingface_space: Vec<String>,
|
||||
|
||||
/// Skip specific Hugging Face repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:)
|
||||
#[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")]
|
||||
pub huggingface_exclude: Vec<String>,
|
||||
|
||||
// Gitea Options
|
||||
/// Scan repositories belonging to the specified Gitea user
|
||||
#[arg(long)]
|
||||
|
|
@ -256,6 +286,18 @@ pub struct InputSpecifierArgs {
|
|||
#[arg(long, requires = "s3_bucket")]
|
||||
pub aws_local_profile: Option<String>,
|
||||
|
||||
/// Scan the specified Google Cloud Storage bucket
|
||||
#[arg(long)]
|
||||
pub gcs_bucket: Option<String>,
|
||||
|
||||
/// Optional prefix within the GCS bucket
|
||||
#[arg(long, requires = "gcs_bucket")]
|
||||
pub gcs_prefix: Option<String>,
|
||||
|
||||
/// Path to a service account JSON file for GCS authentication
|
||||
#[arg(long, value_hint = ValueHint::FilePath, requires = "gcs_bucket")]
|
||||
pub gcs_service_account: Option<PathBuf>,
|
||||
|
||||
/// Docker/OCI images to scan (no local Docker required)
|
||||
#[arg(long = "docker-image")]
|
||||
pub docker_image: Vec<String>,
|
||||
|
|
@ -299,7 +341,6 @@ pub struct ContentFilteringArgs {
|
|||
#[arg(
|
||||
long = "max-file-size",
|
||||
visible_alias = "max-filesize", // also show in --help
|
||||
// alias = "max-filesize", // use this instead if you DON’T want it shown in --help
|
||||
default_value_t = 256.0,
|
||||
value_name = "MB"
|
||||
)]
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ pub mod bitbucket;
|
|||
pub mod gitea;
|
||||
pub mod github;
|
||||
pub mod gitlab;
|
||||
pub mod huggingface;
|
||||
pub mod inputs;
|
||||
pub mod output;
|
||||
pub mod rules;
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
use clap::{Args, ValueEnum};
|
||||
use std::path::PathBuf;
|
||||
use strum::Display;
|
||||
use tracing::debug;
|
||||
|
||||
|
|
@ -119,6 +120,14 @@ pub struct ScanArgs {
|
|||
#[arg(long = "skip-word", value_name = "WORD")]
|
||||
pub skip_word: Vec<String>,
|
||||
|
||||
/// AWS account IDs whose findings should skip live credential validation (repeatable)
|
||||
#[arg(long = "skip-aws-account", value_name = "ACCOUNT_ID", value_delimiter = ',')]
|
||||
pub skip_aws_account: Vec<String>,
|
||||
|
||||
/// File containing AWS account IDs to skip (one per line, `#` comments ignored)
|
||||
#[arg(long = "skip-aws-account-file", value_name = "FILE")]
|
||||
pub skip_aws_account_file: Option<PathBuf>,
|
||||
|
||||
/// Additional inline ignore directives to recognise (repeatable)
|
||||
#[arg(long = "ignore-comment", value_name = "DIRECTIVE")]
|
||||
pub extra_ignore_comments: Vec<String>,
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ use tracing::Level;
|
|||
|
||||
use crate::cli::commands::{
|
||||
azure::AzureArgs, bitbucket::BitbucketArgs, gitea::GiteaArgs, github::GitHubArgs,
|
||||
gitlab::GitLabArgs, rules::RulesArgs, scan::ScanArgs,
|
||||
gitlab::GitLabArgs, huggingface::HuggingFaceArgs, rules::RulesArgs, scan::ScanArgs,
|
||||
};
|
||||
|
||||
#[deny(missing_docs)]
|
||||
|
|
@ -81,6 +81,10 @@ pub enum Command {
|
|||
#[command(name = "azure")]
|
||||
Azure(AzureArgs),
|
||||
|
||||
/// Interact with the Hugging Face Hub
|
||||
#[command(name = "huggingface")]
|
||||
HuggingFace(HuggingFaceArgs),
|
||||
|
||||
/// Manage rules
|
||||
#[command(alias = "rule")]
|
||||
Rules(RulesArgs),
|
||||
|
|
|
|||
96
src/gcs.rs
Normal file
96
src/gcs.rs
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
use std::path::Path;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use gcloud_storage::{
|
||||
client::{google_cloud_auth::credentials::CredentialsFile, Client, ClientConfig},
|
||||
http::objects::{
|
||||
download::Range,
|
||||
get::GetObjectRequest,
|
||||
list::{ListObjectsRequest, ListObjectsResponse},
|
||||
},
|
||||
};
|
||||
use tracing::debug;
|
||||
|
||||
/// Visit every object in the given GCS bucket, optionally filtered by prefix.
|
||||
///
|
||||
/// Authentication is attempted via Application Default Credentials. When that
|
||||
/// fails and no explicit service account path was provided, the client falls
|
||||
/// back to anonymous access so public buckets can still be scanned.
|
||||
pub async fn visit_bucket_objects<F>(
|
||||
bucket: &str,
|
||||
prefix: Option<&str>,
|
||||
service_account_path: Option<&Path>,
|
||||
mut visitor: F,
|
||||
) -> Result<()>
|
||||
where
|
||||
F: FnMut(String, Vec<u8>) -> Result<()>,
|
||||
{
|
||||
let config_result = if let Some(path) = service_account_path {
|
||||
let credentials = CredentialsFile::new_from_file(path.to_string_lossy().into_owned())
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to read GCS service account credentials from {}", path.display())
|
||||
})?;
|
||||
|
||||
ClientConfig::default().with_credentials(credentials).await
|
||||
} else {
|
||||
ClientConfig::default().with_auth().await
|
||||
};
|
||||
|
||||
let config = match config_result {
|
||||
Ok(config) => config,
|
||||
Err(err) => {
|
||||
if service_account_path.is_some()
|
||||
|| std::env::var("GOOGLE_APPLICATION_CREDENTIALS").is_ok()
|
||||
|| std::env::var("GOOGLE_APPLICATION_CREDENTIALS_JSON").is_ok()
|
||||
{
|
||||
return Err(err)
|
||||
.context("Failed to authenticate with GCS using provided credentials");
|
||||
}
|
||||
debug!("Falling back to anonymous GCS access: {err}");
|
||||
ClientConfig::default().anonymous()
|
||||
}
|
||||
};
|
||||
|
||||
let client = Client::new(config);
|
||||
let mut page_token: Option<String> = None;
|
||||
|
||||
loop {
|
||||
let request = ListObjectsRequest {
|
||||
bucket: bucket.to_string(),
|
||||
prefix: prefix.map(|p| p.to_string()),
|
||||
page_token: page_token.clone(),
|
||||
..ListObjectsRequest::default()
|
||||
};
|
||||
|
||||
let mut response: ListObjectsResponse = client
|
||||
.list_objects(&request)
|
||||
.await
|
||||
.with_context(|| format!("Failed to list objects in bucket {bucket}"))?;
|
||||
|
||||
if let Some(items) = response.items.take() {
|
||||
for object in items.into_iter().filter(|o| !o.name.is_empty()) {
|
||||
let data = client
|
||||
.download_object(
|
||||
&GetObjectRequest {
|
||||
bucket: bucket.to_string(),
|
||||
object: object.name.clone(),
|
||||
..GetObjectRequest::default()
|
||||
},
|
||||
&Range::default(),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to fetch object {}", object.name))?;
|
||||
|
||||
visitor(object.name, data)?;
|
||||
}
|
||||
}
|
||||
|
||||
match response.next_page_token {
|
||||
Some(token) if !token.is_empty() => page_token = Some(token),
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -40,6 +40,15 @@ const AZURE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_azcreds() {
|
|||
fi
|
||||
}; _azcreds"#;
|
||||
|
||||
const HUGGINGFACE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_hfcreds() {
|
||||
token="$KF_HUGGINGFACE_TOKEN";
|
||||
if [ -n "$token" ]; then
|
||||
user="${KF_HUGGINGFACE_USERNAME:-hf_user}";
|
||||
echo username="$user";
|
||||
echo password="$token";
|
||||
fi
|
||||
}; _hfcreds"#;
|
||||
|
||||
/// Represents errors that can occur when interacting with the `git` CLI.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum GitError {
|
||||
|
|
@ -47,14 +56,37 @@ pub enum GitError {
|
|||
IOError(#[from] std::io::Error),
|
||||
|
||||
#[error(
|
||||
"git execution failed\ncode={}\nstdout=```\n{}```\nstderr=```\n{}```",
|
||||
.status,
|
||||
String::from_utf8_lossy(.stdout),
|
||||
String::from_utf8_lossy(.stderr)
|
||||
"git execution failed (status: {status}){summary}",
|
||||
status = format_exit_status(.status),
|
||||
summary = format_git_error_summary(.stdout.as_slice(), .stderr.as_slice())
|
||||
)]
|
||||
GitError { stdout: Vec<u8>, stderr: Vec<u8>, status: ExitStatus },
|
||||
}
|
||||
|
||||
fn format_exit_status(status: &ExitStatus) -> String {
|
||||
status.code().map(|code| code.to_string()).unwrap_or_else(|| status.to_string())
|
||||
}
|
||||
|
||||
fn format_git_error_summary(stdout: &[u8], stderr: &[u8]) -> String {
|
||||
let mut messages = Vec::new();
|
||||
if let Some(line) = summarize_output(stderr) {
|
||||
messages.push(line);
|
||||
}
|
||||
if let Some(line) = summarize_output(stdout) {
|
||||
messages.push(line);
|
||||
}
|
||||
if messages.is_empty() {
|
||||
String::new()
|
||||
} else {
|
||||
format!(": {}", messages.join(" | "))
|
||||
}
|
||||
}
|
||||
|
||||
fn summarize_output(output: &[u8]) -> Option<String> {
|
||||
let text = String::from_utf8_lossy(output);
|
||||
text.lines().map(str::trim).find(|line| !line.is_empty()).map(|line| line.to_owned())
|
||||
}
|
||||
|
||||
/// A helper struct for running `git` commands.
|
||||
///
|
||||
/// It supports optional GitHub, GitLab, Gitea, and Bitbucket credentials passed via
|
||||
|
|
@ -91,6 +123,8 @@ impl Git {
|
|||
let has_azure_token = ["KF_AZURE_TOKEN", "KF_AZURE_PAT"]
|
||||
.iter()
|
||||
.any(|key| matches!(std::env::var(key), Ok(value) if !value.is_empty()));
|
||||
let has_huggingface_token =
|
||||
matches!(std::env::var("KF_HUGGINGFACE_TOKEN"), Ok(value) if !value.is_empty());
|
||||
|
||||
// If credentials are provided via environment variables, clear existing helpers first.
|
||||
if has_github_token
|
||||
|
|
@ -98,6 +132,7 @@ impl Git {
|
|||
|| has_gitea_token
|
||||
|| has_bitbucket_credentials
|
||||
|| has_azure_token
|
||||
|| has_huggingface_token
|
||||
{
|
||||
credentials.push("-c".into());
|
||||
credentials.push(r#"credential.helper="#.into());
|
||||
|
|
@ -136,6 +171,11 @@ impl Git {
|
|||
credentials.push(AZURE_CREDENTIAL_HELPER.into());
|
||||
}
|
||||
|
||||
if has_huggingface_token {
|
||||
credentials.push("-c".into());
|
||||
credentials.push(HUGGINGFACE_CREDENTIAL_HELPER.into());
|
||||
}
|
||||
|
||||
Self { credentials, ignore_certs }
|
||||
}
|
||||
|
||||
|
|
|
|||
636
src/huggingface.rs
Normal file
636
src/huggingface.rs
Normal file
|
|
@ -0,0 +1,636 @@
|
|||
use std::{collections::HashSet, env, time::Duration};
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use reqwest::{header::LINK, StatusCode, Url};
|
||||
use serde::Deserialize;
|
||||
use serde_json::Value;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use crate::{git_url::GitUrl, validation::GLOBAL_USER_AGENT};
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct RepoSpecifiers {
|
||||
pub user: Vec<String>,
|
||||
pub organization: Vec<String>,
|
||||
pub model: Vec<String>,
|
||||
pub dataset: Vec<String>,
|
||||
pub space: Vec<String>,
|
||||
pub exclude: Vec<String>,
|
||||
}
|
||||
|
||||
impl RepoSpecifiers {
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.user.is_empty()
|
||||
&& self.organization.is_empty()
|
||||
&& self.model.is_empty()
|
||||
&& self.dataset.is_empty()
|
||||
&& self.space.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct AuthConfig {
|
||||
token: Option<String>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for AuthConfig {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("AuthConfig")
|
||||
.field(
|
||||
"token",
|
||||
&self
|
||||
.token
|
||||
.as_ref()
|
||||
.map(|token| format!("{}…", token.chars().take(4).collect::<String>())),
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl AuthConfig {
|
||||
pub fn from_env() -> Self {
|
||||
let token = env::var("KF_HUGGINGFACE_TOKEN").ok().filter(|t| !t.trim().is_empty());
|
||||
Self { token }
|
||||
}
|
||||
|
||||
fn apply(&self, request: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
|
||||
if let Some(token) = &self.token {
|
||||
request.bearer_auth(token)
|
||||
} else {
|
||||
request
|
||||
}
|
||||
}
|
||||
|
||||
fn has_token(&self) -> bool {
|
||||
self.token.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
|
||||
enum ResourceKind {
|
||||
Model,
|
||||
Dataset,
|
||||
Space,
|
||||
}
|
||||
|
||||
impl ResourceKind {
|
||||
fn api_path(self) -> &'static str {
|
||||
match self {
|
||||
ResourceKind::Model => "models",
|
||||
ResourceKind::Dataset => "datasets",
|
||||
ResourceKind::Space => "spaces",
|
||||
}
|
||||
}
|
||||
|
||||
fn git_url(self, slug: &str) -> String {
|
||||
match self {
|
||||
ResourceKind::Model => format!("https://huggingface.co/{slug}.git"),
|
||||
ResourceKind::Dataset => format!("https://huggingface.co/datasets/{slug}.git"),
|
||||
ResourceKind::Space => format!("https://huggingface.co/spaces/{slug}.git"),
|
||||
}
|
||||
}
|
||||
|
||||
fn canonical_prefix(self) -> &'static str {
|
||||
match self {
|
||||
ResourceKind::Model => "model",
|
||||
ResourceKind::Dataset => "dataset",
|
||||
ResourceKind::Space => "space",
|
||||
}
|
||||
}
|
||||
|
||||
fn display_name_singular(self) -> &'static str {
|
||||
match self {
|
||||
ResourceKind::Model => "model",
|
||||
ResourceKind::Dataset => "dataset",
|
||||
ResourceKind::Space => "space",
|
||||
}
|
||||
}
|
||||
|
||||
fn display_name_plural(self) -> &'static str {
|
||||
match self {
|
||||
ResourceKind::Model => "models",
|
||||
ResourceKind::Dataset => "datasets",
|
||||
ResourceKind::Space => "spaces",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
|
||||
struct ResourceRef {
|
||||
kind: ResourceKind,
|
||||
slug: String,
|
||||
}
|
||||
|
||||
impl ResourceRef {
|
||||
fn new(kind: ResourceKind, slug: String) -> Self {
|
||||
Self { kind, slug }
|
||||
}
|
||||
|
||||
fn canonical_key(&self) -> String {
|
||||
format!("{}:{}", self.kind.canonical_prefix(), self.slug.to_lowercase())
|
||||
}
|
||||
|
||||
fn git_url(&self) -> String {
|
||||
self.kind.git_url(&self.slug)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum HuggingFaceItem {
|
||||
Id {
|
||||
id: String,
|
||||
},
|
||||
ModelId {
|
||||
#[serde(rename = "modelId")]
|
||||
model_id: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl HuggingFaceItem {
|
||||
fn into_identifier(self) -> String {
|
||||
match self {
|
||||
HuggingFaceItem::Id { id } => id,
|
||||
HuggingFaceItem::ModelId { model_id } => model_id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct ExcludeSet {
|
||||
typed: HashSet<String>,
|
||||
untyped: HashSet<String>,
|
||||
}
|
||||
|
||||
impl ExcludeSet {
|
||||
fn from_list(values: &[String]) -> Self {
|
||||
let mut typed = HashSet::new();
|
||||
let mut untyped = HashSet::new();
|
||||
for raw in values {
|
||||
let trimmed = raw.trim();
|
||||
if trimmed.is_empty() {
|
||||
continue;
|
||||
}
|
||||
if let Some((prefix, rest)) = trimmed.split_once(':') {
|
||||
match normalize_kind(prefix) {
|
||||
Some(kind) => {
|
||||
if let Some(slug) = parse_slug_for_kind(kind, rest) {
|
||||
typed.insert(format!(
|
||||
"{}:{}",
|
||||
kind.canonical_prefix(),
|
||||
slug.to_lowercase()
|
||||
));
|
||||
} else {
|
||||
warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)");
|
||||
}
|
||||
}
|
||||
None => warn!("Ignoring invalid Hugging Face exclusion '{raw}' (unknown type)"),
|
||||
}
|
||||
} else if let Some(slug) = normalize_untyped_slug(trimmed) {
|
||||
untyped.insert(slug);
|
||||
} else {
|
||||
warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)");
|
||||
}
|
||||
}
|
||||
Self { typed, untyped }
|
||||
}
|
||||
|
||||
fn should_exclude(&self, kind: ResourceKind, slug: &str) -> bool {
|
||||
let typed_key = format!("{}:{}", kind.canonical_prefix(), slug.to_lowercase());
|
||||
if self.typed.contains(&typed_key) {
|
||||
return true;
|
||||
}
|
||||
self.untyped.contains(&slug.to_lowercase())
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_kind(raw: &str) -> Option<ResourceKind> {
|
||||
match raw.trim().to_ascii_lowercase().as_str() {
|
||||
"model" | "models" => Some(ResourceKind::Model),
|
||||
"dataset" | "datasets" => Some(ResourceKind::Dataset),
|
||||
"space" | "spaces" => Some(ResourceKind::Space),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_untyped_slug(raw: &str) -> Option<String> {
|
||||
let trimmed = raw.trim();
|
||||
if trimmed.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let segments: Vec<&str> = trimmed.split('/').filter(|segment| !segment.is_empty()).collect();
|
||||
normalize_untyped_segments(&segments)
|
||||
}
|
||||
|
||||
fn normalize_untyped_segments(segments: &[&str]) -> Option<String> {
|
||||
if segments.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let mut parts: Vec<&str> = segments.to_vec();
|
||||
if let Some(first) = parts.first() {
|
||||
let lowered = first.trim().to_ascii_lowercase();
|
||||
if matches!(
|
||||
lowered.as_str(),
|
||||
"models" | "model" | "datasets" | "dataset" | "spaces" | "space"
|
||||
) {
|
||||
parts.remove(0);
|
||||
}
|
||||
}
|
||||
if parts.len() < 2 {
|
||||
return None;
|
||||
}
|
||||
let owner = parts[0].trim();
|
||||
let binding = parts[1..].join("/");
|
||||
let name = binding.trim_end_matches(".git").trim();
|
||||
|
||||
if owner.is_empty() || name.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(format!("{}/{}", owner, name).to_lowercase())
|
||||
}
|
||||
|
||||
fn parse_slug_for_kind(kind: ResourceKind, raw: &str) -> Option<String> {
|
||||
let trimmed = raw.trim();
|
||||
if trimmed.is_empty() {
|
||||
return None;
|
||||
}
|
||||
if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
|
||||
let url = Url::parse(trimmed).ok()?;
|
||||
let segments: Vec<&str> = url
|
||||
.path_segments()
|
||||
.map(|segments| segments.filter(|s| !s.is_empty()).collect())
|
||||
.unwrap_or_default();
|
||||
return parse_slug_segments(kind, &segments);
|
||||
}
|
||||
let segments: Vec<&str> = trimmed.split('/').filter(|s| !s.is_empty()).collect();
|
||||
parse_slug_segments(kind, &segments)
|
||||
}
|
||||
|
||||
fn parse_slug_segments(kind: ResourceKind, segments: &[&str]) -> Option<String> {
|
||||
if segments.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let mut parts: Vec<&str> = segments.to_vec();
|
||||
if let Some(first) = parts.first() {
|
||||
let lowered = first.trim().to_ascii_lowercase();
|
||||
let should_trim = match kind {
|
||||
ResourceKind::Model => matches!(lowered.as_str(), "models" | "model"),
|
||||
ResourceKind::Dataset => matches!(lowered.as_str(), "datasets" | "dataset"),
|
||||
ResourceKind::Space => matches!(lowered.as_str(), "spaces" | "space"),
|
||||
};
|
||||
if should_trim {
|
||||
parts.remove(0);
|
||||
}
|
||||
}
|
||||
if parts.len() < 2 {
|
||||
return None;
|
||||
}
|
||||
let owner = parts[0].trim();
|
||||
let binding = parts[1..].join("/");
|
||||
let name = binding.trim_end_matches(".git").trim();
|
||||
|
||||
if owner.is_empty() || name.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(format!("{owner}/{name}"))
|
||||
}
|
||||
|
||||
fn parse_next_link(value: &str) -> Option<Url> {
|
||||
value.split(',').find_map(|part| {
|
||||
let part = part.trim();
|
||||
let (url_part, params) = part.split_once('>')?;
|
||||
if params.contains("rel=\"next\"") {
|
||||
let url = url_part.trim_start_matches('<').trim();
|
||||
Url::parse(url).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
const BODY_SNIPPET_LIMIT: usize = 200;
|
||||
|
||||
async fn fetch_paginated(
|
||||
client: &reqwest::Client,
|
||||
mut current_url: Url,
|
||||
auth: &AuthConfig,
|
||||
context: &str,
|
||||
) -> Result<Vec<HuggingFaceItem>> {
|
||||
let mut items = Vec::new();
|
||||
loop {
|
||||
let mut request =
|
||||
client.get(current_url.clone()).header("User-Agent", GLOBAL_USER_AGENT.as_str());
|
||||
request = auth.apply(request);
|
||||
let response = request.send().await?;
|
||||
let status = response.status();
|
||||
let link_header = response
|
||||
.headers()
|
||||
.get(LINK)
|
||||
.and_then(|value| value.to_str().ok())
|
||||
.map(|value| value.to_string());
|
||||
if !status.is_success() {
|
||||
let body = response.text().await.unwrap_or_default();
|
||||
let mut message = format!(
|
||||
"Hugging Face API request failed while enumerating {context} ({status}): {body}"
|
||||
);
|
||||
if matches!(status, StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN)
|
||||
&& !auth.has_token()
|
||||
{
|
||||
message.push_str(
|
||||
"\nProvide a Hugging Face access token via the KF_HUGGINGFACE_TOKEN environment variable.",
|
||||
);
|
||||
}
|
||||
return Err(anyhow!(message));
|
||||
}
|
||||
let body = response.bytes().await?;
|
||||
let value: Value = serde_json::from_slice(&body).map_err(|err| {
|
||||
let snippet = body_snippet(&body);
|
||||
anyhow!(
|
||||
"Failed to parse Hugging Face response while enumerating {context}: {err}. Body snippet: {snippet}",
|
||||
context = context,
|
||||
err = err,
|
||||
snippet = snippet
|
||||
)
|
||||
})?;
|
||||
|
||||
let array = value.as_array().ok_or_else(|| {
|
||||
let snippet = body_snippet(&body);
|
||||
anyhow!(
|
||||
"Unexpected Hugging Face response format while enumerating {context} (expected array). Body snippet: {snippet}",
|
||||
context = context,
|
||||
snippet = snippet
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut page = Vec::new();
|
||||
for (index, element) in array.iter().enumerate() {
|
||||
match serde_json::from_value::<HuggingFaceItem>(element.clone()) {
|
||||
Ok(item) => page.push(item),
|
||||
Err(err) => {
|
||||
let snippet = value_snippet(element);
|
||||
warn!(
|
||||
"Skipping Hugging Face item at index {index} while enumerating {context}: {err}. Item snippet: {snippet}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
items.append(&mut page);
|
||||
if let Some(link_value) = link_header {
|
||||
if let Some(next_url) = parse_next_link(&link_value) {
|
||||
current_url = next_url;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
Ok(items)
|
||||
}
|
||||
|
||||
fn body_snippet(body: &[u8]) -> String {
|
||||
truncate_for_display(&String::from_utf8_lossy(body), BODY_SNIPPET_LIMIT)
|
||||
}
|
||||
|
||||
fn value_snippet(value: &Value) -> String {
|
||||
let text = value.to_string();
|
||||
truncate_for_display(&text, BODY_SNIPPET_LIMIT)
|
||||
}
|
||||
|
||||
fn truncate_for_display(text: &str, limit: usize) -> String {
|
||||
let mut snippet: String = text.chars().take(limit).collect();
|
||||
if text.chars().count() > limit {
|
||||
snippet.push('…');
|
||||
}
|
||||
snippet
|
||||
}
|
||||
|
||||
async fn fetch_resources_for_owner(
|
||||
client: &reqwest::Client,
|
||||
base_url: &Url,
|
||||
owner: &str,
|
||||
label: &str,
|
||||
auth: &AuthConfig,
|
||||
progress: Option<&ProgressBar>,
|
||||
) -> Result<Vec<ResourceRef>> {
|
||||
let mut resources = Vec::new();
|
||||
for kind in [ResourceKind::Model, ResourceKind::Dataset, ResourceKind::Space] {
|
||||
if let Some(pb) = progress {
|
||||
pb.set_message(format!(
|
||||
"Enumerating Hugging Face {label} {}",
|
||||
kind.display_name_plural()
|
||||
));
|
||||
}
|
||||
let mut url = base_url.join(kind.api_path())?;
|
||||
{
|
||||
let mut pairs = url.query_pairs_mut();
|
||||
pairs.append_pair("author", owner);
|
||||
pairs.append_pair("limit", "100");
|
||||
}
|
||||
let context = format!("{} for {label}", kind.display_name_plural());
|
||||
match fetch_paginated(client, url, auth, &context).await {
|
||||
Ok(items) => {
|
||||
for item in items {
|
||||
let identifier = item.into_identifier();
|
||||
if let Some(slug) = parse_slug_for_kind(kind, &identifier) {
|
||||
resources.push(ResourceRef::new(kind, slug));
|
||||
} else {
|
||||
warn!(
|
||||
"Skipping Hugging Face {} with unexpected identifier '{}'",
|
||||
kind.display_name_singular(),
|
||||
identifier
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
warn!(
|
||||
"Failed to enumerate Hugging Face {} for {label}: {err}",
|
||||
kind.display_name_plural()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(resources)
|
||||
}
|
||||
|
||||
fn append_explicit_resources(specifiers: &RepoSpecifiers, resources: &mut Vec<ResourceRef>) {
|
||||
for model in &specifiers.model {
|
||||
if let Some(slug) = parse_slug_for_kind(ResourceKind::Model, model) {
|
||||
resources.push(ResourceRef::new(ResourceKind::Model, slug));
|
||||
} else {
|
||||
warn!("Ignoring invalid Hugging Face model identifier '{model}'");
|
||||
}
|
||||
}
|
||||
for dataset in &specifiers.dataset {
|
||||
if let Some(slug) = parse_slug_for_kind(ResourceKind::Dataset, dataset) {
|
||||
resources.push(ResourceRef::new(ResourceKind::Dataset, slug));
|
||||
} else {
|
||||
warn!("Ignoring invalid Hugging Face dataset identifier '{dataset}'");
|
||||
}
|
||||
}
|
||||
for space in &specifiers.space {
|
||||
if let Some(slug) = parse_slug_for_kind(ResourceKind::Space, space) {
|
||||
resources.push(ResourceRef::new(ResourceKind::Space, slug));
|
||||
} else {
|
||||
warn!("Ignoring invalid Hugging Face space identifier '{space}'");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn enumerate_repo_urls(
|
||||
specifiers: &RepoSpecifiers,
|
||||
auth: &AuthConfig,
|
||||
ignore_certs: bool,
|
||||
progress: Option<&mut ProgressBar>,
|
||||
) -> Result<Vec<String>> {
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.danger_accept_invalid_certs(ignore_certs)
|
||||
.build()?;
|
||||
let base_url = Url::parse("https://huggingface.co/api/")?;
|
||||
let excludes = ExcludeSet::from_list(&specifiers.exclude);
|
||||
let mut collected = Vec::new();
|
||||
|
||||
for user in &specifiers.user {
|
||||
let label = format!("user {user}");
|
||||
if let Some(pb) = progress.as_ref() {
|
||||
pb.set_message(format!("Enumerating Hugging Face {label}"));
|
||||
}
|
||||
match fetch_resources_for_owner(
|
||||
&client,
|
||||
&base_url,
|
||||
user,
|
||||
&label,
|
||||
auth,
|
||||
progress.as_ref().map(|pb| &**pb),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(mut resources) => collected.append(&mut resources),
|
||||
Err(err) => warn!("Failed to enumerate Hugging Face user {user}: {err}"),
|
||||
}
|
||||
}
|
||||
|
||||
for org in &specifiers.organization {
|
||||
let label = format!("organization {org}");
|
||||
if let Some(pb) = progress.as_ref() {
|
||||
pb.set_message(format!("Enumerating Hugging Face {label}"));
|
||||
}
|
||||
match fetch_resources_for_owner(
|
||||
&client,
|
||||
&base_url,
|
||||
org,
|
||||
&label,
|
||||
auth,
|
||||
progress.as_ref().map(|pb| &**pb),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(mut resources) => collected.append(&mut resources),
|
||||
Err(err) => warn!("Failed to enumerate Hugging Face organization {org}: {err}"),
|
||||
}
|
||||
}
|
||||
|
||||
append_explicit_resources(specifiers, &mut collected);
|
||||
|
||||
let mut seen = HashSet::new();
|
||||
let mut urls = Vec::new();
|
||||
for resource in collected {
|
||||
if excludes.should_exclude(resource.kind, &resource.slug) {
|
||||
debug!(
|
||||
"Skipping Hugging Face {} {} due to exclusion",
|
||||
resource.kind.display_name_singular(),
|
||||
resource.slug
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let key = resource.canonical_key();
|
||||
if seen.insert(key) {
|
||||
urls.push(resource.git_url());
|
||||
}
|
||||
}
|
||||
urls.sort();
|
||||
urls.dedup();
|
||||
Ok(urls)
|
||||
}
|
||||
|
||||
pub async fn list_repositories(
|
||||
specifiers: &RepoSpecifiers,
|
||||
auth: &AuthConfig,
|
||||
ignore_certs: bool,
|
||||
progress_enabled: bool,
|
||||
) -> Result<()> {
|
||||
let mut progress = if progress_enabled {
|
||||
let style = ProgressStyle::with_template("{spinner} {msg} [{elapsed_precise}]")
|
||||
.expect("progress bar style template should compile");
|
||||
let pb = ProgressBar::new_spinner()
|
||||
.with_style(style)
|
||||
.with_message("Enumerating Hugging Face repositories");
|
||||
pb.enable_steady_tick(Duration::from_millis(500));
|
||||
pb
|
||||
} else {
|
||||
ProgressBar::hidden()
|
||||
};
|
||||
|
||||
let urls = enumerate_repo_urls(specifiers, auth, ignore_certs, Some(&mut progress)).await?;
|
||||
for url in urls {
|
||||
println!("{url}");
|
||||
}
|
||||
progress.finish_and_clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn wiki_url(_repo_url: &GitUrl) -> Option<GitUrl> {
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_model_slug_from_plain() {
|
||||
assert_eq!(
|
||||
parse_slug_for_kind(ResourceKind::Model, "user/model"),
|
||||
Some("user/model".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_dataset_slug_with_prefix() {
|
||||
assert_eq!(
|
||||
parse_slug_for_kind(ResourceKind::Dataset, "datasets/user/data.git"),
|
||||
Some("user/data".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_space_slug_from_url() {
|
||||
assert_eq!(
|
||||
parse_slug_for_kind(ResourceKind::Space, "https://huggingface.co/spaces/user/demo"),
|
||||
Some("user/demo".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn exclude_set_matches_typed_and_untyped() {
|
||||
let excludes =
|
||||
ExcludeSet::from_list(&vec!["model:user/model".into(), "datasets/user/data".into()]);
|
||||
assert!(excludes.should_exclude(ResourceKind::Model, "user/model"));
|
||||
assert!(excludes.should_exclude(ResourceKind::Dataset, "user/data"));
|
||||
assert!(!excludes.should_exclude(ResourceKind::Space, "user/space"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_link_header() {
|
||||
let header = "<https://huggingface.co/api/models?cursor=abc>; rel=\"next\"";
|
||||
let url = parse_next_link(header).expect("next link");
|
||||
assert_eq!(url.as_str(), "https://huggingface.co/api/models?cursor=abc");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_for_display_adds_ellipsis() {
|
||||
assert_eq!(truncate_for_display("abcdef", 3), "abc…");
|
||||
assert_eq!(truncate_for_display("abc", 5), "abc");
|
||||
}
|
||||
}
|
||||
|
|
@ -13,6 +13,7 @@ pub mod defaults;
|
|||
pub mod entropy;
|
||||
pub mod finding_data;
|
||||
pub mod findings_store;
|
||||
pub mod gcs;
|
||||
pub mod git_binary;
|
||||
pub mod git_commit_metadata;
|
||||
pub mod git_metadata_graph;
|
||||
|
|
@ -21,6 +22,7 @@ pub mod git_url;
|
|||
pub mod gitea;
|
||||
pub mod github;
|
||||
pub mod gitlab;
|
||||
pub mod huggingface;
|
||||
pub mod inline_ignore;
|
||||
pub mod jira;
|
||||
pub mod liquid_filters;
|
||||
|
|
|
|||
38
src/main.rs
38
src/main.rs
|
|
@ -52,7 +52,7 @@ use kingfisher::{
|
|||
},
|
||||
findings_store,
|
||||
findings_store::FindingsStore,
|
||||
gitea, github,
|
||||
gitea, github, huggingface,
|
||||
rule_loader::RuleLoader,
|
||||
rules_database::RulesDatabase,
|
||||
scanner::{load_and_record_rules, run_scan},
|
||||
|
|
@ -75,6 +75,7 @@ use crate::cli::commands::{
|
|||
bitbucket::{BitbucketAuthArgs, BitbucketCommand, BitbucketRepoType, BitbucketReposCommand},
|
||||
gitea::{GiteaCommand, GiteaRepoType, GiteaReposCommand},
|
||||
gitlab::{GitLabCommand, GitLabRepoType, GitLabReposCommand},
|
||||
huggingface::{HuggingFaceCommand, HuggingFaceReposCommand},
|
||||
};
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
|
|
@ -93,6 +94,7 @@ fn main() -> anyhow::Result<()> {
|
|||
Command::Bitbucket(_) => num_cpus::get(), // Default for Bitbucket commands
|
||||
Command::Gitea(_) => num_cpus::get(), // Default for Gitea commands
|
||||
Command::Azure(_) => num_cpus::get(), // Default for Azure commands
|
||||
Command::HuggingFace(_) => num_cpus::get(), // Default for Hugging Face commands
|
||||
Command::Rules(_) => num_cpus::get(), // Default for Rules commands
|
||||
};
|
||||
|
||||
|
|
@ -327,6 +329,28 @@ async fn async_main(args: CommandLineArgs) -> Result<()> {
|
|||
}
|
||||
},
|
||||
},
|
||||
Command::HuggingFace(hf_args) => match hf_args.command {
|
||||
HuggingFaceCommand::Repos(repos_command) => match repos_command {
|
||||
HuggingFaceReposCommand::List(list_args) => {
|
||||
let specifiers = huggingface::RepoSpecifiers {
|
||||
user: list_args.repo_specifiers.user.clone(),
|
||||
organization: list_args.repo_specifiers.organization.clone(),
|
||||
model: list_args.repo_specifiers.model.clone(),
|
||||
dataset: list_args.repo_specifiers.dataset.clone(),
|
||||
space: list_args.repo_specifiers.space.clone(),
|
||||
exclude: list_args.repo_specifiers.exclude.clone(),
|
||||
};
|
||||
let auth = huggingface::AuthConfig::from_env();
|
||||
huggingface::list_repositories(
|
||||
&specifiers,
|
||||
&auth,
|
||||
global_args.ignore_certs,
|
||||
global_args.use_progress(),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
},
|
||||
},
|
||||
Command::SelfUpdate => {
|
||||
anyhow::bail!("SelfUpdate command should not reach this branch")
|
||||
}
|
||||
|
|
@ -367,6 +391,13 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
|
|||
gitlab_repo_type: GitLabRepoType::All,
|
||||
gitlab_include_subgroups: false,
|
||||
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -400,6 +431,9 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
// Slack query
|
||||
slack_query: None,
|
||||
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
|
||||
|
|
@ -436,6 +470,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
|
||||
no_base64: false,
|
||||
no_inline_ignore: false,
|
||||
|
|
|
|||
197
src/reporter.rs
197
src/reporter.rs
|
|
@ -703,7 +703,7 @@ mod tests {
|
|||
git_commit_metadata::CommitMetadata,
|
||||
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
|
||||
matcher::{SerializableCapture, SerializableCaptures},
|
||||
origin::OriginSet,
|
||||
origin::{Origin, OriginSet},
|
||||
rules::rule::{Confidence, Rule, RuleSyntax},
|
||||
};
|
||||
use gix::{date::Time, ObjectId};
|
||||
|
|
@ -711,79 +711,8 @@ mod tests {
|
|||
use std::path::PathBuf;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn build_finding_record_uses_git_blob_path() {
|
||||
let temp = tempdir().unwrap();
|
||||
let datastore =
|
||||
Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
|
||||
let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
|
||||
|
||||
let repo_path = Arc::new(PathBuf::from("/tmp/repo"));
|
||||
let commit_metadata = Arc::new(CommitMetadata {
|
||||
commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(),
|
||||
committer_name: "Alice".into(),
|
||||
committer_email: "alice@example.com".into(),
|
||||
committer_timestamp: Time::new(0, 0),
|
||||
});
|
||||
let blob_path = "path/in/history.txt".to_string();
|
||||
let origin = OriginSet::new(
|
||||
Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()),
|
||||
vec![],
|
||||
);
|
||||
|
||||
let rule = Arc::new(Rule::new(RuleSyntax {
|
||||
name: "Test Rule".into(),
|
||||
id: "test.rule".into(),
|
||||
pattern: ".*".into(),
|
||||
min_entropy: 0.0,
|
||||
confidence: Confidence::Medium,
|
||||
visible: true,
|
||||
examples: vec![],
|
||||
negative_examples: vec![],
|
||||
references: vec![],
|
||||
validation: None,
|
||||
depends_on_rule: vec![],
|
||||
}));
|
||||
|
||||
let blob_id = BlobId::new(b"blob-data");
|
||||
let report_match = ReportMatch {
|
||||
origin,
|
||||
blob_metadata: BlobMetadata {
|
||||
id: blob_id,
|
||||
num_bytes: 42,
|
||||
mime_essence: None,
|
||||
language: Some("Unknown".into()),
|
||||
},
|
||||
m: Match {
|
||||
location: Location {
|
||||
offset_span: OffsetSpan { start: 0, end: 10 },
|
||||
source_span: SourceSpan {
|
||||
start: SourcePoint { line: 19, column: 0 },
|
||||
end: SourcePoint { line: 19, column: 10 },
|
||||
},
|
||||
},
|
||||
groups: SerializableCaptures {
|
||||
captures: SmallVec::<[SerializableCapture; 2]>::new(),
|
||||
},
|
||||
blob_id,
|
||||
finding_fingerprint: 123,
|
||||
rule: Arc::clone(&rule),
|
||||
validation_response_body: "Bad credentials".into(),
|
||||
validation_response_status: 401,
|
||||
validation_success: false,
|
||||
calculated_entropy: 5.29,
|
||||
visible: true,
|
||||
is_base64: false,
|
||||
},
|
||||
comment: None,
|
||||
match_confidence: Confidence::Medium,
|
||||
visible: true,
|
||||
validation_response_body: "Bad credentials".into(),
|
||||
validation_response_status: 401,
|
||||
validation_success: false,
|
||||
};
|
||||
|
||||
let scan_args = ScanArgs {
|
||||
fn sample_scan_args() -> ScanArgs {
|
||||
ScanArgs {
|
||||
num_jobs: 1,
|
||||
rules: RuleSpecifierArgs::default(),
|
||||
input_specifier_args: InputSpecifierArgs {
|
||||
|
|
@ -802,6 +731,12 @@ mod tests {
|
|||
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
|
||||
gitlab_repo_type: GitLabRepoType::All,
|
||||
gitlab_include_subgroups: false,
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -833,6 +768,9 @@ mod tests {
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
docker_image: Vec::new(),
|
||||
git_clone: GitCloneMode::Bare,
|
||||
git_history: GitHistoryMode::Full,
|
||||
|
|
@ -864,9 +802,98 @@ mod tests {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_inline_ignore: false,
|
||||
}
|
||||
}
|
||||
|
||||
fn sample_report_match(
|
||||
validation_body: &str,
|
||||
validation_status: u16,
|
||||
validation_success: bool,
|
||||
) -> (ReportMatch, String) {
|
||||
let repo_path = Arc::new(PathBuf::from("/tmp/repo"));
|
||||
let commit_metadata = Arc::new(CommitMetadata {
|
||||
commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(),
|
||||
committer_name: "Alice".into(),
|
||||
committer_email: "alice@example.com".into(),
|
||||
committer_timestamp: Time::new(0, 0),
|
||||
});
|
||||
let blob_path = "path/in/history.txt".to_string();
|
||||
let origin = OriginSet::new(
|
||||
Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()),
|
||||
vec![],
|
||||
);
|
||||
|
||||
let rule = Arc::new(Rule::new(RuleSyntax {
|
||||
name: "Test Rule".into(),
|
||||
id: "test.rule".into(),
|
||||
pattern: ".*".into(),
|
||||
min_entropy: 0.0,
|
||||
confidence: Confidence::Medium,
|
||||
visible: true,
|
||||
examples: vec![],
|
||||
negative_examples: vec![],
|
||||
references: vec![],
|
||||
validation: None,
|
||||
depends_on_rule: vec![],
|
||||
}));
|
||||
|
||||
let blob_id = BlobId::new(b"blob-data");
|
||||
let validation_body_owned = validation_body.to_string();
|
||||
let report_match = ReportMatch {
|
||||
origin,
|
||||
blob_metadata: BlobMetadata {
|
||||
id: blob_id,
|
||||
num_bytes: 42,
|
||||
mime_essence: None,
|
||||
language: Some("Unknown".into()),
|
||||
},
|
||||
m: Match {
|
||||
location: Location {
|
||||
offset_span: OffsetSpan { start: 0, end: 10 },
|
||||
source_span: SourceSpan {
|
||||
start: SourcePoint { line: 19, column: 0 },
|
||||
end: SourcePoint { line: 19, column: 10 },
|
||||
},
|
||||
},
|
||||
groups: SerializableCaptures {
|
||||
captures: SmallVec::<[SerializableCapture; 2]>::new(),
|
||||
},
|
||||
blob_id,
|
||||
finding_fingerprint: 123,
|
||||
rule: Arc::clone(&rule),
|
||||
validation_response_body: validation_body_owned.clone(),
|
||||
validation_response_status: validation_status,
|
||||
validation_success,
|
||||
calculated_entropy: 5.29,
|
||||
visible: true,
|
||||
is_base64: false,
|
||||
},
|
||||
comment: None,
|
||||
match_confidence: Confidence::Medium,
|
||||
visible: true,
|
||||
validation_response_body: validation_body_owned,
|
||||
validation_response_status: validation_status,
|
||||
validation_success,
|
||||
};
|
||||
|
||||
(report_match, blob_path)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_finding_record_uses_git_blob_path() {
|
||||
let temp = tempdir().unwrap();
|
||||
let datastore =
|
||||
Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
|
||||
let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
|
||||
|
||||
let (report_match, blob_path) =
|
||||
sample_report_match("Bad credentials", StatusCode::UNAUTHORIZED.as_u16(), false);
|
||||
|
||||
let scan_args = sample_scan_args();
|
||||
|
||||
let record = reporter.build_finding_record(&report_match, &scan_args);
|
||||
assert_eq!(record.finding.path, blob_path);
|
||||
let git_file_path = record
|
||||
|
|
@ -880,6 +907,28 @@ mod tests {
|
|||
assert_eq!(git_file_path, "path/in/history.txt");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skip_list_matches_surface_skip_reason() {
|
||||
let temp = tempdir().unwrap();
|
||||
let datastore =
|
||||
Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
|
||||
let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
|
||||
|
||||
let (report_match, _) = sample_report_match(
|
||||
"(skip list entry) AWS validation not attempted for account 111122223333.",
|
||||
StatusCode::CONTINUE.as_u16(),
|
||||
false,
|
||||
);
|
||||
let scan_args = sample_scan_args();
|
||||
|
||||
let record = reporter.build_finding_record(&report_match, &scan_args);
|
||||
assert_eq!(record.finding.validation.status, "Not Attempted");
|
||||
assert_eq!(
|
||||
record.finding.validation.response,
|
||||
"(skip list entry) AWS validation not attempted for account 111122223333."
|
||||
);
|
||||
}
|
||||
|
||||
use super::build_git_urls;
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -93,6 +93,14 @@ mod tests {
|
|||
gitlab_repo_type: GitLabRepoType::All,
|
||||
gitlab_include_subgroups: false,
|
||||
|
||||
// Hugging Face
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
|
||||
// Gitea
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
|
|
@ -132,6 +140,9 @@ mod tests {
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
|
||||
docker_image: Vec::new(),
|
||||
// clone / history options
|
||||
|
|
@ -163,6 +174,8 @@ mod tests {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
no_inline_ignore: false,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -115,7 +115,7 @@ impl<'a> Display for PrettyFindingRecord<'a> {
|
|||
} else {
|
||||
writeln!(f, " |Validation....: {}", finding.validation.status)?;
|
||||
}
|
||||
if finding.validation.status != "Not Attempted" {
|
||||
if !finding.validation.response.is_empty() {
|
||||
writeln!(f, " |__Response....: {}", style_fn(&finding.validation.response))?;
|
||||
}
|
||||
writeln!(f, " |Language......: {}", finding.language)?;
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ pub(crate) use docker::save_docker_images;
|
|||
pub(crate) use enumerate::enumerate_filesystem_inputs;
|
||||
pub(crate) use repos::{
|
||||
clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos,
|
||||
enumerate_github_repos,
|
||||
enumerate_github_repos, enumerate_huggingface_repos,
|
||||
};
|
||||
pub use runner::{load_and_record_rules, run_async_scan, run_scan};
|
||||
pub(crate) use validation::run_secret_validation;
|
||||
|
|
|
|||
|
|
@ -17,10 +17,10 @@ use crate::{
|
|||
commands::{github::GitCloneMode, github::GitHistoryMode, scan},
|
||||
global,
|
||||
},
|
||||
confluence, findings_store,
|
||||
confluence, findings_store, gcs,
|
||||
git_binary::{CloneMode, Git},
|
||||
git_url::GitUrl,
|
||||
gitea, github, gitlab, jira,
|
||||
gitea, github, gitlab, huggingface, jira,
|
||||
matcher::{Match, Matcher, MatcherStats},
|
||||
origin::{Origin, OriginSet},
|
||||
rules_database::RulesDatabase,
|
||||
|
|
@ -305,6 +305,69 @@ pub async fn enumerate_gitea_repos(
|
|||
Ok(repo_urls)
|
||||
}
|
||||
|
||||
pub async fn enumerate_huggingface_repos(
|
||||
args: &scan::ScanArgs,
|
||||
global_args: &global::GlobalArgs,
|
||||
) -> Result<Vec<GitUrl>> {
|
||||
let repo_specifiers = huggingface::RepoSpecifiers {
|
||||
user: args.input_specifier_args.huggingface_user.clone(),
|
||||
organization: args.input_specifier_args.huggingface_organization.clone(),
|
||||
model: args.input_specifier_args.huggingface_model.clone(),
|
||||
dataset: args.input_specifier_args.huggingface_dataset.clone(),
|
||||
space: args.input_specifier_args.huggingface_space.clone(),
|
||||
exclude: args.input_specifier_args.huggingface_exclude.clone(),
|
||||
};
|
||||
|
||||
let mut repo_urls = args.input_specifier_args.git_url.clone();
|
||||
if !repo_specifiers.is_empty() {
|
||||
let mut progress = if global_args.use_progress() {
|
||||
let style =
|
||||
ProgressStyle::with_template("{spinner} {msg} {human_len} [{elapsed_precise}]")
|
||||
.expect("progress bar style template should compile");
|
||||
let pb = ProgressBar::new_spinner()
|
||||
.with_style(style)
|
||||
.with_message("Enumerating Hugging Face repositories...");
|
||||
pb.enable_steady_tick(Duration::from_millis(500));
|
||||
pb
|
||||
} else {
|
||||
ProgressBar::hidden()
|
||||
};
|
||||
|
||||
let mut num_found: u64 = 0;
|
||||
let auth = huggingface::AuthConfig::from_env();
|
||||
let repo_strings = huggingface::enumerate_repo_urls(
|
||||
&repo_specifiers,
|
||||
&auth,
|
||||
global_args.ignore_certs,
|
||||
Some(&mut progress),
|
||||
)
|
||||
.await
|
||||
.context("Failed to enumerate Hugging Face repositories")?;
|
||||
|
||||
for repo_string in repo_strings {
|
||||
match GitUrl::from_str(&repo_string) {
|
||||
Ok(repo_url) => {
|
||||
repo_urls.push(repo_url);
|
||||
num_found += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
progress.suspend(|| {
|
||||
error!("Failed to parse repo URL from {repo_string}: {e}");
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
progress.finish_with_message(format!(
|
||||
"Found {} repositories from Hugging Face",
|
||||
HumanCount(num_found)
|
||||
));
|
||||
}
|
||||
repo_urls.sort();
|
||||
repo_urls.dedup();
|
||||
Ok(repo_urls)
|
||||
}
|
||||
|
||||
pub async fn enumerate_bitbucket_repos(
|
||||
args: &scan::ScanArgs,
|
||||
global_args: &global::GlobalArgs,
|
||||
|
|
@ -678,3 +741,81 @@ pub async fn fetch_s3_objects(
|
|||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn fetch_gcs_objects(
|
||||
args: &scan::ScanArgs,
|
||||
datastore: &Arc<Mutex<findings_store::FindingsStore>>,
|
||||
rules_db: &RulesDatabase,
|
||||
matcher_stats: &Mutex<MatcherStats>,
|
||||
enable_profiling: bool,
|
||||
shared_profiler: Arc<crate::rule_profiling::ConcurrentRuleProfiler>,
|
||||
progress_enabled: bool,
|
||||
) -> Result<()> {
|
||||
let Some(bucket) = args.input_specifier_args.gcs_bucket.as_deref() else {
|
||||
return Ok(());
|
||||
};
|
||||
let prefix = args.input_specifier_args.gcs_prefix.as_deref();
|
||||
let service_account = args.input_specifier_args.gcs_service_account.as_deref();
|
||||
|
||||
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
|
||||
let seen_blobs = BlobIdMap::new();
|
||||
let matcher = Matcher::new(
|
||||
rules_db,
|
||||
scanner_pool,
|
||||
&seen_blobs,
|
||||
Some(matcher_stats),
|
||||
enable_profiling,
|
||||
Some(shared_profiler.clone()),
|
||||
&args.extra_ignore_comments,
|
||||
args.no_inline_ignore,
|
||||
)?;
|
||||
let mut processor = BlobProcessor { matcher };
|
||||
|
||||
let progress = if progress_enabled {
|
||||
let style =
|
||||
ProgressStyle::with_template("{spinner} {msg} ({pos} objects) [{elapsed_precise}]")
|
||||
.expect("progress bar style template should compile");
|
||||
let pb = ProgressBar::new_spinner().with_style(style).with_message("Fetching GCS objects");
|
||||
pb.enable_steady_tick(Duration::from_millis(500));
|
||||
pb
|
||||
} else {
|
||||
ProgressBar::hidden()
|
||||
};
|
||||
|
||||
let pb = progress.clone();
|
||||
|
||||
let bucket_name = bucket.to_string();
|
||||
|
||||
gcs::visit_bucket_objects(bucket, prefix, service_account, move |key, bytes| {
|
||||
let origin = OriginSet::new(
|
||||
Origin::from_extended(serde_json::json!({
|
||||
"path": format!("gs://{}/{}", bucket_name, key)
|
||||
})),
|
||||
Vec::new(),
|
||||
);
|
||||
let blob = crate::blob::Blob::from_bytes(bytes);
|
||||
|
||||
if let Some((origin, blob_md, scored_matches)) =
|
||||
processor.run(origin, blob, args.no_dedup, args.redact, args.no_base64)?
|
||||
{
|
||||
let origin_arc = Arc::new(origin);
|
||||
let blob_arc = Arc::new(blob_md);
|
||||
|
||||
let mut batch = Vec::with_capacity(scored_matches.len());
|
||||
for (_score, m) in scored_matches {
|
||||
batch.push((origin_arc.clone(), blob_arc.clone(), m));
|
||||
}
|
||||
|
||||
let added = datastore.lock().unwrap().record(batch, !args.no_dedup);
|
||||
debug!("Added {} new GCS blobs", added);
|
||||
}
|
||||
pb.inc(1);
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
|
||||
let total = progress.position();
|
||||
progress.finish_with_message(format!("Fetched {} GCS objects", total));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,7 @@
|
|||
use std::sync::{Arc, Mutex};
|
||||
use std::{
|
||||
fs,
|
||||
sync::{Arc, Mutex},
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use crossbeam_skiplist::SkipMap;
|
||||
|
|
@ -21,10 +24,11 @@ use crate::{
|
|||
safe_list,
|
||||
scanner::{
|
||||
clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos,
|
||||
enumerate_filesystem_inputs, enumerate_github_repos,
|
||||
enumerate_filesystem_inputs, enumerate_github_repos, enumerate_huggingface_repos,
|
||||
repos::{
|
||||
enumerate_gitea_repos, enumerate_gitlab_repos, fetch_confluence_pages,
|
||||
fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects, fetch_slack_messages,
|
||||
fetch_gcs_objects, fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects,
|
||||
fetch_slack_messages,
|
||||
},
|
||||
run_secret_validation, save_docker_images,
|
||||
summary::print_scan_summary,
|
||||
|
|
@ -74,12 +78,14 @@ pub async fn run_async_scan(
|
|||
let mut repo_urls = enumerate_github_repos(args, global_args).await?;
|
||||
let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
|
||||
let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
|
||||
let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
|
||||
let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
|
||||
let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
|
||||
|
||||
// Combine repository URLs
|
||||
repo_urls.extend(gitlab_repo_urls);
|
||||
repo_urls.extend(gitea_repo_urls);
|
||||
repo_urls.extend(huggingface_repo_urls);
|
||||
repo_urls.extend(bitbucket_repo_urls);
|
||||
repo_urls.extend(azure_repo_urls);
|
||||
repo_urls.sort();
|
||||
|
|
@ -182,8 +188,20 @@ pub async fn run_async_scan(
|
|||
)
|
||||
.await?;
|
||||
|
||||
let has_s3 = args.input_specifier_args.s3_bucket.is_some();
|
||||
if input_roots.is_empty() && !has_s3 {
|
||||
fetch_gcs_objects(
|
||||
args,
|
||||
&datastore,
|
||||
rules_db,
|
||||
&matcher_stats,
|
||||
enable_profiling,
|
||||
Arc::clone(&shared_profiler),
|
||||
progress_enabled,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let has_remote_objects = args.input_specifier_args.s3_bucket.is_some()
|
||||
|| args.input_specifier_args.gcs_bucket.is_some();
|
||||
if input_roots.is_empty() && !has_remote_objects {
|
||||
bail!("No inputs to scan");
|
||||
}
|
||||
|
||||
|
|
@ -231,6 +249,26 @@ pub async fn run_async_scan(
|
|||
crate::baseline::apply_baseline(&mut ds, &path, args.manage_baseline, &input_roots)?;
|
||||
}
|
||||
|
||||
let mut skip_aws_accounts = args.skip_aws_account.clone();
|
||||
|
||||
if let Some(path) = args.skip_aws_account_file.as_ref() {
|
||||
let contents = fs::read_to_string(path).with_context(|| {
|
||||
format!("Failed to read --skip-aws-account-file {}", path.display())
|
||||
})?;
|
||||
|
||||
for line in contents.lines() {
|
||||
let content = line.split('#').next().unwrap_or("");
|
||||
for value in content.split(|c: char| c.is_ascii_whitespace() || c == ',' || c == ';') {
|
||||
let trimmed = value.trim();
|
||||
if !trimmed.is_empty() {
|
||||
skip_aws_accounts.push(trimmed.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
|
||||
|
||||
// If validation is enabled, run it as a second phase
|
||||
if !args.no_validate {
|
||||
info!("Starting secret validation phase...");
|
||||
|
|
|
|||
|
|
@ -95,6 +95,14 @@ pub fn init_validation_caches() {
|
|||
aws::set_aws_validation_concurrency(15);
|
||||
}
|
||||
|
||||
pub fn set_skip_aws_account_ids<I, S>(ids: I)
|
||||
where
|
||||
I: IntoIterator<Item = S>,
|
||||
S: Into<String>,
|
||||
{
|
||||
aws::set_aws_skip_account_ids(ids);
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct CachedResponse {
|
||||
pub body: String,
|
||||
|
|
@ -781,6 +789,26 @@ async fn timed_validate_single_match<'a>(
|
|||
}
|
||||
}
|
||||
|
||||
if let Some(account_id) = aws::should_skip_aws_validation(&akid) {
|
||||
m.validation_success = false;
|
||||
m.validation_response_body = format!(
|
||||
"(skip list entry) AWS validation not attempted for account {}.",
|
||||
account_id
|
||||
);
|
||||
m.validation_response_status = StatusCode::CONTINUE;
|
||||
cache.insert(
|
||||
cache_key,
|
||||
CachedResponse {
|
||||
body: m.validation_response_body.clone(),
|
||||
status: m.validation_response_status,
|
||||
is_valid: m.validation_success,
|
||||
timestamp: Instant::now(),
|
||||
},
|
||||
);
|
||||
commit_and_return(m);
|
||||
return;
|
||||
}
|
||||
|
||||
if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) {
|
||||
m.validation_success = false;
|
||||
m.validation_response_body = format!("Invalid AWS credentials ({}): {}", akid, e);
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
use std::time::Duration;
|
||||
use std::{collections::HashSet, sync::RwLock, time::Duration};
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use aws_config::{retry::RetryConfig, BehaviorVersion};
|
||||
use aws_config::{retry::RetryConfig, BehaviorVersion, SdkConfig};
|
||||
use aws_credential_types::Credentials;
|
||||
use aws_sdk_sts::{
|
||||
config::Builder as StsConfigBuilder, error::SdkError,
|
||||
|
|
@ -26,8 +26,9 @@ use http::{
|
|||
header::{HeaderValue, USER_AGENT},
|
||||
StatusCode,
|
||||
};
|
||||
use once_cell::sync::OnceCell;
|
||||
use once_cell::sync::{Lazy, OnceCell};
|
||||
use rand::{rng, Rng};
|
||||
use regex::Regex;
|
||||
use tokio::{
|
||||
sync::Semaphore,
|
||||
time::{sleep, timeout},
|
||||
|
|
@ -36,6 +37,57 @@ use tokio::{
|
|||
use crate::validation::GLOBAL_USER_AGENT;
|
||||
|
||||
static AWS_VALIDATION_SEMAPHORE: OnceCell<Semaphore> = OnceCell::new();
|
||||
const BUILTIN_SKIP_ACCOUNT_IDS: &[&str] = &[
|
||||
"052310077262",
|
||||
"171436882533",
|
||||
"528757803018",
|
||||
"534261010715",
|
||||
"595918472158",
|
||||
"729780141977",
|
||||
"893192397702",
|
||||
"992382622183",
|
||||
];
|
||||
|
||||
static AWS_SKIP_ACCOUNT_IDS: Lazy<RwLock<HashSet<String>>> = Lazy::new(|| {
|
||||
let mut set = HashSet::new();
|
||||
set.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string()));
|
||||
RwLock::new(set)
|
||||
});
|
||||
|
||||
fn build_http_client() -> SharedHttpClient {
|
||||
HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| {
|
||||
let mut conn_builder = ConnectorBuilder::default()
|
||||
.tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc));
|
||||
|
||||
conn_builder.set_connector_settings(settings.cloned());
|
||||
if let Some(components) = runtime_components {
|
||||
conn_builder.set_sleep_impl(components.sleep_impl());
|
||||
}
|
||||
conn_builder.set_proxy_config(Some(ProxyConfig::from_env()));
|
||||
conn_builder.build()
|
||||
})
|
||||
}
|
||||
|
||||
async fn build_base_config(credentials: Credentials) -> SdkConfig {
|
||||
let retry_config = RetryConfig::adaptive().with_max_attempts(3);
|
||||
aws_config::defaults(BehaviorVersion::latest())
|
||||
.region(Region::new("us-east-1"))
|
||||
.credentials_provider(credentials)
|
||||
.http_client(build_http_client())
|
||||
.retry_config(retry_config)
|
||||
.load()
|
||||
.await
|
||||
}
|
||||
|
||||
fn extract_account_id(input: &str) -> Option<String> {
|
||||
let trimmed = input.trim();
|
||||
if trimmed.len() == 12 && trimmed.chars().all(|c| c.is_ascii_digit()) {
|
||||
return Some(trimmed.to_string());
|
||||
}
|
||||
|
||||
static ACCOUNT_ID_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\d{12})").expect("valid regex"));
|
||||
ACCOUNT_ID_RE.captures(trimmed).and_then(|caps| caps.get(1)).map(|m| m.as_str().to_string())
|
||||
}
|
||||
|
||||
/// Set the maximum number of concurrent AWS validations. Call before first use.
|
||||
pub fn set_aws_validation_concurrency(max: usize) {
|
||||
|
|
@ -46,6 +98,46 @@ fn aws_validation_semaphore() -> &'static Semaphore {
|
|||
AWS_VALIDATION_SEMAPHORE.get_or_init(|| Semaphore::new(15))
|
||||
}
|
||||
|
||||
pub fn set_aws_skip_account_ids<I, S>(ids: I)
|
||||
where
|
||||
I: IntoIterator<Item = S>,
|
||||
S: Into<String>,
|
||||
{
|
||||
let mut guard = match AWS_SKIP_ACCOUNT_IDS.write() {
|
||||
Ok(g) => g,
|
||||
Err(poisoned) => poisoned.into_inner(),
|
||||
};
|
||||
guard.clear();
|
||||
|
||||
guard.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string()));
|
||||
|
||||
for raw in ids.into_iter() {
|
||||
let value = raw.into();
|
||||
if value.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
if let Some(normalized) = extract_account_id(&value) {
|
||||
guard.insert(normalized);
|
||||
} else {
|
||||
tracing::warn!("Ignoring invalid AWS account ID in skip list: {value}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn should_skip_aws_validation(access_key_id: &str) -> Option<String> {
|
||||
let guard = AWS_SKIP_ACCOUNT_IDS.read().ok()?;
|
||||
if guard.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let account = aws_key_to_account_number(access_key_id).ok()?;
|
||||
if guard.contains(&account) {
|
||||
Some(account)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct UaInterceptor;
|
||||
|
||||
|
|
@ -132,29 +224,7 @@ pub async fn validate_aws_credentials(
|
|||
None, // expiry
|
||||
"static", // provider name
|
||||
);
|
||||
// Create HTTP client that respects proxy settings from the environment
|
||||
let http_client: SharedHttpClient =
|
||||
HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| {
|
||||
let mut conn_builder = ConnectorBuilder::default()
|
||||
.tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc));
|
||||
|
||||
conn_builder.set_connector_settings(settings.cloned());
|
||||
if let Some(components) = runtime_components {
|
||||
conn_builder.set_sleep_impl(components.sleep_impl());
|
||||
}
|
||||
conn_builder.set_proxy_config(Some(ProxyConfig::from_env()));
|
||||
conn_builder.build()
|
||||
});
|
||||
|
||||
// Create AWS config with adaptive retries
|
||||
let retry_config = RetryConfig::adaptive().with_max_attempts(3);
|
||||
let config = aws_config::defaults(BehaviorVersion::latest())
|
||||
.region(Region::new("us-east-1"))
|
||||
.credentials_provider(credentials)
|
||||
.http_client(http_client)
|
||||
.retry_config(retry_config)
|
||||
.load()
|
||||
.await;
|
||||
let config = build_base_config(credentials).await;
|
||||
|
||||
// Create STS client
|
||||
let sts_config = StsConfigBuilder::from(&config).interceptor(UaInterceptor).build();
|
||||
|
|
@ -230,3 +300,74 @@ pub fn aws_key_to_account_number(aws_key_id: &str) -> Result<String, Box<dyn std
|
|||
// Return the account number formatted as a 12-digit string
|
||||
Ok(format!("{:012}", account_num))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::sync::Mutex;
|
||||
|
||||
static TEST_GUARD: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
|
||||
|
||||
#[test]
|
||||
fn skip_account_list_normalizes_inputs() {
|
||||
let _lock = TEST_GUARD.lock().unwrap();
|
||||
|
||||
set_aws_skip_account_ids([
|
||||
" 052310077262 ",
|
||||
"arn:aws:iam::171436882533:role/demo",
|
||||
"invalid",
|
||||
]);
|
||||
|
||||
let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap();
|
||||
assert!(guard.contains("052310077262"));
|
||||
assert!(guard.contains("171436882533"));
|
||||
assert_eq!(guard.len(), BUILTIN_SKIP_ACCOUNT_IDS.len());
|
||||
drop(guard);
|
||||
|
||||
set_aws_skip_account_ids(Vec::<String>::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn should_skip_when_account_matches() {
|
||||
let _lock = TEST_GUARD.lock().unwrap();
|
||||
|
||||
set_aws_skip_account_ids(["534261010715"]);
|
||||
assert_eq!(
|
||||
should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"),
|
||||
Some("534261010715".to_string())
|
||||
);
|
||||
|
||||
set_aws_skip_account_ids(Vec::<String>::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn builtin_canary_accounts_are_preseeded() {
|
||||
let _lock = TEST_GUARD.lock().unwrap();
|
||||
|
||||
set_aws_skip_account_ids(Vec::<String>::new());
|
||||
assert_eq!(
|
||||
should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"),
|
||||
Some("534261010715".to_string())
|
||||
);
|
||||
|
||||
set_aws_skip_account_ids(Vec::<String>::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_accounts_are_deduplicated() {
|
||||
let _lock = TEST_GUARD.lock().unwrap();
|
||||
|
||||
set_aws_skip_account_ids([
|
||||
"534261010715",
|
||||
"arn:aws:iam::534261010715:user/canarytokens",
|
||||
" 534261010715 ",
|
||||
]);
|
||||
|
||||
let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap();
|
||||
assert_eq!(guard.iter().filter(|id| id.as_str() == "534261010715").count(), 1);
|
||||
drop(guard);
|
||||
|
||||
set_aws_skip_account_ids(Vec::<String>::new());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,31 +33,77 @@ pub fn find_closest_variable(
|
|||
target_variable_name: &str,
|
||||
search_variable_name: &str,
|
||||
) -> Option<String> {
|
||||
// Find positions of the target variable with the target value
|
||||
// Collect the positions of the target variable for the provided value so we can
|
||||
// compare relative offsets with candidate variables.
|
||||
let mut target_positions = Vec::new();
|
||||
for (name, value, start, end) in captures {
|
||||
if name == target_variable_name && value == target_value {
|
||||
target_positions.push((*start, *end));
|
||||
}
|
||||
}
|
||||
|
||||
if target_positions.is_empty() {
|
||||
return None;
|
||||
}
|
||||
// For each target position, find the closest search variable
|
||||
let mut closest_distance = usize::MAX;
|
||||
let mut closest_value: Option<String> = None;
|
||||
for (_target_start, target_end) in target_positions {
|
||||
for (name, value, start, _) in captures {
|
||||
if name == search_variable_name {
|
||||
let distance = (*start as isize - target_end as isize).abs() as usize;
|
||||
if distance < closest_distance {
|
||||
closest_distance = distance;
|
||||
closest_value = Some(value.clone());
|
||||
|
||||
// Prefer candidates that appear before the target value (same logical block), but
|
||||
// fall back to overlapping values and then to those that appear after the target
|
||||
// value when no better match exists. This avoids pairing with the next block when
|
||||
// multiple credentials are close together in the same file.
|
||||
let mut best_before: Option<(usize, String)> = None;
|
||||
let mut best_overlap: Option<(usize, String)> = None;
|
||||
let mut best_after: Option<(usize, String)> = None;
|
||||
|
||||
for (target_start, target_end) in target_positions.iter().copied() {
|
||||
for (name, value, start, end) in captures {
|
||||
if name != search_variable_name {
|
||||
continue;
|
||||
}
|
||||
|
||||
if *end <= target_start {
|
||||
// Candidate is before the target; choose the one closest to the target start.
|
||||
let distance = target_start - *end;
|
||||
match &mut best_before {
|
||||
Some((best_distance, best_value)) if distance < *best_distance => {
|
||||
*best_distance = distance;
|
||||
*best_value = value.clone();
|
||||
}
|
||||
None => {
|
||||
best_before = Some((distance, value.clone()));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
} else if *start >= target_end {
|
||||
// Candidate is after the target; choose the one closest to the target end.
|
||||
let distance = *start - target_end;
|
||||
match &mut best_after {
|
||||
Some((best_distance, best_value)) if distance < *best_distance => {
|
||||
*best_distance = distance;
|
||||
*best_value = value.clone();
|
||||
}
|
||||
None => {
|
||||
best_after = Some((distance, value.clone()));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
} else {
|
||||
// Candidate overlaps the target – treat as an exact match.
|
||||
let distance = 0usize;
|
||||
match &mut best_overlap {
|
||||
Some((best_distance, best_value)) if distance < *best_distance => {
|
||||
*best_distance = distance;
|
||||
*best_value = value.clone();
|
||||
}
|
||||
None => {
|
||||
best_overlap = Some((distance, value.clone()));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
closest_value
|
||||
|
||||
best_before.or(best_overlap).or(best_after).map(|(_, value)| value)
|
||||
}
|
||||
|
||||
pub async fn check_url_resolvable(url: &Url) -> Result<(), Box<dyn std::error::Error>> {
|
||||
|
|
@ -147,4 +193,31 @@ mod tests {
|
|||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prefers_closest_preceding_variable() {
|
||||
let captures = vec![
|
||||
("TOKEN".to_string(), "secret".to_string(), 75usize, 115usize),
|
||||
("AKID".to_string(), "preceding".to_string(), 30usize, 50usize),
|
||||
("AKID".to_string(), "following".to_string(), 180usize, 200usize),
|
||||
];
|
||||
|
||||
let result =
|
||||
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
|
||||
|
||||
assert_eq!(result, "preceding".to_string());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn falls_back_to_following_when_no_preceding() {
|
||||
let captures = vec![
|
||||
("TOKEN".to_string(), "secret".to_string(), 10usize, 50usize),
|
||||
("AKID".to_string(), "after".to_string(), 60usize, 80usize),
|
||||
];
|
||||
|
||||
let result =
|
||||
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
|
||||
|
||||
assert_eq!(result, "after".to_string());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -72,6 +72,12 @@ fn run_skiplist(skip_regex: Vec<String>, skip_skipword: Vec<String>) -> Result<u
|
|||
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
|
||||
gitlab_repo_type: GitLabRepoType::Owner,
|
||||
gitlab_include_subgroups: false,
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -103,6 +109,9 @@ fn run_skiplist(skip_regex: Vec<String>, skip_skipword: Vec<String>) -> Result<u
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
docker_image: Vec::new(),
|
||||
git_clone: GitCloneMode::Bare,
|
||||
git_history: GitHistoryMode::Full,
|
||||
|
|
@ -133,6 +142,8 @@ fn run_skiplist(skip_regex: Vec<String>, skip_skipword: Vec<String>) -> Result<u
|
|||
manage_baseline: false,
|
||||
skip_regex: skip_regex,
|
||||
skip_word: skip_skipword,
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
no_inline_ignore: false,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -68,6 +68,13 @@ fn test_bitbucket_remote_scan() -> Result<()> {
|
|||
gitlab_repo_type: GitLabRepoType::Owner,
|
||||
gitlab_include_subgroups: false,
|
||||
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -102,6 +109,9 @@ fn test_bitbucket_remote_scan() -> Result<()> {
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
docker_image: Vec::new(),
|
||||
git_clone: GitCloneMode::Bare,
|
||||
git_history: GitHistoryMode::Full,
|
||||
|
|
@ -131,6 +141,8 @@ fn test_bitbucket_remote_scan() -> Result<()> {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
extra_ignore_comments: Vec::new(),
|
||||
no_inline_ignore: false,
|
||||
|
|
|
|||
|
|
@ -85,6 +85,13 @@ rules:
|
|||
gitlab_repo_type: GitLabRepoType::Owner,
|
||||
gitlab_include_subgroups: false,
|
||||
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -120,6 +127,9 @@ rules:
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
// Docker image scanning
|
||||
docker_image: Vec::new(),
|
||||
// git clone / history options
|
||||
|
|
@ -151,6 +161,8 @@ rules:
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
extra_ignore_comments: Vec::new(),
|
||||
no_inline_ignore: false,
|
||||
|
|
|
|||
|
|
@ -72,6 +72,13 @@ fn test_github_remote_scan() -> Result<()> {
|
|||
gitlab_repo_type: GitLabRepoType::Owner,
|
||||
gitlab_include_subgroups: false,
|
||||
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -107,6 +114,9 @@ fn test_github_remote_scan() -> Result<()> {
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
// Docker image scanning
|
||||
docker_image: Vec::new(),
|
||||
// git clone / history options
|
||||
|
|
@ -138,6 +148,8 @@ fn test_github_remote_scan() -> Result<()> {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
extra_ignore_comments: Vec::new(),
|
||||
no_inline_ignore: false,
|
||||
|
|
|
|||
|
|
@ -71,6 +71,13 @@ fn test_gitlab_remote_scan() -> Result<()> {
|
|||
gitlab_repo_type: GitLabRepoType::Owner,
|
||||
gitlab_include_subgroups: false,
|
||||
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -106,6 +113,9 @@ fn test_gitlab_remote_scan() -> Result<()> {
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
// Docker image scanning
|
||||
docker_image: Vec::new(),
|
||||
git_clone: GitCloneMode::Bare,
|
||||
|
|
@ -137,6 +147,8 @@ fn test_gitlab_remote_scan() -> Result<()> {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
no_inline_ignore: false,
|
||||
};
|
||||
|
|
@ -210,6 +222,13 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
|
|||
gitlab_repo_type: GitLabRepoType::Owner,
|
||||
gitlab_include_subgroups: false,
|
||||
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -252,6 +271,9 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
|
|||
scan_nested_repos: true,
|
||||
since_commit: None,
|
||||
branch: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
},
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
max_file_size_mb: 25.0,
|
||||
|
|
@ -273,6 +295,8 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
extra_ignore_comments: Vec::new(),
|
||||
no_inline_ignore: false,
|
||||
|
|
|
|||
|
|
@ -55,6 +55,12 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
|
|||
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
|
||||
gitlab_repo_type: GitLabRepoType::Owner,
|
||||
gitlab_include_subgroups: false,
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -86,6 +92,9 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
docker_image: Vec::new(),
|
||||
git_clone: GitCloneMode::Bare,
|
||||
git_history: GitHistoryMode::Full,
|
||||
|
|
@ -115,6 +124,8 @@ async fn test_redact_hashes_finding_values() -> Result<()> {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
extra_ignore_comments: Vec::new(),
|
||||
no_inline_ignore: false,
|
||||
|
|
|
|||
|
|
@ -62,6 +62,13 @@ impl TestContext {
|
|||
gitlab_repo_type: GitLabRepoType::Owner,
|
||||
gitlab_include_subgroups: false,
|
||||
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -92,6 +99,9 @@ impl TestContext {
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
max_results: 10,
|
||||
docker_image: Vec::new(),
|
||||
git_clone: GitCloneMode::Bare,
|
||||
|
|
@ -123,6 +133,8 @@ impl TestContext {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
no_inline_ignore: false,
|
||||
};
|
||||
|
|
@ -186,6 +198,13 @@ async fn test_scan_slack_messages() -> Result<()> {
|
|||
gitlab_repo_type: GitLabRepoType::Owner,
|
||||
gitlab_include_subgroups: false,
|
||||
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -218,6 +237,9 @@ async fn test_scan_slack_messages() -> Result<()> {
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
docker_image: Vec::new(),
|
||||
git_clone: GitCloneMode::Bare,
|
||||
git_history: GitHistoryMode::Full,
|
||||
|
|
@ -247,6 +269,8 @@ async fn test_scan_slack_messages() -> Result<()> {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
extra_ignore_comments: Vec::new(),
|
||||
no_inline_ignore: false,
|
||||
|
|
|
|||
|
|
@ -128,6 +128,13 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
|
|||
gitlab_repo_type: GitLabRepoType::Owner,
|
||||
gitlab_include_subgroups: false,
|
||||
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -163,6 +170,9 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
// Docker image scanning
|
||||
docker_image: Vec::new(),
|
||||
// git clone / history options
|
||||
|
|
@ -194,6 +204,8 @@ async fn test_validation_cache_and_depvars() -> Result<()> {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
extra_ignore_comments: Vec::new(),
|
||||
no_inline_ignore: false,
|
||||
|
|
|
|||
|
|
@ -71,6 +71,13 @@ impl TestContext {
|
|||
gitlab_repo_type: GitLabRepoType::Owner,
|
||||
gitlab_include_subgroups: false,
|
||||
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -106,6 +113,9 @@ impl TestContext {
|
|||
s3_prefix: None,
|
||||
role_arn: None,
|
||||
aws_local_profile: None,
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
// Docker image scanning
|
||||
docker_image: Vec::new(),
|
||||
// git clone / history options
|
||||
|
|
@ -137,6 +147,8 @@ impl TestContext {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
extra_ignore_comments: Vec::new(),
|
||||
no_inline_ignore: false,
|
||||
|
|
@ -183,6 +195,13 @@ impl TestContext {
|
|||
gitlab_repo_type: GitLabRepoType::Owner,
|
||||
gitlab_include_subgroups: false,
|
||||
|
||||
huggingface_user: Vec::new(),
|
||||
huggingface_organization: Vec::new(),
|
||||
huggingface_model: Vec::new(),
|
||||
huggingface_dataset: Vec::new(),
|
||||
huggingface_space: Vec::new(),
|
||||
huggingface_exclude: Vec::new(),
|
||||
|
||||
gitea_user: Vec::new(),
|
||||
gitea_organization: Vec::new(),
|
||||
gitea_exclude: Vec::new(),
|
||||
|
|
@ -228,6 +247,10 @@ impl TestContext {
|
|||
scan_nested_repos: true,
|
||||
since_commit: None,
|
||||
branch: None,
|
||||
|
||||
gcs_bucket: None,
|
||||
gcs_prefix: None,
|
||||
gcs_service_account: None,
|
||||
},
|
||||
extra_ignore_comments: Vec::new(),
|
||||
content_filtering_args: ContentFilteringArgs {
|
||||
|
|
@ -250,6 +273,8 @@ impl TestContext {
|
|||
manage_baseline: false,
|
||||
skip_regex: Vec::new(),
|
||||
skip_word: Vec::new(),
|
||||
skip_aws_account: Vec::new(),
|
||||
skip_aws_account_file: None,
|
||||
no_base64: false,
|
||||
no_inline_ignore: false,
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue