diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 27e1f98..beeec7f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,6 +5,9 @@ on: branches: - main +env: + RUST_TOOLCHAIN: "1.90" + # This workflow runs on pull requests to the main branch # It builds the project for 2 platforms, Linux arm64 and macOS arm64, # and runs tests for each platform. All platforms tested on merge to main @@ -16,7 +19,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions-rs/toolchain@v1 with: - toolchain: 1.88.0 + toolchain: ${{ env.RUST_TOOLCHAIN }} profile: minimal override: true - uses: swatinem/rust-cache@v2 @@ -34,7 +37,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions-rs/toolchain@v1 with: - toolchain: 1.88.0 + toolchain: ${{ env.RUST_TOOLCHAIN }} profile: minimal override: true - uses: swatinem/rust-cache@v2 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index abe38f9..4debe1d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -4,6 +4,8 @@ on: push: branches: - main +env: + RUST_TOOLCHAIN: "1.90" jobs: # ──────────────── Linux (via Makefile) ──────────────── @@ -15,7 +17,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: - toolchain: 1.88.0 + toolchain: ${{ env.RUST_TOOLCHAIN }} profile: minimal override: true @@ -69,7 +71,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: - toolchain: 1.88.0 + toolchain: ${{ env.RUST_TOOLCHAIN }} profile: minimal override: true @@ -123,7 +125,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: - toolchain: 1.88.0 + toolchain: ${{ env.RUST_TOOLCHAIN }} profile: minimal override: true @@ -154,7 +156,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: - toolchain: 1.88.0 + toolchain: ${{ env.RUST_TOOLCHAIN }} profile: minimal override: true @@ -185,7 +187,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions-rs/toolchain@v1 with: - toolchain: 1.88.0 + toolchain: ${{ env.RUST_TOOLCHAIN }} profile: minimal override: true diff --git a/CHANGELOG.md b/CHANGELOG.md index 1be8150..0f7df0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ All notable changes to this project will be documented in this file. +## [v1.58.0] +- Added first-class Hugging Face scanning support, including CLI enumeration, token authentication, and integration with remote scans. +- Condensed GitError formatting to report the exit status and the first informative lines from stdout/stderr, producing concise git clone failure logs. +- Added support for scanning Google Cloud Storage buckets via `--gcs-bucket`, including optional prefixes and service-account authentication. +- Added `--skip-aws-account` (now accepting comma-separated values) and `--skip-aws-account-file` to bypass live AWS validation for known canary/honey-token account IDs without triggering alerts. Kingfisher now ships with several canary AWS account IDs pre-seeded in the skip list and now reports matching findings as "Not Attempted" with the "Response" containing "(skip list entry)" so it's clear that validation was intentionally skipped and why. + ## [v1.57.0] - Added inline ignore directive detection to treat suppression tokens anywhere on surrounding lines, including multi-line handling - Added a `--no-ignore` CLI flag to disable inline directives when you need every potential secret reported diff --git a/Cargo.toml b/Cargo.toml index 852194d..1eb11b5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace.package] edition = "2021" -rust-version = "1.88" +rust-version = "1.90" license = "Apache-2.0" authors = ["Mick Grove "] homepage = "https://github.com/mongodb/kingfisher" @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.57.0" +version = "1.58.0" description = "MongoDB's blazingly fast and accurate secret scanning and validation tool" edition.workspace = true rust-version.workspace = true @@ -122,7 +122,6 @@ reqwest-middleware = "0.4.2" tracing-subscriber = {version = "0.3.19", features = ["env-filter"] } tracing-core = "0.1.34" tree-sitter = "0.25.8" -aws-smithy-runtime = "1.9.1" aws-smithy-http-client = "1.1.1" aws-smithy-runtime-api = "1.9.0" aws-smithy-types = "1.3.2" @@ -192,6 +191,10 @@ walkdir = "2.5.0" p256 = "0.13.2" ed25519-dalek = { version = "2.2", features = ["pkcs8"] } aws-sdk-s3 = "1.100.0" +gcloud-storage = { version = "1.1.1", default-features = false, features = [ + "rustls-tls", + "auth", +] } tokei = "12.1.2" [target.'cfg(not(windows))'.dependencies] diff --git a/Makefile b/Makefile index 14a4c6a..0325d64 100644 --- a/Makefile +++ b/Makefile @@ -110,11 +110,11 @@ setup-zig: ubuntu-x64: setup-zig # ensures Zig & cargo-zigbuild exist @echo "Checking Rust toolchain…" @$(MAKE) check-rust || { \ - echo "🦀 Installing Rust 1.88.0 …"; \ + echo "🦀 Installing Rust 1.90.0 …"; \ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \ . $$HOME/.cargo/env; \ - rustup toolchain install 1.88.0; \ - rustup default 1.88.0; \ + rustup toolchain install 1.90.0; \ + rustup default 1.90.0; \ } @echo "📦 Installing build dependencies (musl, cmake, etc.)…" @@ -150,11 +150,11 @@ ubuntu-x64: setup-zig # ensures Zig & cargo-zigbuild exist ubuntu-arm64: setup-zig # ensures Zig & cargo-zigbuild exist @echo "Checking Rust toolchain…" @$(MAKE) check-rust || { \ - echo "🦀 Installing Rust 1.88.0 …"; \ + echo "🦀 Installing Rust 1.90.0 …"; \ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \ . $$HOME/.cargo/env; \ - rustup toolchain install 1.88.0; \ - rustup default 1.88.0; \ + rustup toolchain install 1.90.0; \ + rustup default 1.90.0; \ } @echo "📦 Installing build dependencies (musl, cmake, etc.)…" @@ -245,7 +245,7 @@ endif linux-x64: check-docker create-dockerignore @mkdir -p target/release docker run --platform linux/amd64 --rm \ - -v "$$(pwd):/src" -w /src rust:1.88-alpine sh -eu -c '\ + -v "$$(pwd):/src" -w /src rust:1.90-alpine sh -eu -c '\ apk add --no-cache \ musl-dev \ gcc g++ make cmake pkgconfig \ @@ -256,7 +256,7 @@ linux-x64: check-docker create-dockerignore patch perl ragel && \ git openssl-dev curl && \ \ - cargo test --workspace --all-targets --release ; \ + cargo test --workspace --all-targets ; \ \ rustup target add x86_64-unknown-linux-musl && \ \ @@ -274,7 +274,7 @@ linux-x64: check-docker create-dockerignore linux-arm64: check-docker create-dockerignore @mkdir -p target/release docker run --platform linux/arm64 --rm \ - -v "$$(pwd):/src" -w /src rust:1.88-alpine sh -eu -c '\ + -v "$$(pwd):/src" -w /src rust:1.90-alpine sh -eu -c '\ apk add --no-cache \ musl-dev \ gcc g++ make cmake pkgconfig \ @@ -287,7 +287,7 @@ linux-arm64: check-docker create-dockerignore \ rustup target add aarch64-unknown-linux-musl && \ \ - cargo test --workspace --all-targets --release ; \ + cargo test --workspace --all-targets ; \ \ export PKG_CONFIG_ALLOW_CROSS=1 ; \ export RUSTFLAGS="-C target-feature=+crt-static" ; \ @@ -385,7 +385,7 @@ check-rust: echo "Rust not found."; \ exit 1; \ fi; \ - required=1.88.0; \ + required=1.90.0; \ if [ $$(printf '%s\n' "$$required" "$$version" | sort -V | head -n1) != "$$required" ]; then \ echo "Rust version $$version is older than required $$required."; \ exit 1; \ diff --git a/README.md b/README.md index d77d01e..bd07bc7 100644 --- a/README.md +++ b/README.md @@ -5,22 +5,23 @@ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s hardware‑accelerated Hyperscan regex engine with language‑aware source code parsing, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production +Kingfisher is a blazingly fast secret‑scanning and live validation tool built in Rust. It combines Intel’s SIMD accelerated regex engine (Hyperscan) with language‑aware source code parsing, and **ships with hundreds of built‑in rules** to detect, validate, and triage secrets before they ever reach production

-Originally forked from Praetorian’s Nosey Parker, Kingfisher **adds** live cloud-API validation; many more targets (GitLab, BitBucket, Gitea, S3, Docker, Jira, Confluence, Slack); compressed-file extraction and scanning; baseline and allowlist controls; language-aware detection (~20 languages); and a native Windows binary. See [Origins and Divergence](#origins-and-divergence) for details. +Originally forked from Praetorian’s Nosey Parker, Kingfisher has since significantly expanded and diverged, adding live validation, 10+ new scan targets, and major architectural enhancements. See [Origins and Divergence](#origins-and-divergence) for details. + ## Key Features ### Multiple Scan Targets
-| Files / Dirs | Local Git | GitHub | GitLab | Azure DevOps | Bitbucket | Gitea | -|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:| -| Files / Dirs
Files / Dirs | Local Git
Local Git | GitHub
GitHub | GitLab
GitLab | Azure DevOps
Azure DevOps | Bitbucket
Bitbucket | Gitea
Gitea | +| Files / Dirs | Local Git | GitHub | GitLab | Azure Repos | Bitbucket | Gitea | Hugging Face | +|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|:-------------:| +| Files / Dirs
Files / Dirs | Local Git
Local Git | GitHub
GitHub | GitLab
GitLab | Azure Repos
Azure Repos | Bitbucket
Bitbucket | Gitea
Gitea |Hugging Face
Hugging Face | -| Docker | Jira | Confluence | Slack | AWS S3 | -|:------:|:----:|:-----------:|:-----:|:------:| -| Docker
Docker | Jira
Jira | Confluence
Confluence | Slack
Slack | AWS S3
AWS S3 | +| Docker | Jira | Confluence | Slack | AWS S3 | Google Cloud | +|:------:|:----:|:-----------:|:-----:|:------:|:---:| +| Docker
Docker | Jira
Jira | Confluence
Confluence | Slack
Slack | AWS S3
AWS S3 | Google Cloud Storage
Cloud Storage |
@@ -41,17 +42,27 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) Kingfisher Runtime Comparison

+# Table of Contents + +
+ - [Kingfisher](#kingfisher) - [Key Features](#key-features) - [Multiple Scan Targets](#multiple-scan-targets) - [Performance, Accuracy, and Hundreds of Rules](#performance-accuracy-and-hundreds-of-rules) - [Benchmark Results](#benchmark-results) +- [Table of Contents](#table-of-contents) - [Getting Started](#getting-started) - [Installation](#installation) - - [Run Kingfisher in Docker](#run-kingfisher-in-docker) + - [Pre-built Releases](#pre-built-releases) + - [Homebrew](#homebrew) + - [Linux and macOS](#linux-and-macos) + - [Windows](#windows) + - [Compile](#compile) + - [ Run Kingfisher in Docker](#-run-kingfisher-in-docker) - [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance) - - [Write Custom Rules!](#write-custom-rules) -- [Usage](#usage) + - [📝 Write Custom Rules!](#-write-custom-rules) +- [🎉 Usage](#-usage) - [Basic Examples](#basic-examples) - [Scan with secret validation](#scan-with-secret-validation) - [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories) @@ -67,6 +78,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [Exclude specific paths](#exclude-specific-paths) - [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines) - [ Scanning an AWS S3 Bucket](#-scanning-an-aws-s3-bucket) + - [ Scanning a Google Cloud Storage Bucket](#-scanning-a-google-cloud-storage-bucket) - [ Scanning Docker Images](#-scanning-docker-images) - [ Scanning GitHub](#-scanning-github) - [Scan GitHub organization (requires `KF_GITHUB_TOKEN`)](#scan-github-organization-requires-kf_github_token) @@ -79,8 +91,8 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url) - [List GitLab repositories](#list-gitlab-repositories) - [ Scanning Azure Repos](#-scanning-azure-repos) - - [Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-devops-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat) - - [Scan specific Azure DevOps projects](#scan-specific-azure-devops-projects) + - [Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-repos-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat) + - [Scan specific Azure Repos projects](#scan-specific-azure-repos-projects) - [Skip specific Azure repositories during enumeration](#skip-specific-azure-repositories-during-enumeration) - [List Azure repositories](#list-azure-repositories) - [ Scanning Gitea](#-scanning-gitea) @@ -97,6 +109,12 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [List Bitbucket repositories](#list-bitbucket-repositories) - [Authenticate to Bitbucket](#authenticate-to-bitbucket) - [Self-hosted Bitbucket Server](#self-hosted-bitbucket-server) + - [ Scanning Hugging Face](#-scanning-hugging-face) + - [Scan Hugging Face user](#scan-hugging-face-user) + - [Scan Hugging Face organization](#scan-hugging-face-organization) + - [Scan specific Hugging Face resources](#scan-specific-hugging-face-resources) + - [List Hugging Face repositories](#list-hugging-face-repositories) + - [Authenticate to Hugging Face](#authenticate-to-hugging-face) - [ Scanning Jira](#-scanning-jira) - [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query) - [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues) @@ -107,7 +125,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [Environment Variables for Tokens](#environment-variables-for-tokens) - [Exit Codes](#exit-codes) - [Update Checks](#update-checks) -- [Advanced Options](#advanced-options) +- [🤓 Advanced Options](#-advanced-options) - [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets) - [List Builtin Rules](#list-builtin-rules) - [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run) @@ -117,6 +135,8 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [Notable Scan Options](#notable-scan-options) - [Understanding `--confidence`](#understanding---confidence) - [Ignore known false positives](#ignore-known-false-positives) + - [Skip Canary Tokens (AWS)](#skip-canary-tokens-aws) + - [Common CLI flows](#common-cli-flows) - [Inline ignore directives](#inline-ignore-directives) - [Finding Fingerprint](#finding-fingerprint) - [Rule Performance Profiling](#rule-performance-profiling) @@ -125,36 +145,58 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [Roadmap](#roadmap) - [License](#license) +
+ + # Getting Started ## Installation +### Pre-built Releases +Pre-built binaries are available from the [Releases](https://github.com/mongodb/kingfisher/releases) section. + +### Homebrew -On macOS, you can simply ```bash brew install kingfisher ``` -Pre-built binaries are also available on the [Releases](https://github.com/mongodb/kingfisher/releases) section of this page. +### Linux and macOS -You can also install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform: +
+ +You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform. ```bash # Linux, macOS curl --silent --location \ https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \ sh && \ - ubi --project mongodb/kingfisher --in "$HOME/bin" + ubi --project mongodb/kingfisher --in "$HOME/.local/bin" ``` +This installs and runs `ubi` and then places the `kingfisher` executable in `~/.local/bin` on Unix-like systems. + +
+ +### Windows + +
+ +You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform. + ```powershell # Windows powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in . ``` -This installs `ubi` and then places the `kingfisher` executable in `~/bin` on Unix-like -systems (or the current directory on Windows). +This installs and runs `ubi` and then places the `kingfisher` executable in the current directory on Windows. +
-Or you may compile for your platform via `make`: + +### Compile +You may compile for your platform via `make` + +
```bash # NOTE: Requires Docker @@ -174,9 +216,14 @@ make darwin-all # builds both x64 and arm64 make all # builds for every OS and architecture supported ``` -### Run Kingfisher in Docker +
+ +### Docker Run Kingfisher in Docker + +Run the dockerized Kingfisher container + +
-Run the dockerized Kingfisher container: ```bash # GitHub Container Registry docker run --rm ghcr.io/mongodb/kingfisher:latest --version @@ -233,6 +280,8 @@ docker run --rm \ ``` +
+ # 🔐 Detection Rules at a Glance Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything from classic cloud keys to the latest AI SaaS tokens. Below is an overview: @@ -248,7 +297,7 @@ Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything fr | **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more | | **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more | -## Write Custom Rules! +## 📝 Write Custom Rules! Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential. @@ -258,7 +307,7 @@ First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom King Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! -# Usage +# 🎉 Usage ## Basic Examples @@ -400,7 +449,7 @@ kingfisher scan ./my-project \ -v ``` -## GitHub Scanning an AWS S3 Bucket +## GitHub Scanning an AWS S3 Bucket You can scan S3 objects directly: ```bash @@ -415,7 +464,7 @@ Credential resolution happens in this order: If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role. -Examples: +Examples ```bash # using explicit keys @@ -452,7 +501,30 @@ docker run --rm \ scan --s3-bucket bucket-name ``` -## Docker Scanning Docker Images +## Google Cloud Storage Scanning a Google Cloud Storage Bucket + +The `--gcs-bucket` flag streams objects directly from Google Cloud Storage. Authentication uses +Application Default Credentials, so you can provide a service-account JSON file via the +`GOOGLE_APPLICATION_CREDENTIALS` environment variable or by passing `--gcs-service-account`. Public +buckets work without credentials. + +```bash +kingfisher scan --gcs-bucket bucket-name + +# scan a sub-tree inside the bucket +kingfisher scan --gcs-bucket bucket-name --gcs-prefix path/to/data/ + +# supply a service-account key explicitly +kingfisher scan --gcs-bucket bucket-name --gcs-service-account /path/to/key.json +``` + +Functional example: +```bash +kingfisher scan --gcs-bucket cloud-samples-data --gcs-prefix "storage/" +``` + + +## Docker Scanning Docker Images Kingfisher will first try to use any locally available image, then fall back to pulling via OCI. @@ -482,7 +554,7 @@ kingfisher scan --docker-image some-private-registry.dkr.ecr.us-east-1.amazonaws kingfisher scan --docker-image private.registry.example.com/my-image:tag ``` -## GitHub Scanning GitHub +## GitHub Scanning GitHub ### Scan GitHub organization (requires `KF_GITHUB_TOKEN`) @@ -524,7 +596,7 @@ KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/priva --- -## GitLab Scanning GitLab +## GitLab Scanning GitLab ### Scan GitLab group (requires `KF_GITLAB_TOKEN`) @@ -580,18 +652,18 @@ kingfisher gitlab repos list --group my-group --include-subgroups # skip specific projects when listing or scanning (supports glob patterns) kingfisher gitlab repos list --group my-group --gitlab-exclude my-group/**/legacy-* ``` -## Azure Repos Scanning Azure Repos +## Azure Repos Scanning Azure Repos -### Scan Azure DevOps organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`) +### Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`) ```bash kingfisher scan --azure-organization my-org -# Azure DevOps Server example +# Azure Repos Server example KF_AZURE_PAT="pat" kingfisher scan --azure-organization DefaultCollection --azure-base-url https://ado.internal.example/tfs/ ``` -### Scan specific Azure DevOps projects +### Scan specific Azure Repos projects Projects are specified as `ORGANIZATION/PROJECT`. Repeat the flag for multiple projects. @@ -621,7 +693,7 @@ kingfisher azure repos list --project my-org/app --project my-org/api # skip specific repositories while listing (supports glob patterns) kingfisher azure repos list --organization my-org --azure-exclude my-org/**/experimental-* ``` -## Gitea Scanning Gitea +## Gitea Scanning Gitea ### Scan Gitea organization (requires `KF_GITEA_TOKEN`) @@ -673,7 +745,7 @@ KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --all-gitea-organizations # self-hosted example KF_GITEA_TOKEN="gtoken" kingfisher gitea repos list --user johndoe --gitea-api-url https://gitea.internal.example/api/v1/ ``` -## Bitbucket Scanning Bitbucket +## Bitbucket Scanning Bitbucket ### Scan Bitbucket workspace ```bash @@ -745,7 +817,45 @@ Use `--bitbucket-api-url` to point Kingfisher at your server's REST endpoint, fo `https://bitbucket.example.com/rest/api/1.0/`. Provide credentials with `--bitbucket-username` and `--bitbucket-token`, and pass `--ignore-certs` when connecting to HTTP or otherwise insecure instances. -## Jira Scanning Jira +## Hugging Face Scanning Hugging Face + +Hugging Face hosts git repositories for models, datasets, and Spaces. Kingfisher can enumerate and scan all three resource types. + +### Scan Hugging Face user + +```bash +kingfisher scan --huggingface-user +``` + +### Scan Hugging Face organization + +```bash +kingfisher scan --huggingface-organization +``` + +### Scan specific Hugging Face resources + +Scan individual repositories by ID (owner/name) or by passing the full HTTPS URL: + +```bash +kingfisher scan --huggingface-model +kingfisher scan --huggingface-dataset https://huggingface.co/datasets// +kingfisher scan --huggingface-space +``` + +Use `--huggingface-exclude` to omit results returned by user or organization enumeration. Prefix values with `model:`, `dataset:`, or `space:` when you only want to skip a specific resource type. + +### List Hugging Face repositories + +```bash +kingfisher huggingface repos list --huggingface-user +``` + +### Authenticate to Hugging Face + +Private repositories require an access token provided through the `KF_HUGGINGFACE_TOKEN` environment variable. For git authentication the helper also honours `KF_HUGGINGFACE_USERNAME` (default `hf_user`). + +## Jira Scanning Jira ### Scan Jira issues matching a JQL query @@ -764,7 +874,7 @@ KF_JIRA_TOKEN="token" kingfisher scan \ --max-results 1000 ``` -## Confluence Scanning Confluence +## Confluence Scanning Confluence ### Scan Confluence pages matching a CQL query ```bash @@ -789,7 +899,7 @@ Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environ To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions. -## Slack Scanning Slack +## Slack Scanning Slack ### Scan Slack messages matching a search query ```bash @@ -811,11 +921,13 @@ KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan \ | `KF_GITLAB_TOKEN` | GitLab Personal Access Token | | `KF_GITEA_TOKEN` | Gitea Personal Access Token | | `KF_GITEA_USERNAME` | Username for private Gitea clones (used with `KF_GITEA_TOKEN`) | -| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure DevOps Personal Access Token | -| `KF_AZURE_USERNAME` | Username to use with Azure DevOps PATs (defaults to `pat` when unset) | +| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure Repos Personal Access Token | +| `KF_AZURE_USERNAME` | Username to use with Azure Repos PATs (defaults to `pat` when unset) | | `KF_BITBUCKET_USERNAME` | Bitbucket username for basic authentication | | `KF_BITBUCKET_APP_PASSWORD` / `KF_BITBUCKET_TOKEN` | Bitbucket app password or server token | | `KF_BITBUCKET_OAUTH_TOKEN` | Bitbucket OAuth or PAT token | +| `KF_HUGGINGFACE_TOKEN` | Hugging Face access token for API enumeration and git cloning | +| `KF_HUGGINGFACE_USERNAME` | Optional username for Hugging Face git operations (defaults to `hf_user`) | | `KF_JIRA_TOKEN` | Jira API token | | `KF_CONFLUENCE_TOKEN` | Confluence API token | | `KF_SLACK_TOKEN` | Slack API token | @@ -869,7 +981,7 @@ Kingfisher automatically queries GitHub for a newer release when it starts and t - **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely -# Advanced Options +# 🤓 Advanced Options ## Build a Baseline / Detect New Secrets @@ -963,6 +1075,8 @@ leaves the default unchanged. - `--manage-baseline`: Create or update the baseline file with current findings - `--skip-regex `: Ignore findings whose text matches this regex (repeatable) - `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) +- `--skip-aws-account `: Skip live AWS validation for findings tied to the specified AWS account number (repeatable, accepts comma-separated lists) +- `--skip-aws-account-file `: Load AWS account numbers to skip from a file (one account per line; `#` comments allowed) - `--ignore-comment `: Honor additional inline directives from other scanners (repeatable; e.g. `--ignore-comment "gitleaks:allow"`) - `--no-ignore`: Disable inline directives entirely so every match is reported ## Understanding `--confidence` @@ -996,6 +1110,66 @@ kingfisher scan \ If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early. +### Skip Canary Tokens (AWS) + +Canary/honey tokens are intentionally leaked credentials used to catch misuse. Kingfisher can **recognize and skip** known AWS canary accounts so hygiene scans don’t set off alerts. + +**How to skip** +Pass the 12-digit AWS account IDs for your canaries via `--skip-aws-account` (comma-separated) or `--skip-aws-account-file` (one ID per line; blank lines and `#` comments allowed). Kingfisher also ships with a **pre-seeded (but not exhaustive)** list of Thinkst Canary account IDs used by canarytokens.org, so many are skipped automatically. + +```bash +kingfisher scan /path/to/code \ + --skip-aws-account "171436882533,534261010715" + +# or combine preloaded canary IDs with a just-created decoy account +printf '999900001111 \n534261010715' > /tmp/canary_accounts.txt + +kingfisher scan /path/to/repo \ + --skip-aws-account-file /tmp/canary_accounts.txt + +``` + +**What you’ll see** +Findings tied to a skip-listed account report `Validation: Not Attempted` and note in the `Response:` that the entry came from the skip list: + +```bash +AWS SECRET ACCESS KEY => [KINGFISHER.AWS.2] + |Finding.......: + |Fingerprint...: 2141074333616819500 + |Confidence....: medium + |Entropy.......: 5.00 + |Validation....: Not Attempted + |__Response....: (skip list entry) AWS validation not attempted for account 171436882533. + |Language......: Unknown + |Line Num......: 21 + |Path..........: /tmp/test_canary_accounts.log +``` + +**Why this matters** +Skipping prevents noisy tripwires in prod telemetry while keeping the status explicit—“Not Attempted” isn’t a pass. If needed, verify these credentials out-of-band or with a safe, non-triggering method. + + +#### Common CLI flows + +```bash +# Skip a few in-house canaries during a filesystem scan +kingfisher scan repo/ \ + --skip-aws-account "111122223333,444455556666" + +# Read a longer list from disk +kingfisher scan repo/ \ + --skip-aws-account-file /tmp/scripts/canary_accounts.txt + +# Combine preloaded canary IDs with a just-created decoy account +printf '999900001111\n534261010715\n' > /tmp/new_canary.txt + +kingfisher scan /path/to/repo \ + --skip-aws-account-file /tmp/new_canary.txt + +``` + +Tip: if you manage multiple canary fleets (Thinkst, self-hosted alternatives, or bespoke decoys), checkpoint the account IDs alongside your infrastructure-as-code so security teams can rotate or expand the skip list without editing pipelines. + ### Inline ignore directives Add `kingfisher:ignore` anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value: diff --git a/docs/assets/icons/gcs.svg b/docs/assets/icons/gcs.svg new file mode 100644 index 0000000..842c121 --- /dev/null +++ b/docs/assets/icons/gcs.svg @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/assets/icons/huggingface.svg b/docs/assets/icons/huggingface.svg new file mode 100644 index 0000000..43711df --- /dev/null +++ b/docs/assets/icons/huggingface.svg @@ -0,0 +1,41 @@ + + + + + + + + + + + + diff --git a/src/cli/commands/huggingface.rs b/src/cli/commands/huggingface.rs new file mode 100644 index 0000000..f0879ab --- /dev/null +++ b/src/cli/commands/huggingface.rs @@ -0,0 +1,73 @@ +use clap::{Args, Subcommand}; + +use crate::cli::commands::output::OutputArgs; + +use super::github::GitHubOutputFormat; + +/// Top-level Hugging Face command group +#[derive(Args, Debug)] +pub struct HuggingFaceArgs { + #[command(subcommand)] + pub command: HuggingFaceCommand, +} + +#[derive(Subcommand, Debug)] +pub enum HuggingFaceCommand { + /// Interact with Hugging Face repositories + #[command(subcommand)] + Repos(HuggingFaceReposCommand), +} + +#[derive(Subcommand, Debug)] +pub enum HuggingFaceReposCommand { + /// List Hugging Face repositories + List(HuggingFaceReposListArgs), +} + +#[derive(Args, Debug, Clone)] +pub struct HuggingFaceReposListArgs { + #[command(flatten)] + pub repo_specifiers: HuggingFaceRepoSpecifiers, + + #[command(flatten)] + pub output_args: OutputArgs, +} + +#[derive(Args, Debug, Clone, Default)] +pub struct HuggingFaceRepoSpecifiers { + /// Models, datasets, and Spaces owned by these users + #[arg(long = "huggingface-user")] + pub user: Vec, + + /// Models, datasets, and Spaces owned by these organizations + #[arg(long = "huggingface-organization", alias = "huggingface-org")] + pub organization: Vec, + + /// Specific models to scan (format: owner/name or full URL) + #[arg(long = "huggingface-model")] + pub model: Vec, + + /// Specific datasets to scan (format: owner/name or full URL) + #[arg(long = "huggingface-dataset")] + pub dataset: Vec, + + /// Specific Spaces to scan (format: owner/name or full URL) + #[arg(long = "huggingface-space")] + pub space: Vec, + + /// Skip specific repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:) + #[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")] + pub exclude: Vec, +} + +impl HuggingFaceRepoSpecifiers { + pub fn is_empty(&self) -> bool { + self.user.is_empty() + && self.organization.is_empty() + && self.model.is_empty() + && self.dataset.is_empty() + && self.space.is_empty() + } +} + +pub type HuggingFaceOutputFormat = GitHubOutputFormat; diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs index 4bab9d1..a41cf82 100644 --- a/src/cli/commands/inputs.rs +++ b/src/cli/commands/inputs.rs @@ -28,6 +28,11 @@ pub struct InputSpecifierArgs { "gitlab_group", "gitea_user", "gitea_organization", + "huggingface_user", + "huggingface_organization", + "huggingface_model", + "huggingface_dataset", + "huggingface_space", "bitbucket_user", "bitbucket_workspace", "bitbucket_project", @@ -43,7 +48,8 @@ pub struct InputSpecifierArgs { "confluence_url", "docker_image", "slack_query", - "s3_bucket" + "s3_bucket", + "gcs_bucket" ]), num_args = 0.., value_hint = ValueHint::AnyPath @@ -120,6 +126,30 @@ pub struct InputSpecifierArgs { #[arg(long, alias = "include-subgroups")] pub gitlab_include_subgroups: bool, + /// Scan models, datasets, and Spaces belonging to the specified Hugging Face users + #[arg(long = "huggingface-user")] + pub huggingface_user: Vec, + + /// Scan models, datasets, and Spaces belonging to the specified Hugging Face organizations + #[arg(long = "huggingface-organization", alias = "huggingface-org")] + pub huggingface_organization: Vec, + + /// Scan a specific Hugging Face model (format: owner/name or full URL) + #[arg(long = "huggingface-model")] + pub huggingface_model: Vec, + + /// Scan a specific Hugging Face dataset (format: owner/name or full URL) + #[arg(long = "huggingface-dataset")] + pub huggingface_dataset: Vec, + + /// Scan a specific Hugging Face Space (format: owner/name or full URL) + #[arg(long = "huggingface-space")] + pub huggingface_space: Vec, + + /// Skip specific Hugging Face repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:) + #[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")] + pub huggingface_exclude: Vec, + // Gitea Options /// Scan repositories belonging to the specified Gitea user #[arg(long)] @@ -256,6 +286,18 @@ pub struct InputSpecifierArgs { #[arg(long, requires = "s3_bucket")] pub aws_local_profile: Option, + /// Scan the specified Google Cloud Storage bucket + #[arg(long)] + pub gcs_bucket: Option, + + /// Optional prefix within the GCS bucket + #[arg(long, requires = "gcs_bucket")] + pub gcs_prefix: Option, + + /// Path to a service account JSON file for GCS authentication + #[arg(long, value_hint = ValueHint::FilePath, requires = "gcs_bucket")] + pub gcs_service_account: Option, + /// Docker/OCI images to scan (no local Docker required) #[arg(long = "docker-image")] pub docker_image: Vec, @@ -299,7 +341,6 @@ pub struct ContentFilteringArgs { #[arg( long = "max-file-size", visible_alias = "max-filesize", // also show in --help - // alias = "max-filesize", // use this instead if you DON’T want it shown in --help default_value_t = 256.0, value_name = "MB" )] diff --git a/src/cli/commands/mod.rs b/src/cli/commands/mod.rs index 0434af9..4b25b89 100644 --- a/src/cli/commands/mod.rs +++ b/src/cli/commands/mod.rs @@ -3,6 +3,7 @@ pub mod bitbucket; pub mod gitea; pub mod github; pub mod gitlab; +pub mod huggingface; pub mod inputs; pub mod output; pub mod rules; diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index 78312d8..f4d7921 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -1,4 +1,5 @@ use clap::{Args, ValueEnum}; +use std::path::PathBuf; use strum::Display; use tracing::debug; @@ -119,6 +120,14 @@ pub struct ScanArgs { #[arg(long = "skip-word", value_name = "WORD")] pub skip_word: Vec, + /// AWS account IDs whose findings should skip live credential validation (repeatable) + #[arg(long = "skip-aws-account", value_name = "ACCOUNT_ID", value_delimiter = ',')] + pub skip_aws_account: Vec, + + /// File containing AWS account IDs to skip (one per line, `#` comments ignored) + #[arg(long = "skip-aws-account-file", value_name = "FILE")] + pub skip_aws_account_file: Option, + /// Additional inline ignore directives to recognise (repeatable) #[arg(long = "ignore-comment", value_name = "DIRECTIVE")] pub extra_ignore_comments: Vec, diff --git a/src/cli/global.rs b/src/cli/global.rs index a03d3d4..3c2fe3f 100644 --- a/src/cli/global.rs +++ b/src/cli/global.rs @@ -8,7 +8,7 @@ use tracing::Level; use crate::cli::commands::{ azure::AzureArgs, bitbucket::BitbucketArgs, gitea::GiteaArgs, github::GitHubArgs, - gitlab::GitLabArgs, rules::RulesArgs, scan::ScanArgs, + gitlab::GitLabArgs, huggingface::HuggingFaceArgs, rules::RulesArgs, scan::ScanArgs, }; #[deny(missing_docs)] @@ -81,6 +81,10 @@ pub enum Command { #[command(name = "azure")] Azure(AzureArgs), + /// Interact with the Hugging Face Hub + #[command(name = "huggingface")] + HuggingFace(HuggingFaceArgs), + /// Manage rules #[command(alias = "rule")] Rules(RulesArgs), diff --git a/src/gcs.rs b/src/gcs.rs new file mode 100644 index 0000000..dc346fd --- /dev/null +++ b/src/gcs.rs @@ -0,0 +1,96 @@ +use std::path::Path; + +use anyhow::{Context, Result}; +use gcloud_storage::{ + client::{google_cloud_auth::credentials::CredentialsFile, Client, ClientConfig}, + http::objects::{ + download::Range, + get::GetObjectRequest, + list::{ListObjectsRequest, ListObjectsResponse}, + }, +}; +use tracing::debug; + +/// Visit every object in the given GCS bucket, optionally filtered by prefix. +/// +/// Authentication is attempted via Application Default Credentials. When that +/// fails and no explicit service account path was provided, the client falls +/// back to anonymous access so public buckets can still be scanned. +pub async fn visit_bucket_objects( + bucket: &str, + prefix: Option<&str>, + service_account_path: Option<&Path>, + mut visitor: F, +) -> Result<()> +where + F: FnMut(String, Vec) -> Result<()>, +{ + let config_result = if let Some(path) = service_account_path { + let credentials = CredentialsFile::new_from_file(path.to_string_lossy().into_owned()) + .await + .with_context(|| { + format!("Failed to read GCS service account credentials from {}", path.display()) + })?; + + ClientConfig::default().with_credentials(credentials).await + } else { + ClientConfig::default().with_auth().await + }; + + let config = match config_result { + Ok(config) => config, + Err(err) => { + if service_account_path.is_some() + || std::env::var("GOOGLE_APPLICATION_CREDENTIALS").is_ok() + || std::env::var("GOOGLE_APPLICATION_CREDENTIALS_JSON").is_ok() + { + return Err(err) + .context("Failed to authenticate with GCS using provided credentials"); + } + debug!("Falling back to anonymous GCS access: {err}"); + ClientConfig::default().anonymous() + } + }; + + let client = Client::new(config); + let mut page_token: Option = None; + + loop { + let request = ListObjectsRequest { + bucket: bucket.to_string(), + prefix: prefix.map(|p| p.to_string()), + page_token: page_token.clone(), + ..ListObjectsRequest::default() + }; + + let mut response: ListObjectsResponse = client + .list_objects(&request) + .await + .with_context(|| format!("Failed to list objects in bucket {bucket}"))?; + + if let Some(items) = response.items.take() { + for object in items.into_iter().filter(|o| !o.name.is_empty()) { + let data = client + .download_object( + &GetObjectRequest { + bucket: bucket.to_string(), + object: object.name.clone(), + ..GetObjectRequest::default() + }, + &Range::default(), + ) + .await + .with_context(|| format!("Failed to fetch object {}", object.name))?; + + visitor(object.name, data)?; + } + } + + match response.next_page_token { + Some(token) if !token.is_empty() => page_token = Some(token), + _ => break, + } + } + + Ok(()) +} diff --git a/src/git_binary.rs b/src/git_binary.rs index 82fd990..a629373 100644 --- a/src/git_binary.rs +++ b/src/git_binary.rs @@ -40,6 +40,15 @@ const AZURE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_azcreds() { fi }; _azcreds"#; +const HUGGINGFACE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_hfcreds() { + token="$KF_HUGGINGFACE_TOKEN"; + if [ -n "$token" ]; then + user="${KF_HUGGINGFACE_USERNAME:-hf_user}"; + echo username="$user"; + echo password="$token"; + fi +}; _hfcreds"#; + /// Represents errors that can occur when interacting with the `git` CLI. #[derive(Debug, thiserror::Error)] pub enum GitError { @@ -47,14 +56,37 @@ pub enum GitError { IOError(#[from] std::io::Error), #[error( - "git execution failed\ncode={}\nstdout=```\n{}```\nstderr=```\n{}```", - .status, - String::from_utf8_lossy(.stdout), - String::from_utf8_lossy(.stderr) + "git execution failed (status: {status}){summary}", + status = format_exit_status(.status), + summary = format_git_error_summary(.stdout.as_slice(), .stderr.as_slice()) )] GitError { stdout: Vec, stderr: Vec, status: ExitStatus }, } +fn format_exit_status(status: &ExitStatus) -> String { + status.code().map(|code| code.to_string()).unwrap_or_else(|| status.to_string()) +} + +fn format_git_error_summary(stdout: &[u8], stderr: &[u8]) -> String { + let mut messages = Vec::new(); + if let Some(line) = summarize_output(stderr) { + messages.push(line); + } + if let Some(line) = summarize_output(stdout) { + messages.push(line); + } + if messages.is_empty() { + String::new() + } else { + format!(": {}", messages.join(" | ")) + } +} + +fn summarize_output(output: &[u8]) -> Option { + let text = String::from_utf8_lossy(output); + text.lines().map(str::trim).find(|line| !line.is_empty()).map(|line| line.to_owned()) +} + /// A helper struct for running `git` commands. /// /// It supports optional GitHub, GitLab, Gitea, and Bitbucket credentials passed via @@ -91,6 +123,8 @@ impl Git { let has_azure_token = ["KF_AZURE_TOKEN", "KF_AZURE_PAT"] .iter() .any(|key| matches!(std::env::var(key), Ok(value) if !value.is_empty())); + let has_huggingface_token = + matches!(std::env::var("KF_HUGGINGFACE_TOKEN"), Ok(value) if !value.is_empty()); // If credentials are provided via environment variables, clear existing helpers first. if has_github_token @@ -98,6 +132,7 @@ impl Git { || has_gitea_token || has_bitbucket_credentials || has_azure_token + || has_huggingface_token { credentials.push("-c".into()); credentials.push(r#"credential.helper="#.into()); @@ -136,6 +171,11 @@ impl Git { credentials.push(AZURE_CREDENTIAL_HELPER.into()); } + if has_huggingface_token { + credentials.push("-c".into()); + credentials.push(HUGGINGFACE_CREDENTIAL_HELPER.into()); + } + Self { credentials, ignore_certs } } diff --git a/src/huggingface.rs b/src/huggingface.rs new file mode 100644 index 0000000..42bbf03 --- /dev/null +++ b/src/huggingface.rs @@ -0,0 +1,636 @@ +use std::{collections::HashSet, env, time::Duration}; + +use anyhow::{anyhow, Result}; +use indicatif::{ProgressBar, ProgressStyle}; +use reqwest::{header::LINK, StatusCode, Url}; +use serde::Deserialize; +use serde_json::Value; +use tracing::{debug, warn}; + +use crate::{git_url::GitUrl, validation::GLOBAL_USER_AGENT}; + +#[derive(Debug, Clone, Default)] +pub struct RepoSpecifiers { + pub user: Vec, + pub organization: Vec, + pub model: Vec, + pub dataset: Vec, + pub space: Vec, + pub exclude: Vec, +} + +impl RepoSpecifiers { + pub fn is_empty(&self) -> bool { + self.user.is_empty() + && self.organization.is_empty() + && self.model.is_empty() + && self.dataset.is_empty() + && self.space.is_empty() + } +} + +#[derive(Clone, Default)] +pub struct AuthConfig { + token: Option, +} + +impl std::fmt::Debug for AuthConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AuthConfig") + .field( + "token", + &self + .token + .as_ref() + .map(|token| format!("{}…", token.chars().take(4).collect::())), + ) + .finish() + } +} + +impl AuthConfig { + pub fn from_env() -> Self { + let token = env::var("KF_HUGGINGFACE_TOKEN").ok().filter(|t| !t.trim().is_empty()); + Self { token } + } + + fn apply(&self, request: reqwest::RequestBuilder) -> reqwest::RequestBuilder { + if let Some(token) = &self.token { + request.bearer_auth(token) + } else { + request + } + } + + fn has_token(&self) -> bool { + self.token.is_some() + } +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] +enum ResourceKind { + Model, + Dataset, + Space, +} + +impl ResourceKind { + fn api_path(self) -> &'static str { + match self { + ResourceKind::Model => "models", + ResourceKind::Dataset => "datasets", + ResourceKind::Space => "spaces", + } + } + + fn git_url(self, slug: &str) -> String { + match self { + ResourceKind::Model => format!("https://huggingface.co/{slug}.git"), + ResourceKind::Dataset => format!("https://huggingface.co/datasets/{slug}.git"), + ResourceKind::Space => format!("https://huggingface.co/spaces/{slug}.git"), + } + } + + fn canonical_prefix(self) -> &'static str { + match self { + ResourceKind::Model => "model", + ResourceKind::Dataset => "dataset", + ResourceKind::Space => "space", + } + } + + fn display_name_singular(self) -> &'static str { + match self { + ResourceKind::Model => "model", + ResourceKind::Dataset => "dataset", + ResourceKind::Space => "space", + } + } + + fn display_name_plural(self) -> &'static str { + match self { + ResourceKind::Model => "models", + ResourceKind::Dataset => "datasets", + ResourceKind::Space => "spaces", + } + } +} + +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +struct ResourceRef { + kind: ResourceKind, + slug: String, +} + +impl ResourceRef { + fn new(kind: ResourceKind, slug: String) -> Self { + Self { kind, slug } + } + + fn canonical_key(&self) -> String { + format!("{}:{}", self.kind.canonical_prefix(), self.slug.to_lowercase()) + } + + fn git_url(&self) -> String { + self.kind.git_url(&self.slug) + } +} + +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum HuggingFaceItem { + Id { + id: String, + }, + ModelId { + #[serde(rename = "modelId")] + model_id: String, + }, +} + +impl HuggingFaceItem { + fn into_identifier(self) -> String { + match self { + HuggingFaceItem::Id { id } => id, + HuggingFaceItem::ModelId { model_id } => model_id, + } + } +} + +#[derive(Default)] +struct ExcludeSet { + typed: HashSet, + untyped: HashSet, +} + +impl ExcludeSet { + fn from_list(values: &[String]) -> Self { + let mut typed = HashSet::new(); + let mut untyped = HashSet::new(); + for raw in values { + let trimmed = raw.trim(); + if trimmed.is_empty() { + continue; + } + if let Some((prefix, rest)) = trimmed.split_once(':') { + match normalize_kind(prefix) { + Some(kind) => { + if let Some(slug) = parse_slug_for_kind(kind, rest) { + typed.insert(format!( + "{}:{}", + kind.canonical_prefix(), + slug.to_lowercase() + )); + } else { + warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)"); + } + } + None => warn!("Ignoring invalid Hugging Face exclusion '{raw}' (unknown type)"), + } + } else if let Some(slug) = normalize_untyped_slug(trimmed) { + untyped.insert(slug); + } else { + warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)"); + } + } + Self { typed, untyped } + } + + fn should_exclude(&self, kind: ResourceKind, slug: &str) -> bool { + let typed_key = format!("{}:{}", kind.canonical_prefix(), slug.to_lowercase()); + if self.typed.contains(&typed_key) { + return true; + } + self.untyped.contains(&slug.to_lowercase()) + } +} + +fn normalize_kind(raw: &str) -> Option { + match raw.trim().to_ascii_lowercase().as_str() { + "model" | "models" => Some(ResourceKind::Model), + "dataset" | "datasets" => Some(ResourceKind::Dataset), + "space" | "spaces" => Some(ResourceKind::Space), + _ => None, + } +} + +fn normalize_untyped_slug(raw: &str) -> Option { + let trimmed = raw.trim(); + if trimmed.is_empty() { + return None; + } + let segments: Vec<&str> = trimmed.split('/').filter(|segment| !segment.is_empty()).collect(); + normalize_untyped_segments(&segments) +} + +fn normalize_untyped_segments(segments: &[&str]) -> Option { + if segments.is_empty() { + return None; + } + let mut parts: Vec<&str> = segments.to_vec(); + if let Some(first) = parts.first() { + let lowered = first.trim().to_ascii_lowercase(); + if matches!( + lowered.as_str(), + "models" | "model" | "datasets" | "dataset" | "spaces" | "space" + ) { + parts.remove(0); + } + } + if parts.len() < 2 { + return None; + } + let owner = parts[0].trim(); + let binding = parts[1..].join("/"); + let name = binding.trim_end_matches(".git").trim(); + + if owner.is_empty() || name.is_empty() { + return None; + } + Some(format!("{}/{}", owner, name).to_lowercase()) +} + +fn parse_slug_for_kind(kind: ResourceKind, raw: &str) -> Option { + let trimmed = raw.trim(); + if trimmed.is_empty() { + return None; + } + if trimmed.starts_with("http://") || trimmed.starts_with("https://") { + let url = Url::parse(trimmed).ok()?; + let segments: Vec<&str> = url + .path_segments() + .map(|segments| segments.filter(|s| !s.is_empty()).collect()) + .unwrap_or_default(); + return parse_slug_segments(kind, &segments); + } + let segments: Vec<&str> = trimmed.split('/').filter(|s| !s.is_empty()).collect(); + parse_slug_segments(kind, &segments) +} + +fn parse_slug_segments(kind: ResourceKind, segments: &[&str]) -> Option { + if segments.is_empty() { + return None; + } + let mut parts: Vec<&str> = segments.to_vec(); + if let Some(first) = parts.first() { + let lowered = first.trim().to_ascii_lowercase(); + let should_trim = match kind { + ResourceKind::Model => matches!(lowered.as_str(), "models" | "model"), + ResourceKind::Dataset => matches!(lowered.as_str(), "datasets" | "dataset"), + ResourceKind::Space => matches!(lowered.as_str(), "spaces" | "space"), + }; + if should_trim { + parts.remove(0); + } + } + if parts.len() < 2 { + return None; + } + let owner = parts[0].trim(); + let binding = parts[1..].join("/"); + let name = binding.trim_end_matches(".git").trim(); + + if owner.is_empty() || name.is_empty() { + return None; + } + Some(format!("{owner}/{name}")) +} + +fn parse_next_link(value: &str) -> Option { + value.split(',').find_map(|part| { + let part = part.trim(); + let (url_part, params) = part.split_once('>')?; + if params.contains("rel=\"next\"") { + let url = url_part.trim_start_matches('<').trim(); + Url::parse(url).ok() + } else { + None + } + }) +} + +const BODY_SNIPPET_LIMIT: usize = 200; + +async fn fetch_paginated( + client: &reqwest::Client, + mut current_url: Url, + auth: &AuthConfig, + context: &str, +) -> Result> { + let mut items = Vec::new(); + loop { + let mut request = + client.get(current_url.clone()).header("User-Agent", GLOBAL_USER_AGENT.as_str()); + request = auth.apply(request); + let response = request.send().await?; + let status = response.status(); + let link_header = response + .headers() + .get(LINK) + .and_then(|value| value.to_str().ok()) + .map(|value| value.to_string()); + if !status.is_success() { + let body = response.text().await.unwrap_or_default(); + let mut message = format!( + "Hugging Face API request failed while enumerating {context} ({status}): {body}" + ); + if matches!(status, StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN) + && !auth.has_token() + { + message.push_str( + "\nProvide a Hugging Face access token via the KF_HUGGINGFACE_TOKEN environment variable.", + ); + } + return Err(anyhow!(message)); + } + let body = response.bytes().await?; + let value: Value = serde_json::from_slice(&body).map_err(|err| { + let snippet = body_snippet(&body); + anyhow!( + "Failed to parse Hugging Face response while enumerating {context}: {err}. Body snippet: {snippet}", + context = context, + err = err, + snippet = snippet + ) + })?; + + let array = value.as_array().ok_or_else(|| { + let snippet = body_snippet(&body); + anyhow!( + "Unexpected Hugging Face response format while enumerating {context} (expected array). Body snippet: {snippet}", + context = context, + snippet = snippet + ) + })?; + + let mut page = Vec::new(); + for (index, element) in array.iter().enumerate() { + match serde_json::from_value::(element.clone()) { + Ok(item) => page.push(item), + Err(err) => { + let snippet = value_snippet(element); + warn!( + "Skipping Hugging Face item at index {index} while enumerating {context}: {err}. Item snippet: {snippet}" + ); + } + } + } + items.append(&mut page); + if let Some(link_value) = link_header { + if let Some(next_url) = parse_next_link(&link_value) { + current_url = next_url; + continue; + } + } + break; + } + Ok(items) +} + +fn body_snippet(body: &[u8]) -> String { + truncate_for_display(&String::from_utf8_lossy(body), BODY_SNIPPET_LIMIT) +} + +fn value_snippet(value: &Value) -> String { + let text = value.to_string(); + truncate_for_display(&text, BODY_SNIPPET_LIMIT) +} + +fn truncate_for_display(text: &str, limit: usize) -> String { + let mut snippet: String = text.chars().take(limit).collect(); + if text.chars().count() > limit { + snippet.push('…'); + } + snippet +} + +async fn fetch_resources_for_owner( + client: &reqwest::Client, + base_url: &Url, + owner: &str, + label: &str, + auth: &AuthConfig, + progress: Option<&ProgressBar>, +) -> Result> { + let mut resources = Vec::new(); + for kind in [ResourceKind::Model, ResourceKind::Dataset, ResourceKind::Space] { + if let Some(pb) = progress { + pb.set_message(format!( + "Enumerating Hugging Face {label} {}", + kind.display_name_plural() + )); + } + let mut url = base_url.join(kind.api_path())?; + { + let mut pairs = url.query_pairs_mut(); + pairs.append_pair("author", owner); + pairs.append_pair("limit", "100"); + } + let context = format!("{} for {label}", kind.display_name_plural()); + match fetch_paginated(client, url, auth, &context).await { + Ok(items) => { + for item in items { + let identifier = item.into_identifier(); + if let Some(slug) = parse_slug_for_kind(kind, &identifier) { + resources.push(ResourceRef::new(kind, slug)); + } else { + warn!( + "Skipping Hugging Face {} with unexpected identifier '{}'", + kind.display_name_singular(), + identifier + ); + } + } + } + Err(err) => { + warn!( + "Failed to enumerate Hugging Face {} for {label}: {err}", + kind.display_name_plural() + ); + } + } + } + Ok(resources) +} + +fn append_explicit_resources(specifiers: &RepoSpecifiers, resources: &mut Vec) { + for model in &specifiers.model { + if let Some(slug) = parse_slug_for_kind(ResourceKind::Model, model) { + resources.push(ResourceRef::new(ResourceKind::Model, slug)); + } else { + warn!("Ignoring invalid Hugging Face model identifier '{model}'"); + } + } + for dataset in &specifiers.dataset { + if let Some(slug) = parse_slug_for_kind(ResourceKind::Dataset, dataset) { + resources.push(ResourceRef::new(ResourceKind::Dataset, slug)); + } else { + warn!("Ignoring invalid Hugging Face dataset identifier '{dataset}'"); + } + } + for space in &specifiers.space { + if let Some(slug) = parse_slug_for_kind(ResourceKind::Space, space) { + resources.push(ResourceRef::new(ResourceKind::Space, slug)); + } else { + warn!("Ignoring invalid Hugging Face space identifier '{space}'"); + } + } +} + +pub async fn enumerate_repo_urls( + specifiers: &RepoSpecifiers, + auth: &AuthConfig, + ignore_certs: bool, + progress: Option<&mut ProgressBar>, +) -> Result> { + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(30)) + .danger_accept_invalid_certs(ignore_certs) + .build()?; + let base_url = Url::parse("https://huggingface.co/api/")?; + let excludes = ExcludeSet::from_list(&specifiers.exclude); + let mut collected = Vec::new(); + + for user in &specifiers.user { + let label = format!("user {user}"); + if let Some(pb) = progress.as_ref() { + pb.set_message(format!("Enumerating Hugging Face {label}")); + } + match fetch_resources_for_owner( + &client, + &base_url, + user, + &label, + auth, + progress.as_ref().map(|pb| &**pb), + ) + .await + { + Ok(mut resources) => collected.append(&mut resources), + Err(err) => warn!("Failed to enumerate Hugging Face user {user}: {err}"), + } + } + + for org in &specifiers.organization { + let label = format!("organization {org}"); + if let Some(pb) = progress.as_ref() { + pb.set_message(format!("Enumerating Hugging Face {label}")); + } + match fetch_resources_for_owner( + &client, + &base_url, + org, + &label, + auth, + progress.as_ref().map(|pb| &**pb), + ) + .await + { + Ok(mut resources) => collected.append(&mut resources), + Err(err) => warn!("Failed to enumerate Hugging Face organization {org}: {err}"), + } + } + + append_explicit_resources(specifiers, &mut collected); + + let mut seen = HashSet::new(); + let mut urls = Vec::new(); + for resource in collected { + if excludes.should_exclude(resource.kind, &resource.slug) { + debug!( + "Skipping Hugging Face {} {} due to exclusion", + resource.kind.display_name_singular(), + resource.slug + ); + continue; + } + let key = resource.canonical_key(); + if seen.insert(key) { + urls.push(resource.git_url()); + } + } + urls.sort(); + urls.dedup(); + Ok(urls) +} + +pub async fn list_repositories( + specifiers: &RepoSpecifiers, + auth: &AuthConfig, + ignore_certs: bool, + progress_enabled: bool, +) -> Result<()> { + let mut progress = if progress_enabled { + let style = ProgressStyle::with_template("{spinner} {msg} [{elapsed_precise}]") + .expect("progress bar style template should compile"); + let pb = ProgressBar::new_spinner() + .with_style(style) + .with_message("Enumerating Hugging Face repositories"); + pb.enable_steady_tick(Duration::from_millis(500)); + pb + } else { + ProgressBar::hidden() + }; + + let urls = enumerate_repo_urls(specifiers, auth, ignore_certs, Some(&mut progress)).await?; + for url in urls { + println!("{url}"); + } + progress.finish_and_clear(); + Ok(()) +} + +pub fn wiki_url(_repo_url: &GitUrl) -> Option { + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_model_slug_from_plain() { + assert_eq!( + parse_slug_for_kind(ResourceKind::Model, "user/model"), + Some("user/model".to_string()) + ); + } + + #[test] + fn parse_dataset_slug_with_prefix() { + assert_eq!( + parse_slug_for_kind(ResourceKind::Dataset, "datasets/user/data.git"), + Some("user/data".to_string()) + ); + } + + #[test] + fn parse_space_slug_from_url() { + assert_eq!( + parse_slug_for_kind(ResourceKind::Space, "https://huggingface.co/spaces/user/demo"), + Some("user/demo".to_string()) + ); + } + + #[test] + fn exclude_set_matches_typed_and_untyped() { + let excludes = + ExcludeSet::from_list(&vec!["model:user/model".into(), "datasets/user/data".into()]); + assert!(excludes.should_exclude(ResourceKind::Model, "user/model")); + assert!(excludes.should_exclude(ResourceKind::Dataset, "user/data")); + assert!(!excludes.should_exclude(ResourceKind::Space, "user/space")); + } + + #[test] + fn parse_link_header() { + let header = "; rel=\"next\""; + let url = parse_next_link(header).expect("next link"); + assert_eq!(url.as_str(), "https://huggingface.co/api/models?cursor=abc"); + } + + #[test] + fn truncate_for_display_adds_ellipsis() { + assert_eq!(truncate_for_display("abcdef", 3), "abc…"); + assert_eq!(truncate_for_display("abc", 5), "abc"); + } +} diff --git a/src/lib.rs b/src/lib.rs index 9fc8106..1736bd1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,7 @@ pub mod defaults; pub mod entropy; pub mod finding_data; pub mod findings_store; +pub mod gcs; pub mod git_binary; pub mod git_commit_metadata; pub mod git_metadata_graph; @@ -21,6 +22,7 @@ pub mod git_url; pub mod gitea; pub mod github; pub mod gitlab; +pub mod huggingface; pub mod inline_ignore; pub mod jira; pub mod liquid_filters; diff --git a/src/main.rs b/src/main.rs index c0f6f9a..d6c36c7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -52,7 +52,7 @@ use kingfisher::{ }, findings_store, findings_store::FindingsStore, - gitea, github, + gitea, github, huggingface, rule_loader::RuleLoader, rules_database::RulesDatabase, scanner::{load_and_record_rules, run_scan}, @@ -75,6 +75,7 @@ use crate::cli::commands::{ bitbucket::{BitbucketAuthArgs, BitbucketCommand, BitbucketRepoType, BitbucketReposCommand}, gitea::{GiteaCommand, GiteaRepoType, GiteaReposCommand}, gitlab::{GitLabCommand, GitLabRepoType, GitLabReposCommand}, + huggingface::{HuggingFaceCommand, HuggingFaceReposCommand}, }; fn main() -> anyhow::Result<()> { @@ -93,6 +94,7 @@ fn main() -> anyhow::Result<()> { Command::Bitbucket(_) => num_cpus::get(), // Default for Bitbucket commands Command::Gitea(_) => num_cpus::get(), // Default for Gitea commands Command::Azure(_) => num_cpus::get(), // Default for Azure commands + Command::HuggingFace(_) => num_cpus::get(), // Default for Hugging Face commands Command::Rules(_) => num_cpus::get(), // Default for Rules commands }; @@ -327,6 +329,28 @@ async fn async_main(args: CommandLineArgs) -> Result<()> { } }, }, + Command::HuggingFace(hf_args) => match hf_args.command { + HuggingFaceCommand::Repos(repos_command) => match repos_command { + HuggingFaceReposCommand::List(list_args) => { + let specifiers = huggingface::RepoSpecifiers { + user: list_args.repo_specifiers.user.clone(), + organization: list_args.repo_specifiers.organization.clone(), + model: list_args.repo_specifiers.model.clone(), + dataset: list_args.repo_specifiers.dataset.clone(), + space: list_args.repo_specifiers.space.clone(), + exclude: list_args.repo_specifiers.exclude.clone(), + }; + let auth = huggingface::AuthConfig::from_env(); + huggingface::list_repositories( + &specifiers, + &auth, + global_args.ignore_certs, + global_args.use_progress(), + ) + .await?; + } + }, + }, Command::SelfUpdate => { anyhow::bail!("SelfUpdate command should not reach this branch") } @@ -367,6 +391,13 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { gitlab_repo_type: GitLabRepoType::All, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), + gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -400,6 +431,9 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { s3_prefix: None, role_arn: None, aws_local_profile: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, // Slack query slack_query: None, slack_api_url: Url::parse("https://slack.com/api/").unwrap(), @@ -436,6 +470,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_base64: false, no_inline_ignore: false, diff --git a/src/reporter.rs b/src/reporter.rs index 52a491c..127a9ad 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -703,7 +703,7 @@ mod tests { git_commit_metadata::CommitMetadata, location::{Location, OffsetSpan, SourcePoint, SourceSpan}, matcher::{SerializableCapture, SerializableCaptures}, - origin::OriginSet, + origin::{Origin, OriginSet}, rules::rule::{Confidence, Rule, RuleSyntax}, }; use gix::{date::Time, ObjectId}; @@ -711,79 +711,8 @@ mod tests { use std::path::PathBuf; use tempfile::tempdir; - #[test] - fn build_finding_record_uses_git_blob_path() { - let temp = tempdir().unwrap(); - let datastore = - Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf()))); - let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false }; - - let repo_path = Arc::new(PathBuf::from("/tmp/repo")); - let commit_metadata = Arc::new(CommitMetadata { - commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(), - committer_name: "Alice".into(), - committer_email: "alice@example.com".into(), - committer_timestamp: Time::new(0, 0), - }); - let blob_path = "path/in/history.txt".to_string(); - let origin = OriginSet::new( - Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()), - vec![], - ); - - let rule = Arc::new(Rule::new(RuleSyntax { - name: "Test Rule".into(), - id: "test.rule".into(), - pattern: ".*".into(), - min_entropy: 0.0, - confidence: Confidence::Medium, - visible: true, - examples: vec![], - negative_examples: vec![], - references: vec![], - validation: None, - depends_on_rule: vec![], - })); - - let blob_id = BlobId::new(b"blob-data"); - let report_match = ReportMatch { - origin, - blob_metadata: BlobMetadata { - id: blob_id, - num_bytes: 42, - mime_essence: None, - language: Some("Unknown".into()), - }, - m: Match { - location: Location { - offset_span: OffsetSpan { start: 0, end: 10 }, - source_span: SourceSpan { - start: SourcePoint { line: 19, column: 0 }, - end: SourcePoint { line: 19, column: 10 }, - }, - }, - groups: SerializableCaptures { - captures: SmallVec::<[SerializableCapture; 2]>::new(), - }, - blob_id, - finding_fingerprint: 123, - rule: Arc::clone(&rule), - validation_response_body: "Bad credentials".into(), - validation_response_status: 401, - validation_success: false, - calculated_entropy: 5.29, - visible: true, - is_base64: false, - }, - comment: None, - match_confidence: Confidence::Medium, - visible: true, - validation_response_body: "Bad credentials".into(), - validation_response_status: 401, - validation_success: false, - }; - - let scan_args = ScanArgs { + fn sample_scan_args() -> ScanArgs { + ScanArgs { num_jobs: 1, rules: RuleSpecifierArgs::default(), input_specifier_args: InputSpecifierArgs { @@ -802,6 +731,12 @@ mod tests { gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(), gitlab_repo_type: GitLabRepoType::All, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -833,6 +768,9 @@ mod tests { s3_prefix: None, role_arn: None, aws_local_profile: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, docker_image: Vec::new(), git_clone: GitCloneMode::Bare, git_history: GitHistoryMode::Full, @@ -864,9 +802,98 @@ mod tests { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_inline_ignore: false, + } + } + + fn sample_report_match( + validation_body: &str, + validation_status: u16, + validation_success: bool, + ) -> (ReportMatch, String) { + let repo_path = Arc::new(PathBuf::from("/tmp/repo")); + let commit_metadata = Arc::new(CommitMetadata { + commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(), + committer_name: "Alice".into(), + committer_email: "alice@example.com".into(), + committer_timestamp: Time::new(0, 0), + }); + let blob_path = "path/in/history.txt".to_string(); + let origin = OriginSet::new( + Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()), + vec![], + ); + + let rule = Arc::new(Rule::new(RuleSyntax { + name: "Test Rule".into(), + id: "test.rule".into(), + pattern: ".*".into(), + min_entropy: 0.0, + confidence: Confidence::Medium, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None, + depends_on_rule: vec![], + })); + + let blob_id = BlobId::new(b"blob-data"); + let validation_body_owned = validation_body.to_string(); + let report_match = ReportMatch { + origin, + blob_metadata: BlobMetadata { + id: blob_id, + num_bytes: 42, + mime_essence: None, + language: Some("Unknown".into()), + }, + m: Match { + location: Location { + offset_span: OffsetSpan { start: 0, end: 10 }, + source_span: SourceSpan { + start: SourcePoint { line: 19, column: 0 }, + end: SourcePoint { line: 19, column: 10 }, + }, + }, + groups: SerializableCaptures { + captures: SmallVec::<[SerializableCapture; 2]>::new(), + }, + blob_id, + finding_fingerprint: 123, + rule: Arc::clone(&rule), + validation_response_body: validation_body_owned.clone(), + validation_response_status: validation_status, + validation_success, + calculated_entropy: 5.29, + visible: true, + is_base64: false, + }, + comment: None, + match_confidence: Confidence::Medium, + visible: true, + validation_response_body: validation_body_owned, + validation_response_status: validation_status, + validation_success, }; + (report_match, blob_path) + } + + #[test] + fn build_finding_record_uses_git_blob_path() { + let temp = tempdir().unwrap(); + let datastore = + Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf()))); + let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false }; + + let (report_match, blob_path) = + sample_report_match("Bad credentials", StatusCode::UNAUTHORIZED.as_u16(), false); + + let scan_args = sample_scan_args(); + let record = reporter.build_finding_record(&report_match, &scan_args); assert_eq!(record.finding.path, blob_path); let git_file_path = record @@ -880,6 +907,28 @@ mod tests { assert_eq!(git_file_path, "path/in/history.txt"); } + #[test] + fn skip_list_matches_surface_skip_reason() { + let temp = tempdir().unwrap(); + let datastore = + Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf()))); + let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false }; + + let (report_match, _) = sample_report_match( + "(skip list entry) AWS validation not attempted for account 111122223333.", + StatusCode::CONTINUE.as_u16(), + false, + ); + let scan_args = sample_scan_args(); + + let record = reporter.build_finding_record(&report_match, &scan_args); + assert_eq!(record.finding.validation.status, "Not Attempted"); + assert_eq!( + record.finding.validation.response, + "(skip list entry) AWS validation not attempted for account 111122223333." + ); + } + use super::build_git_urls; #[test] diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 02dc20c..6435c71 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -93,6 +93,14 @@ mod tests { gitlab_repo_type: GitLabRepoType::All, gitlab_include_subgroups: false, + // Hugging Face + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), + // Gitea gitea_user: Vec::new(), gitea_organization: Vec::new(), @@ -132,6 +140,9 @@ mod tests { s3_prefix: None, role_arn: None, aws_local_profile: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, docker_image: Vec::new(), // clone / history options @@ -163,6 +174,8 @@ mod tests { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_base64: false, no_inline_ignore: false, } diff --git a/src/reporter/pretty_format.rs b/src/reporter/pretty_format.rs index 086648c..ea31fc5 100644 --- a/src/reporter/pretty_format.rs +++ b/src/reporter/pretty_format.rs @@ -115,7 +115,7 @@ impl<'a> Display for PrettyFindingRecord<'a> { } else { writeln!(f, " |Validation....: {}", finding.validation.status)?; } - if finding.validation.status != "Not Attempted" { + if !finding.validation.response.is_empty() { writeln!(f, " |__Response....: {}", style_fn(&finding.validation.response))?; } writeln!(f, " |Language......: {}", finding.language)?; diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index a6e0b6a..6d98b34 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -3,7 +3,7 @@ pub(crate) use docker::save_docker_images; pub(crate) use enumerate::enumerate_filesystem_inputs; pub(crate) use repos::{ clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos, - enumerate_github_repos, + enumerate_github_repos, enumerate_huggingface_repos, }; pub use runner::{load_and_record_rules, run_async_scan, run_scan}; pub(crate) use validation::run_secret_validation; diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index fb6577a..da00dcc 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -17,10 +17,10 @@ use crate::{ commands::{github::GitCloneMode, github::GitHistoryMode, scan}, global, }, - confluence, findings_store, + confluence, findings_store, gcs, git_binary::{CloneMode, Git}, git_url::GitUrl, - gitea, github, gitlab, jira, + gitea, github, gitlab, huggingface, jira, matcher::{Match, Matcher, MatcherStats}, origin::{Origin, OriginSet}, rules_database::RulesDatabase, @@ -305,6 +305,69 @@ pub async fn enumerate_gitea_repos( Ok(repo_urls) } +pub async fn enumerate_huggingface_repos( + args: &scan::ScanArgs, + global_args: &global::GlobalArgs, +) -> Result> { + let repo_specifiers = huggingface::RepoSpecifiers { + user: args.input_specifier_args.huggingface_user.clone(), + organization: args.input_specifier_args.huggingface_organization.clone(), + model: args.input_specifier_args.huggingface_model.clone(), + dataset: args.input_specifier_args.huggingface_dataset.clone(), + space: args.input_specifier_args.huggingface_space.clone(), + exclude: args.input_specifier_args.huggingface_exclude.clone(), + }; + + let mut repo_urls = args.input_specifier_args.git_url.clone(); + if !repo_specifiers.is_empty() { + let mut progress = if global_args.use_progress() { + let style = + ProgressStyle::with_template("{spinner} {msg} {human_len} [{elapsed_precise}]") + .expect("progress bar style template should compile"); + let pb = ProgressBar::new_spinner() + .with_style(style) + .with_message("Enumerating Hugging Face repositories..."); + pb.enable_steady_tick(Duration::from_millis(500)); + pb + } else { + ProgressBar::hidden() + }; + + let mut num_found: u64 = 0; + let auth = huggingface::AuthConfig::from_env(); + let repo_strings = huggingface::enumerate_repo_urls( + &repo_specifiers, + &auth, + global_args.ignore_certs, + Some(&mut progress), + ) + .await + .context("Failed to enumerate Hugging Face repositories")?; + + for repo_string in repo_strings { + match GitUrl::from_str(&repo_string) { + Ok(repo_url) => { + repo_urls.push(repo_url); + num_found += 1; + } + Err(e) => { + progress.suspend(|| { + error!("Failed to parse repo URL from {repo_string}: {e}"); + }); + } + } + } + + progress.finish_with_message(format!( + "Found {} repositories from Hugging Face", + HumanCount(num_found) + )); + } + repo_urls.sort(); + repo_urls.dedup(); + Ok(repo_urls) +} + pub async fn enumerate_bitbucket_repos( args: &scan::ScanArgs, global_args: &global::GlobalArgs, @@ -678,3 +741,81 @@ pub async fn fetch_s3_objects( Ok(()) } + +pub async fn fetch_gcs_objects( + args: &scan::ScanArgs, + datastore: &Arc>, + rules_db: &RulesDatabase, + matcher_stats: &Mutex, + enable_profiling: bool, + shared_profiler: Arc, + progress_enabled: bool, +) -> Result<()> { + let Some(bucket) = args.input_specifier_args.gcs_bucket.as_deref() else { + return Ok(()); + }; + let prefix = args.input_specifier_args.gcs_prefix.as_deref(); + let service_account = args.input_specifier_args.gcs_service_account.as_deref(); + + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let seen_blobs = BlobIdMap::new(); + let matcher = Matcher::new( + rules_db, + scanner_pool, + &seen_blobs, + Some(matcher_stats), + enable_profiling, + Some(shared_profiler.clone()), + &args.extra_ignore_comments, + args.no_inline_ignore, + )?; + let mut processor = BlobProcessor { matcher }; + + let progress = if progress_enabled { + let style = + ProgressStyle::with_template("{spinner} {msg} ({pos} objects) [{elapsed_precise}]") + .expect("progress bar style template should compile"); + let pb = ProgressBar::new_spinner().with_style(style).with_message("Fetching GCS objects"); + pb.enable_steady_tick(Duration::from_millis(500)); + pb + } else { + ProgressBar::hidden() + }; + + let pb = progress.clone(); + + let bucket_name = bucket.to_string(); + + gcs::visit_bucket_objects(bucket, prefix, service_account, move |key, bytes| { + let origin = OriginSet::new( + Origin::from_extended(serde_json::json!({ + "path": format!("gs://{}/{}", bucket_name, key) + })), + Vec::new(), + ); + let blob = crate::blob::Blob::from_bytes(bytes); + + if let Some((origin, blob_md, scored_matches)) = + processor.run(origin, blob, args.no_dedup, args.redact, args.no_base64)? + { + let origin_arc = Arc::new(origin); + let blob_arc = Arc::new(blob_md); + + let mut batch = Vec::with_capacity(scored_matches.len()); + for (_score, m) in scored_matches { + batch.push((origin_arc.clone(), blob_arc.clone(), m)); + } + + let added = datastore.lock().unwrap().record(batch, !args.no_dedup); + debug!("Added {} new GCS blobs", added); + } + pb.inc(1); + Ok(()) + }) + .await?; + + let total = progress.position(); + progress.finish_with_message(format!("Fetched {} GCS objects", total)); + + Ok(()) +} diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs index 9de4a00..25dae47 100644 --- a/src/scanner/runner.rs +++ b/src/scanner/runner.rs @@ -1,4 +1,7 @@ -use std::sync::{Arc, Mutex}; +use std::{ + fs, + sync::{Arc, Mutex}, +}; use anyhow::{bail, Context, Result}; use crossbeam_skiplist::SkipMap; @@ -21,10 +24,11 @@ use crate::{ safe_list, scanner::{ clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos, - enumerate_filesystem_inputs, enumerate_github_repos, + enumerate_filesystem_inputs, enumerate_github_repos, enumerate_huggingface_repos, repos::{ enumerate_gitea_repos, enumerate_gitlab_repos, fetch_confluence_pages, - fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects, fetch_slack_messages, + fetch_gcs_objects, fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects, + fetch_slack_messages, }, run_secret_validation, save_docker_images, summary::print_scan_summary, @@ -74,12 +78,14 @@ pub async fn run_async_scan( let mut repo_urls = enumerate_github_repos(args, global_args).await?; let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?; let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?; + let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?; let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?; let azure_repo_urls = enumerate_azure_repos(args, global_args).await?; // Combine repository URLs repo_urls.extend(gitlab_repo_urls); repo_urls.extend(gitea_repo_urls); + repo_urls.extend(huggingface_repo_urls); repo_urls.extend(bitbucket_repo_urls); repo_urls.extend(azure_repo_urls); repo_urls.sort(); @@ -182,8 +188,20 @@ pub async fn run_async_scan( ) .await?; - let has_s3 = args.input_specifier_args.s3_bucket.is_some(); - if input_roots.is_empty() && !has_s3 { + fetch_gcs_objects( + args, + &datastore, + rules_db, + &matcher_stats, + enable_profiling, + Arc::clone(&shared_profiler), + progress_enabled, + ) + .await?; + + let has_remote_objects = args.input_specifier_args.s3_bucket.is_some() + || args.input_specifier_args.gcs_bucket.is_some(); + if input_roots.is_empty() && !has_remote_objects { bail!("No inputs to scan"); } @@ -231,6 +249,26 @@ pub async fn run_async_scan( crate::baseline::apply_baseline(&mut ds, &path, args.manage_baseline, &input_roots)?; } + let mut skip_aws_accounts = args.skip_aws_account.clone(); + + if let Some(path) = args.skip_aws_account_file.as_ref() { + let contents = fs::read_to_string(path).with_context(|| { + format!("Failed to read --skip-aws-account-file {}", path.display()) + })?; + + for line in contents.lines() { + let content = line.split('#').next().unwrap_or(""); + for value in content.split(|c: char| c.is_ascii_whitespace() || c == ',' || c == ';') { + let trimmed = value.trim(); + if !trimmed.is_empty() { + skip_aws_accounts.push(trimmed.to_string()); + } + } + } + } + + crate::validation::set_skip_aws_account_ids(skip_aws_accounts); + // If validation is enabled, run it as a second phase if !args.no_validate { info!("Starting secret validation phase..."); diff --git a/src/validation.rs b/src/validation.rs index 81f97dd..b371bf0 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -95,6 +95,14 @@ pub fn init_validation_caches() { aws::set_aws_validation_concurrency(15); } +pub fn set_skip_aws_account_ids(ids: I) +where + I: IntoIterator, + S: Into, +{ + aws::set_aws_skip_account_ids(ids); +} + #[derive(Clone)] pub struct CachedResponse { pub body: String, @@ -781,6 +789,26 @@ async fn timed_validate_single_match<'a>( } } + if let Some(account_id) = aws::should_skip_aws_validation(&akid) { + m.validation_success = false; + m.validation_response_body = format!( + "(skip list entry) AWS validation not attempted for account {}.", + account_id + ); + m.validation_response_status = StatusCode::CONTINUE; + cache.insert( + cache_key, + CachedResponse { + body: m.validation_response_body.clone(), + status: m.validation_response_status, + is_valid: m.validation_success, + timestamp: Instant::now(), + }, + ); + commit_and_return(m); + return; + } + if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) { m.validation_success = false; m.validation_response_body = format!("Invalid AWS credentials ({}): {}", akid, e); diff --git a/src/validation/aws.rs b/src/validation/aws.rs index 7e1073d..156b32c 100644 --- a/src/validation/aws.rs +++ b/src/validation/aws.rs @@ -1,7 +1,7 @@ -use std::time::Duration; +use std::{collections::HashSet, sync::RwLock, time::Duration}; use anyhow::{anyhow, Result}; -use aws_config::{retry::RetryConfig, BehaviorVersion}; +use aws_config::{retry::RetryConfig, BehaviorVersion, SdkConfig}; use aws_credential_types::Credentials; use aws_sdk_sts::{ config::Builder as StsConfigBuilder, error::SdkError, @@ -26,8 +26,9 @@ use http::{ header::{HeaderValue, USER_AGENT}, StatusCode, }; -use once_cell::sync::OnceCell; +use once_cell::sync::{Lazy, OnceCell}; use rand::{rng, Rng}; +use regex::Regex; use tokio::{ sync::Semaphore, time::{sleep, timeout}, @@ -36,6 +37,57 @@ use tokio::{ use crate::validation::GLOBAL_USER_AGENT; static AWS_VALIDATION_SEMAPHORE: OnceCell = OnceCell::new(); +const BUILTIN_SKIP_ACCOUNT_IDS: &[&str] = &[ + "052310077262", + "171436882533", + "528757803018", + "534261010715", + "595918472158", + "729780141977", + "893192397702", + "992382622183", +]; + +static AWS_SKIP_ACCOUNT_IDS: Lazy>> = Lazy::new(|| { + let mut set = HashSet::new(); + set.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string())); + RwLock::new(set) +}); + +fn build_http_client() -> SharedHttpClient { + HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| { + let mut conn_builder = ConnectorBuilder::default() + .tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc)); + + conn_builder.set_connector_settings(settings.cloned()); + if let Some(components) = runtime_components { + conn_builder.set_sleep_impl(components.sleep_impl()); + } + conn_builder.set_proxy_config(Some(ProxyConfig::from_env())); + conn_builder.build() + }) +} + +async fn build_base_config(credentials: Credentials) -> SdkConfig { + let retry_config = RetryConfig::adaptive().with_max_attempts(3); + aws_config::defaults(BehaviorVersion::latest()) + .region(Region::new("us-east-1")) + .credentials_provider(credentials) + .http_client(build_http_client()) + .retry_config(retry_config) + .load() + .await +} + +fn extract_account_id(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.len() == 12 && trimmed.chars().all(|c| c.is_ascii_digit()) { + return Some(trimmed.to_string()); + } + + static ACCOUNT_ID_RE: Lazy = Lazy::new(|| Regex::new(r"(\d{12})").expect("valid regex")); + ACCOUNT_ID_RE.captures(trimmed).and_then(|caps| caps.get(1)).map(|m| m.as_str().to_string()) +} /// Set the maximum number of concurrent AWS validations. Call before first use. pub fn set_aws_validation_concurrency(max: usize) { @@ -46,6 +98,46 @@ fn aws_validation_semaphore() -> &'static Semaphore { AWS_VALIDATION_SEMAPHORE.get_or_init(|| Semaphore::new(15)) } +pub fn set_aws_skip_account_ids(ids: I) +where + I: IntoIterator, + S: Into, +{ + let mut guard = match AWS_SKIP_ACCOUNT_IDS.write() { + Ok(g) => g, + Err(poisoned) => poisoned.into_inner(), + }; + guard.clear(); + + guard.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string())); + + for raw in ids.into_iter() { + let value = raw.into(); + if value.trim().is_empty() { + continue; + } + if let Some(normalized) = extract_account_id(&value) { + guard.insert(normalized); + } else { + tracing::warn!("Ignoring invalid AWS account ID in skip list: {value}"); + } + } +} + +pub fn should_skip_aws_validation(access_key_id: &str) -> Option { + let guard = AWS_SKIP_ACCOUNT_IDS.read().ok()?; + if guard.is_empty() { + return None; + } + + let account = aws_key_to_account_number(access_key_id).ok()?; + if guard.contains(&account) { + Some(account) + } else { + None + } +} + #[derive(Debug)] struct UaInterceptor; @@ -132,29 +224,7 @@ pub async fn validate_aws_credentials( None, // expiry "static", // provider name ); - // Create HTTP client that respects proxy settings from the environment - let http_client: SharedHttpClient = - HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| { - let mut conn_builder = ConnectorBuilder::default() - .tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc)); - - conn_builder.set_connector_settings(settings.cloned()); - if let Some(components) = runtime_components { - conn_builder.set_sleep_impl(components.sleep_impl()); - } - conn_builder.set_proxy_config(Some(ProxyConfig::from_env())); - conn_builder.build() - }); - - // Create AWS config with adaptive retries - let retry_config = RetryConfig::adaptive().with_max_attempts(3); - let config = aws_config::defaults(BehaviorVersion::latest()) - .region(Region::new("us-east-1")) - .credentials_provider(credentials) - .http_client(http_client) - .retry_config(retry_config) - .load() - .await; + let config = build_base_config(credentials).await; // Create STS client let sts_config = StsConfigBuilder::from(&config).interceptor(UaInterceptor).build(); @@ -230,3 +300,74 @@ pub fn aws_key_to_account_number(aws_key_id: &str) -> Result> = Lazy::new(|| Mutex::new(())); + + #[test] + fn skip_account_list_normalizes_inputs() { + let _lock = TEST_GUARD.lock().unwrap(); + + set_aws_skip_account_ids([ + " 052310077262 ", + "arn:aws:iam::171436882533:role/demo", + "invalid", + ]); + + let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap(); + assert!(guard.contains("052310077262")); + assert!(guard.contains("171436882533")); + assert_eq!(guard.len(), BUILTIN_SKIP_ACCOUNT_IDS.len()); + drop(guard); + + set_aws_skip_account_ids(Vec::::new()); + } + + #[test] + fn should_skip_when_account_matches() { + let _lock = TEST_GUARD.lock().unwrap(); + + set_aws_skip_account_ids(["534261010715"]); + assert_eq!( + should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"), + Some("534261010715".to_string()) + ); + + set_aws_skip_account_ids(Vec::::new()); + } + + #[test] + fn builtin_canary_accounts_are_preseeded() { + let _lock = TEST_GUARD.lock().unwrap(); + + set_aws_skip_account_ids(Vec::::new()); + assert_eq!( + should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"), + Some("534261010715".to_string()) + ); + + set_aws_skip_account_ids(Vec::::new()); + } + + #[test] + fn duplicate_accounts_are_deduplicated() { + let _lock = TEST_GUARD.lock().unwrap(); + + set_aws_skip_account_ids([ + "534261010715", + "arn:aws:iam::534261010715:user/canarytokens", + " 534261010715 ", + ]); + + let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap(); + assert_eq!(guard.iter().filter(|id| id.as_str() == "534261010715").count(), 1); + drop(guard); + + set_aws_skip_account_ids(Vec::::new()); + } +} diff --git a/src/validation/utils.rs b/src/validation/utils.rs index ea0b020..9e8422d 100644 --- a/src/validation/utils.rs +++ b/src/validation/utils.rs @@ -33,31 +33,77 @@ pub fn find_closest_variable( target_variable_name: &str, search_variable_name: &str, ) -> Option { - // Find positions of the target variable with the target value + // Collect the positions of the target variable for the provided value so we can + // compare relative offsets with candidate variables. let mut target_positions = Vec::new(); for (name, value, start, end) in captures { if name == target_variable_name && value == target_value { target_positions.push((*start, *end)); } } + if target_positions.is_empty() { return None; } - // For each target position, find the closest search variable - let mut closest_distance = usize::MAX; - let mut closest_value: Option = None; - for (_target_start, target_end) in target_positions { - for (name, value, start, _) in captures { - if name == search_variable_name { - let distance = (*start as isize - target_end as isize).abs() as usize; - if distance < closest_distance { - closest_distance = distance; - closest_value = Some(value.clone()); + + // Prefer candidates that appear before the target value (same logical block), but + // fall back to overlapping values and then to those that appear after the target + // value when no better match exists. This avoids pairing with the next block when + // multiple credentials are close together in the same file. + let mut best_before: Option<(usize, String)> = None; + let mut best_overlap: Option<(usize, String)> = None; + let mut best_after: Option<(usize, String)> = None; + + for (target_start, target_end) in target_positions.iter().copied() { + for (name, value, start, end) in captures { + if name != search_variable_name { + continue; + } + + if *end <= target_start { + // Candidate is before the target; choose the one closest to the target start. + let distance = target_start - *end; + match &mut best_before { + Some((best_distance, best_value)) if distance < *best_distance => { + *best_distance = distance; + *best_value = value.clone(); + } + None => { + best_before = Some((distance, value.clone())); + } + _ => {} + } + } else if *start >= target_end { + // Candidate is after the target; choose the one closest to the target end. + let distance = *start - target_end; + match &mut best_after { + Some((best_distance, best_value)) if distance < *best_distance => { + *best_distance = distance; + *best_value = value.clone(); + } + None => { + best_after = Some((distance, value.clone())); + } + _ => {} + } + } else { + // Candidate overlaps the target – treat as an exact match. + let distance = 0usize; + match &mut best_overlap { + Some((best_distance, best_value)) if distance < *best_distance => { + *best_distance = distance; + *best_value = value.clone(); + } + None => { + best_overlap = Some((distance, value.clone())); + } + _ => {} } } } } - closest_value + + best_before.or(best_overlap).or(best_after).map(|(_, value)| value) } pub async fn check_url_resolvable(url: &Url) -> Result<(), Box> { @@ -147,4 +193,31 @@ mod tests { ] ); } + + #[test] + fn prefers_closest_preceding_variable() { + let captures = vec![ + ("TOKEN".to_string(), "secret".to_string(), 75usize, 115usize), + ("AKID".to_string(), "preceding".to_string(), 30usize, 50usize), + ("AKID".to_string(), "following".to_string(), 180usize, 200usize), + ]; + + let result = + find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap(); + + assert_eq!(result, "preceding".to_string()); + } + + #[test] + fn falls_back_to_following_when_no_preceding() { + let captures = vec![ + ("TOKEN".to_string(), "secret".to_string(), 10usize, 50usize), + ("AKID".to_string(), "after".to_string(), 60usize, 80usize), + ]; + + let result = + find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap(); + + assert_eq!(result, "after".to_string()); + } } diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs index 3947548..54379a3 100644 --- a/tests/int_allowlist.rs +++ b/tests/int_allowlist.rs @@ -72,6 +72,12 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result, skip_skipword: Vec) -> Result, skip_skipword: Vec) -> Result Result<()> { gitlab_repo_type: GitLabRepoType::Owner, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), + gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -102,6 +109,9 @@ fn test_bitbucket_remote_scan() -> Result<()> { s3_prefix: None, role_arn: None, aws_local_profile: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, docker_image: Vec::new(), git_clone: GitCloneMode::Bare, git_history: GitHistoryMode::Full, @@ -131,6 +141,8 @@ fn test_bitbucket_remote_scan() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index e86c589..2f1998c 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -85,6 +85,13 @@ rules: gitlab_repo_type: GitLabRepoType::Owner, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), + gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -120,6 +127,9 @@ rules: s3_prefix: None, role_arn: None, aws_local_profile: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, // Docker image scanning docker_image: Vec::new(), // git clone / history options @@ -151,6 +161,8 @@ rules: manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, diff --git a/tests/int_github.rs b/tests/int_github.rs index dcf936a..82a0f78 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -72,6 +72,13 @@ fn test_github_remote_scan() -> Result<()> { gitlab_repo_type: GitLabRepoType::Owner, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), + gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -107,6 +114,9 @@ fn test_github_remote_scan() -> Result<()> { s3_prefix: None, role_arn: None, aws_local_profile: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, // Docker image scanning docker_image: Vec::new(), // git clone / history options @@ -138,6 +148,8 @@ fn test_github_remote_scan() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index 7c2e0b9..745f323 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -71,6 +71,13 @@ fn test_gitlab_remote_scan() -> Result<()> { gitlab_repo_type: GitLabRepoType::Owner, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), + gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -106,6 +113,9 @@ fn test_gitlab_remote_scan() -> Result<()> { s3_prefix: None, role_arn: None, aws_local_profile: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, // Docker image scanning docker_image: Vec::new(), git_clone: GitCloneMode::Bare, @@ -137,6 +147,8 @@ fn test_gitlab_remote_scan() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_base64: false, no_inline_ignore: false, }; @@ -210,6 +222,13 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { gitlab_repo_type: GitLabRepoType::Owner, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), + gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -252,6 +271,9 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { scan_nested_repos: true, since_commit: None, branch: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, @@ -273,6 +295,8 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, diff --git a/tests/int_redact.rs b/tests/int_redact.rs index 7d7accb..a570cfe 100644 --- a/tests/int_redact.rs +++ b/tests/int_redact.rs @@ -55,6 +55,12 @@ async fn test_redact_hashes_finding_values() -> Result<()> { gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(), gitlab_repo_type: GitLabRepoType::Owner, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -86,6 +92,9 @@ async fn test_redact_hashes_finding_values() -> Result<()> { s3_prefix: None, role_arn: None, aws_local_profile: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, docker_image: Vec::new(), git_clone: GitCloneMode::Bare, git_history: GitHistoryMode::Full, @@ -115,6 +124,8 @@ async fn test_redact_hashes_finding_values() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, diff --git a/tests/int_slack.rs b/tests/int_slack.rs index 5f2d01a..dd9df68 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -62,6 +62,13 @@ impl TestContext { gitlab_repo_type: GitLabRepoType::Owner, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), + gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -92,6 +99,9 @@ impl TestContext { s3_prefix: None, role_arn: None, aws_local_profile: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, max_results: 10, docker_image: Vec::new(), git_clone: GitCloneMode::Bare, @@ -123,6 +133,8 @@ impl TestContext { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_base64: false, no_inline_ignore: false, }; @@ -186,6 +198,13 @@ async fn test_scan_slack_messages() -> Result<()> { gitlab_repo_type: GitLabRepoType::Owner, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), + gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -218,6 +237,9 @@ async fn test_scan_slack_messages() -> Result<()> { s3_prefix: None, role_arn: None, aws_local_profile: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, docker_image: Vec::new(), git_clone: GitCloneMode::Bare, git_history: GitHistoryMode::Full, @@ -247,6 +269,8 @@ async fn test_scan_slack_messages() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index 74cf6e4..ba81576 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -128,6 +128,13 @@ async fn test_validation_cache_and_depvars() -> Result<()> { gitlab_repo_type: GitLabRepoType::Owner, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), + gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -163,6 +170,9 @@ async fn test_validation_cache_and_depvars() -> Result<()> { s3_prefix: None, role_arn: None, aws_local_profile: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, // Docker image scanning docker_image: Vec::new(), // git clone / history options @@ -194,6 +204,8 @@ async fn test_validation_cache_and_depvars() -> Result<()> { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 464377a..5dec6b5 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -71,6 +71,13 @@ impl TestContext { gitlab_repo_type: GitLabRepoType::Owner, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), + gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -106,6 +113,9 @@ impl TestContext { s3_prefix: None, role_arn: None, aws_local_profile: None, + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, // Docker image scanning docker_image: Vec::new(), // git clone / history options @@ -137,6 +147,8 @@ impl TestContext { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, @@ -183,6 +195,13 @@ impl TestContext { gitlab_repo_type: GitLabRepoType::Owner, gitlab_include_subgroups: false, + huggingface_user: Vec::new(), + huggingface_organization: Vec::new(), + huggingface_model: Vec::new(), + huggingface_dataset: Vec::new(), + huggingface_space: Vec::new(), + huggingface_exclude: Vec::new(), + gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), @@ -228,6 +247,10 @@ impl TestContext { scan_nested_repos: true, since_commit: None, branch: None, + + gcs_bucket: None, + gcs_prefix: None, + gcs_service_account: None, }, extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { @@ -250,6 +273,8 @@ impl TestContext { manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), + skip_aws_account: Vec::new(), + skip_aws_account_file: None, no_base64: false, no_inline_ignore: false, };