- Added first-class Hugging Face scanning support, including CLI enumeration, token authentication, and integration with remote scans.

- Condensed GitError formatting to report the exit status and the first informative lines from stdout/stderr, producing concise git clone failure logs.
- Added support for scanning Google Cloud Storage buckets via --gcs-bucket, including optional prefixes and service-account authentication.
- Added --skip-aws-account (now accepting comma-separated values) and --skip-aws-account-file to bypass live AWS validation for known canary/honey-token account IDs without triggering alerts. Kingfisher now ships with several canary AWS account IDs pre-seeded in the skip list and now reports matching findings as "Not Attempted" with the "Response" containing "(skip list entry)" so its clear that validation was intentionally skipped and why.
This commit is contained in:
Mick Grove 2025-10-15 22:47:40 -07:00
commit 8aced005b8
33 changed files with 1939 additions and 182 deletions

View file

@ -0,0 +1,73 @@
use clap::{Args, Subcommand};
use crate::cli::commands::output::OutputArgs;
use super::github::GitHubOutputFormat;
/// Top-level Hugging Face command group
#[derive(Args, Debug)]
pub struct HuggingFaceArgs {
#[command(subcommand)]
pub command: HuggingFaceCommand,
}
#[derive(Subcommand, Debug)]
pub enum HuggingFaceCommand {
/// Interact with Hugging Face repositories
#[command(subcommand)]
Repos(HuggingFaceReposCommand),
}
#[derive(Subcommand, Debug)]
pub enum HuggingFaceReposCommand {
/// List Hugging Face repositories
List(HuggingFaceReposListArgs),
}
#[derive(Args, Debug, Clone)]
pub struct HuggingFaceReposListArgs {
#[command(flatten)]
pub repo_specifiers: HuggingFaceRepoSpecifiers,
#[command(flatten)]
pub output_args: OutputArgs<HuggingFaceOutputFormat>,
}
#[derive(Args, Debug, Clone, Default)]
pub struct HuggingFaceRepoSpecifiers {
/// Models, datasets, and Spaces owned by these users
#[arg(long = "huggingface-user")]
pub user: Vec<String>,
/// Models, datasets, and Spaces owned by these organizations
#[arg(long = "huggingface-organization", alias = "huggingface-org")]
pub organization: Vec<String>,
/// Specific models to scan (format: owner/name or full URL)
#[arg(long = "huggingface-model")]
pub model: Vec<String>,
/// Specific datasets to scan (format: owner/name or full URL)
#[arg(long = "huggingface-dataset")]
pub dataset: Vec<String>,
/// Specific Spaces to scan (format: owner/name or full URL)
#[arg(long = "huggingface-space")]
pub space: Vec<String>,
/// Skip specific repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:)
#[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")]
pub exclude: Vec<String>,
}
impl HuggingFaceRepoSpecifiers {
pub fn is_empty(&self) -> bool {
self.user.is_empty()
&& self.organization.is_empty()
&& self.model.is_empty()
&& self.dataset.is_empty()
&& self.space.is_empty()
}
}
pub type HuggingFaceOutputFormat = GitHubOutputFormat;

View file

@ -28,6 +28,11 @@ pub struct InputSpecifierArgs {
"gitlab_group",
"gitea_user",
"gitea_organization",
"huggingface_user",
"huggingface_organization",
"huggingface_model",
"huggingface_dataset",
"huggingface_space",
"bitbucket_user",
"bitbucket_workspace",
"bitbucket_project",
@ -43,7 +48,8 @@ pub struct InputSpecifierArgs {
"confluence_url",
"docker_image",
"slack_query",
"s3_bucket"
"s3_bucket",
"gcs_bucket"
]),
num_args = 0..,
value_hint = ValueHint::AnyPath
@ -120,6 +126,30 @@ pub struct InputSpecifierArgs {
#[arg(long, alias = "include-subgroups")]
pub gitlab_include_subgroups: bool,
/// Scan models, datasets, and Spaces belonging to the specified Hugging Face users
#[arg(long = "huggingface-user")]
pub huggingface_user: Vec<String>,
/// Scan models, datasets, and Spaces belonging to the specified Hugging Face organizations
#[arg(long = "huggingface-organization", alias = "huggingface-org")]
pub huggingface_organization: Vec<String>,
/// Scan a specific Hugging Face model (format: owner/name or full URL)
#[arg(long = "huggingface-model")]
pub huggingface_model: Vec<String>,
/// Scan a specific Hugging Face dataset (format: owner/name or full URL)
#[arg(long = "huggingface-dataset")]
pub huggingface_dataset: Vec<String>,
/// Scan a specific Hugging Face Space (format: owner/name or full URL)
#[arg(long = "huggingface-space")]
pub huggingface_space: Vec<String>,
/// Skip specific Hugging Face repositories during enumeration (accepts optional prefixes like model:, dataset:, or space:)
#[arg(long = "huggingface-exclude", value_name = "IDENTIFIER")]
pub huggingface_exclude: Vec<String>,
// Gitea Options
/// Scan repositories belonging to the specified Gitea user
#[arg(long)]
@ -256,6 +286,18 @@ pub struct InputSpecifierArgs {
#[arg(long, requires = "s3_bucket")]
pub aws_local_profile: Option<String>,
/// Scan the specified Google Cloud Storage bucket
#[arg(long)]
pub gcs_bucket: Option<String>,
/// Optional prefix within the GCS bucket
#[arg(long, requires = "gcs_bucket")]
pub gcs_prefix: Option<String>,
/// Path to a service account JSON file for GCS authentication
#[arg(long, value_hint = ValueHint::FilePath, requires = "gcs_bucket")]
pub gcs_service_account: Option<PathBuf>,
/// Docker/OCI images to scan (no local Docker required)
#[arg(long = "docker-image")]
pub docker_image: Vec<String>,
@ -299,7 +341,6 @@ pub struct ContentFilteringArgs {
#[arg(
long = "max-file-size",
visible_alias = "max-filesize", // also show in --help
// alias = "max-filesize", // use this instead if you DONT want it shown in --help
default_value_t = 256.0,
value_name = "MB"
)]

View file

@ -3,6 +3,7 @@ pub mod bitbucket;
pub mod gitea;
pub mod github;
pub mod gitlab;
pub mod huggingface;
pub mod inputs;
pub mod output;
pub mod rules;

View file

@ -1,4 +1,5 @@
use clap::{Args, ValueEnum};
use std::path::PathBuf;
use strum::Display;
use tracing::debug;
@ -119,6 +120,14 @@ pub struct ScanArgs {
#[arg(long = "skip-word", value_name = "WORD")]
pub skip_word: Vec<String>,
/// AWS account IDs whose findings should skip live credential validation (repeatable)
#[arg(long = "skip-aws-account", value_name = "ACCOUNT_ID", value_delimiter = ',')]
pub skip_aws_account: Vec<String>,
/// File containing AWS account IDs to skip (one per line, `#` comments ignored)
#[arg(long = "skip-aws-account-file", value_name = "FILE")]
pub skip_aws_account_file: Option<PathBuf>,
/// Additional inline ignore directives to recognise (repeatable)
#[arg(long = "ignore-comment", value_name = "DIRECTIVE")]
pub extra_ignore_comments: Vec<String>,

View file

@ -8,7 +8,7 @@ use tracing::Level;
use crate::cli::commands::{
azure::AzureArgs, bitbucket::BitbucketArgs, gitea::GiteaArgs, github::GitHubArgs,
gitlab::GitLabArgs, rules::RulesArgs, scan::ScanArgs,
gitlab::GitLabArgs, huggingface::HuggingFaceArgs, rules::RulesArgs, scan::ScanArgs,
};
#[deny(missing_docs)]
@ -81,6 +81,10 @@ pub enum Command {
#[command(name = "azure")]
Azure(AzureArgs),
/// Interact with the Hugging Face Hub
#[command(name = "huggingface")]
HuggingFace(HuggingFaceArgs),
/// Manage rules
#[command(alias = "rule")]
Rules(RulesArgs),

87
src/gcs.rs Normal file
View file

@ -0,0 +1,87 @@
use std::path::Path;
use anyhow::{Context, Result};
use gcloud_storage::{
client::{Client, ClientConfig},
http::objects::{
download::Range,
get::GetObjectRequest,
list::{ListObjectsRequest, ListObjectsResponse},
},
};
use tracing::debug;
/// Visit every object in the given GCS bucket, optionally filtered by prefix.
///
/// Authentication is attempted via Application Default Credentials. When that
/// fails and no explicit service account path was provided, the client falls
/// back to anonymous access so public buckets can still be scanned.
pub async fn visit_bucket_objects<F>(
bucket: &str,
prefix: Option<&str>,
service_account_path: Option<&Path>,
mut visitor: F,
) -> Result<()>
where
F: FnMut(String, Vec<u8>) -> Result<()>,
{
if let Some(path) = service_account_path {
std::env::set_var("GOOGLE_APPLICATION_CREDENTIALS", path);
}
let config = match ClientConfig::default().with_auth().await {
Ok(config) => config,
Err(err) => {
if service_account_path.is_some()
|| std::env::var("GOOGLE_APPLICATION_CREDENTIALS").is_ok()
{
return Err(err)
.context("Failed to authenticate with GCS using provided credentials");
}
debug!("Falling back to anonymous GCS access: {err}");
ClientConfig::default().anonymous()
}
};
let client = Client::new(config);
let mut page_token: Option<String> = None;
loop {
let request = ListObjectsRequest {
bucket: bucket.to_string(),
prefix: prefix.map(|p| p.to_string()),
page_token: page_token.clone(),
..ListObjectsRequest::default()
};
let mut response: ListObjectsResponse = client
.list_objects(&request)
.await
.with_context(|| format!("Failed to list objects in bucket {bucket}"))?;
if let Some(items) = response.items.take() {
for object in items.into_iter().filter(|o| !o.name.is_empty()) {
let data = client
.download_object(
&GetObjectRequest {
bucket: bucket.to_string(),
object: object.name.clone(),
..GetObjectRequest::default()
},
&Range::default(),
)
.await
.with_context(|| format!("Failed to fetch object {}", object.name))?;
visitor(object.name, data)?;
}
}
match response.next_page_token {
Some(token) if !token.is_empty() => page_token = Some(token),
_ => break,
}
}
Ok(())
}

View file

@ -40,6 +40,15 @@ const AZURE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_azcreds() {
fi
}; _azcreds"#;
const HUGGINGFACE_CREDENTIAL_HELPER: &str = r#"credential.helper=!_hfcreds() {
token="$KF_HUGGINGFACE_TOKEN";
if [ -n "$token" ]; then
user="${KF_HUGGINGFACE_USERNAME:-hf_user}";
echo username="$user";
echo password="$token";
fi
}; _hfcreds"#;
/// Represents errors that can occur when interacting with the `git` CLI.
#[derive(Debug, thiserror::Error)]
pub enum GitError {
@ -47,14 +56,37 @@ pub enum GitError {
IOError(#[from] std::io::Error),
#[error(
"git execution failed\ncode={}\nstdout=```\n{}```\nstderr=```\n{}```",
.status,
String::from_utf8_lossy(.stdout),
String::from_utf8_lossy(.stderr)
"git execution failed (status: {status}){summary}",
status = format_exit_status(.status),
summary = format_git_error_summary(.stdout.as_slice(), .stderr.as_slice())
)]
GitError { stdout: Vec<u8>, stderr: Vec<u8>, status: ExitStatus },
}
fn format_exit_status(status: &ExitStatus) -> String {
status.code().map(|code| code.to_string()).unwrap_or_else(|| status.to_string())
}
fn format_git_error_summary(stdout: &[u8], stderr: &[u8]) -> String {
let mut messages = Vec::new();
if let Some(line) = summarize_output(stderr) {
messages.push(line);
}
if let Some(line) = summarize_output(stdout) {
messages.push(line);
}
if messages.is_empty() {
String::new()
} else {
format!(": {}", messages.join(" | "))
}
}
fn summarize_output(output: &[u8]) -> Option<String> {
let text = String::from_utf8_lossy(output);
text.lines().map(str::trim).find(|line| !line.is_empty()).map(|line| line.to_owned())
}
/// A helper struct for running `git` commands.
///
/// It supports optional GitHub, GitLab, Gitea, and Bitbucket credentials passed via
@ -91,6 +123,8 @@ impl Git {
let has_azure_token = ["KF_AZURE_TOKEN", "KF_AZURE_PAT"]
.iter()
.any(|key| matches!(std::env::var(key), Ok(value) if !value.is_empty()));
let has_huggingface_token =
matches!(std::env::var("KF_HUGGINGFACE_TOKEN"), Ok(value) if !value.is_empty());
// If credentials are provided via environment variables, clear existing helpers first.
if has_github_token
@ -98,6 +132,7 @@ impl Git {
|| has_gitea_token
|| has_bitbucket_credentials
|| has_azure_token
|| has_huggingface_token
{
credentials.push("-c".into());
credentials.push(r#"credential.helper="#.into());
@ -136,6 +171,11 @@ impl Git {
credentials.push(AZURE_CREDENTIAL_HELPER.into());
}
if has_huggingface_token {
credentials.push("-c".into());
credentials.push(HUGGINGFACE_CREDENTIAL_HELPER.into());
}
Self { credentials, ignore_certs }
}

636
src/huggingface.rs Normal file
View file

@ -0,0 +1,636 @@
use std::{collections::HashSet, env, time::Duration};
use anyhow::{anyhow, Result};
use indicatif::{ProgressBar, ProgressStyle};
use reqwest::{header::LINK, StatusCode, Url};
use serde::Deserialize;
use serde_json::Value;
use tracing::{debug, warn};
use crate::{git_url::GitUrl, validation::GLOBAL_USER_AGENT};
#[derive(Debug, Clone, Default)]
pub struct RepoSpecifiers {
pub user: Vec<String>,
pub organization: Vec<String>,
pub model: Vec<String>,
pub dataset: Vec<String>,
pub space: Vec<String>,
pub exclude: Vec<String>,
}
impl RepoSpecifiers {
pub fn is_empty(&self) -> bool {
self.user.is_empty()
&& self.organization.is_empty()
&& self.model.is_empty()
&& self.dataset.is_empty()
&& self.space.is_empty()
}
}
#[derive(Clone, Default)]
pub struct AuthConfig {
token: Option<String>,
}
impl std::fmt::Debug for AuthConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("AuthConfig")
.field(
"token",
&self
.token
.as_ref()
.map(|token| format!("{}", token.chars().take(4).collect::<String>())),
)
.finish()
}
}
impl AuthConfig {
pub fn from_env() -> Self {
let token = env::var("KF_HUGGINGFACE_TOKEN").ok().filter(|t| !t.trim().is_empty());
Self { token }
}
fn apply(&self, request: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
if let Some(token) = &self.token {
request.bearer_auth(token)
} else {
request
}
}
fn has_token(&self) -> bool {
self.token.is_some()
}
}
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
enum ResourceKind {
Model,
Dataset,
Space,
}
impl ResourceKind {
fn api_path(self) -> &'static str {
match self {
ResourceKind::Model => "models",
ResourceKind::Dataset => "datasets",
ResourceKind::Space => "spaces",
}
}
fn git_url(self, slug: &str) -> String {
match self {
ResourceKind::Model => format!("https://huggingface.co/{slug}.git"),
ResourceKind::Dataset => format!("https://huggingface.co/datasets/{slug}.git"),
ResourceKind::Space => format!("https://huggingface.co/spaces/{slug}.git"),
}
}
fn canonical_prefix(self) -> &'static str {
match self {
ResourceKind::Model => "model",
ResourceKind::Dataset => "dataset",
ResourceKind::Space => "space",
}
}
fn display_name_singular(self) -> &'static str {
match self {
ResourceKind::Model => "model",
ResourceKind::Dataset => "dataset",
ResourceKind::Space => "space",
}
}
fn display_name_plural(self) -> &'static str {
match self {
ResourceKind::Model => "models",
ResourceKind::Dataset => "datasets",
ResourceKind::Space => "spaces",
}
}
}
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
struct ResourceRef {
kind: ResourceKind,
slug: String,
}
impl ResourceRef {
fn new(kind: ResourceKind, slug: String) -> Self {
Self { kind, slug }
}
fn canonical_key(&self) -> String {
format!("{}:{}", self.kind.canonical_prefix(), self.slug.to_lowercase())
}
fn git_url(&self) -> String {
self.kind.git_url(&self.slug)
}
}
#[derive(Debug, Deserialize)]
#[serde(untagged)]
enum HuggingFaceItem {
Id {
id: String,
},
ModelId {
#[serde(rename = "modelId")]
model_id: String,
},
}
impl HuggingFaceItem {
fn into_identifier(self) -> String {
match self {
HuggingFaceItem::Id { id } => id,
HuggingFaceItem::ModelId { model_id } => model_id,
}
}
}
#[derive(Default)]
struct ExcludeSet {
typed: HashSet<String>,
untyped: HashSet<String>,
}
impl ExcludeSet {
fn from_list(values: &[String]) -> Self {
let mut typed = HashSet::new();
let mut untyped = HashSet::new();
for raw in values {
let trimmed = raw.trim();
if trimmed.is_empty() {
continue;
}
if let Some((prefix, rest)) = trimmed.split_once(':') {
match normalize_kind(prefix) {
Some(kind) => {
if let Some(slug) = parse_slug_for_kind(kind, rest) {
typed.insert(format!(
"{}:{}",
kind.canonical_prefix(),
slug.to_lowercase()
));
} else {
warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)");
}
}
None => warn!("Ignoring invalid Hugging Face exclusion '{raw}' (unknown type)"),
}
} else if let Some(slug) = normalize_untyped_slug(trimmed) {
untyped.insert(slug);
} else {
warn!("Ignoring invalid Hugging Face exclusion '{raw}' (expected owner/name)");
}
}
Self { typed, untyped }
}
fn should_exclude(&self, kind: ResourceKind, slug: &str) -> bool {
let typed_key = format!("{}:{}", kind.canonical_prefix(), slug.to_lowercase());
if self.typed.contains(&typed_key) {
return true;
}
self.untyped.contains(&slug.to_lowercase())
}
}
fn normalize_kind(raw: &str) -> Option<ResourceKind> {
match raw.trim().to_ascii_lowercase().as_str() {
"model" | "models" => Some(ResourceKind::Model),
"dataset" | "datasets" => Some(ResourceKind::Dataset),
"space" | "spaces" => Some(ResourceKind::Space),
_ => None,
}
}
fn normalize_untyped_slug(raw: &str) -> Option<String> {
let trimmed = raw.trim();
if trimmed.is_empty() {
return None;
}
let segments: Vec<&str> = trimmed.split('/').filter(|segment| !segment.is_empty()).collect();
normalize_untyped_segments(&segments)
}
fn normalize_untyped_segments(segments: &[&str]) -> Option<String> {
if segments.is_empty() {
return None;
}
let mut parts: Vec<&str> = segments.to_vec();
if let Some(first) = parts.first() {
let lowered = first.trim().to_ascii_lowercase();
if matches!(
lowered.as_str(),
"models" | "model" | "datasets" | "dataset" | "spaces" | "space"
) {
parts.remove(0);
}
}
if parts.len() < 2 {
return None;
}
let owner = parts[0].trim();
let binding = parts[1..].join("/");
let name = binding.trim_end_matches(".git").trim();
if owner.is_empty() || name.is_empty() {
return None;
}
Some(format!("{}/{}", owner, name).to_lowercase())
}
fn parse_slug_for_kind(kind: ResourceKind, raw: &str) -> Option<String> {
let trimmed = raw.trim();
if trimmed.is_empty() {
return None;
}
if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
let url = Url::parse(trimmed).ok()?;
let segments: Vec<&str> = url
.path_segments()
.map(|segments| segments.filter(|s| !s.is_empty()).collect())
.unwrap_or_default();
return parse_slug_segments(kind, &segments);
}
let segments: Vec<&str> = trimmed.split('/').filter(|s| !s.is_empty()).collect();
parse_slug_segments(kind, &segments)
}
fn parse_slug_segments(kind: ResourceKind, segments: &[&str]) -> Option<String> {
if segments.is_empty() {
return None;
}
let mut parts: Vec<&str> = segments.to_vec();
if let Some(first) = parts.first() {
let lowered = first.trim().to_ascii_lowercase();
let should_trim = match kind {
ResourceKind::Model => matches!(lowered.as_str(), "models" | "model"),
ResourceKind::Dataset => matches!(lowered.as_str(), "datasets" | "dataset"),
ResourceKind::Space => matches!(lowered.as_str(), "spaces" | "space"),
};
if should_trim {
parts.remove(0);
}
}
if parts.len() < 2 {
return None;
}
let owner = parts[0].trim();
let binding = parts[1..].join("/");
let name = binding.trim_end_matches(".git").trim();
if owner.is_empty() || name.is_empty() {
return None;
}
Some(format!("{owner}/{name}"))
}
fn parse_next_link(value: &str) -> Option<Url> {
value.split(',').find_map(|part| {
let part = part.trim();
let (url_part, params) = part.split_once('>')?;
if params.contains("rel=\"next\"") {
let url = url_part.trim_start_matches('<').trim();
Url::parse(url).ok()
} else {
None
}
})
}
const BODY_SNIPPET_LIMIT: usize = 200;
async fn fetch_paginated(
client: &reqwest::Client,
mut current_url: Url,
auth: &AuthConfig,
context: &str,
) -> Result<Vec<HuggingFaceItem>> {
let mut items = Vec::new();
loop {
let mut request =
client.get(current_url.clone()).header("User-Agent", GLOBAL_USER_AGENT.as_str());
request = auth.apply(request);
let response = request.send().await?;
let status = response.status();
let link_header = response
.headers()
.get(LINK)
.and_then(|value| value.to_str().ok())
.map(|value| value.to_string());
if !status.is_success() {
let body = response.text().await.unwrap_or_default();
let mut message = format!(
"Hugging Face API request failed while enumerating {context} ({status}): {body}"
);
if matches!(status, StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN)
&& !auth.has_token()
{
message.push_str(
"\nProvide a Hugging Face access token via the KF_HUGGINGFACE_TOKEN environment variable.",
);
}
return Err(anyhow!(message));
}
let body = response.bytes().await?;
let value: Value = serde_json::from_slice(&body).map_err(|err| {
let snippet = body_snippet(&body);
anyhow!(
"Failed to parse Hugging Face response while enumerating {context}: {err}. Body snippet: {snippet}",
context = context,
err = err,
snippet = snippet
)
})?;
let array = value.as_array().ok_or_else(|| {
let snippet = body_snippet(&body);
anyhow!(
"Unexpected Hugging Face response format while enumerating {context} (expected array). Body snippet: {snippet}",
context = context,
snippet = snippet
)
})?;
let mut page = Vec::new();
for (index, element) in array.iter().enumerate() {
match serde_json::from_value::<HuggingFaceItem>(element.clone()) {
Ok(item) => page.push(item),
Err(err) => {
let snippet = value_snippet(element);
warn!(
"Skipping Hugging Face item at index {index} while enumerating {context}: {err}. Item snippet: {snippet}"
);
}
}
}
items.append(&mut page);
if let Some(link_value) = link_header {
if let Some(next_url) = parse_next_link(&link_value) {
current_url = next_url;
continue;
}
}
break;
}
Ok(items)
}
fn body_snippet(body: &[u8]) -> String {
truncate_for_display(&String::from_utf8_lossy(body), BODY_SNIPPET_LIMIT)
}
fn value_snippet(value: &Value) -> String {
let text = value.to_string();
truncate_for_display(&text, BODY_SNIPPET_LIMIT)
}
fn truncate_for_display(text: &str, limit: usize) -> String {
let mut snippet: String = text.chars().take(limit).collect();
if text.chars().count() > limit {
snippet.push('…');
}
snippet
}
async fn fetch_resources_for_owner(
client: &reqwest::Client,
base_url: &Url,
owner: &str,
label: &str,
auth: &AuthConfig,
progress: Option<&ProgressBar>,
) -> Result<Vec<ResourceRef>> {
let mut resources = Vec::new();
for kind in [ResourceKind::Model, ResourceKind::Dataset, ResourceKind::Space] {
if let Some(pb) = progress {
pb.set_message(format!(
"Enumerating Hugging Face {label} {}",
kind.display_name_plural()
));
}
let mut url = base_url.join(kind.api_path())?;
{
let mut pairs = url.query_pairs_mut();
pairs.append_pair("author", owner);
pairs.append_pair("limit", "100");
}
let context = format!("{} for {label}", kind.display_name_plural());
match fetch_paginated(client, url, auth, &context).await {
Ok(items) => {
for item in items {
let identifier = item.into_identifier();
if let Some(slug) = parse_slug_for_kind(kind, &identifier) {
resources.push(ResourceRef::new(kind, slug));
} else {
warn!(
"Skipping Hugging Face {} with unexpected identifier '{}'",
kind.display_name_singular(),
identifier
);
}
}
}
Err(err) => {
warn!(
"Failed to enumerate Hugging Face {} for {label}: {err}",
kind.display_name_plural()
);
}
}
}
Ok(resources)
}
fn append_explicit_resources(specifiers: &RepoSpecifiers, resources: &mut Vec<ResourceRef>) {
for model in &specifiers.model {
if let Some(slug) = parse_slug_for_kind(ResourceKind::Model, model) {
resources.push(ResourceRef::new(ResourceKind::Model, slug));
} else {
warn!("Ignoring invalid Hugging Face model identifier '{model}'");
}
}
for dataset in &specifiers.dataset {
if let Some(slug) = parse_slug_for_kind(ResourceKind::Dataset, dataset) {
resources.push(ResourceRef::new(ResourceKind::Dataset, slug));
} else {
warn!("Ignoring invalid Hugging Face dataset identifier '{dataset}'");
}
}
for space in &specifiers.space {
if let Some(slug) = parse_slug_for_kind(ResourceKind::Space, space) {
resources.push(ResourceRef::new(ResourceKind::Space, slug));
} else {
warn!("Ignoring invalid Hugging Face space identifier '{space}'");
}
}
}
pub async fn enumerate_repo_urls(
specifiers: &RepoSpecifiers,
auth: &AuthConfig,
ignore_certs: bool,
progress: Option<&mut ProgressBar>,
) -> Result<Vec<String>> {
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(30))
.danger_accept_invalid_certs(ignore_certs)
.build()?;
let base_url = Url::parse("https://huggingface.co/api/")?;
let excludes = ExcludeSet::from_list(&specifiers.exclude);
let mut collected = Vec::new();
for user in &specifiers.user {
let label = format!("user {user}");
if let Some(pb) = progress.as_ref() {
pb.set_message(format!("Enumerating Hugging Face {label}"));
}
match fetch_resources_for_owner(
&client,
&base_url,
user,
&label,
auth,
progress.as_ref().map(|pb| &**pb),
)
.await
{
Ok(mut resources) => collected.append(&mut resources),
Err(err) => warn!("Failed to enumerate Hugging Face user {user}: {err}"),
}
}
for org in &specifiers.organization {
let label = format!("organization {org}");
if let Some(pb) = progress.as_ref() {
pb.set_message(format!("Enumerating Hugging Face {label}"));
}
match fetch_resources_for_owner(
&client,
&base_url,
org,
&label,
auth,
progress.as_ref().map(|pb| &**pb),
)
.await
{
Ok(mut resources) => collected.append(&mut resources),
Err(err) => warn!("Failed to enumerate Hugging Face organization {org}: {err}"),
}
}
append_explicit_resources(specifiers, &mut collected);
let mut seen = HashSet::new();
let mut urls = Vec::new();
for resource in collected {
if excludes.should_exclude(resource.kind, &resource.slug) {
debug!(
"Skipping Hugging Face {} {} due to exclusion",
resource.kind.display_name_singular(),
resource.slug
);
continue;
}
let key = resource.canonical_key();
if seen.insert(key) {
urls.push(resource.git_url());
}
}
urls.sort();
urls.dedup();
Ok(urls)
}
pub async fn list_repositories(
specifiers: &RepoSpecifiers,
auth: &AuthConfig,
ignore_certs: bool,
progress_enabled: bool,
) -> Result<()> {
let mut progress = if progress_enabled {
let style = ProgressStyle::with_template("{spinner} {msg} [{elapsed_precise}]")
.expect("progress bar style template should compile");
let pb = ProgressBar::new_spinner()
.with_style(style)
.with_message("Enumerating Hugging Face repositories");
pb.enable_steady_tick(Duration::from_millis(500));
pb
} else {
ProgressBar::hidden()
};
let urls = enumerate_repo_urls(specifiers, auth, ignore_certs, Some(&mut progress)).await?;
for url in urls {
println!("{url}");
}
progress.finish_and_clear();
Ok(())
}
pub fn wiki_url(_repo_url: &GitUrl) -> Option<GitUrl> {
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_model_slug_from_plain() {
assert_eq!(
parse_slug_for_kind(ResourceKind::Model, "user/model"),
Some("user/model".to_string())
);
}
#[test]
fn parse_dataset_slug_with_prefix() {
assert_eq!(
parse_slug_for_kind(ResourceKind::Dataset, "datasets/user/data.git"),
Some("user/data".to_string())
);
}
#[test]
fn parse_space_slug_from_url() {
assert_eq!(
parse_slug_for_kind(ResourceKind::Space, "https://huggingface.co/spaces/user/demo"),
Some("user/demo".to_string())
);
}
#[test]
fn exclude_set_matches_typed_and_untyped() {
let excludes =
ExcludeSet::from_list(&vec!["model:user/model".into(), "datasets/user/data".into()]);
assert!(excludes.should_exclude(ResourceKind::Model, "user/model"));
assert!(excludes.should_exclude(ResourceKind::Dataset, "user/data"));
assert!(!excludes.should_exclude(ResourceKind::Space, "user/space"));
}
#[test]
fn parse_link_header() {
let header = "<https://huggingface.co/api/models?cursor=abc>; rel=\"next\"";
let url = parse_next_link(header).expect("next link");
assert_eq!(url.as_str(), "https://huggingface.co/api/models?cursor=abc");
}
#[test]
fn truncate_for_display_adds_ellipsis() {
assert_eq!(truncate_for_display("abcdef", 3), "abc…");
assert_eq!(truncate_for_display("abc", 5), "abc");
}
}

View file

@ -13,6 +13,7 @@ pub mod defaults;
pub mod entropy;
pub mod finding_data;
pub mod findings_store;
pub mod gcs;
pub mod git_binary;
pub mod git_commit_metadata;
pub mod git_metadata_graph;
@ -21,6 +22,7 @@ pub mod git_url;
pub mod gitea;
pub mod github;
pub mod gitlab;
pub mod huggingface;
pub mod inline_ignore;
pub mod jira;
pub mod liquid_filters;

View file

@ -52,7 +52,7 @@ use kingfisher::{
},
findings_store,
findings_store::FindingsStore,
gitea, github,
gitea, github, huggingface,
rule_loader::RuleLoader,
rules_database::RulesDatabase,
scanner::{load_and_record_rules, run_scan},
@ -75,6 +75,7 @@ use crate::cli::commands::{
bitbucket::{BitbucketAuthArgs, BitbucketCommand, BitbucketRepoType, BitbucketReposCommand},
gitea::{GiteaCommand, GiteaRepoType, GiteaReposCommand},
gitlab::{GitLabCommand, GitLabRepoType, GitLabReposCommand},
huggingface::{HuggingFaceCommand, HuggingFaceReposCommand},
};
fn main() -> anyhow::Result<()> {
@ -93,6 +94,7 @@ fn main() -> anyhow::Result<()> {
Command::Bitbucket(_) => num_cpus::get(), // Default for Bitbucket commands
Command::Gitea(_) => num_cpus::get(), // Default for Gitea commands
Command::Azure(_) => num_cpus::get(), // Default for Azure commands
Command::HuggingFace(_) => num_cpus::get(), // Default for Hugging Face commands
Command::Rules(_) => num_cpus::get(), // Default for Rules commands
};
@ -327,6 +329,28 @@ async fn async_main(args: CommandLineArgs) -> Result<()> {
}
},
},
Command::HuggingFace(hf_args) => match hf_args.command {
HuggingFaceCommand::Repos(repos_command) => match repos_command {
HuggingFaceReposCommand::List(list_args) => {
let specifiers = huggingface::RepoSpecifiers {
user: list_args.repo_specifiers.user.clone(),
organization: list_args.repo_specifiers.organization.clone(),
model: list_args.repo_specifiers.model.clone(),
dataset: list_args.repo_specifiers.dataset.clone(),
space: list_args.repo_specifiers.space.clone(),
exclude: list_args.repo_specifiers.exclude.clone(),
};
let auth = huggingface::AuthConfig::from_env();
huggingface::list_repositories(
&specifiers,
&auth,
global_args.ignore_certs,
global_args.use_progress(),
)
.await?;
}
},
},
Command::SelfUpdate => {
anyhow::bail!("SelfUpdate command should not reach this branch")
}
@ -367,6 +391,13 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -400,6 +431,9 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Slack query
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
@ -436,6 +470,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_base64: false,
no_inline_ignore: false,

View file

@ -703,7 +703,7 @@ mod tests {
git_commit_metadata::CommitMetadata,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::{SerializableCapture, SerializableCaptures},
origin::OriginSet,
origin::{Origin, OriginSet},
rules::rule::{Confidence, Rule, RuleSyntax},
};
use gix::{date::Time, ObjectId};
@ -711,79 +711,8 @@ mod tests {
use std::path::PathBuf;
use tempfile::tempdir;
#[test]
fn build_finding_record_uses_git_blob_path() {
let temp = tempdir().unwrap();
let datastore =
Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
let repo_path = Arc::new(PathBuf::from("/tmp/repo"));
let commit_metadata = Arc::new(CommitMetadata {
commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(),
committer_name: "Alice".into(),
committer_email: "alice@example.com".into(),
committer_timestamp: Time::new(0, 0),
});
let blob_path = "path/in/history.txt".to_string();
let origin = OriginSet::new(
Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()),
vec![],
);
let rule = Arc::new(Rule::new(RuleSyntax {
name: "Test Rule".into(),
id: "test.rule".into(),
pattern: ".*".into(),
min_entropy: 0.0,
confidence: Confidence::Medium,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
depends_on_rule: vec![],
}));
let blob_id = BlobId::new(b"blob-data");
let report_match = ReportMatch {
origin,
blob_metadata: BlobMetadata {
id: blob_id,
num_bytes: 42,
mime_essence: None,
language: Some("Unknown".into()),
},
m: Match {
location: Location {
offset_span: OffsetSpan { start: 0, end: 10 },
source_span: SourceSpan {
start: SourcePoint { line: 19, column: 0 },
end: SourcePoint { line: 19, column: 10 },
},
},
groups: SerializableCaptures {
captures: SmallVec::<[SerializableCapture; 2]>::new(),
},
blob_id,
finding_fingerprint: 123,
rule: Arc::clone(&rule),
validation_response_body: "Bad credentials".into(),
validation_response_status: 401,
validation_success: false,
calculated_entropy: 5.29,
visible: true,
is_base64: false,
},
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: "Bad credentials".into(),
validation_response_status: 401,
validation_success: false,
};
let scan_args = ScanArgs {
fn sample_scan_args() -> ScanArgs {
ScanArgs {
num_jobs: 1,
rules: RuleSpecifierArgs::default(),
input_specifier_args: InputSpecifierArgs {
@ -802,6 +731,12 @@ mod tests {
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
@ -833,6 +768,9 @@ mod tests {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
@ -864,9 +802,98 @@ mod tests {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_inline_ignore: false,
}
}
fn sample_report_match(
validation_body: &str,
validation_status: u16,
validation_success: bool,
) -> (ReportMatch, String) {
let repo_path = Arc::new(PathBuf::from("/tmp/repo"));
let commit_metadata = Arc::new(CommitMetadata {
commit_id: ObjectId::from_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").unwrap(),
committer_name: "Alice".into(),
committer_email: "alice@example.com".into(),
committer_timestamp: Time::new(0, 0),
});
let blob_path = "path/in/history.txt".to_string();
let origin = OriginSet::new(
Origin::from_git_repo_with_first_commit(repo_path, commit_metadata, blob_path.clone()),
vec![],
);
let rule = Arc::new(Rule::new(RuleSyntax {
name: "Test Rule".into(),
id: "test.rule".into(),
pattern: ".*".into(),
min_entropy: 0.0,
confidence: Confidence::Medium,
visible: true,
examples: vec![],
negative_examples: vec![],
references: vec![],
validation: None,
depends_on_rule: vec![],
}));
let blob_id = BlobId::new(b"blob-data");
let validation_body_owned = validation_body.to_string();
let report_match = ReportMatch {
origin,
blob_metadata: BlobMetadata {
id: blob_id,
num_bytes: 42,
mime_essence: None,
language: Some("Unknown".into()),
},
m: Match {
location: Location {
offset_span: OffsetSpan { start: 0, end: 10 },
source_span: SourceSpan {
start: SourcePoint { line: 19, column: 0 },
end: SourcePoint { line: 19, column: 10 },
},
},
groups: SerializableCaptures {
captures: SmallVec::<[SerializableCapture; 2]>::new(),
},
blob_id,
finding_fingerprint: 123,
rule: Arc::clone(&rule),
validation_response_body: validation_body_owned.clone(),
validation_response_status: validation_status,
validation_success,
calculated_entropy: 5.29,
visible: true,
is_base64: false,
},
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: validation_body_owned,
validation_response_status: validation_status,
validation_success,
};
(report_match, blob_path)
}
#[test]
fn build_finding_record_uses_git_blob_path() {
let temp = tempdir().unwrap();
let datastore =
Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
let (report_match, blob_path) =
sample_report_match("Bad credentials", StatusCode::UNAUTHORIZED.as_u16(), false);
let scan_args = sample_scan_args();
let record = reporter.build_finding_record(&report_match, &scan_args);
assert_eq!(record.finding.path, blob_path);
let git_file_path = record
@ -880,6 +907,28 @@ mod tests {
assert_eq!(git_file_path, "path/in/history.txt");
}
#[test]
fn skip_list_matches_surface_skip_reason() {
let temp = tempdir().unwrap();
let datastore =
Arc::new(Mutex::new(findings_store::FindingsStore::new(temp.path().to_path_buf())));
let reporter = DetailsReporter { datastore, styles: Styles::new(false), only_valid: false };
let (report_match, _) = sample_report_match(
"(skip list entry) AWS validation not attempted for account 111122223333.",
StatusCode::CONTINUE.as_u16(),
false,
);
let scan_args = sample_scan_args();
let record = reporter.build_finding_record(&report_match, &scan_args);
assert_eq!(record.finding.validation.status, "Not Attempted");
assert_eq!(
record.finding.validation.response,
"(skip list entry) AWS validation not attempted for account 111122223333."
);
}
use super::build_git_urls;
#[test]

View file

@ -93,6 +93,14 @@ mod tests {
gitlab_repo_type: GitLabRepoType::All,
gitlab_include_subgroups: false,
// Hugging Face
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
// Gitea
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
@ -132,6 +140,9 @@ mod tests {
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
docker_image: Vec::new(),
// clone / history options
@ -163,6 +174,8 @@ mod tests {
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
no_inline_ignore: false,
}

View file

@ -115,7 +115,7 @@ impl<'a> Display for PrettyFindingRecord<'a> {
} else {
writeln!(f, " |Validation....: {}", finding.validation.status)?;
}
if finding.validation.status != "Not Attempted" {
if !finding.validation.response.is_empty() {
writeln!(f, " |__Response....: {}", style_fn(&finding.validation.response))?;
}
writeln!(f, " |Language......: {}", finding.language)?;

View file

@ -3,7 +3,7 @@ pub(crate) use docker::save_docker_images;
pub(crate) use enumerate::enumerate_filesystem_inputs;
pub(crate) use repos::{
clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos,
enumerate_github_repos,
enumerate_github_repos, enumerate_huggingface_repos,
};
pub use runner::{load_and_record_rules, run_async_scan, run_scan};
pub(crate) use validation::run_secret_validation;

View file

@ -17,10 +17,10 @@ use crate::{
commands::{github::GitCloneMode, github::GitHistoryMode, scan},
global,
},
confluence, findings_store,
confluence, findings_store, gcs,
git_binary::{CloneMode, Git},
git_url::GitUrl,
gitea, github, gitlab, jira,
gitea, github, gitlab, huggingface, jira,
matcher::{Match, Matcher, MatcherStats},
origin::{Origin, OriginSet},
rules_database::RulesDatabase,
@ -305,6 +305,69 @@ pub async fn enumerate_gitea_repos(
Ok(repo_urls)
}
pub async fn enumerate_huggingface_repos(
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
) -> Result<Vec<GitUrl>> {
let repo_specifiers = huggingface::RepoSpecifiers {
user: args.input_specifier_args.huggingface_user.clone(),
organization: args.input_specifier_args.huggingface_organization.clone(),
model: args.input_specifier_args.huggingface_model.clone(),
dataset: args.input_specifier_args.huggingface_dataset.clone(),
space: args.input_specifier_args.huggingface_space.clone(),
exclude: args.input_specifier_args.huggingface_exclude.clone(),
};
let mut repo_urls = args.input_specifier_args.git_url.clone();
if !repo_specifiers.is_empty() {
let mut progress = if global_args.use_progress() {
let style =
ProgressStyle::with_template("{spinner} {msg} {human_len} [{elapsed_precise}]")
.expect("progress bar style template should compile");
let pb = ProgressBar::new_spinner()
.with_style(style)
.with_message("Enumerating Hugging Face repositories...");
pb.enable_steady_tick(Duration::from_millis(500));
pb
} else {
ProgressBar::hidden()
};
let mut num_found: u64 = 0;
let auth = huggingface::AuthConfig::from_env();
let repo_strings = huggingface::enumerate_repo_urls(
&repo_specifiers,
&auth,
global_args.ignore_certs,
Some(&mut progress),
)
.await
.context("Failed to enumerate Hugging Face repositories")?;
for repo_string in repo_strings {
match GitUrl::from_str(&repo_string) {
Ok(repo_url) => {
repo_urls.push(repo_url);
num_found += 1;
}
Err(e) => {
progress.suspend(|| {
error!("Failed to parse repo URL from {repo_string}: {e}");
});
}
}
}
progress.finish_with_message(format!(
"Found {} repositories from Hugging Face",
HumanCount(num_found)
));
}
repo_urls.sort();
repo_urls.dedup();
Ok(repo_urls)
}
pub async fn enumerate_bitbucket_repos(
args: &scan::ScanArgs,
global_args: &global::GlobalArgs,
@ -678,3 +741,81 @@ pub async fn fetch_s3_objects(
Ok(())
}
pub async fn fetch_gcs_objects(
args: &scan::ScanArgs,
datastore: &Arc<Mutex<findings_store::FindingsStore>>,
rules_db: &RulesDatabase,
matcher_stats: &Mutex<MatcherStats>,
enable_profiling: bool,
shared_profiler: Arc<crate::rule_profiling::ConcurrentRuleProfiler>,
progress_enabled: bool,
) -> Result<()> {
let Some(bucket) = args.input_specifier_args.gcs_bucket.as_deref() else {
return Ok(());
};
let prefix = args.input_specifier_args.gcs_prefix.as_deref();
let service_account = args.input_specifier_args.gcs_service_account.as_deref();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let seen_blobs = BlobIdMap::new();
let matcher = Matcher::new(
rules_db,
scanner_pool,
&seen_blobs,
Some(matcher_stats),
enable_profiling,
Some(shared_profiler.clone()),
&args.extra_ignore_comments,
args.no_inline_ignore,
)?;
let mut processor = BlobProcessor { matcher };
let progress = if progress_enabled {
let style =
ProgressStyle::with_template("{spinner} {msg} ({pos} objects) [{elapsed_precise}]")
.expect("progress bar style template should compile");
let pb = ProgressBar::new_spinner().with_style(style).with_message("Fetching GCS objects");
pb.enable_steady_tick(Duration::from_millis(500));
pb
} else {
ProgressBar::hidden()
};
let pb = progress.clone();
let bucket_name = bucket.to_string();
gcs::visit_bucket_objects(bucket, prefix, service_account, move |key, bytes| {
let origin = OriginSet::new(
Origin::from_extended(serde_json::json!({
"path": format!("gs://{}/{}", bucket_name, key)
})),
Vec::new(),
);
let blob = crate::blob::Blob::from_bytes(bytes);
if let Some((origin, blob_md, scored_matches)) =
processor.run(origin, blob, args.no_dedup, args.redact, args.no_base64)?
{
let origin_arc = Arc::new(origin);
let blob_arc = Arc::new(blob_md);
let mut batch = Vec::with_capacity(scored_matches.len());
for (_score, m) in scored_matches {
batch.push((origin_arc.clone(), blob_arc.clone(), m));
}
let added = datastore.lock().unwrap().record(batch, !args.no_dedup);
debug!("Added {} new GCS blobs", added);
}
pb.inc(1);
Ok(())
})
.await?;
let total = progress.position();
progress.finish_with_message(format!("Fetched {} GCS objects", total));
Ok(())
}

View file

@ -1,4 +1,7 @@
use std::sync::{Arc, Mutex};
use std::{
fs,
sync::{Arc, Mutex},
};
use anyhow::{bail, Context, Result};
use crossbeam_skiplist::SkipMap;
@ -21,10 +24,11 @@ use crate::{
safe_list,
scanner::{
clone_or_update_git_repos, enumerate_azure_repos, enumerate_bitbucket_repos,
enumerate_filesystem_inputs, enumerate_github_repos,
enumerate_filesystem_inputs, enumerate_github_repos, enumerate_huggingface_repos,
repos::{
enumerate_gitea_repos, enumerate_gitlab_repos, fetch_confluence_pages,
fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects, fetch_slack_messages,
fetch_gcs_objects, fetch_git_host_artifacts, fetch_jira_issues, fetch_s3_objects,
fetch_slack_messages,
},
run_secret_validation, save_docker_images,
summary::print_scan_summary,
@ -74,12 +78,14 @@ pub async fn run_async_scan(
let mut repo_urls = enumerate_github_repos(args, global_args).await?;
let gitlab_repo_urls = enumerate_gitlab_repos(args, global_args).await?;
let gitea_repo_urls = enumerate_gitea_repos(args, global_args).await?;
let huggingface_repo_urls = enumerate_huggingface_repos(args, global_args).await?;
let bitbucket_repo_urls = enumerate_bitbucket_repos(args, global_args).await?;
let azure_repo_urls = enumerate_azure_repos(args, global_args).await?;
// Combine repository URLs
repo_urls.extend(gitlab_repo_urls);
repo_urls.extend(gitea_repo_urls);
repo_urls.extend(huggingface_repo_urls);
repo_urls.extend(bitbucket_repo_urls);
repo_urls.extend(azure_repo_urls);
repo_urls.sort();
@ -182,8 +188,20 @@ pub async fn run_async_scan(
)
.await?;
let has_s3 = args.input_specifier_args.s3_bucket.is_some();
if input_roots.is_empty() && !has_s3 {
fetch_gcs_objects(
args,
&datastore,
rules_db,
&matcher_stats,
enable_profiling,
Arc::clone(&shared_profiler),
progress_enabled,
)
.await?;
let has_remote_objects = args.input_specifier_args.s3_bucket.is_some()
|| args.input_specifier_args.gcs_bucket.is_some();
if input_roots.is_empty() && !has_remote_objects {
bail!("No inputs to scan");
}
@ -231,6 +249,26 @@ pub async fn run_async_scan(
crate::baseline::apply_baseline(&mut ds, &path, args.manage_baseline, &input_roots)?;
}
let mut skip_aws_accounts = args.skip_aws_account.clone();
if let Some(path) = args.skip_aws_account_file.as_ref() {
let contents = fs::read_to_string(path).with_context(|| {
format!("Failed to read --skip-aws-account-file {}", path.display())
})?;
for line in contents.lines() {
let content = line.split('#').next().unwrap_or("");
for value in content.split(|c: char| c.is_ascii_whitespace() || c == ',' || c == ';') {
let trimmed = value.trim();
if !trimmed.is_empty() {
skip_aws_accounts.push(trimmed.to_string());
}
}
}
}
crate::validation::set_skip_aws_account_ids(skip_aws_accounts);
// If validation is enabled, run it as a second phase
if !args.no_validate {
info!("Starting secret validation phase...");

View file

@ -95,6 +95,14 @@ pub fn init_validation_caches() {
aws::set_aws_validation_concurrency(15);
}
pub fn set_skip_aws_account_ids<I, S>(ids: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
aws::set_aws_skip_account_ids(ids);
}
#[derive(Clone)]
pub struct CachedResponse {
pub body: String,
@ -781,6 +789,26 @@ async fn timed_validate_single_match<'a>(
}
}
if let Some(account_id) = aws::should_skip_aws_validation(&akid) {
m.validation_success = false;
m.validation_response_body = format!(
"(skip list entry) AWS validation not attempted for account {}.",
account_id
);
m.validation_response_status = StatusCode::CONTINUE;
cache.insert(
cache_key,
CachedResponse {
body: m.validation_response_body.clone(),
status: m.validation_response_status,
is_valid: m.validation_success,
timestamp: Instant::now(),
},
);
commit_and_return(m);
return;
}
if let Err(e) = aws::validate_aws_credentials_input(&akid, &secret) {
m.validation_success = false;
m.validation_response_body = format!("Invalid AWS credentials ({}): {}", akid, e);

View file

@ -1,7 +1,7 @@
use std::time::Duration;
use std::{collections::HashSet, sync::RwLock, time::Duration};
use anyhow::{anyhow, Result};
use aws_config::{retry::RetryConfig, BehaviorVersion};
use aws_config::{retry::RetryConfig, BehaviorVersion, SdkConfig};
use aws_credential_types::Credentials;
use aws_sdk_sts::{
config::Builder as StsConfigBuilder, error::SdkError,
@ -26,8 +26,9 @@ use http::{
header::{HeaderValue, USER_AGENT},
StatusCode,
};
use once_cell::sync::OnceCell;
use once_cell::sync::{Lazy, OnceCell};
use rand::{rng, Rng};
use regex::Regex;
use tokio::{
sync::Semaphore,
time::{sleep, timeout},
@ -36,6 +37,57 @@ use tokio::{
use crate::validation::GLOBAL_USER_AGENT;
static AWS_VALIDATION_SEMAPHORE: OnceCell<Semaphore> = OnceCell::new();
const BUILTIN_SKIP_ACCOUNT_IDS: &[&str] = &[
"052310077262",
"171436882533",
"528757803018",
"534261010715",
"595918472158",
"729780141977",
"893192397702",
"992382622183",
];
static AWS_SKIP_ACCOUNT_IDS: Lazy<RwLock<HashSet<String>>> = Lazy::new(|| {
let mut set = HashSet::new();
set.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string()));
RwLock::new(set)
});
fn build_http_client() -> SharedHttpClient {
HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| {
let mut conn_builder = ConnectorBuilder::default()
.tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc));
conn_builder.set_connector_settings(settings.cloned());
if let Some(components) = runtime_components {
conn_builder.set_sleep_impl(components.sleep_impl());
}
conn_builder.set_proxy_config(Some(ProxyConfig::from_env()));
conn_builder.build()
})
}
async fn build_base_config(credentials: Credentials) -> SdkConfig {
let retry_config = RetryConfig::adaptive().with_max_attempts(3);
aws_config::defaults(BehaviorVersion::latest())
.region(Region::new("us-east-1"))
.credentials_provider(credentials)
.http_client(build_http_client())
.retry_config(retry_config)
.load()
.await
}
fn extract_account_id(input: &str) -> Option<String> {
let trimmed = input.trim();
if trimmed.len() == 12 && trimmed.chars().all(|c| c.is_ascii_digit()) {
return Some(trimmed.to_string());
}
static ACCOUNT_ID_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\d{12})").expect("valid regex"));
ACCOUNT_ID_RE.captures(trimmed).and_then(|caps| caps.get(1)).map(|m| m.as_str().to_string())
}
/// Set the maximum number of concurrent AWS validations. Call before first use.
pub fn set_aws_validation_concurrency(max: usize) {
@ -46,6 +98,46 @@ fn aws_validation_semaphore() -> &'static Semaphore {
AWS_VALIDATION_SEMAPHORE.get_or_init(|| Semaphore::new(15))
}
pub fn set_aws_skip_account_ids<I, S>(ids: I)
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
let mut guard = match AWS_SKIP_ACCOUNT_IDS.write() {
Ok(g) => g,
Err(poisoned) => poisoned.into_inner(),
};
guard.clear();
guard.extend(BUILTIN_SKIP_ACCOUNT_IDS.iter().map(|id| id.to_string()));
for raw in ids.into_iter() {
let value = raw.into();
if value.trim().is_empty() {
continue;
}
if let Some(normalized) = extract_account_id(&value) {
guard.insert(normalized);
} else {
tracing::warn!("Ignoring invalid AWS account ID in skip list: {value}");
}
}
}
pub fn should_skip_aws_validation(access_key_id: &str) -> Option<String> {
let guard = AWS_SKIP_ACCOUNT_IDS.read().ok()?;
if guard.is_empty() {
return None;
}
let account = aws_key_to_account_number(access_key_id).ok()?;
if guard.contains(&account) {
Some(account)
} else {
None
}
}
#[derive(Debug)]
struct UaInterceptor;
@ -132,29 +224,7 @@ pub async fn validate_aws_credentials(
None, // expiry
"static", // provider name
);
// Create HTTP client that respects proxy settings from the environment
let http_client: SharedHttpClient =
HttpClientBuilder::new().build_with_connector_fn(|settings, runtime_components| {
let mut conn_builder = ConnectorBuilder::default()
.tls_provider(tls::Provider::Rustls(tls::rustls_provider::CryptoMode::AwsLc));
conn_builder.set_connector_settings(settings.cloned());
if let Some(components) = runtime_components {
conn_builder.set_sleep_impl(components.sleep_impl());
}
conn_builder.set_proxy_config(Some(ProxyConfig::from_env()));
conn_builder.build()
});
// Create AWS config with adaptive retries
let retry_config = RetryConfig::adaptive().with_max_attempts(3);
let config = aws_config::defaults(BehaviorVersion::latest())
.region(Region::new("us-east-1"))
.credentials_provider(credentials)
.http_client(http_client)
.retry_config(retry_config)
.load()
.await;
let config = build_base_config(credentials).await;
// Create STS client
let sts_config = StsConfigBuilder::from(&config).interceptor(UaInterceptor).build();
@ -230,3 +300,74 @@ pub fn aws_key_to_account_number(aws_key_id: &str) -> Result<String, Box<dyn std
// Return the account number formatted as a 12-digit string
Ok(format!("{:012}", account_num))
}
#[cfg(test)]
mod tests {
use super::*;
use once_cell::sync::Lazy;
use std::sync::Mutex;
static TEST_GUARD: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
#[test]
fn skip_account_list_normalizes_inputs() {
let _lock = TEST_GUARD.lock().unwrap();
set_aws_skip_account_ids([
" 052310077262 ",
"arn:aws:iam::171436882533:role/demo",
"invalid",
]);
let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap();
assert!(guard.contains("052310077262"));
assert!(guard.contains("171436882533"));
assert_eq!(guard.len(), BUILTIN_SKIP_ACCOUNT_IDS.len());
drop(guard);
set_aws_skip_account_ids(Vec::<String>::new());
}
#[test]
fn should_skip_when_account_matches() {
let _lock = TEST_GUARD.lock().unwrap();
set_aws_skip_account_ids(["534261010715"]);
assert_eq!(
should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"),
Some("534261010715".to_string())
);
set_aws_skip_account_ids(Vec::<String>::new());
}
#[test]
fn builtin_canary_accounts_are_preseeded() {
let _lock = TEST_GUARD.lock().unwrap();
set_aws_skip_account_ids(Vec::<String>::new());
assert_eq!(
should_skip_aws_validation("AKIAXYZDQCEN4B6JSJQI"),
Some("534261010715".to_string())
);
set_aws_skip_account_ids(Vec::<String>::new());
}
#[test]
fn duplicate_accounts_are_deduplicated() {
let _lock = TEST_GUARD.lock().unwrap();
set_aws_skip_account_ids([
"534261010715",
"arn:aws:iam::534261010715:user/canarytokens",
" 534261010715 ",
]);
let guard = AWS_SKIP_ACCOUNT_IDS.read().unwrap();
assert_eq!(guard.iter().filter(|id| id.as_str() == "534261010715").count(), 1);
drop(guard);
set_aws_skip_account_ids(Vec::<String>::new());
}
}

View file

@ -33,31 +33,77 @@ pub fn find_closest_variable(
target_variable_name: &str,
search_variable_name: &str,
) -> Option<String> {
// Find positions of the target variable with the target value
// Collect the positions of the target variable for the provided value so we can
// compare relative offsets with candidate variables.
let mut target_positions = Vec::new();
for (name, value, start, end) in captures {
if name == target_variable_name && value == target_value {
target_positions.push((*start, *end));
}
}
if target_positions.is_empty() {
return None;
}
// For each target position, find the closest search variable
let mut closest_distance = usize::MAX;
let mut closest_value: Option<String> = None;
for (_target_start, target_end) in target_positions {
for (name, value, start, _) in captures {
if name == search_variable_name {
let distance = (*start as isize - target_end as isize).abs() as usize;
if distance < closest_distance {
closest_distance = distance;
closest_value = Some(value.clone());
// Prefer candidates that appear before the target value (same logical block), but
// fall back to overlapping values and then to those that appear after the target
// value when no better match exists. This avoids pairing with the next block when
// multiple credentials are close together in the same file.
let mut best_before: Option<(usize, String)> = None;
let mut best_overlap: Option<(usize, String)> = None;
let mut best_after: Option<(usize, String)> = None;
for (target_start, target_end) in target_positions.iter().copied() {
for (name, value, start, end) in captures {
if name != search_variable_name {
continue;
}
if *end <= target_start {
// Candidate is before the target; choose the one closest to the target start.
let distance = target_start - *end;
match &mut best_before {
Some((best_distance, best_value)) if distance < *best_distance => {
*best_distance = distance;
*best_value = value.clone();
}
None => {
best_before = Some((distance, value.clone()));
}
_ => {}
}
} else if *start >= target_end {
// Candidate is after the target; choose the one closest to the target end.
let distance = *start - target_end;
match &mut best_after {
Some((best_distance, best_value)) if distance < *best_distance => {
*best_distance = distance;
*best_value = value.clone();
}
None => {
best_after = Some((distance, value.clone()));
}
_ => {}
}
} else {
// Candidate overlaps the target treat as an exact match.
let distance = 0usize;
match &mut best_overlap {
Some((best_distance, best_value)) if distance < *best_distance => {
*best_distance = distance;
*best_value = value.clone();
}
None => {
best_overlap = Some((distance, value.clone()));
}
_ => {}
}
}
}
}
closest_value
best_before.or(best_overlap).or(best_after).map(|(_, value)| value)
}
pub async fn check_url_resolvable(url: &Url) -> Result<(), Box<dyn std::error::Error>> {
@ -147,4 +193,31 @@ mod tests {
]
);
}
#[test]
fn prefers_closest_preceding_variable() {
let captures = vec![
("TOKEN".to_string(), "secret".to_string(), 75usize, 115usize),
("AKID".to_string(), "preceding".to_string(), 30usize, 50usize),
("AKID".to_string(), "following".to_string(), 180usize, 200usize),
];
let result =
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
assert_eq!(result, "preceding".to_string());
}
#[test]
fn falls_back_to_following_when_no_preceding() {
let captures = vec![
("TOKEN".to_string(), "secret".to_string(), 10usize, 50usize),
("AKID".to_string(), "after".to_string(), 60usize, 80usize),
];
let result =
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
assert_eq!(result, "after".to_string());
}
}