// ──────────────────────────────────────────────────────────── // Global allocator setup // * Default - mimalloc (no feature flags) // * Debug - jemalloc (`use-jemalloc` feature) // * Fallback - system allocator (`system-alloc` feature) // ──────────────────────────────────────────────────────────── // --- jemalloc (opt-in) --- #[cfg(feature = "use-jemalloc")] #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; // --- mimalloc (default) --- #[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))] #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; // --- system allocator (explicit opt-out) --- #[cfg(feature = "system-alloc")] use std::alloc::System; #[cfg(feature = "system-alloc")] #[global_allocator] static GLOBAL: System = System; // use std::alloc::System; // #[global_allocator] // static GLOBAL: System = System; use std::{ io::{IsTerminal, Read}, sync::{Arc, Mutex}, }; use anyhow::{Context, Result}; use kingfisher::{ azure, bitbucket, cli::{ self, commands::{ github::{GitCloneMode, GitHistoryMode, GitHubRepoType}, inputs::{ContentFilteringArgs, InputSpecifierArgs}, output::{OutputArgs, ReportOutputFormat}, rules::{ RuleSpecifierArgs, RulesCheckArgs, RulesCommand, RulesListArgs, RulesListOutputFormat, }, }, global::Command, CommandLineArgs, GlobalArgs, }, findings_store, findings_store::FindingsStore, gitea, github, huggingface, rule_loader::RuleLoader, rules_database::RulesDatabase, scanner::{load_and_record_rules, run_scan}, update::check_for_update, validation::set_user_agent_suffix, }; use serde_json::json; use tempfile::TempDir; use term_size; use tokio::runtime::Builder; use tracing::{error, info, warn}; use tracing_core::metadata::LevelFilter; use tracing_subscriber::{ self, fmt, prelude::__tracing_subscriber_SubscriberExt, registry, util::SubscriberInitExt, }; use url::Url; use crate::cli::commands::{ azure::AzureRepoType, bitbucket::{BitbucketAuthArgs, BitbucketRepoType}, gitea::GiteaRepoType, gitlab::GitLabRepoType, scan::{ListRepositoriesCommand, ScanOperation}, }; fn main() -> anyhow::Result<()> { color_backtrace::install(); // Parse command-line arguments let args = CommandLineArgs::parse_args(); set_user_agent_suffix(args.global_args.user_agent_suffix.clone()); // Determine the number of jobs, defaulting to the number of CPUs let num_jobs = match &args.command { Command::Scan(scan_args) => scan_args.scan_args.num_jobs, Command::SelfUpdate => 1, // Self-update doesn't need a thread pool Command::Rules(_) => num_cpus::get(), // Default for Rules commands }; // Set up the Tokio runtime with the specified number of threads let runtime = Builder::new_multi_thread() .worker_threads(num_jobs) .enable_all() .build() .context("Failed to create Tokio runtime")?; runtime.block_on(async_main(args)) } fn setup_logging(global_args: &GlobalArgs) { // Determine log level based on global verbosity let (level, all_targets) = if global_args.quiet { (LevelFilter::ERROR, false) } else { let level = match global_args.verbose { 0 => LevelFilter::INFO, // Default level if no `-v` is provided 1 => LevelFilter::DEBUG, // `-v` 2 => LevelFilter::TRACE, // `-vv` _ => LevelFilter::TRACE, // `-vvv` or more }; let all_targets = global_args.verbose > 2; // Enable all targets for `-vvv` or more (level, all_targets) }; // Create a filter for logging let filter = if all_targets { // Enable TRACE for all modules tracing_subscriber::filter::Targets::new().with_default(LevelFilter::TRACE) } else { // Per-target filtering, only TRACE for `kingfisher` tracing_subscriber::filter::Targets::new() .with_default(LevelFilter::ERROR) // Default for all modules .with_target("kingfisher", level) // Replace `kingfisher` with your // crate's name }; // Configure the formatter layer let fmt_layer = fmt::layer() .with_writer(std::io::stderr) // Write logs to stderr .with_target(true) // Enable target filtering .with_ansi(std::io::stderr().is_terminal()) // Emit ANSI colours when stderr is a TTY .without_time(); // Remove timestamps // Build and initialize the registry registry() .with(fmt_layer) // Attach the formatter layer .with(filter) // Attach the filter .init(); } pub fn determine_exit_code(datastore: &Arc>) -> i32 { // exit with code 200 if _any_ findings are discovered // exit with code 205 if VALIDATED findings are discovered // exit with code 0 if there are NO findings discovered let ds = datastore.lock().unwrap(); // Get all matches // let all_matches = ds.get_matches(); // Only consider visible matches when determining the exit code let all_matches = ds .get_matches() .iter() .filter(|msg| { let (_, _, match_item) = &***msg; match_item.visible }) .collect::>(); if all_matches.is_empty() { // No findings discovered 0 } else { // Check if there are any validated findings let validated_matches = all_matches .iter() .filter(|msg| { let (_, _, match_item) = &****msg; match_item.validation_success }) .count(); if validated_matches > 0 { // Validated findings discovered 205 } else { // Findings discovered, but not validated 200 } } } async fn async_main(args: CommandLineArgs) -> Result<()> { setup_logging(&args.global_args); let global_args = args.global_args.clone(); match args.command { Command::SelfUpdate => { let mut g = global_args; g.self_update = true; g.no_update_check = false; check_for_update(&g, None); Ok(()) } command => { let temp_dir = TempDir::new().context("Failed to create temporary directory")?; let clone_dir = temp_dir.path().to_path_buf(); let datastore = Arc::new(Mutex::new(FindingsStore::new(clone_dir))); let update_msg = check_for_update(&global_args, None); match command { Command::Scan(scan_command) => match scan_command.into_operation()? { ScanOperation::Scan(mut scan_args) => { info!( "Launching with {} concurrent scan jobs. Use --num-jobs to override.", &scan_args.num_jobs ); let paths = &scan_args.input_specifier_args.path_inputs; let is_dash = paths.iter().any(|p| p.as_os_str() == "-"); if (paths.is_empty() || is_dash) && !atty::is(atty::Stream::Stdin) { let mut buf = Vec::new(); std::io::stdin().read_to_end(&mut buf)?; let stdin_file = temp_dir.path().join("stdin_input"); std::fs::write(&stdin_file, buf)?; scan_args.input_specifier_args.path_inputs = vec![stdin_file.into()]; } let rules_db = Arc::new(load_and_record_rules(&scan_args, &datastore)?); run_scan(&global_args, &scan_args, &rules_db, Arc::clone(&datastore)) .await?; let exit_code = determine_exit_code(&datastore); if let Err(e) = temp_dir.close() { eprintln!("Failed to close temporary directory: {}", e); } std::process::exit(exit_code); } ScanOperation::ListRepositories(list_command) => match list_command { ListRepositoriesCommand::Github { api_url, specifiers } => { github::list_repositories( api_url, global_args.ignore_certs, global_args.use_progress(), &specifiers.user, &specifiers.organization, specifiers.all_organizations, &specifiers.exclude_repos, specifiers.repo_type.into(), ) .await?; } ListRepositoriesCommand::Gitlab { api_url, specifiers } => { kingfisher::gitlab::list_repositories( api_url, global_args.ignore_certs, global_args.use_progress(), &specifiers.user, &specifiers.group, specifiers.all_groups, specifiers.include_subgroups, &specifiers.exclude_repos, specifiers.repo_type.into(), ) .await?; } ListRepositoriesCommand::Gitea { api_url, specifiers } => { gitea::list_repositories( api_url, global_args.ignore_certs, global_args.use_progress(), &specifiers.user, &specifiers.organization, specifiers.all_organizations, &specifiers.exclude_repos, specifiers.repo_type.into(), ) .await?; } ListRepositoriesCommand::Bitbucket { api_url, specifiers } => { let auth_config = bitbucket::AuthConfig::from_env(); bitbucket::list_repositories( api_url, auth_config, global_args.ignore_certs, global_args.use_progress(), &specifiers.user, &specifiers.workspace, &specifiers.project, specifiers.all_workspaces, &specifiers.exclude_repos, specifiers.repo_type.into(), ) .await?; } ListRepositoriesCommand::Azure { base_url, specifiers } => { azure::list_repositories( base_url, global_args.ignore_certs, global_args.use_progress(), &specifiers.organization, &specifiers.project, specifiers.all_projects, &specifiers.exclude_repos, specifiers.repo_type.into(), ) .await?; } ListRepositoriesCommand::Huggingface { specifiers } => { let repo_specifiers = huggingface::RepoSpecifiers { user: specifiers.user.clone(), organization: specifiers.organization.clone(), model: specifiers.model.clone(), dataset: specifiers.dataset.clone(), space: specifiers.space.clone(), exclude: specifiers.exclude.clone(), }; let auth = huggingface::AuthConfig::from_env(); huggingface::list_repositories( &repo_specifiers, &auth, global_args.ignore_certs, global_args.use_progress(), ) .await?; } }, }, Command::Rules(ref rule_args) => match &rule_args.command { RulesCommand::Check(check_args) => { run_rules_check(&check_args)?; } RulesCommand::List(list_args) => { run_rules_list(&list_args)?; } }, Command::SelfUpdate => { anyhow::bail!("SelfUpdate command should not reach this branch") } } if let Some(msg) = update_msg { info!("{msg}"); } Ok(()) } } } /// Create a default ScanArgs instance for rule loading fn create_default_scan_args() -> cli::commands::scan::ScanArgs { use cli::commands::scan::*; ScanArgs { num_jobs: 1, rules: RuleSpecifierArgs { rules_path: Vec::new(), rule: vec!["all".into()], load_builtins: true, }, input_specifier_args: InputSpecifierArgs { path_inputs: Vec::new(), git_url: Vec::new(), github_user: Vec::new(), github_organization: Vec::new(), github_exclude: Vec::new(), all_github_organizations: false, github_api_url: url::Url::parse("https://api.github.com/").unwrap(), github_repo_type: GitHubRepoType::Source, // new GitLab defaults gitlab_user: Vec::new(), gitlab_group: Vec::new(), gitlab_exclude: Vec::new(), all_gitlab_groups: false, gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(), gitlab_repo_type: GitLabRepoType::All, gitlab_include_subgroups: false, huggingface_user: Vec::new(), huggingface_organization: Vec::new(), huggingface_model: Vec::new(), huggingface_dataset: Vec::new(), huggingface_space: Vec::new(), huggingface_exclude: Vec::new(), gitea_user: Vec::new(), gitea_organization: Vec::new(), gitea_exclude: Vec::new(), all_gitea_organizations: false, gitea_api_url: Url::parse("https://gitea.com/api/v1/").unwrap(), gitea_repo_type: GiteaRepoType::Source, bitbucket_user: Vec::new(), bitbucket_workspace: Vec::new(), bitbucket_project: Vec::new(), bitbucket_exclude: Vec::new(), all_bitbucket_workspaces: false, bitbucket_api_url: Url::parse("https://api.bitbucket.org/2.0/").unwrap(), bitbucket_repo_type: BitbucketRepoType::Source, bitbucket_auth: BitbucketAuthArgs::default(), azure_organization: Vec::new(), azure_project: Vec::new(), azure_exclude: Vec::new(), all_azure_projects: false, azure_base_url: Url::parse("https://dev.azure.com/").unwrap(), azure_repo_type: AzureRepoType::Source, jira_url: None, jql: None, confluence_url: None, cql: None, max_results: 100, s3_bucket: None, s3_prefix: None, role_arn: None, aws_local_profile: None, gcs_bucket: None, gcs_prefix: None, gcs_service_account: None, // Slack query slack_query: None, slack_api_url: Url::parse("https://slack.com/api/").unwrap(), // Docker image scanning docker_image: Vec::new(), // git clone / history options git_clone: GitCloneMode::Bare, git_history: GitHistoryMode::Full, commit_metadata: true, repo_artifacts: false, scan_nested_repos: true, since_commit: None, branch: None, branch_root: false, branch_root_commit: None, }, extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, no_extract_archives: true, extraction_depth: 2, exclude: Vec::new(), // Exclude patterns no_binary: true, }, confidence: ConfidenceLevel::Medium, no_validate: true, rule_stats: false, only_valid: false, min_entropy: None, redact: false, git_repo_timeout: 1800, no_dedup: false, baseline_file: None, manage_baseline: false, skip_regex: Vec::new(), skip_word: Vec::new(), skip_aws_account: Vec::new(), skip_aws_account_file: None, output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_base64: false, no_inline_ignore: false, no_ignore_if_contains: false, } } /// Run the rules check command pub fn run_rules_check(args: &RulesCheckArgs) -> Result<()> { let mut num_errors = 0; let mut num_warnings = 0; // Load and check rules let loader = RuleLoader::from_rule_specifiers(&args.rules); let loaded = loader.load(&create_default_scan_args())?; let resolved = loaded.resolve_enabled_rules()?; let rules_db = RulesDatabase::from_rules(resolved.into_iter().cloned().collect())?; // Check each rule for (rule_index, rule) in rules_db.rules().iter().enumerate() { let rule_syntax = rule.syntax(); // Basic rule validation checks if rule.name().len() < 3 { warn!("Rule '{}' has a very short name", rule.name()); num_warnings += 1; } if rule.syntax().pattern.len() < 5 { warn!("Rule '{}' has a very short pattern", rule.name()); num_warnings += 1; } if rule.syntax().examples.is_empty() { warn!("Rule '{}' has no examples", rule.name()); num_warnings += 1; continue; } // Check regex compilation if let Err(e) = rule.syntax().as_regex() { error!("Rule '{}' has invalid regex: {}", rule.name(), e); num_errors += 1; continue; } // Test each example against both vectorscan and regex for (example_index, example) in rule_syntax.examples.iter().enumerate() { // Create a test blob from the example // let blob = Blob::new(BlobId::new(example.as_bytes()), // example.as_bytes().to_vec()); let origin = OriginSet::new( // Origin::from_file(PathBuf::from("test_example")), // Vec::new(), // ); // // Check vectorscan match // let vectorscan_matched = match matcher.scan_blob(&blob, &origin, None)? { // ScanResult::New(matches) => !matches.is_empty(), // _ => false, // }; // Check regex match // Get the regex using the public method let re = rules_db.get_regex_by_rule_id(rule.id()).expect("Failed to get regex for rule"); let regex_matched = re.is_match(example.as_bytes()); if !regex_matched { // ||!vectorscan_matched { println!("\nTesting rule {} - {}", rule_index + 1, rule_syntax.name); println!(" Processing example {}", example_index + 1); println!(" [!] Mismatch detected for example: {}", example); // if !vectorscan_matched { // println!(" Vectorscan match: {}", vectorscan_matched); // num_errors += 1; // } if !regex_matched { println!(" Regex match: {}", regex_matched); num_errors += 1; } } // // Report any mismatches // if !vectorscan_matched || !regex_matched { // error!("Rule '{}' example {} failed validation:", // rule.name(), example_index + 1); println!(" // Example text: {}", example); // if !vectorscan_matched { // error!(" - Vectorscan pattern did not match example"); // num_errors += 1; // } // if !regex_matched { // error!(" - Regex pattern did not match example"); // num_errors += 1; // } // } } } // Print summary if num_errors > 0 || num_warnings > 0 { println!("\nCheck Summary:"); println!(" Errors: {}", num_errors); println!(" Warnings: {}", num_warnings); println!("\nError types include:"); println!(" - Invalid regex patterns"); println!(" - Examples that don't match their patterns"); println!("\nWarning types include:"); println!(" - Rules with very short names"); println!(" - Rules with very short patterns"); println!(" - Rules without examples"); } else { println!("\nAll rules passed validation successfully!"); } // Exit with error if there are errors or if warnings are treated as errors if num_errors > 0 || (args.warnings_as_errors && num_warnings > 0) { std::process::exit(1); } Ok(()) } /// Run the rules list command pub fn run_rules_list(args: &RulesListArgs) -> Result<()> { // Load rules let loader = RuleLoader::from_rule_specifiers(&args.rules); let loaded = loader.load(&create_default_scan_args())?; let resolved = loaded.resolve_enabled_rules()?; let mut writer = args.output_args.get_writer()?; match args.output_args.format { RulesListOutputFormat::Pretty => { // Determine terminal width if possible, otherwise use default let term_width = term_size::dimensions().map(|(w, _)| w).unwrap_or(120); // First pass: calculate column widths let max_name_width = resolved.iter().map(|r| r.name().len()).max().unwrap_or(0).max(4); // "Rule" header let max_id_width = resolved.iter().map(|r| r.id().len()).max().unwrap_or(0).max(2); // "ID" header let max_conf_width = resolved .iter() .map(|r| format!("{:?}", r.confidence()).len()) .max() .unwrap_or(0) .max(10); // "Confidence" header // Calculate pattern width based on terminal width let reserved_width = max_name_width + max_id_width + max_conf_width + 10; let pattern_width = term_width.saturating_sub(reserved_width); // Format pattern on a single line let format_pattern = |pattern: &str| { let single_line = pattern .replace('\n', " ") .replace('\r', " ") .split_whitespace() .collect::>() .join(" "); if single_line.len() > pattern_width { format!("{}...", &single_line[..pattern_width.saturating_sub(3)]) } else { single_line } }; // Print header writeln!( writer, "\n{:name_width$} │ {:id_width$} │ {:conf_width$} │ Pattern", "Rule", "ID", "Confidence", name_width = max_name_width, id_width = max_id_width, conf_width = max_conf_width )?; // Print separator writeln!( writer, "{0:─ { // Create JSON format let rules_json: Vec<_> = resolved .iter() .map(|rule| { json!({ "name": rule.name(), "id": rule.id(), "pattern": rule.syntax().pattern, "confidence": rule.confidence(), "examples": rule.syntax().examples, "visible": rule.visible(), }) }) .collect(); serde_json::to_writer_pretty(&mut writer, &rules_json)?; writeln!(writer)?; } } Ok(()) }