kingfisher/src/cli/commands/scan.rs

use clap::{Args, ValueEnum};
use strum::Display;
use tracing::debug;

use crate::{
    cli::{
        commands::{
            inputs::{ContentFilteringArgs, InputSpecifierArgs},
            output::{OutputArgs, ReportOutputFormat},
            rules::RuleSpecifierArgs,
        },
        global::RAM_GB,
    },
    rules::rule::Confidence,
};

/// Determine the default number of parallel scan jobs.
///
/// * Target = `num_cpus * 2`.
/// * Cap by RAM at ≈ 1 GiB per job (so 16 GiB ⇒ max 16 jobs).
/// * Always ≥ 1.
/// * When `-v/--verbose` is passed, the computed value is logged at DEBUG.
fn default_scan_jobs() -> usize {
    // How many logical CPUs do we see? (Falls back to 1 on error.)
    let cpu_count = std::thread::available_parallelism().map(usize::from).unwrap_or(1);

    // Desired parallelism is CPU * 2.
    let desired = cpu_count * 2;

    match *RAM_GB {
        // If we know how much RAM we have, cap by a 1 GiB-per-job heuristic.
        Some(ram_gb) => {
            let max_by_ram = ram_gb.ceil() as usize; // 1 GiB per job
            let jobs = desired.min(max_by_ram).max(1);

            debug!(
                "Using {jobs} parallel scan jobs \
                 (cpus = {cpu_count}, desired = {desired}, \
                 ram = {ram_gb:.1} GiB, cap_by_ram = {max_by_ram})"
            );
            jobs
        }
        // If RAM is unknown, just use the desired value.
        None => {
            debug!("Using {desired} parallel scan jobs (cpus = {cpu_count}, ram unknown)");
            desired
        }
    }
}

/// `kingfisher scan` command and flags
#[derive(Args, Debug, Clone)]
pub struct ScanArgs {
    /// Number of parallel scanning threads
    #[arg(long = "jobs", short = 'j', default_value_t = default_scan_jobs())]
    pub num_jobs: usize,

    #[command(flatten)]
    pub rules: RuleSpecifierArgs,

    #[command(flatten)]
    pub input_specifier_args: InputSpecifierArgs,

    #[command(flatten)]
    pub content_filtering_args: ContentFilteringArgs,

    /// Minimum confidence level for reporting findings
    #[arg(long, short = 'c', default_value = "medium")]
    pub confidence: ConfidenceLevel,

    /// Disable secret validation
    #[arg(long, short = 'n', default_value_t = false)]
    pub no_validate: bool,

    /// Display only validated findings
    #[arg(long, default_value_t = false)]
    pub only_valid: bool,

    /// Override the default minimum entropy threshold
    #[arg(long, short = 'e')]
    pub min_entropy: Option<f32>,

    /// Show performance statistics for each rule
    #[arg(long, default_value_t = false)]
    pub rule_stats: bool,

    /// Display every occurrence of a finding
    #[arg(long, default_value_t = false)]
    pub no_dedup: bool,

    /// Redact findings values using a secure hash
    #[arg(long, short = 'r', default_value_t = false)]
    pub redact: bool,

    /// Skip decoding Base64 blobs before scanning
    #[arg(long, default_value_t = false)]
    pub no_base64: bool,

    /// Timeout for Git repository scanning in seconds
    #[arg(long, default_value_t = 1800, value_name = "SECONDS")]
    pub git_repo_timeout: u64,

    #[command(flatten)]
    pub output_args: OutputArgs<ReportOutputFormat>,

    /// Baseline file to filter known secrets
    #[arg(long, value_name = "FILE")]
    pub baseline_file: Option<std::path::PathBuf>,

    /// Create or update the baseline file with current findings
    #[arg(long, default_value_t = false)]
    pub manage_baseline: bool,

    /// Regex patterns to allow-list secret matches (repeatable)
    #[arg(long = "skip-regex", value_name = "PATTERN")]
    pub skip_regex: Vec<String>,

    /// Skipwords to allow-list secret matches (case-insensitive, repeatable)
    #[arg(long = "skip-word", value_name = "WORD")]
    pub skip_word: Vec<String>,
}

/// Confidence levels for findings
#[derive(Copy, Clone, Debug, Display, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
#[strum(serialize_all = "kebab-case")]
pub enum ConfidenceLevel {
    Low,
    Medium,
    High,
}

impl From<ConfidenceLevel> for Confidence {
    fn from(level: ConfidenceLevel) -> Self {
        match level {
            ConfidenceLevel::Low => Confidence::Low,
            ConfidenceLevel::Medium => Confidence::Medium,
            ConfidenceLevel::High => Confidence::High,
        }
    }
}