removed unused cli argument, snippet-length

This commit is contained in:
Mick Grove 2025-08-10 17:27:36 -07:00
commit f4a1e85b26
10 changed files with 0 additions and 2283 deletions

View file

@ -1,132 +0,0 @@
use clap::{Args, ValueEnum};
use strum::Display;
use tracing::debug;
use crate::{
cli::{
commands::{
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::{OutputArgs, ReportOutputFormat},
rules::RuleSpecifierArgs,
},
global::RAM_GB,
},
rules::rule::Confidence,
};
/// Determine the default number of parallel scan jobs.
///
/// * Target = `num_cpus * 2`.
/// * Cap by RAM at ≈ 1 GiB per job (so 16 GiB ⇒ max 16 jobs).
/// * Always ≥ 1.
/// * When `-v/--verbose` is passed, the computed value is logged at DEBUG.
fn default_scan_jobs() -> usize {
// How many logical CPUs do we see? (Falls back to 1 on error.)
let cpu_count = std::thread::available_parallelism().map(usize::from).unwrap_or(1);
// Desired parallelism is CPU * 2.
let desired = cpu_count * 2;
match *RAM_GB {
// If we know how much RAM we have, cap by a 1 GiB-per-job heuristic.
Some(ram_gb) => {
let max_by_ram = ram_gb.ceil() as usize; // 1 GiB per job
let jobs = desired.min(max_by_ram).max(1);
debug!(
"Using {jobs} parallel scan jobs \
(cpus = {cpu_count}, desired = {desired}, \
ram = {ram_gb:.1} GiB, cap_by_ram = {max_by_ram})"
);
jobs
}
// If RAM is unknown, just use the desired value.
None => {
debug!("Using {desired} parallel scan jobs (cpus = {cpu_count}, ram unknown)");
desired
}
}
}
/// `kingfisher scan` command and flags
#[derive(Args, Debug, Clone)]
pub struct ScanArgs {
/// Number of parallel scanning threads
#[arg(long = "jobs", short = 'j', default_value_t = default_scan_jobs())]
pub num_jobs: usize,
#[command(flatten)]
pub rules: RuleSpecifierArgs,
#[command(flatten)]
pub input_specifier_args: InputSpecifierArgs,
#[command(flatten)]
pub content_filtering_args: ContentFilteringArgs,
/// Minimum confidence level for reporting findings
#[arg(long, short = 'c', default_value = "medium")]
pub confidence: ConfidenceLevel,
/// Disable secret validation
#[arg(long, short = 'n', default_value_t = false)]
pub no_validate: bool,
/// Display only validated findings
#[arg(long, default_value_t = false)]
pub only_valid: bool,
/// Override the default minimum entropy threshold
#[arg(long, short = 'e')]
pub min_entropy: Option<f32>,
/// Show performance statistics for each rule
#[arg(long, default_value_t = false)]
pub rule_stats: bool,
/// Display every occurrence of a finding
#[arg(long, default_value_t = false)]
pub no_dedup: bool,
/// Redact findings values using a secure hash
#[arg(long, short = 'r', default_value_t = false)]
pub redact: bool,
/// Timeout for Git repository scanning in seconds
#[arg(long, default_value_t = 1800, value_name = "SECONDS")]
pub git_repo_timeout: u64,
#[command(flatten)]
pub output_args: OutputArgs<ReportOutputFormat>,
/// Bytes of context before and after each match
#[arg(long, default_value_t = 256, value_name = "BYTES")]
pub snippet_length: usize,
/// Baseline file to filter known secrets
#[arg(long, value_name = "FILE")]
pub baseline_file: Option<std::path::PathBuf>,
/// Create or update the baseline file with current findings
#[arg(long, default_value_t = false)]
pub manage_baseline: bool,
}
/// Confidence levels for findings
#[derive(Copy, Clone, Debug, Display, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
#[strum(serialize_all = "kebab-case")]
pub enum ConfidenceLevel {
Low,
Medium,
High,
}
impl From<ConfidenceLevel> for Confidence {
fn from(level: ConfidenceLevel) -> Self {
match level {
ConfidenceLevel::Low => Confidence::Low,
ConfidenceLevel::Medium => Confidence::Medium,
ConfidenceLevel::High => Confidence::High,
}
}
}

View file

@ -1,528 +0,0 @@
// ────────────────────────────────────────────────────────────
// Global allocator setup
// * Default - mimalloc (no feature flags)
// * Debug - jemalloc (`use-jemalloc` feature)
// * Fallback - system allocator (`system-alloc` feature)
// ────────────────────────────────────────────────────────────
// --- jemalloc (opt-in) ---
#[cfg(feature = "use-jemalloc")]
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
// --- mimalloc (default) ---
#[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))]
#[global_allocator]
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
// --- system allocator (explicit opt-out) ---
#[cfg(feature = "system-alloc")]
use std::alloc::System;
#[cfg(feature = "system-alloc")]
#[global_allocator]
static GLOBAL: System = System;
// use std::alloc::System;
// #[global_allocator]
// static GLOBAL: System = System;
use std::{
io::Read,
sync::{Arc, Mutex},
};
use anyhow::{Context, Result};
use kingfisher::{
cli::{
self,
commands::{
github::{
GitCloneMode, GitHistoryMode, GitHubCommand, GitHubRepoType, GitHubReposCommand,
},
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::{OutputArgs, ReportOutputFormat},
rules::{
RuleSpecifierArgs, RulesCheckArgs, RulesCommand, RulesListArgs,
RulesListOutputFormat,
},
},
global::Command,
CommandLineArgs, GlobalArgs,
},
findings_store,
findings_store::FindingsStore,
github,
rule_loader::RuleLoader,
rules_database::RulesDatabase,
scanner::{load_and_record_rules, run_scan},
update::check_for_update,
};
use serde_json::json;
use tempfile::TempDir;
use term_size;
use tokio::runtime::Builder;
use tracing::{error, info, warn};
use tracing_core::metadata::LevelFilter;
use tracing_subscriber::{
self, fmt, prelude::__tracing_subscriber_SubscriberExt, registry, util::SubscriberInitExt,
};
use url::Url;
use crate::cli::commands::gitlab::{GitLabCommand, GitLabRepoType, GitLabReposCommand};
fn main() -> anyhow::Result<()> {
color_backtrace::install();
// Parse command-line arguments
let args = CommandLineArgs::parse_args();
// Determine the number of jobs, defaulting to the number of CPUs
let num_jobs = match args.command {
Command::Scan(ref scan_args) => scan_args.num_jobs,
Command::GitHub(_) => num_cpus::get(), // Default for GitHub commands
Command::GitLab(_) => num_cpus::get(), // Default for GitLab commands
Command::Rules(_) => num_cpus::get(), // Default for Rules commands
};
// Set up the Tokio runtime with the specified number of threads
let runtime = Builder::new_multi_thread()
.worker_threads(num_jobs)
.enable_all()
.build()
.context("Failed to create Tokio runtime")?;
runtime.block_on(async_main(args))
}
fn setup_logging(global_args: &GlobalArgs) {
// Determine log level based on global verbosity
let (level, all_targets) = if global_args.quiet {
(LevelFilter::ERROR, false)
} else {
let level = match global_args.verbose {
0 => LevelFilter::INFO, // Default level if no `-v` is provided
1 => LevelFilter::DEBUG, // `-v`
2 => LevelFilter::TRACE, // `-vv`
_ => LevelFilter::TRACE, // `-vvv` or more
};
let all_targets = global_args.verbose > 2; // Enable all targets for `-vvv` or more
(level, all_targets)
};
// Create a filter for logging
let filter = if all_targets {
// Enable TRACE for all modules
tracing_subscriber::filter::Targets::new().with_default(LevelFilter::TRACE)
} else {
// Per-target filtering, only TRACE for `kingfisher`
tracing_subscriber::filter::Targets::new()
.with_default(LevelFilter::ERROR) // Default for all modules
.with_target("kingfisher", level) // Replace `kingfisher` with your
// crate's name
};
// Configure the formatter layer
let fmt_layer = fmt::layer()
.with_writer(std::io::stderr) // Write logs to stderr
.with_target(true) // Enable target filtering
.with_ansi(false) // Disable colors
.without_time(); // Remove timestamps
// Build and initialize the registry
registry()
.with(fmt_layer) // Attach the formatter layer
.with(filter) // Attach the filter
.init();
}
pub fn determine_exit_code(datastore: &Arc<Mutex<findings_store::FindingsStore>>) -> i32 {
// exit with code 200 if _any_ findings are discovered
// exit with code 205 if VALIDATED findings are discovered
// exit with code 0 if there are NO findings discovered
let ds = datastore.lock().unwrap();
// Get all matches
// let all_matches = ds.get_matches();
// Only consider visible matches when determining the exit code
let all_matches = ds
.get_matches()
.iter()
.filter(|msg| {
let (_, _, match_item) = &***msg;
match_item.visible
})
.collect::<Vec<_>>();
if all_matches.is_empty() {
// No findings discovered
0
} else {
// Check if there are any validated findings
let validated_matches = all_matches
.iter()
.filter(|msg| {
let (_, _, match_item) = &****msg;
match_item.validation_success
})
.count();
if validated_matches > 0 {
// Validated findings discovered
205
} else {
// Findings discovered, but not validated
200
}
}
}
async fn async_main(args: CommandLineArgs) -> Result<()> {
// Create a temporary directory
let temp_dir = TempDir::new().context("Failed to create temporary directory")?;
let clone_dir = temp_dir.path().to_path_buf();
// Create the in-memory datastore
let datastore = Arc::new(Mutex::new(FindingsStore::new(clone_dir)));
setup_logging(&args.global_args);
let update_msg = check_for_update(&args.global_args, None);
match args.command {
Command::Scan(mut scan_args) => {
// —————————————————————————————————————————
// If no paths or a single "-", slurp stdin into a temp file
// —————————————————————————————————————————
info!(
"Launching with {} concurrent scan jobs. Use --num-jobs to override.",
&scan_args.num_jobs
);
let paths = &scan_args.input_specifier_args.path_inputs;
let is_dash = paths.iter().any(|p| p.as_os_str() == "-");
if (paths.is_empty() || is_dash) && !atty::is(atty::Stream::Stdin) {
// read all stdin
let mut buf = Vec::new();
std::io::stdin().read_to_end(&mut buf)?;
// write into temp_dir
let stdin_file = temp_dir.path().join("stdin_input");
std::fs::write(&stdin_file, buf)?;
// replace inputs
scan_args.input_specifier_args.path_inputs = vec![stdin_file.into()];
}
// now proceed exactly as before
let rules_db = Arc::new(load_and_record_rules(&scan_args, &datastore)?);
run_scan(&args.global_args, &scan_args, &rules_db, Arc::clone(&datastore)).await?;
let exit_code = determine_exit_code(&datastore);
if let Err(e) = temp_dir.close() {
eprintln!("Failed to close temporary directory: {}", e);
}
std::process::exit(exit_code);
}
Command::Rules(ref rule_args) => match &rule_args.command {
RulesCommand::Check(check_args) => {
run_rules_check(&check_args)?;
}
RulesCommand::List(list_args) => {
run_rules_list(&list_args)?;
}
},
Command::GitHub(github_args) => match github_args.command {
GitHubCommand::Repos(repos_command) => match repos_command {
GitHubReposCommand::List(list_args) => {
github::list_repositories(
github_args.github_api_url,
args.global_args.ignore_certs,
args.global_args.use_progress(),
&list_args.repo_specifiers.user,
&list_args.repo_specifiers.organization,
list_args.repo_specifiers.all_organizations,
list_args.repo_specifiers.repo_type.into(),
)
.await?;
}
},
},
Command::GitLab(gitlab_args) => match gitlab_args.command {
GitLabCommand::Repos(repos_command) => match repos_command {
GitLabReposCommand::List(list_args) => {
kingfisher::gitlab::list_repositories(
gitlab_args.gitlab_api_url,
args.global_args.ignore_certs,
args.global_args.use_progress(),
&list_args.repo_specifiers.user,
&list_args.repo_specifiers.group,
list_args.repo_specifiers.all_groups,
list_args.repo_specifiers.repo_type.into(),
)
.await?;
}
},
},
}
if let Some(msg) = update_msg {
info!("{msg}");
}
Ok(())
}
/// Create a default ScanArgs instance for rule loading
fn create_default_scan_args() -> cli::commands::scan::ScanArgs {
use cli::commands::scan::*;
ScanArgs {
num_jobs: 1,
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
input_specifier_args: InputSpecifierArgs {
path_inputs: Vec::new(),
git_url: Vec::new(),
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: url::Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
// new GitLab defaults
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::All,
jira_url: None,
jql: None,
max_results: 100,
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Slack query
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
no_extract_archives: true,
extraction_depth: 2,
exclude: Vec::new(), // Exclude patterns
no_binary: true,
},
confidence: ConfidenceLevel::Medium,
no_validate: true,
rule_stats: false,
only_valid: false,
min_entropy: None,
redact: false,
git_repo_timeout: 1800,
no_dedup: false,
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
}
}
/// Run the rules check command
pub fn run_rules_check(args: &RulesCheckArgs) -> Result<()> {
let mut num_errors = 0;
let mut num_warnings = 0;
// Load and check rules
let loader = RuleLoader::from_rule_specifiers(&args.rules);
let loaded = loader.load(&create_default_scan_args())?;
let resolved = loaded.resolve_enabled_rules()?;
let rules_db = RulesDatabase::from_rules(resolved.into_iter().cloned().collect())?;
// Check each rule
for (rule_index, rule) in rules_db.rules().iter().enumerate() {
let rule_syntax = rule.syntax();
// Basic rule validation checks
if rule.name().len() < 3 {
warn!("Rule '{}' has a very short name", rule.name());
num_warnings += 1;
}
if rule.syntax().pattern.len() < 5 {
warn!("Rule '{}' has a very short pattern", rule.name());
num_warnings += 1;
}
if rule.syntax().examples.is_empty() {
warn!("Rule '{}' has no examples", rule.name());
num_warnings += 1;
continue;
}
// Check regex compilation
if let Err(e) = rule.syntax().as_regex() {
error!("Rule '{}' has invalid regex: {}", rule.name(), e);
num_errors += 1;
continue;
}
// Test each example against both vectorscan and regex
for (example_index, example) in rule_syntax.examples.iter().enumerate() {
// Create a test blob from the example
// let blob = Blob::new(BlobId::new(example.as_bytes()),
// example.as_bytes().to_vec()); let origin = OriginSet::new(
// Origin::from_file(PathBuf::from("test_example")),
// Vec::new(),
// );
// // Check vectorscan match
// let vectorscan_matched = match matcher.scan_blob(&blob, &origin, None)? {
// ScanResult::New(matches) => !matches.is_empty(),
// _ => false,
// };
// Check regex match
// Get the regex using the public method
let re =
rules_db.get_regex_by_rule_id(rule.id()).expect("Failed to get regex for rule");
let regex_matched = re.is_match(example.as_bytes());
if !regex_matched {
// ||!vectorscan_matched {
println!("\nTesting rule {} - {}", rule_index + 1, rule_syntax.name);
println!(" Processing example {}", example_index + 1);
println!(" [!] Mismatch detected for example: {}", example);
// if !vectorscan_matched {
// println!(" Vectorscan match: {}", vectorscan_matched);
// num_errors += 1;
// }
if !regex_matched {
println!(" Regex match: {}", regex_matched);
num_errors += 1;
}
}
// // Report any mismatches
// if !vectorscan_matched || !regex_matched {
// error!("Rule '{}' example {} failed validation:",
// rule.name(), example_index + 1); println!("
// Example text: {}", example);
// if !vectorscan_matched {
// error!(" - Vectorscan pattern did not match example");
// num_errors += 1;
// }
// if !regex_matched {
// error!(" - Regex pattern did not match example");
// num_errors += 1;
// }
// }
}
}
// Print summary
if num_errors > 0 || num_warnings > 0 {
println!("\nCheck Summary:");
println!(" Errors: {}", num_errors);
println!(" Warnings: {}", num_warnings);
println!("\nError types include:");
println!(" - Invalid regex patterns");
println!(" - Examples that don't match their patterns");
println!("\nWarning types include:");
println!(" - Rules with very short names");
println!(" - Rules with very short patterns");
println!(" - Rules without examples");
} else {
println!("\nAll rules passed validation successfully!");
}
// Exit with error if there are errors or if warnings are treated as errors
if num_errors > 0 || (args.warnings_as_errors && num_warnings > 0) {
std::process::exit(1);
}
Ok(())
}
/// Run the rules list command
pub fn run_rules_list(args: &RulesListArgs) -> Result<()> {
// Load rules
let loader = RuleLoader::from_rule_specifiers(&args.rules);
let loaded = loader.load(&create_default_scan_args())?;
let resolved = loaded.resolve_enabled_rules()?;
let mut writer = args.output_args.get_writer()?;
match args.output_args.format {
RulesListOutputFormat::Pretty => {
// Determine terminal width if possible, otherwise use default
let term_width = term_size::dimensions().map(|(w, _)| w).unwrap_or(120);
// First pass: calculate column widths
let max_name_width = resolved.iter().map(|r| r.name().len()).max().unwrap_or(0).max(4); // "Rule" header
let max_id_width = resolved.iter().map(|r| r.id().len()).max().unwrap_or(0).max(2); // "ID" header
let max_conf_width = resolved
.iter()
.map(|r| format!("{:?}", r.confidence()).len())
.max()
.unwrap_or(0)
.max(10); // "Confidence" header
// Calculate pattern width based on terminal width
let reserved_width = max_name_width + max_id_width + max_conf_width + 10;
let pattern_width = term_width.saturating_sub(reserved_width);
// Format pattern on a single line
let format_pattern = |pattern: &str| {
let single_line = pattern
.replace('\n', " ")
.replace('\r', " ")
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
if single_line.len() > pattern_width {
format!("{}...", &single_line[..pattern_width.saturating_sub(3)])
} else {
single_line
}
};
// Print header
writeln!(
writer,
"\n{:name_width$} │ {:id_width$} │ {:conf_width$} │ Pattern",
"Rule",
"ID",
"Confidence",
name_width = max_name_width,
id_width = max_id_width,
conf_width = max_conf_width
)?;
// Print separator
writeln!(
writer,
"{0:─<name_width$} ┼ {0:─<id_width$} ┼ {0:─<conf_width$} ┼ {0:─<pattern_width$}",
"",
name_width = max_name_width,
id_width = max_id_width,
conf_width = max_conf_width,
pattern_width = pattern_width
)?;
// Print each rule
for rule in resolved {
let formatted_pattern = format_pattern(&rule.syntax().pattern);
writeln!(
writer,
"{:name_width$} │ {:id_width$} │ {:conf_width$} │ {}",
rule.name(),
rule.id(),
format!("{:?}", rule.confidence()),
formatted_pattern,
name_width = max_name_width,
id_width = max_id_width,
conf_width = max_conf_width
)?;
}
writeln!(writer)?;
}
RulesListOutputFormat::Json => {
// Create JSON format
let rules_json: Vec<_> = resolved
.iter()
.map(|rule| {
json!({
"name": rule.name(),
"id": rule.id(),
"pattern": rule.syntax().pattern,
"confidence": rule.confidence(),
"examples": rule.syntax().examples,
"visible": rule.visible(),
})
})
.collect();
serde_json::to_writer_pretty(&mut writer, &rules_json)?;
writeln!(writer)?;
}
}
Ok(())
}

View file

@ -1,270 +0,0 @@
use super::*;
impl DetailsReporter {
pub fn json_format<W: std::io::Write>(
&self,
mut writer: W,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
let records = self.build_finding_records(args)?;
if !records.is_empty() {
serde_json::to_writer_pretty(&mut writer, &records)?;
writeln!(writer)?;
}
Ok(())
}
pub fn jsonl_format<W: std::io::Write>(
&self,
mut writer: W,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
let records = self.build_finding_records(args)?;
for record in records {
serde_json::to_writer(&mut writer, &record)?;
writeln!(writer)?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::cli::commands::github::GitCloneMode;
use crate::cli::commands::github::GitHistoryMode;
use crate::cli::commands::rules::RuleSpecifierArgs;
use crate::matcher::{SerializableCapture, SerializableCaptures};
use crate::util::intern;
use crate::{
blob::BlobId,
cli::commands::github::GitHubRepoType,
cli::commands::inputs::ContentFilteringArgs,
cli::commands::inputs::InputSpecifierArgs,
cli::commands::output::{OutputArgs, ReportOutputFormat},
cli::commands::scan::ConfidenceLevel,
findings_store::FindingsStore,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::Match,
origin::Origin,
reporter::styles::Styles,
};
use std::{
io::Cursor,
path::PathBuf,
sync::{Arc, Mutex},
};
use url::Url;
fn create_default_args() -> cli::commands::scan::ScanArgs {
use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope
cli::commands::scan::ScanArgs {
num_jobs: 1,
no_dedup: false,
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
input_specifier_args: InputSpecifierArgs {
// local path / git URL inputs
path_inputs: Vec::new(),
git_url: Vec::new(),
// GitHub
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
// GitLab
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::All,
// Jira options
jira_url: None,
jql: None,
max_results: 100,
// Slack options
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
docker_image: Vec::new(),
// clone / history options
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
no_extract_archives: false,
extraction_depth: 2,
exclude: Vec::new(), // Exclude patterns
no_binary: true,
},
confidence: ConfidenceLevel::Medium,
no_validate: false,
rule_stats: false,
only_valid: false,
min_entropy: None,
redact: false,
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
}
}
fn create_mock_match(
rule_name: &str,
rule_text_id: &str,
rule_finding_fingerprint: &str,
validation_success: bool,
) -> Match {
Match {
location: Location {
offset_span: OffsetSpan { start: 10, end: 20 },
source_span: SourceSpan {
start: SourcePoint { line: 5, column: 10 },
end: SourcePoint { line: 5, column: 20 },
},
},
groups: SerializableCaptures {
captures: vec![SerializableCapture {
name: Some("token".to_string()),
match_number: 1,
start: 10,
end: 20,
value: "mock_token".into(),
}],
},
blob_id: BlobId::new(b"mock_blob"),
finding_fingerprint: 0123,
rule_finding_fingerprint: intern(rule_finding_fingerprint),
rule_text_id: intern(rule_text_id),
rule_name: intern(rule_name),
rule_confidence: Confidence::Medium,
validation_response_body: "validation response".to_string(),
validation_response_status: 200,
validation_success,
calculated_entropy: 4.5,
visible: true,
}
}
fn setup_mock_reporter(matches: Vec<ReportMatch>) -> DetailsReporter {
let mut datastore = FindingsStore::new(PathBuf::from("/tmp"));
if !matches.is_empty() {
let blob_metadata = BlobMetadata {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
};
let dedup = true;
for m in matches.clone() {
datastore.record(
vec![(
Arc::new(OriginSet::new(
Origin::from_file(PathBuf::from("/mock/path/file.rs")),
vec![],
)),
Arc::new(blob_metadata.clone()),
m.m.clone(),
)],
dedup,
);
}
}
DetailsReporter {
datastore: Arc::new(Mutex::new(datastore)),
styles: Styles::new(false),
only_valid: false,
}
}
#[test]
fn test_json_format() -> Result<()> {
let mock_match =
create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
let matches = vec![ReportMatch {
origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]),
blob_metadata: BlobMetadata {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
},
m: mock_match,
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: "validation response".to_string(),
validation_response_status: 200,
validation_success: true,
}];
let reporter = setup_mock_reporter(matches);
let mut output = Cursor::new(Vec::new());
reporter.json_format(&mut output, &create_default_args())?;
let json_output: Vec<serde_json::Value> = serde_json::from_slice(&output.into_inner())?;
assert!(!json_output.is_empty(), "JSON output should not be empty");
let first = &json_output[0];
assert_eq!(first["rule"]["name"], "MockRule");
assert_eq!(first["finding"]["language"], "Rust");
Ok(())
}
#[test]
fn test_validation_status_in_json() -> Result<()> {
let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")];
for (validation_success, expected_status) in test_cases {
let mock_match = create_mock_match(
"MockRule",
"mock_rule_1",
"mock_finding_fingerprint",
validation_success,
);
let matches = vec![ReportMatch {
origin: OriginSet::new(
Origin::from_file(PathBuf::from("/mock/path/file.rs")),
vec![],
),
blob_metadata: BlobMetadata {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
},
m: mock_match,
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: "validation response".to_string(),
validation_response_status: 200,
validation_success,
}];
let reporter = setup_mock_reporter(matches);
let mut output = Cursor::new(Vec::new());
reporter.json_format(&mut output, &create_default_args())?;
let json_output: Vec<serde_json::Value> = serde_json::from_slice(&output.into_inner())?;
assert!(!json_output.is_empty(), "JSON output should not be empty");
let first = &json_output[0];
let validation_status = first["finding"]["validation"]["status"].as_str().unwrap();
assert_eq!(validation_status, expected_status);
}
Ok(())
}
}

View file

@ -1,171 +0,0 @@
//! Proves that run_async_scan collapses identical findings when
//! ── no_dedup == false ──
//! while keeping them separate when no_dedup == true.
use std::{
fs,
sync::{Arc, Mutex},
};
use anyhow::Result;
use kingfisher::{
cli::{
commands::{
github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
gitlab::GitLabRepoType,
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::{OutputArgs, ReportOutputFormat},
rules::RuleSpecifierArgs,
scan::{ConfidenceLevel, ScanArgs},
},
global::{AdvancedArgs, Mode},
GlobalArgs,
},
findings_store::FindingsStore,
rule_loader::RuleLoader,
rules_database::RulesDatabase,
scanner::run_async_scan,
};
use tempfile::TempDir;
use tokio::runtime::Runtime;
use url::Url;
/// Helper: run a scan with the supplied `no_dedup` flag and return how many
/// findings the `FindingsStore` ends up containing.
fn run_scan(count_rt: &Runtime, no_dedup: bool) -> Result<usize> {
// ── temp workspace ──────────────────────────────────────────────
let work = TempDir::new()?;
let rules_dir = work.path().join("rules");
fs::create_dir_all(&rules_dir)?;
let inputs_dir = work.path().join("in");
fs::create_dir_all(&inputs_dir)?;
// 1. Tiny custom rule that matches `secret_1234`
fs::write(
rules_dir.join("demo.yml"),
r#"
rules:
- id: demo.secret
name: Demo secret
pattern: "secret_[0-9]{4}"
confidence: low
"#,
)?;
// 2. Two different blobs that both contain the SAME secret
fs::write(inputs_dir.join("a.txt"), "secret_1234\n")?;
fs::write(inputs_dir.join("b.txt"), "secret_1234\n")?;
// ── build ScanArgs ──────────────────────────────────────────────
let scan_args = ScanArgs {
num_jobs: 2,
rules: RuleSpecifierArgs {
rules_path: vec![rules_dir.clone()],
rule: vec!["all".into()],
load_builtins: false,
},
input_specifier_args: InputSpecifierArgs {
path_inputs: vec![inputs_dir.join("a.txt"), inputs_dir.join("b.txt")],
git_url: Vec::new(),
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
// new GitLab defaults
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
jira_url: None,
jql: None,
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 5.0,
extraction_depth: 1,
no_binary: true,
no_extract_archives: false,
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Low,
no_validate: true,
rule_stats: false,
only_valid: false,
min_entropy: Some(0.0),
redact: false,
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup,
snippet_length: 64,
baseline_file: None,
manage_baseline: false,
};
let global_args = GlobalArgs {
verbose: 0,
quiet: true,
color: Mode::Never,
progress: Mode::Never,
no_update_check: false,
self_update: false,
ignore_certs: false,
advanced: AdvancedArgs { rlimit_nofile: 8192 },
};
// ── load rules once ─────────────────────────────────────────────
let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules).load(&scan_args)?;
let resolved = loaded.resolve_enabled_rules()?;
let rules_db = Arc::new(RulesDatabase::from_rules(resolved.into_iter().cloned().collect())?);
// Fresh FindingsStore for this run
let store_path = work.path().join("store");
fs::create_dir_all(&store_path)?;
let datastore = Arc::new(Mutex::new(FindingsStore::new(store_path)));
// run_async_scan is async use the supplied Tokio runtime
count_rt.block_on(run_async_scan(
&global_args,
&scan_args,
Arc::clone(&datastore),
&rules_db,
))?;
let x = Ok(datastore.lock().unwrap().get_matches().len());
x
}
#[test]
fn test_dedup_branch() -> Result<()> {
// A *single* runtime reused for both scans keeps the test fast
let rt = Runtime::new().unwrap();
let findings_with_dups = run_scan(&rt, true)?; // keep duplicates
let findings_deduped = run_scan(&rt, false)?; // collapse duplicates
assert!(
findings_with_dups > findings_deduped,
"expected deduplication to reduce finding count ({} -- {})",
findings_with_dups,
findings_deduped
);
assert_eq!(findings_deduped, 1, "exactly one unique finding should remain after dedup");
Ok(())
}

View file

@ -1,149 +0,0 @@
// tests/int_github.rs
use std::{
str::FromStr,
sync::{Arc, Mutex},
};
use anyhow::{Context, Result};
use kingfisher::{
cli::{
commands::{
github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
gitlab::GitLabRepoType,
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::{OutputArgs, ReportOutputFormat},
rules::RuleSpecifierArgs,
scan::{ConfidenceLevel, ScanArgs},
},
global::{AdvancedArgs, Mode},
GlobalArgs,
},
findings_store::FindingsStore,
git_url::GitUrl,
scanner::{load_and_record_rules, run_scan},
};
use tempfile::TempDir;
use tokio::runtime::Runtime;
use url::Url;
/// Helper function to determine exit code based on findings
fn determine_exit_code(total_findings: usize, validated_findings: usize) -> i32 {
if total_findings == 0 {
0 // No findings discovered
} else if validated_findings > 0 {
205 // Validated findings discovered
} else {
200 // Findings discovered but none validated
}
}
#[test]
fn test_github_remote_scan() -> Result<()> {
// Create a temporary directory for the scan
let temp_dir = TempDir::new().context("Failed to create temporary directory")?;
let clone_dir = temp_dir.path().to_path_buf();
// Create test repository URL
let test_repo_url = "https://github.com/micksmix/SecretsTest.git";
let git_url = GitUrl::from_str(test_repo_url).expect("Failed to parse Git URL");
// Create scan arguments
let scan_args = ScanArgs {
num_jobs: 2,
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
input_specifier_args: InputSpecifierArgs {
path_inputs: Vec::new(),
git_url: vec![git_url],
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
// new GitLab defaults
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
jira_url: None,
jql: None,
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
no_extract_archives: false,
extraction_depth: 2,
no_binary: true,
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Medium,
no_validate: false,
rule_stats: false,
only_valid: false,
min_entropy: None,
redact: false,
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true,
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
// Create global arguments
let global_args = GlobalArgs {
verbose: 0,
quiet: false,
color: Mode::Auto,
progress: Mode::Auto,
no_update_check: false,
self_update: false,
ignore_certs: false,
advanced: AdvancedArgs { rlimit_nofile: 16384 },
};
// Create in-memory datastore
let datastore = Arc::new(Mutex::new(FindingsStore::new(clone_dir)));
// Create the runtime first
let runtime = Runtime::new().expect("Failed to create Tokio runtime");
// Load rules
let rules_db = Arc::new(load_and_record_rules(&scan_args, &datastore)?);
// Run the scan using runtime.block_on
runtime.block_on(async {
run_scan(&global_args, &scan_args, &rules_db, Arc::clone(&datastore)).await
})?;
// Get scan results
let ds = datastore.lock().unwrap();
let matches = ds.get_matches();
let total_findings = matches.len();
let validated_findings = matches.iter().filter(|arc| arc.as_ref().2.validation_success).count();
// Print validation statistics
println!("Total findings: {}, Validated findings: {}", total_findings, validated_findings);
// Check total number of findings
assert!(total_findings >= 10, "Expected at least 10 findings, but got {}", total_findings);
// Determine exit code
let exit_code = determine_exit_code(total_findings, validated_findings);
// Test passes if we found some kind of findings (exit code >= 200)
assert!(
exit_code >= 200,
"Test failed: Expected to find vulnerabilities (exit code >= 200), got exit code {}",
exit_code
);
// Drop the runtime explicitly here, outside of async context
drop(runtime);
Ok(())
}

View file

@ -1,243 +0,0 @@
// tests/int_gitlab.rs
use std::{
str::FromStr,
sync::{Arc, Mutex},
};
use anyhow::{Context, Result};
use kingfisher::{
cli::{
commands::{
github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
gitlab::GitLabRepoType,
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::{OutputArgs, ReportOutputFormat},
rules::RuleSpecifierArgs,
scan::{ConfidenceLevel, ScanArgs},
},
global::{AdvancedArgs, Mode},
GlobalArgs,
},
findings_store::FindingsStore,
git_url::GitUrl,
scanner::{load_and_record_rules, run_scan},
};
use tempfile::TempDir;
use tokio::runtime::Runtime;
use url::Url;
/// Derive process exit-codes from findings
fn determine_exit_code(total: usize, validated: usize) -> i32 {
match (total, validated) {
(0, _) => 0,
(_, v) if v > 0 => 205,
_ => 200,
}
}
#[test]
fn test_gitlab_remote_scan() -> Result<()> {
let temp_dir = TempDir::new().context("tmp dir")?;
let clone_dir = temp_dir.path().to_path_buf();
// Public GitLab repo seeded with test secrets
let test_repo_url = "https://gitlab.com/micksmix/SecretsTest.git";
let git_url = GitUrl::from_str(test_repo_url).expect("parse GitLab URL");
let scan_args = ScanArgs {
num_jobs: 2,
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
input_specifier_args: InputSpecifierArgs {
path_inputs: Vec::new(),
git_url: vec![git_url],
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/")?,
github_repo_type: GitHubRepoType::Source,
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/")?,
gitlab_repo_type: GitLabRepoType::Owner,
jira_url: None,
jql: None,
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
no_extract_archives: false,
extraction_depth: 2,
no_binary: true,
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Medium,
no_validate: false,
rule_stats: false,
only_valid: false,
min_entropy: None,
redact: false,
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true,
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
let global_args = GlobalArgs {
verbose: 0,
quiet: false,
color: Mode::Auto,
progress: Mode::Auto,
no_update_check: false,
self_update: false,
ignore_certs: false,
advanced: AdvancedArgs { rlimit_nofile: 16_384 },
};
let datastore = Arc::new(Mutex::new(FindingsStore::new(clone_dir)));
let rt = Runtime::new()?;
let rules_db = Arc::new(load_and_record_rules(&scan_args, &datastore)?);
rt.block_on(async {
run_scan(&global_args, &scan_args, &rules_db, Arc::clone(&datastore)).await
})?;
let ds = datastore.lock().unwrap();
let findings = ds.get_matches();
let total = findings.len();
let validated = findings.iter().filter(|m| m.as_ref().2.validation_success).count();
assert!(total >= 10, "expected at least 10 findings from GitLab repo, got {total}");
let exit_code = determine_exit_code(total, validated);
assert!(
exit_code >= 200,
"expected kingfisher to report findings (exit_code >= 200), got {exit_code}"
);
drop(rt);
Ok(())
}
#[test]
fn test_gitlab_remote_scan_no_history() -> Result<()> {
let temp_dir = TempDir::new().context("tmp dir")?;
let clone_dir = temp_dir.path().to_path_buf();
let test_repo_url = "https://gitlab.com/micksmix/SecretsTest.git";
let git_url = GitUrl::from_str(test_repo_url).expect("parse GitLab URL");
let scan_args = ScanArgs {
num_jobs: 2,
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
input_specifier_args: InputSpecifierArgs {
path_inputs: Vec::new(),
git_url: vec![git_url],
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/")?,
github_repo_type: GitHubRepoType::Source,
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/")?,
gitlab_repo_type: GitLabRepoType::Owner,
jira_url: None,
jql: None,
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::None,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
no_extract_archives: false,
extraction_depth: 2,
no_binary: true,
exclude: Vec::new(),
},
confidence: ConfidenceLevel::Medium,
no_validate: false,
rule_stats: false,
only_valid: false,
min_entropy: None,
redact: false,
git_repo_timeout: 1800,
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true,
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
let global_args = GlobalArgs {
verbose: 0,
quiet: false,
color: Mode::Auto,
progress: Mode::Auto,
no_update_check: false,
self_update: false,
ignore_certs: false,
advanced: AdvancedArgs { rlimit_nofile: 16_384 },
};
let datastore = Arc::new(Mutex::new(FindingsStore::new(clone_dir)));
let rt = Runtime::new()?;
let rules_db = Arc::new(load_and_record_rules(&scan_args, &datastore)?);
rt.block_on(async {
run_scan(&global_args, &scan_args, &rules_db, Arc::clone(&datastore)).await
})?;
let ds = datastore.lock().unwrap();
let findings = ds.get_matches();
let total = findings.len();
let validated = findings.iter().filter(|m| m.as_ref().2.validation_success).count();
assert!(total >= 10, "expected at least 10 findings from GitLab repo, got {total}");
let exit_code = determine_exit_code(total, validated);
assert!(
exit_code >= 200,
"expected kingfisher to report findings (exit_code >= 200), got {exit_code}"
);
drop(rt);
Ok(())
}

View file

@ -1,115 +0,0 @@
// Integration test to ensure --redact replaces secret values with hashes
use std::{
path::PathBuf,
sync::{Arc, Mutex},
};
use anyhow::Result;
use kingfisher::{
cli::{
commands::{
github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
gitlab::GitLabRepoType,
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::{OutputArgs, ReportOutputFormat},
rules::RuleSpecifierArgs,
scan::{ConfidenceLevel, ScanArgs},
},
global::{AdvancedArgs, GlobalArgs, Mode},
},
findings_store::FindingsStore,
rule_loader::RuleLoader,
rules_database::RulesDatabase,
scanner::run_async_scan,
};
use tempfile::TempDir;
use url::Url;
#[tokio::test]
async fn test_redact_hashes_finding_values() -> Result<()> {
let temp_dir = TempDir::new()?;
let scan_args = ScanArgs {
num_jobs: 2,
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
input_specifier_args: InputSpecifierArgs {
path_inputs: vec![PathBuf::from("testdata/generic_secrets.py")],
git_url: Vec::new(),
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
jira_url: None,
jql: None,
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
extraction_depth: 2,
no_binary: true,
no_extract_archives: false,
exclude: Vec::new(),
},
confidence: ConfidenceLevel::Low,
no_validate: true,
rule_stats: false,
only_valid: false,
min_entropy: Some(0.0),
redact: true,
git_repo_timeout: 1800,
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true,
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
let global_args = GlobalArgs {
verbose: 0,
quiet: true,
color: Mode::Never,
no_update_check: false,
self_update: false,
progress: Mode::Never,
ignore_certs: false,
advanced: AdvancedArgs { rlimit_nofile: 16384 },
};
let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules).load(&scan_args)?;
let resolved = loaded.resolve_enabled_rules()?;
let rules_db = RulesDatabase::from_rules(resolved.into_iter().cloned().collect())?;
let datastore = Arc::new(Mutex::new(FindingsStore::new(temp_dir.path().to_path_buf())));
run_async_scan(&global_args, &scan_args, Arc::clone(&datastore), &rules_db).await?;
let ds = datastore.lock().unwrap();
let matches = ds.get_matches();
assert!(!matches.is_empty());
for m_arc in matches {
let m = &m_arc.2;
assert!(m.groups.captures.iter().any(|cap| cap.value.starts_with("[REDACTED:")));
}
Ok(())
}

View file

@ -1,207 +0,0 @@
use std::{
env,
sync::{Arc, Mutex},
};
use anyhow::Result;
use kingfisher::{
cli::{
commands::{
github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
gitlab::GitLabRepoType,
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::{OutputArgs, ReportOutputFormat},
rules::RuleSpecifierArgs,
scan::{ConfidenceLevel, ScanArgs},
},
global::{AdvancedArgs, Mode},
GlobalArgs,
},
findings_store::FindingsStore,
rule_loader::RuleLoader,
rules_database::RulesDatabase,
scanner::run_async_scan,
};
use tempfile::TempDir;
use url::Url;
use wiremock::{
matchers::{method, path},
Mock, MockServer, ResponseTemplate,
};
struct TestContext {
rules_db: Arc<RulesDatabase>,
}
impl TestContext {
fn new() -> Result<Self> {
let scan_args = ScanArgs {
num_jobs: 2,
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
input_specifier_args: InputSpecifierArgs {
path_inputs: Vec::new(),
git_url: Vec::new(),
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
jira_url: None,
jql: None,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
max_results: 10,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
extraction_depth: 2,
no_binary: true,
no_extract_archives: false,
exclude: Vec::new(),
},
confidence: ConfidenceLevel::Low,
no_validate: true,
rule_stats: false,
only_valid: false,
min_entropy: Some(0.0),
redact: false,
git_repo_timeout: 1800,
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true,
snippet_length: 128,
baseline_file: None,
manage_baseline: false,
};
let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules).load(&scan_args)?;
let resolved = loaded.resolve_enabled_rules()?;
let rules_db = RulesDatabase::from_rules(resolved.into_iter().cloned().collect())?;
Ok(Self { rules_db: Arc::new(rules_db) })
}
}
#[tokio::test]
async fn test_scan_slack_messages() -> Result<()> {
let ctx = TestContext::new()?;
let server = MockServer::start().await;
let response = serde_json::json!({
"ok": true,
"messages": {
"matches": [{
"permalink": "https://example.slack.com/archives/C123/p1234",
"text": "This contains a github token ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa",
"ts": "1234.56",
"channel": {"id": "C123", "name": "general"}
}],
"pagination": {"page": 1, "page_count": 1}
}
});
Mock::given(method("GET"))
.and(path("/search.messages"))
.respond_with(ResponseTemplate::new(200).set_body_json(response))
.mount(&server)
.await;
env::set_var("KF_SLACK_TOKEN", "xoxp-test");
let temp_dir = TempDir::new()?;
let clone_dir = temp_dir.path().to_path_buf();
let scan_args = ScanArgs {
num_jobs: 2,
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
input_specifier_args: InputSpecifierArgs {
path_inputs: Vec::new(),
git_url: Vec::new(),
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
jira_url: None,
jql: None,
slack_query: Some("test".into()),
slack_api_url: Url::parse(&format!("{}/", server.uri()))?,
max_results: 10,
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
docker_image: Vec::new(),
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
extraction_depth: 2,
no_binary: true,
no_extract_archives: false,
exclude: Vec::new(),
},
confidence: ConfidenceLevel::Low,
no_validate: true,
rule_stats: false,
only_valid: false,
min_entropy: Some(0.0),
redact: false,
git_repo_timeout: 1800,
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true,
snippet_length: 128,
baseline_file: None,
manage_baseline: false,
};
let global_args = GlobalArgs {
verbose: 0,
quiet: true,
color: Mode::Auto,
no_update_check: false,
self_update: false,
progress: Mode::Never,
ignore_certs: false,
advanced: AdvancedArgs { rlimit_nofile: 16384 },
};
let datastore = Arc::new(Mutex::new(FindingsStore::new(clone_dir)));
run_async_scan(&global_args, &scan_args, Arc::clone(&datastore), &ctx.rules_db).await?;
let findings = {
let ds = datastore.lock().unwrap();
ds.get_matches().len()
};
assert!(findings > 0);
Ok(())
}

View file

@ -1,211 +0,0 @@
// tests/int_validation_cache.rs
use std::{
fs,
sync::{
atomic::{AtomicUsize, Ordering},
Arc, Mutex,
},
};
use anyhow::Result;
use kingfisher::{
cli::{
commands::{
github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
gitlab::GitLabRepoType,
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::{OutputArgs, ReportOutputFormat},
rules::RuleSpecifierArgs,
scan::{ConfidenceLevel, ScanArgs},
},
global::{AdvancedArgs, Mode},
GlobalArgs,
},
findings_store::FindingsStore,
rule_loader::RuleLoader,
rules_database::RulesDatabase,
scanner::run_async_scan,
};
use tempfile::TempDir;
use url::Url;
use wiremock::{
matchers::{method, path},
Mock, MockServer, Request, ResponseTemplate,
};
#[tokio::test]
async fn test_validation_cache_and_depvars() -> Result<()> {
/* --------------------------------------------------------- *
* 1. Spin-up Wiremock and count incoming validation calls *
* --------------------------------------------------------- */
let server = MockServer::start().await;
let hit_counter = Arc::new(AtomicUsize::new(0));
let counter_clone = Arc::clone(&hit_counter);
Mock::given(method("GET"))
.and(path("/validate"))
.respond_with(move |_req: &Request| {
counter_clone.fetch_add(1, Ordering::SeqCst);
ResponseTemplate::new(200).set_body_string("ok")
})
.mount(&server)
.await;
/* --------------------------------------------------------- *
* 2. Synthetic rules exercising depends_on_rule + HTTP val *
* --------------------------------------------------------- */
let rules_yaml = format!(
r#"
rules:
- name: Demo API Key
id: demo.key.1
pattern: '(demokey_[a-z0-9]{{8}})'
confidence: low
min_entropy: 0.0
- name: Demo API Key Validation
id: demo.key.validation.1
depends_on_rule:
- rule_id: demo.key.1
variable: TOKEN
pattern: '(demokey_[a-z0-9]{{8}})'
confidence: low
validation:
type: Http
content:
request:
method: GET
url: '{base}/validate?token={{ {{ TOKEN }} }}'
response_matcher:
- report_response: true
- type: WordMatch
words:
- '"error_code":"403003"'
negative: true
"#,
base = server.uri()
);
/* --------------------------------------------------------- *
* 3. Temp workspace: rules file + input with 2 duplicates *
* --------------------------------------------------------- */
let work_dir = TempDir::new()?;
let rules_file = work_dir.path().join("demo.yml");
fs::write(&rules_file, rules_yaml)?;
let secret_file = work_dir.path().join("secrets.txt");
fs::write(&secret_file, "demokey_abcdefgh\ndemokey_abcdefgh")?;
/* --------------------------------------------------------- *
* 4. Build Scan / Global args (no_dedup=true to keep dups) *
* --------------------------------------------------------- */
let scan_args = ScanArgs {
num_jobs: 2,
rules: RuleSpecifierArgs {
rules_path: vec![work_dir.path().to_path_buf()],
rule: vec!["all".into()],
load_builtins: false,
},
input_specifier_args: InputSpecifierArgs {
path_inputs: vec![secret_file.clone()],
git_url: Vec::new(),
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
// new GitLab defaults
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
jira_url: None,
jql: None,
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
extraction_depth: 2,
no_binary: true,
no_extract_archives: false,
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Low,
no_validate: false,
rule_stats: false,
only_valid: false,
min_entropy: Some(0.0),
redact: false,
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true, // keep duplicates so the cache is stressed
snippet_length: 128,
baseline_file: None,
manage_baseline: false,
};
/* --------------------------------------------------------- *
* 5. Load rules, run scan *
* --------------------------------------------------------- */
// ---------------------------------------------------------
// 5. Load rules, record them, run scan
// ---------------------------------------------------------
let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules).load(&scan_args)?;
let resolved = loaded.resolve_enabled_rules()?;
let rules_db = Arc::new(RulesDatabase::from_rules(resolved.into_iter().cloned().collect())?);
let datastore = Arc::new(Mutex::new(FindingsStore::new(work_dir.path().to_path_buf())));
// NEW: make the datastore aware of every rule
{
let mut ds = datastore.lock().unwrap();
ds.record_rules(rules_db.rules()); // <-- **add this line**
}
let global_args = GlobalArgs {
verbose: 0,
quiet: true,
color: Mode::Auto,
progress: Mode::Never,
no_update_check: false,
self_update: false,
ignore_certs: false,
advanced: AdvancedArgs { rlimit_nofile: 8192 },
};
run_async_scan(&global_args, &scan_args, Arc::clone(&datastore), &rules_db).await?;
/* --------------------------------------------------------- *
* 6. Assertions *
* --------------------------------------------------------- */
// There are two matches for demo.key.validation.1, but the validator
// should have been called only once thanks to SkipMap caching.
assert_eq!(
hit_counter.load(Ordering::SeqCst),
1,
"validator endpoint should be hit exactly once"
);
let ds = datastore.lock().unwrap();
let total_matches = ds.get_matches().len();
assert_eq!(total_matches, 4, "expected 2 matches per rule (dup secrets)"); // 2 for each rule
Ok(())
}

View file

@ -1,257 +0,0 @@
// tests/integration_scan.rs
use std::{
path::{Path, PathBuf},
sync::{Arc, Mutex},
};
use anyhow::{Context, Result};
use kingfisher::{
cli::{
commands::{
github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
gitlab::GitLabRepoType,
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::{OutputArgs, ReportOutputFormat},
rules::RuleSpecifierArgs,
scan::{ConfidenceLevel, ScanArgs},
},
global::{AdvancedArgs, Mode},
GlobalArgs,
},
findings_store::FindingsStore,
rule_loader::RuleLoader,
rules_database::RulesDatabase,
scanner::run_async_scan,
};
use tempfile::TempDir;
use url::Url;
#[derive(Debug)]
struct TestCase {
file_name: &'static str,
min_expected_findings: usize,
}
struct TestContext {
rules_db: Arc<RulesDatabase>,
}
fn root_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
}
impl TestContext {
fn new() -> Result<Self> {
let scan_args = ScanArgs {
num_jobs: 2,
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
input_specifier_args: InputSpecifierArgs {
path_inputs: Vec::new(),
git_url: Vec::new(),
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
// new GitLab defaults
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
jira_url: None,
jql: None,
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
extraction_depth: 2,
no_binary: true,
no_extract_archives: false,
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Low,
no_validate: true,
rule_stats: false,
only_valid: false,
min_entropy: Some(0.0),
redact: false,
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true,
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules)
.load(&scan_args)
.context("Failed to load rules")?;
let resolved = loaded.resolve_enabled_rules().context("Failed to resolve rules")?;
let rules_db = RulesDatabase::from_rules(resolved.into_iter().cloned().collect())
.context("Failed to compile rules")?;
Ok(Self { rules_db: Arc::new(rules_db) })
}
async fn scan_file(&self, file_path: &Path) -> Result<usize> {
let temp_dir = TempDir::new().context("Failed to create temporary directory")?;
let clone_dir = temp_dir.path().to_path_buf();
let scan_args = ScanArgs {
num_jobs: 2,
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
input_specifier_args: InputSpecifierArgs {
path_inputs: vec![file_path.to_path_buf()],
git_url: Vec::new(),
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
// new GitLab defaults
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
jira_url: None,
jql: None,
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
extraction_depth: 2,
no_binary: true,
no_extract_archives: false,
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Low,
no_validate: true,
rule_stats: false,
only_valid: false,
min_entropy: Some(0.0),
redact: false,
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup: true,
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
let global_args = GlobalArgs {
verbose: 0,
quiet: true,
color: Mode::Auto,
no_update_check: false,
self_update: false,
progress: Mode::Never,
ignore_certs: false,
advanced: AdvancedArgs { rlimit_nofile: 16384 },
};
let datastore = Arc::new(Mutex::new(FindingsStore::new(clone_dir)));
run_async_scan(&global_args, &scan_args, Arc::clone(&datastore), &self.rules_db).await?;
let findings = {
let ds = datastore.lock().unwrap();
ds.get_matches().len()
};
Ok(findings)
}
}
#[tokio::test]
async fn test_scan_vulnerable_files() -> Result<()> {
let test_context = TestContext::new()?;
let test_cases = vec![
TestCase { file_name: "testdata/c_vulnerable.c", min_expected_findings: 3 },
TestCase { file_name: "testdata/cpp_vulnerable.cpp", min_expected_findings: 3 },
TestCase { file_name: "testdata/csharp_vulnerable.cs", min_expected_findings: 4 },
TestCase { file_name: "testdata/elixir_vulnerable.exs", min_expected_findings: 1 },
TestCase { file_name: "testdata/generic_secrets.py", min_expected_findings: 9 },
TestCase { file_name: "testdata/go_vulnerable.go", min_expected_findings: 4 },
TestCase { file_name: "testdata/java_vulnerable.java", min_expected_findings: 4 },
TestCase { file_name: "testdata/javascript_vulnerable.js", min_expected_findings: 4 },
TestCase { file_name: "testdata/json_vulnerable.json", min_expected_findings: 4 },
TestCase { file_name: "testdata/kotlin_vulnerable.kt", min_expected_findings: 7 },
TestCase { file_name: "testdata/objc_vulnerable.m", min_expected_findings: 4 },
TestCase { file_name: "testdata/php_vulnerable.php", min_expected_findings: 5 },
TestCase { file_name: "testdata/python_vulnerable.py", min_expected_findings: 10 },
TestCase { file_name: "testdata/python2_vulnerable.py", min_expected_findings: 4 },
TestCase { file_name: "testdata/ruby_vulnerable.rb", min_expected_findings: 6 },
TestCase { file_name: "testdata/rust_vulnerable.rs", min_expected_findings: 3 },
TestCase { file_name: "testdata/scala_vulnerable.scala", min_expected_findings: 3 },
TestCase { file_name: "testdata/shell_vulnerable.sh", min_expected_findings: 2 },
TestCase { file_name: "testdata/slack_tokens.properties", min_expected_findings: 17 },
TestCase { file_name: "testdata/swift_vulnerable.swift", min_expected_findings: 2 },
TestCase { file_name: "testdata/toml_vulnerable.toml", min_expected_findings: 4 },
TestCase { file_name: "testdata/tsx_vulnerable.tsx", min_expected_findings: 1 },
TestCase { file_name: "testdata/typescript_vulnerable.ts", min_expected_findings: 1 },
TestCase { file_name: "testdata/yaml_vulnerable.yaml", min_expected_findings: 4 },
];
let root = root_dir();
for test_case in test_cases {
let test_file = root.join(test_case.file_name);
println!("Testing file: {}", test_case.file_name);
let findings = test_context.scan_file(&test_file).await?;
assert!(
findings >= test_case.min_expected_findings,
"File: {} - Expected >= {} findings, got {}",
test_case.file_name,
test_case.min_expected_findings,
findings
);
}
Ok(())
}