kingfisher/tests/int_dedup.rs
Mick Grove 078fa16e6a - Reduced per-match memory usage by compacting stored source locations and interning repeated capture names.
- Stored optional validation response bodies as boxed strings to avoid allocating empty payloads and to streamline validator caches.
- Parallelized git cloning based on the configured job count and begin scanning repositories as soon as each clone finishes to reduce end-to-end scan times.
- Combined per-repository results into a single aggregate summary after scans complete.
- Added initial access-map support and report viewer html file. Currently beta features.
2025-12-04 22:02:30 -08:00

229 lines
8 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Proves that run_async_scan collapses identical findings when
//! ── no_dedup == false ──
//! while keeping them separate when no_dedup == true.
use std::{
fs,
sync::{Arc, Mutex},
};
use anyhow::Result;
use kingfisher::{
cli::{
commands::{
azure::AzureRepoType,
bitbucket::{BitbucketAuthArgs, BitbucketRepoType},
gitea::GiteaRepoType,
github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
gitlab::GitLabRepoType,
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::{OutputArgs, ReportOutputFormat},
rules::RuleSpecifierArgs,
scan::{ConfidenceLevel, ScanArgs},
},
global::Mode,
GlobalArgs,
},
findings_store::FindingsStore,
rule_loader::RuleLoader,
rules_database::RulesDatabase,
scanner::run_async_scan,
update::UpdateStatus,
};
use tempfile::TempDir;
use tokio::runtime::Runtime;
use url::Url;
/// Helper: run a scan with the supplied `no_dedup` flag and return how many
/// findings the `FindingsStore` ends up containing.
fn run_scan(count_rt: &Runtime, no_dedup: bool) -> Result<usize> {
// ── temp workspace ──────────────────────────────────────────────
let work = TempDir::new()?;
let rules_dir = work.path().join("rules");
fs::create_dir_all(&rules_dir)?;
let inputs_dir = work.path().join("in");
fs::create_dir_all(&inputs_dir)?;
// 1. Tiny custom rule that matches `secret_1234`
fs::write(
rules_dir.join("demo.yml"),
r#"
rules:
- id: demo.secret
name: Demo secret
pattern: "secret_[0-9]{4}"
confidence: low
"#,
)?;
// 2. Two different blobs that both contain the SAME secret
fs::write(inputs_dir.join("a.txt"), "secret_1234\n")?;
fs::write(inputs_dir.join("b.txt"), "secret_1234\n")?;
// ── build ScanArgs ──────────────────────────────────────────────
let scan_args = ScanArgs {
num_jobs: 2,
rules: RuleSpecifierArgs {
rules_path: vec![rules_dir.clone()],
rule: vec!["all".into()],
load_builtins: false,
},
input_specifier_args: InputSpecifierArgs {
path_inputs: vec![inputs_dir.join("a.txt"), inputs_dir.join("b.txt")],
git_url: Vec::new(),
github_user: Vec::new(),
github_organization: Vec::new(),
github_exclude: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
// new GitLab defaults
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
gitlab_exclude: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
gitlab_include_subgroups: false,
huggingface_user: Vec::new(),
huggingface_organization: Vec::new(),
huggingface_model: Vec::new(),
huggingface_dataset: Vec::new(),
huggingface_space: Vec::new(),
huggingface_exclude: Vec::new(),
gitea_user: Vec::new(),
gitea_organization: Vec::new(),
gitea_exclude: Vec::new(),
all_gitea_organizations: false,
gitea_api_url: Url::parse("https://gitea.com/api/v1/").unwrap(),
gitea_repo_type: GiteaRepoType::Source,
bitbucket_user: Vec::new(),
bitbucket_workspace: Vec::new(),
bitbucket_project: Vec::new(),
bitbucket_exclude: Vec::new(),
all_bitbucket_workspaces: false,
bitbucket_api_url: Url::parse("https://api.bitbucket.org/2.0/").unwrap(),
bitbucket_repo_type: BitbucketRepoType::Source,
bitbucket_auth: BitbucketAuthArgs::default(),
azure_organization: Vec::new(),
azure_project: Vec::new(),
azure_exclude: Vec::new(),
all_azure_projects: false,
azure_base_url: Url::parse("https://dev.azure.com/").unwrap(),
azure_repo_type: AzureRepoType::Source,
jira_url: None,
jql: None,
confluence_url: None,
cql: None,
max_results: 100,
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
gcs_bucket: None,
gcs_prefix: None,
gcs_service_account: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
commit_metadata: true,
repo_artifacts: false,
scan_nested_repos: true,
since_commit: None,
branch: None,
branch_root: false,
branch_root_commit: None,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 5.0,
extraction_depth: 1,
no_binary: true,
no_extract_archives: false,
exclude: Vec::new(), // Exclude patterns
},
confidence: ConfidenceLevel::Low,
no_validate: true,
access_map: false,
access_map_html: None,
rule_stats: false,
only_valid: false,
min_entropy: Some(0.0),
redact: false,
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
no_dedup,
baseline_file: None,
manage_baseline: false,
skip_regex: Vec::new(),
skip_word: Vec::new(),
skip_aws_account: Vec::new(),
skip_aws_account_file: None,
no_base64: false,
extra_ignore_comments: Vec::new(),
no_inline_ignore: false,
no_ignore_if_contains: false,
};
let global_args = GlobalArgs {
verbose: 0,
quiet: true,
color: Mode::Never,
progress: Mode::Never,
no_update_check: false,
self_update: false,
ignore_certs: false,
user_agent_suffix: None,
};
// ── load rules once ─────────────────────────────────────────────
let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules).load(&scan_args)?;
let resolved = loaded.resolve_enabled_rules()?;
let rules_db = Arc::new(RulesDatabase::from_rules(resolved.into_iter().cloned().collect())?);
let update_status = UpdateStatus::default();
// Fresh FindingsStore for this run
let store_path = work.path().join("store");
fs::create_dir_all(&store_path)?;
let datastore = Arc::new(Mutex::new(FindingsStore::new(store_path)));
// run_async_scan is async use the supplied Tokio runtime
count_rt.block_on(run_async_scan(
&global_args,
&scan_args,
Arc::clone(&datastore),
&rules_db,
&update_status,
))?;
let x = Ok(datastore.lock().unwrap().get_matches().len());
x
}
#[test]
fn test_dedup_branch() -> Result<()> {
// A *single* runtime reused for both scans keeps the test fast
let rt = Runtime::new().unwrap();
let findings_with_dups = run_scan(&rt, true)?; // keep duplicates
let findings_deduped = run_scan(&rt, false)?; // collapse duplicates
assert!(
findings_with_dups > findings_deduped,
"expected deduplication to reduce finding count ({} -- {})",
findings_with_dups,
findings_deduped
);
assert_eq!(findings_deduped, 1, "exactly one unique finding should remain after dedup");
Ok(())
}