diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c92a92..6483f58 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project will be documented in this file. ## [1.32.0] - Added support for scanning AWS S3 buckets via `--s3-bucket` and optional `--s3-prefix` - Added `--role-arn` and `--aws-local-profile` flags for S3 authentication alongside `KF_AWS_KEY`/`KF_AWS_SECRET` +- Refactored output reporting and formatting logic + ## [1.31.0] - New rules: Telegram bot token, OpenWeatherMap, Apify, Groq - New OpenAI detectors added (@joshlarsen) diff --git a/data/rules/credentials.yml b/data/rules/credentials.yml index 38c1057..cb7e866 100644 --- a/data/rules/credentials.yml +++ b/data/rules/credentials.yml @@ -20,6 +20,6 @@ rules: [a-z0-9\/._~-]* )? min_entropy: 3.0 - confidence: low + confidence: medium examples: - https://eaRIWNkE:qyOIhJiM@j2LYY414Q5cCYD \ No newline at end of file diff --git a/data/rules/generic.yml b/data/rules/generic.yml index 9338797..3f71d4a 100644 --- a/data/rules/generic.yml +++ b/data/rules/generic.yml @@ -96,7 +96,6 @@ rules: ["'] min_entropy: 3.3 confidence: low - categories: [fuzzy, generic, secret] examples: - | password = "super$ecret" diff --git a/f1.patch b/f1.patch new file mode 100644 index 0000000..a132a31 --- /dev/null +++ b/f1.patch @@ -0,0 +1,719 @@ +diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs +index 9fcb1ecdfe8decc60278848c4a7be43cc9ebee70..b600f9f65838e52ce5dc3d7bb3bb1a5d5ff2bcaf 100644 +--- a/src/reporter/json_format.rs ++++ b/src/reporter/json_format.rs +@@ -1,436 +1,80 @@ +-use http::StatusCode; +-use serde_json::json; +- + use super::*; +-use crate::bstring_escape::Escaped; + + impl DetailsReporter { +- pub fn deduplicate_matches( +- &self, +- matches: Vec, +- no_dedup: bool, +- ) -> Vec { +- if no_dedup { +- return matches; +- } +- +- use std::collections::HashMap; +- let mut by_fp: HashMap = HashMap::new(); +- +- for rm in matches { +- let fp = rm.m.finding_fingerprint; +- if let Some(existing) = by_fp.get_mut(&fp) { +- // merge origin sets (keep first origin, append the rest) +- for o in rm.origin.iter() { +- if !existing.origin.iter().any(|e| e == o) { +- existing.origin = OriginSet::new( +- existing.origin.first().clone(), +- existing +- .origin +- .iter() +- .skip(1) +- .cloned() +- .chain(std::iter::once(o.clone())) +- .collect(), +- ); +- } +- } +- continue; +- } +- by_fp.insert(fp, rm); +- } +- by_fp.into_values().collect() +- } +- +- pub fn gather_json_findings( +- &self, +- args: &cli::commands::scan::ScanArgs, +- ) -> Result> { +- let mut matches = self.get_filtered_matches()?; +- if !args.no_dedup { +- matches = self.deduplicate_matches(matches, args.no_dedup); +- } +- +- let mut json_findings = Vec::new(); +- for rm in matches { +- let source_span = &rm.m.location.source_span; +- let line_num = source_span.start.line; +- +- let snippet = Escaped( +- rm.m.groups +- .captures +- .get(1) +- .or_else(|| rm.m.groups.captures.get(0)) +- .map(|capture| capture.value.as_bytes()) +- .unwrap_or_default(), +- ) +- .to_string(); +- +- let validation_status = if rm.validation_success { +- "Active Credential" +- } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { +- "Not Attempted" +- } else { +- "Inactive Credential" +- }; +- +- const MAX_RESPONSE_LENGTH: usize = 512; +- let truncated_body: String = +- rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); +- let ellipsis = +- if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; +- let response_body = format!("{}{}", truncated_body, ellipsis); +- +- // Call extract_git_metadata on each GitRepo origin and take the first non-null result. +- let git_metadata_val = rm +- .origin +- .iter() +- .filter_map(|origin| { +- if let Origin::GitRepo(e) = origin { +- self.extract_git_metadata(e, source_span) +- } else { +- None +- } +- }) +- .next() +- .unwrap_or(serde_json::Value::Null); +- +- // Collect a file path from an Origin::File, if available. +- let file_path = rm +- .origin +- .iter() +- .find_map(|origin| match origin { +- Origin::File(e) => { +- if let Some(url) = self.jira_issue_url(&e.path, args) { +- Some(url) +- } else if let Some(url) = self.slack_message_url(&e.path) { +- Some(url) +- } else if let Some(mapped) = self.s3_display_path(&e.path) { +- Some(mapped) +- } else if let Some(mapped) = self.docker_display_path(&e.path) { +- Some(mapped) +- } else { +- Some(e.path.display().to_string()) +- } +- } +- Origin::Extended(e) => e.path().map(|p| p.display().to_string()), +- _ => None, +- }) +- .unwrap_or_default(); +- +- let match_json = json!({ +- "rule": { +- "name": rm.m.rule_name, +- "id": rm.m.rule_text_id, +- }, +- "finding": { +- "snippet": snippet, +- "fingerprint": rm.m.finding_fingerprint.to_string(), +- "confidence": rm.match_confidence.to_string(), +- "entropy": format!("{:.2}", rm.m.calculated_entropy), +- "validation": { +- "status": validation_status, +- "response": response_body, +- }, +- "language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()), +- "line": line_num, +- "column_start": source_span.start.column, +- "column_end": source_span.end.column, +- "path": file_path, +- "git_metadata": git_metadata_val +- } +- }); +- +- let finding_json = json!({ +- "id": rm.m.rule_text_id, +- "matches": [ match_json ] +- }); +- json_findings.push(finding_json); +- } +- Ok(json_findings) +- } + pub fn json_format( + &self, + mut writer: W, + args: &cli::commands::scan::ScanArgs, + ) -> Result<()> { +- let mut findings = Vec::new(); +- +- // Get filtered matches +- let mut matches = self.get_filtered_matches()?; +- +- // Apply deduplication only if requested +- if !args.no_dedup { +- matches = self.deduplicate_matches(matches, args.no_dedup); +- } +- +- // For each match, handle it based on the no_dedup flag +- for rm in matches { +- if args.no_dedup && rm.origin.len() > 1 { +- // For no_dedup and multiple origins, create separate findings for each origin +- for origin in rm.origin.iter() { +- // Create a single-origin version of this match +- let single_origin_rm = ReportMatch { +- origin: OriginSet::new(origin.clone(), Vec::new()), +- blob_metadata: rm.blob_metadata.clone(), +- m: rm.m.clone(), +- comment: rm.comment.clone(), +- visible: rm.visible, +- match_confidence: rm.match_confidence, +- validation_response_body: rm.validation_response_body.clone(), +- validation_response_status: rm.validation_response_status, +- validation_success: rm.validation_success, +- }; +- +- // Process this single-origin match into a JSON finding +- let json_finding = self.process_match_to_json(&single_origin_rm, args)?; +- findings.push(json_finding); +- } +- } else { +- // Process normally for deduped matches or matches with only one origin +- let json_finding = self.process_match_to_json(&rm, args)?; +- findings.push(json_finding); +- } +- } +- +- // Write the JSON output +- if !findings.is_empty() { +- serde_json::to_writer_pretty(&mut writer, &findings)?; ++ let records = self.build_finding_records(args)?; ++ if !records.is_empty() { ++ serde_json::to_writer_pretty(&mut writer, &records)?; + writeln!(writer)?; + } + Ok(()) + } + +- // Add a helper method to convert a ReportMatch to a JSON finding +- pub fn process_match_to_json( +- &self, +- rm: &ReportMatch, +- args: &cli::commands::scan::ScanArgs, +- ) -> Result { +- // Extract the relevant data from the match as you already do in your current implementation +- let source_span = &rm.m.location.source_span; +- let line_num = source_span.start.line; +- +- let snippet = Escaped( +- rm.m.groups +- .captures +- .get(1) +- .or_else(|| rm.m.groups.captures.get(0)) +- .map(|capture| capture.value.as_bytes()) +- .unwrap_or_default(), +- ) +- .to_string(); +- +- let validation_status = if rm.validation_success { +- "Active Credential" +- } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { +- "Not Attempted" +- } else { +- "Inactive Credential" +- }; +- +- const MAX_RESPONSE_LENGTH: usize = 512; +- let truncated_body: String = +- rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); +- let ellipsis = +- if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; +- let response_body = format!("{}{}", truncated_body, ellipsis); +- +- // Call extract_git_metadata on each GitRepo origin and take the first non-null result. +- let git_metadata_val = rm +- .origin +- .iter() +- .filter_map(|origin| { +- if let Origin::GitRepo(e) = origin { +- self.extract_git_metadata(e, source_span) +- } else { +- None +- } +- }) +- .next() +- .unwrap_or(serde_json::Value::Null); +- +- // Collect a file path from an Origin::File, if available. +- let file_path = rm +- .origin +- .iter() +- .find_map(|origin| { +- if let Origin::File(e) = origin { +- if let Some(url) = self.jira_issue_url(&e.path, args) { +- Some(url) +- } else if let Some(url) = self.slack_message_url(&e.path) { +- Some(url) +- } else if let Some(mapped) = self.s3_display_path(&e.path) { +- Some(mapped) +- } else if let Some(mapped) = self.docker_display_path(&e.path) { +- Some(mapped) +- } else { +- Some(e.path.display().to_string()) +- } +- } else if let Origin::Extended(e) = origin { +- e.path().map(|p| p.display().to_string()) +- } else { +- None +- } +- }) +- .unwrap_or_default(); +- +- let match_json = json!({ +- "rule": { +- "name": rm.m.rule_name, +- "id": rm.m.rule_text_id, +- }, +- "finding": { +- "snippet": snippet, +- "fingerprint": rm.m.finding_fingerprint.to_string(), +- "confidence": rm.match_confidence.to_string(), +- "entropy": format!("{:.2}", rm.m.calculated_entropy), +- "validation": { +- "status": validation_status, +- "response": response_body, +- }, +- "language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()), +- "line": line_num, +- "column_start": source_span.start.column, +- "column_end": source_span.end.column, +- "path": file_path, +- "git_metadata": git_metadata_val +- } +- }); +- +- let finding_json = json!({ +- "id": rm.m.rule_text_id, +- "matches": [ match_json ] +- }); +- +- Ok(finding_json) +- } +- // // Modified JSON format to pass args to gather_json_findings +- // pub fn json_format( +- // &self, +- // mut writer: W, +- // args: &cli::commands::scan::ScanArgs, +- // ) -> Result<()> { +- // let findings = self.gather_json_findings(args)?; +- // if !findings.is_empty() { +- // serde_json::to_writer_pretty(&mut writer, &findings)?; +- // writeln!(writer)?; +- // } +- // Ok(()) +- // } +- + pub fn jsonl_format( + &self, + mut writer: W, + args: &cli::commands::scan::ScanArgs, + ) -> Result<()> { +- // Get filtered matches +- let mut matches = self.get_filtered_matches()?; +- +- // Apply deduplication only if requested +- if !args.no_dedup { +- matches = self.deduplicate_matches(matches, args.no_dedup); +- } +- +- // For each match, handle it based on the no_dedup flag +- for rm in matches { +- if args.no_dedup && rm.origin.len() > 1 { +- // For no_dedup and multiple origins, create separate findings for each origin +- for origin in rm.origin.iter() { +- // Create a single-origin version of this match +- let single_origin_rm = ReportMatch { +- origin: OriginSet::new(origin.clone(), Vec::new()), +- blob_metadata: rm.blob_metadata.clone(), +- m: rm.m.clone(), +- comment: rm.comment.clone(), +- visible: rm.visible, +- match_confidence: rm.match_confidence, +- validation_response_body: rm.validation_response_body.clone(), +- validation_response_status: rm.validation_response_status, +- validation_success: rm.validation_success, +- }; +- +- // Process this single-origin match into a JSON finding and write it +- let json_finding = self.process_match_to_json(&single_origin_rm, args)?; +- serde_json::to_writer(&mut writer, &json_finding)?; +- writeln!(writer)?; +- } +- } else { +- // Process normally for deduped matches or matches with only one origin +- let json_finding = self.process_match_to_json(&rm, args)?; +- serde_json::to_writer(&mut writer, &json_finding)?; +- writeln!(writer)?; +- } ++ let records = self.build_finding_records(args)?; ++ for record in records { ++ serde_json::to_writer(&mut writer, &record)?; ++ writeln!(writer)?; + } + Ok(()) + } +- // // Modified JSONL format to pass args to gather_json_findings +- // pub fn jsonl_format( +- // &self, +- // mut writer: W, +- // args: &cli::commands::scan::ScanArgs, +- // ) -> Result<()> { +- // let findings = self.gather_json_findings(args)?; +- // for finding in findings { +- // serde_json::to_writer(&mut writer, &finding)?; +- // writeln!(writer)?; +- // } +- // Ok(()) +- // } + } + + #[cfg(test)] + mod tests { +- use std::{ +- io::Cursor, +- path::PathBuf, +- sync::{Arc, Mutex}, +- }; +- +- use anyhow::Result; +- use serde_json::Value; +- use url::Url; +- + use super::*; + use crate::{ + blob::BlobId, +- cli::commands::{ +- github::{GitCloneMode, GitHistoryMode, GitHubRepoType}, +- inputs::{ContentFilteringArgs, InputSpecifierArgs}, +- output::OutputArgs, +- rules::RuleSpecifierArgs, +- scan::ConfidenceLevel, ++ cli::commands::github::GitHubRepoType, ++ cli::commands::output::{OutputArgs, ReportOutputFormat}, ++ cli::commands::scan::{ ++ ConfidenceLevel, ContentFilteringArgs, GitCloneMode, GitHistoryMode, ++ InputSpecifierArgs, RuleSpecifierArgs, + }, + findings_store::FindingsStore, + location::{Location, OffsetSpan, SourcePoint, SourceSpan}, +- matcher::{Match, SerializableCapture, SerializableCaptures}, +- origin::{Origin, OriginSet}, +- reporter::{ReportMatch, Styles}, +- rules::rule::Confidence, +- util::intern, ++ matcher::serializable::{SerializableCapture, SerializableCaptures}, ++ matcher::Match, ++ origin::Origin, ++ reporter::styles::Styles, ++ scanner::test_utils::intern, + }; ++ use std::{ ++ io::Cursor, ++ path::PathBuf, ++ sync::{Arc, Mutex}, ++ }; ++ use url::Url; + + fn create_default_args() -> cli::commands::scan::ScanArgs { + use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope + + cli::commands::scan::ScanArgs { + num_jobs: 1, + no_dedup: false, + rules: RuleSpecifierArgs { + rules_path: Vec::new(), + rule: vec!["all".into()], + load_builtins: true, + }, + input_specifier_args: InputSpecifierArgs { + // local path / git URL inputs + path_inputs: Vec::new(), + git_url: Vec::new(), + + // GitHub + github_user: Vec::new(), + github_organization: Vec::new(), + all_github_organizations: false, + github_api_url: Url::parse("https://api.github.com/").unwrap(), + github_repo_type: GitHubRepoType::Source, + + // GitLab +diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs +index 9fcb1ecdfe8decc60278848c4a7be43cc9ebee70..b600f9f65838e52ce5dc3d7bb3bb1a5d5ff2bcaf 100644 +--- a/src/reporter/json_format.rs ++++ b/src/reporter/json_format.rs +@@ -458,240 +102,168 @@ mod tests { + git_history: GitHistoryMode::Full, + scan_nested_repos: true, + commit_metadata: true, + }, + content_filtering_args: ContentFilteringArgs { + max_file_size_mb: 25.0, + no_extract_archives: false, + extraction_depth: 2, + exclude: Vec::new(), // Exclude patterns + no_binary: true, + }, + confidence: ConfidenceLevel::Medium, + no_validate: false, + rule_stats: false, + only_valid: false, + min_entropy: None, + redact: false, + git_repo_timeout: 1800, // 30 minutes + output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, + snippet_length: 256, + baseline_file: None, + manage_baseline: false, + } + } + +- // Helper function to create a mock Match + fn create_mock_match( + rule_name: &str, + rule_text_id: &str, + rule_finding_fingerprint: &str, + validation_success: bool, + ) -> Match { + Match { + location: Location { + offset_span: OffsetSpan { start: 10, end: 20 }, + source_span: SourceSpan { + start: SourcePoint { line: 5, column: 10 }, + end: SourcePoint { line: 5, column: 20 }, + }, + }, + groups: SerializableCaptures { + captures: vec![SerializableCapture { + name: Some("token".to_string()), + match_number: 1, + start: 10, + end: 20, + value: "mock_token".into(), + }], + }, + blob_id: BlobId::new(b"mock_blob"), + finding_fingerprint: 0123, + rule_finding_fingerprint: intern(rule_finding_fingerprint), + rule_text_id: intern(rule_text_id), +- rule_name: intern(rule_name), //.to_string(), ++ rule_name: intern(rule_name), + rule_confidence: Confidence::Medium, + validation_response_body: "validation response".to_string(), + validation_response_status: 200, + validation_success, + calculated_entropy: 4.5, + visible: true, + } + } + +- // Helper function to create a mock DetailsReporter + fn setup_mock_reporter(matches: Vec) -> DetailsReporter { + let mut datastore = FindingsStore::new(PathBuf::from("/tmp")); +- // Create mock origin and blob metadata for the first test match + if !matches.is_empty() { + let blob_metadata = BlobMetadata { + id: BlobId::new(b"mock_blob"), + num_bytes: 1024, + mime_essence: Some("text/plain".to_string()), + charset: Some("UTF-8".to_string()), + language: Some("Rust".to_string()), + }; + let dedup = true; +- // Add matches to datastore + for m in matches.clone() { + datastore.record( + vec![( + Arc::new(OriginSet::new( +- // OriginSet -- Arc<…> + Origin::from_file(PathBuf::from("/mock/path/file.rs")), + vec![], + )), +- Arc::new(blob_metadata.clone()), // BlobMetadata -- Arc<…> ++ Arc::new(blob_metadata.clone()), + m.m.clone(), + )], + dedup, + ); + } + } + DetailsReporter { + datastore: Arc::new(Mutex::new(datastore)), + styles: Styles::new(false), + only_valid: false, + } + } ++ + #[test] + fn test_json_format() -> Result<()> { +- // Create a mock match with successful validation + let mock_match = + create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); + let matches = vec![ReportMatch { + origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]), + blob_metadata: BlobMetadata { + id: BlobId::new(b"mock_blob"), + num_bytes: 1024, + mime_essence: Some("text/plain".to_string()), + charset: Some("UTF-8".to_string()), + language: Some("Rust".to_string()), + }, + m: mock_match, + comment: None, + match_confidence: Confidence::Medium, + visible: true, + validation_response_body: "validation response".to_string(), + validation_response_status: 200, + validation_success: true, + }]; + let reporter = setup_mock_reporter(matches); + let mut output = Cursor::new(Vec::new()); +- // Call the json_format method + reporter.json_format(&mut output, &create_default_args())?; +- // Parse and validate JSON output +- let json_output: Vec = serde_json::from_slice(&output.into_inner())?; ++ let json_output: Vec = serde_json::from_slice(&output.into_inner())?; + assert!(!json_output.is_empty(), "JSON output should not be empty"); +- let first_finding = &json_output[0]; +- assert!(first_finding.get("id").is_some(), "Finding should have an 'id'"); +- assert!(first_finding.get("matches").is_some(), "Finding should have 'matches'"); +- // Validate the structure of the first match +- let matches = first_finding.get("matches").unwrap().as_array().unwrap(); +- let first_match = &matches[0]; +- assert_eq!(first_match.get("rule").unwrap().get("name").unwrap(), "MockRule"); +- assert_eq!(first_match.get("finding").unwrap().get("language").unwrap(), "Rust"); ++ let first = &json_output[0]; ++ assert_eq!(first["rule"]["name"], "MockRule"); ++ assert_eq!(first["finding"]["language"], "Rust"); + Ok(()) + } + +- // #[test] +- // fn test_jsonl_format() -> Result<()> { +- // // Create a mock match with successful validation +- // let mock_match = +- // create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); +- // let matches = vec![ReportMatch { +- // origin: OriginSet::new( +- // Origin::from_file(PathBuf::from("/mock/path/file.rs")), +- // vec![], +- // ), +- // blob_metadata: BlobMetadata { +- // id: BlobId::new(b"mock_blob"), +- // num_bytes: 1024, +- // mime_essence: Some("text/plain".to_string()), +- // charset: Some("UTF-8".to_string()), +- // language: Some("Rust".to_string()), +- // }, +- // m: mock_match, +- // comment: None, +- // match_confidence: Confidence::Medium, +- // visible: true, +- // validation_response_body: "validation response".to_string(), +- // validation_response_status: 200, +- // validation_success: true, +- // }]; +- // let reporter = setup_mock_reporter(matches); +- // let mut output = Cursor::new(Vec::new()); +- // // Call the jsonl_format method +- // reporter.jsonl_format(&mut output, &create_default_args())?; +- // // Split output into lines and validate +- // let jsonl_output = String::from_utf8(output.into_inner())?; +- // let lines: Vec<&str> = jsonl_output.lines().collect(); +- // assert!(!lines.is_empty(), "JSONL output should not be empty"); +- // for line in &lines { +- // let json_value: serde_json::Value = serde_json::from_str(line)?; +- // assert!( +- // json_value.get("rule_name").is_some(), +- // "Each line should have a 'rule_name'" +- // ); +- // assert!( +- // json_value.get("matches").is_some(), +- // "Each line should have 'matches'" +- // ); +- // } +- // Ok(()) +- // } +- + #[test] + fn test_validation_status_in_json() -> Result<()> { +- // Test validation status in JSON output + let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")]; + for (validation_success, expected_status) in test_cases { + let mock_match = create_mock_match( + "MockRule", + "mock_rule_1", + "mock_finding_fingerprint", + validation_success, + ); + let matches = vec![ReportMatch { + origin: OriginSet::new( + Origin::from_file(PathBuf::from("/mock/path/file.rs")), + vec![], + ), + blob_metadata: BlobMetadata { + id: BlobId::new(b"mock_blob"), + num_bytes: 1024, + mime_essence: Some("text/plain".to_string()), + charset: Some("UTF-8".to_string()), + language: Some("Rust".to_string()), + }, + m: mock_match, + comment: None, + match_confidence: Confidence::Medium, + visible: true, + validation_response_body: "validation response".to_string(), + validation_response_status: 200, + validation_success, + }]; + let reporter = setup_mock_reporter(matches); + let mut output = Cursor::new(Vec::new()); +- // Call the json_format method + reporter.json_format(&mut output, &create_default_args())?; +- // Parse and validate JSON output +- let json_output: Vec = serde_json::from_slice(&output.into_inner())?; ++ let json_output: Vec = serde_json::from_slice(&output.into_inner())?; + assert!(!json_output.is_empty(), "JSON output should not be empty"); +- let first_finding = &json_output[0]; +- let matches = first_finding.get("matches").unwrap().as_array().unwrap(); +- let first_match = &matches[0]; +- let validation_status = first_match +- .get("finding") +- .unwrap() +- .get("validation") +- .unwrap() +- .get("status") +- .unwrap() +- .as_str() +- .unwrap(); ++ let first = &json_output[0]; ++ let validation_status = first["finding"]["validation"]["status"].as_str().unwrap(); + assert_eq!(validation_status, expected_status); + } + Ok(()) + } + } diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs index ea38722..13bc78b 100644 --- a/src/cli/commands/inputs.rs +++ b/src/cli/commands/inputs.rs @@ -124,7 +124,6 @@ pub struct InputSpecifierArgs { #[arg(long, requires = "s3_bucket")] pub aws_local_profile: Option, - /// Docker/OCI images to scan (no local Docker required) #[arg(long = "docker-image")] pub docker_image: Vec, diff --git a/src/lib.rs b/src/lib.rs index 90d0451..04f7303 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,8 +29,8 @@ pub mod rule_loader; pub mod rule_profiling; pub mod rules; pub mod rules_database; -pub mod safe_list; pub mod s3; +pub mod safe_list; pub mod scanner; pub mod scanner_pool; pub mod serde_utils; diff --git a/src/reporter.rs b/src/reporter.rs index ad0efe9..5159651 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -5,12 +5,12 @@ use std::{ use anyhow::Result; use http::StatusCode; -use indenter::indented; use schemars::JsonSchema; use serde::Serialize; use crate::{ blob::BlobMetadata, + bstring_escape::Escaped, cli, cli::global::GlobalArgs, finding_data, findings_store, @@ -226,38 +226,6 @@ impl DetailsReporter { .collect()) } - // fn process_matches(&self, only_valid: bool) -> Result> { - // let datastore = self.datastore.lock().unwrap(); - // Ok(datastore - // .get_matches() - // .iter() - // .filter(|msg| { - // let (_origin, _blob_metadata, match_item) = &***msg; - // if only_valid { - // match_item.validation_success - // && match_item.validation_response_status != StatusCode::CONTINUE.as_u16() - // && match_item.visible - // } else { - // match_item.visible - // } - // }) - // .map(|msg| { - // let (origin, blob_metadata, match_item) = &**msg; - // ReportMatch { - // origin: origin.clone(), - // blob_metadata: blob_metadata.clone(), - // m: match_item.clone(), - // comment: None, - // visible: match_item.visible, - // match_confidence: match_item.rule_confidence, - // validation_response_body: match_item.validation_response_body.clone(), - // validation_response_status: match_item.validation_response_status, - // validation_success: match_item.validation_success, - // } - // }) - // .collect()) - // } - pub fn get_filtered_matches(&self) -> Result> { self.process_matches(self.only_valid, true) } @@ -266,6 +234,166 @@ impl DetailsReporter { self.process_matches(only_valid.unwrap_or(self.only_valid), false) } + pub fn deduplicate_matches( + &self, + matches: Vec, + no_dedup: bool, + ) -> Vec { + if no_dedup { + return matches; + } + + use std::collections::HashMap; + let mut by_fp: HashMap = HashMap::new(); + + for rm in matches { + let fp = rm.m.finding_fingerprint; + if let Some(existing) = by_fp.get_mut(&fp) { + // merge origin sets (keep first origin, append the rest) + for o in rm.origin.iter() { + if !existing.origin.iter().any(|e| e == o) { + existing.origin = OriginSet::new( + existing.origin.first().clone(), + existing + .origin + .iter() + .skip(1) + .cloned() + .chain(std::iter::once(o.clone())) + .collect(), + ); + } + } + continue; + } + by_fp.insert(fp, rm); + } + by_fp.into_values().collect() + } + + fn matches_for_output(&self, args: &cli::commands::scan::ScanArgs) -> Result> { + let mut matches = self.get_filtered_matches()?; + if !args.no_dedup { + matches = self.deduplicate_matches(matches, args.no_dedup); + } + if args.no_dedup { + let mut expanded = Vec::new(); + for rm in matches { + if rm.origin.len() > 1 { + for origin in rm.origin.iter() { + let mut single = rm.clone(); + single.origin = OriginSet::new(origin.clone(), Vec::new()); + expanded.push(single); + } + } else { + expanded.push(rm); + } + } + matches = expanded; + } + Ok(matches) + } + + pub fn build_finding_record( + &self, + rm: &ReportMatch, + args: &cli::commands::scan::ScanArgs, + ) -> FindingReporterRecord { + let source_span = &rm.m.location.source_span; + let line_num = source_span.start.line; + + let snippet = Escaped( + rm.m.groups + .captures + .get(1) + .or_else(|| rm.m.groups.captures.get(0)) + .map(|capture| capture.value.as_bytes()) + .unwrap_or_default(), + ) + .to_string(); + + let validation_status = if rm.validation_success { + "Active Credential".to_string() + } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { + "Not Attempted".to_string() + } else { + "Inactive Credential".to_string() + }; + + const MAX_RESPONSE_LENGTH: usize = 512; + let truncated_body: String = + rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); + let ellipsis = + if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; + let response_body = format!("{}{}", truncated_body, ellipsis); + + let git_metadata_val = rm + .origin + .iter() + .filter_map(|origin| { + if let Origin::GitRepo(e) = origin { + self.extract_git_metadata(e, source_span) + } else { + None + } + }) + .next(); + + let file_path = rm + .origin + .iter() + .find_map(|origin| match origin { + Origin::File(e) => { + if let Some(url) = self.jira_issue_url(&e.path, args) { + Some(url) + } else if let Some(url) = self.slack_message_url(&e.path) { + Some(url) + } else if let Some(mapped) = self.s3_display_path(&e.path) { + Some(mapped) + } else if let Some(mapped) = self.docker_display_path(&e.path) { + Some(mapped) + } else { + Some(e.path.display().to_string()) + } + } + Origin::Extended(e) => e.path().map(|p| p.display().to_string()), + _ => None, + }) + .unwrap_or_default(); + + FindingReporterRecord { + rule: RuleMetadata { + name: rm.m.rule_name.to_string().clone(), + id: rm.m.rule_text_id.to_string().clone(), + }, + finding: FindingRecordData { + snippet, + fingerprint: rm.m.finding_fingerprint.to_string(), + confidence: rm.match_confidence.to_string(), + entropy: format!("{:.2}", rm.m.calculated_entropy), + validation: ValidationInfo { status: validation_status, response: response_body }, + language: rm + .blob_metadata + .language + .clone() + .unwrap_or_else(|| "Unknown".to_string()), + line: line_num as u32, + column_start: source_span.start.column as u32, + column_end: source_span.end.column as u32, + path: file_path, + git_metadata: git_metadata_val, + }, + } + } + + pub fn build_finding_records( + &self, + args: &cli::commands::scan::ScanArgs, + ) -> Result> { + let matches = self.matches_for_output(args)?; + Ok(matches.iter().map(|rm| self.build_finding_record(rm, args)).collect()) + } + fn get_finding_data(&self) -> Result> { let datastore = self.datastore.lock().unwrap(); Ok(datastore @@ -388,6 +516,41 @@ pub struct ReportMatch { /// Validation Success pub validation_success: bool, } + +#[derive(Serialize, JsonSchema, Clone, Debug)] +pub struct FindingReporterRecord { + pub rule: RuleMetadata, + pub finding: FindingRecordData, +} + +#[derive(Serialize, JsonSchema, Clone, Debug)] +pub struct RuleMetadata { + pub name: String, + pub id: String, +} + +#[derive(Serialize, JsonSchema, Clone, Debug)] +pub struct ValidationInfo { + pub status: String, + pub response: String, +} + +#[derive(Serialize, JsonSchema, Clone, Debug)] +pub struct FindingRecordData { + pub snippet: String, + pub fingerprint: String, + pub confidence: String, + pub entropy: String, + pub validation: ValidationInfo, + pub language: String, + pub line: u32, + pub column_start: u32, + pub column_end: u32, + pub path: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub git_metadata: Option, +} + impl From for ReportMatch { fn from(e: finding_data::FindingDataEntry) -> Self { ReportMatch { diff --git a/src/reporter/bson_format.rs b/src/reporter/bson_format.rs index c1470d0..6691c3b 100644 --- a/src/reporter/bson_format.rs +++ b/src/reporter/bson_format.rs @@ -1,90 +1,17 @@ -use bson::Document; -use serde_json::Value; - use super::*; + impl DetailsReporter { /// Formats findings as BSON and writes them to the provided writer. - /// For testing purposes, prints the full JSON for each finding before converting. pub fn bson_format( &self, mut writer: W, args: &cli::commands::scan::ScanArgs, ) -> Result<()> { - // Get filtered matches - let mut matches = self.get_filtered_matches()?; - - // Apply deduplication only if requested - if !args.no_dedup { - matches = self.deduplicate_matches(matches, args.no_dedup); - } - - let mut bson_findings = Vec::new(); - - // For each match, handle it based on the no_dedup flag - for rm in matches { - if args.no_dedup && rm.origin.len() > 1 { - // For no_dedup and multiple origins, create separate findings for each origin - for origin in rm.origin.iter() { - // Create a single-origin version of this match - let single_origin_rm = ReportMatch { - origin: OriginSet::new(origin.clone(), Vec::new()), - blob_metadata: rm.blob_metadata.clone(), - m: rm.m.clone(), - comment: rm.comment.clone(), - visible: rm.visible, - match_confidence: rm.match_confidence, - validation_response_body: rm.validation_response_body.clone(), - validation_response_status: rm.validation_response_status, - validation_success: rm.validation_success, - }; - - // Process to JSON first, then convert to BSON - let json_finding = self.process_match_to_json(&single_origin_rm, args)?; - if let Ok(bson_doc) = json_to_bson_document(&json_finding) { - bson_findings.push(bson_doc); - } - } - } else { - // Process normally for deduped matches or matches with only one origin - let json_finding = self.process_match_to_json(&rm, args)?; - if let Ok(bson_doc) = json_to_bson_document(&json_finding) { - bson_findings.push(bson_doc); - } - } - } - - // Write each BSON document - for doc in bson_findings { + let records = self.build_finding_records(args)?; + for record in records { + let doc = bson::to_document(&record)?; doc.to_writer(&mut writer)?; } Ok(()) } - // pub fn bson_format( - // &self, - // mut writer: W, - // args: &cli::commands::scan::ScanArgs, - // ) -> Result<()> { - // let findings = self.gather_json_findings(args)?; - - // // Print the full JSON for each finding - // for finding in &findings { - // println!("Full JSON:\n{}", serde_json::to_string_pretty(finding)?); - // } - - // let bson_findings: Vec = findings - // .into_iter() - // .filter_map(|finding| json_to_bson_document(&finding).ok()) - // .collect(); - // for doc in bson_findings { - // doc.to_writer(&mut writer)?; - // } - // Ok(()) - // } -} - -fn json_to_bson_document(json: &Value) -> Result { - match bson::to_bson(json)? { - bson::Bson::Document(doc) => Ok(doc), - _ => Err(anyhow::anyhow!("Failed to convert JSON to BSON document")), - } } diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 9fcb1ec..b64a777 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -1,415 +1,61 @@ -use http::StatusCode; -use serde_json::json; - use super::*; -use crate::bstring_escape::Escaped; +use serde_json::Value; impl DetailsReporter { - pub fn deduplicate_matches( - &self, - matches: Vec, - no_dedup: bool, - ) -> Vec { - if no_dedup { - return matches; - } - - use std::collections::HashMap; - let mut by_fp: HashMap = HashMap::new(); - - for rm in matches { - let fp = rm.m.finding_fingerprint; - if let Some(existing) = by_fp.get_mut(&fp) { - // merge origin sets (keep first origin, append the rest) - for o in rm.origin.iter() { - if !existing.origin.iter().any(|e| e == o) { - existing.origin = OriginSet::new( - existing.origin.first().clone(), - existing - .origin - .iter() - .skip(1) - .cloned() - .chain(std::iter::once(o.clone())) - .collect(), - ); - } - } - continue; - } - by_fp.insert(fp, rm); - } - by_fp.into_values().collect() - } - - pub fn gather_json_findings( - &self, - args: &cli::commands::scan::ScanArgs, - ) -> Result> { - let mut matches = self.get_filtered_matches()?; - if !args.no_dedup { - matches = self.deduplicate_matches(matches, args.no_dedup); - } - - let mut json_findings = Vec::new(); - for rm in matches { - let source_span = &rm.m.location.source_span; - let line_num = source_span.start.line; - - let snippet = Escaped( - rm.m.groups - .captures - .get(1) - .or_else(|| rm.m.groups.captures.get(0)) - .map(|capture| capture.value.as_bytes()) - .unwrap_or_default(), - ) - .to_string(); - - let validation_status = if rm.validation_success { - "Active Credential" - } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { - "Not Attempted" - } else { - "Inactive Credential" - }; - - const MAX_RESPONSE_LENGTH: usize = 512; - let truncated_body: String = - rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); - let ellipsis = - if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; - let response_body = format!("{}{}", truncated_body, ellipsis); - - // Call extract_git_metadata on each GitRepo origin and take the first non-null result. - let git_metadata_val = rm - .origin - .iter() - .filter_map(|origin| { - if let Origin::GitRepo(e) = origin { - self.extract_git_metadata(e, source_span) - } else { - None - } - }) - .next() - .unwrap_or(serde_json::Value::Null); - - // Collect a file path from an Origin::File, if available. - let file_path = rm - .origin - .iter() - .find_map(|origin| match origin { - Origin::File(e) => { - if let Some(url) = self.jira_issue_url(&e.path, args) { - Some(url) - } else if let Some(url) = self.slack_message_url(&e.path) { - Some(url) - } else if let Some(mapped) = self.s3_display_path(&e.path) { - Some(mapped) - } else if let Some(mapped) = self.docker_display_path(&e.path) { - Some(mapped) - } else { - Some(e.path.display().to_string()) - } - } - Origin::Extended(e) => e.path().map(|p| p.display().to_string()), - _ => None, - }) - .unwrap_or_default(); - - let match_json = json!({ - "rule": { - "name": rm.m.rule_name, - "id": rm.m.rule_text_id, - }, - "finding": { - "snippet": snippet, - "fingerprint": rm.m.finding_fingerprint.to_string(), - "confidence": rm.match_confidence.to_string(), - "entropy": format!("{:.2}", rm.m.calculated_entropy), - "validation": { - "status": validation_status, - "response": response_body, - }, - "language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()), - "line": line_num, - "column_start": source_span.start.column, - "column_end": source_span.end.column, - "path": file_path, - "git_metadata": git_metadata_val - } - }); - - let finding_json = json!({ - "id": rm.m.rule_text_id, - "matches": [ match_json ] - }); - json_findings.push(finding_json); - } - Ok(json_findings) - } pub fn json_format( &self, mut writer: W, args: &cli::commands::scan::ScanArgs, ) -> Result<()> { - let mut findings = Vec::new(); - - // Get filtered matches - let mut matches = self.get_filtered_matches()?; - - // Apply deduplication only if requested - if !args.no_dedup { - matches = self.deduplicate_matches(matches, args.no_dedup); - } - - // For each match, handle it based on the no_dedup flag - for rm in matches { - if args.no_dedup && rm.origin.len() > 1 { - // For no_dedup and multiple origins, create separate findings for each origin - for origin in rm.origin.iter() { - // Create a single-origin version of this match - let single_origin_rm = ReportMatch { - origin: OriginSet::new(origin.clone(), Vec::new()), - blob_metadata: rm.blob_metadata.clone(), - m: rm.m.clone(), - comment: rm.comment.clone(), - visible: rm.visible, - match_confidence: rm.match_confidence, - validation_response_body: rm.validation_response_body.clone(), - validation_response_status: rm.validation_response_status, - validation_success: rm.validation_success, - }; - - // Process this single-origin match into a JSON finding - let json_finding = self.process_match_to_json(&single_origin_rm, args)?; - findings.push(json_finding); - } - } else { - // Process normally for deduped matches or matches with only one origin - let json_finding = self.process_match_to_json(&rm, args)?; - findings.push(json_finding); - } - } - - // Write the JSON output - if !findings.is_empty() { - serde_json::to_writer_pretty(&mut writer, &findings)?; + let records = self.build_finding_records(args)?; + if !records.is_empty() { + serde_json::to_writer_pretty(&mut writer, &records)?; writeln!(writer)?; } Ok(()) } - // Add a helper method to convert a ReportMatch to a JSON finding - pub fn process_match_to_json( - &self, - rm: &ReportMatch, - args: &cli::commands::scan::ScanArgs, - ) -> Result { - // Extract the relevant data from the match as you already do in your current implementation - let source_span = &rm.m.location.source_span; - let line_num = source_span.start.line; - - let snippet = Escaped( - rm.m.groups - .captures - .get(1) - .or_else(|| rm.m.groups.captures.get(0)) - .map(|capture| capture.value.as_bytes()) - .unwrap_or_default(), - ) - .to_string(); - - let validation_status = if rm.validation_success { - "Active Credential" - } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() { - "Not Attempted" - } else { - "Inactive Credential" - }; - - const MAX_RESPONSE_LENGTH: usize = 512; - let truncated_body: String = - rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); - let ellipsis = - if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; - let response_body = format!("{}{}", truncated_body, ellipsis); - - // Call extract_git_metadata on each GitRepo origin and take the first non-null result. - let git_metadata_val = rm - .origin - .iter() - .filter_map(|origin| { - if let Origin::GitRepo(e) = origin { - self.extract_git_metadata(e, source_span) - } else { - None - } - }) - .next() - .unwrap_or(serde_json::Value::Null); - - // Collect a file path from an Origin::File, if available. - let file_path = rm - .origin - .iter() - .find_map(|origin| { - if let Origin::File(e) = origin { - if let Some(url) = self.jira_issue_url(&e.path, args) { - Some(url) - } else if let Some(url) = self.slack_message_url(&e.path) { - Some(url) - } else if let Some(mapped) = self.s3_display_path(&e.path) { - Some(mapped) - } else if let Some(mapped) = self.docker_display_path(&e.path) { - Some(mapped) - } else { - Some(e.path.display().to_string()) - } - } else if let Origin::Extended(e) = origin { - e.path().map(|p| p.display().to_string()) - } else { - None - } - }) - .unwrap_or_default(); - - let match_json = json!({ - "rule": { - "name": rm.m.rule_name, - "id": rm.m.rule_text_id, - }, - "finding": { - "snippet": snippet, - "fingerprint": rm.m.finding_fingerprint.to_string(), - "confidence": rm.match_confidence.to_string(), - "entropy": format!("{:.2}", rm.m.calculated_entropy), - "validation": { - "status": validation_status, - "response": response_body, - }, - "language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()), - "line": line_num, - "column_start": source_span.start.column, - "column_end": source_span.end.column, - "path": file_path, - "git_metadata": git_metadata_val - } - }); - - let finding_json = json!({ - "id": rm.m.rule_text_id, - "matches": [ match_json ] - }); - - Ok(finding_json) - } - // // Modified JSON format to pass args to gather_json_findings - // pub fn json_format( - // &self, - // mut writer: W, - // args: &cli::commands::scan::ScanArgs, - // ) -> Result<()> { - // let findings = self.gather_json_findings(args)?; - // if !findings.is_empty() { - // serde_json::to_writer_pretty(&mut writer, &findings)?; - // writeln!(writer)?; - // } - // Ok(()) - // } - pub fn jsonl_format( &self, mut writer: W, args: &cli::commands::scan::ScanArgs, ) -> Result<()> { - // Get filtered matches - let mut matches = self.get_filtered_matches()?; - - // Apply deduplication only if requested - if !args.no_dedup { - matches = self.deduplicate_matches(matches, args.no_dedup); - } - - // For each match, handle it based on the no_dedup flag - for rm in matches { - if args.no_dedup && rm.origin.len() > 1 { - // For no_dedup and multiple origins, create separate findings for each origin - for origin in rm.origin.iter() { - // Create a single-origin version of this match - let single_origin_rm = ReportMatch { - origin: OriginSet::new(origin.clone(), Vec::new()), - blob_metadata: rm.blob_metadata.clone(), - m: rm.m.clone(), - comment: rm.comment.clone(), - visible: rm.visible, - match_confidence: rm.match_confidence, - validation_response_body: rm.validation_response_body.clone(), - validation_response_status: rm.validation_response_status, - validation_success: rm.validation_success, - }; - - // Process this single-origin match into a JSON finding and write it - let json_finding = self.process_match_to_json(&single_origin_rm, args)?; - serde_json::to_writer(&mut writer, &json_finding)?; - writeln!(writer)?; - } - } else { - // Process normally for deduped matches or matches with only one origin - let json_finding = self.process_match_to_json(&rm, args)?; - serde_json::to_writer(&mut writer, &json_finding)?; - writeln!(writer)?; - } + let records = self.build_finding_records(args)?; + for record in records { + serde_json::to_writer(&mut writer, &record)?; + writeln!(writer)?; } Ok(()) } - // // Modified JSONL format to pass args to gather_json_findings - // pub fn jsonl_format( - // &self, - // mut writer: W, - // args: &cli::commands::scan::ScanArgs, - // ) -> Result<()> { - // let findings = self.gather_json_findings(args)?; - // for finding in findings { - // serde_json::to_writer(&mut writer, &finding)?; - // writeln!(writer)?; - // } - // Ok(()) - // } } #[cfg(test)] mod tests { + use super::*; + use crate::cli::commands::github::GitCloneMode; + use crate::cli::commands::github::GitHistoryMode; + use crate::cli::commands::rules::RuleSpecifierArgs; + use crate::matcher::{SerializableCapture, SerializableCaptures}; + use crate::util::intern; + use crate::{ + blob::BlobId, + cli::commands::github::GitHubRepoType, + cli::commands::inputs::ContentFilteringArgs, + cli::commands::inputs::InputSpecifierArgs, + cli::commands::output::{OutputArgs, ReportOutputFormat}, + cli::commands::scan::ConfidenceLevel, + findings_store::FindingsStore, + location::{Location, OffsetSpan, SourcePoint, SourceSpan}, + matcher::Match, + origin::Origin, + reporter::styles::Styles, + }; use std::{ io::Cursor, path::PathBuf, sync::{Arc, Mutex}, }; - - use anyhow::Result; - use serde_json::Value; use url::Url; - - use super::*; - use crate::{ - blob::BlobId, - cli::commands::{ - github::{GitCloneMode, GitHistoryMode, GitHubRepoType}, - inputs::{ContentFilteringArgs, InputSpecifierArgs}, - output::OutputArgs, - rules::RuleSpecifierArgs, - scan::ConfidenceLevel, - }, - findings_store::FindingsStore, - location::{Location, OffsetSpan, SourcePoint, SourceSpan}, - matcher::{Match, SerializableCapture, SerializableCaptures}, - origin::{Origin, OriginSet}, - reporter::{ReportMatch, Styles}, - rules::rule::Confidence, - util::intern, - }; - fn create_default_args() -> cli::commands::scan::ScanArgs { use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope @@ -480,7 +126,6 @@ mod tests { } } - // Helper function to create a mock Match fn create_mock_match( rule_name: &str, rule_text_id: &str, @@ -508,7 +153,7 @@ mod tests { finding_fingerprint: 0123, rule_finding_fingerprint: intern(rule_finding_fingerprint), rule_text_id: intern(rule_text_id), - rule_name: intern(rule_name), //.to_string(), + rule_name: intern(rule_name), rule_confidence: Confidence::Medium, validation_response_body: "validation response".to_string(), validation_response_status: 200, @@ -518,10 +163,8 @@ mod tests { } } - // Helper function to create a mock DetailsReporter fn setup_mock_reporter(matches: Vec) -> DetailsReporter { let mut datastore = FindingsStore::new(PathBuf::from("/tmp")); - // Create mock origin and blob metadata for the first test match if !matches.is_empty() { let blob_metadata = BlobMetadata { id: BlobId::new(b"mock_blob"), @@ -531,16 +174,14 @@ mod tests { language: Some("Rust".to_string()), }; let dedup = true; - // Add matches to datastore for m in matches.clone() { datastore.record( vec![( Arc::new(OriginSet::new( - // OriginSet -- Arc<…> Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![], )), - Arc::new(blob_metadata.clone()), // BlobMetadata -- Arc<…> + Arc::new(blob_metadata.clone()), m.m.clone(), )], dedup, @@ -553,9 +194,9 @@ mod tests { only_valid: false, } } + #[test] fn test_json_format() -> Result<()> { - // Create a mock match with successful validation let mock_match = create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); let matches = vec![ReportMatch { @@ -577,72 +218,17 @@ mod tests { }]; let reporter = setup_mock_reporter(matches); let mut output = Cursor::new(Vec::new()); - // Call the json_format method reporter.json_format(&mut output, &create_default_args())?; - // Parse and validate JSON output - let json_output: Vec = serde_json::from_slice(&output.into_inner())?; + let json_output: Vec = serde_json::from_slice(&output.into_inner())?; assert!(!json_output.is_empty(), "JSON output should not be empty"); - let first_finding = &json_output[0]; - assert!(first_finding.get("id").is_some(), "Finding should have an 'id'"); - assert!(first_finding.get("matches").is_some(), "Finding should have 'matches'"); - // Validate the structure of the first match - let matches = first_finding.get("matches").unwrap().as_array().unwrap(); - let first_match = &matches[0]; - assert_eq!(first_match.get("rule").unwrap().get("name").unwrap(), "MockRule"); - assert_eq!(first_match.get("finding").unwrap().get("language").unwrap(), "Rust"); + let first = &json_output[0]; + assert_eq!(first["rule"]["name"], "MockRule"); + assert_eq!(first["finding"]["language"], "Rust"); Ok(()) } - // #[test] - // fn test_jsonl_format() -> Result<()> { - // // Create a mock match with successful validation - // let mock_match = - // create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true); - // let matches = vec![ReportMatch { - // origin: OriginSet::new( - // Origin::from_file(PathBuf::from("/mock/path/file.rs")), - // vec![], - // ), - // blob_metadata: BlobMetadata { - // id: BlobId::new(b"mock_blob"), - // num_bytes: 1024, - // mime_essence: Some("text/plain".to_string()), - // charset: Some("UTF-8".to_string()), - // language: Some("Rust".to_string()), - // }, - // m: mock_match, - // comment: None, - // match_confidence: Confidence::Medium, - // visible: true, - // validation_response_body: "validation response".to_string(), - // validation_response_status: 200, - // validation_success: true, - // }]; - // let reporter = setup_mock_reporter(matches); - // let mut output = Cursor::new(Vec::new()); - // // Call the jsonl_format method - // reporter.jsonl_format(&mut output, &create_default_args())?; - // // Split output into lines and validate - // let jsonl_output = String::from_utf8(output.into_inner())?; - // let lines: Vec<&str> = jsonl_output.lines().collect(); - // assert!(!lines.is_empty(), "JSONL output should not be empty"); - // for line in &lines { - // let json_value: serde_json::Value = serde_json::from_str(line)?; - // assert!( - // json_value.get("rule_name").is_some(), - // "Each line should have a 'rule_name'" - // ); - // assert!( - // json_value.get("matches").is_some(), - // "Each line should have 'matches'" - // ); - // } - // Ok(()) - // } - #[test] fn test_validation_status_in_json() -> Result<()> { - // Test validation status in JSON output let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")]; for (validation_success, expected_status) in test_cases { let mock_match = create_mock_match( @@ -673,23 +259,11 @@ mod tests { }]; let reporter = setup_mock_reporter(matches); let mut output = Cursor::new(Vec::new()); - // Call the json_format method reporter.json_format(&mut output, &create_default_args())?; - // Parse and validate JSON output - let json_output: Vec = serde_json::from_slice(&output.into_inner())?; + let json_output: Vec = serde_json::from_slice(&output.into_inner())?; assert!(!json_output.is_empty(), "JSON output should not be empty"); - let first_finding = &json_output[0]; - let matches = first_finding.get("matches").unwrap().as_array().unwrap(); - let first_match = &matches[0]; - let validation_status = first_match - .get("finding") - .unwrap() - .get("validation") - .unwrap() - .get("status") - .unwrap() - .as_str() - .unwrap(); + let first = &json_output[0]; + let validation_status = first["finding"]["validation"]["status"].as_str().unwrap(); assert_eq!(validation_status, expected_status); } Ok(()) diff --git a/src/reporter/pretty_format.rs b/src/reporter/pretty_format.rs index 942e7ad..6790a44 100644 --- a/src/reporter/pretty_format.rs +++ b/src/reporter/pretty_format.rs @@ -1,413 +1,126 @@ use std::fmt::{Display, Formatter, Result as FmtResult}; -use http::StatusCode; +use indenter::indented; use super::*; -use crate::{ - bstring_escape::Escaped, - origin::{get_repo_url, GitRepoOrigin}, -}; + impl DetailsReporter { - // Modified pretty format to use deduplicate_matches helper pub fn pretty_format( &self, mut writer: W, args: &cli::commands::scan::ScanArgs, ) -> Result<()> { - let mut matches = self.get_filtered_matches()?; - let num_findings = matches.len(); - - if !args.no_dedup { - matches = self.deduplicate_matches(matches, args.no_dedup); - } - - for (index, rm) in matches.into_iter().enumerate() { - // When no_dedup is true, we'll handle each origin separately - if args.no_dedup && rm.origin.len() > 1 { - // For each origin, create a separate "finding" - for origin in rm.origin.iter() { - // Create a new ReportMatch with just this single origin - let single_origin_rm = ReportMatch { - origin: OriginSet::new(origin.clone(), Vec::new()), - blob_metadata: rm.blob_metadata.clone(), - m: rm.m.clone(), - comment: rm.comment.clone(), - visible: rm.visible, - match_confidence: rm.match_confidence, - validation_response_body: rm.validation_response_body.clone(), - validation_response_status: rm.validation_response_status, - validation_success: rm.validation_success, - }; - - self.write_finding( - &mut writer, - &single_origin_rm, - index + 1, - num_findings, - args, - )?; - } - } else { - // Normal processing for deduped matches or matches with only one origin - self.write_finding(&mut writer, &rm, index + 1, num_findings, args)?; - } + let records = self.build_finding_records(args)?; + let num_findings = records.len(); + for (index, record) in records.iter().enumerate() { + self.write_finding_record(&mut writer, record, index + 1, num_findings)?; } Ok(()) } - fn write_finding( + fn write_finding_record( &self, writer: &mut W, - rm: &ReportMatch, + record: &FindingReporterRecord, _finding_num: usize, _num_findings: usize, - args: &cli::commands::scan::ScanArgs, ) -> Result<()> { - let lock_icon = if rm.validation_success { "🔓 " } else { "" }; + let is_active = record.finding.validation.status == "Active Credential"; + let lock_icon = if is_active { "🔓 " } else { "" }; let formatted_heading = format!( "{}{} => [{}]", lock_icon, - rm.m.rule_name.to_uppercase(), - rm.m.rule_text_id.to_uppercase() + record.rule.name.to_uppercase(), + record.rule.id.to_uppercase() ); - if rm.validation_success { + if is_active { writeln!(writer, "{}", self.style_finding_active_heading(formatted_heading))?; } else { writeln!(writer, "{}", self.style_finding_heading(formatted_heading))?; } - writeln!(writer, "{}", PrettyFinding(self, rm, args))?; + writeln!(writer, "{}", PrettyFindingRecord(self, record))?; writeln!(writer)?; Ok(()) } - fn write_git_metadata( + fn write_git_metadata_value( &self, f: &mut Formatter<'_>, - e: &GitRepoOrigin, - _args: &cli::commands::scan::ScanArgs, - line_num: usize, + git: &serde_json::Value, ) -> FmtResult { - // Check if this is a remote git scan - // let mut is_remote_git_scan = !args.input_specifier_args.git_url.is_empty(); - // let mut git_url_string = String::new(); - let repo_url = get_repo_url(&e.repo_path) - .unwrap_or_else(|_| e.repo_path.to_string_lossy().to_string().into()); - let mut git_url_string = repo_url.clone(); - if git_url_string.ends_with(".git") { - git_url_string = git_url_string.strip_suffix(".git").unwrap().to_string().into(); + let repo_url = git["repository_url"].as_str().unwrap_or(""); + writeln!(f, " |Git Repo......: {}", self.style_metadata(repo_url))?; + if let Some(commit) = git.get("commit") { + if let Some(url) = commit.get("url").and_then(|v| v.as_str()) { + writeln!(f, " |__Commit......: {}", self.style_metadata(url))?; + } + if let Some(committer) = commit.get("committer") { + let name = committer.get("name").and_then(|v| v.as_str()).unwrap_or(""); + let email = committer.get("email").and_then(|v| v.as_str()).unwrap_or(""); + writeln!(indented(f).with_str(" |__"), "Committer...: {} <{}>", name, email)?; + } + if let Some(date) = commit.get("date").and_then(|v| v.as_str()) { + writeln!(indented(f).with_str(" |__"), "Date........: {}", date)?; + } } - writeln!(f, " |Git Repo......: {}", self.style_metadata(&git_url_string),)?; - if let Some(cs) = &e.first_commit { - let cmd = &cs.commit_metadata; - - let atime = - cmd.committer_timestamp.format(gix::date::time::format::SHORT.clone()).to_string(); - - let commit_id = &cmd.commit_id; - let commit_url = format!("{}/commit/{}", &git_url_string, commit_id); - // Write Commit Information - writeln!(f, " |__Commit......: {}", self.style_metadata(&commit_url))?; - writeln!( - indented(f).with_str(" |__"), - "Committer...: {} <{}>", - cmd.committer_name, - cmd.committer_email - )?; - writeln!(indented(f).with_str(" |__"), "Date........: {}", atime)?; - // writeln!(indented(f).with_str(" |__"), "Summary.....: {}", msg)?; - writeln!(indented(f).with_str(" |__"), "Path........: {}", cs.blob_path)?; - // Construct Git Command - let git_link = - format!("{}/blob/{}/{}#L{}", &git_url_string, commit_id, cs.blob_path, line_num); - let git_command = - format!("git -C {} show {}:{}", e.repo_path.display(), commit_id, cs.blob_path); - writeln!( - indented(f).with_str(" |__"), - "Git Link....: {}", - self.style_metadata(&git_link) - )?; - writeln!( - indented(f).with_str(" |__"), - "Git Command.: {}", - self.style_metadata(&git_command) - )?; + if let Some(file) = git.get("file") { + if let Some(path) = file.get("path").and_then(|v| v.as_str()) { + writeln!(indented(f).with_str(" |__"), "Path........: {}", path)?; + } + if let Some(url) = file.get("url").and_then(|v| v.as_str()) { + writeln!( + indented(f).with_str(" |__"), + "Git Link....: {}", + self.style_metadata(url) + )?; + } + if let Some(cmd) = file.get("git_command").and_then(|v| v.as_str()) { + writeln!( + indented(f).with_str(" |__"), + "Git Command.: {}", + self.style_metadata(cmd) + )?; + } } Ok(()) } } -// pub struct PrettyFinding<'a>(&'a DetailsReporter, &'a Finding); -pub struct PrettyFinding<'a>( - &'a DetailsReporter, - &'a ReportMatch, - &'a cli::commands::scan::ScanArgs, -); -impl<'a> Display for PrettyFinding<'a> { + +pub struct PrettyFindingRecord<'a>(&'a DetailsReporter, &'a FindingReporterRecord); + +impl<'a> Display for PrettyFindingRecord<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - let PrettyFinding(reporter, rm, args) = self; - // Use Box String> to store the closure - let style_fn: Box String> = if rm.validation_success { - Box::new(|s: &str| reporter.style_active_creds(s).to_string()) // Convert StyledObject - // to String + let reporter = self.0; + let record = self.1; + let is_active = record.finding.validation.status == "Active Credential"; + let style_fn: Box String> = if is_active { + Box::new(|s| reporter.style_active_creds(s).to_string()) } else { - Box::new(|s: &str| reporter.style_match(s).to_string()) // Convert StyledObject to - // String + Box::new(|s| reporter.style_match(s).to_string()) }; - let matching_finding = - rm.m.groups - .captures - .get(1) - .or_else(|| rm.m.groups.captures.get(0)) - .map(|capture| capture.value.as_bytes()) - .unwrap_or(&[]); - writeln!(f, " |Finding.......: {}", style_fn(&Escaped(matching_finding).to_string()))?; - writeln!(f, " |Fingerprint...: {}", rm.m.finding_fingerprint)?; - writeln!(f, " |Confidence....: {}", rm.match_confidence.to_string())?; - writeln!(f, " |Entropy.......: {:.2}", rm.m.calculated_entropy)?; - let validation_status = if rm.validation_response_status == StatusCode::CONTINUE.as_u16() - || rm.validation_response_status == StatusCode::PRECONDITION_REQUIRED.as_u16() - { - "Not Attempted".to_string() - } else if rm.validation_success { - "Active Credential".to_string() - } else { - "Inactive Credential".to_string() - }; - writeln!( - f, - " |Validation....: {}", - if rm.validation_success { - reporter.style_finding_active_heading(&validation_status).to_string() - // Convert StyledObject to String - } else { - (&validation_status).to_string() - } - )?; - const MAX_RESPONSE_LENGTH: usize = 512; - if rm.validation_response_status != StatusCode::CONTINUE.as_u16() { - let truncated_body: String = - rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect(); - let ellipsis = - if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" }; + let finding = &record.finding; + writeln!(f, " |Finding.......: {}", style_fn(&finding.snippet))?; + writeln!(f, " |Fingerprint...: {}", finding.fingerprint)?; + writeln!(f, " |Confidence....: {}", finding.confidence)?; + writeln!(f, " |Entropy.......: {}", finding.entropy)?; + if is_active { writeln!( f, - " |__Response....: {}{}", - if rm.validation_success { - reporter.style_active_creds(&truncated_body).to_string() // Convert StyledObject - // to String - } else { - reporter.style_metadata(&truncated_body).to_string() // Convert StyledObject to - // String - }, - ellipsis + " |Validation....: {}", + reporter.style_finding_active_heading(&finding.validation.status).to_string() )?; + } else { + writeln!(f, " |Validation....: {}", finding.validation.status)?; } - writeln!( - f, - " |Language......: {}", - rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()) - )?; - - let source_span = &rm.m.location.source_span; - writeln!(f, " |Line Num......: {}", source_span.start.line)?; - - //print all the other areas where this was seen - for p in rm.origin.iter() { - match p { - Origin::File(e) => { - let display_path = if let Some(url) = reporter.jira_issue_url(&e.path, args) { - url - } else if let Some(url) = reporter.slack_message_url(&e.path) { - url - } else if let Some(mapped) = reporter.s3_display_path(&e.path) { - mapped - } else if let Some(mapped) = reporter.docker_display_path(&e.path) { - mapped - } else { - e.path.display().to_string() - }; - writeln!( - f, - " |Path..........: {}", - if rm.validation_success { - reporter.style_active_creds(&display_path).to_string() - } else { - display_path - } - )?; - } - Origin::Extended(e) => { - if let Some(p) = e.path() { - let display_path = p.display().to_string(); - writeln!( - f, - " |Path..........: {}", - if rm.validation_success { - reporter.style_active_creds(&display_path).to_string() - } else { - display_path - } - )?; - } - } - Origin::GitRepo(e) => { - reporter.write_git_metadata(f, e, args, source_span.start.line)?; - } - } + if finding.validation.status != "Not Attempted" { + writeln!(f, " |__Response....: {}", style_fn(&finding.validation.response))?; + } + writeln!(f, " |Language......: {}", finding.language)?; + writeln!(f, " |Line Num......: {}", finding.line)?; + writeln!(f, " |Path..........: {}", style_fn(&finding.path))?; + if let Some(git) = &finding.git_metadata { + reporter.write_git_metadata_value(f, git)?; } Ok(()) } } - -#[test] -fn test_pretty_format_with_nan_entropy_panics() { - use std::{ - io::Cursor, - sync::{Arc, Mutex}, - }; - - use http::StatusCode; - use url::Url; - - use crate::{ - blob::BlobMetadata, - cli::commands::{ - github::{GitCloneMode, GitHistoryMode, GitHubRepoType}, - gitlab::GitLabRepoType, - inputs::{ContentFilteringArgs, InputSpecifierArgs}, - output::{OutputArgs, ReportOutputFormat}, - rules::RuleSpecifierArgs, - scan::{ConfidenceLevel, ScanArgs}, - }, - location::{Location, OffsetSpan, SourcePoint, SourceSpan}, - matcher::{Match, SerializableCaptures}, - origin::{Origin, OriginSet}, - reporter::{DetailsReporter, Styles}, - }; - - // Construct a fake match with NaN entropy - let m = Match { - rule_name: "dummy_rule".into(), - rule_text_id: "dummy.id".into(), - finding_fingerprint: 123456789, - rule_finding_fingerprint: "abc".into(), - location: Location { - offset_span: OffsetSpan { start: 0, end: 1 }, - source_span: SourceSpan { - start: SourcePoint { line: 1, column: 0 }, - end: SourcePoint { line: 1, column: 10 }, - }, - }, - blob_id: crate::blob::BlobId::default(), - groups: SerializableCaptures { captures: vec![] }, - rule_confidence: crate::rules::rule::Confidence::Medium, - validation_success: true, - validation_response_status: StatusCode::OK.as_u16(), - validation_response_body: "OK".into(), - calculated_entropy: f32::NAN, // Here's the trigger - visible: true, - }; - - let _rm = crate::reporter::ReportMatch { - origin: OriginSet::new(Origin::from_file("dummy.txt".into()), vec![]), - blob_metadata: BlobMetadata { - id: m.blob_id, - num_bytes: 1, - mime_essence: None, - charset: None, - language: Some("Rust".into()), - }, - m, - comment: None, - visible: true, - match_confidence: crate::rules::rule::Confidence::Medium, - validation_response_body: "OK".into(), - validation_response_status: StatusCode::OK.as_u16(), - validation_success: true, - }; - - let store = Arc::new(Mutex::new(crate::findings_store::FindingsStore::new(".".into()))); - let reporter = - DetailsReporter { datastore: store, styles: Styles::new(false), only_valid: false }; - - let mut buf = Cursor::new(Vec::new()); - let args = ScanArgs { - // core execution / performance - num_jobs: 1, - no_dedup: false, - - // rule selection - rules: RuleSpecifierArgs { - rules_path: Vec::new(), - rule: vec!["all".into()], - load_builtins: true, - }, - - // input discovery - input_specifier_args: InputSpecifierArgs { - path_inputs: Vec::new(), - git_url: Vec::new(), - github_user: Vec::new(), - github_organization: Vec::new(), - all_github_organizations: false, - github_api_url: url::Url::parse("https://api.github.com/").unwrap(), - github_repo_type: GitHubRepoType::Source, - // new GitLab defaults - gitlab_user: Vec::new(), - gitlab_group: Vec::new(), - all_gitlab_groups: false, - gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(), - gitlab_repo_type: GitLabRepoType::Owner, - // Jira options - jira_url: None, - jql: None, - max_results: 100, - - // Slack options - slack_query: None, - slack_api_url: Url::parse("https://slack.com/api/").unwrap(), - // s3 - s3_bucket: None, - s3_prefix: None, - role_arn: None, - aws_local_profile: None, - // Docker image scanning - docker_image: Vec::new(), - // git clone / history options - git_clone: GitCloneMode::Bare, - git_history: GitHistoryMode::Full, - scan_nested_repos: true, - commit_metadata: true, - }, - - // content filtering - content_filtering_args: ContentFilteringArgs { - max_file_size_mb: 25.0, - no_extract_archives: false, - extraction_depth: 2, - exclude: Vec::new(), // Exclude patterns - no_binary: true, - }, - - // scanning behaviour - confidence: ConfidenceLevel::Medium, - no_validate: false, - rule_stats: false, - only_valid: false, - min_entropy: None, - redact: false, - git_repo_timeout: 1800, // 30 minutes - - // output - output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, - - // display - snippet_length: 256, - baseline_file: None, - manage_baseline: false, - }; - - // This will panic if the entropy isn't checked for NaN - let _result = reporter.pretty_format(&mut buf, &args); - // assert!(result.is_err() || result.is_ok(), "Should not crash"); // remove this line if panic - // is expected pre-fix -} diff --git a/src/reporter/sarif_format.rs b/src/reporter/sarif_format.rs index 033d37c..ff771dd 100644 --- a/src/reporter/sarif_format.rs +++ b/src/reporter/sarif_format.rs @@ -1,274 +1,56 @@ -use std::collections::HashMap; +use std::collections::{BTreeMap, HashSet}; use rayon::prelude::*; use serde_sarif::sarif; use super::*; -use crate::{bstring_escape::Escaped, defaults::get_builtin_rules, origin::get_repo_url}; -#[derive(Hash, Eq, PartialEq)] -struct LocationKey { - file_path: String, - line: usize, - column_start: usize, - column_end: usize, - text: String, -} +use crate::defaults::get_builtin_rules; + impl DetailsReporter { - fn make_sarif_result( - &self, - finding: &Finding, - no_dedup: bool, - args: &cli::commands::scan::ScanArgs, - ) -> Result { - // Deduplicate exactly as in the JSON reporter - // let matches = self.deduplicate_matches(finding.matches.clone(), no_dedup); - // Deduplicate exactly as in the JSON reporter - but only if no_dedup is false - let matches = if no_dedup { - finding.matches.clone() - } else { - self.deduplicate_matches(finding.matches.clone(), no_dedup) - }; + fn record_to_sarif_result(&self, record: &FindingReporterRecord) -> Result { + let finding = &record.finding; + let artifact_location = + sarif::ArtifactLocationBuilder::default().uri(finding.path.clone()).build()?; + let region = sarif::RegionBuilder::default() + .start_line(finding.line as i64) + .start_column(finding.column_start as i64) + .end_line(finding.line as i64) + .end_column(finding.column_end as i64) + .snippet( + sarif::ArtifactContentBuilder::default().text(finding.snippet.clone()).build()?, + ) + .build()?; - let metadata = &finding.metadata; - - let mut location_map: HashMap> = HashMap::new(); - for rm in &matches { - let source_span = &rm.m.location.source_span; - let snippet = - rm.m.groups - .captures - .get(1) - .or_else(|| rm.m.groups.captures.get(0)) - .map(|capture| capture.value.as_bytes()) - .unwrap_or(&[]); - let key = LocationKey { - file_path: rm - .origin - .first() - .blob_path() - .map(|p| p.to_string_lossy().into_owned()) - .unwrap_or_default(), - line: source_span.start.line, - column_start: source_span.start.column, - column_end: source_span.end.column, - text: Escaped(snippet).to_string(), - }; - location_map.entry(key).or_default().push((&rm.origin, &rm.m)); + let mut props = BTreeMap::new(); + props.insert("validation_status".to_string(), serde_json::json!(finding.validation.status)); + props.insert("entropy".to_string(), serde_json::json!(finding.entropy)); + if let Some(git) = &finding.git_metadata { + props.insert("git_metadata".to_string(), git.clone()); } + let properties = + sarif::PropertyBagBuilder::default().additional_properties(props).build()?; - let mut fpu64: u64 = 0; + let location = sarif::LocationBuilder::default() + .physical_location( + sarif::PhysicalLocationBuilder::default() + .artifact_location(artifact_location) + .region(region) + .build()?, + ) + .properties(properties) + .build()?; - let locations: Vec = location_map - .into_iter() - .filter_map(|(key, matches)| { - let (prov, m) = matches[0]; - let source_span = &m.location.source_span; - let mut artifact_locations = Vec::new(); - let mut git_metadata_list = Vec::new(); - - fpu64 = m.finding_fingerprint; - - for p in prov.iter() { - match p { - Origin::File(e) => { - let uri = if let Some(url) = self.jira_issue_url(&e.path, args) { - url - } else if let Some(url) = self.slack_message_url(&e.path) { - url - } else if let Some(mapped) = self.s3_display_path(&e.path) { - mapped - } else { - e.path.display().to_string() - }; - artifact_locations.push( - sarif::ArtifactLocationBuilder::default().uri(uri).build().ok()?, - ); - } - Origin::Extended(e) => { - if let Some(p) = e.path() { - artifact_locations.push( - sarif::ArtifactLocationBuilder::default() - .uri(p.display().to_string()) - .build() - .ok()?, - ); - } - } - Origin::GitRepo(e) => { - // Extract and store Git metadata - if let Some(git_metadata) = self.extract_git_metadata(e, source_span) { - git_metadata_list.push(git_metadata); - } - - // Build Git artifact location - if let Some(cs) = &e.first_commit { - let repo_url = get_repo_url(&e.repo_path) - .unwrap_or_else(|_| { - e.repo_path.to_string_lossy().to_string().into() - }) - .trim_end_matches(".git") - .to_string(); - let git_url = format!( - "{}/blob/{}/{}#L{}", - repo_url, - cs.commit_metadata.commit_id, - cs.blob_path, - source_span.start.line - ); - artifact_locations.push( - sarif::ArtifactLocationBuilder::default() - .uri(git_url) - .build() - .ok()?, - ); - } - } - } - } - - if artifact_locations.is_empty() { - return None; - } - - let region = sarif::RegionBuilder::default() - .start_line(key.line as i64) - .start_column(key.column_start as i64) - .end_line(key.line as i64) - .end_column(key.column_end as i64) - .snippet(sarif::ArtifactContentBuilder::default().text(key.text).build().ok()?) - .build() - .ok()?; - - let logical_location = sarif::LogicalLocationBuilder::default() - .kind("blob") - .name(m.finding_fingerprint.to_string()) - .build() - .ok()?; - - let validation_status = - if m.validation_response_status == StatusCode::CONTINUE.as_u16() { - "Not Attempted" - } else if m.validation_success { - "Active Credential" - } else { - "Inactive Credential" - }; - - // Build combined properties including Git metadata and fingerprint - let mut props = std::collections::BTreeMap::new(); - props.insert("validation_status".to_string(), serde_json::json!(validation_status)); - - props.insert( - "entropy".to_string(), - serde_json::json!(format!("{:.2}", m.calculated_entropy)), - ); - - // Add the fingerprint property from the match - props.insert("fingerprint".to_string(), serde_json::json!(m.finding_fingerprint)); - - if !git_metadata_list.is_empty() { - props.insert("git_metadata".to_string(), serde_json::json!(git_metadata_list)); - } - - let properties = sarif::PropertyBagBuilder::default() - .additional_properties(props) - .build() - .ok()?; - - // Create locations for each artifact location - let locations = artifact_locations - .into_iter() - .map(|artifact_location| { - sarif::LocationBuilder::default() - .physical_location( - sarif::PhysicalLocationBuilder::default() - .artifact_location(artifact_location) - .region(region.clone()) - .build() - .ok()?, - ) - .logical_locations(vec![logical_location.clone()]) - .properties(properties.clone()) - .build() - .ok() - }) - .collect::>>()?; - Some(locations) - }) - .flatten() - .collect(); - // let message = sarif::MessageBuilder::default() - // .text(format!( - // "Rule {} found {} unique {}.\nFirst blob id matched: {}", - // metadata.rule_name, - // locations.len(), - // if locations.len() == 1 { "match" } else { "matches" }, - // first_match_blob_id - // )) - // .build()?; - // Create detailed message from first location's information - let detailed_msg = if let Some(first_match) = matches.first() { - let mut msg = format!( - "Rule {} found {} unique {}.\n", - metadata.rule_name, - locations.len(), - if locations.len() == 1 { "match" } else { "matches" } - ); - // Add file or Git information based on origin - // Get first origin of first match - we know this exists - let p = first_match.origin.first(); - match p { - Origin::File(e) => { - let uri = if let Some(url) = self.jira_issue_url(&e.path, args) { - url - } else if let Some(url) = self.slack_message_url(&e.path) { - url - } else if let Some(mapped) = self.s3_display_path(&e.path) { - mapped - } else { - e.path.display().to_string() - }; - msg.push_str(&format!("Location: {}\n", uri)); - } - Origin::Extended(e) => { - if let Some(p) = e.path() { - msg.push_str(&format!("Location: {}\n", p.display())); - } - } - Origin::GitRepo(e) => { - if let Some(cs) = &e.first_commit { - let repo_url = get_repo_url(&e.repo_path) - .unwrap_or_else(|_| e.repo_path.to_string_lossy().to_string().into()) - .trim_end_matches(".git") - .to_string(); - // Add commit and author information - let cmd = &cs.commit_metadata; - msg.push_str(&format!("Repository: {}\n", repo_url)); - msg.push_str(&format!("Commit: {}\n", cmd.commit_id)); - msg.push_str(&format!( - "Committer: {} <{}>\n", - String::from_utf8_lossy(&cmd.committer_name), - String::from_utf8_lossy(&cmd.committer_email) - )); - msg.push_str(&format!("File: {}", cs.blob_path)); - } - } - } - msg - } else { - format!("Rule {} found {} unique matches.", metadata.rule_name, locations.len(),) - }; - let message = sarif::MessageBuilder::default().text(detailed_msg).build()?; - let fingerprint_name = "fingerprint".to_string(); - let fingerprint = fpu64.to_string(); + let message = sarif::MessageBuilder::default() + .text(format!("Rule {} matched {}", record.rule.name, finding.path)) + .build()?; let result = sarif::ResultBuilder::default() - .rule_id(&metadata.rule_name) + .rule_id(&record.rule.name) .message(message) .kind(sarif::ResultKind::Review.to_string()) - .locations(locations) + .locations(vec![location]) .level(sarif::ResultLevel::Warning.to_string()) - .partial_fingerprints([(fingerprint_name, fingerprint)]) + .partial_fingerprints([("fingerprint".to_string(), finding.fingerprint.clone())]) .build()?; Ok(result) } @@ -276,54 +58,11 @@ impl DetailsReporter { pub fn sarif_format( &self, mut writer: W, - no_dedup: bool, + _no_dedup: bool, args: &cli::commands::scan::ScanArgs, ) -> Result<()> { - // Gather findings first - let mut findings = self.gather_findings()?; - - // If no_dedup is true, expand findings with multiple origins into separate findings - if no_dedup { - let mut expanded_findings = Vec::new(); - for finding in findings { - // Check matches with multiple origins - let matches_with_multiple_origins: Vec<_> = - finding.matches.iter().filter(|rm| rm.origin.len() > 1).collect(); - - if !matches_with_multiple_origins.is_empty() { - // For each match with multiple origins, create separate findings - for rm in matches_with_multiple_origins { - for origin in rm.origin.iter() { - // Create a single-origin match - let single_origin_rm = ReportMatch { - origin: OriginSet::new(origin.clone(), Vec::new()), - blob_metadata: rm.blob_metadata.clone(), - m: rm.m.clone(), - comment: rm.comment.clone(), - visible: rm.visible, - match_confidence: rm.match_confidence, - validation_response_body: rm.validation_response_body.clone(), - validation_response_status: rm.validation_response_status, - validation_success: rm.validation_success, - }; - - // Create a new finding with just this single-origin match - let new_finding = - Finding::new(finding.metadata.clone(), vec![single_origin_rm]); - expanded_findings.push(new_finding); - } - } - } else { - // If the finding has no matches with multiple origins, keep it as is - expanded_findings.push(finding); - } - } - findings = expanded_findings; - } - - // Filter only rules relevant to the findings - let finding_rule_ids: std::collections::HashSet<_> = - findings.iter().map(|f| f.metadata.rule_name.clone()).collect(); + let records = self.build_finding_records(args)?; + let finding_rule_ids: HashSet<_> = records.iter().map(|r| r.rule.name.clone()).collect(); let rules: Vec = get_builtin_rules(None)? .iter_rules() .par_bridge() @@ -366,10 +105,9 @@ impl DetailsReporter { ) .build()?; - let sarif_results: Vec = findings - .par_iter() - .filter_map(|f| self.make_sarif_result(f, no_dedup, args).ok()) - .collect(); + let sarif_results: Vec = + records.iter().filter_map(|r| self.record_to_sarif_result(r).ok()).collect(); + let run = sarif::RunBuilder::default().tool(tool).results(sarif_results).build()?; let sarif = sarif::SarifBuilder::default() .version(sarif::Version::V2_1_0.to_string()) diff --git a/src/s3.rs b/src/s3.rs index ed18a52..0f1fcef 100644 --- a/src/s3.rs +++ b/src/s3.rs @@ -2,12 +2,12 @@ use anyhow::{Context, Result}; use aws_config::{defaults, meta::region::RegionProviderChain, BehaviorVersion}; use aws_credential_types::Credentials; use aws_sdk_s3::{ + error::ProvideErrorMetadata, // for .code() + operation::list_objects_v2::ListObjectsV2Error, // modeled service error Client, - operation::list_objects_v2::ListObjectsV2Error, // modeled service error - error::ProvideErrorMetadata, // for .code() }; use aws_types::region::Region; -use reqwest; // HTTP client for HEAD fallback +use reqwest; // HTTP client for HEAD fallback pub async fn visit_bucket_objects( bucket: &str, @@ -43,9 +43,7 @@ where .configure(&config) .build() .await; - let conf = aws_sdk_s3::config::Builder::from(&config) - .credentials_provider(assume) - .build(); + let conf = aws_sdk_s3::config::Builder::from(&config).credentials_provider(assume).build(); Client::from_conf(conf) } else { Client::new(&config) @@ -66,7 +64,7 @@ where // On error, extract the modeled service error Err(err) => { - let svc_err: ListObjectsV2Error = err.into_service_error(); // from SdkError + let svc_err: ListObjectsV2Error = err.into_service_error(); // from SdkError // If the bucket must be addressed at another region... if svc_err.code() == Some("PermanentRedirect") { diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index 735e381..59bc536 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -21,14 +21,16 @@ use crate::{ findings_store, git_binary::{CloneMode, Git}, git_url::GitUrl, - github, gitlab, jira, + github, gitlab, + guesser::Guesser, + jira, matcher::{Match, Matcher, MatcherStats}, origin::{Origin, OriginSet}, rules_database::RulesDatabase, s3, scanner::processing::BlobProcessor, scanner_pool::ScannerPool, - slack, guesser::Guesser, PathBuf, + slack, PathBuf, }; pub type DatastoreMessage = (OriginSet, BlobMetadata, Vec<(Option, Match)>); @@ -291,7 +293,6 @@ pub async fn fetch_slack_messages( Ok(vec![output_dir]) } - pub async fn fetch_s3_objects( args: &scan::ScanArgs, datastore: &Arc>, @@ -330,10 +331,12 @@ pub async fn fetch_s3_objects( ); let blob = crate::blob::Blob::from_bytes(bytes); - if let Some((origin, blob_md, scored_matches)) = processor.run(origin, blob, args.no_dedup)? { + if let Some((origin, blob_md, scored_matches)) = + processor.run(origin, blob, args.no_dedup)? + { // Wrap origin & metadata once: let origin_arc = Arc::new(origin); - let blob_arc = Arc::new(blob_md); + let blob_arc = Arc::new(blob_md); // Now build a batch of exactly one FindingsStoreMessage per Match let mut batch = Vec::with_capacity(scored_matches.len()); @@ -350,4 +353,4 @@ pub async fn fetch_s3_objects( .await?; Ok(()) -} \ No newline at end of file +} diff --git a/src/scanner/runner.rs b/src/scanner/runner.rs index 63f7bee..33dc644 100644 --- a/src/scanner/runner.rs +++ b/src/scanner/runner.rs @@ -74,7 +74,6 @@ pub async fn run_async_scan( let slack_dirs = fetch_slack_messages(args, global_args, &datastore).await?; input_roots.extend(slack_dirs); - // Save Docker images if specified if !args.input_specifier_args.docker_image.is_empty() { let clone_root = { @@ -129,7 +128,6 @@ pub async fn run_async_scan( )?; } - if !args.no_dedup { // Final deduplication step before validation (or before reporting) let reporter = crate::reporter::DetailsReporter { diff --git a/tests/int_s3.rs b/tests/int_s3.rs index c44afe8..c8d6b9c 100644 --- a/tests/int_s3.rs +++ b/tests/int_s3.rs @@ -4,25 +4,26 @@ use kingfisher::s3::visit_bucket_objects; #[tokio::test] async fn test_visit_public_bucket() -> Result<()> { let mut objects = Vec::new(); - visit_bucket_objects("awsglue-datasets", Some("examples/us-legislators/all/"), None, None, |key, data| { - objects.push((key, data)); - Ok(()) - }) + visit_bucket_objects( + "awsglue-datasets", + Some("examples/us-legislators/all/"), + None, + None, + |key, data| { + objects.push((key, data)); + Ok(()) + }, + ) .await?; assert!( objects.iter().any(|(k, _)| k.ends_with("events.json")), "events.json object not found" ); - let creds = objects - .iter() - .find(|(k, _)| k.ends_with("events.json")) - .expect("events.json object"); + let creds = + objects.iter().find(|(k, _)| k.ends_with("events.json")).expect("events.json object"); let body = std::str::from_utf8(&creds.1)?; - assert!( - body.contains("Q4450263"), - "expected events.json file" - ); + assert!(body.contains("Q4450263"), "expected events.json file"); Ok(()) -} \ No newline at end of file +}