refactored output reporting and formatting logic

This commit is contained in:
Mick Grove 2025-08-04 08:58:06 -07:00
commit bc05c3e5f2
15 changed files with 1109 additions and 1275 deletions

View file

@ -5,6 +5,8 @@ All notable changes to this project will be documented in this file.
## [1.32.0]
- Added support for scanning AWS S3 buckets via `--s3-bucket` and optional `--s3-prefix`
- Added `--role-arn` and `--aws-local-profile` flags for S3 authentication alongside `KF_AWS_KEY`/`KF_AWS_SECRET`
- Refactored output reporting and formatting logic
## [1.31.0]
- New rules: Telegram bot token, OpenWeatherMap, Apify, Groq
- New OpenAI detectors added (@joshlarsen)

View file

@ -20,6 +20,6 @@ rules:
[a-z0-9\/._~-]*
)?
min_entropy: 3.0
confidence: low
confidence: medium
examples:
- https://eaRIWNkE:qyOIhJiM@j2LYY414Q5cCYD

View file

@ -96,7 +96,6 @@ rules:
["']
min_entropy: 3.3
confidence: low
categories: [fuzzy, generic, secret]
examples:
- |
password = "super$ecret"

719
f1.patch Normal file
View file

@ -0,0 +1,719 @@
diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs
index 9fcb1ecdfe8decc60278848c4a7be43cc9ebee70..b600f9f65838e52ce5dc3d7bb3bb1a5d5ff2bcaf 100644
--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@@ -1,436 +1,80 @@
-use http::StatusCode;
-use serde_json::json;
-
use super::*;
-use crate::bstring_escape::Escaped;
impl DetailsReporter {
- pub fn deduplicate_matches(
- &self,
- matches: Vec<ReportMatch>,
- no_dedup: bool,
- ) -> Vec<ReportMatch> {
- if no_dedup {
- return matches;
- }
-
- use std::collections::HashMap;
- let mut by_fp: HashMap<u64, ReportMatch> = HashMap::new();
-
- for rm in matches {
- let fp = rm.m.finding_fingerprint;
- if let Some(existing) = by_fp.get_mut(&fp) {
- // merge origin sets (keep first origin, append the rest)
- for o in rm.origin.iter() {
- if !existing.origin.iter().any(|e| e == o) {
- existing.origin = OriginSet::new(
- existing.origin.first().clone(),
- existing
- .origin
- .iter()
- .skip(1)
- .cloned()
- .chain(std::iter::once(o.clone()))
- .collect(),
- );
- }
- }
- continue;
- }
- by_fp.insert(fp, rm);
- }
- by_fp.into_values().collect()
- }
-
- pub fn gather_json_findings(
- &self,
- args: &cli::commands::scan::ScanArgs,
- ) -> Result<Vec<serde_json::Value>> {
- let mut matches = self.get_filtered_matches()?;
- if !args.no_dedup {
- matches = self.deduplicate_matches(matches, args.no_dedup);
- }
-
- let mut json_findings = Vec::new();
- for rm in matches {
- let source_span = &rm.m.location.source_span;
- let line_num = source_span.start.line;
-
- let snippet = Escaped(
- rm.m.groups
- .captures
- .get(1)
- .or_else(|| rm.m.groups.captures.get(0))
- .map(|capture| capture.value.as_bytes())
- .unwrap_or_default(),
- )
- .to_string();
-
- let validation_status = if rm.validation_success {
- "Active Credential"
- } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() {
- "Not Attempted"
- } else {
- "Inactive Credential"
- };
-
- const MAX_RESPONSE_LENGTH: usize = 512;
- let truncated_body: String =
- rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect();
- let ellipsis =
- if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" };
- let response_body = format!("{}{}", truncated_body, ellipsis);
-
- // Call extract_git_metadata on each GitRepo origin and take the first non-null result.
- let git_metadata_val = rm
- .origin
- .iter()
- .filter_map(|origin| {
- if let Origin::GitRepo(e) = origin {
- self.extract_git_metadata(e, source_span)
- } else {
- None
- }
- })
- .next()
- .unwrap_or(serde_json::Value::Null);
-
- // Collect a file path from an Origin::File, if available.
- let file_path = rm
- .origin
- .iter()
- .find_map(|origin| match origin {
- Origin::File(e) => {
- if let Some(url) = self.jira_issue_url(&e.path, args) {
- Some(url)
- } else if let Some(url) = self.slack_message_url(&e.path) {
- Some(url)
- } else if let Some(mapped) = self.s3_display_path(&e.path) {
- Some(mapped)
- } else if let Some(mapped) = self.docker_display_path(&e.path) {
- Some(mapped)
- } else {
- Some(e.path.display().to_string())
- }
- }
- Origin::Extended(e) => e.path().map(|p| p.display().to_string()),
- _ => None,
- })
- .unwrap_or_default();
-
- let match_json = json!({
- "rule": {
- "name": rm.m.rule_name,
- "id": rm.m.rule_text_id,
- },
- "finding": {
- "snippet": snippet,
- "fingerprint": rm.m.finding_fingerprint.to_string(),
- "confidence": rm.match_confidence.to_string(),
- "entropy": format!("{:.2}", rm.m.calculated_entropy),
- "validation": {
- "status": validation_status,
- "response": response_body,
- },
- "language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()),
- "line": line_num,
- "column_start": source_span.start.column,
- "column_end": source_span.end.column,
- "path": file_path,
- "git_metadata": git_metadata_val
- }
- });
-
- let finding_json = json!({
- "id": rm.m.rule_text_id,
- "matches": [ match_json ]
- });
- json_findings.push(finding_json);
- }
- Ok(json_findings)
- }
pub fn json_format<W: std::io::Write>(
&self,
mut writer: W,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
- let mut findings = Vec::new();
-
- // Get filtered matches
- let mut matches = self.get_filtered_matches()?;
-
- // Apply deduplication only if requested
- if !args.no_dedup {
- matches = self.deduplicate_matches(matches, args.no_dedup);
- }
-
- // For each match, handle it based on the no_dedup flag
- for rm in matches {
- if args.no_dedup && rm.origin.len() > 1 {
- // For no_dedup and multiple origins, create separate findings for each origin
- for origin in rm.origin.iter() {
- // Create a single-origin version of this match
- let single_origin_rm = ReportMatch {
- origin: OriginSet::new(origin.clone(), Vec::new()),
- blob_metadata: rm.blob_metadata.clone(),
- m: rm.m.clone(),
- comment: rm.comment.clone(),
- visible: rm.visible,
- match_confidence: rm.match_confidence,
- validation_response_body: rm.validation_response_body.clone(),
- validation_response_status: rm.validation_response_status,
- validation_success: rm.validation_success,
- };
-
- // Process this single-origin match into a JSON finding
- let json_finding = self.process_match_to_json(&single_origin_rm, args)?;
- findings.push(json_finding);
- }
- } else {
- // Process normally for deduped matches or matches with only one origin
- let json_finding = self.process_match_to_json(&rm, args)?;
- findings.push(json_finding);
- }
- }
-
- // Write the JSON output
- if !findings.is_empty() {
- serde_json::to_writer_pretty(&mut writer, &findings)?;
+ let records = self.build_finding_records(args)?;
+ if !records.is_empty() {
+ serde_json::to_writer_pretty(&mut writer, &records)?;
writeln!(writer)?;
}
Ok(())
}
- // Add a helper method to convert a ReportMatch to a JSON finding
- pub fn process_match_to_json(
- &self,
- rm: &ReportMatch,
- args: &cli::commands::scan::ScanArgs,
- ) -> Result<serde_json::Value> {
- // Extract the relevant data from the match as you already do in your current implementation
- let source_span = &rm.m.location.source_span;
- let line_num = source_span.start.line;
-
- let snippet = Escaped(
- rm.m.groups
- .captures
- .get(1)
- .or_else(|| rm.m.groups.captures.get(0))
- .map(|capture| capture.value.as_bytes())
- .unwrap_or_default(),
- )
- .to_string();
-
- let validation_status = if rm.validation_success {
- "Active Credential"
- } else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() {
- "Not Attempted"
- } else {
- "Inactive Credential"
- };
-
- const MAX_RESPONSE_LENGTH: usize = 512;
- let truncated_body: String =
- rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect();
- let ellipsis =
- if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" };
- let response_body = format!("{}{}", truncated_body, ellipsis);
-
- // Call extract_git_metadata on each GitRepo origin and take the first non-null result.
- let git_metadata_val = rm
- .origin
- .iter()
- .filter_map(|origin| {
- if let Origin::GitRepo(e) = origin {
- self.extract_git_metadata(e, source_span)
- } else {
- None
- }
- })
- .next()
- .unwrap_or(serde_json::Value::Null);
-
- // Collect a file path from an Origin::File, if available.
- let file_path = rm
- .origin
- .iter()
- .find_map(|origin| {
- if let Origin::File(e) = origin {
- if let Some(url) = self.jira_issue_url(&e.path, args) {
- Some(url)
- } else if let Some(url) = self.slack_message_url(&e.path) {
- Some(url)
- } else if let Some(mapped) = self.s3_display_path(&e.path) {
- Some(mapped)
- } else if let Some(mapped) = self.docker_display_path(&e.path) {
- Some(mapped)
- } else {
- Some(e.path.display().to_string())
- }
- } else if let Origin::Extended(e) = origin {
- e.path().map(|p| p.display().to_string())
- } else {
- None
- }
- })
- .unwrap_or_default();
-
- let match_json = json!({
- "rule": {
- "name": rm.m.rule_name,
- "id": rm.m.rule_text_id,
- },
- "finding": {
- "snippet": snippet,
- "fingerprint": rm.m.finding_fingerprint.to_string(),
- "confidence": rm.match_confidence.to_string(),
- "entropy": format!("{:.2}", rm.m.calculated_entropy),
- "validation": {
- "status": validation_status,
- "response": response_body,
- },
- "language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()),
- "line": line_num,
- "column_start": source_span.start.column,
- "column_end": source_span.end.column,
- "path": file_path,
- "git_metadata": git_metadata_val
- }
- });
-
- let finding_json = json!({
- "id": rm.m.rule_text_id,
- "matches": [ match_json ]
- });
-
- Ok(finding_json)
- }
- // // Modified JSON format to pass args to gather_json_findings
- // pub fn json_format<W: std::io::Write>(
- // &self,
- // mut writer: W,
- // args: &cli::commands::scan::ScanArgs,
- // ) -> Result<()> {
- // let findings = self.gather_json_findings(args)?;
- // if !findings.is_empty() {
- // serde_json::to_writer_pretty(&mut writer, &findings)?;
- // writeln!(writer)?;
- // }
- // Ok(())
- // }
-
pub fn jsonl_format<W: std::io::Write>(
&self,
mut writer: W,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
- // Get filtered matches
- let mut matches = self.get_filtered_matches()?;
-
- // Apply deduplication only if requested
- if !args.no_dedup {
- matches = self.deduplicate_matches(matches, args.no_dedup);
- }
-
- // For each match, handle it based on the no_dedup flag
- for rm in matches {
- if args.no_dedup && rm.origin.len() > 1 {
- // For no_dedup and multiple origins, create separate findings for each origin
- for origin in rm.origin.iter() {
- // Create a single-origin version of this match
- let single_origin_rm = ReportMatch {
- origin: OriginSet::new(origin.clone(), Vec::new()),
- blob_metadata: rm.blob_metadata.clone(),
- m: rm.m.clone(),
- comment: rm.comment.clone(),
- visible: rm.visible,
- match_confidence: rm.match_confidence,
- validation_response_body: rm.validation_response_body.clone(),
- validation_response_status: rm.validation_response_status,
- validation_success: rm.validation_success,
- };
-
- // Process this single-origin match into a JSON finding and write it
- let json_finding = self.process_match_to_json(&single_origin_rm, args)?;
- serde_json::to_writer(&mut writer, &json_finding)?;
- writeln!(writer)?;
- }
- } else {
- // Process normally for deduped matches or matches with only one origin
- let json_finding = self.process_match_to_json(&rm, args)?;
- serde_json::to_writer(&mut writer, &json_finding)?;
- writeln!(writer)?;
- }
+ let records = self.build_finding_records(args)?;
+ for record in records {
+ serde_json::to_writer(&mut writer, &record)?;
+ writeln!(writer)?;
}
Ok(())
}
- // // Modified JSONL format to pass args to gather_json_findings
- // pub fn jsonl_format<W: std::io::Write>(
- // &self,
- // mut writer: W,
- // args: &cli::commands::scan::ScanArgs,
- // ) -> Result<()> {
- // let findings = self.gather_json_findings(args)?;
- // for finding in findings {
- // serde_json::to_writer(&mut writer, &finding)?;
- // writeln!(writer)?;
- // }
- // Ok(())
- // }
}
#[cfg(test)]
mod tests {
- use std::{
- io::Cursor,
- path::PathBuf,
- sync::{Arc, Mutex},
- };
-
- use anyhow::Result;
- use serde_json::Value;
- use url::Url;
-
use super::*;
use crate::{
blob::BlobId,
- cli::commands::{
- github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
- inputs::{ContentFilteringArgs, InputSpecifierArgs},
- output::OutputArgs,
- rules::RuleSpecifierArgs,
- scan::ConfidenceLevel,
+ cli::commands::github::GitHubRepoType,
+ cli::commands::output::{OutputArgs, ReportOutputFormat},
+ cli::commands::scan::{
+ ConfidenceLevel, ContentFilteringArgs, GitCloneMode, GitHistoryMode,
+ InputSpecifierArgs, RuleSpecifierArgs,
},
findings_store::FindingsStore,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
- matcher::{Match, SerializableCapture, SerializableCaptures},
- origin::{Origin, OriginSet},
- reporter::{ReportMatch, Styles},
- rules::rule::Confidence,
- util::intern,
+ matcher::serializable::{SerializableCapture, SerializableCaptures},
+ matcher::Match,
+ origin::Origin,
+ reporter::styles::Styles,
+ scanner::test_utils::intern,
};
+ use std::{
+ io::Cursor,
+ path::PathBuf,
+ sync::{Arc, Mutex},
+ };
+ use url::Url;
fn create_default_args() -> cli::commands::scan::ScanArgs {
use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope
cli::commands::scan::ScanArgs {
num_jobs: 1,
no_dedup: false,
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
input_specifier_args: InputSpecifierArgs {
// local path / git URL inputs
path_inputs: Vec::new(),
git_url: Vec::new(),
// GitHub
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
// GitLab
diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs
index 9fcb1ecdfe8decc60278848c4a7be43cc9ebee70..b600f9f65838e52ce5dc3d7bb3bb1a5d5ff2bcaf 100644
--- a/src/reporter/json_format.rs
+++ b/src/reporter/json_format.rs
@@ -458,240 +102,168 @@ mod tests {
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
no_extract_archives: false,
extraction_depth: 2,
exclude: Vec::new(), // Exclude patterns
no_binary: true,
},
confidence: ConfidenceLevel::Medium,
no_validate: false,
rule_stats: false,
only_valid: false,
min_entropy: None,
redact: false,
git_repo_timeout: 1800, // 30 minutes
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
}
}
- // Helper function to create a mock Match
fn create_mock_match(
rule_name: &str,
rule_text_id: &str,
rule_finding_fingerprint: &str,
validation_success: bool,
) -> Match {
Match {
location: Location {
offset_span: OffsetSpan { start: 10, end: 20 },
source_span: SourceSpan {
start: SourcePoint { line: 5, column: 10 },
end: SourcePoint { line: 5, column: 20 },
},
},
groups: SerializableCaptures {
captures: vec![SerializableCapture {
name: Some("token".to_string()),
match_number: 1,
start: 10,
end: 20,
value: "mock_token".into(),
}],
},
blob_id: BlobId::new(b"mock_blob"),
finding_fingerprint: 0123,
rule_finding_fingerprint: intern(rule_finding_fingerprint),
rule_text_id: intern(rule_text_id),
- rule_name: intern(rule_name), //.to_string(),
+ rule_name: intern(rule_name),
rule_confidence: Confidence::Medium,
validation_response_body: "validation response".to_string(),
validation_response_status: 200,
validation_success,
calculated_entropy: 4.5,
visible: true,
}
}
- // Helper function to create a mock DetailsReporter
fn setup_mock_reporter(matches: Vec<ReportMatch>) -> DetailsReporter {
let mut datastore = FindingsStore::new(PathBuf::from("/tmp"));
- // Create mock origin and blob metadata for the first test match
if !matches.is_empty() {
let blob_metadata = BlobMetadata {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
};
let dedup = true;
- // Add matches to datastore
for m in matches.clone() {
datastore.record(
vec![(
Arc::new(OriginSet::new(
- // OriginSet -- Arc<…>
Origin::from_file(PathBuf::from("/mock/path/file.rs")),
vec![],
)),
- Arc::new(blob_metadata.clone()), // BlobMetadata -- Arc<…>
+ Arc::new(blob_metadata.clone()),
m.m.clone(),
)],
dedup,
);
}
}
DetailsReporter {
datastore: Arc::new(Mutex::new(datastore)),
styles: Styles::new(false),
only_valid: false,
}
}
+
#[test]
fn test_json_format() -> Result<()> {
- // Create a mock match with successful validation
let mock_match =
create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
let matches = vec![ReportMatch {
origin: OriginSet::new(Origin::from_file(PathBuf::from("/mock/path/file.rs")), vec![]),
blob_metadata: BlobMetadata {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
},
m: mock_match,
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: "validation response".to_string(),
validation_response_status: 200,
validation_success: true,
}];
let reporter = setup_mock_reporter(matches);
let mut output = Cursor::new(Vec::new());
- // Call the json_format method
reporter.json_format(&mut output, &create_default_args())?;
- // Parse and validate JSON output
- let json_output: Vec<Value> = serde_json::from_slice(&output.into_inner())?;
+ let json_output: Vec<serde_json::Value> = serde_json::from_slice(&output.into_inner())?;
assert!(!json_output.is_empty(), "JSON output should not be empty");
- let first_finding = &json_output[0];
- assert!(first_finding.get("id").is_some(), "Finding should have an 'id'");
- assert!(first_finding.get("matches").is_some(), "Finding should have 'matches'");
- // Validate the structure of the first match
- let matches = first_finding.get("matches").unwrap().as_array().unwrap();
- let first_match = &matches[0];
- assert_eq!(first_match.get("rule").unwrap().get("name").unwrap(), "MockRule");
- assert_eq!(first_match.get("finding").unwrap().get("language").unwrap(), "Rust");
+ let first = &json_output[0];
+ assert_eq!(first["rule"]["name"], "MockRule");
+ assert_eq!(first["finding"]["language"], "Rust");
Ok(())
}
- // #[test]
- // fn test_jsonl_format() -> Result<()> {
- // // Create a mock match with successful validation
- // let mock_match =
- // create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
- // let matches = vec![ReportMatch {
- // origin: OriginSet::new(
- // Origin::from_file(PathBuf::from("/mock/path/file.rs")),
- // vec![],
- // ),
- // blob_metadata: BlobMetadata {
- // id: BlobId::new(b"mock_blob"),
- // num_bytes: 1024,
- // mime_essence: Some("text/plain".to_string()),
- // charset: Some("UTF-8".to_string()),
- // language: Some("Rust".to_string()),
- // },
- // m: mock_match,
- // comment: None,
- // match_confidence: Confidence::Medium,
- // visible: true,
- // validation_response_body: "validation response".to_string(),
- // validation_response_status: 200,
- // validation_success: true,
- // }];
- // let reporter = setup_mock_reporter(matches);
- // let mut output = Cursor::new(Vec::new());
- // // Call the jsonl_format method
- // reporter.jsonl_format(&mut output, &create_default_args())?;
- // // Split output into lines and validate
- // let jsonl_output = String::from_utf8(output.into_inner())?;
- // let lines: Vec<&str> = jsonl_output.lines().collect();
- // assert!(!lines.is_empty(), "JSONL output should not be empty");
- // for line in &lines {
- // let json_value: serde_json::Value = serde_json::from_str(line)?;
- // assert!(
- // json_value.get("rule_name").is_some(),
- // "Each line should have a 'rule_name'"
- // );
- // assert!(
- // json_value.get("matches").is_some(),
- // "Each line should have 'matches'"
- // );
- // }
- // Ok(())
- // }
-
#[test]
fn test_validation_status_in_json() -> Result<()> {
- // Test validation status in JSON output
let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")];
for (validation_success, expected_status) in test_cases {
let mock_match = create_mock_match(
"MockRule",
"mock_rule_1",
"mock_finding_fingerprint",
validation_success,
);
let matches = vec![ReportMatch {
origin: OriginSet::new(
Origin::from_file(PathBuf::from("/mock/path/file.rs")),
vec![],
),
blob_metadata: BlobMetadata {
id: BlobId::new(b"mock_blob"),
num_bytes: 1024,
mime_essence: Some("text/plain".to_string()),
charset: Some("UTF-8".to_string()),
language: Some("Rust".to_string()),
},
m: mock_match,
comment: None,
match_confidence: Confidence::Medium,
visible: true,
validation_response_body: "validation response".to_string(),
validation_response_status: 200,
validation_success,
}];
let reporter = setup_mock_reporter(matches);
let mut output = Cursor::new(Vec::new());
- // Call the json_format method
reporter.json_format(&mut output, &create_default_args())?;
- // Parse and validate JSON output
- let json_output: Vec<Value> = serde_json::from_slice(&output.into_inner())?;
+ let json_output: Vec<serde_json::Value> = serde_json::from_slice(&output.into_inner())?;
assert!(!json_output.is_empty(), "JSON output should not be empty");
- let first_finding = &json_output[0];
- let matches = first_finding.get("matches").unwrap().as_array().unwrap();
- let first_match = &matches[0];
- let validation_status = first_match
- .get("finding")
- .unwrap()
- .get("validation")
- .unwrap()
- .get("status")
- .unwrap()
- .as_str()
- .unwrap();
+ let first = &json_output[0];
+ let validation_status = first["finding"]["validation"]["status"].as_str().unwrap();
assert_eq!(validation_status, expected_status);
}
Ok(())
}
}

View file

@ -124,7 +124,6 @@ pub struct InputSpecifierArgs {
#[arg(long, requires = "s3_bucket")]
pub aws_local_profile: Option<String>,
/// Docker/OCI images to scan (no local Docker required)
#[arg(long = "docker-image")]
pub docker_image: Vec<String>,

View file

@ -29,8 +29,8 @@ pub mod rule_loader;
pub mod rule_profiling;
pub mod rules;
pub mod rules_database;
pub mod safe_list;
pub mod s3;
pub mod safe_list;
pub mod scanner;
pub mod scanner_pool;
pub mod serde_utils;

View file

@ -5,12 +5,12 @@ use std::{
use anyhow::Result;
use http::StatusCode;
use indenter::indented;
use schemars::JsonSchema;
use serde::Serialize;
use crate::{
blob::BlobMetadata,
bstring_escape::Escaped,
cli,
cli::global::GlobalArgs,
finding_data, findings_store,
@ -226,38 +226,6 @@ impl DetailsReporter {
.collect())
}
// fn process_matches(&self, only_valid: bool) -> Result<Vec<ReportMatch>> {
// let datastore = self.datastore.lock().unwrap();
// Ok(datastore
// .get_matches()
// .iter()
// .filter(|msg| {
// let (_origin, _blob_metadata, match_item) = &***msg;
// if only_valid {
// match_item.validation_success
// && match_item.validation_response_status != StatusCode::CONTINUE.as_u16()
// && match_item.visible
// } else {
// match_item.visible
// }
// })
// .map(|msg| {
// let (origin, blob_metadata, match_item) = &**msg;
// ReportMatch {
// origin: origin.clone(),
// blob_metadata: blob_metadata.clone(),
// m: match_item.clone(),
// comment: None,
// visible: match_item.visible,
// match_confidence: match_item.rule_confidence,
// validation_response_body: match_item.validation_response_body.clone(),
// validation_response_status: match_item.validation_response_status,
// validation_success: match_item.validation_success,
// }
// })
// .collect())
// }
pub fn get_filtered_matches(&self) -> Result<Vec<ReportMatch>> {
self.process_matches(self.only_valid, true)
}
@ -266,6 +234,166 @@ impl DetailsReporter {
self.process_matches(only_valid.unwrap_or(self.only_valid), false)
}
pub fn deduplicate_matches(
&self,
matches: Vec<ReportMatch>,
no_dedup: bool,
) -> Vec<ReportMatch> {
if no_dedup {
return matches;
}
use std::collections::HashMap;
let mut by_fp: HashMap<u64, ReportMatch> = HashMap::new();
for rm in matches {
let fp = rm.m.finding_fingerprint;
if let Some(existing) = by_fp.get_mut(&fp) {
// merge origin sets (keep first origin, append the rest)
for o in rm.origin.iter() {
if !existing.origin.iter().any(|e| e == o) {
existing.origin = OriginSet::new(
existing.origin.first().clone(),
existing
.origin
.iter()
.skip(1)
.cloned()
.chain(std::iter::once(o.clone()))
.collect(),
);
}
}
continue;
}
by_fp.insert(fp, rm);
}
by_fp.into_values().collect()
}
fn matches_for_output(&self, args: &cli::commands::scan::ScanArgs) -> Result<Vec<ReportMatch>> {
let mut matches = self.get_filtered_matches()?;
if !args.no_dedup {
matches = self.deduplicate_matches(matches, args.no_dedup);
}
if args.no_dedup {
let mut expanded = Vec::new();
for rm in matches {
if rm.origin.len() > 1 {
for origin in rm.origin.iter() {
let mut single = rm.clone();
single.origin = OriginSet::new(origin.clone(), Vec::new());
expanded.push(single);
}
} else {
expanded.push(rm);
}
}
matches = expanded;
}
Ok(matches)
}
pub fn build_finding_record(
&self,
rm: &ReportMatch,
args: &cli::commands::scan::ScanArgs,
) -> FindingReporterRecord {
let source_span = &rm.m.location.source_span;
let line_num = source_span.start.line;
let snippet = Escaped(
rm.m.groups
.captures
.get(1)
.or_else(|| rm.m.groups.captures.get(0))
.map(|capture| capture.value.as_bytes())
.unwrap_or_default(),
)
.to_string();
let validation_status = if rm.validation_success {
"Active Credential".to_string()
} else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() {
"Not Attempted".to_string()
} else {
"Inactive Credential".to_string()
};
const MAX_RESPONSE_LENGTH: usize = 512;
let truncated_body: String =
rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect();
let ellipsis =
if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" };
let response_body = format!("{}{}", truncated_body, ellipsis);
let git_metadata_val = rm
.origin
.iter()
.filter_map(|origin| {
if let Origin::GitRepo(e) = origin {
self.extract_git_metadata(e, source_span)
} else {
None
}
})
.next();
let file_path = rm
.origin
.iter()
.find_map(|origin| match origin {
Origin::File(e) => {
if let Some(url) = self.jira_issue_url(&e.path, args) {
Some(url)
} else if let Some(url) = self.slack_message_url(&e.path) {
Some(url)
} else if let Some(mapped) = self.s3_display_path(&e.path) {
Some(mapped)
} else if let Some(mapped) = self.docker_display_path(&e.path) {
Some(mapped)
} else {
Some(e.path.display().to_string())
}
}
Origin::Extended(e) => e.path().map(|p| p.display().to_string()),
_ => None,
})
.unwrap_or_default();
FindingReporterRecord {
rule: RuleMetadata {
name: rm.m.rule_name.to_string().clone(),
id: rm.m.rule_text_id.to_string().clone(),
},
finding: FindingRecordData {
snippet,
fingerprint: rm.m.finding_fingerprint.to_string(),
confidence: rm.match_confidence.to_string(),
entropy: format!("{:.2}", rm.m.calculated_entropy),
validation: ValidationInfo { status: validation_status, response: response_body },
language: rm
.blob_metadata
.language
.clone()
.unwrap_or_else(|| "Unknown".to_string()),
line: line_num as u32,
column_start: source_span.start.column as u32,
column_end: source_span.end.column as u32,
path: file_path,
git_metadata: git_metadata_val,
},
}
}
pub fn build_finding_records(
&self,
args: &cli::commands::scan::ScanArgs,
) -> Result<Vec<FindingReporterRecord>> {
let matches = self.matches_for_output(args)?;
Ok(matches.iter().map(|rm| self.build_finding_record(rm, args)).collect())
}
fn get_finding_data(&self) -> Result<Vec<finding_data::FindingMetadata>> {
let datastore = self.datastore.lock().unwrap();
Ok(datastore
@ -388,6 +516,41 @@ pub struct ReportMatch {
/// Validation Success
pub validation_success: bool,
}
#[derive(Serialize, JsonSchema, Clone, Debug)]
pub struct FindingReporterRecord {
pub rule: RuleMetadata,
pub finding: FindingRecordData,
}
#[derive(Serialize, JsonSchema, Clone, Debug)]
pub struct RuleMetadata {
pub name: String,
pub id: String,
}
#[derive(Serialize, JsonSchema, Clone, Debug)]
pub struct ValidationInfo {
pub status: String,
pub response: String,
}
#[derive(Serialize, JsonSchema, Clone, Debug)]
pub struct FindingRecordData {
pub snippet: String,
pub fingerprint: String,
pub confidence: String,
pub entropy: String,
pub validation: ValidationInfo,
pub language: String,
pub line: u32,
pub column_start: u32,
pub column_end: u32,
pub path: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub git_metadata: Option<serde_json::Value>,
}
impl From<finding_data::FindingDataEntry> for ReportMatch {
fn from(e: finding_data::FindingDataEntry) -> Self {
ReportMatch {

View file

@ -1,90 +1,17 @@
use bson::Document;
use serde_json::Value;
use super::*;
impl DetailsReporter {
/// Formats findings as BSON and writes them to the provided writer.
/// For testing purposes, prints the full JSON for each finding before converting.
pub fn bson_format<W: std::io::Write>(
&self,
mut writer: W,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
// Get filtered matches
let mut matches = self.get_filtered_matches()?;
// Apply deduplication only if requested
if !args.no_dedup {
matches = self.deduplicate_matches(matches, args.no_dedup);
}
let mut bson_findings = Vec::new();
// For each match, handle it based on the no_dedup flag
for rm in matches {
if args.no_dedup && rm.origin.len() > 1 {
// For no_dedup and multiple origins, create separate findings for each origin
for origin in rm.origin.iter() {
// Create a single-origin version of this match
let single_origin_rm = ReportMatch {
origin: OriginSet::new(origin.clone(), Vec::new()),
blob_metadata: rm.blob_metadata.clone(),
m: rm.m.clone(),
comment: rm.comment.clone(),
visible: rm.visible,
match_confidence: rm.match_confidence,
validation_response_body: rm.validation_response_body.clone(),
validation_response_status: rm.validation_response_status,
validation_success: rm.validation_success,
};
// Process to JSON first, then convert to BSON
let json_finding = self.process_match_to_json(&single_origin_rm, args)?;
if let Ok(bson_doc) = json_to_bson_document(&json_finding) {
bson_findings.push(bson_doc);
}
}
} else {
// Process normally for deduped matches or matches with only one origin
let json_finding = self.process_match_to_json(&rm, args)?;
if let Ok(bson_doc) = json_to_bson_document(&json_finding) {
bson_findings.push(bson_doc);
}
}
}
// Write each BSON document
for doc in bson_findings {
let records = self.build_finding_records(args)?;
for record in records {
let doc = bson::to_document(&record)?;
doc.to_writer(&mut writer)?;
}
Ok(())
}
// pub fn bson_format<W: std::io::Write>(
// &self,
// mut writer: W,
// args: &cli::commands::scan::ScanArgs,
// ) -> Result<()> {
// let findings = self.gather_json_findings(args)?;
// // Print the full JSON for each finding
// for finding in &findings {
// println!("Full JSON:\n{}", serde_json::to_string_pretty(finding)?);
// }
// let bson_findings: Vec<Document> = findings
// .into_iter()
// .filter_map(|finding| json_to_bson_document(&finding).ok())
// .collect();
// for doc in bson_findings {
// doc.to_writer(&mut writer)?;
// }
// Ok(())
// }
}
fn json_to_bson_document(json: &Value) -> Result<Document> {
match bson::to_bson(json)? {
bson::Bson::Document(doc) => Ok(doc),
_ => Err(anyhow::anyhow!("Failed to convert JSON to BSON document")),
}
}

View file

@ -1,415 +1,61 @@
use http::StatusCode;
use serde_json::json;
use super::*;
use crate::bstring_escape::Escaped;
use serde_json::Value;
impl DetailsReporter {
pub fn deduplicate_matches(
&self,
matches: Vec<ReportMatch>,
no_dedup: bool,
) -> Vec<ReportMatch> {
if no_dedup {
return matches;
}
use std::collections::HashMap;
let mut by_fp: HashMap<u64, ReportMatch> = HashMap::new();
for rm in matches {
let fp = rm.m.finding_fingerprint;
if let Some(existing) = by_fp.get_mut(&fp) {
// merge origin sets (keep first origin, append the rest)
for o in rm.origin.iter() {
if !existing.origin.iter().any(|e| e == o) {
existing.origin = OriginSet::new(
existing.origin.first().clone(),
existing
.origin
.iter()
.skip(1)
.cloned()
.chain(std::iter::once(o.clone()))
.collect(),
);
}
}
continue;
}
by_fp.insert(fp, rm);
}
by_fp.into_values().collect()
}
pub fn gather_json_findings(
&self,
args: &cli::commands::scan::ScanArgs,
) -> Result<Vec<serde_json::Value>> {
let mut matches = self.get_filtered_matches()?;
if !args.no_dedup {
matches = self.deduplicate_matches(matches, args.no_dedup);
}
let mut json_findings = Vec::new();
for rm in matches {
let source_span = &rm.m.location.source_span;
let line_num = source_span.start.line;
let snippet = Escaped(
rm.m.groups
.captures
.get(1)
.or_else(|| rm.m.groups.captures.get(0))
.map(|capture| capture.value.as_bytes())
.unwrap_or_default(),
)
.to_string();
let validation_status = if rm.validation_success {
"Active Credential"
} else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() {
"Not Attempted"
} else {
"Inactive Credential"
};
const MAX_RESPONSE_LENGTH: usize = 512;
let truncated_body: String =
rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect();
let ellipsis =
if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" };
let response_body = format!("{}{}", truncated_body, ellipsis);
// Call extract_git_metadata on each GitRepo origin and take the first non-null result.
let git_metadata_val = rm
.origin
.iter()
.filter_map(|origin| {
if let Origin::GitRepo(e) = origin {
self.extract_git_metadata(e, source_span)
} else {
None
}
})
.next()
.unwrap_or(serde_json::Value::Null);
// Collect a file path from an Origin::File, if available.
let file_path = rm
.origin
.iter()
.find_map(|origin| match origin {
Origin::File(e) => {
if let Some(url) = self.jira_issue_url(&e.path, args) {
Some(url)
} else if let Some(url) = self.slack_message_url(&e.path) {
Some(url)
} else if let Some(mapped) = self.s3_display_path(&e.path) {
Some(mapped)
} else if let Some(mapped) = self.docker_display_path(&e.path) {
Some(mapped)
} else {
Some(e.path.display().to_string())
}
}
Origin::Extended(e) => e.path().map(|p| p.display().to_string()),
_ => None,
})
.unwrap_or_default();
let match_json = json!({
"rule": {
"name": rm.m.rule_name,
"id": rm.m.rule_text_id,
},
"finding": {
"snippet": snippet,
"fingerprint": rm.m.finding_fingerprint.to_string(),
"confidence": rm.match_confidence.to_string(),
"entropy": format!("{:.2}", rm.m.calculated_entropy),
"validation": {
"status": validation_status,
"response": response_body,
},
"language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()),
"line": line_num,
"column_start": source_span.start.column,
"column_end": source_span.end.column,
"path": file_path,
"git_metadata": git_metadata_val
}
});
let finding_json = json!({
"id": rm.m.rule_text_id,
"matches": [ match_json ]
});
json_findings.push(finding_json);
}
Ok(json_findings)
}
pub fn json_format<W: std::io::Write>(
&self,
mut writer: W,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
let mut findings = Vec::new();
// Get filtered matches
let mut matches = self.get_filtered_matches()?;
// Apply deduplication only if requested
if !args.no_dedup {
matches = self.deduplicate_matches(matches, args.no_dedup);
}
// For each match, handle it based on the no_dedup flag
for rm in matches {
if args.no_dedup && rm.origin.len() > 1 {
// For no_dedup and multiple origins, create separate findings for each origin
for origin in rm.origin.iter() {
// Create a single-origin version of this match
let single_origin_rm = ReportMatch {
origin: OriginSet::new(origin.clone(), Vec::new()),
blob_metadata: rm.blob_metadata.clone(),
m: rm.m.clone(),
comment: rm.comment.clone(),
visible: rm.visible,
match_confidence: rm.match_confidence,
validation_response_body: rm.validation_response_body.clone(),
validation_response_status: rm.validation_response_status,
validation_success: rm.validation_success,
};
// Process this single-origin match into a JSON finding
let json_finding = self.process_match_to_json(&single_origin_rm, args)?;
findings.push(json_finding);
}
} else {
// Process normally for deduped matches or matches with only one origin
let json_finding = self.process_match_to_json(&rm, args)?;
findings.push(json_finding);
}
}
// Write the JSON output
if !findings.is_empty() {
serde_json::to_writer_pretty(&mut writer, &findings)?;
let records = self.build_finding_records(args)?;
if !records.is_empty() {
serde_json::to_writer_pretty(&mut writer, &records)?;
writeln!(writer)?;
}
Ok(())
}
// Add a helper method to convert a ReportMatch to a JSON finding
pub fn process_match_to_json(
&self,
rm: &ReportMatch,
args: &cli::commands::scan::ScanArgs,
) -> Result<serde_json::Value> {
// Extract the relevant data from the match as you already do in your current implementation
let source_span = &rm.m.location.source_span;
let line_num = source_span.start.line;
let snippet = Escaped(
rm.m.groups
.captures
.get(1)
.or_else(|| rm.m.groups.captures.get(0))
.map(|capture| capture.value.as_bytes())
.unwrap_or_default(),
)
.to_string();
let validation_status = if rm.validation_success {
"Active Credential"
} else if rm.validation_response_status == StatusCode::CONTINUE.as_u16() {
"Not Attempted"
} else {
"Inactive Credential"
};
const MAX_RESPONSE_LENGTH: usize = 512;
let truncated_body: String =
rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect();
let ellipsis =
if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" };
let response_body = format!("{}{}", truncated_body, ellipsis);
// Call extract_git_metadata on each GitRepo origin and take the first non-null result.
let git_metadata_val = rm
.origin
.iter()
.filter_map(|origin| {
if let Origin::GitRepo(e) = origin {
self.extract_git_metadata(e, source_span)
} else {
None
}
})
.next()
.unwrap_or(serde_json::Value::Null);
// Collect a file path from an Origin::File, if available.
let file_path = rm
.origin
.iter()
.find_map(|origin| {
if let Origin::File(e) = origin {
if let Some(url) = self.jira_issue_url(&e.path, args) {
Some(url)
} else if let Some(url) = self.slack_message_url(&e.path) {
Some(url)
} else if let Some(mapped) = self.s3_display_path(&e.path) {
Some(mapped)
} else if let Some(mapped) = self.docker_display_path(&e.path) {
Some(mapped)
} else {
Some(e.path.display().to_string())
}
} else if let Origin::Extended(e) = origin {
e.path().map(|p| p.display().to_string())
} else {
None
}
})
.unwrap_or_default();
let match_json = json!({
"rule": {
"name": rm.m.rule_name,
"id": rm.m.rule_text_id,
},
"finding": {
"snippet": snippet,
"fingerprint": rm.m.finding_fingerprint.to_string(),
"confidence": rm.match_confidence.to_string(),
"entropy": format!("{:.2}", rm.m.calculated_entropy),
"validation": {
"status": validation_status,
"response": response_body,
},
"language": rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string()),
"line": line_num,
"column_start": source_span.start.column,
"column_end": source_span.end.column,
"path": file_path,
"git_metadata": git_metadata_val
}
});
let finding_json = json!({
"id": rm.m.rule_text_id,
"matches": [ match_json ]
});
Ok(finding_json)
}
// // Modified JSON format to pass args to gather_json_findings
// pub fn json_format<W: std::io::Write>(
// &self,
// mut writer: W,
// args: &cli::commands::scan::ScanArgs,
// ) -> Result<()> {
// let findings = self.gather_json_findings(args)?;
// if !findings.is_empty() {
// serde_json::to_writer_pretty(&mut writer, &findings)?;
// writeln!(writer)?;
// }
// Ok(())
// }
pub fn jsonl_format<W: std::io::Write>(
&self,
mut writer: W,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
// Get filtered matches
let mut matches = self.get_filtered_matches()?;
// Apply deduplication only if requested
if !args.no_dedup {
matches = self.deduplicate_matches(matches, args.no_dedup);
}
// For each match, handle it based on the no_dedup flag
for rm in matches {
if args.no_dedup && rm.origin.len() > 1 {
// For no_dedup and multiple origins, create separate findings for each origin
for origin in rm.origin.iter() {
// Create a single-origin version of this match
let single_origin_rm = ReportMatch {
origin: OriginSet::new(origin.clone(), Vec::new()),
blob_metadata: rm.blob_metadata.clone(),
m: rm.m.clone(),
comment: rm.comment.clone(),
visible: rm.visible,
match_confidence: rm.match_confidence,
validation_response_body: rm.validation_response_body.clone(),
validation_response_status: rm.validation_response_status,
validation_success: rm.validation_success,
};
// Process this single-origin match into a JSON finding and write it
let json_finding = self.process_match_to_json(&single_origin_rm, args)?;
serde_json::to_writer(&mut writer, &json_finding)?;
writeln!(writer)?;
}
} else {
// Process normally for deduped matches or matches with only one origin
let json_finding = self.process_match_to_json(&rm, args)?;
serde_json::to_writer(&mut writer, &json_finding)?;
writeln!(writer)?;
}
let records = self.build_finding_records(args)?;
for record in records {
serde_json::to_writer(&mut writer, &record)?;
writeln!(writer)?;
}
Ok(())
}
// // Modified JSONL format to pass args to gather_json_findings
// pub fn jsonl_format<W: std::io::Write>(
// &self,
// mut writer: W,
// args: &cli::commands::scan::ScanArgs,
// ) -> Result<()> {
// let findings = self.gather_json_findings(args)?;
// for finding in findings {
// serde_json::to_writer(&mut writer, &finding)?;
// writeln!(writer)?;
// }
// Ok(())
// }
}
#[cfg(test)]
mod tests {
use super::*;
use crate::cli::commands::github::GitCloneMode;
use crate::cli::commands::github::GitHistoryMode;
use crate::cli::commands::rules::RuleSpecifierArgs;
use crate::matcher::{SerializableCapture, SerializableCaptures};
use crate::util::intern;
use crate::{
blob::BlobId,
cli::commands::github::GitHubRepoType,
cli::commands::inputs::ContentFilteringArgs,
cli::commands::inputs::InputSpecifierArgs,
cli::commands::output::{OutputArgs, ReportOutputFormat},
cli::commands::scan::ConfidenceLevel,
findings_store::FindingsStore,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::Match,
origin::Origin,
reporter::styles::Styles,
};
use std::{
io::Cursor,
path::PathBuf,
sync::{Arc, Mutex},
};
use anyhow::Result;
use serde_json::Value;
use url::Url;
use super::*;
use crate::{
blob::BlobId,
cli::commands::{
github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::OutputArgs,
rules::RuleSpecifierArgs,
scan::ConfidenceLevel,
},
findings_store::FindingsStore,
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::{Match, SerializableCapture, SerializableCaptures},
origin::{Origin, OriginSet},
reporter::{ReportMatch, Styles},
rules::rule::Confidence,
util::intern,
};
fn create_default_args() -> cli::commands::scan::ScanArgs {
use crate::cli::commands::gitlab::GitLabRepoType; // bring enum into scope
@ -480,7 +126,6 @@ mod tests {
}
}
// Helper function to create a mock Match
fn create_mock_match(
rule_name: &str,
rule_text_id: &str,
@ -508,7 +153,7 @@ mod tests {
finding_fingerprint: 0123,
rule_finding_fingerprint: intern(rule_finding_fingerprint),
rule_text_id: intern(rule_text_id),
rule_name: intern(rule_name), //.to_string(),
rule_name: intern(rule_name),
rule_confidence: Confidence::Medium,
validation_response_body: "validation response".to_string(),
validation_response_status: 200,
@ -518,10 +163,8 @@ mod tests {
}
}
// Helper function to create a mock DetailsReporter
fn setup_mock_reporter(matches: Vec<ReportMatch>) -> DetailsReporter {
let mut datastore = FindingsStore::new(PathBuf::from("/tmp"));
// Create mock origin and blob metadata for the first test match
if !matches.is_empty() {
let blob_metadata = BlobMetadata {
id: BlobId::new(b"mock_blob"),
@ -531,16 +174,14 @@ mod tests {
language: Some("Rust".to_string()),
};
let dedup = true;
// Add matches to datastore
for m in matches.clone() {
datastore.record(
vec![(
Arc::new(OriginSet::new(
// OriginSet -- Arc<…>
Origin::from_file(PathBuf::from("/mock/path/file.rs")),
vec![],
)),
Arc::new(blob_metadata.clone()), // BlobMetadata -- Arc<…>
Arc::new(blob_metadata.clone()),
m.m.clone(),
)],
dedup,
@ -553,9 +194,9 @@ mod tests {
only_valid: false,
}
}
#[test]
fn test_json_format() -> Result<()> {
// Create a mock match with successful validation
let mock_match =
create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
let matches = vec![ReportMatch {
@ -577,72 +218,17 @@ mod tests {
}];
let reporter = setup_mock_reporter(matches);
let mut output = Cursor::new(Vec::new());
// Call the json_format method
reporter.json_format(&mut output, &create_default_args())?;
// Parse and validate JSON output
let json_output: Vec<Value> = serde_json::from_slice(&output.into_inner())?;
let json_output: Vec<serde_json::Value> = serde_json::from_slice(&output.into_inner())?;
assert!(!json_output.is_empty(), "JSON output should not be empty");
let first_finding = &json_output[0];
assert!(first_finding.get("id").is_some(), "Finding should have an 'id'");
assert!(first_finding.get("matches").is_some(), "Finding should have 'matches'");
// Validate the structure of the first match
let matches = first_finding.get("matches").unwrap().as_array().unwrap();
let first_match = &matches[0];
assert_eq!(first_match.get("rule").unwrap().get("name").unwrap(), "MockRule");
assert_eq!(first_match.get("finding").unwrap().get("language").unwrap(), "Rust");
let first = &json_output[0];
assert_eq!(first["rule"]["name"], "MockRule");
assert_eq!(first["finding"]["language"], "Rust");
Ok(())
}
// #[test]
// fn test_jsonl_format() -> Result<()> {
// // Create a mock match with successful validation
// let mock_match =
// create_mock_match("MockRule", "mock_rule_1", "mock_finding_fingerprint", true);
// let matches = vec![ReportMatch {
// origin: OriginSet::new(
// Origin::from_file(PathBuf::from("/mock/path/file.rs")),
// vec![],
// ),
// blob_metadata: BlobMetadata {
// id: BlobId::new(b"mock_blob"),
// num_bytes: 1024,
// mime_essence: Some("text/plain".to_string()),
// charset: Some("UTF-8".to_string()),
// language: Some("Rust".to_string()),
// },
// m: mock_match,
// comment: None,
// match_confidence: Confidence::Medium,
// visible: true,
// validation_response_body: "validation response".to_string(),
// validation_response_status: 200,
// validation_success: true,
// }];
// let reporter = setup_mock_reporter(matches);
// let mut output = Cursor::new(Vec::new());
// // Call the jsonl_format method
// reporter.jsonl_format(&mut output, &create_default_args())?;
// // Split output into lines and validate
// let jsonl_output = String::from_utf8(output.into_inner())?;
// let lines: Vec<&str> = jsonl_output.lines().collect();
// assert!(!lines.is_empty(), "JSONL output should not be empty");
// for line in &lines {
// let json_value: serde_json::Value = serde_json::from_str(line)?;
// assert!(
// json_value.get("rule_name").is_some(),
// "Each line should have a 'rule_name'"
// );
// assert!(
// json_value.get("matches").is_some(),
// "Each line should have 'matches'"
// );
// }
// Ok(())
// }
#[test]
fn test_validation_status_in_json() -> Result<()> {
// Test validation status in JSON output
let test_cases = vec![(true, "Active Credential"), (false, "Inactive Credential")];
for (validation_success, expected_status) in test_cases {
let mock_match = create_mock_match(
@ -673,23 +259,11 @@ mod tests {
}];
let reporter = setup_mock_reporter(matches);
let mut output = Cursor::new(Vec::new());
// Call the json_format method
reporter.json_format(&mut output, &create_default_args())?;
// Parse and validate JSON output
let json_output: Vec<Value> = serde_json::from_slice(&output.into_inner())?;
let json_output: Vec<serde_json::Value> = serde_json::from_slice(&output.into_inner())?;
assert!(!json_output.is_empty(), "JSON output should not be empty");
let first_finding = &json_output[0];
let matches = first_finding.get("matches").unwrap().as_array().unwrap();
let first_match = &matches[0];
let validation_status = first_match
.get("finding")
.unwrap()
.get("validation")
.unwrap()
.get("status")
.unwrap()
.as_str()
.unwrap();
let first = &json_output[0];
let validation_status = first["finding"]["validation"]["status"].as_str().unwrap();
assert_eq!(validation_status, expected_status);
}
Ok(())

View file

@ -1,413 +1,126 @@
use std::fmt::{Display, Formatter, Result as FmtResult};
use http::StatusCode;
use indenter::indented;
use super::*;
use crate::{
bstring_escape::Escaped,
origin::{get_repo_url, GitRepoOrigin},
};
impl DetailsReporter {
// Modified pretty format to use deduplicate_matches helper
pub fn pretty_format<W: std::io::Write>(
&self,
mut writer: W,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
let mut matches = self.get_filtered_matches()?;
let num_findings = matches.len();
if !args.no_dedup {
matches = self.deduplicate_matches(matches, args.no_dedup);
}
for (index, rm) in matches.into_iter().enumerate() {
// When no_dedup is true, we'll handle each origin separately
if args.no_dedup && rm.origin.len() > 1 {
// For each origin, create a separate "finding"
for origin in rm.origin.iter() {
// Create a new ReportMatch with just this single origin
let single_origin_rm = ReportMatch {
origin: OriginSet::new(origin.clone(), Vec::new()),
blob_metadata: rm.blob_metadata.clone(),
m: rm.m.clone(),
comment: rm.comment.clone(),
visible: rm.visible,
match_confidence: rm.match_confidence,
validation_response_body: rm.validation_response_body.clone(),
validation_response_status: rm.validation_response_status,
validation_success: rm.validation_success,
};
self.write_finding(
&mut writer,
&single_origin_rm,
index + 1,
num_findings,
args,
)?;
}
} else {
// Normal processing for deduped matches or matches with only one origin
self.write_finding(&mut writer, &rm, index + 1, num_findings, args)?;
}
let records = self.build_finding_records(args)?;
let num_findings = records.len();
for (index, record) in records.iter().enumerate() {
self.write_finding_record(&mut writer, record, index + 1, num_findings)?;
}
Ok(())
}
fn write_finding<W: std::io::Write>(
fn write_finding_record<W: std::io::Write>(
&self,
writer: &mut W,
rm: &ReportMatch,
record: &FindingReporterRecord,
_finding_num: usize,
_num_findings: usize,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
let lock_icon = if rm.validation_success { "🔓 " } else { "" };
let is_active = record.finding.validation.status == "Active Credential";
let lock_icon = if is_active { "🔓 " } else { "" };
let formatted_heading = format!(
"{}{} => [{}]",
lock_icon,
rm.m.rule_name.to_uppercase(),
rm.m.rule_text_id.to_uppercase()
record.rule.name.to_uppercase(),
record.rule.id.to_uppercase()
);
if rm.validation_success {
if is_active {
writeln!(writer, "{}", self.style_finding_active_heading(formatted_heading))?;
} else {
writeln!(writer, "{}", self.style_finding_heading(formatted_heading))?;
}
writeln!(writer, "{}", PrettyFinding(self, rm, args))?;
writeln!(writer, "{}", PrettyFindingRecord(self, record))?;
writeln!(writer)?;
Ok(())
}
fn write_git_metadata(
fn write_git_metadata_value(
&self,
f: &mut Formatter<'_>,
e: &GitRepoOrigin,
_args: &cli::commands::scan::ScanArgs,
line_num: usize,
git: &serde_json::Value,
) -> FmtResult {
// Check if this is a remote git scan
// let mut is_remote_git_scan = !args.input_specifier_args.git_url.is_empty();
// let mut git_url_string = String::new();
let repo_url = get_repo_url(&e.repo_path)
.unwrap_or_else(|_| e.repo_path.to_string_lossy().to_string().into());
let mut git_url_string = repo_url.clone();
if git_url_string.ends_with(".git") {
git_url_string = git_url_string.strip_suffix(".git").unwrap().to_string().into();
let repo_url = git["repository_url"].as_str().unwrap_or("");
writeln!(f, " |Git Repo......: {}", self.style_metadata(repo_url))?;
if let Some(commit) = git.get("commit") {
if let Some(url) = commit.get("url").and_then(|v| v.as_str()) {
writeln!(f, " |__Commit......: {}", self.style_metadata(url))?;
}
if let Some(committer) = commit.get("committer") {
let name = committer.get("name").and_then(|v| v.as_str()).unwrap_or("");
let email = committer.get("email").and_then(|v| v.as_str()).unwrap_or("");
writeln!(indented(f).with_str(" |__"), "Committer...: {} <{}>", name, email)?;
}
if let Some(date) = commit.get("date").and_then(|v| v.as_str()) {
writeln!(indented(f).with_str(" |__"), "Date........: {}", date)?;
}
}
writeln!(f, " |Git Repo......: {}", self.style_metadata(&git_url_string),)?;
if let Some(cs) = &e.first_commit {
let cmd = &cs.commit_metadata;
let atime =
cmd.committer_timestamp.format(gix::date::time::format::SHORT.clone()).to_string();
let commit_id = &cmd.commit_id;
let commit_url = format!("{}/commit/{}", &git_url_string, commit_id);
// Write Commit Information
writeln!(f, " |__Commit......: {}", self.style_metadata(&commit_url))?;
writeln!(
indented(f).with_str(" |__"),
"Committer...: {} <{}>",
cmd.committer_name,
cmd.committer_email
)?;
writeln!(indented(f).with_str(" |__"), "Date........: {}", atime)?;
// writeln!(indented(f).with_str(" |__"), "Summary.....: {}", msg)?;
writeln!(indented(f).with_str(" |__"), "Path........: {}", cs.blob_path)?;
// Construct Git Command
let git_link =
format!("{}/blob/{}/{}#L{}", &git_url_string, commit_id, cs.blob_path, line_num);
let git_command =
format!("git -C {} show {}:{}", e.repo_path.display(), commit_id, cs.blob_path);
writeln!(
indented(f).with_str(" |__"),
"Git Link....: {}",
self.style_metadata(&git_link)
)?;
writeln!(
indented(f).with_str(" |__"),
"Git Command.: {}",
self.style_metadata(&git_command)
)?;
if let Some(file) = git.get("file") {
if let Some(path) = file.get("path").and_then(|v| v.as_str()) {
writeln!(indented(f).with_str(" |__"), "Path........: {}", path)?;
}
if let Some(url) = file.get("url").and_then(|v| v.as_str()) {
writeln!(
indented(f).with_str(" |__"),
"Git Link....: {}",
self.style_metadata(url)
)?;
}
if let Some(cmd) = file.get("git_command").and_then(|v| v.as_str()) {
writeln!(
indented(f).with_str(" |__"),
"Git Command.: {}",
self.style_metadata(cmd)
)?;
}
}
Ok(())
}
}
// pub struct PrettyFinding<'a>(&'a DetailsReporter, &'a Finding);
pub struct PrettyFinding<'a>(
&'a DetailsReporter,
&'a ReportMatch,
&'a cli::commands::scan::ScanArgs,
);
impl<'a> Display for PrettyFinding<'a> {
pub struct PrettyFindingRecord<'a>(&'a DetailsReporter, &'a FindingReporterRecord);
impl<'a> Display for PrettyFindingRecord<'a> {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
let PrettyFinding(reporter, rm, args) = self;
// Use Box<dyn Fn(&str) -> String> to store the closure
let style_fn: Box<dyn Fn(&str) -> String> = if rm.validation_success {
Box::new(|s: &str| reporter.style_active_creds(s).to_string()) // Convert StyledObject
// to String
let reporter = self.0;
let record = self.1;
let is_active = record.finding.validation.status == "Active Credential";
let style_fn: Box<dyn Fn(&str) -> String> = if is_active {
Box::new(|s| reporter.style_active_creds(s).to_string())
} else {
Box::new(|s: &str| reporter.style_match(s).to_string()) // Convert StyledObject to
// String
Box::new(|s| reporter.style_match(s).to_string())
};
let matching_finding =
rm.m.groups
.captures
.get(1)
.or_else(|| rm.m.groups.captures.get(0))
.map(|capture| capture.value.as_bytes())
.unwrap_or(&[]);
writeln!(f, " |Finding.......: {}", style_fn(&Escaped(matching_finding).to_string()))?;
writeln!(f, " |Fingerprint...: {}", rm.m.finding_fingerprint)?;
writeln!(f, " |Confidence....: {}", rm.match_confidence.to_string())?;
writeln!(f, " |Entropy.......: {:.2}", rm.m.calculated_entropy)?;
let validation_status = if rm.validation_response_status == StatusCode::CONTINUE.as_u16()
|| rm.validation_response_status == StatusCode::PRECONDITION_REQUIRED.as_u16()
{
"Not Attempted".to_string()
} else if rm.validation_success {
"Active Credential".to_string()
} else {
"Inactive Credential".to_string()
};
writeln!(
f,
" |Validation....: {}",
if rm.validation_success {
reporter.style_finding_active_heading(&validation_status).to_string()
// Convert StyledObject to String
} else {
(&validation_status).to_string()
}
)?;
const MAX_RESPONSE_LENGTH: usize = 512;
if rm.validation_response_status != StatusCode::CONTINUE.as_u16() {
let truncated_body: String =
rm.validation_response_body.chars().take(MAX_RESPONSE_LENGTH).collect();
let ellipsis =
if rm.validation_response_body.len() > MAX_RESPONSE_LENGTH { "..." } else { "" };
let finding = &record.finding;
writeln!(f, " |Finding.......: {}", style_fn(&finding.snippet))?;
writeln!(f, " |Fingerprint...: {}", finding.fingerprint)?;
writeln!(f, " |Confidence....: {}", finding.confidence)?;
writeln!(f, " |Entropy.......: {}", finding.entropy)?;
if is_active {
writeln!(
f,
" |__Response....: {}{}",
if rm.validation_success {
reporter.style_active_creds(&truncated_body).to_string() // Convert StyledObject
// to String
} else {
reporter.style_metadata(&truncated_body).to_string() // Convert StyledObject to
// String
},
ellipsis
" |Validation....: {}",
reporter.style_finding_active_heading(&finding.validation.status).to_string()
)?;
} else {
writeln!(f, " |Validation....: {}", finding.validation.status)?;
}
writeln!(
f,
" |Language......: {}",
rm.blob_metadata.language.clone().unwrap_or_else(|| "Unknown".to_string())
)?;
let source_span = &rm.m.location.source_span;
writeln!(f, " |Line Num......: {}", source_span.start.line)?;
//print all the other areas where this was seen
for p in rm.origin.iter() {
match p {
Origin::File(e) => {
let display_path = if let Some(url) = reporter.jira_issue_url(&e.path, args) {
url
} else if let Some(url) = reporter.slack_message_url(&e.path) {
url
} else if let Some(mapped) = reporter.s3_display_path(&e.path) {
mapped
} else if let Some(mapped) = reporter.docker_display_path(&e.path) {
mapped
} else {
e.path.display().to_string()
};
writeln!(
f,
" |Path..........: {}",
if rm.validation_success {
reporter.style_active_creds(&display_path).to_string()
} else {
display_path
}
)?;
}
Origin::Extended(e) => {
if let Some(p) = e.path() {
let display_path = p.display().to_string();
writeln!(
f,
" |Path..........: {}",
if rm.validation_success {
reporter.style_active_creds(&display_path).to_string()
} else {
display_path
}
)?;
}
}
Origin::GitRepo(e) => {
reporter.write_git_metadata(f, e, args, source_span.start.line)?;
}
}
if finding.validation.status != "Not Attempted" {
writeln!(f, " |__Response....: {}", style_fn(&finding.validation.response))?;
}
writeln!(f, " |Language......: {}", finding.language)?;
writeln!(f, " |Line Num......: {}", finding.line)?;
writeln!(f, " |Path..........: {}", style_fn(&finding.path))?;
if let Some(git) = &finding.git_metadata {
reporter.write_git_metadata_value(f, git)?;
}
Ok(())
}
}
#[test]
fn test_pretty_format_with_nan_entropy_panics() {
use std::{
io::Cursor,
sync::{Arc, Mutex},
};
use http::StatusCode;
use url::Url;
use crate::{
blob::BlobMetadata,
cli::commands::{
github::{GitCloneMode, GitHistoryMode, GitHubRepoType},
gitlab::GitLabRepoType,
inputs::{ContentFilteringArgs, InputSpecifierArgs},
output::{OutputArgs, ReportOutputFormat},
rules::RuleSpecifierArgs,
scan::{ConfidenceLevel, ScanArgs},
},
location::{Location, OffsetSpan, SourcePoint, SourceSpan},
matcher::{Match, SerializableCaptures},
origin::{Origin, OriginSet},
reporter::{DetailsReporter, Styles},
};
// Construct a fake match with NaN entropy
let m = Match {
rule_name: "dummy_rule".into(),
rule_text_id: "dummy.id".into(),
finding_fingerprint: 123456789,
rule_finding_fingerprint: "abc".into(),
location: Location {
offset_span: OffsetSpan { start: 0, end: 1 },
source_span: SourceSpan {
start: SourcePoint { line: 1, column: 0 },
end: SourcePoint { line: 1, column: 10 },
},
},
blob_id: crate::blob::BlobId::default(),
groups: SerializableCaptures { captures: vec![] },
rule_confidence: crate::rules::rule::Confidence::Medium,
validation_success: true,
validation_response_status: StatusCode::OK.as_u16(),
validation_response_body: "OK".into(),
calculated_entropy: f32::NAN, // Here's the trigger
visible: true,
};
let _rm = crate::reporter::ReportMatch {
origin: OriginSet::new(Origin::from_file("dummy.txt".into()), vec![]),
blob_metadata: BlobMetadata {
id: m.blob_id,
num_bytes: 1,
mime_essence: None,
charset: None,
language: Some("Rust".into()),
},
m,
comment: None,
visible: true,
match_confidence: crate::rules::rule::Confidence::Medium,
validation_response_body: "OK".into(),
validation_response_status: StatusCode::OK.as_u16(),
validation_success: true,
};
let store = Arc::new(Mutex::new(crate::findings_store::FindingsStore::new(".".into())));
let reporter =
DetailsReporter { datastore: store, styles: Styles::new(false), only_valid: false };
let mut buf = Cursor::new(Vec::new());
let args = ScanArgs {
// core execution / performance
num_jobs: 1,
no_dedup: false,
// rule selection
rules: RuleSpecifierArgs {
rules_path: Vec::new(),
rule: vec!["all".into()],
load_builtins: true,
},
// input discovery
input_specifier_args: InputSpecifierArgs {
path_inputs: Vec::new(),
git_url: Vec::new(),
github_user: Vec::new(),
github_organization: Vec::new(),
all_github_organizations: false,
github_api_url: url::Url::parse("https://api.github.com/").unwrap(),
github_repo_type: GitHubRepoType::Source,
// new GitLab defaults
gitlab_user: Vec::new(),
gitlab_group: Vec::new(),
all_gitlab_groups: false,
gitlab_api_url: Url::parse("https://gitlab.com/").unwrap(),
gitlab_repo_type: GitLabRepoType::Owner,
// Jira options
jira_url: None,
jql: None,
max_results: 100,
// Slack options
slack_query: None,
slack_api_url: Url::parse("https://slack.com/api/").unwrap(),
// s3
s3_bucket: None,
s3_prefix: None,
role_arn: None,
aws_local_profile: None,
// Docker image scanning
docker_image: Vec::new(),
// git clone / history options
git_clone: GitCloneMode::Bare,
git_history: GitHistoryMode::Full,
scan_nested_repos: true,
commit_metadata: true,
},
// content filtering
content_filtering_args: ContentFilteringArgs {
max_file_size_mb: 25.0,
no_extract_archives: false,
extraction_depth: 2,
exclude: Vec::new(), // Exclude patterns
no_binary: true,
},
// scanning behaviour
confidence: ConfidenceLevel::Medium,
no_validate: false,
rule_stats: false,
only_valid: false,
min_entropy: None,
redact: false,
git_repo_timeout: 1800, // 30 minutes
// output
output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty },
// display
snippet_length: 256,
baseline_file: None,
manage_baseline: false,
};
// This will panic if the entropy isn't checked for NaN
let _result = reporter.pretty_format(&mut buf, &args);
// assert!(result.is_err() || result.is_ok(), "Should not crash"); // remove this line if panic
// is expected pre-fix
}

View file

@ -1,274 +1,56 @@
use std::collections::HashMap;
use std::collections::{BTreeMap, HashSet};
use rayon::prelude::*;
use serde_sarif::sarif;
use super::*;
use crate::{bstring_escape::Escaped, defaults::get_builtin_rules, origin::get_repo_url};
#[derive(Hash, Eq, PartialEq)]
struct LocationKey {
file_path: String,
line: usize,
column_start: usize,
column_end: usize,
text: String,
}
use crate::defaults::get_builtin_rules;
impl DetailsReporter {
fn make_sarif_result(
&self,
finding: &Finding,
no_dedup: bool,
args: &cli::commands::scan::ScanArgs,
) -> Result<sarif::Result> {
// Deduplicate exactly as in the JSON reporter
// let matches = self.deduplicate_matches(finding.matches.clone(), no_dedup);
// Deduplicate exactly as in the JSON reporter - but only if no_dedup is false
let matches = if no_dedup {
finding.matches.clone()
} else {
self.deduplicate_matches(finding.matches.clone(), no_dedup)
};
fn record_to_sarif_result(&self, record: &FindingReporterRecord) -> Result<sarif::Result> {
let finding = &record.finding;
let artifact_location =
sarif::ArtifactLocationBuilder::default().uri(finding.path.clone()).build()?;
let region = sarif::RegionBuilder::default()
.start_line(finding.line as i64)
.start_column(finding.column_start as i64)
.end_line(finding.line as i64)
.end_column(finding.column_end as i64)
.snippet(
sarif::ArtifactContentBuilder::default().text(finding.snippet.clone()).build()?,
)
.build()?;
let metadata = &finding.metadata;
let mut location_map: HashMap<LocationKey, Vec<(&OriginSet, &Match)>> = HashMap::new();
for rm in &matches {
let source_span = &rm.m.location.source_span;
let snippet =
rm.m.groups
.captures
.get(1)
.or_else(|| rm.m.groups.captures.get(0))
.map(|capture| capture.value.as_bytes())
.unwrap_or(&[]);
let key = LocationKey {
file_path: rm
.origin
.first()
.blob_path()
.map(|p| p.to_string_lossy().into_owned())
.unwrap_or_default(),
line: source_span.start.line,
column_start: source_span.start.column,
column_end: source_span.end.column,
text: Escaped(snippet).to_string(),
};
location_map.entry(key).or_default().push((&rm.origin, &rm.m));
let mut props = BTreeMap::new();
props.insert("validation_status".to_string(), serde_json::json!(finding.validation.status));
props.insert("entropy".to_string(), serde_json::json!(finding.entropy));
if let Some(git) = &finding.git_metadata {
props.insert("git_metadata".to_string(), git.clone());
}
let properties =
sarif::PropertyBagBuilder::default().additional_properties(props).build()?;
let mut fpu64: u64 = 0;
let location = sarif::LocationBuilder::default()
.physical_location(
sarif::PhysicalLocationBuilder::default()
.artifact_location(artifact_location)
.region(region)
.build()?,
)
.properties(properties)
.build()?;
let locations: Vec<sarif::Location> = location_map
.into_iter()
.filter_map(|(key, matches)| {
let (prov, m) = matches[0];
let source_span = &m.location.source_span;
let mut artifact_locations = Vec::new();
let mut git_metadata_list = Vec::new();
fpu64 = m.finding_fingerprint;
for p in prov.iter() {
match p {
Origin::File(e) => {
let uri = if let Some(url) = self.jira_issue_url(&e.path, args) {
url
} else if let Some(url) = self.slack_message_url(&e.path) {
url
} else if let Some(mapped) = self.s3_display_path(&e.path) {
mapped
} else {
e.path.display().to_string()
};
artifact_locations.push(
sarif::ArtifactLocationBuilder::default().uri(uri).build().ok()?,
);
}
Origin::Extended(e) => {
if let Some(p) = e.path() {
artifact_locations.push(
sarif::ArtifactLocationBuilder::default()
.uri(p.display().to_string())
.build()
.ok()?,
);
}
}
Origin::GitRepo(e) => {
// Extract and store Git metadata
if let Some(git_metadata) = self.extract_git_metadata(e, source_span) {
git_metadata_list.push(git_metadata);
}
// Build Git artifact location
if let Some(cs) = &e.first_commit {
let repo_url = get_repo_url(&e.repo_path)
.unwrap_or_else(|_| {
e.repo_path.to_string_lossy().to_string().into()
})
.trim_end_matches(".git")
.to_string();
let git_url = format!(
"{}/blob/{}/{}#L{}",
repo_url,
cs.commit_metadata.commit_id,
cs.blob_path,
source_span.start.line
);
artifact_locations.push(
sarif::ArtifactLocationBuilder::default()
.uri(git_url)
.build()
.ok()?,
);
}
}
}
}
if artifact_locations.is_empty() {
return None;
}
let region = sarif::RegionBuilder::default()
.start_line(key.line as i64)
.start_column(key.column_start as i64)
.end_line(key.line as i64)
.end_column(key.column_end as i64)
.snippet(sarif::ArtifactContentBuilder::default().text(key.text).build().ok()?)
.build()
.ok()?;
let logical_location = sarif::LogicalLocationBuilder::default()
.kind("blob")
.name(m.finding_fingerprint.to_string())
.build()
.ok()?;
let validation_status =
if m.validation_response_status == StatusCode::CONTINUE.as_u16() {
"Not Attempted"
} else if m.validation_success {
"Active Credential"
} else {
"Inactive Credential"
};
// Build combined properties including Git metadata and fingerprint
let mut props = std::collections::BTreeMap::new();
props.insert("validation_status".to_string(), serde_json::json!(validation_status));
props.insert(
"entropy".to_string(),
serde_json::json!(format!("{:.2}", m.calculated_entropy)),
);
// Add the fingerprint property from the match
props.insert("fingerprint".to_string(), serde_json::json!(m.finding_fingerprint));
if !git_metadata_list.is_empty() {
props.insert("git_metadata".to_string(), serde_json::json!(git_metadata_list));
}
let properties = sarif::PropertyBagBuilder::default()
.additional_properties(props)
.build()
.ok()?;
// Create locations for each artifact location
let locations = artifact_locations
.into_iter()
.map(|artifact_location| {
sarif::LocationBuilder::default()
.physical_location(
sarif::PhysicalLocationBuilder::default()
.artifact_location(artifact_location)
.region(region.clone())
.build()
.ok()?,
)
.logical_locations(vec![logical_location.clone()])
.properties(properties.clone())
.build()
.ok()
})
.collect::<Option<Vec<_>>>()?;
Some(locations)
})
.flatten()
.collect();
// let message = sarif::MessageBuilder::default()
// .text(format!(
// "Rule {} found {} unique {}.\nFirst blob id matched: {}",
// metadata.rule_name,
// locations.len(),
// if locations.len() == 1 { "match" } else { "matches" },
// first_match_blob_id
// ))
// .build()?;
// Create detailed message from first location's information
let detailed_msg = if let Some(first_match) = matches.first() {
let mut msg = format!(
"Rule {} found {} unique {}.\n",
metadata.rule_name,
locations.len(),
if locations.len() == 1 { "match" } else { "matches" }
);
// Add file or Git information based on origin
// Get first origin of first match - we know this exists
let p = first_match.origin.first();
match p {
Origin::File(e) => {
let uri = if let Some(url) = self.jira_issue_url(&e.path, args) {
url
} else if let Some(url) = self.slack_message_url(&e.path) {
url
} else if let Some(mapped) = self.s3_display_path(&e.path) {
mapped
} else {
e.path.display().to_string()
};
msg.push_str(&format!("Location: {}\n", uri));
}
Origin::Extended(e) => {
if let Some(p) = e.path() {
msg.push_str(&format!("Location: {}\n", p.display()));
}
}
Origin::GitRepo(e) => {
if let Some(cs) = &e.first_commit {
let repo_url = get_repo_url(&e.repo_path)
.unwrap_or_else(|_| e.repo_path.to_string_lossy().to_string().into())
.trim_end_matches(".git")
.to_string();
// Add commit and author information
let cmd = &cs.commit_metadata;
msg.push_str(&format!("Repository: {}\n", repo_url));
msg.push_str(&format!("Commit: {}\n", cmd.commit_id));
msg.push_str(&format!(
"Committer: {} <{}>\n",
String::from_utf8_lossy(&cmd.committer_name),
String::from_utf8_lossy(&cmd.committer_email)
));
msg.push_str(&format!("File: {}", cs.blob_path));
}
}
}
msg
} else {
format!("Rule {} found {} unique matches.", metadata.rule_name, locations.len(),)
};
let message = sarif::MessageBuilder::default().text(detailed_msg).build()?;
let fingerprint_name = "fingerprint".to_string();
let fingerprint = fpu64.to_string();
let message = sarif::MessageBuilder::default()
.text(format!("Rule {} matched {}", record.rule.name, finding.path))
.build()?;
let result = sarif::ResultBuilder::default()
.rule_id(&metadata.rule_name)
.rule_id(&record.rule.name)
.message(message)
.kind(sarif::ResultKind::Review.to_string())
.locations(locations)
.locations(vec![location])
.level(sarif::ResultLevel::Warning.to_string())
.partial_fingerprints([(fingerprint_name, fingerprint)])
.partial_fingerprints([("fingerprint".to_string(), finding.fingerprint.clone())])
.build()?;
Ok(result)
}
@ -276,54 +58,11 @@ impl DetailsReporter {
pub fn sarif_format<W: std::io::Write>(
&self,
mut writer: W,
no_dedup: bool,
_no_dedup: bool,
args: &cli::commands::scan::ScanArgs,
) -> Result<()> {
// Gather findings first
let mut findings = self.gather_findings()?;
// If no_dedup is true, expand findings with multiple origins into separate findings
if no_dedup {
let mut expanded_findings = Vec::new();
for finding in findings {
// Check matches with multiple origins
let matches_with_multiple_origins: Vec<_> =
finding.matches.iter().filter(|rm| rm.origin.len() > 1).collect();
if !matches_with_multiple_origins.is_empty() {
// For each match with multiple origins, create separate findings
for rm in matches_with_multiple_origins {
for origin in rm.origin.iter() {
// Create a single-origin match
let single_origin_rm = ReportMatch {
origin: OriginSet::new(origin.clone(), Vec::new()),
blob_metadata: rm.blob_metadata.clone(),
m: rm.m.clone(),
comment: rm.comment.clone(),
visible: rm.visible,
match_confidence: rm.match_confidence,
validation_response_body: rm.validation_response_body.clone(),
validation_response_status: rm.validation_response_status,
validation_success: rm.validation_success,
};
// Create a new finding with just this single-origin match
let new_finding =
Finding::new(finding.metadata.clone(), vec![single_origin_rm]);
expanded_findings.push(new_finding);
}
}
} else {
// If the finding has no matches with multiple origins, keep it as is
expanded_findings.push(finding);
}
}
findings = expanded_findings;
}
// Filter only rules relevant to the findings
let finding_rule_ids: std::collections::HashSet<_> =
findings.iter().map(|f| f.metadata.rule_name.clone()).collect();
let records = self.build_finding_records(args)?;
let finding_rule_ids: HashSet<_> = records.iter().map(|r| r.rule.name.clone()).collect();
let rules: Vec<sarif::ReportingDescriptor> = get_builtin_rules(None)?
.iter_rules()
.par_bridge()
@ -366,10 +105,9 @@ impl DetailsReporter {
)
.build()?;
let sarif_results: Vec<sarif::Result> = findings
.par_iter()
.filter_map(|f| self.make_sarif_result(f, no_dedup, args).ok())
.collect();
let sarif_results: Vec<sarif::Result> =
records.iter().filter_map(|r| self.record_to_sarif_result(r).ok()).collect();
let run = sarif::RunBuilder::default().tool(tool).results(sarif_results).build()?;
let sarif = sarif::SarifBuilder::default()
.version(sarif::Version::V2_1_0.to_string())

View file

@ -2,12 +2,12 @@ use anyhow::{Context, Result};
use aws_config::{defaults, meta::region::RegionProviderChain, BehaviorVersion};
use aws_credential_types::Credentials;
use aws_sdk_s3::{
error::ProvideErrorMetadata, // for .code()
operation::list_objects_v2::ListObjectsV2Error, // modeled service error
Client,
operation::list_objects_v2::ListObjectsV2Error, // modeled service error
error::ProvideErrorMetadata, // for .code()
};
use aws_types::region::Region;
use reqwest; // HTTP client for HEAD fallback
use reqwest; // HTTP client for HEAD fallback
pub async fn visit_bucket_objects<F>(
bucket: &str,
@ -43,9 +43,7 @@ where
.configure(&config)
.build()
.await;
let conf = aws_sdk_s3::config::Builder::from(&config)
.credentials_provider(assume)
.build();
let conf = aws_sdk_s3::config::Builder::from(&config).credentials_provider(assume).build();
Client::from_conf(conf)
} else {
Client::new(&config)
@ -66,7 +64,7 @@ where
// On error, extract the modeled service error
Err(err) => {
let svc_err: ListObjectsV2Error = err.into_service_error(); // from SdkError
let svc_err: ListObjectsV2Error = err.into_service_error(); // from SdkError
// If the bucket must be addressed at another region...
if svc_err.code() == Some("PermanentRedirect") {

View file

@ -21,14 +21,16 @@ use crate::{
findings_store,
git_binary::{CloneMode, Git},
git_url::GitUrl,
github, gitlab, jira,
github, gitlab,
guesser::Guesser,
jira,
matcher::{Match, Matcher, MatcherStats},
origin::{Origin, OriginSet},
rules_database::RulesDatabase,
s3,
scanner::processing::BlobProcessor,
scanner_pool::ScannerPool,
slack, guesser::Guesser, PathBuf,
slack, PathBuf,
};
pub type DatastoreMessage = (OriginSet, BlobMetadata, Vec<(Option<f64>, Match)>);
@ -291,7 +293,6 @@ pub async fn fetch_slack_messages(
Ok(vec![output_dir])
}
pub async fn fetch_s3_objects(
args: &scan::ScanArgs,
datastore: &Arc<Mutex<findings_store::FindingsStore>>,
@ -330,10 +331,12 @@ pub async fn fetch_s3_objects(
);
let blob = crate::blob::Blob::from_bytes(bytes);
if let Some((origin, blob_md, scored_matches)) = processor.run(origin, blob, args.no_dedup)? {
if let Some((origin, blob_md, scored_matches)) =
processor.run(origin, blob, args.no_dedup)?
{
// Wrap origin & metadata once:
let origin_arc = Arc::new(origin);
let blob_arc = Arc::new(blob_md);
let blob_arc = Arc::new(blob_md);
// Now build a batch of exactly one FindingsStoreMessage per Match
let mut batch = Vec::with_capacity(scored_matches.len());
@ -350,4 +353,4 @@ pub async fn fetch_s3_objects(
.await?;
Ok(())
}
}

View file

@ -74,7 +74,6 @@ pub async fn run_async_scan(
let slack_dirs = fetch_slack_messages(args, global_args, &datastore).await?;
input_roots.extend(slack_dirs);
// Save Docker images if specified
if !args.input_specifier_args.docker_image.is_empty() {
let clone_root = {
@ -129,7 +128,6 @@ pub async fn run_async_scan(
)?;
}
if !args.no_dedup {
// Final deduplication step before validation (or before reporting)
let reporter = crate::reporter::DetailsReporter {

View file

@ -4,25 +4,26 @@ use kingfisher::s3::visit_bucket_objects;
#[tokio::test]
async fn test_visit_public_bucket() -> Result<()> {
let mut objects = Vec::new();
visit_bucket_objects("awsglue-datasets", Some("examples/us-legislators/all/"), None, None, |key, data| {
objects.push((key, data));
Ok(())
})
visit_bucket_objects(
"awsglue-datasets",
Some("examples/us-legislators/all/"),
None,
None,
|key, data| {
objects.push((key, data));
Ok(())
},
)
.await?;
assert!(
objects.iter().any(|(k, _)| k.ends_with("events.json")),
"events.json object not found"
);
let creds = objects
.iter()
.find(|(k, _)| k.ends_with("events.json"))
.expect("events.json object");
let creds =
objects.iter().find(|(k, _)| k.ends_with("events.json")).expect("events.json object");
let body = std::str::from_utf8(&creds.1)?;
assert!(
body.contains("Q4450263"),
"expected events.json file"
);
assert!(body.contains("Q4450263"), "expected events.json file");
Ok(())
}
}