removed serde_utils and added Authress rule

This commit is contained in:
Mick Grove 2025-08-16 07:34:02 -07:00
commit 80058136af

View file

@ -1,712 +0,0 @@
use std::{
marker::PhantomData,
sync::{
atomic::{AtomicBool, Ordering},
Arc, Mutex,
},
time::{Duration, Instant as StdInstant, Instant},
};
use anyhow::{bail, Context, Result};
use base64::{engine::general_purpose::STANDARD, Engine};
use bstr::BString;
use gix::Repository as GixRepo;
use indicatif::{ProgressBar, ProgressStyle};
use rayon::{
iter::plumbing::Folder,
prelude::{ParallelIterator, *},
};
use serde::{Deserialize, Deserializer};
use tracing::{debug, error};
use crate::{
binary::is_binary,
blob::{Blob, BlobId, BlobIdMap},
cli::commands::{github::GitHistoryMode, scan},
decompress::{decompress_file_to_temp, CompressedContent},
findings_store,
matcher::{Matcher, MatcherStats},
open_git_repo,
origin::{Origin, OriginSet},
rule_profiling::ConcurrentRuleProfiler,
rules_database::RulesDatabase,
scanner::{
processing::BlobProcessor,
runner::{create_datastore_channel, spawn_datastore_writer_thread},
util::is_compressed_file,
},
scanner_pool::ScannerPool,
EnumeratorConfig, EnumeratorFileResult, FileResult, FilesystemEnumerator, FoundInput,
GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator, PathBuf,
};
type OwnedBlob = Blob<'static>;
pub fn enumerate_filesystem_inputs(
args: &scan::ScanArgs,
datastore: Arc<Mutex<findings_store::FindingsStore>>,
input_roots: &[PathBuf],
progress_enabled: bool,
rules_db: &RulesDatabase,
enable_profiling: bool,
shared_profiler: Arc<ConcurrentRuleProfiler>,
matcher_stats: &Mutex<MatcherStats>,
) -> Result<()> {
let repo_scan_timeout = Duration::from_secs(args.git_repo_timeout);
let progress = if progress_enabled {
let style =
ProgressStyle::with_template("{spinner} {msg} {total_bytes} [{elapsed_precise}]")
.expect("progress bar style template should compile");
let pb = ProgressBar::new_spinner()
.with_style(style)
.with_message("Scanning files and git repository content...");
pb.enable_steady_tick(Duration::from_millis(500));
pb
} else {
ProgressBar::hidden()
};
let _input_enumerator = || -> Result<FilesystemEnumerator> {
let mut ie = FilesystemEnumerator::new(input_roots, &args)?;
ie.threads(args.num_jobs);
ie.max_filesize(args.content_filtering_args.max_file_size_bytes());
if args.input_specifier_args.git_history == GitHistoryMode::None {
ie.enumerate_git_history(false);
}
let collect_git_metadata = true;
ie.collect_git_metadata(collect_git_metadata);
Ok(ie)
}()
.context("Failed to initialize filesystem enumerator")?;
let (enum_thread, input_recv, exclude_globset) = {
let fs_enumerator = make_fs_enumerator(args, input_roots.into())
.context("Failed to initialize filesystem enumerator")?;
let exclude_globset = fs_enumerator.as_ref().and_then(|ie| ie.exclude_globset());
let channel_size = std::cmp::max(args.num_jobs * 128, 1024);
let (input_send, input_recv) = crossbeam_channel::bounded(channel_size);
let input_enumerator_thread = std::thread::Builder::new()
.name("input_enumerator".to_string())
.spawn(move || -> Result<_> {
if let Some(fs_enumerator) = fs_enumerator {
fs_enumerator.run(input_send.clone())?;
}
Ok(())
})
.context("Failed to enumerate filesystem inputs")?;
(input_enumerator_thread, input_recv, exclude_globset)
};
let enum_cfg = EnumeratorConfig {
enumerate_git_history: match args.input_specifier_args.git_history {
GitHistoryMode::Full => true,
GitHistoryMode::None => false,
},
collect_git_metadata: args.input_specifier_args.commit_metadata,
repo_scan_timeout,
exclude_globset,
};
let (send_ds, recv_ds) = create_datastore_channel(args.num_jobs);
let datastore_writer_thread =
spawn_datastore_writer_thread(datastore, recv_ds, !args.no_dedup)?;
let t1 = Instant::now();
let num_blob_processors = Mutex::new(0u64);
let seen_blobs = BlobIdMap::new();
let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone())));
let matcher = Matcher::new(
&rules_db,
scanner_pool.clone(),
&seen_blobs,
Some(&matcher_stats),
enable_profiling,
Some(shared_profiler),
)?;
let blob_processor_init_time = Mutex::new(t1.elapsed());
let make_blob_processor = || -> BlobProcessor {
let t1 = Instant::now();
*num_blob_processors.lock().unwrap() += 1;
{
let mut init_time = blob_processor_init_time.lock().unwrap();
*init_time += t1.elapsed();
}
BlobProcessor { matcher }
};
let scan_res: Result<()> = input_recv
.into_iter()
.par_bridge()
.filter_map(|input| match (&enum_cfg, input).into_blob_iter() {
Err(e) => {
debug!("Error enumerating input: {e:#}");
None
}
Ok(blob_iter) => blob_iter,
})
.flatten()
.try_for_each_init(
|| (make_blob_processor.clone()(), progress.clone()),
move |(processor, progress), entry| {
let (origin, blob) = match entry {
Err(e) => {
error!("Error loading input: {e:#}");
return Ok(());
}
Ok(entry) => entry,
};
// Check if this is an archive file
let is_archive = if let Origin::File(file_origin) = &origin.first() {
is_compressed_file(&file_origin.path)
} else {
false
};
let is_binary = is_binary(&blob.bytes());
let should_skip = if is_archive {
// For archives: skip only if --no_extract_archives is true
args.content_filtering_args.no_extract_archives
} else {
// For non-archives: skip if it's binary and --no_binary is true
is_binary && args.content_filtering_args.no_binary
};
if should_skip {
progress.suspend(|| {
if is_archive {
debug!("Skipping archive: {}", blob.id);
} else {
debug!("Skipping binary blob: {}", blob.id);
}
});
return Ok(());
}
progress.inc(blob.len().try_into().unwrap());
match processor.run(origin, blob, args.no_dedup, args.redact) {
Ok(None) => {
// nothing to record
}
Ok(Some((origin_set, blob_metadata, vec_of_matches))) => {
for (_, single_match) in vec_of_matches {
// Send each match
send_ds.send((
Arc::new(origin_set.clone()),
Arc::new(blob_metadata.clone()),
single_match,
))?;
}
}
Err(e) => {
debug!("Error scanning input: {e:#}");
}
}
Ok(())
},
);
enum_thread.join().unwrap().context("Failed to enumerate inputs")?;
let (..) = datastore_writer_thread
.join()
.unwrap()
.context("Failed to save results to the datastore")?;
scan_res.context("Failed to scan inputs")?;
progress.finish();
Ok(())
}
/// Initialize a `FilesystemEnumerator` based on the command-line arguments and
/// datastore. Also initialize a `Gitignore` that is the same as that used by
/// the filesystem enumerator.
fn make_fs_enumerator(
args: &scan::ScanArgs,
input_roots: Vec<PathBuf>,
) -> Result<Option<FilesystemEnumerator>> {
if input_roots.is_empty() {
Ok(None)
} else {
let mut ie = FilesystemEnumerator::new(&input_roots, &args)?;
ie.threads(args.num_jobs);
ie.max_filesize(args.content_filtering_args.max_file_size_bytes());
if args.input_specifier_args.git_history == GitHistoryMode::None {
ie.enumerate_git_history(false);
}
// Pass no_dedup when enumerating git history
ie.no_dedup(args.no_dedup);
ie.set_exclude_patterns(&args.content_filtering_args.exclude)?;
// Determine whether to collect git metadata or not
let collect_git_metadata = false;
ie.collect_git_metadata(collect_git_metadata);
Ok(Some(ie))
}
}
// Rest of the file remains the same...
/// Implements parallel iteration for either a single blob or a list of blobs.
struct FileResultIter<'a> {
iter_kind: FileResultIterKind,
_marker: PhantomData<&'a ()>,
}
impl<'a> ParallelIterator for FileResultIter<'a> {
type Item = Result<(OriginSet, Blob<'a>)>;
fn drive_unindexed<C>(self, consumer: C) -> C::Result
where
C: rayon::iter::plumbing::UnindexedConsumer<Self::Item>,
{
match self.iter_kind {
FileResultIterKind::Single(maybe_one) => {
let mut folder = consumer.into_folder();
if let Some(one) = maybe_one {
folder = folder.consume(Ok(one));
}
folder.complete()
}
FileResultIterKind::Archive(items) => {
items.into_par_iter().map(Ok).drive_unindexed(consumer)
}
}
}
}
impl ParallelBlobIterator for FileResult {
type Iter<'a> = FileResultIter<'a>;
fn into_blob_iter<'a>(self) -> Result<Option<Self::Iter<'a>>> {
let extraction_enabled = self.extract_archives;
let max_extraction_depth = self.extraction_depth;
if extraction_enabled && is_compressed_file(&self.path) {
match decompress_file_to_temp(&self.path) {
Ok((content, _temp_dir)) => match content {
// Single-file decompression fully in memory.
CompressedContent::Raw(ref data) => {
let origin = OriginSet::new(Origin::from_file(self.path.clone()), vec![]);
let blob = Blob::from_bytes(data.to_vec());
Ok(Some(FileResultIter {
iter_kind: FileResultIterKind::Single(Some((origin, blob))),
_marker: PhantomData,
}))
}
// Single-file decompression streamed to a file. We read it back into memory
// here.
CompressedContent::RawFile(path) => {
let origin = OriginSet::new(Origin::from_file(self.path.clone()), vec![]);
let blob = Blob::from_file(&path)?;
Ok(Some(FileResultIter {
iter_kind: FileResultIterKind::Single(Some((origin, blob))),
_marker: PhantomData,
}))
}
// Multifile archive (inmemory).
CompressedContent::Archive(ref files) => {
if max_extraction_depth == 0 {
debug!(
"Skipping nested archive (max depth reached): {}",
self.path.display()
);
return Ok(None);
}
let items = files
.iter()
.map(|(filename, data)| {
let full_path = PathBuf::from(filename);
let nested_origin =
OriginSet::new(Origin::from_file(full_path), vec![]);
// Construct a FileResult for deeper extraction if needed (not used
// directly here)
let _ = FileResult {
path: self.path.join(filename),
num_bytes: data.len() as u64,
extract_archives: self.extract_archives,
extraction_depth: max_extraction_depth - 1,
};
(nested_origin, Blob::from_bytes(data.to_vec()))
})
.collect();
Ok(Some(FileResultIter {
iter_kind: FileResultIterKind::Archive(items),
_marker: PhantomData,
}))
}
// Multifile archive (files on disk).
CompressedContent::ArchiveFiles(ref entries) => {
if max_extraction_depth == 0 {
debug!(
"Skipping nested archive (max depth reached): {}",
self.path.display()
);
return Ok(None);
}
// Read each extracted file from disk and create a Blob.
let mut items = Vec::new();
for (filename, disk_path) in entries {
let blob = match Blob::from_file(disk_path) {
Ok(b) => b,
Err(e) => {
debug!(
"Failed to mmap extracted file {}: {}",
disk_path.display(),
e
);
continue; // skip unreadable / unmappable file
}
};
let full_path = PathBuf::from(filename);
let nested_origin =
OriginSet::new(Origin::from_file(full_path), vec![]);
// Construct a FileResult for deeper extraction if needed (not used
// directly here)
let _ = FileResult {
path: self.path.join(filename),
num_bytes: blob.len() as u64,
extract_archives: self.extract_archives,
extraction_depth: max_extraction_depth - 1,
};
items.push((nested_origin, blob));
}
Ok(Some(FileResultIter {
iter_kind: FileResultIterKind::Archive(items),
_marker: PhantomData,
}))
}
},
Err(e) => {
debug!("Failed to decompress {}: {}", self.path.display(), e);
Ok(None) // Skip on decompression failure
}
}
} else {
// Not compressed or extraction disabled: read file as a single blob.
let blob = Blob::from_file(&self.path)
.with_context(|| format!("Failed to load blob from {}", self.path.display()))?;
let origin = OriginSet::new(Origin::from_file(self.path.clone()), vec![]);
Ok(Some(FileResultIter {
iter_kind: FileResultIterKind::Single(Some((origin, blob))),
_marker: PhantomData,
}))
}
}
}
// A marker so the struct itself carries the lifetime.
struct GitRepoResultIter<'a> {
inner: GitRepoResult,
deadline: std::time::Instant,
_marker: std::marker::PhantomData<&'a ()>,
}
impl ParallelBlobIterator for GitRepoResult {
type Iter<'a> = GitRepoResultIter<'a>;
fn into_blob_iter<'a>(self) -> Result<Option<Self::Iter<'a>>> {
// placeholder 1 h deadline; will be overwritten immediately
const PLACEHOLDER: Duration = Duration::from_secs(3600);
Ok(Some(GitRepoResultIter {
inner: self,
deadline: Instant::now() + PLACEHOLDER,
_marker: std::marker::PhantomData,
}))
}
}
impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> {
type Item = Result<(OriginSet, Blob<'a>)>;
fn drive_unindexed<C>(self, consumer: C) -> C::Result
where
C: rayon::iter::plumbing::UnindexedConsumer<Self::Item>,
{
// ── shared state ──────────────────────────────────────────────
let repo_sync = self.inner.repository.into_sync();
let repo_path = Arc::new(self.inner.path.clone());
let deadline = self.deadline;
let flag = Arc::new(AtomicBool::new(false)); // first-timeout gate
self.inner
.blobs
.into_par_iter()
.with_min_len(1024)
.map_init(|| repo_sync.to_thread_local(), {
let repo_path = Arc::clone(&repo_path);
let flag = Arc::clone(&flag);
move |repo: &mut GixRepo, md| -> Result<(OriginSet, Blob)> {
// ── 10-minute guard ──────────────────────────
if StdInstant::now() > deadline {
if flag.swap(true, Ordering::Relaxed) {
bail!("__timeout_silenced__");
}
bail!("blob-read timeout (repo: {})", repo_path.display());
}
// ── load blob ────────────────────────────────
let blob_id = md.blob_oid;
let mut raw = repo.find_object(blob_id)?.try_into_blob()?;
let blob = Blob::new(BlobId::from(&blob_id), std::mem::take(&mut raw.data));
// ── build Origin — CLONE Arc & PathBuf ──────
let origin = OriginSet::try_from_iter(md.first_seen.iter().map(|e| {
Origin::from_git_repo_with_first_commit(
Arc::clone(&repo_path),
Arc::clone(&e.commit_metadata), // ← clone Arc
e.path.clone(), // ← clone path
)
}))
.unwrap_or_else(|| Origin::from_git_repo(Arc::clone(&repo_path)).into());
Ok((origin, blob))
}
})
.filter(|res| {
!matches!(res,
Err(e) if e.to_string() == "__timeout_silenced__"
)
})
.drive_unindexed(consumer)
}
}
struct EnumeratorFileIter<'a> {
inner: EnumeratorFileResult,
reader: std::io::BufReader<std::fs::File>,
_marker: PhantomData<&'a ()>,
}
impl ParallelBlobIterator for EnumeratorFileResult {
type Iter<'a> = EnumeratorFileIter<'a>;
fn into_blob_iter<'a>(self) -> Result<Option<Self::Iter<'a>>> {
let file = std::fs::File::open(&self.path)?;
let reader = std::io::BufReader::new(file);
Ok(Some(EnumeratorFileIter { inner: self, reader, _marker: PhantomData }))
}
}
enum FoundInputIter<'a> {
File(FileResultIter<'a>),
GitRepo(GitRepoResultIter<'a>),
EnumeratorFile(EnumeratorFileIter<'a>),
}
// Enumerator file parallelism approach:
//
// - Split into lines sequentially
// - Parallelize JSON deserialization (JSON is an expensive serialization format, but easy to sling
// around, hence used here -- another format like Arrow or msgpack would be much more efficient)
impl<'a> ParallelIterator for EnumeratorFileIter<'a> {
type Item = Result<(OriginSet, Blob<'a>)>;
fn drive_unindexed<C>(self, consumer: C) -> C::Result
where
C: rayon::iter::plumbing::UnindexedConsumer<Self::Item>,
{
use std::io::BufRead;
(1usize..)
.zip(self.reader.lines())
.filter_map(|(line_num, line)| line.map(|line| (line_num, line)).ok())
.par_bridge()
.map(|(line_num, line)| {
let e: EnumeratorBlobResult = serde_json::from_str(&line).with_context(|| {
format!("Error in enumerator {}:{line_num}", self.inner.path.display())
})?;
// let origin = Origin::from_extended(e.origin).into();
let origin = OriginSet::new(Origin::from_extended(e.origin), Vec::new());
let blob = Blob::from_bytes(e.content.as_bytes().to_owned());
Ok((origin, blob))
})
.drive_unindexed(consumer)
}
}
trait ParallelBlobIterator {
/// The concrete parallel iterator returned by `into_blob_iter`.
/// It is generic over the lifetime `'a` that the produced `Blob<'a>` carries.
type Iter<'a>: ParallelIterator<Item = Result<(OriginSet, Blob<'a>)>> + 'a
where
Self: 'a;
/// Convert the input into an *optional* parallel iterator of `(Origin, Blob)` tuples.
fn into_blob_iter<'a>(self) -> Result<Option<Self::Iter<'a>>>
where
Self: 'a;
}
impl<'a> ParallelIterator for FoundInputIter<'a> {
type Item = Result<(OriginSet, Blob<'a>)>;
fn drive_unindexed<C>(self, consumer: C) -> C::Result
where
C: rayon::iter::plumbing::UnindexedConsumer<Self::Item>,
{
match self {
FoundInputIter::File(i) => i.drive_unindexed(consumer),
FoundInputIter::GitRepo(i) => i.drive_unindexed(consumer),
FoundInputIter::EnumeratorFile(i) => i.drive_unindexed(consumer),
}
}
}
impl<'cfg> ParallelBlobIterator for (&'cfg EnumeratorConfig, FoundInput) {
type Iter<'a>
= FoundInputIter<'a>
where
Self: 'a;
fn into_blob_iter<'a>(self) -> Result<Option<Self::Iter<'a>>>
where
'cfg: 'a,
{
use std::time::Instant;
let (cfg, input) = self;
match input {
// ───────────── regular file ─────────────
FoundInput::File(i) => Ok(i.into_blob_iter()?.map(FoundInputIter::File)),
// ───────────── directory (possible Git repo) ─────────────
FoundInput::Directory(i) => {
let path = &i.path;
if !cfg.enumerate_git_history {
return Ok(None);
}
// Try to open a Git repository at that path
let repository = match open_git_repo(path)? {
Some(r) => r,
None => return Ok(None),
};
debug!("Found Git repository at {}", path.display());
let t_start = Instant::now();
let collect_git_metadata = cfg.collect_git_metadata;
let timeout = cfg.repo_scan_timeout;
// Spawn an enumerator thread so we can time-out cleanly
let path_clone = path.to_path_buf();
let (tx, rx) = std::sync::mpsc::channel();
let exclude_globset = cfg.exclude_globset.clone();
let handle = std::thread::spawn(move || {
let res = if collect_git_metadata {
GitRepoWithMetadataEnumerator::new(
&path_clone,
repository,
exclude_globset.clone(),
)
.run()
} else {
GitRepoEnumerator::new(&path_clone, repository).run()
};
let _ = tx.send(res);
});
// Wait for enumeration, polling every 100 ms
let git_result = loop {
if t_start.elapsed() > timeout {
debug!(
"Git repo enumeration at {} timed-out after {:.1}s (> {} s)",
path.display(),
t_start.elapsed().as_secs_f64(),
timeout.as_secs()
);
// Abandon the worker thread and skip this repo
return Ok(None);
}
match rx.try_recv() {
Ok(res) => break res,
Err(std::sync::mpsc::TryRecvError::Empty) => {
std::thread::sleep(std::time::Duration::from_millis(100));
}
Err(std::sync::mpsc::TryRecvError::Disconnected) => {
debug!("Enumerator thread disconnected for {}", path.display());
return Ok(None);
}
}
};
let _ = handle.join(); // avoid leak
match git_result {
Err(e) => {
debug!("Failed to enumerate Git repo at {}: {e}", path.display());
Ok(None)
}
Ok(repo_result) => {
debug!(
"Enumerated Git repo at {} in {:.2}s",
path.display(),
t_start.elapsed().as_secs_f64()
);
// Convert to a blob iterator, then patch the deadline
repo_result
.into_blob_iter() // Option<GitRepoResultIter>
.map(|iter| {
iter.map(|mut gri| {
gri.deadline = Instant::now() + timeout;
FoundInputIter::GitRepo(gri)
})
})
}
}
}
// ───────────── pre-enumerated JSON file list ─────────────
FoundInput::EnumeratorFile(i) => {
Ok(i.into_blob_iter()?.map(FoundInputIter::EnumeratorFile))
}
}
}
}
/// A simple enum describing how we yield file content:
/// - Single: one `(origin, blob)`
/// - Archive: multiple `(origin, blob)` items from a decompressed archive
enum FileResultIterKind {
Single(Option<(OriginSet, OwnedBlob)>),
Archive(Vec<(OriginSet, OwnedBlob)>),
}
#[derive(Deserialize)]
pub enum Content {
#[serde(rename = "content_base64")]
Base64(#[serde(deserialize_with = "deserialize_b64_bstring")] BString),
#[serde(rename = "content")]
Utf8(String),
}
impl Content {
pub fn as_bytes(&self) -> &[u8] {
match self {
Content::Base64(s) => s.as_slice(),
Content::Utf8(s) => s.as_bytes(),
}
}
}
fn deserialize_b64_bstring<'de, D>(deserializer: D) -> Result<BString, D::Error>
where
D: Deserializer<'de>,
{
let encoded = String::deserialize(deserializer)?;
let decoded = STANDARD.decode(&encoded).map_err(serde::de::Error::custom)?;
Ok(decoded.into())
}
// -------------------------------------------------------------------------------------------------
/// An entry deserialized from an extensible enumerator
#[derive(serde::Deserialize)]
struct EnumeratorBlobResult {
#[serde(flatten)]
pub content: Content,
pub origin: serde_json::Value,
}