From 7d9d3be132574c7c7ade97428319087998bef028 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sat, 25 Oct 2025 17:12:51 -0700 Subject: [PATCH] - Fixed local filesystem scans to keep open_path_as_is enabled when opening Git repositories and only disable it for diff-based scans. - Created Linux and Windows specific installer script - Updated diff-focused scanning so --branch-root-commit can be provided alongside --branch, letting you diff from a chosen commit while targeting a specific branch tip (still defaulting back to the --branch ref when the commit is omitted). --- CHANGELOG.md | 5 + Cargo.toml | 2 +- README.md | 47 +- data/rules/vercel.yml | 2 +- scripts/install-kingfisher.ps1 | 80 +++ scripts/install-kingfisher.sh | 151 +++++ src/cli/commands/inputs.rs | 26 + src/lib.rs | 12 +- src/main.rs | 2 + src/reporter.rs | 2 + src/reporter/json_format.rs | 2 + src/scanner/enumerate.rs | 72 ++- src/scanner/enumerate.rs.orig | 1070 ++++++++++++++++++++++++++++++++ tests/int_allowlist.rs | 2 + tests/int_bitbucket.rs | 2 + tests/int_dedup.rs | 2 + tests/int_github.rs | 2 + tests/int_gitlab.rs | 4 + tests/int_redact.rs | 2 + tests/int_slack.rs | 4 + tests/int_validation_cache.rs | 2 + tests/int_vulnerable_files.rs | 4 + tests/smoke_branch.rs | 132 ++++ 23 files changed, 1608 insertions(+), 21 deletions(-) create mode 100644 scripts/install-kingfisher.ps1 create mode 100755 scripts/install-kingfisher.sh create mode 100644 src/scanner/enumerate.rs.orig diff --git a/CHANGELOG.md b/CHANGELOG.md index d489420..fbe4eaf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to this project will be documented in this file. +## [v1.61.0] +- Fixed local filesystem scans to keep `open_path_as_is` enabled when opening Git repositories and only disable it for diff-based scans. +- Created Linux and Windows specific installer script +- Updated diff-focused scanning so `--branch-root-commit` can be provided alongside `--branch`, letting you diff from a chosen commit while targeting a specific branch tip (still defaulting back to the `--branch` ref when the commit is omitted). + ## [v1.60.0] - Removed the `--bitbucket-username`, `--bitbucket-token`, and `--bitbucket-oauth-token` flags in favour of `KF_BITBUCKET_*` environment variables when authenticating to Bitbucket. - Added provider-specific `kingfisher scan` subcommands (for example `kingfisher scan github …`) that translate into the legacy flags under the hood. The new layout keeps backwards compatibility while removing the wall of provider options from `kingfisher scan --help`. diff --git a/Cargo.toml b/Cargo.toml index 94c2e3f..d85f76f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.60.0" +version = "1.61.0" description = "MongoDB's blazingly fast and accurate secret scanning and validation tool" edition.workspace = true rust-version.workspace = true diff --git a/README.md b/README.md index 3a73552..b7fc292 100644 --- a/README.md +++ b/README.md @@ -166,17 +166,23 @@ brew install kingfisher
-You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform. +Use the bundled installer script to fetch the latest release and place it in +`~/.local/bin` (or a directory of your choice): ```bash # Linux, macOS curl --silent --location \ - https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.sh | \ - sh && \ - ubi --project mongodb/kingfisher --in "$HOME/.local/bin" + https://raw.githubusercontent.com/mongodb/kingfisher/main/scripts/install-kingfisher.sh | \ + bash ``` -This installs and runs `ubi` and then places the `kingfisher` executable in `~/.local/bin` on Unix-like systems. +To install into a custom location, pass the desired directory as an argument: + +```bash +curl --silent --location \ + https://raw.githubusercontent.com/mongodb/kingfisher/main/scripts/install-kingfisher.sh | \ + bash -s -- /opt/kingfisher +```
@@ -184,14 +190,21 @@ This installs and runs `ubi` and then places the `kingfisher` executable in `~/.
-You can easily install using [ubi](https://github.com/houseabsolute/ubi), which downloads the correct binary for your platform. +Download and run the PowerShell installer to place the binary in +`$env:USERPROFILE\bin` (or another directory you specify): ```powershell # Windows -powershell -exec bypass -c "Invoke-WebRequest -URI 'https://raw.githubusercontent.com/houseabsolute/ubi/master/bootstrap/bootstrap-ubi.ps1' -UseBasicParsing | Invoke-Expression" && ubi --project mongodb/kingfisher --in . +Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass -Force +Invoke-WebRequest -Uri 'https://raw.githubusercontent.com/mongodb/kingfisher/main/scripts/install-kingfisher.ps1' -OutFile install-kingfisher.ps1 +./install-kingfisher.ps1 ``` -This installs and runs `ubi` and then places the `kingfisher` executable in the current directory on Windows. +You can provide a custom destination using the `-InstallDir` parameter: + +```powershell +./install-kingfisher.ps1 -InstallDir 'C:\Tools\Kingfisher' +```
@@ -415,6 +428,11 @@ kingfisher scan ./my-project \ Limit scanning to the delta between your default branch and a pull request branch by combining `--since-commit` with `--branch` (defaults to `HEAD`). This only scans files that differ between the two references, which keeps CI runs fast while still blocking new secrets. +Use `--branch-root-commit` alongside `--branch` when you need to include a specific commit (and everything after it) in a diff-focused scan without re-examining earlier history. Provide the branch tip (or other comparison ref) via `--branch`, and pass the commit or merge-base you want to include with `--branch-root-commit`. If you omit `--branch-root-commit`, you can still enable `--branch-root` to fall back to treating the `--branch` ref itself as the inclusive root for backwards compatibility. This is especially useful in long-lived branches where you want to resume scanning from a previous review point or from the commit where a hotfix forked. + +> **How is this different from `--since-commit`?** +> `--since-commit` computes a diff between the branch tip and another ref, so it only inspects files that changed between those two points in history. `--branch-root-commit` rewinds to the parent of the commit you provide and then scans everything introduced from that commit forward, even if the files are unchanged relative to another baseline. Reach for `--since-commit` to keep CI scans fast by checking only the latest delta, and use `--branch-root-commit` when you want to re-audit the full contents of a branch starting at a specific commit. + ```bash kingfisher scan . \ --since-commit origin/main \ @@ -436,6 +454,19 @@ kingfisher scan /tmp/SecretsTest --branch feature-1 \ # scan only a specific commit kingfisher scan /tmp/dev/SecretsTest \ --branch baba6ccb453963d3f6136d1ace843e48d7007c3f +# +# scan feature-1 starting at a specific commit (inclusive) +kingfisher scan /tmp/SecretsTest --branch feature-1 \ + --branch-root-commit baba6ccb453963d3f6136d1ace843e48d7007c3f +# +# scan feature-1 starting from the commit where the branch diverged from main +kingfisher scan /tmp/SecretsTest --branch feature-1 \ + --branch-root-commit $(git -C /tmp/SecretsTest merge-base main feature-1) +# +# scan from a hotfix commit that should be re-checked before merging +HOTFIX_COMMIT=$(git -C /tmp/SecretsTest rev-parse hotfix~1) +kingfisher scan /tmp/SecretsTest --branch hotfix \ + --branch-root-commit "$HOTFIX_COMMIT" ``` When the branch under test is already checked out, `--branch HEAD` or omitting `--branch` entirely is sufficient. Kingfisher exits with `200` when any findings are discovered and `205` when validated secrets are present, allowing CI jobs to fail automatically if new credentials slip in. diff --git a/data/rules/vercel.yml b/data/rules/vercel.yml index d649b00..121b5fe 100644 --- a/data/rules/vercel.yml +++ b/data/rules/vercel.yml @@ -8,7 +8,7 @@ rules: (?:.|[\n\r]){0,32}? \b ( - [a-zA-Z0-9]{24} + [A-Z0-9]{24} ) \b confidence: medium diff --git a/scripts/install-kingfisher.ps1 b/scripts/install-kingfisher.ps1 new file mode 100644 index 0000000..5e2405c --- /dev/null +++ b/scripts/install-kingfisher.ps1 @@ -0,0 +1,80 @@ +<# +.SYNOPSIS + Download and install the latest Kingfisher release for Windows. + +.DESCRIPTION + Fetches the most recent GitHub release for mongodb/kingfisher, downloads the + Windows x64 archive, and extracts kingfisher.exe to the destination folder. + By default the script installs into "$env:USERPROFILE\bin". + +.PARAMETER InstallDir + Optional destination directory for the kingfisher.exe binary. + +.EXAMPLE + ./install-kingfisher.ps1 + +.EXAMPLE + ./install-kingfisher.ps1 -InstallDir "C:\\Tools" +#> +param( + [Parameter(Position = 0)] + [string]$InstallDir = (Join-Path $env:USERPROFILE 'bin') +) + +$repo = 'mongodb/kingfisher' +$apiUrl = "https://api.github.com/repos/$repo/releases/latest" +$assetName = 'kingfisher-windows-x64.zip' + +if (-not (Get-Command Invoke-WebRequest -ErrorAction SilentlyContinue)) { + throw 'Invoke-WebRequest is required to download releases.' +} + +if (-not (Get-Command Expand-Archive -ErrorAction SilentlyContinue)) { + throw 'Expand-Archive is required to extract the release archive. Install the PowerShell archive module.' +} + +Write-Host "Fetching latest release metadata for $repo…" +try { + $response = Invoke-WebRequest -Uri $apiUrl -UseBasicParsing + $release = $response.Content | ConvertFrom-Json +} catch { + throw "Failed to retrieve release information from GitHub: $_" +} + +$releaseTag = $release.tag_name +$asset = $release.assets | Where-Object { $_.name -eq $assetName } +if (-not $asset) { + throw "Could not find asset '$assetName' in the latest release." +} + +$tempDir = New-Item -ItemType Directory -Path ([System.IO.Path]::GetTempPath()) -Name ([System.Guid]::NewGuid().ToString()) +$archivePath = Join-Path $tempDir.FullName $assetName + +try { + if ($releaseTag) { + Write-Host "Latest release: $releaseTag" + } + + Write-Host "Downloading $assetName…" + Invoke-WebRequest -Uri $asset.browser_download_url -OutFile $archivePath -UseBasicParsing + + Write-Host 'Extracting archive…' + Expand-Archive -Path $archivePath -DestinationPath $tempDir.FullName -Force + + $binaryPath = Join-Path $tempDir.FullName 'kingfisher.exe' + if (-not (Test-Path $binaryPath)) { + throw 'Extracted archive did not contain kingfisher.exe.' + } + + New-Item -ItemType Directory -Path $InstallDir -Force | Out-Null + $destination = Join-Path $InstallDir 'kingfisher.exe' + Copy-Item -Path $binaryPath -Destination $destination -Force + + Write-Host "Kingfisher installed to: $destination" + Write-Host "Ensure '$InstallDir' is in your PATH environment variable." +} +finally { + if ($tempDir -and (Test-Path $tempDir.FullName)) { + Remove-Item -Path $tempDir.FullName -Recurse -Force + } +} diff --git a/scripts/install-kingfisher.sh b/scripts/install-kingfisher.sh new file mode 100755 index 0000000..295b4b4 --- /dev/null +++ b/scripts/install-kingfisher.sh @@ -0,0 +1,151 @@ +#!/usr/bin/env bash +set -euo pipefail + +REPO="mongodb/kingfisher" +API_URL="https://api.github.com/repos/${REPO}/releases/latest" +DEFAULT_INSTALL_DIR="$HOME/.local/bin" + +usage() { + cat <<'USAGE' +Usage: install-kingfisher.sh [INSTALL_DIR] + +Downloads the latest Kingfisher release for Linux or macOS and installs the +binary into INSTALL_DIR (default: ~/.local/bin). + +The script requires curl, tar, and python3. +USAGE +} + +if [[ "${1-}" == "-h" || "${1-}" == "--help" ]]; then + usage + exit 0 +fi + +INSTALL_DIR="${1:-$DEFAULT_INSTALL_DIR}" + +if ! command -v curl >/dev/null 2>&1; then + echo "Error: curl is required to download releases." >&2 + exit 1 +fi + +if ! command -v tar >/dev/null 2>&1; then + echo "Error: tar is required to extract the release archive." >&2 + exit 1 +fi + +if ! command -v python3 >/dev/null 2>&1; then + echo "Error: python3 is required to process the GitHub API response." >&2 + exit 1 +fi + +OS=$(uname -s) +ARCH=$(uname -m) + +case "$OS" in + Linux) + platform="linux" + ;; + Darwin) + platform="darwin" + ;; + *) + echo "Error: Unsupported operating system '$OS'." >&2 + echo "This installer currently supports Linux and macOS." >&2 + exit 1 + ;; +esac + +case "$ARCH" in + x86_64|amd64) + arch_suffix="x64" + ;; + arm64|aarch64) + arch_suffix="arm64" + ;; + *) + echo "Error: Unsupported architecture '$ARCH'." >&2 + echo "This installer currently supports x86_64/amd64 and arm64/aarch64." >&2 + exit 1 + ;; +esac + +asset_name="kingfisher-${platform}-${arch_suffix}.tgz" + +echo "Fetching latest release metadata for ${REPO}…" +release_json=$(curl -fsSL "$API_URL") + +if [[ -z "$release_json" ]]; then + echo "Error: Failed to retrieve release information from GitHub." >&2 + exit 1 +fi + +download_url=$(RELEASE_JSON="$release_json" python3 - "$asset_name" <<'PY' +import json +import sys +import os + +asset_name = sys.argv[1] +try: + release = json.loads(os.environ["RELEASE_JSON"]) +except (json.JSONDecodeError, KeyError) as exc: + sys.stderr.write(f"Error: Failed to parse GitHub response: {exc}\n") + sys.exit(1) + +for asset in release.get("assets", []): + if asset.get("name") == asset_name: + print(asset.get("browser_download_url", "")) + sys.exit(0) + +sys.stderr.write(f"Error: Could not find asset '{asset_name}' in the latest release.\n") +sys.exit(1) +PY +) + +if [[ -z "$download_url" ]]; then + exit 1 +fi + +release_tag=$(RELEASE_JSON="$release_json" python3 - <<'PY' +import json +import sys +import os + +try: + release = json.loads(os.environ["RELEASE_JSON"]) +except (json.JSONDecodeError, KeyError) as exc: + sys.stderr.write(f"Error: Failed to parse GitHub response: {exc}\n") + sys.exit(1) + +print(release.get("tag_name", "")) +PY +) + +tmpdir=$(mktemp -d) +cleanup() { + rm -rf "$tmpdir" +} +trap cleanup EXIT + +archive_path="$tmpdir/$asset_name" + +if [[ -n "$release_tag" ]]; then + echo "Latest release: $release_tag" +fi + +echo "Downloading $asset_name…" +curl -fsSL "$download_url" -o "$archive_path" + +echo "Extracting archive…" +tar -C "$tmpdir" -xzf "$archive_path" + +if [[ ! -f "$tmpdir/kingfisher" ]]; then + echo "Error: Extracted archive did not contain the kingfisher binary." >&2 + exit 1 +fi + +mkdir -p "$INSTALL_DIR" +install -m 755 "$tmpdir/kingfisher" "$INSTALL_DIR/kingfisher" + +printf 'Kingfisher installed to: %s/kingfisher\n\n' "$INSTALL_DIR" +printf 'Add the following to your shell configuration if the directory is not already in your PATH:\n export PATH="%s:$PATH"\n' "$INSTALL_DIR" + diff --git a/src/cli/commands/inputs.rs b/src/cli/commands/inputs.rs index fdea286..a04785e 100644 --- a/src/cli/commands/inputs.rs +++ b/src/cli/commands/inputs.rs @@ -332,6 +332,32 @@ pub struct InputSpecifierArgs { visible_alias = "ref" )] pub branch: Option, + + /// Treat the `--branch` commit or ref as the inclusive root for the scan. + /// + /// When enabled, Kingfisher diffs from the parent of the selected commit + /// through the current HEAD of the repository, ensuring the chosen commit + /// and every descendant is scanned exactly once. Providing + /// `--branch-root-commit` will also enable this behaviour automatically. + #[arg( + long = "branch-root", + help_heading = "Git Options", + requires = "branch", + conflicts_with = "since_commit", + action = clap::ArgAction::SetTrue + )] + pub branch_root: bool, + + /// Explicit commit or ref to use as the inclusive branch root. Supplying + /// this flag implicitly enables branch-root scanning even if `--branch-root` + /// is omitted. + #[arg( + long = "branch-root-commit", + value_name = "GIT-REF", + help_heading = "Git Options", + conflicts_with = "since_commit" + )] + pub branch_root_commit: Option, } impl InputSpecifierArgs { diff --git a/src/lib.rs b/src/lib.rs index fb9246c..46c581b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -62,6 +62,7 @@ use tracing::debug; pub struct GitDiffConfig { pub since_ref: Option, pub branch_ref: String, + pub branch_root: Option, } struct EnumeratorConfig { @@ -332,7 +333,16 @@ impl FilesystemEnumerator { /// Opens the given Git repository if it exists, returning None if not. pub fn open_git_repo(path: &Path) -> Result> { - let opts = Options::isolated().open_path_as_is(false); + open_git_repo_with_options(path, true) +} + +/// Opens the given Git repository with explicit control over the +/// `open_path_as_is` option, returning None if not. +pub fn open_git_repo_with_options( + path: &Path, + open_path_as_is: bool, +) -> Result> { + let opts = Options::isolated().open_path_as_is(open_path_as_is); match open_opts(path, opts) { Err(gix::open::Error::NotARepository { .. }) => Ok(None), Err(err) => Err(err.into()), diff --git a/src/main.rs b/src/main.rs index 78533a5..a7deda6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -418,6 +418,8 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, }, extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { diff --git a/src/reporter.rs b/src/reporter.rs index 127a9ad..9ec0f66 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -779,6 +779,8 @@ mod tests { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, }, extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 6435c71..26d1b27 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -153,6 +153,8 @@ mod tests { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, }, extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index e1d963f..06cc3c9 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -31,7 +31,7 @@ use crate::{ git_commit_metadata::CommitMetadata, git_repo_enumerator::GitBlobMetadata, matcher::{Matcher, MatcherStats}, - open_git_repo, + open_git_repo_with_options, origin::{Origin, OriginSet}, rule_profiling::ConcurrentRuleProfiler, rules_database::RulesDatabase, @@ -60,16 +60,29 @@ pub fn enumerate_filesystem_inputs( ) -> Result<()> { let repo_scan_timeout = Duration::from_secs(args.git_repo_timeout); + let branch_root_enabled = args.input_specifier_args.branch_root + || args.input_specifier_args.branch_root_commit.is_some(); + let diff_config = if args.input_specifier_args.since_commit.is_some() || args.input_specifier_args.branch.is_some() + || branch_root_enabled { + let branch_arg = args.input_specifier_args.branch.clone(); + let branch_root_commit = args.input_specifier_args.branch_root_commit.clone(); + let (branch_ref, branch_root) = if branch_root_enabled { + if let Some(explicit_root) = branch_root_commit { + (branch_arg.clone().unwrap_or_else(|| "HEAD".to_string()), Some(explicit_root)) + } else { + ("HEAD".to_string(), branch_arg.clone()) + } + } else { + (branch_arg.clone().unwrap_or_else(|| "HEAD".to_string()), None) + }; + Some(GitDiffConfig { since_ref: args.input_specifier_args.since_commit.clone(), - branch_ref: args - .input_specifier_args - .branch - .clone() - .unwrap_or_else(|| "HEAD".to_string()), + branch_ref, + branch_root, }) } else { None @@ -609,13 +622,14 @@ impl<'cfg> ParallelBlobIterator for (&'cfg EnumeratorConfig, FoundInput) { // ───────────── directory (possible Git repo) ───────────── FoundInput::Directory(i) => { let path = &i.path; + let open_path_as_is = cfg.git_diff.is_none(); - if cfg.git_diff.is_none() && !cfg.enumerate_git_history { + if open_path_as_is && !cfg.enumerate_git_history { return Ok(None); } // Try to open a Git repository at that path - let repository = match open_git_repo(path)? { + let repository = match open_git_repo_with_options(path, open_path_as_is)? { Some(r) => r, None => return Ok(None), }; @@ -719,7 +733,7 @@ fn enumerate_git_diff_repo( exclude_globset: Option>, collect_commit_metadata: bool, ) -> Result { - let GitDiffConfig { since_ref, branch_ref } = diff_cfg; + let GitDiffConfig { since_ref, branch_ref, branch_root } = diff_cfg; let blobs = { let head_id = resolve_diff_ref(&repository, path, &branch_ref).with_context(|| { @@ -760,6 +774,40 @@ fn enumerate_git_diff_repo( .with_context(|| format!("Failed to read tree for commit {}", base_id.to_hex()))?; base_tree = Some(tree); + } else if let Some(ref branch_root_value) = branch_root { + let root_id = + resolve_diff_ref(&repository, path, branch_root_value).with_context(|| { + format!( + "Failed to resolve --branch-root '{}' in repository {}", + branch_root_value, + path.display() + ) + })?; + + let root_commit = root_id + .object() + .with_context(|| format!("Failed to load commit {} for diffing", root_id.to_hex()))? + .try_into_commit() + .with_context(|| { + format!("Referenced object {} is not a commit", root_id.to_hex()) + })?; + + let mut parent_ids = root_commit.parent_ids(); + if let Some(parent_id) = parent_ids.next() { + let parent_commit = parent_id + .object() + .with_context(|| { + format!("Failed to load parent commit {} for diffing", parent_id.to_hex()) + })? + .try_into_commit() + .with_context(|| { + format!("Referenced object {} is not a commit", parent_id.to_hex()) + })?; + let parent_tree = parent_commit.tree().with_context(|| { + format!("Failed to read tree for commit {}", parent_id.to_hex()) + })?; + base_tree = Some(parent_tree); + } } let changes = repository @@ -1008,7 +1056,11 @@ mod tests { let result = enumerate_git_diff_repo( &repo_path, gix_repo, - GitDiffConfig { since_ref: None, branch_ref: "featurefake".to_string() }, + GitDiffConfig { + since_ref: None, + branch_ref: "featurefake".to_string(), + branch_root: None, + }, None, false, )?; diff --git a/src/scanner/enumerate.rs.orig b/src/scanner/enumerate.rs.orig new file mode 100644 index 0000000..28dcba7 --- /dev/null +++ b/src/scanner/enumerate.rs.orig @@ -0,0 +1,1070 @@ +use std::{ + marker::PhantomData, + path::Path, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, Mutex, + }, + time::{Duration, Instant as StdInstant, Instant}, +}; + +use anyhow::{anyhow, bail, Context, Result}; +use base64::{engine::general_purpose::STANDARD, Engine}; +use bstr::{BString, ByteSlice}; +use gix::{object::tree::diff::ChangeDetached, object::tree::EntryKind, Repository as GixRepo}; +use indicatif::{ProgressBar, ProgressStyle}; +use rayon::{ + iter::plumbing::Folder, + prelude::{ParallelIterator, *}, +}; +use serde::{Deserialize, Deserializer}; +use tracing::{debug, error}; + +use smallvec::smallvec; + +use crate::{ + binary::is_binary, + blob::{Blob, BlobAppearance, BlobId, BlobIdMap}, + cli::commands::{github::GitHistoryMode, scan}, + decompress::{decompress_file_to_temp, CompressedContent}, + findings_store, + git_commit_metadata::CommitMetadata, + git_repo_enumerator::GitBlobMetadata, + matcher::{Matcher, MatcherStats}, + open_git_repo_with_options, + origin::{Origin, OriginSet}, + rule_profiling::ConcurrentRuleProfiler, + rules_database::RulesDatabase, + scanner::{ + processing::BlobProcessor, + runner::{create_datastore_channel, spawn_datastore_writer_thread}, + util::is_compressed_file, + }, + scanner_pool::ScannerPool, + DirectoryResult, EnumeratorConfig, EnumeratorFileResult, FileResult, FilesystemEnumerator, + FoundInput, GitDiffConfig, GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator, + PathBuf, +}; + +type OwnedBlob = Blob<'static>; + +pub fn enumerate_filesystem_inputs( + args: &scan::ScanArgs, + datastore: Arc>, + input_roots: &[PathBuf], + progress_enabled: bool, + rules_db: &RulesDatabase, + enable_profiling: bool, + shared_profiler: Arc, + matcher_stats: &Mutex, +) -> Result<()> { + let repo_scan_timeout = Duration::from_secs(args.git_repo_timeout); + + let diff_config = if args.input_specifier_args.since_commit.is_some() + || args.input_specifier_args.branch.is_some() + { + Some(GitDiffConfig { + since_ref: args.input_specifier_args.since_commit.clone(), + branch_ref: args + .input_specifier_args + .branch + .clone() + .unwrap_or_else(|| "HEAD".to_string()), + }) + } else { + None + }; + + let progress = if progress_enabled { + let style = + ProgressStyle::with_template("{spinner} {msg} {total_bytes} [{elapsed_precise}]") + .expect("progress bar style template should compile"); + let pb = ProgressBar::new_spinner() + .with_style(style) + .with_message("Scanning files and git repository content..."); + pb.enable_steady_tick(Duration::from_millis(500)); + pb + } else { + ProgressBar::hidden() + }; + let _input_enumerator = || -> Result { + let mut ie = FilesystemEnumerator::new(input_roots, &args)?; + ie.threads(args.num_jobs); + ie.max_filesize(args.content_filtering_args.max_file_size_bytes()); + if args.input_specifier_args.git_history == GitHistoryMode::None { + ie.enumerate_git_history(false); + } + + let collect_git_metadata = true; + ie.collect_git_metadata(collect_git_metadata); + Ok(ie) + }() + .context("Failed to initialize filesystem enumerator")?; + + let (enum_thread, input_recv, exclude_globset) = { + let fs_enumerator = make_fs_enumerator(args, input_roots.to_vec()) + .context("Failed to initialize filesystem enumerator")?; + let exclude_globset = fs_enumerator.as_ref().and_then(|ie| ie.exclude_globset()); + let channel_size = std::cmp::max(args.num_jobs * 128, 1024); + + let (input_send, input_recv) = crossbeam_channel::bounded(channel_size); + let diff_config_for_thread = diff_config.clone(); + let roots_for_thread = input_roots.to_vec(); + let input_enumerator_thread = std::thread::Builder::new() + .name("input_enumerator".to_string()) + .spawn(move || -> Result<_> { + if diff_config_for_thread.is_some() { + for root in roots_for_thread { + input_send + .send(FoundInput::Directory(DirectoryResult { path: root })) + .context("Failed to queue repository for scanning")?; + } + } else if let Some(fs_enumerator) = fs_enumerator { + fs_enumerator.run(input_send.clone())?; + } + Ok(()) + }) + .context("Failed to enumerate filesystem inputs")?; + (input_enumerator_thread, input_recv, exclude_globset) + }; + + let enum_cfg = EnumeratorConfig { + enumerate_git_history: match args.input_specifier_args.git_history { + GitHistoryMode::Full => true, + GitHistoryMode::None => false, + }, + collect_git_metadata: args.input_specifier_args.commit_metadata, + repo_scan_timeout, + exclude_globset: exclude_globset.clone(), + git_diff: diff_config.clone(), + }; + let (send_ds, recv_ds) = create_datastore_channel(args.num_jobs); + let datastore_writer_thread = + spawn_datastore_writer_thread(datastore, recv_ds, !args.no_dedup)?; + + let t1 = Instant::now(); + let num_blob_processors = Mutex::new(0u64); + let seen_blobs = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + + let matcher = Matcher::new( + &rules_db, + scanner_pool.clone(), + &seen_blobs, + Some(&matcher_stats), + enable_profiling, + Some(shared_profiler), + &args.extra_ignore_comments, + args.no_inline_ignore, + )?; + let blob_processor_init_time = Mutex::new(t1.elapsed()); + let make_blob_processor = || -> BlobProcessor { + let t1 = Instant::now(); + *num_blob_processors.lock().unwrap() += 1; + { + let mut init_time = blob_processor_init_time.lock().unwrap(); + *init_time += t1.elapsed(); + } + BlobProcessor { matcher } + }; + let scan_res: Result<()> = input_recv + .into_iter() + .par_bridge() + .filter_map(|input| match (&enum_cfg, input).into_blob_iter() { + Err(e) => { + debug!("Error enumerating input: {e:#}"); + None + } + Ok(blob_iter) => blob_iter, + }) + .flatten() + .try_for_each_init( + || (make_blob_processor.clone()(), progress.clone()), + move |(processor, progress), entry| { + let (origin, blob) = match entry { + Err(e) => { + error!("Error loading input: {e:#}"); + return Ok(()); + } + Ok(entry) => entry, + }; + // Check if this is an archive file + let is_archive = if let Origin::File(file_origin) = &origin.first() { + is_compressed_file(&file_origin.path) + } else { + false + }; + let is_binary = is_binary(&blob.bytes()); + let should_skip = if is_archive { + // For archives: skip only if --no_extract_archives is true + args.content_filtering_args.no_extract_archives + } else { + // For non-archives: skip if it's binary and --no_binary is true + is_binary && args.content_filtering_args.no_binary + }; + if should_skip { + progress.suspend(|| { + let path = origin + .first() + .blob_path() + .map(|p| p.display().to_string()) + .unwrap_or_else(|| blob.temp_id().to_string()); + if is_archive { + debug!("Skipping archive: {path}"); + } else { + debug!("Skipping binary blob: {path}"); + } + }); + return Ok(()); + } + progress.inc(blob.len().try_into().unwrap()); + match processor.run(origin, blob, args.no_dedup, args.redact, args.no_base64) { + Ok(None) => { + // nothing to record + } + Ok(Some((origin_set, blob_metadata, vec_of_matches))) => { + for (_, single_match) in vec_of_matches { + // Send each match + send_ds.send(( + Arc::new(origin_set.clone()), + Arc::new(blob_metadata.clone()), + single_match, + ))?; + } + } + Err(e) => { + debug!("Error scanning input: {e:#}"); + } + } + Ok(()) + }, + ); + + enum_thread.join().unwrap().context("Failed to enumerate inputs")?; + let (..) = datastore_writer_thread + .join() + .unwrap() + .context("Failed to save results to the datastore")?; + scan_res.context("Failed to scan inputs")?; + progress.finish(); + Ok(()) +} + +/// Initialize a `FilesystemEnumerator` based on the command-line arguments and +/// datastore. Also initialize a `Gitignore` that is the same as that used by +/// the filesystem enumerator. +fn make_fs_enumerator( + args: &scan::ScanArgs, + input_roots: Vec, +) -> Result> { + if input_roots.is_empty() { + Ok(None) + } else { + let mut ie = FilesystemEnumerator::new(&input_roots, &args)?; + ie.threads(args.num_jobs); + ie.max_filesize(args.content_filtering_args.max_file_size_bytes()); + if args.input_specifier_args.git_history == GitHistoryMode::None { + ie.enumerate_git_history(false); + } + + // Pass no_dedup when enumerating git history + ie.no_dedup(args.no_dedup); + + ie.set_exclude_patterns(&args.content_filtering_args.exclude)?; + // Determine whether to collect git metadata or not + let collect_git_metadata = false; + ie.collect_git_metadata(collect_git_metadata); + Ok(Some(ie)) + } +} + +// Rest of the file remains the same... +/// Implements parallel iteration for either a single blob or a list of blobs. +struct FileResultIter<'a> { + iter_kind: FileResultIterKind, + _marker: PhantomData<&'a ()>, +} + +impl<'a> ParallelIterator for FileResultIter<'a> { + type Item = Result<(OriginSet, Blob<'a>)>; + + fn drive_unindexed(self, consumer: C) -> C::Result + where + C: rayon::iter::plumbing::UnindexedConsumer, + { + match self.iter_kind { + FileResultIterKind::Single(maybe_one) => { + let mut folder = consumer.into_folder(); + if let Some(one) = maybe_one { + folder = folder.consume(Ok(one)); + } + folder.complete() + } + FileResultIterKind::Archive(items) => { + items.into_par_iter().map(Ok).drive_unindexed(consumer) + } + } + } +} + +impl ParallelBlobIterator for FileResult { + type Iter<'a> = FileResultIter<'a>; + + fn into_blob_iter<'a>(self) -> Result>> { + let extraction_enabled = self.extract_archives; + let max_extraction_depth = self.extraction_depth; + + if extraction_enabled && is_compressed_file(&self.path) { + match decompress_file_to_temp(&self.path) { + Ok((content, _temp_dir)) => match content { + // Single-file decompression fully in memory. + CompressedContent::Raw(ref data) => { + let origin = OriginSet::new(Origin::from_file(self.path.clone()), vec![]); + let blob = Blob::from_bytes(data.to_vec()); + Ok(Some(FileResultIter { + iter_kind: FileResultIterKind::Single(Some((origin, blob))), + _marker: PhantomData, + })) + } + + // Single-file decompression streamed to a file. We read it back into memory + // here. + CompressedContent::RawFile(path) => { + let origin = OriginSet::new(Origin::from_file(self.path.clone()), vec![]); + let blob = Blob::from_file(&path)?; + Ok(Some(FileResultIter { + iter_kind: FileResultIterKind::Single(Some((origin, blob))), + _marker: PhantomData, + })) + } + + // Multi‑file archive (in‑memory). + CompressedContent::Archive(ref files) => { + if max_extraction_depth == 0 { + debug!( + "Skipping nested archive (max depth reached): {}", + self.path.display() + ); + return Ok(None); + } + let items = files + .iter() + .map(|(filename, data)| { + let full_path = PathBuf::from(filename); + let nested_origin = + OriginSet::new(Origin::from_file(full_path), vec![]); + // Construct a FileResult for deeper extraction if needed (not used + // directly here) + let _ = FileResult { + path: self.path.join(filename), + num_bytes: data.len() as u64, + extract_archives: self.extract_archives, + extraction_depth: max_extraction_depth - 1, + }; + (nested_origin, Blob::from_bytes(data.to_vec())) + }) + .collect(); + Ok(Some(FileResultIter { + iter_kind: FileResultIterKind::Archive(items), + _marker: PhantomData, + })) + } + + // Multi‑file archive (files on disk). + CompressedContent::ArchiveFiles(ref entries) => { + if max_extraction_depth == 0 { + debug!( + "Skipping nested archive (max depth reached): {}", + self.path.display() + ); + return Ok(None); + } + // Read each extracted file from disk and create a Blob. + let mut items = Vec::new(); + for (filename, disk_path) in entries { + let blob = match Blob::from_file(disk_path) { + Ok(b) => b, + Err(e) => { + debug!( + "Failed to mmap extracted file {}: {}", + disk_path.display(), + e + ); + continue; // skip unreadable / unmappable file + } + }; + let full_path = PathBuf::from(filename); + let nested_origin = + OriginSet::new(Origin::from_file(full_path), vec![]); + + // Construct a FileResult for deeper extraction if needed (not used + // directly here) + let _ = FileResult { + path: self.path.join(filename), + num_bytes: blob.len() as u64, + extract_archives: self.extract_archives, + extraction_depth: max_extraction_depth - 1, + }; + items.push((nested_origin, blob)); + } + Ok(Some(FileResultIter { + iter_kind: FileResultIterKind::Archive(items), + _marker: PhantomData, + })) + } + }, + Err(e) => { + debug!("Failed to decompress {}: {}", self.path.display(), e); + Ok(None) // Skip on decompression failure + } + } + } else { + // Not compressed or extraction disabled: read file as a single blob. + let blob = Blob::from_file(&self.path) + .with_context(|| format!("Failed to load blob from {}", self.path.display()))?; + let origin = OriginSet::new(Origin::from_file(self.path.clone()), vec![]); + Ok(Some(FileResultIter { + iter_kind: FileResultIterKind::Single(Some((origin, blob))), + _marker: PhantomData, + })) + } + } +} + +// A marker so the struct itself carries the lifetime. +struct GitRepoResultIter<'a> { + inner: GitRepoResult, + deadline: std::time::Instant, + _marker: std::marker::PhantomData<&'a ()>, +} + +impl ParallelBlobIterator for GitRepoResult { + type Iter<'a> = GitRepoResultIter<'a>; + + fn into_blob_iter<'a>(self) -> Result>> { + // placeholder 1 h deadline; will be overwritten immediately + const PLACEHOLDER: Duration = Duration::from_secs(3600); + + Ok(Some(GitRepoResultIter { + inner: self, + deadline: Instant::now() + PLACEHOLDER, + _marker: std::marker::PhantomData, + })) + } +} + +impl<'a> rayon::iter::ParallelIterator for GitRepoResultIter<'a> { + type Item = Result<(OriginSet, Blob<'a>)>; + + fn drive_unindexed(self, consumer: C) -> C::Result + where + C: rayon::iter::plumbing::UnindexedConsumer, + { + // ── shared state ────────────────────────────────────────────── + let repo_sync = self.inner.repository.into_sync(); + let repo_path = Arc::new(self.inner.path.clone()); + let deadline = self.deadline; + let flag = Arc::new(AtomicBool::new(false)); // first-timeout gate + + self.inner + .blobs + .into_par_iter() + .with_min_len(1024) + .map_init(|| repo_sync.to_thread_local(), { + let repo_path = Arc::clone(&repo_path); + let flag = Arc::clone(&flag); + + move |repo: &mut GixRepo, md| -> Result<(OriginSet, Blob)> { + // ── 10-minute guard ────────────────────────── + if StdInstant::now() > deadline { + if flag.swap(true, Ordering::Relaxed) { + bail!("__timeout_silenced__"); + } + bail!("blob-read timeout (repo: {})", repo_path.display()); + } + + // ── load blob ──────────────────────────────── + let blob_id = md.blob_oid; + let mut raw = repo.find_object(blob_id)?.try_into_blob()?; + let blob = Blob::new(BlobId::from(&blob_id), std::mem::take(&mut raw.data)); + + // ── build Origin — CLONE Arc & PathBuf ────── + let origin = OriginSet::try_from_iter(md.first_seen.iter().map(|e| { + Origin::from_git_repo_with_first_commit( + Arc::clone(&repo_path), + Arc::clone(&e.commit_metadata), + String::from_utf8_lossy(&e.path).to_string(), + ) + })) + .unwrap_or_else(|| Origin::from_git_repo(Arc::clone(&repo_path)).into()); + + Ok((origin, blob)) + } + }) + .filter(|res| { + !matches!(res, + Err(e) if e.to_string() == "__timeout_silenced__" + ) + }) + .drive_unindexed(consumer) + } +} + +struct EnumeratorFileIter<'a> { + inner: EnumeratorFileResult, + reader: std::io::BufReader, + _marker: PhantomData<&'a ()>, +} + +impl ParallelBlobIterator for EnumeratorFileResult { + type Iter<'a> = EnumeratorFileIter<'a>; + + fn into_blob_iter<'a>(self) -> Result>> { + let file = std::fs::File::open(&self.path)?; + let reader = std::io::BufReader::new(file); + Ok(Some(EnumeratorFileIter { inner: self, reader, _marker: PhantomData })) + } +} +enum FoundInputIter<'a> { + File(FileResultIter<'a>), + GitRepo(GitRepoResultIter<'a>), + EnumeratorFile(EnumeratorFileIter<'a>), +} + +// Enumerator file parallelism approach: +// +// - Split into lines sequentially +// - Parallelize JSON deserialization (JSON is an expensive serialization format, but easy to sling +// around, hence used here -- another format like Arrow or msgpack would be much more efficient) + +impl<'a> ParallelIterator for EnumeratorFileIter<'a> { + type Item = Result<(OriginSet, Blob<'a>)>; + + fn drive_unindexed(self, consumer: C) -> C::Result + where + C: rayon::iter::plumbing::UnindexedConsumer, + { + use std::io::BufRead; + (1usize..) + .zip(self.reader.lines()) + .filter_map(|(line_num, line)| line.map(|line| (line_num, line)).ok()) + .par_bridge() + .map(|(line_num, line)| { + let e: EnumeratorBlobResult = serde_json::from_str(&line).with_context(|| { + format!("Error in enumerator {}:{line_num}", self.inner.path.display()) + })?; + // let origin = Origin::from_extended(e.origin).into(); + let origin = OriginSet::new(Origin::from_extended(e.origin), Vec::new()); + let blob = Blob::from_bytes(e.content.as_bytes().to_owned()); + Ok((origin, blob)) + }) + .drive_unindexed(consumer) + } +} + +trait ParallelBlobIterator { + /// The concrete parallel iterator returned by `into_blob_iter`. + /// It is generic over the lifetime `'a` that the produced `Blob<'a>` carries. + type Iter<'a>: ParallelIterator)>> + 'a + where + Self: 'a; + /// Convert the input into an *optional* parallel iterator of `(Origin, Blob)` tuples. + fn into_blob_iter<'a>(self) -> Result>> + where + Self: 'a; +} + +impl<'a> ParallelIterator for FoundInputIter<'a> { + type Item = Result<(OriginSet, Blob<'a>)>; + + fn drive_unindexed(self, consumer: C) -> C::Result + where + C: rayon::iter::plumbing::UnindexedConsumer, + { + match self { + FoundInputIter::File(i) => i.drive_unindexed(consumer), + FoundInputIter::GitRepo(i) => i.drive_unindexed(consumer), + FoundInputIter::EnumeratorFile(i) => i.drive_unindexed(consumer), + } + } +} +impl<'cfg> ParallelBlobIterator for (&'cfg EnumeratorConfig, FoundInput) { + type Iter<'a> + = FoundInputIter<'a> + where + Self: 'a; + + fn into_blob_iter<'a>(self) -> Result>> + where + 'cfg: 'a, + { + use std::time::Instant; + + let (cfg, input) = self; + + match input { + // ───────────── regular file ───────────── + FoundInput::File(i) => Ok(i.into_blob_iter()?.map(FoundInputIter::File)), + + // ───────────── directory (possible Git repo) ───────────── + FoundInput::Directory(i) => { + let path = &i.path; + let open_path_as_is = cfg.git_diff.is_none(); + + if open_path_as_is && !cfg.enumerate_git_history { + return Ok(None); + } + + // Try to open a Git repository at that path + let repository = match open_git_repo_with_options(path, open_path_as_is)? { + Some(r) => r, + None => return Ok(None), + }; + + debug!("Found Git repository at {}", path.display()); + let t_start = Instant::now(); + let collect_git_metadata = cfg.collect_git_metadata; + let timeout = cfg.repo_scan_timeout; + + // Spawn an enumerator thread so we can time-out cleanly + let path_clone = path.to_path_buf(); + let (tx, rx) = std::sync::mpsc::channel(); + let exclude_globset = cfg.exclude_globset.clone(); + let diff_cfg = cfg.git_diff.clone(); + let handle = std::thread::spawn(move || { + let res = if let Some(diff_cfg) = diff_cfg { + enumerate_git_diff_repo( + &path_clone, + repository, + diff_cfg, + exclude_globset.clone(), + collect_git_metadata, + ) + } else if collect_git_metadata { + GitRepoWithMetadataEnumerator::new( + &path_clone, + repository, + exclude_globset.clone(), + ) + .run() + } else { + GitRepoEnumerator::new(&path_clone, repository).run() + }; + let _ = tx.send(res); + }); + + // Wait for enumeration, polling every 100 ms + let git_result = loop { + if t_start.elapsed() > timeout { + debug!( + "Git repo enumeration at {} timed-out after {:.1}s (> {} s)", + path.display(), + t_start.elapsed().as_secs_f64(), + timeout.as_secs() + ); + // Abandon the worker thread and skip this repo + return Ok(None); + } + + match rx.try_recv() { + Ok(res) => break res, + Err(std::sync::mpsc::TryRecvError::Empty) => { + std::thread::sleep(std::time::Duration::from_millis(100)); + } + Err(std::sync::mpsc::TryRecvError::Disconnected) => { + debug!("Enumerator thread disconnected for {}", path.display()); + return Ok(None); + } + } + }; + + let _ = handle.join(); // avoid leak + + match git_result { + Err(e) => { + debug!("Failed to enumerate Git repo at {}: {e}", path.display()); + Ok(None) + } + Ok(repo_result) => { + debug!( + "Enumerated Git repo at {} in {:.2}s", + path.display(), + t_start.elapsed().as_secs_f64() + ); + + // Convert to a blob iterator, then patch the deadline + repo_result + .into_blob_iter() // Option + .map(|iter| { + iter.map(|mut gri| { + gri.deadline = Instant::now() + timeout; + FoundInputIter::GitRepo(gri) + }) + }) + } + } + } + + // ───────────── pre-enumerated JSON file list ───────────── + FoundInput::EnumeratorFile(i) => { + Ok(i.into_blob_iter()?.map(FoundInputIter::EnumeratorFile)) + } + } + } +} + +fn enumerate_git_diff_repo( + path: &Path, + repository: gix::Repository, + diff_cfg: GitDiffConfig, + exclude_globset: Option>, + collect_commit_metadata: bool, +) -> Result { + let GitDiffConfig { since_ref, branch_ref } = diff_cfg; + + let blobs = { + let head_id = resolve_diff_ref(&repository, path, &branch_ref).with_context(|| { + format!("Failed to resolve --branch '{}' in repository {}", branch_ref, path.display()) + })?; + + let head_commit = head_id + .object() + .with_context(|| format!("Failed to load commit {} for diffing", head_id.to_hex()))? + .try_into_commit() + .with_context(|| format!("Referenced object {} is not a commit", head_id.to_hex()))?; + + let head_tree = head_commit + .tree() + .with_context(|| format!("Failed to read tree for commit {}", head_id.to_hex()))?; + + let mut base_tree = None; + + if let Some(ref since_ref_value) = since_ref { + let base_id = + resolve_diff_ref(&repository, path, since_ref_value).with_context(|| { + format!( + "Failed to resolve --since-commit '{}' in repository {}", + since_ref_value, + path.display() + ) + })?; + + let commit = base_id + .object() + .with_context(|| format!("Failed to load commit {} for diffing", base_id.to_hex()))? + .try_into_commit() + .with_context(|| { + format!("Referenced object {} is not a commit", base_id.to_hex()) + })?; + let tree = commit + .tree() + .with_context(|| format!("Failed to read tree for commit {}", base_id.to_hex()))?; + + base_tree = Some(tree); + } + + let changes = repository + .diff_tree_to_tree(base_tree.as_ref(), Some(&head_tree), None) + .with_context(|| { + if let Some(ref since_ref_value) = since_ref { + format!( + "Failed to compute diff between '{}' and '{}'", + since_ref_value, branch_ref + ) + } else { + format!("Failed to compute tree for '{}'", branch_ref) + } + })?; + + let commit_metadata = if collect_commit_metadata { + let committer = head_commit + .committer() + .with_context(|| format!("Failed to read committer for {}", branch_ref))? + .trim(); + let timestamp = committer.time().unwrap_or_else(|_| gix::date::Time::new(0, 0)); + Arc::new(CommitMetadata { + commit_id: head_commit.id, + committer_name: committer.name.to_str_lossy().into_owned(), + committer_email: committer.email.to_str_lossy().into_owned(), + committer_timestamp: timestamp, + }) + } else { + Arc::new(CommitMetadata { + commit_id: head_commit.id, + committer_name: String::new(), + committer_email: String::new(), + committer_timestamp: gix::date::Time::new(0, 0), + }) + }; + + let mut blobs = Vec::new(); + for change in changes { + let (entry_mode, id, location) = match change { + ChangeDetached::Addition { entry_mode, id, location, .. } => { + (entry_mode, id, location) + } + ChangeDetached::Modification { entry_mode, id, location, .. } => { + (entry_mode, id, location) + } + ChangeDetached::Rewrite { entry_mode, id, location, .. } => { + (entry_mode, id, location) + } + ChangeDetached::Deletion { .. } => continue, + }; + + match entry_mode.kind() { + EntryKind::Blob | EntryKind::BlobExecutable | EntryKind::Link => {} + _ => continue, + } + + let relative_path_str = String::from_utf8_lossy(location.as_ref()).into_owned(); + let relative_path = Path::new(&relative_path_str); + if let Some(gs) = &exclude_globset { + if gs.is_match(relative_path) || gs.is_match(&path.join(relative_path)) { + debug!( + "Skipping {} due to --exclude while diffing {}", + relative_path.display(), + path.display() + ); + continue; + } + } + + let appearance = + BlobAppearance { commit_metadata: Arc::clone(&commit_metadata), path: location }; + blobs.push(GitBlobMetadata { blob_oid: id, first_seen: smallvec![appearance] }); + } + + blobs + }; + + Ok(GitRepoResult { repository, path: path.to_owned(), blobs }) +} + +fn resolve_diff_ref<'repo>( + repository: &'repo gix::Repository, + path: &Path, + reference: &str, +) -> Result> { + let mut candidates = reference_candidates(reference); + if candidates.is_empty() { + candidates.push(reference.to_string()); + } + + let mut last_err: Option = None; + for candidate in &candidates { + match repository.rev_parse_single(candidate.as_bytes()) { + Ok(id) => return Ok(id), + Err(err) => last_err = Some(err.into()), + } + } + + let attempted = candidates.join(", "); + let err = last_err.unwrap_or_else(|| { + anyhow!("Reference resolution failed for '{}' without a more specific error", reference) + }); + Err(err).with_context(|| { + if attempted.is_empty() { + format!("Failed to resolve reference '{}' in repository {}", reference, path.display()) + } else { + format!( + "Failed to resolve reference '{}' in repository {} (tried: {})", + reference, + path.display(), + attempted + ) + } + }) +} + +fn reference_candidates(reference: &str) -> Vec { + fn push_unique(vec: &mut Vec, candidate: String) { + if !vec.iter().any(|existing| existing == &candidate) { + vec.push(candidate); + } + } + + let trimmed = reference.trim(); + if trimmed.is_empty() { + return Vec::new(); + } + + let mut candidates = Vec::new(); + push_unique(&mut candidates, trimmed.to_string()); + + if trimmed.eq_ignore_ascii_case("HEAD") { + return candidates; + } + + if trimmed.starts_with("refs/") { + return candidates; + } + + push_unique(&mut candidates, format!("refs/heads/{trimmed}")); + push_unique(&mut candidates, format!("refs/tags/{trimmed}")); + + if let Some((remote, rest)) = trimmed.split_once('/') { + if remote == "origin" { + if !rest.is_empty() { + push_unique(&mut candidates, format!("refs/remotes/{remote}/{rest}")); + } + } else if !rest.is_empty() { + push_unique(&mut candidates, format!("refs/remotes/origin/{trimmed}")); + push_unique(&mut candidates, format!("refs/remotes/{remote}/{rest}")); + } + } else { + push_unique(&mut candidates, format!("origin/{trimmed}")); + push_unique(&mut candidates, format!("refs/remotes/origin/{trimmed}")); + } + + candidates +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::path::Path; + + use super::{enumerate_git_diff_repo, GitDiffConfig}; + use anyhow::Result; + use bstr::ByteSlice; + use git2::{Repository as Git2Repository, Signature}; + use gix::{open::Options, open_opts}; + use tempfile::tempdir; + + use super::reference_candidates; + + #[test] + fn reference_candidates_for_plain_branch() { + assert_eq!( + reference_candidates("main"), + vec![ + "main".to_string(), + "refs/heads/main".to_string(), + "refs/tags/main".to_string(), + "origin/main".to_string(), + "refs/remotes/origin/main".to_string(), + ] + ); + } + + #[test] + fn reference_candidates_for_remote_branch() { + assert_eq!( + reference_candidates("origin/feature"), + vec![ + "origin/feature".to_string(), + "refs/heads/origin/feature".to_string(), + "refs/tags/origin/feature".to_string(), + "refs/remotes/origin/feature".to_string(), + ] + ); + } + + #[test] + fn reference_candidates_for_branch_with_path() { + assert_eq!( + reference_candidates("feature/foo"), + vec![ + "feature/foo".to_string(), + "refs/heads/feature/foo".to_string(), + "refs/tags/feature/foo".to_string(), + "refs/remotes/origin/feature/foo".to_string(), + "refs/remotes/feature/foo".to_string(), + ] + ); + } + + #[test] + fn reference_candidates_for_explicit_ref() { + assert_eq!(reference_candidates("refs/heads/main"), vec!["refs/heads/main".to_string()]); + } + + #[test] + fn reference_candidates_for_head_symbol() { + assert_eq!(reference_candidates("HEAD"), vec!["HEAD".to_string()]); + } + + #[test] + fn enumerate_git_diff_repo_branch_without_since_scans_head_tree() -> Result<()> { + let temp = tempdir()?; + let repo_path = temp.path().join("repo"); + let repo = Git2Repository::init(&repo_path)?; + let signature = Signature::now("tester", "tester@example.com")?; + + let tracked_file = repo_path.join("secret.txt"); + fs::create_dir_all(tracked_file.parent().unwrap())?; + fs::write(&tracked_file, b"super-secret")?; + + let mut index = repo.index()?; + index.add_path(Path::new("secret.txt"))?; + let tree_id = index.write_tree()?; + let tree = repo.find_tree(tree_id)?; + let commit_id = repo.commit(Some("HEAD"), &signature, &signature, "initial", &tree, &[])?; + let commit = repo.find_commit(commit_id)?; + repo.branch("featurefake", &commit, true)?; + + let git_dir = repo_path.join(".git"); + let gix_repo = open_opts(&git_dir, Options::isolated().open_path_as_is(true))?; + let result = enumerate_git_diff_repo( + &repo_path, + gix_repo, + GitDiffConfig { since_ref: None, branch_ref: "featurefake".to_string() }, + None, + false, + )?; + + assert_eq!(result.blobs.len(), 1, "expected the full branch tree to be enumerated"); + let blob = &result.blobs[0]; + assert_eq!(blob.first_seen.len(), 1); + let appearance_path = blob.first_seen[0].path.to_str_lossy(); + assert_eq!(appearance_path, "secret.txt"); + + Ok(()) + } +} + +/// A simple enum describing how we yield file content: +/// - Single: one `(origin, blob)` +/// - Archive: multiple `(origin, blob)` items from a decompressed archive +enum FileResultIterKind { + Single(Option<(OriginSet, OwnedBlob)>), + Archive(Vec<(OriginSet, OwnedBlob)>), +} + +#[derive(Deserialize)] +pub enum Content { + #[serde(rename = "content_base64")] + Base64(#[serde(deserialize_with = "deserialize_b64_bstring")] BString), + + #[serde(rename = "content")] + Utf8(String), +} + +impl Content { + pub fn as_bytes(&self) -> &[u8] { + match self { + Content::Base64(s) => s.as_slice(), + Content::Utf8(s) => s.as_bytes(), + } + } +} + +fn deserialize_b64_bstring<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let encoded = String::deserialize(deserializer)?; + let decoded = STANDARD.decode(&encoded).map_err(serde::de::Error::custom)?; + Ok(decoded.into()) +} + +// ------------------------------------------------------------------------------------------------- +/// An entry deserialized from an extensible enumerator +#[derive(serde::Deserialize)] +struct EnumeratorBlobResult { + #[serde(flatten)] + pub content: Content, + + pub origin: serde_json::Value, +} diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs index 54379a3..d950c47 100644 --- a/tests/int_allowlist.rs +++ b/tests/int_allowlist.rs @@ -120,6 +120,8 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result Result<()> { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index 2f1998c..b6eaae9 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -140,6 +140,8 @@ rules: scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 5.0, diff --git a/tests/int_github.rs b/tests/int_github.rs index 82a0f78..f96dd16 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -127,6 +127,8 @@ fn test_github_remote_scan() -> Result<()> { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index 745f323..71421d1 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -125,6 +125,8 @@ fn test_gitlab_remote_scan() -> Result<()> { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, }, extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { @@ -271,6 +273,8 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, gcs_bucket: None, gcs_prefix: None, gcs_service_account: None, diff --git a/tests/int_redact.rs b/tests/int_redact.rs index a570cfe..86b9dd1 100644 --- a/tests/int_redact.rs +++ b/tests/int_redact.rs @@ -103,6 +103,8 @@ async fn test_redact_hashes_finding_values() -> Result<()> { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, diff --git a/tests/int_slack.rs b/tests/int_slack.rs index dd9df68..d2d9048 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -111,6 +111,8 @@ impl TestContext { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, }, extra_ignore_comments: Vec::new(), content_filtering_args: ContentFilteringArgs { @@ -248,6 +250,8 @@ async fn test_scan_slack_messages() -> Result<()> { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index ba81576..d868936 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -183,6 +183,8 @@ async fn test_validation_cache_and_depvars() -> Result<()> { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 5dec6b5..9a17797 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -126,6 +126,8 @@ impl TestContext { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, }, content_filtering_args: ContentFilteringArgs { max_file_size_mb: 25.0, @@ -247,6 +249,8 @@ impl TestContext { scan_nested_repos: true, since_commit: None, branch: None, + branch_root: false, + branch_root_commit: None, gcs_bucket: None, gcs_prefix: None, diff --git a/tests/smoke_branch.rs b/tests/smoke_branch.rs index f79402b..927ca2d 100644 --- a/tests/smoke_branch.rs +++ b/tests/smoke_branch.rs @@ -117,3 +117,135 @@ aws_secret_access_key = efnegoUp/WXc3XwlL77dXu1aKIICzvz+n+7Sz88i Ok(()) } + +#[test] +fn scan_branch_root_inclusive_history() -> anyhow::Result<()> { + let dir = tempdir()?; + let repo_dir = dir.path().join("repo"); + let repo = Repository::init(&repo_dir)?; + let signature = Signature::now("tester", "tester@example.com")?; + + let secrets_path = repo_dir.join("secrets.txt"); + + let aws_value = "UpUbsQANRHLf2uuQ7QOlNXPbbtV5fmseW/GgTs5D/"; + let gcp_value = "c4c474d61701fd6fd4191883b8fea9a8411bf771"; + let slack_value = "xoxb-123465789012-0987654321123-AbDcEfGhIjKlMnOpQrStUvWx"; + let github_value = "ghp_aBcDeFgHiJkLmNoqpRsTuVwXyZ1243567890"; + let stripe_value = + "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"; + + let aws_line = "AWS_SECRET_ACCESS_KEY = 'UpUbsQANRHLf2uuQ7QOlNXPbbtV5fmseW/GgTs5D/'"; + let gcp_line = "GCP_PRIVATE_KEY_ID = 'c4c474d61701fd6fd4191883b8fea9a8411bf771'"; + let slack_line = "SLACK_BOT_TOKEN = 'xoxb-123465789012-0987654321123-AbDcEfGhIjKlMnOpQrStUvWx'"; + let github_line = "GITHUB_TOKEN = 'ghp_aBcDeFgHiJkLmNoqpRsTuVwXyZ1243567890'"; + let stripe_line = concat!( + "STRIPE_SECRET_KEY = '", + "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", + "'", + ); + + fs::write(&secrets_path, aws_line)?; + + let mut index = repo.index()?; + index.add_path(Path::new("secrets.txt"))?; + let tree_id = index.write_tree()?; + let tree = repo.find_tree(tree_id)?; + let initial_commit_id = + repo.commit(Some("HEAD"), &signature, &signature, "Add AWS secret", &tree, &[])?; + let initial_commit = repo.find_commit(initial_commit_id)?; + let initial_commit_hex = initial_commit_id.to_string(); + + let additions = [ + ("Add GCP private key id", gcp_line), + ("Add Slack bot token", slack_line), + ("Add GitHub PAT", github_line), + ("Add Stripe API key", stripe_line), + ]; + + let mut parent_commit = initial_commit; + let mut contents = String::from(aws_line); + + for (message, line) in additions { + contents.push('\n'); + contents.push_str(line); + fs::write(&secrets_path, &contents)?; + + let mut index = repo.index()?; + index.add_path(Path::new("secrets.txt"))?; + let tree_id = index.write_tree()?; + let tree = repo.find_tree(tree_id)?; + let new_commit_id = + repo.commit(Some("HEAD"), &signature, &signature, message, &tree, &[&parent_commit])?; + parent_commit = repo.find_commit(new_commit_id)?; + } + + let latest_commit_hex = parent_commit.id().to_string(); + repo.branch("long-lived", &parent_commit, true)?; + + // Scanning the initial commit without --branch-root should report only the + // secret present at that commit. + Command::cargo_bin("kingfisher")? + .args([ + "scan", + repo_dir.to_str().unwrap(), + "--branch", + initial_commit_hex.as_str(), + "--no-validate", + "--no-update-check", + ]) + .assert() + .code(200) + .stdout( + contains(aws_value) + .and(contains(gcp_value).not()) + .and(contains(slack_value).not()) + .and(contains(github_value).not()) + .and(contains(stripe_value).not()), + ); + + // Using --branch-root should include the selected commit and the remaining + // branch history up to HEAD, surfacing the later secrets too. + Command::cargo_bin("kingfisher")? + .args([ + "scan", + repo_dir.to_str().unwrap(), + "--branch", + initial_commit_hex.as_str(), + "--branch-root", + "--no-validate", + "--no-update-check", + ]) + .assert() + .code(200) + .stdout( + contains(aws_value) + .and(contains(gcp_value)) + .and(contains(slack_value)) + .and(contains(github_value)) + .and(contains(stripe_value)), + ); + + Command::cargo_bin("kingfisher")? + .args([ + "scan", + repo_dir.to_str().unwrap(), + "--branch", + "long-lived", + "--branch-root-commit", + initial_commit_hex.as_str(), + "--no-validate", + "--no-update-check", + ]) + .assert() + .code(200) + .stdout( + contains(aws_value) + .and(contains(gcp_value)) + .and(contains(slack_value)) + .and(contains(github_value)) + .and(contains(stripe_value)) + .and(contains(latest_commit_hex.as_str())), + ); + + Ok(()) +}