forked from mirrors/kingfisher
Improved language detection
This commit is contained in:
parent
7a0ea600d3
commit
0ce72739dc
8 changed files with 104 additions and 204 deletions
|
|
@ -3,7 +3,8 @@
|
|||
All notable changes to this project will be documented in this file.
|
||||
|
||||
## [1.42.0]
|
||||
- Internal refactoring of rule loader and git enumerator
|
||||
- Internal refactoring of rule loader, git enumerator, and filetype guesser
|
||||
- Improved language detection
|
||||
|
||||
## [1.41.0]
|
||||
- Added support for scanning gitlab subgroups, with `kingfisher scan --gitlab-group my-group --gitlab-include-subgroups`
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ publish = false
|
|||
|
||||
[package]
|
||||
name = "kingfisher"
|
||||
version = "1.41.0"
|
||||
version = "1.42.0"
|
||||
description = "MongoDB's blazingly fast secret scanning and validation tool"
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
|
|
@ -141,6 +141,7 @@ tree-sitter-typescript = "0.23.2"
|
|||
tree-sitter-yaml = "0.7.1"
|
||||
streaming-iterator = "0.1.9"
|
||||
tree-sitter-regex = "0.24.3"
|
||||
tree_magic_mini = "3.2"
|
||||
content_inspector = "0.2.4"
|
||||
rustc-hash = "2.1.1"
|
||||
term_size = "0.3.2"
|
||||
|
|
@ -187,6 +188,7 @@ walkdir = "2.5.0"
|
|||
p256 = "0.13.2"
|
||||
ed25519-dalek = { version = "2.2", features = ["pkcs8"] }
|
||||
aws-sdk-s3 = "1.100.0"
|
||||
tokei = "12.1.2"
|
||||
|
||||
[dependencies.tikv-jemallocator]
|
||||
version = "0.6"
|
||||
|
|
|
|||
|
|
@ -1,9 +1,11 @@
|
|||
// Requires: tokei = "12" in Cargo.toml
|
||||
use std::path::Path;
|
||||
use tokei::LanguageType;
|
||||
|
||||
/// The type of content detected.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum ContentType {
|
||||
/// Unprintable or control‑heavy data.
|
||||
/// Unprintable or control-heavy data.
|
||||
BINARY,
|
||||
/// Mostly printable text.
|
||||
TEXT,
|
||||
|
|
@ -30,7 +32,7 @@ impl ContentInspector {
|
|||
|
||||
/// Classify `bytes` as TEXT or BINARY:
|
||||
///
|
||||
/// 1. If null‑byte count > `max_null_bytes` -- `BINARY`.
|
||||
/// 1. If null-byte count > `max_null_bytes` -- `BINARY`.
|
||||
/// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` →
|
||||
/// `BINARY`.
|
||||
/// 3. Otherwise, `TEXT`.
|
||||
|
|
@ -76,62 +78,89 @@ impl ContentInspector {
|
|||
Some(mime.to_string())
|
||||
}
|
||||
|
||||
/// Detect UTF‑8 encoding by attempting a lossless conversion.
|
||||
/// Detect UTF-8 encoding by attempting a lossless conversion.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn guess_charset(&self, bytes: &[u8]) -> Option<String> {
|
||||
String::from_utf8(bytes.to_vec()).ok().map(|_| "UTF-8".to_string())
|
||||
}
|
||||
|
||||
/// Guess programming language by extension, else simple content markers.
|
||||
/// Guess programming language with broad coverage using `tokei`.
|
||||
///
|
||||
/// Extension mapping covers common languages (Rust, Python, JS, etc.).
|
||||
/// Fallback checks for `<?php`, `package main`, `public class`, or shebangs.
|
||||
/// Strategy (no disk I/O):
|
||||
/// 1) Try extension via `LanguageType::from_file_extension`.
|
||||
/// 2) Handle common extensionless filenames (e.g., Makefile, Dockerfile, CMakeLists.txt).
|
||||
/// 3) Parse an in-memory shebang (first line) against `LanguageType::shebangs`.
|
||||
/// 4) Minimal content markers as a last resort.
|
||||
///
|
||||
/// Returns the canonical `tokei` language name (e.g., `Rust`, `Bash`, `Python`).
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn guess_language(&self, path: &Path, content: &[u8]) -> Option<String> {
|
||||
// 1) Extension mapping (fast, no I/O).
|
||||
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
||||
let name = match ext.to_ascii_lowercase().as_str() {
|
||||
"rs" => "Rust",
|
||||
"py" => "Python",
|
||||
"js" => "JavaScript",
|
||||
"ts" => "TypeScript",
|
||||
"java" => "Java",
|
||||
"c" => "C",
|
||||
"cpp" | "cc" | "cxx" => "C++",
|
||||
"go" => "Go",
|
||||
"rb" => "Ruby",
|
||||
"php" => "PHP",
|
||||
"cs" => "C#",
|
||||
"kt" | "kts" => "Kotlin",
|
||||
"scala" => "Scala",
|
||||
"swift" => "Swift",
|
||||
"sh" => "Shell",
|
||||
"pl" => "Perl",
|
||||
"lua" => "Lua",
|
||||
"hs" => "Haskell",
|
||||
"r" => "R",
|
||||
_ => "",
|
||||
};
|
||||
if !name.is_empty() {
|
||||
return Some(name.to_string());
|
||||
if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) {
|
||||
return Some(lang.name().to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure).
|
||||
if let Some(file) = path.file_name().and_then(|f| f.to_str()) {
|
||||
match file {
|
||||
"Makefile" | "makefile" => {
|
||||
return Some(LanguageType::Makefile.name().to_string());
|
||||
}
|
||||
"Dockerfile" | "dockerfile" => {
|
||||
return Some(LanguageType::Dockerfile.name().to_string());
|
||||
}
|
||||
"CMakeLists.txt" => {
|
||||
return Some(LanguageType::CMake.name().to_string());
|
||||
}
|
||||
"Rakefile" | "rakefile" => {
|
||||
return Some(LanguageType::Rakefile.name().to_string());
|
||||
}
|
||||
// Common ecosystem files; map to their primary language where sensible.
|
||||
"Gemfile" | "gemfile" => {
|
||||
return Some(LanguageType::Ruby.name().to_string());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// 3) Shebang detection (in-memory): compare the first line to known shebangs.
|
||||
if let Some(first_line) = content.split(|&b| b == b'\n').next() {
|
||||
if first_line.starts_with(b"#!") {
|
||||
if let Ok(line) = std::str::from_utf8(first_line) {
|
||||
for &lang in LanguageType::list() {
|
||||
for &sb in lang.shebangs() {
|
||||
if line.starts_with(sb) {
|
||||
return Some(lang.name().to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4) Lightweight content markers to catch a few ubiquitous cases without I/O.
|
||||
let s = String::from_utf8_lossy(content);
|
||||
if s.contains("<?php") {
|
||||
Some("PHP".to_string())
|
||||
} else if s.contains("package main") {
|
||||
Some("Go".to_string())
|
||||
} else if s.contains("public class") {
|
||||
Some("Java".to_string())
|
||||
} else if s.contains("#!/usr/bin/env bash") || s.contains("#!/bin/bash") {
|
||||
Some("Shell".to_string())
|
||||
} else if s.contains("#!/usr/bin/env python") {
|
||||
Some("Python".to_string())
|
||||
} else {
|
||||
None
|
||||
return Some(LanguageType::Php.name().to_string());
|
||||
}
|
||||
if s.contains("package main") {
|
||||
return Some(LanguageType::Go.name().to_string());
|
||||
}
|
||||
if s.contains("public class") {
|
||||
return Some(LanguageType::Java.name().to_string());
|
||||
}
|
||||
if s.contains("#!/usr/bin/env bash") || s.contains("#!/bin/bash") {
|
||||
return Some(LanguageType::Bash.name().to_string());
|
||||
}
|
||||
if s.contains("#!/usr/bin/env python") {
|
||||
return Some(LanguageType::Python.name().to_string());
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -175,11 +204,21 @@ mod tests {
|
|||
#[test]
|
||||
fn language_guess() {
|
||||
let ins = ContentInspector::default();
|
||||
assert_eq!(ins.guess_language(&PathBuf::from("main.rs"), b""), Some("Rust".into()));
|
||||
assert_eq!(ins.guess_language(&PathBuf::from("x"), b"<?php echo; ?>"), Some("PHP".into()));
|
||||
assert_eq!(
|
||||
ins.guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi"),
|
||||
Some("Shell".into())
|
||||
);
|
||||
|
||||
// Compare case-insensitively by lowercasing both sides.
|
||||
let rust = ins
|
||||
.guess_language(&PathBuf::from("main.rs"), b"")
|
||||
.map(|s| s.to_ascii_lowercase());
|
||||
assert_eq!(rust, Some("rust".into()));
|
||||
|
||||
let php = ins
|
||||
.guess_language(&PathBuf::from("x"), b"<?php echo; ?>")
|
||||
.map(|s| s.to_ascii_lowercase());
|
||||
assert_eq!(php, Some("php".into()));
|
||||
|
||||
let bash = ins
|
||||
.guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi")
|
||||
.map(|s| s.to_ascii_lowercase());
|
||||
assert_eq!(bash, Some("bash".into()));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
129
src/guesser.rs
129
src/guesser.rs
|
|
@ -1,129 +0,0 @@
|
|||
// Fixed guesser.rs
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use crate::content_type::ContentInspector;
|
||||
|
||||
pub enum Input<'a> {
|
||||
Bytes(&'a [u8]),
|
||||
PathAndBytes(&'a Path, &'a [u8]),
|
||||
}
|
||||
impl<'a> Input<'a> {
|
||||
pub fn from_bytes(bytes: &'a [u8]) -> Self {
|
||||
Self::Bytes(bytes)
|
||||
}
|
||||
|
||||
pub fn from_path_and_bytes(path: &'a Path, bytes: &'a [u8]) -> Self {
|
||||
Self::PathAndBytes(path, bytes)
|
||||
}
|
||||
}
|
||||
/// Result from content guessing
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Guess {
|
||||
mime_type: Option<String>,
|
||||
mime_params: Vec<(String, String)>,
|
||||
content_guess: Option<String>,
|
||||
}
|
||||
impl Guess {
|
||||
pub fn path_guess(&self) -> Option<&str> {
|
||||
self.mime_type.as_deref()
|
||||
}
|
||||
|
||||
pub fn content_guess(&self) -> Option<&str> {
|
||||
self.content_guess.as_deref()
|
||||
}
|
||||
|
||||
pub fn essence_str(&self) -> Option<&str> {
|
||||
self.mime_type.as_deref()
|
||||
}
|
||||
|
||||
pub fn get_param(&self, param: &str) -> Option<String> {
|
||||
self.mime_params.iter().find(|(p, _)| p == param).map(|(_, v)| v.clone())
|
||||
}
|
||||
}
|
||||
/// Content guesser with configurable inspector
|
||||
pub struct Guesser {
|
||||
inspector: ContentInspector,
|
||||
}
|
||||
|
||||
impl Guesser {
|
||||
pub fn new() -> Result<Self> {
|
||||
Ok(Self { inspector: ContentInspector::default() })
|
||||
}
|
||||
|
||||
pub fn guess(&self, input: Input<'_>) -> Guess {
|
||||
let mut guess = Guess { mime_type: None, mime_params: Vec::new(), content_guess: None };
|
||||
match input {
|
||||
Input::Bytes(bytes) => {
|
||||
guess.mime_type = Some("text/plain".to_string());
|
||||
if let Some(charset) = self.inspector.guess_charset(bytes) {
|
||||
guess.mime_params.push(("charset".to_string(), charset));
|
||||
}
|
||||
}
|
||||
Input::PathAndBytes(path, bytes) => {
|
||||
// Try to get MIME type from extension
|
||||
match self.inspector.guess_mime_type(path) {
|
||||
Some(mime) => guess.mime_type = Some(mime),
|
||||
None => guess.mime_type = Some("application/octet-stream".into()),
|
||||
}
|
||||
|
||||
// Charset (if textual)
|
||||
if let Some(charset) = self.inspector.guess_charset(bytes) {
|
||||
guess.mime_params.push(("charset".into(), charset));
|
||||
}
|
||||
// Try to guess language
|
||||
guess.content_guess = self.inspector.guess_language(path, bytes);
|
||||
}
|
||||
}
|
||||
guess
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::path::PathBuf;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_guesser() -> Result<()> {
|
||||
let guesser = Guesser::new()?;
|
||||
// Test bytes only
|
||||
let bytes = b"Hello World";
|
||||
let guess = guesser.guess(Input::from_bytes(bytes));
|
||||
assert_eq!(
|
||||
guess.path_guess(),
|
||||
Some("text/plain"),
|
||||
"expected: {:?}, got: {:?}",
|
||||
Some("text/plain"),
|
||||
guess.path_guess()
|
||||
);
|
||||
assert_eq!(
|
||||
guess.content_guess(),
|
||||
None,
|
||||
"expected: {:?}, got: {:?}",
|
||||
None::<String>,
|
||||
guess.content_guess()
|
||||
);
|
||||
|
||||
// Test path and bytes
|
||||
let path = PathBuf::from("test.rs");
|
||||
let guess = guesser.guess(Input::from_path_and_bytes(&path, bytes));
|
||||
assert_eq!(
|
||||
guess.path_guess(),
|
||||
Some("application/octet-stream"),
|
||||
"expected: {:?}, got: {:?}",
|
||||
Some("application/octet-stream"),
|
||||
guess.path_guess()
|
||||
);
|
||||
assert_eq!(
|
||||
guess.content_guess(),
|
||||
Some("Rust"),
|
||||
"expected: {:?}, got: {:?}",
|
||||
Some("Rust"),
|
||||
guess.content_guess()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -18,7 +18,6 @@ mod git_repo_enumerator;
|
|||
pub mod git_url;
|
||||
pub mod github;
|
||||
pub mod gitlab;
|
||||
pub mod guesser;
|
||||
pub mod jira;
|
||||
pub mod liquid_filters;
|
||||
pub mod location;
|
||||
|
|
|
|||
|
|
@ -25,7 +25,6 @@ use crate::{
|
|||
cli::commands::{github::GitHistoryMode, scan},
|
||||
decompress::{decompress_file_to_temp, CompressedContent},
|
||||
findings_store,
|
||||
guesser::Guesser,
|
||||
matcher::{Matcher, MatcherStats},
|
||||
open_git_repo,
|
||||
origin::{Origin, OriginSet},
|
||||
|
|
@ -130,12 +129,11 @@ pub fn enumerate_filesystem_inputs(
|
|||
let make_blob_processor = || -> BlobProcessor {
|
||||
let t1 = Instant::now();
|
||||
*num_blob_processors.lock().unwrap() += 1;
|
||||
let guesser = Guesser::new().expect("should be able to create filetype guesser");
|
||||
{
|
||||
let mut init_time = blob_processor_init_time.lock().unwrap();
|
||||
*init_time += t1.elapsed();
|
||||
}
|
||||
BlobProcessor { matcher, guesser }
|
||||
BlobProcessor { matcher }
|
||||
};
|
||||
let scan_res: Result<()> = input_recv
|
||||
.into_iter()
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ use tracing::{debug_span, trace};
|
|||
|
||||
use crate::{
|
||||
blob::{Blob, BlobMetadata},
|
||||
guesser::{Guesser, Input},
|
||||
content_type::ContentInspector,
|
||||
location::LocationMapping,
|
||||
matcher::{Match, Matcher, OwnedBlobMatch, ScanResult},
|
||||
origin::{Origin, OriginSet},
|
||||
|
|
@ -12,11 +12,10 @@ use crate::{
|
|||
Path,
|
||||
};
|
||||
|
||||
/// A combined matcher, content type guesser, and a number of parameters that
|
||||
/// don't change within one `scan` run
|
||||
/// A matcher along with parameters that remain constant during a single
|
||||
/// `scan` run
|
||||
pub struct BlobProcessor<'a> {
|
||||
pub matcher: Matcher<'a>,
|
||||
pub guesser: Guesser,
|
||||
}
|
||||
|
||||
impl<'a> BlobProcessor<'a> {
|
||||
|
|
@ -61,7 +60,7 @@ impl<'a> BlobProcessor<'a> {
|
|||
if matches.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
let md = MetadataResult::from_blob_and_origin(&self.guesser, &blob, &origin);
|
||||
let md = MetadataResult::from_blob_and_origin(&blob, &origin);
|
||||
let metadata = BlobMetadata {
|
||||
id: blob.id,
|
||||
num_bytes: blob.len(),
|
||||
|
|
@ -109,19 +108,13 @@ struct MetadataResult {
|
|||
charset: Option<String>,
|
||||
}
|
||||
impl MetadataResult {
|
||||
fn from_blob_and_origin(guesser: &Guesser, blob: &Blob, origin: &OriginSet) -> MetadataResult {
|
||||
fn from_blob_and_origin(blob: &Blob, origin: &OriginSet) -> MetadataResult {
|
||||
let blob_path: Option<&'_ Path> = origin.iter().find_map(|p| p.blob_path());
|
||||
let input = match blob_path {
|
||||
None => Input::from_bytes(&blob.bytes()), // Use Input directly
|
||||
Some(blob_path) => {
|
||||
Input::from_path_and_bytes(blob_path, &blob.bytes()) // Use Input
|
||||
// directly
|
||||
}
|
||||
};
|
||||
let guess = guesser.guess(input);
|
||||
let mime_essence = guess.path_guess().map(|s| s.to_string());
|
||||
let language = guess.content_guess().map(ToOwned::to_owned);
|
||||
let charset = guess.path_guess().and_then(|_| guess.get_param("charset")); // Call get_param on Guess directly
|
||||
let bytes = blob.bytes();
|
||||
let mime_essence = Some(tree_magic_mini::from_u8(bytes).to_string());
|
||||
let inspector = ContentInspector::default();
|
||||
let language = blob_path.and_then(|p| inspector.guess_language(p, bytes));
|
||||
let charset = inspector.guess_charset(bytes);
|
||||
MetadataResult { mime_essence, language, charset }
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,9 +18,7 @@ use crate::{
|
|||
confluence, findings_store,
|
||||
git_binary::{CloneMode, Git},
|
||||
git_url::GitUrl,
|
||||
github, gitlab,
|
||||
guesser::Guesser,
|
||||
jira,
|
||||
github, gitlab, jira,
|
||||
matcher::{Match, Matcher, MatcherStats},
|
||||
origin::{Origin, OriginSet},
|
||||
rules_database::RulesDatabase,
|
||||
|
|
@ -356,8 +354,7 @@ pub async fn fetch_s3_objects(
|
|||
enable_profiling,
|
||||
Some(shared_profiler.clone()),
|
||||
)?;
|
||||
let guesser = Guesser::new().expect("should be able to create filetype guesser");
|
||||
let mut processor = BlobProcessor { matcher, guesser };
|
||||
let mut processor = BlobProcessor { matcher };
|
||||
|
||||
let progress = if progress_enabled {
|
||||
let style =
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue