From 0ce72739dc1faaac2b8a87f5892043ec08aad4a1 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Fri, 15 Aug 2025 16:08:46 -0700 Subject: [PATCH] Improved language detection --- CHANGELOG.md | 3 +- Cargo.toml | 4 +- src/content_type.rs | 133 ++++++++++++++++++++++++-------------- src/guesser.rs | 129 ------------------------------------ src/lib.rs | 1 - src/scanner/enumerate.rs | 4 +- src/scanner/processing.rs | 27 +++----- src/scanner/repos.rs | 7 +- 8 files changed, 104 insertions(+), 204 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e49a4e5..f826736 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,8 @@ All notable changes to this project will be documented in this file. ## [1.42.0] -- Internal refactoring of rule loader and git enumerator +- Internal refactoring of rule loader, git enumerator, and filetype guesser +- Improved language detection ## [1.41.0] - Added support for scanning gitlab subgroups, with `kingfisher scan --gitlab-group my-group --gitlab-include-subgroups` diff --git a/Cargo.toml b/Cargo.toml index db985c4..7d79adc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.41.0" +version = "1.42.0" description = "MongoDB's blazingly fast secret scanning and validation tool" edition.workspace = true rust-version.workspace = true @@ -141,6 +141,7 @@ tree-sitter-typescript = "0.23.2" tree-sitter-yaml = "0.7.1" streaming-iterator = "0.1.9" tree-sitter-regex = "0.24.3" +tree_magic_mini = "3.2" content_inspector = "0.2.4" rustc-hash = "2.1.1" term_size = "0.3.2" @@ -187,6 +188,7 @@ walkdir = "2.5.0" p256 = "0.13.2" ed25519-dalek = { version = "2.2", features = ["pkcs8"] } aws-sdk-s3 = "1.100.0" +tokei = "12.1.2" [dependencies.tikv-jemallocator] version = "0.6" diff --git a/src/content_type.rs b/src/content_type.rs index 110f62c..af623ed 100644 --- a/src/content_type.rs +++ b/src/content_type.rs @@ -1,9 +1,11 @@ +// Requires: tokei = "12" in Cargo.toml use std::path::Path; +use tokei::LanguageType; /// The type of content detected. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ContentType { - /// Unprintable or control‑heavy data. + /// Unprintable or control-heavy data. BINARY, /// Mostly printable text. TEXT, @@ -30,7 +32,7 @@ impl ContentInspector { /// Classify `bytes` as TEXT or BINARY: /// - /// 1. If null‑byte count > `max_null_bytes` -- `BINARY`. + /// 1. If null-byte count > `max_null_bytes` -- `BINARY`. /// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` → /// `BINARY`. /// 3. Otherwise, `TEXT`. @@ -76,62 +78,89 @@ impl ContentInspector { Some(mime.to_string()) } - /// Detect UTF‑8 encoding by attempting a lossless conversion. + /// Detect UTF-8 encoding by attempting a lossless conversion. #[inline] #[must_use] pub fn guess_charset(&self, bytes: &[u8]) -> Option { String::from_utf8(bytes.to_vec()).ok().map(|_| "UTF-8".to_string()) } - /// Guess programming language by extension, else simple content markers. + /// Guess programming language with broad coverage using `tokei`. /// - /// Extension mapping covers common languages (Rust, Python, JS, etc.). - /// Fallback checks for ` Option { + // 1) Extension mapping (fast, no I/O). if let Some(ext) = path.extension().and_then(|e| e.to_str()) { - let name = match ext.to_ascii_lowercase().as_str() { - "rs" => "Rust", - "py" => "Python", - "js" => "JavaScript", - "ts" => "TypeScript", - "java" => "Java", - "c" => "C", - "cpp" | "cc" | "cxx" => "C++", - "go" => "Go", - "rb" => "Ruby", - "php" => "PHP", - "cs" => "C#", - "kt" | "kts" => "Kotlin", - "scala" => "Scala", - "swift" => "Swift", - "sh" => "Shell", - "pl" => "Perl", - "lua" => "Lua", - "hs" => "Haskell", - "r" => "R", - _ => "", - }; - if !name.is_empty() { - return Some(name.to_string()); + if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) { + return Some(lang.name().to_string()); } } + // 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure). + if let Some(file) = path.file_name().and_then(|f| f.to_str()) { + match file { + "Makefile" | "makefile" => { + return Some(LanguageType::Makefile.name().to_string()); + } + "Dockerfile" | "dockerfile" => { + return Some(LanguageType::Dockerfile.name().to_string()); + } + "CMakeLists.txt" => { + return Some(LanguageType::CMake.name().to_string()); + } + "Rakefile" | "rakefile" => { + return Some(LanguageType::Rakefile.name().to_string()); + } + // Common ecosystem files; map to their primary language where sensible. + "Gemfile" | "gemfile" => { + return Some(LanguageType::Ruby.name().to_string()); + } + _ => {} + } + } + + // 3) Shebang detection (in-memory): compare the first line to known shebangs. + if let Some(first_line) = content.split(|&b| b == b'\n').next() { + if first_line.starts_with(b"#!") { + if let Ok(line) = std::str::from_utf8(first_line) { + for &lang in LanguageType::list() { + for &sb in lang.shebangs() { + if line.starts_with(sb) { + return Some(lang.name().to_string()); + } + } + } + } + } + } + + // 4) Lightweight content markers to catch a few ubiquitous cases without I/O. let s = String::from_utf8_lossy(content); if s.contains(""), Some("PHP".into())); - assert_eq!( - ins.guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi"), - Some("Shell".into()) - ); + + // Compare case-insensitively by lowercasing both sides. + let rust = ins + .guess_language(&PathBuf::from("main.rs"), b"") + .map(|s| s.to_ascii_lowercase()); + assert_eq!(rust, Some("rust".into())); + + let php = ins + .guess_language(&PathBuf::from("x"), b"") + .map(|s| s.to_ascii_lowercase()); + assert_eq!(php, Some("php".into())); + + let bash = ins + .guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi") + .map(|s| s.to_ascii_lowercase()); + assert_eq!(bash, Some("bash".into())); } } diff --git a/src/guesser.rs b/src/guesser.rs index 3483473..e69de29 100644 --- a/src/guesser.rs +++ b/src/guesser.rs @@ -1,129 +0,0 @@ -// Fixed guesser.rs -use std::path::Path; - -use anyhow::Result; - -use crate::content_type::ContentInspector; - -pub enum Input<'a> { - Bytes(&'a [u8]), - PathAndBytes(&'a Path, &'a [u8]), -} -impl<'a> Input<'a> { - pub fn from_bytes(bytes: &'a [u8]) -> Self { - Self::Bytes(bytes) - } - - pub fn from_path_and_bytes(path: &'a Path, bytes: &'a [u8]) -> Self { - Self::PathAndBytes(path, bytes) - } -} -/// Result from content guessing -#[derive(Debug, Default)] -pub struct Guess { - mime_type: Option, - mime_params: Vec<(String, String)>, - content_guess: Option, -} -impl Guess { - pub fn path_guess(&self) -> Option<&str> { - self.mime_type.as_deref() - } - - pub fn content_guess(&self) -> Option<&str> { - self.content_guess.as_deref() - } - - pub fn essence_str(&self) -> Option<&str> { - self.mime_type.as_deref() - } - - pub fn get_param(&self, param: &str) -> Option { - self.mime_params.iter().find(|(p, _)| p == param).map(|(_, v)| v.clone()) - } -} -/// Content guesser with configurable inspector -pub struct Guesser { - inspector: ContentInspector, -} - -impl Guesser { - pub fn new() -> Result { - Ok(Self { inspector: ContentInspector::default() }) - } - - pub fn guess(&self, input: Input<'_>) -> Guess { - let mut guess = Guess { mime_type: None, mime_params: Vec::new(), content_guess: None }; - match input { - Input::Bytes(bytes) => { - guess.mime_type = Some("text/plain".to_string()); - if let Some(charset) = self.inspector.guess_charset(bytes) { - guess.mime_params.push(("charset".to_string(), charset)); - } - } - Input::PathAndBytes(path, bytes) => { - // Try to get MIME type from extension - match self.inspector.guess_mime_type(path) { - Some(mime) => guess.mime_type = Some(mime), - None => guess.mime_type = Some("application/octet-stream".into()), - } - - // Charset (if textual) - if let Some(charset) = self.inspector.guess_charset(bytes) { - guess.mime_params.push(("charset".into(), charset)); - } - // Try to guess language - guess.content_guess = self.inspector.guess_language(path, bytes); - } - } - guess - } -} - -#[cfg(test)] -mod tests { - use std::path::PathBuf; - - use super::*; - - #[test] - fn test_guesser() -> Result<()> { - let guesser = Guesser::new()?; - // Test bytes only - let bytes = b"Hello World"; - let guess = guesser.guess(Input::from_bytes(bytes)); - assert_eq!( - guess.path_guess(), - Some("text/plain"), - "expected: {:?}, got: {:?}", - Some("text/plain"), - guess.path_guess() - ); - assert_eq!( - guess.content_guess(), - None, - "expected: {:?}, got: {:?}", - None::, - guess.content_guess() - ); - - // Test path and bytes - let path = PathBuf::from("test.rs"); - let guess = guesser.guess(Input::from_path_and_bytes(&path, bytes)); - assert_eq!( - guess.path_guess(), - Some("application/octet-stream"), - "expected: {:?}, got: {:?}", - Some("application/octet-stream"), - guess.path_guess() - ); - assert_eq!( - guess.content_guess(), - Some("Rust"), - "expected: {:?}, got: {:?}", - Some("Rust"), - guess.content_guess() - ); - Ok(()) - } -} diff --git a/src/lib.rs b/src/lib.rs index 4c3763f..4a95419 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,7 +18,6 @@ mod git_repo_enumerator; pub mod git_url; pub mod github; pub mod gitlab; -pub mod guesser; pub mod jira; pub mod liquid_filters; pub mod location; diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index 87a8011..2b9f958 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -25,7 +25,6 @@ use crate::{ cli::commands::{github::GitHistoryMode, scan}, decompress::{decompress_file_to_temp, CompressedContent}, findings_store, - guesser::Guesser, matcher::{Matcher, MatcherStats}, open_git_repo, origin::{Origin, OriginSet}, @@ -130,12 +129,11 @@ pub fn enumerate_filesystem_inputs( let make_blob_processor = || -> BlobProcessor { let t1 = Instant::now(); *num_blob_processors.lock().unwrap() += 1; - let guesser = Guesser::new().expect("should be able to create filetype guesser"); { let mut init_time = blob_processor_init_time.lock().unwrap(); *init_time += t1.elapsed(); } - BlobProcessor { matcher, guesser } + BlobProcessor { matcher } }; let scan_res: Result<()> = input_recv .into_iter() diff --git a/src/scanner/processing.rs b/src/scanner/processing.rs index e1551b0..e88393f 100644 --- a/src/scanner/processing.rs +++ b/src/scanner/processing.rs @@ -4,7 +4,7 @@ use tracing::{debug_span, trace}; use crate::{ blob::{Blob, BlobMetadata}, - guesser::{Guesser, Input}, + content_type::ContentInspector, location::LocationMapping, matcher::{Match, Matcher, OwnedBlobMatch, ScanResult}, origin::{Origin, OriginSet}, @@ -12,11 +12,10 @@ use crate::{ Path, }; -/// A combined matcher, content type guesser, and a number of parameters that -/// don't change within one `scan` run +/// A matcher along with parameters that remain constant during a single +/// `scan` run pub struct BlobProcessor<'a> { pub matcher: Matcher<'a>, - pub guesser: Guesser, } impl<'a> BlobProcessor<'a> { @@ -61,7 +60,7 @@ impl<'a> BlobProcessor<'a> { if matches.is_empty() { return Ok(None); } - let md = MetadataResult::from_blob_and_origin(&self.guesser, &blob, &origin); + let md = MetadataResult::from_blob_and_origin(&blob, &origin); let metadata = BlobMetadata { id: blob.id, num_bytes: blob.len(), @@ -109,19 +108,13 @@ struct MetadataResult { charset: Option, } impl MetadataResult { - fn from_blob_and_origin(guesser: &Guesser, blob: &Blob, origin: &OriginSet) -> MetadataResult { + fn from_blob_and_origin(blob: &Blob, origin: &OriginSet) -> MetadataResult { let blob_path: Option<&'_ Path> = origin.iter().find_map(|p| p.blob_path()); - let input = match blob_path { - None => Input::from_bytes(&blob.bytes()), // Use Input directly - Some(blob_path) => { - Input::from_path_and_bytes(blob_path, &blob.bytes()) // Use Input - // directly - } - }; - let guess = guesser.guess(input); - let mime_essence = guess.path_guess().map(|s| s.to_string()); - let language = guess.content_guess().map(ToOwned::to_owned); - let charset = guess.path_guess().and_then(|_| guess.get_param("charset")); // Call get_param on Guess directly + let bytes = blob.bytes(); + let mime_essence = Some(tree_magic_mini::from_u8(bytes).to_string()); + let inspector = ContentInspector::default(); + let language = blob_path.and_then(|p| inspector.guess_language(p, bytes)); + let charset = inspector.guess_charset(bytes); MetadataResult { mime_essence, language, charset } } } diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index e5052c1..e3bb3e9 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -18,9 +18,7 @@ use crate::{ confluence, findings_store, git_binary::{CloneMode, Git}, git_url::GitUrl, - github, gitlab, - guesser::Guesser, - jira, + github, gitlab, jira, matcher::{Match, Matcher, MatcherStats}, origin::{Origin, OriginSet}, rules_database::RulesDatabase, @@ -356,8 +354,7 @@ pub async fn fetch_s3_objects( enable_profiling, Some(shared_profiler.clone()), )?; - let guesser = Guesser::new().expect("should be able to create filetype guesser"); - let mut processor = BlobProcessor { matcher, guesser }; + let mut processor = BlobProcessor { matcher }; let progress = if progress_enabled { let style =