use once_cell::sync::Lazy; use std::path::Path; use tokei::LanguageType; // Precompute all (shebang_prefix_bytes, language) pairs once. // Sort longest-first so more specific shebangs win. static SHEBANG_PREFIXES: Lazy> = Lazy::new(|| { let mut v = Vec::new(); for &lang in LanguageType::list() { for &sb in lang.shebangs() { v.push((sb.as_bytes(), lang)); } } // Longest prefix first to prefer e.g. "#!/usr/bin/env python3" over "#!/usr/bin/env python" v.sort_by(|a, b| b.0.len().cmp(&a.0.len())); v }); /// The type of content detected. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ContentType { /// Unprintable or control-heavy data. BINARY, /// Mostly printable text. TEXT, } /// Heuristic thresholds for text vs. binary detection. pub struct ContentInspector { max_null_bytes: usize, max_control_ratio: f64, } impl Default for ContentInspector { fn default() -> Self { Self { max_null_bytes: 4, max_control_ratio: 0.3 } } } impl ContentInspector { /// Create a new inspector with default thresholds. #[inline] pub fn new() -> Self { Default::default() } /// Classify `bytes` as TEXT or BINARY: /// /// 1. If null-byte count > `max_null_bytes` -- `BINARY`. /// 2. Else if (control chars excluding `\n`, `\r`, `\t`) / total > `max_control_ratio` → /// `BINARY`. /// 3. Otherwise, `TEXT`. #[inline] #[must_use] pub fn inspect(&self, bytes: &[u8]) -> ContentType { let nulls = bytes.iter().filter(|&&b| b == 0).count(); if nulls > self.max_null_bytes { return ContentType::BINARY; } let controls = bytes.iter().filter(|&&b| b < 32 && !matches!(b, b'\n' | b'\r' | b'\t')).count(); let ratio = if bytes.is_empty() { 0.0 } else { controls as f64 / bytes.len() as f64 }; if ratio > self.max_control_ratio { ContentType::BINARY } else { ContentType::TEXT } } /// Guess MIME type from `path` extension. /// /// Returns: /// - `Some(mime)` if the extension is one of the known text or image types. /// - `None` if there is no extension or it’s unrecognized. #[inline] #[must_use] pub fn guess_mime_type(&self, path: &Path) -> Option { let ext = path.extension()?.to_str()?.to_ascii_lowercase(); let mime = match ext.as_str() { "txt" | "md" | "rst" => "text/plain", "html" | "htm" => "text/html", "css" => "text/css", "js" => "application/javascript", "json" => "application/json", "xml" => "application/xml", "pdf" => "application/pdf", "jpg" | "jpeg" => "image/jpeg", "png" => "image/png", "gif" => "image/gif", _ => return None, }; Some(mime.to_string()) } /// Detect UTF-8 encoding by attempting a lossless conversion. #[inline] #[must_use] pub fn guess_charset(&self, bytes: &[u8]) -> Option { String::from_utf8(bytes.to_vec()).ok().map(|_| "UTF-8".to_string()) } /// Guess programming language with broad coverage using `tokei`. /// /// Strategy (no disk I/O): /// 1) Try extension via `LanguageType::from_file_extension`. /// 2) Handle common extensionless filenames (e.g., Makefile, Dockerfile, CMakeLists.txt). /// 3) Parse an in-memory shebang (first line) against `LanguageType::shebangs`. /// 4) Minimal content markers as a last resort. /// /// Returns the canonical `tokei` language name (e.g., `Rust`, `Bash`, `Python`). #[inline] #[must_use] pub fn guess_language(&self, path: &Path, content: &[u8]) -> Option { // 1) Extension mapping (fast, no I/O). if let Some(ext) = path.extension().and_then(|e| e.to_str()) { if let Some(lang) = LanguageType::from_file_extension(&ext.to_ascii_lowercase()) { return Some(lang.name().to_string()); } } // 2) Well-known filenames with no/odd extensions (avoid from_path to keep this pure). if let Some(file) = path.file_name().and_then(|f| f.to_str()) { match file { "Makefile" | "makefile" => { return Some(LanguageType::Makefile.name().to_string()); } "Dockerfile" | "dockerfile" => { return Some(LanguageType::Dockerfile.name().to_string()); } "CMakeLists.txt" => { return Some(LanguageType::CMake.name().to_string()); } "Rakefile" | "rakefile" => { return Some(LanguageType::Rakefile.name().to_string()); } // Common ecosystem files; map to their primary language where sensible. "Gemfile" | "gemfile" => { return Some(LanguageType::Ruby.name().to_string()); } _ => {} } } // 3) Shebang detection (in-memory): match by longest prefix, byte-wise (no UTF-8 needed). if let Some(first_line) = content.split(|&b| b == b'\n').next() { if first_line.starts_with(b"#!") { for (prefix, lang) in SHEBANG_PREFIXES.iter() { if first_line.starts_with(prefix) { return Some(lang.name().to_string()); } } } } // 4) Lightweight content markers to catch a few ubiquitous cases without I/O. let s = String::from_utf8_lossy(content); if s.contains(" ContentType { ContentInspector::default().inspect(bytes) } #[cfg(test)] mod tests { use std::path::PathBuf; use super::*; #[test] fn binary_vs_text() { let ins = ContentInspector::default(); let bin = vec![0, 1, 2, 0, 0, 0, 5]; assert_eq!(ins.inspect(&bin), ContentType::BINARY); let txt = b"Hello\nWorld"; assert_eq!(ins.inspect(txt), ContentType::TEXT); } #[test] fn mime_guess() { let ins = ContentInspector::default(); assert_eq!(ins.guess_mime_type(&PathBuf::from("a.md")), Some("text/plain".into())); assert_eq!(ins.guess_mime_type(&PathBuf::from("img.png")), Some("image/png".into())); assert_eq!(ins.guess_mime_type(&PathBuf::from("x.xyz")), None); } #[test] fn charset_guess() { let ins = ContentInspector::default(); assert_eq!(ins.guess_charset("ok".as_bytes()), Some("UTF-8".into())); assert_eq!(ins.guess_charset(&[0xFF, 0xFE, 0xFD]), None); } #[test] fn language_guess() { let ins = ContentInspector::default(); // Compare case-insensitively by lowercasing both sides. let rust = ins.guess_language(&PathBuf::from("main.rs"), b"").map(|s| s.to_ascii_lowercase()); assert_eq!(rust, Some("rust".into())); let php = ins .guess_language(&PathBuf::from("x"), b"") .map(|s| s.to_ascii_lowercase()); assert_eq!(php, Some("php".into())); let bash = ins .guess_language(&PathBuf::from("run"), b"#!/bin/bash\necho hi") .map(|s| s.to_ascii_lowercase()); assert_eq!(bash, Some("bash".into())); } }