From fa640e2c38b2ead5342a5188e89e24b4dc6fefba Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Mon, 23 Feb 2026 20:06:43 -0700 Subject: [PATCH] Python bytecode (.pyc) scanning: extracts string constants from compiled Python --- CHANGELOG.md | 1 + README.md | 3 +- src/lib.rs | 1 + src/pyc.rs | 779 +++++++++++++++++++++++++++++++++++++++ src/scanner/enumerate.rs | 22 +- src/scanner/util.rs | 9 + 6 files changed, 813 insertions(+), 2 deletions(-) create mode 100644 src/pyc.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 5518c0f..96ab59f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file. ## [v1.85.0] - SQLite database scanning: kingfisher now detects and extracts SQLite files (`.db`, `.sqlite`, `.sqlite3`, etc.), dumping each table as SQL text with named columns so secrets stored in database rows are scannable. Controlled by the existing `--extract-archives` flag. +- Python bytecode (.pyc) scanning: extracts string constants from compiled Python (`.pyc`, `.pyo`) files via marshal parsing so secrets embedded in bytecode are scannable. Controlled by `--extract-archives`. - Performance: pipelined ODB enumeration — scanning now begins while blob OIDs are still being discovered, overlapping I/O with pattern matching. - Performance: skip blobs smaller than 20 bytes during enumeration (too small to contain any secret). - Performance: preserve pack-ascending blob order in the metadata path for better I/O locality when Rayon splits work. diff --git a/README.md b/README.md index 4ed1404..5899efa 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ Designed for offensive security engineers and blue-teamers alike, Kingfisher hel - **Broad AI SaaS coverage**: finds and validates tokens for OpenAI, Anthropic, Google Gemini, Cohere, AWS Bedrock, Voyage AI, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, Together.ai, Zhipu, and many more - **Compressed Files**: Supports extracting and scanning compressed files for secrets - **SQLite Database Scanning**: Automatically extracts and scans SQLite database contents for secrets stored in table rows +- **Python Bytecode (.pyc) Scanning**: Extracts and scans string constants from compiled Python (`.pyc`, `.pyo`) files - **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) - **Checksum-aware detection**: verifies tokens with built-in checksums (e.g., GitHub, Confluent, Zuplo) — no API calls required - **Built-in Report Viewer**: Visualize and triage findings locally with `kingfisher view ./report-file.json` @@ -611,7 +612,7 @@ Since then it has evolved far beyond that starting point, introducing live valid - **Baseline management** to suppress known findings over time - **Tree-sitter parsing** layered on Hyperscan for language-aware detection - **More scan targets** (GitLab, Bitbucket, Gitea, Jira, Confluence, Slack, S3, GCS, Docker, Hugging Face, etc.) -- **Compressed Files** and **SQLite database** scanning support +- **Compressed Files**, **SQLite database**, and **Python bytecode (.pyc)** scanning support - **New storage model** (in-memory + Bloom filter, replacing SQLite) - **Unified workflow** with JSON/BSON/SARIF outputs - **Cross-platform builds** for Linux, macOS, and Windows diff --git a/src/lib.rs b/src/lib.rs index 316560e..74888eb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -40,6 +40,7 @@ pub mod location; pub mod matcher; pub mod origin; pub mod parser; +pub mod pyc; pub mod reporter; pub mod rule_loader; pub mod rule_profiling; diff --git a/src/pyc.rs b/src/pyc.rs new file mode 100644 index 0000000..23aa4e0 --- /dev/null +++ b/src/pyc.rs @@ -0,0 +1,779 @@ +use std::io::{self, Cursor, Read}; +use std::path::Path; + +use anyhow::{bail, Context, Result}; +use tracing::debug; + +const MAX_RECURSION_DEPTH: usize = 256; +const MAX_TOTAL_BYTES: usize = 64 * 1024 * 1024; +const MAX_COLLECTION_LEN: u32 = 1_000_000; + +const FLAG_REF: u8 = 0x80; + +const TYPE_NULL: u8 = b'0'; +const TYPE_NONE: u8 = b'N'; +const TYPE_FALSE: u8 = b'F'; +const TYPE_TRUE: u8 = b'T'; +const TYPE_STOPITER: u8 = b'S'; +const TYPE_ELLIPSIS: u8 = b'.'; +const TYPE_INT: u8 = b'i'; +const TYPE_INT64: u8 = b'I'; +const TYPE_FLOAT: u8 = b'f'; +const TYPE_BINARY_FLOAT: u8 = b'g'; +const TYPE_COMPLEX: u8 = b'x'; +const TYPE_BINARY_COMPLEX: u8 = b'y'; +const TYPE_LONG: u8 = b'l'; +const TYPE_STRING: u8 = b's'; +const TYPE_INTERNED: u8 = b't'; +const TYPE_REF: u8 = b'r'; +const TYPE_TUPLE: u8 = b'('; +const TYPE_LIST: u8 = b'['; +const TYPE_DICT: u8 = b'{'; +const TYPE_CODE: u8 = b'c'; +const TYPE_UNICODE: u8 = b'u'; +const TYPE_SET: u8 = b'<'; +const TYPE_FROZENSET: u8 = b'>'; +const TYPE_ASCII: u8 = b'a'; +const TYPE_ASCII_INTERNED: u8 = b'A'; +const TYPE_SMALL_TUPLE: u8 = b')'; +const TYPE_SHORT_ASCII: u8 = b'z'; +const TYPE_SHORT_ASCII_INTERNED: u8 = b'Z'; + +/// Code object layout varies by Python version. +#[derive(Debug, Clone, Copy)] +enum CodeFormat { + /// Python 3.3-3.7: 5 leading i32s, 8 objects, 1 i32, 1 object + V33, + /// Python 3.8-3.10: 6 leading i32s, 8 objects, 1 i32, 1 object + V38, + /// Python 3.11-3.12: 5 leading i32s, 9 objects, 1 i32, 2 objects + V311, + /// Python 3.13+: 5 leading i32s, 8 objects, 1 i32, 2 objects + /// (varnames/freevars/cellvars replaced by localsplusnames/localspluskinds) + V313, +} + +impl CodeFormat { + fn leading_longs(self) -> usize { + match self { + CodeFormat::V33 => 5, + CodeFormat::V38 => 6, + CodeFormat::V311 | CodeFormat::V313 => 5, + } + } + + fn middle_objects(self) -> usize { + match self { + CodeFormat::V33 | CodeFormat::V38 | CodeFormat::V313 => 8, + CodeFormat::V311 => 9, + } + } + + fn trailing_objects(self) -> usize { + match self { + CodeFormat::V33 | CodeFormat::V38 => 1, + CodeFormat::V311 | CodeFormat::V313 => 2, + } + } +} + +/// Determine the header size and code format from the 2-byte magic number. +fn pyc_version_info(magic: u16) -> Option<(usize, CodeFormat)> { + match magic { + // Python 3.0-3.2: 8-byte header + 3000..=3189 => Some((8, CodeFormat::V33)), + // Python 3.3-3.6: 12-byte header + 3190..=3379 => Some((12, CodeFormat::V33)), + // Python 3.7: 16-byte header, same code format as 3.3 + 3380..=3399 => Some((16, CodeFormat::V33)), + // Python 3.8-3.10: 16-byte header + 3400..=3494 => Some((16, CodeFormat::V38)), + // Python 3.11-3.12: 16-byte header + 3495..=3567 => Some((16, CodeFormat::V311)), + // Python 3.13+: 16-byte header, changed code object layout + 3568..=3700 => Some((16, CodeFormat::V313)), + _ => None, + } +} + +struct MarshalReader<'a> { + cursor: Cursor<&'a [u8]>, + code_format: CodeFormat, + refs: Vec<()>, + strings: Vec, + total_extracted: usize, + depth: usize, +} + +impl<'a> MarshalReader<'a> { + fn new(data: &'a [u8], code_format: CodeFormat) -> Self { + Self { + cursor: Cursor::new(data), + code_format, + refs: Vec::new(), + strings: Vec::new(), + total_extracted: 0, + depth: 0, + } + } + + fn read_u8(&mut self) -> io::Result { + let mut buf = [0u8; 1]; + self.cursor.read_exact(&mut buf)?; + Ok(buf[0]) + } + + fn read_i32(&mut self) -> io::Result { + let mut buf = [0u8; 4]; + self.cursor.read_exact(&mut buf)?; + Ok(i32::from_le_bytes(buf)) + } + + fn read_u32(&mut self) -> io::Result { + self.read_i32().map(|v| v as u32) + } + + fn read_bytes(&mut self, len: usize) -> io::Result> { + let mut buf = vec![0u8; len]; + self.cursor.read_exact(&mut buf)?; + Ok(buf) + } + + fn skip(&mut self, n: usize) -> io::Result<()> { + let mut remaining = n; + let mut buf = [0u8; 512]; + while remaining > 0 { + let to_read = remaining.min(buf.len()); + self.cursor.read_exact(&mut buf[..to_read])?; + remaining -= to_read; + } + Ok(()) + } + + fn collect_string(&mut self, data: &[u8]) { + if self.total_extracted >= MAX_TOTAL_BYTES { + return; + } + if data.is_empty() { + return; + } + if !self.strings.is_empty() { + self.strings.push(b'\n'); + self.total_extracted += 1; + } + let allowed = MAX_TOTAL_BYTES.saturating_sub(self.total_extracted); + let take = data.len().min(allowed); + self.strings.extend_from_slice(&data[..take]); + self.total_extracted += take; + } + + fn read_object(&mut self) -> Result<()> { + if self.depth > MAX_RECURSION_DEPTH { + bail!("marshal recursion depth exceeded"); + } + self.depth += 1; + let result = self.read_object_inner(); + self.depth -= 1; + result + } + + fn read_object_inner(&mut self) -> Result<()> { + let raw_type = self.read_u8().context("unexpected EOF reading type byte")?; + let type_byte = raw_type & !FLAG_REF; + let is_ref = raw_type & FLAG_REF != 0; + + if is_ref { + self.refs.push(()); + } + + match type_byte { + TYPE_NULL | TYPE_NONE | TYPE_STOPITER | TYPE_ELLIPSIS | TYPE_TRUE | TYPE_FALSE => {} + + TYPE_INT => { + self.skip(4)?; + } + + TYPE_INT64 => { + self.skip(8)?; + } + + TYPE_FLOAT => { + let n = self.read_u8()? as usize; + self.skip(n)?; + } + + TYPE_BINARY_FLOAT => { + self.skip(8)?; + } + + TYPE_COMPLEX => { + let n1 = self.read_u8()? as usize; + self.skip(n1)?; + let n2 = self.read_u8()? as usize; + self.skip(n2)?; + } + + TYPE_BINARY_COMPLEX => { + self.skip(16)?; + } + + TYPE_LONG => { + let n = self.read_i32()?; + let words = n.unsigned_abs() as usize; + self.skip(words * 2)?; + } + + TYPE_STRING | TYPE_INTERNED => { + let len = self.read_u32()? as usize; + if len > MAX_TOTAL_BYTES { + bail!("string length {len} exceeds limit"); + } + let data = self.read_bytes(len)?; + self.collect_string(&data); + } + + TYPE_UNICODE => { + let len = self.read_u32()? as usize; + if len > MAX_TOTAL_BYTES { + bail!("unicode length {len} exceeds limit"); + } + let data = self.read_bytes(len)?; + self.collect_string(&data); + } + + TYPE_ASCII | TYPE_ASCII_INTERNED => { + let len = self.read_u32()? as usize; + if len > MAX_TOTAL_BYTES { + bail!("ascii length {len} exceeds limit"); + } + let data = self.read_bytes(len)?; + self.collect_string(&data); + } + + TYPE_SHORT_ASCII | TYPE_SHORT_ASCII_INTERNED => { + let len = self.read_u8()? as usize; + let data = self.read_bytes(len)?; + self.collect_string(&data); + } + + TYPE_REF => { + self.skip(4)?; + } + + TYPE_TUPLE => { + let n = self.read_u32()?; + if n > MAX_COLLECTION_LEN { + bail!("tuple length {n} exceeds limit"); + } + for _ in 0..n { + self.read_object()?; + } + } + + TYPE_SMALL_TUPLE => { + let n = self.read_u8()? as u32; + for _ in 0..n { + self.read_object()?; + } + } + + TYPE_LIST => { + let n = self.read_u32()?; + if n > MAX_COLLECTION_LEN { + bail!("list length {n} exceeds limit"); + } + for _ in 0..n { + self.read_object()?; + } + } + + TYPE_SET | TYPE_FROZENSET => { + let n = self.read_u32()?; + if n > MAX_COLLECTION_LEN { + bail!("set length {n} exceeds limit"); + } + for _ in 0..n { + self.read_object()?; + } + } + + TYPE_DICT => { + loop { + let peek = self.read_u8()?; + if peek == TYPE_NULL { + break; + } + // Put the byte back by seeking + let pos = self.cursor.position(); + self.cursor.set_position(pos - 1); + self.read_object()?; + self.read_object()?; + } + } + + TYPE_CODE => { + self.read_code_object()?; + } + + other => { + debug!("unknown marshal type byte 0x{other:02x}, stopping parse"); + bail!("unknown marshal type 0x{other:02x}"); + } + } + + Ok(()) + } + + fn read_code_object(&mut self) -> Result<()> { + let fmt = self.code_format; + + for _ in 0..fmt.leading_longs() { + self.skip(4)?; + } + for _ in 0..fmt.middle_objects() { + self.read_object()?; + } + // firstlineno + self.skip(4)?; + for _ in 0..fmt.trailing_objects() { + self.read_object()?; + } + + Ok(()) + } +} + +/// Extract all string constants from a `.pyc` file. +/// +/// Returns the extracted strings concatenated with newlines, suitable for +/// scanning. Returns an empty vec if the file contains no extractable strings. +pub fn extract_pyc_strings(path: &Path) -> Result> { + let data = std::fs::read(path) + .with_context(|| format!("failed to read .pyc file: {}", path.display()))?; + + if data.len() < 8 { + bail!("file too small to be a valid .pyc: {} bytes", data.len()); + } + + let magic = u16::from_le_bytes([data[0], data[1]]); + // Bytes 2-3 should be \r\n + if data[2] != b'\r' || data[3] != b'\n' { + bail!("invalid .pyc magic suffix (expected \\r\\n)"); + } + + let (header_size, code_format) = match pyc_version_info(magic) { + Some(info) => info, + None => { + debug!( + "unsupported .pyc magic number {magic} in {}, skipping", + path.display() + ); + return Ok(Vec::new()); + } + }; + + if data.len() < header_size { + bail!( + ".pyc header requires {header_size} bytes but file is only {} bytes", + data.len() + ); + } + + let marshal_data = &data[header_size..]; + if marshal_data.is_empty() { + return Ok(Vec::new()); + } + + let mut reader = MarshalReader::new(marshal_data, code_format); + match reader.read_object() { + Ok(()) => {} + Err(e) => { + debug!( + "marshal parse error in {} (extracted {} bytes before error): {e:#}", + path.display(), + reader.strings.len() + ); + } + } + + Ok(reader.strings) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_pyc_header(magic: u16, header_size: usize) -> Vec { + let mut header = Vec::new(); + header.extend_from_slice(&magic.to_le_bytes()); + header.push(b'\r'); + header.push(b'\n'); + // Fill remaining header bytes with zeros + header.resize(header_size, 0); + header + } + + fn marshal_short_ascii(s: &str) -> Vec { + assert!(s.len() < 256); + let mut buf = Vec::new(); + buf.push(TYPE_SHORT_ASCII); + buf.push(s.len() as u8); + buf.extend_from_slice(s.as_bytes()); + buf + } + + fn marshal_ascii(s: &str) -> Vec { + let mut buf = Vec::new(); + buf.push(TYPE_ASCII); + buf.extend_from_slice(&(s.len() as u32).to_le_bytes()); + buf.extend_from_slice(s.as_bytes()); + buf + } + + fn marshal_unicode(s: &str) -> Vec { + let mut buf = Vec::new(); + buf.push(TYPE_UNICODE); + buf.extend_from_slice(&(s.len() as u32).to_le_bytes()); + buf.extend_from_slice(s.as_bytes()); + buf + } + + fn marshal_none() -> Vec { + vec![TYPE_NONE] + } + + fn marshal_int(val: i32) -> Vec { + let mut buf = vec![TYPE_INT]; + buf.extend_from_slice(&val.to_le_bytes()); + buf + } + + fn marshal_small_tuple(items: &[Vec]) -> Vec { + assert!(items.len() < 256); + let mut buf = Vec::new(); + buf.push(TYPE_SMALL_TUPLE); + buf.push(items.len() as u8); + for item in items { + buf.extend_from_slice(item); + } + buf + } + + fn marshal_tuple(items: &[Vec]) -> Vec { + let mut buf = Vec::new(); + buf.push(TYPE_TUPLE); + buf.extend_from_slice(&(items.len() as u32).to_le_bytes()); + for item in items { + buf.extend_from_slice(item); + } + buf + } + + fn marshal_string(s: &[u8]) -> Vec { + let mut buf = Vec::new(); + buf.push(TYPE_STRING); + buf.extend_from_slice(&(s.len() as u32).to_le_bytes()); + buf.extend_from_slice(s); + buf + } + + /// Build a minimal Python 3.8 code object with the given consts tuple. + fn marshal_code_38(consts: Vec, names: Vec) -> Vec { + let mut buf = Vec::new(); + buf.push(TYPE_CODE); + // 6 leading i32s (argcount, posonlyargcount, kwonlyargcount, nlocals, + // stacksize, flags) + for _ in 0..6 { + buf.extend_from_slice(&0i32.to_le_bytes()); + } + // 8 middle objects: code, consts, names, varnames, freevars, cellvars, + // filename, name + buf.extend_from_slice(&marshal_string(b"")); // code (bytecode) + buf.extend_from_slice(&consts); // consts + buf.extend_from_slice(&names); // names + buf.extend_from_slice(&marshal_small_tuple(&[])); // varnames + buf.extend_from_slice(&marshal_small_tuple(&[])); // freevars + buf.extend_from_slice(&marshal_small_tuple(&[])); // cellvars + buf.extend_from_slice(&marshal_short_ascii("")); // filename + buf.extend_from_slice(&marshal_short_ascii("")); // name + // firstlineno + buf.extend_from_slice(&1i32.to_le_bytes()); + // 1 trailing object: lnotab + buf.extend_from_slice(&marshal_string(b"")); + buf + } + + #[test] + fn extracts_short_ascii_string() { + let mut data = make_pyc_header(3413, 16); // Python 3.8 + data.extend_from_slice(&marshal_short_ascii("secret_api_key_12345")); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + assert_eq!(result, b"secret_api_key_12345"); + } + + #[test] + fn extracts_ascii_string() { + let mut data = make_pyc_header(3413, 16); + data.extend_from_slice(&marshal_ascii("AKIAIOSFODNN7EXAMPLE")); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + assert_eq!(result, b"AKIAIOSFODNN7EXAMPLE"); + } + + #[test] + fn extracts_unicode_string() { + let mut data = make_pyc_header(3413, 16); + data.extend_from_slice(&marshal_unicode("password=hunter2")); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + assert_eq!(result, b"password=hunter2"); + } + + #[test] + fn extracts_strings_from_tuple() { + let mut data = make_pyc_header(3413, 16); + let tuple = marshal_small_tuple(&[ + marshal_none(), + marshal_short_ascii("first"), + marshal_int(42), + marshal_short_ascii("second"), + ]); + data.extend_from_slice(&tuple); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + assert_eq!(result, b"first\nsecond"); + } + + #[test] + fn extracts_strings_from_code_object() { + let mut data = make_pyc_header(3413, 16); + let consts = marshal_small_tuple(&[ + marshal_none(), + marshal_short_ascii("ghp_abc123def456"), + ]); + let names = marshal_small_tuple(&[marshal_short_ascii("api_key")]); + let code = marshal_code_38(consts, names); + data.extend_from_slice(&code); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + let result_str = String::from_utf8_lossy(&result); + assert!(result_str.contains("ghp_abc123def456"), "missing secret from consts"); + assert!(result_str.contains("api_key"), "missing name"); + } + + #[test] + fn handles_large_tuple() { + let mut data = make_pyc_header(3413, 16); + let items: Vec> = + (0..50).map(|i| marshal_short_ascii(&format!("item_{i}"))).collect(); + let tuple = marshal_tuple(&items); + data.extend_from_slice(&tuple); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + let result_str = String::from_utf8_lossy(&result); + assert!(result_str.contains("item_0")); + assert!(result_str.contains("item_49")); + } + + #[test] + fn handles_python_33_header() { + let mut data = make_pyc_header(3230, 12); // Python 3.3 + data.extend_from_slice(&marshal_short_ascii("py33_secret")); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + assert_eq!(result, b"py33_secret"); + } + + #[test] + fn handles_python_37_header() { + let mut data = make_pyc_header(3394, 16); // Python 3.7 + data.extend_from_slice(&marshal_short_ascii("py37_secret")); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + assert_eq!(result, b"py37_secret"); + } + + #[test] + fn handles_python_311_header() { + let mut data = make_pyc_header(3495, 16); // Python 3.11 + data.extend_from_slice(&marshal_short_ascii("py311_secret")); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + assert_eq!(result, b"py311_secret"); + } + + #[test] + fn rejects_file_too_small() { + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &[0u8; 4]).unwrap(); + let result = extract_pyc_strings(tmp.path()); + assert!(result.is_err()); + } + + #[test] + fn rejects_bad_magic_suffix() { + let mut data = vec![0x00, 0x0D, 0x00, 0x00]; // wrong suffix + data.resize(16, 0); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()); + assert!(result.is_err()); + } + + #[test] + fn skips_unknown_magic() { + let mut data = vec![0x00, 0x00, b'\r', b'\n']; + data.resize(16, 0); + data.extend_from_slice(&marshal_short_ascii("should_not_appear")); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + assert!(result.is_empty()); + } + + #[test] + fn handles_empty_marshal_data() { + let data = make_pyc_header(3413, 16); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + assert!(result.is_empty()); + } + + #[test] + fn handles_flag_ref_bit() { + let mut data = make_pyc_header(3413, 16); + // Short ASCII with FLAG_REF set + data.push(TYPE_SHORT_ASCII | FLAG_REF); + data.push(5); + data.extend_from_slice(b"hello"); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + assert_eq!(result, b"hello"); + } + + #[test] + fn handles_nested_tuples() { + let mut data = make_pyc_header(3413, 16); + let inner = marshal_small_tuple(&[ + marshal_short_ascii("inner_secret"), + ]); + let outer = marshal_small_tuple(&[ + marshal_short_ascii("outer"), + inner, + ]); + data.extend_from_slice(&outer); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + let result_str = String::from_utf8_lossy(&result); + assert!(result_str.contains("outer")); + assert!(result_str.contains("inner_secret")); + } + + #[test] + fn handles_type_string_bytes() { + let mut data = make_pyc_header(3413, 16); + data.extend_from_slice(&marshal_string(b"raw_bytes_secret")); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + assert_eq!(result, b"raw_bytes_secret"); + } + + /// Build a minimal Python 3.13+ code object (8 middle objects instead of 9). + fn marshal_code_313(consts: Vec, names: Vec) -> Vec { + let mut buf = Vec::new(); + buf.push(TYPE_CODE); + // 5 leading i32s (argcount, posonlyargcount, kwonlyargcount, stacksize, + // flags) + for _ in 0..5 { + buf.extend_from_slice(&0i32.to_le_bytes()); + } + // 8 middle objects: code, consts, names, localsplusnames, + // localspluskinds, filename, name, qualname + buf.extend_from_slice(&marshal_string(b"")); // code + buf.extend_from_slice(&consts); // consts + buf.extend_from_slice(&names); // names + buf.extend_from_slice(&marshal_small_tuple(&[])); // localsplusnames + buf.extend_from_slice(&marshal_string(b"")); // localspluskinds + buf.extend_from_slice(&marshal_short_ascii("")); // filename + buf.extend_from_slice(&marshal_short_ascii("")); // name + buf.extend_from_slice(&marshal_short_ascii("")); // qualname + // firstlineno + buf.extend_from_slice(&1i32.to_le_bytes()); + // 2 trailing objects: linetable, exceptiontable + buf.extend_from_slice(&marshal_string(b"")); + buf.extend_from_slice(&marshal_string(b"")); + buf + } + + #[test] + fn extracts_strings_from_code_object_v313() { + let mut data = make_pyc_header(3627, 16); // Python 3.14 + let consts = marshal_small_tuple(&[ + marshal_none(), + marshal_short_ascii("sk-proj-ABCDEF123456"), + ]); + let names = marshal_small_tuple(&[marshal_short_ascii("openai_key")]); + let code = marshal_code_313(consts, names); + data.extend_from_slice(&code); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), &data).unwrap(); + let result = extract_pyc_strings(tmp.path()).unwrap(); + let result_str = String::from_utf8_lossy(&result); + assert!( + result_str.contains("sk-proj-ABCDEF123456"), + "missing secret from consts" + ); + assert!(result_str.contains("openai_key"), "missing name"); + assert!(result_str.contains(""), "missing filename"); + } + + #[test] + fn extracts_from_real_pyc() { + use std::process::Command; + let python = Command::new("python3").arg("--version").output(); + if python.is_err() { + return; // skip if python3 not available + } + let tmp_dir = tempfile::tempdir().unwrap(); + let py_path = tmp_dir.path().join("test_secrets.py"); + let pyc_path = tmp_dir.path().join("test_secrets.pyc"); + std::fs::write( + &py_path, + "DB_PASSWORD = 'xK9#mP2$vL5nQ8wR'\nAPI_ENDPOINT = 'https://api.example.com/v1'\n", + ) + .unwrap(); + let status = Command::new("python3") + .args([ + "-c", + &format!( + "import py_compile; py_compile.compile('{}', cfile='{}')", + py_path.display(), + pyc_path.display() + ), + ]) + .status(); + if status.is_err() || !status.unwrap().success() { + return; // skip if compilation fails + } + let result = extract_pyc_strings(&pyc_path).unwrap(); + let result_str = String::from_utf8_lossy(&result); + assert!( + result_str.contains("DB_PASSWORD") || result_str.contains("xK9#mP2$vL5nQ8wR"), + "expected to find secret string in extracted pyc content, got: {result_str}" + ); + } +} diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index 8dfa895..e964483 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -39,9 +39,10 @@ use crate::{ scanner::{ processing::BlobProcessor, runner::{create_datastore_channel, spawn_datastore_writer_thread}, - util::{is_compressed_file, is_sqlite_file}, + util::{is_compressed_file, is_pyc_file, is_sqlite_file}, }, scanner_pool::ScannerPool, + pyc::extract_pyc_strings, sqlite::extract_sqlite_contents, DirectoryResult, EnumeratorConfig, EnumeratorFileResult, FileResult, FilesystemEnumerator, FoundInput, GitDiffConfig, GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator, @@ -361,6 +362,25 @@ impl ParallelBlobIterator for FileResult { Ok(None) } } + } else if extraction_enabled && is_pyc_file(&self.path) { + match extract_pyc_strings(&self.path) { + Ok(strings) if strings.is_empty() => { + debug!("No strings found in .pyc file: {}", self.path.display()); + Ok(None) + } + Ok(strings) => { + let origin = OriginSet::new(Origin::from_file(self.path.clone()), vec![]); + let blob = Blob::from_bytes(strings); + Ok(Some(FileResultIter { + iter_kind: FileResultIterKind::Single(Some((origin, blob))), + _marker: PhantomData, + })) + } + Err(e) => { + debug!("Failed to extract .pyc file {}: {e:#}", self.path.display()); + Ok(None) + } + } } else if extraction_enabled && is_compressed_file(&self.path) { match decompress_file_to_temp(&self.path) { Ok((content, _temp_dir)) => match content { diff --git a/src/scanner/util.rs b/src/scanner/util.rs index 41a425b..649b58e 100644 --- a/src/scanner/util.rs +++ b/src/scanner/util.rs @@ -37,6 +37,15 @@ const SQLITE_EXTENSIONS: &[&str] = &["db", "sqlite", "sqlite3", "db3", "s3db", " #[allow(dead_code)] pub const SQLITE_MAGIC: &[u8; 16] = b"SQLite format 3\0"; +pub fn is_pyc_file(path: &Path) -> bool { + if let Some(ext) = path.extension().and_then(|s| s.to_str()) { + let ext_lower = ext.to_lowercase(); + ext_lower == "pyc" || ext_lower == "pyo" + } else { + false + } +} + pub fn is_sqlite_file(path: &Path) -> bool { if let Some(ext) = path.extension().and_then(|s| s.to_str()) { let ext_lower = ext.to_lowercase();