From 1f4ccb8144173b34ff4ab3545d71799bc3f52ccd Mon Sep 17 00:00:00 2001
From: Mick Grove <mick.grove@mongodb.com>
Date: Sun, 22 Feb 2026 23:35:18 -0700
Subject: [PATCH] Automatically extracts and scans SQLite database contents for
 secrets stored in table rows

---
 CHANGELOG.md             |   1 +
 Cargo.toml               |   3 +-
 README.md                |   5 +-
 src/lib.rs               |   1 +
 src/scanner/enumerate.rs |  30 ++++-
 src/scanner/util.rs      |  22 ++++
 src/sqlite.rs            | 260 +++++++++++++++++++++++++++++++++++++++
 tests/smoke_sqlite.rs    |  48 ++++++++
 8 files changed, 365 insertions(+), 5 deletions(-)
 create mode 100644 src/sqlite.rs
 create mode 100644 tests/smoke_sqlite.rs

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ca97715..5518c0f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
 All notable changes to this project will be documented in this file.
 
 ## [v1.85.0]
+- SQLite database scanning: kingfisher now detects and extracts SQLite files (`.db`, `.sqlite`, `.sqlite3`, etc.), dumping each table as SQL text with named columns so secrets stored in database rows are scannable. Controlled by the existing `--extract-archives` flag.
 - Performance: pipelined ODB enumeration — scanning now begins while blob OIDs are still being discovered, overlapping I/O with pattern matching.
 - Performance: skip blobs smaller than 20 bytes during enumeration (too small to contain any secret).
 - Performance: preserve pack-ascending blob order in the metadata path for better I/O locality when Rayon splits work.
diff --git a/Cargo.toml b/Cargo.toml
index 5c47363..ebb5e40 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,7 +48,7 @@ http = "1.4"
 
 [package]
 name = "kingfisher"
-version = "1.84.0"
+version = "1.85.0"
 description = "MongoDB's blazingly fast and accurate secret scanning and validation tool"
 edition.workspace = true
 rust-version.workspace = true
@@ -198,6 +198,7 @@ zip = { version = "2.4.2", default-features = false, features = ["deflate", "def
 tar = "0.4.44"
 lzma-rs = "0.3.0"
 asar = "0.3.0"
+rusqlite = { version = "0.34", features = ["bundled"] }
 blake3 = "1.8.2"
 memchr = "2.7"
 memmap2 = "0.9.9"
diff --git a/README.md b/README.md
index 876210d..4ed1404 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ Designed for offensive security engineers and blue-teamers alike, Kingfisher hel
 - **Blast Radius Mapping**: instantly map leaked keys to their effective cloud identities and exposed resources with `--access-map`. Supports AWS, GCP, Azure, GitHub, Gitlab, and more token support coming.
 - **Broad AI SaaS coverage**: finds and validates tokens for OpenAI, Anthropic, Google Gemini, Cohere, AWS Bedrock, Voyage AI, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, Together.ai, Zhipu, and many more
 - **Compressed Files**: Supports extracting and scanning compressed files for secrets
+- **SQLite Database Scanning**: Automatically extracts and scans SQLite database contents for secrets stored in table rows
 - **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md))
 - **Checksum-aware detection**: verifies tokens with built-in checksums (e.g., GitHub, Confluent, Zuplo) — no API calls required
 - **Built-in Report Viewer**: Visualize and triage findings locally with `kingfisher view ./report-file.json`
@@ -600,7 +601,7 @@ kingfisher scan /tmp/repo --branch feature-1 \
 
 # Lineage and Evolution
 
-Kingfisher began as an internal fork of Nosey Parker, used as a high-performance foundation for secret detection. 
+Kingfisher began as an internal fork of [Nosey Parker](https://github.com/praetorian-inc/noseyparker), used as a high-performance foundation for secret detection. 
 
 Since then it has evolved far beyond that starting point, introducing live validation, hundreds of new rules, additional scan targets, and major architectural changes across nearly every subsystem.
 
@@ -610,7 +611,7 @@ Since then it has evolved far beyond that starting point, introducing live valid
 - **Baseline management** to suppress known findings over time  
 - **Tree-sitter parsing** layered on Hyperscan for language-aware detection  
 - **More scan targets** (GitLab, Bitbucket, Gitea, Jira, Confluence, Slack, S3, GCS, Docker, Hugging Face, etc.)  
-- **Compressed Files** scanning support added
+- **Compressed Files** and **SQLite database** scanning support
 - **New storage model** (in-memory + Bloom filter, replacing SQLite)  
 - **Unified workflow** with JSON/BSON/SARIF outputs  
 - **Cross-platform builds** for Linux, macOS, and Windows
diff --git a/src/lib.rs b/src/lib.rs
index 0ac558e..316560e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -51,6 +51,7 @@ pub mod scanner;
 pub mod scanner_pool;
 pub mod slack;
 pub mod snippet;
+pub mod sqlite;
 pub mod update;
 pub mod util;
 pub mod validation;
diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs
index f8d166e..8dfa895 100644
--- a/src/scanner/enumerate.rs
+++ b/src/scanner/enumerate.rs
@@ -39,9 +39,10 @@ use crate::{
     scanner::{
         processing::BlobProcessor,
         runner::{create_datastore_channel, spawn_datastore_writer_thread},
-        util::is_compressed_file,
+        util::{is_compressed_file, is_sqlite_file},
     },
     scanner_pool::ScannerPool,
+    sqlite::extract_sqlite_contents,
     DirectoryResult, EnumeratorConfig, EnumeratorFileResult, FileResult, FilesystemEnumerator,
     FoundInput, GitDiffConfig, GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator,
     PathBuf,
@@ -335,7 +336,32 @@ impl ParallelBlobIterator for FileResult {
         let extraction_enabled = self.extract_archives;
         let max_extraction_depth = self.extraction_depth;
 
-        if extraction_enabled && is_compressed_file(&self.path) {
+        if extraction_enabled && is_sqlite_file(&self.path) {
+            match extract_sqlite_contents(&self.path) {
+                Ok(tables) if tables.is_empty() => {
+                    debug!("No tables found in SQLite database: {}", self.path.display());
+                    Ok(None)
+                }
+                Ok(tables) => {
+                    let items = tables
+                        .into_iter()
+                        .map(|(logical_name, data)| {
+                            let full_path = self.path.join(logical_name);
+                            let origin = OriginSet::new(Origin::from_file(full_path), vec![]);
+                            (origin, Blob::from_bytes(data))
+                        })
+                        .collect();
+                    Ok(Some(FileResultIter {
+                        iter_kind: FileResultIterKind::Archive(items),
+                        _marker: PhantomData,
+                    }))
+                }
+                Err(e) => {
+                    debug!("Failed to extract SQLite database {}: {e:#}", self.path.display());
+                    Ok(None)
+                }
+            }
+        } else if extraction_enabled && is_compressed_file(&self.path) {
             match decompress_file_to_temp(&self.path) {
                 Ok((content, _temp_dir)) => match content {
                     // Single-file decompression fully in memory.
diff --git a/src/scanner/util.rs b/src/scanner/util.rs
index 53fff6d..41a425b 100644
--- a/src/scanner/util.rs
+++ b/src/scanner/util.rs
@@ -30,3 +30,25 @@ pub fn is_compressed_file(path: &Path) -> bool {
         false
     }
 }
+
+const SQLITE_EXTENSIONS: &[&str] = &["db", "sqlite", "sqlite3", "db3", "s3db", "sl3"];
+/// SQLite file header magic bytes. Useful for detecting extensionless SQLite
+/// files (e.g. Chrome `Cookies`, `History`, `Web Data`).
+#[allow(dead_code)]
+pub const SQLITE_MAGIC: &[u8; 16] = b"SQLite format 3\0";
+
+pub fn is_sqlite_file(path: &Path) -> bool {
+    if let Some(ext) = path.extension().and_then(|s| s.to_str()) {
+        let ext_lower = ext.to_lowercase();
+        if SQLITE_EXTENSIONS.iter().any(|e| *e == ext_lower) {
+            return true;
+        }
+    }
+    false
+}
+
+/// Check the first 16 bytes of `data` for the SQLite magic header.
+#[allow(dead_code)]
+pub fn has_sqlite_magic(data: &[u8]) -> bool {
+    data.len() >= SQLITE_MAGIC.len() && data[..SQLITE_MAGIC.len()] == *SQLITE_MAGIC
+}
diff --git a/src/sqlite.rs b/src/sqlite.rs
new file mode 100644
index 0000000..3fc1b8e
--- /dev/null
+++ b/src/sqlite.rs
@@ -0,0 +1,260 @@
+use std::fmt::Write as FmtWrite;
+use std::path::Path;
+
+use anyhow::{Context, Result};
+use rusqlite::{Connection, OpenFlags};
+use tracing::debug;
+
+const MAX_ROWS_PER_TABLE: usize = 100_000;
+const MAX_TOTAL_BYTES: usize = 256 * 1024 * 1024;
+
+/// Extract all user tables from a SQLite database as SQL text.
+///
+/// Returns a vec of `(logical_name, sql_text)` pairs, one per table.
+/// Each entry contains the CREATE TABLE statement followed by INSERT
+/// statements with explicit column names so that keyword-based secret
+/// detectors can match column names like "api_key" near their values.
+pub fn extract_sqlite_contents(path: &Path) -> Result<Vec<(String, Vec<u8>)>> {
+    let conn = Connection::open_with_flags(path, OpenFlags::SQLITE_OPEN_READ_ONLY)
+        .with_context(|| format!("Failed to open SQLite database: {}", path.display()))?;
+
+    conn.busy_timeout(std::time::Duration::from_secs(5))?;
+
+    let tables = list_user_tables(&conn)?;
+    if tables.is_empty() {
+        debug!("SQLite database has no user tables: {}", path.display());
+        return Ok(Vec::new());
+    }
+
+    let mut results = Vec::with_capacity(tables.len());
+    let mut total_bytes: usize = 0;
+
+    for (table_name, create_sql) in &tables {
+        if total_bytes >= MAX_TOTAL_BYTES {
+            debug!(
+                "SQLite extraction hit total size limit ({MAX_TOTAL_BYTES} bytes), \
+                 skipping remaining tables in {}",
+                path.display()
+            );
+            break;
+        }
+
+        match dump_table(&conn, table_name, create_sql, MAX_TOTAL_BYTES - total_bytes) {
+            Ok(sql_text) => {
+                total_bytes += sql_text.len();
+                let logical_name = format!("{}.sql", table_name);
+                results.push((logical_name, sql_text.into_bytes()));
+            }
+            Err(e) => {
+                debug!(
+                    "Failed to dump table '{}' from {}: {e:#}",
+                    table_name,
+                    path.display()
+                );
+            }
+        }
+    }
+
+    Ok(results)
+}
+
+/// List all user tables (excluding sqlite_* internal tables) along with
+/// their CREATE TABLE SQL.
+fn list_user_tables(conn: &Connection) -> Result<Vec<(String, String)>> {
+    let mut stmt = conn.prepare(
+        "SELECT name, sql FROM sqlite_master \
+         WHERE type = 'table' AND name NOT LIKE 'sqlite_%' \
+         ORDER BY name",
+    )?;
+
+    let rows = stmt.query_map([], |row| {
+        let name: String = row.get(0)?;
+        let sql: String = row.get(1)?;
+        Ok((name, sql))
+    })?;
+
+    let mut tables = Vec::new();
+    for row in rows {
+        tables.push(row?);
+    }
+    Ok(tables)
+}
+
+/// Dump a single table as SQL text: the CREATE statement followed by
+/// INSERT INTO statements with named columns.
+fn dump_table(
+    conn: &Connection,
+    table_name: &str,
+    create_sql: &str,
+    remaining_budget: usize,
+) -> Result<String> {
+    let mut out = String::with_capacity(4096);
+    writeln!(out, "{create_sql};")?;
+
+    let col_names = column_names(conn, table_name)?;
+    if col_names.is_empty() {
+        return Ok(out);
+    }
+
+    let columns_fragment = col_names
+        .iter()
+        .map(|c| format!("\"{}\"", c.replace('"', "\"\"")))
+        .collect::<Vec<_>>()
+        .join(",");
+
+    let query = format!("SELECT * FROM \"{}\"", table_name.replace('"', "\"\""));
+    let mut stmt = conn.prepare(&query)?;
+    let col_count = col_names.len();
+
+    let mut rows_emitted: usize = 0;
+    let mut rows = stmt.query([])?;
+
+    while let Some(row) = rows.next()? {
+        if rows_emitted >= MAX_ROWS_PER_TABLE {
+            writeln!(out, "-- (truncated after {MAX_ROWS_PER_TABLE} rows)")?;
+            break;
+        }
+        if out.len() >= remaining_budget {
+            writeln!(out, "-- (truncated: size limit reached)")?;
+            break;
+        }
+
+        write!(out, "INSERT INTO \"{table_name}\" ({columns_fragment}) VALUES (")?;
+
+        for i in 0..col_count {
+            if i > 0 {
+                write!(out, ",")?;
+            }
+            write_value(&mut out, row, i)?;
+        }
+
+        writeln!(out, ");")?;
+        rows_emitted += 1;
+    }
+
+    Ok(out)
+}
+
+fn column_names(conn: &Connection, table_name: &str) -> Result<Vec<String>> {
+    let query = format!(
+        "PRAGMA table_info(\"{}\")",
+        table_name.replace('"', "\"\"")
+    );
+    let mut stmt = conn.prepare(&query)?;
+    let names = stmt
+        .query_map([], |row| {
+            let name: String = row.get(1)?;
+            Ok(name)
+        })?
+        .collect::<Result<Vec<_>, _>>()?;
+    Ok(names)
+}
+
+fn write_value(out: &mut String, row: &rusqlite::Row<'_>, idx: usize) -> Result<()> {
+    use rusqlite::types::ValueRef;
+    match row.get_ref(idx)? {
+        ValueRef::Null => write!(out, "NULL")?,
+        ValueRef::Integer(i) => write!(out, "{i}")?,
+        ValueRef::Real(f) => write!(out, "{f}")?,
+        ValueRef::Text(t) => {
+            let s = String::from_utf8_lossy(t);
+            write!(out, "'{}'", s.replace('\'', "''"))?;
+        }
+        ValueRef::Blob(b) => {
+            write!(out, "X'")?;
+            for byte in b {
+                write!(out, "{byte:02X}")?;
+            }
+            write!(out, "'")?;
+        }
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tempfile::NamedTempFile;
+
+    fn create_test_db() -> (NamedTempFile, std::path::PathBuf) {
+        let tmp = NamedTempFile::new().unwrap();
+        let path = tmp.path().to_path_buf();
+        let conn = Connection::open(&path).unwrap();
+        conn.execute_batch(
+            "CREATE TABLE user_info (id INTEGER PRIMARY KEY, username TEXT, api_key TEXT);
+             INSERT INTO user_info VALUES (1, 'alice', 'ghp_abc123def456ghi789jkl012mno345pqr678');
+             INSERT INTO user_info VALUES (2, 'bob', 'AKIAIOSFODNN7EXAMPLE');
+             CREATE TABLE config (key TEXT, value TEXT);
+             INSERT INTO config VALUES ('db_password', 's3cret!passw0rd');",
+        )
+        .unwrap();
+        (tmp, path)
+    }
+
+    #[test]
+    fn extracts_all_tables() {
+        let (_tmp, path) = create_test_db();
+        let results = extract_sqlite_contents(&path).unwrap();
+        assert_eq!(results.len(), 2);
+
+        let names: Vec<&str> = results.iter().map(|(n, _)| n.as_str()).collect();
+        assert!(names.contains(&"config.sql"));
+        assert!(names.contains(&"user_info.sql"));
+    }
+
+    #[test]
+    fn output_contains_column_names_and_values() {
+        let (_tmp, path) = create_test_db();
+        let results = extract_sqlite_contents(&path).unwrap();
+
+        let user_info = results
+            .iter()
+            .find(|(n, _)| n == "user_info.sql")
+            .unwrap();
+        let sql = String::from_utf8_lossy(&user_info.1);
+
+        assert!(sql.contains("CREATE TABLE"));
+        assert!(sql.contains("\"api_key\""));
+        assert!(sql.contains("ghp_abc123def456ghi789jkl012mno345pqr678"));
+        assert!(sql.contains("INSERT INTO"));
+    }
+
+    #[test]
+    fn handles_empty_database() {
+        let tmp = NamedTempFile::new().unwrap();
+        let path = tmp.path().to_path_buf();
+        let conn = Connection::open(&path).unwrap();
+        conn.execute_batch("CREATE TABLE empty_table (id INTEGER);")
+            .unwrap();
+
+        let results = extract_sqlite_contents(&path).unwrap();
+        assert_eq!(results.len(), 1);
+        let sql = String::from_utf8_lossy(&results[0].1);
+        assert!(sql.contains("CREATE TABLE"));
+        assert!(!sql.contains("INSERT INTO"));
+    }
+
+    #[test]
+    fn handles_nonexistent_file() {
+        let result = extract_sqlite_contents(Path::new("/nonexistent/database.db"));
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn handles_special_characters_in_values() {
+        let tmp = NamedTempFile::new().unwrap();
+        let path = tmp.path().to_path_buf();
+        let conn = Connection::open(&path).unwrap();
+        conn.execute_batch(
+            "CREATE TABLE t (id INTEGER PRIMARY KEY, val TEXT);
+             INSERT INTO t VALUES (1, 'it''s a test');
+             INSERT INTO t VALUES (2, NULL);",
+        )
+        .unwrap();
+
+        let results = extract_sqlite_contents(&path).unwrap();
+        let sql = String::from_utf8_lossy(&results[0].1);
+        assert!(sql.contains("'it''s a test'"));
+        assert!(sql.contains("NULL"));
+    }
+}
diff --git a/tests/smoke_sqlite.rs b/tests/smoke_sqlite.rs
new file mode 100644
index 0000000..e187abe
--- /dev/null
+++ b/tests/smoke_sqlite.rs
@@ -0,0 +1,48 @@
+use assert_cmd::prelude::*;
+
+#[test]
+fn smoke_scan_sqlite_database() -> anyhow::Result<()> {
+    use rusqlite::Connection;
+    use std::process::Command;
+
+    let dir = tempfile::tempdir()?;
+    let db_path = dir.path().join("secrets.db");
+    let github_pat = "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";
+
+    {
+        let conn = Connection::open(&db_path)?;
+        conn.execute_batch(&format!(
+            "CREATE TABLE user_info (id INTEGER PRIMARY KEY, username TEXT, api_key TEXT);
+             INSERT INTO user_info VALUES (1, 'alice', '{github_pat}');"
+        ))?;
+    }
+
+    let findings_code = 200;
+
+    // With extraction enabled, the secret should be found and the path should
+    // reference the table-level logical file (secrets.db/user_info.sql).
+    let output = Command::new(assert_cmd::cargo::cargo_bin!("kingfisher"))
+        .args([
+            "scan",
+            db_path.to_str().unwrap(),
+            "--confidence=low",
+            "--format",
+            "json",
+            "--no-update-check",
+        ])
+        .assert()
+        .code(findings_code)
+        .stdout(predicates::str::contains(github_pat))
+        .get_output()
+        .stdout
+        .clone();
+
+    let stdout = String::from_utf8_lossy(&output);
+    assert!(
+        stdout.contains("user_info.sql"),
+        "Expected table-level path in finding, got: {stdout}"
+    );
+
+    dir.close()?;
+    Ok(())
+}