- Added '--repo-artifacts' flag to scan repository issues, gists/snippets, and wikis when cloning via '--git-url'

This commit is contained in:
Mick Grove 2025-08-20 20:41:11 -07:00
commit 6e4c94ddc3
19 changed files with 470 additions and 22 deletions

View file

@ -1,4 +1,10 @@
use std::{env, sync::Arc, time::Duration};
use std::{
collections::HashSet,
env, fs,
path::{Path, PathBuf},
sync::{Arc, Mutex},
time::Duration,
};
use anyhow::{Context, Result};
use indicatif::{ProgressBar, ProgressStyle};
@ -7,8 +13,12 @@ use octorust::{
types::{Order, ReposListOrgSort, ReposListOrgType, ReposListUserType},
Client,
};
use serde_json::Value;
use url::Url;
use crate::{findings_store, git_url::GitUrl};
use std::str::FromStr;
#[derive(Debug)]
pub struct RepoSpecifiers {
pub user: Vec<String>,
@ -161,3 +171,190 @@ pub async fn list_repositories(
}
Ok(())
}
fn parse_repo(repo_url: &GitUrl) -> Option<(String, String, String)> {
let url = Url::parse(repo_url.as_str()).ok()?;
let host = url.host_str()?.to_string();
let mut segments = url.path_segments()?;
let owner = segments.next()?.to_string();
let mut repo = segments.next()?.to_string();
if let Some(stripped) = repo.strip_suffix(".git") {
repo = stripped.to_string();
}
Some((host, owner, repo))
}
pub fn wiki_url(repo_url: &GitUrl) -> Option<GitUrl> {
let (host, owner, repo) = parse_repo(repo_url)?;
let wiki = format!("https://{host}/{owner}/{repo}.wiki.git");
GitUrl::from_str(&wiki).ok()
}
pub async fn fetch_repo_items(
repo_url: &GitUrl,
ignore_certs: bool,
output_root: &Path,
datastore: &Arc<Mutex<findings_store::FindingsStore>>,
) -> Result<Vec<PathBuf>> {
let (_, owner, repo) = parse_repo(repo_url).context("invalid GitHub repo URL")?;
let client = reqwest::Client::builder().danger_accept_invalid_certs(ignore_certs).build()?;
let mut dirs = Vec::new();
// Issues
let issues_dir = output_root.join("github_issues").join(&owner).join(&repo);
fs::create_dir_all(&issues_dir)?;
let mut page = 1;
loop {
let url = format!(
"https://api.github.com/repos/{owner}/{repo}/issues?state=all&per_page=100&page={page}"
);
let mut req = client.get(&url).header("User-Agent", "kingfisher");
if let Ok(token) = env::var("KF_GITHUB_TOKEN") {
if !token.is_empty() {
req = req.bearer_auth(token);
}
}
let resp = req.send().await?;
if !resp.status().is_success() {
break;
}
let issues: Vec<Value> = resp.json().await?;
if issues.is_empty() {
break;
}
for issue in issues {
let number = issue.get("number").and_then(|v| v.as_u64()).unwrap_or(0);
let title = issue.get("title").and_then(|v| v.as_str()).unwrap_or("");
let body = issue.get("body").and_then(|v| v.as_str()).unwrap_or("");
let content = format!("# {title}\n\n{body}");
let file_path = issues_dir.join(format!("issue_{number}.md"));
fs::write(&file_path, content)?;
let url = format!("https://github.com/{owner}/{repo}/issues/{number}");
let mut ds = datastore.lock().unwrap();
ds.register_repo_link(file_path, url);
}
page += 1;
}
if issues_dir.read_dir().ok().and_then(|mut d| d.next()).is_some() {
dirs.push(issues_dir);
}
// Gists
let gists_dir = output_root.join("github_gists").join(&owner);
fs::create_dir_all(&gists_dir)?;
let mut seen = HashSet::new();
// Public gists for the owner
page = 1;
loop {
let url = format!("https://api.github.com/users/{owner}/gists?per_page=100&page={page}");
let mut req = client.get(&url).header("User-Agent", "kingfisher");
if let Ok(token) = env::var("KF_GITHUB_TOKEN") {
if !token.is_empty() {
req = req.bearer_auth(&token);
}
}
let resp = req.send().await?;
if !resp.status().is_success() {
break;
}
let gists: Vec<Value> = resp.json().await?;
if gists.is_empty() {
break;
}
for gist in gists {
if let Some(id) = gist.get("id").and_then(|v| v.as_str()) {
if seen.insert(id.to_string()) {
let mut req_g = client
.get(&format!("https://api.github.com/gists/{id}"))
.header("User-Agent", "kingfisher");
if let Ok(token) = env::var("KF_GITHUB_TOKEN") {
if !token.is_empty() {
req_g = req_g.bearer_auth(&token);
}
}
let detail: Value = req_g.send().await?.json().await?;
if let Some(files) = detail.get("files").and_then(|v| v.as_object()) {
let gist_dir = gists_dir.join(id);
fs::create_dir_all(&gist_dir)?;
for (fname, fobj) in files {
if let Some(content) = fobj.get("content").and_then(|v| v.as_str()) {
let file_path = gist_dir.join(fname);
fs::write(&file_path, content)?;
let url = format!("https://gist.github.com/{id}");
let mut ds = datastore.lock().unwrap();
ds.register_repo_link(file_path, url);
}
}
}
}
}
}
page += 1;
}
// Private gists for authenticated user if they own the repo
if let Ok(token) = env::var("KF_GITHUB_TOKEN") {
if !token.is_empty() {
page = 1;
loop {
let url = format!("https://api.github.com/gists?per_page=100&page={page}");
let resp = client
.get(&url)
.header("User-Agent", "kingfisher")
.bearer_auth(&token)
.send()
.await?;
if !resp.status().is_success() {
break;
}
let gists: Vec<Value> = resp.json().await?;
if gists.is_empty() {
break;
}
for gist in gists {
let owner_login =
gist.get("owner").and_then(|o| o.get("login")).and_then(|v| v.as_str());
if owner_login == Some(owner.as_str()) {
if let Some(id) = gist.get("id").and_then(|v| v.as_str()) {
if seen.insert(id.to_string()) {
let detail: Value = client
.get(&format!("https://api.github.com/gists/{id}"))
.header("User-Agent", "kingfisher")
.bearer_auth(&token)
.send()
.await?
.json()
.await?;
if let Some(files) = detail.get("files").and_then(|v| v.as_object())
{
let gist_dir = gists_dir.join(id);
fs::create_dir_all(&gist_dir)?;
for (fname, fobj) in files {
if let Some(content) =
fobj.get("content").and_then(|v| v.as_str())
{
let file_path = gist_dir.join(fname);
fs::write(&file_path, content)?;
let url = format!("https://gist.github.com/{id}");
let mut ds = datastore.lock().unwrap();
ds.register_repo_link(file_path, url);
}
}
}
}
}
}
}
page += 1;
}
}
}
if gists_dir.read_dir().ok().and_then(|mut d| d.next()).is_some() {
dirs.push(gists_dir);
}
Ok(dirs)
}