use std::{ collections::HashSet, env, fs, path::{Path, PathBuf}, sync::{Arc, Mutex}, time::Duration, }; use anyhow::{Context, Result}; use globset::{Glob, GlobSet, GlobSetBuilder}; use indicatif::{ProgressBar, ProgressStyle}; use octorust::{ auth::Credentials, types::{Order, ReposListOrgSort, ReposListOrgType, ReposListUserType}, Client, }; use serde_json::Value; use tracing::warn; use url::Url; use crate::{findings_store, git_url::GitUrl, validation::GLOBAL_USER_AGENT}; use std::str::FromStr; #[derive(Debug)] pub struct RepoSpecifiers { pub user: Vec, pub organization: Vec, pub all_organizations: bool, pub repo_filter: RepoType, pub exclude_repos: Vec, } impl RepoSpecifiers { pub fn is_empty(&self) -> bool { self.user.is_empty() && self.organization.is_empty() && !self.all_organizations } } #[derive(Debug, Clone)] pub enum RepoType { All, Source, Fork, } impl From for ReposListUserType { fn from(repo_type: RepoType) -> Self { match repo_type { RepoType::All => ReposListUserType::All, RepoType::Source => ReposListUserType::Owner, RepoType::Fork => ReposListUserType::Member, } } } impl From for ReposListOrgType { fn from(repo_type: RepoType) -> Self { match repo_type { RepoType::All => ReposListOrgType::All, RepoType::Source => ReposListOrgType::Sources, RepoType::Fork => ReposListOrgType::Forks, } } } fn normalize_repo_identifier(owner: &str, repo: &str) -> Option { let owner = owner.trim().trim_matches('/'); let repo = repo.trim().trim_matches('/'); let repo = repo.strip_suffix(".git").unwrap_or(repo); if owner.is_empty() || repo.is_empty() { return None; } Some(format!("{}/{}", owner.to_lowercase(), repo.to_lowercase())) } fn parse_repo_name_from_path(path: &str) -> Option { let trimmed = path.trim().trim_matches('/'); if trimmed.is_empty() { return None; } let mut parts = trimmed.split('/'); let owner = parts.next()?; let repo = parts.next()?; if parts.next().is_some() { return None; } normalize_repo_identifier(owner, repo) } fn parse_repo_name_from_url(repo_url: &str) -> Option { let url = Url::parse(repo_url).ok()?; parse_repo_name_from_path(url.path()) } fn parse_excluded_repo(raw: &str) -> Option { let trimmed = raw.trim(); if trimmed.is_empty() { return None; } if let Some(name) = parse_repo_name_from_url(trimmed) { return Some(name); } if let Some(idx) = trimmed.rfind(':') { if let Some(name) = parse_repo_name_from_path(&trimmed[idx + 1..]) { return Some(name); } } parse_repo_name_from_path(trimmed) } struct ExcludeMatcher { exact: HashSet, globs: Option, } impl ExcludeMatcher { fn is_empty(&self) -> bool { self.exact.is_empty() && self.globs.is_none() } fn matches(&self, name: &str) -> bool { if self.exact.contains(name) { return true; } if let Some(globs) = &self.globs { return globs.is_match(name); } false } } fn looks_like_glob(pattern: &str) -> bool { pattern.contains('*') || pattern.contains('?') || pattern.contains('[') } fn build_exclude_matcher(exclude_repos: &[String]) -> ExcludeMatcher { let mut exact = HashSet::new(); let mut glob_builder = GlobSetBuilder::new(); let mut has_glob = false; for raw in exclude_repos { match parse_excluded_repo(raw) { Some(name) => { if looks_like_glob(&name) { match Glob::new(&name) { Ok(glob) => { glob_builder.add(glob); has_glob = true; } Err(err) => { warn!("Ignoring invalid GitHub exclusion pattern '{raw}': {err}"); exact.insert(name); } } } else { exact.insert(name); } } None => { warn!("Ignoring invalid GitHub exclusion '{raw}' (expected owner/repo)"); } } } let globs = if has_glob { match glob_builder.build() { Ok(set) => Some(set), Err(err) => { warn!("Failed to build GitHub exclusion patterns: {err}"); None } } } else { None }; ExcludeMatcher { exact, globs } } fn should_exclude_repo(clone_url: &str, excludes: &ExcludeMatcher) -> bool { if excludes.is_empty() { return false; } if let Some(name) = parse_repo_name_from_url(clone_url) { return excludes.matches(&name); } false } fn create_github_client(github_url: &url::Url, ignore_certs: bool) -> Result> { // Try personal access token let credentials = if let Ok(token) = env::var("KF_GITHUB_TOKEN") { Credentials::Token(token) } else { Credentials::Token("".to_string()) // Anonymous access }; let mut client_builder = reqwest::Client::builder(); if ignore_certs { client_builder = client_builder.danger_accept_invalid_certs(ignore_certs); } let reqwest_client = client_builder.build().context("Failed to build HTTP client")?; let http_client = reqwest_middleware::ClientBuilder::new(reqwest_client).build(); let mut client = Client::custom( concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION")), credentials, http_client, ); // Override host if not using api.github.com if github_url.host_str() != Some("api.github.com") { client.with_host_override(github_url.as_str()); } Ok(Arc::new(client)) } pub async fn enumerate_repo_urls( repo_specifiers: &RepoSpecifiers, github_url: url::Url, ignore_certs: bool, mut progress: Option<&mut ProgressBar>, ) -> Result> { let client = create_github_client(&github_url, ignore_certs)?; let mut repo_urls = Vec::new(); let exclude_set = build_exclude_matcher(&repo_specifiers.exclude_repos); let user_repo_type: ReposListUserType = repo_specifiers.repo_filter.clone().into(); let org_repo_type: ReposListOrgType = repo_specifiers.repo_filter.clone().into(); for username in &repo_specifiers.user { let repos = client .repos() .list_all_for_user( username, user_repo_type.clone(), ReposListOrgSort::Created, Order::Desc, ) .await?; repo_urls.extend(repos.body.into_iter().filter_map(|repo| { let clone_url = repo.clone_url; if should_exclude_repo(&clone_url, &exclude_set) { None } else { Some(clone_url) } })); if let Some(progress) = progress.as_mut() { progress.inc(1); } } let orgs = if repo_specifiers.all_organizations { let mut all_orgs = Vec::new(); let org_list = client.orgs().list_all(100).await?; all_orgs.extend(org_list.body.into_iter().map(|org| org.login)); all_orgs } else { repo_specifiers.organization.clone() }; for org_name in orgs { let repos = client .repos() .list_all_for_org( &org_name, org_repo_type.clone(), ReposListOrgSort::Created, Order::Desc, ) .await?; repo_urls.extend(repos.body.into_iter().filter_map(|repo| { let clone_url = repo.clone_url; if should_exclude_repo(&clone_url, &exclude_set) { None } else { Some(clone_url) } })); if let Some(progress) = progress.as_mut() { progress.inc(1); } } repo_urls.sort(); repo_urls.dedup(); Ok(repo_urls) } pub async fn list_repositories( api_url: Url, ignore_certs: bool, progress_enabled: bool, users: &[String], orgs: &[String], all_orgs: bool, exclude_repos: &[String], repo_filter: RepoType, ) -> Result<()> { let repo_specifiers = RepoSpecifiers { user: users.to_vec(), organization: orgs.to_vec(), all_organizations: all_orgs, repo_filter, exclude_repos: exclude_repos.to_vec(), }; // Create a progress bar just for displaying status // let mut progress = ProgressBar::new_spinner("Fetching repositories...", // true,); let mut progress = if progress_enabled { let style = ProgressStyle::with_template("{spinner} {msg} [{elapsed_precise}]") .expect("progress bar style template should compile"); let pb = ProgressBar::new_spinner().with_style(style).with_message("Fetching repositories"); pb.enable_steady_tick(Duration::from_millis(500)); pb } else { ProgressBar::hidden() }; let repo_urls = enumerate_repo_urls(&repo_specifiers, api_url, ignore_certs, Some(&mut progress)).await?; // Print repositories for url in repo_urls { println!("{}", url); } Ok(()) } fn parse_repo(repo_url: &GitUrl) -> Option<(String, String, String)> { let url = Url::parse(repo_url.as_str()).ok()?; let host = url.host_str()?.to_string(); let mut segments = url.path_segments()?; let owner = segments.next()?.to_string(); let mut repo = segments.next()?.to_string(); if let Some(stripped) = repo.strip_suffix(".git") { repo = stripped.to_string(); } Some((host, owner, repo)) } pub fn wiki_url(repo_url: &GitUrl) -> Option { let (host, owner, repo) = parse_repo(repo_url)?; let wiki = format!("https://{host}/{owner}/{repo}.wiki.git"); GitUrl::from_str(&wiki).ok() } pub async fn fetch_repo_items( repo_url: &GitUrl, ignore_certs: bool, output_root: &Path, datastore: &Arc>, ) -> Result> { let (_, owner, repo) = parse_repo(repo_url).context("invalid GitHub repo URL")?; let client = reqwest::Client::builder().danger_accept_invalid_certs(ignore_certs).build()?; let mut dirs = Vec::new(); // Issues let issues_dir = output_root.join("github_issues").join(&owner).join(&repo); fs::create_dir_all(&issues_dir)?; let mut page = 1; loop { let url = format!( "https://api.github.com/repos/{owner}/{repo}/issues?state=all&per_page=100&page={page}" ); let mut req = client.get(&url).header("User-Agent", GLOBAL_USER_AGENT.as_str()); if let Ok(token) = env::var("KF_GITHUB_TOKEN") { if !token.is_empty() { req = req.bearer_auth(token); } } let resp = req.send().await?; if !resp.status().is_success() { break; } let issues: Vec = resp.json().await?; if issues.is_empty() { break; } for issue in issues { let number = issue.get("number").and_then(|v| v.as_u64()).unwrap_or(0); let title = issue.get("title").and_then(|v| v.as_str()).unwrap_or(""); let body = issue.get("body").and_then(|v| v.as_str()).unwrap_or(""); let content = format!("# {title}\n\n{body}"); let file_path = issues_dir.join(format!("issue_{number}.md")); fs::write(&file_path, content)?; let url = format!("https://github.com/{owner}/{repo}/issues/{number}"); let mut ds = datastore.lock().unwrap(); ds.register_repo_link(file_path, url); } page += 1; } if issues_dir.read_dir().ok().and_then(|mut d| d.next()).is_some() { dirs.push(issues_dir); } // Gists let gists_dir = output_root.join("github_gists").join(&owner); fs::create_dir_all(&gists_dir)?; let mut seen = HashSet::new(); // Public gists for the owner page = 1; loop { let url = format!("https://api.github.com/users/{owner}/gists?per_page=100&page={page}"); let mut req = client.get(&url).header("User-Agent", GLOBAL_USER_AGENT.as_str()); if let Ok(token) = env::var("KF_GITHUB_TOKEN") { if !token.is_empty() { req = req.bearer_auth(&token); } } let resp = req.send().await?; if !resp.status().is_success() { break; } let gists: Vec = resp.json().await?; if gists.is_empty() { break; } for gist in gists { if let Some(id) = gist.get("id").and_then(|v| v.as_str()) { if seen.insert(id.to_string()) { let mut req_g = client .get(&format!("https://api.github.com/gists/{id}")) .header("User-Agent", GLOBAL_USER_AGENT.as_str()); if let Ok(token) = env::var("KF_GITHUB_TOKEN") { if !token.is_empty() { req_g = req_g.bearer_auth(&token); } } let detail: Value = req_g.send().await?.json().await?; if let Some(files) = detail.get("files").and_then(|v| v.as_object()) { let gist_dir = gists_dir.join(id); fs::create_dir_all(&gist_dir)?; for (fname, fobj) in files { if let Some(content) = fobj.get("content").and_then(|v| v.as_str()) { let file_path = gist_dir.join(fname); fs::write(&file_path, content)?; let url = format!("https://gist.github.com/{id}"); let mut ds = datastore.lock().unwrap(); ds.register_repo_link(file_path, url); } } } } } } page += 1; } // Private gists for authenticated user if they own the repo if let Ok(token) = env::var("KF_GITHUB_TOKEN") { if !token.is_empty() { page = 1; loop { let url = format!("https://api.github.com/gists?per_page=100&page={page}"); let resp = client .get(&url) .header("User-Agent", GLOBAL_USER_AGENT.as_str()) .bearer_auth(&token) .send() .await?; if !resp.status().is_success() { break; } let gists: Vec = resp.json().await?; if gists.is_empty() { break; } for gist in gists { let owner_login = gist.get("owner").and_then(|o| o.get("login")).and_then(|v| v.as_str()); if owner_login == Some(owner.as_str()) { if let Some(id) = gist.get("id").and_then(|v| v.as_str()) { if seen.insert(id.to_string()) { let detail: Value = client .get(&format!("https://api.github.com/gists/{id}")) .header("User-Agent", GLOBAL_USER_AGENT.as_str()) .bearer_auth(&token) .send() .await? .json() .await?; if let Some(files) = detail.get("files").and_then(|v| v.as_object()) { let gist_dir = gists_dir.join(id); fs::create_dir_all(&gist_dir)?; for (fname, fobj) in files { if let Some(content) = fobj.get("content").and_then(|v| v.as_str()) { let file_path = gist_dir.join(fname); fs::write(&file_path, content)?; let url = format!("https://gist.github.com/{id}"); let mut ds = datastore.lock().unwrap(); ds.register_repo_link(file_path, url); } } } } } } } page += 1; } } } if gists_dir.read_dir().ok().and_then(|mut d| d.next()).is_some() { dirs.push(gists_dir); } Ok(dirs) } #[cfg(test)] mod tests { use super::*; #[test] fn parse_excluded_repo_variants() { assert_eq!(parse_excluded_repo("Owner/Repo").as_deref(), Some("owner/repo")); assert_eq!(parse_excluded_repo("owner/repo.git").as_deref(), Some("owner/repo")); assert_eq!( parse_excluded_repo("https://github.com/Owner/Repo.git").as_deref(), Some("owner/repo") ); assert_eq!( parse_excluded_repo("git@github.com:Owner/Repo.git").as_deref(), Some("owner/repo") ); assert_eq!( parse_excluded_repo("ssh://git@github.example.com/Owner/Repo.git").as_deref(), Some("owner/repo") ); assert_eq!( parse_excluded_repo(" https://github.com/Owner/Repo ").as_deref(), Some("owner/repo") ); assert_eq!(parse_excluded_repo("not-a-repo"), None); } #[test] fn should_exclude_repo_matches_normalized_names() { let excludes = build_exclude_matcher(&vec!["Owner/Repo".to_string()]); assert!(should_exclude_repo("https://github.com/owner/repo.git", &excludes)); assert!(!should_exclude_repo("https://github.com/owner/other.git", &excludes)); } #[test] fn should_exclude_repo_matches_ssh_urls() { let excludes = build_exclude_matcher(&vec!["owner/repo".to_string()]); assert!(should_exclude_repo("ssh://git@github.example.com/owner/repo.git", &excludes)); } #[test] fn should_exclude_repo_matches_globs() { let excludes = build_exclude_matcher(&vec!["owner/*-archive".to_string()]); assert!(should_exclude_repo("https://github.com/owner/project-archive.git", &excludes)); assert!(!should_exclude_repo("https://github.com/owner/project.git", &excludes)); } }