use std::{ collections::HashSet, env, fs, path::{Path, PathBuf}, sync::{Arc, Mutex}, time::Duration, }; use anyhow::{Context, Result}; use indicatif::{ProgressBar, ProgressStyle}; use octorust::{ auth::Credentials, types::{Order, ReposListOrgSort, ReposListOrgType, ReposListUserType}, Client, }; use reqwest::StatusCode; use serde::Deserialize; use serde_json::Value; use tracing::{info, warn}; use url::Url; use crate::{findings_store, git_url::GitUrl, validation::GLOBAL_USER_AGENT}; use std::str::FromStr; #[derive(Deserialize)] struct GitHubContributor { login: Option, } #[derive(Deserialize)] struct GitHubRepo { clone_url: String, } #[derive(Debug)] pub struct RepoSpecifiers { pub user: Vec, pub organization: Vec, pub all_organizations: bool, pub repo_filter: RepoType, pub exclude_repos: Vec, } impl RepoSpecifiers { pub fn is_empty(&self) -> bool { self.user.is_empty() && self.organization.is_empty() && !self.all_organizations } } #[derive(Debug, Clone)] pub enum RepoType { All, Source, Fork, } impl From for ReposListUserType { fn from(repo_type: RepoType) -> Self { match repo_type { RepoType::All => ReposListUserType::All, RepoType::Source => ReposListUserType::Owner, RepoType::Fork => ReposListUserType::Member, } } } impl From for ReposListOrgType { fn from(repo_type: RepoType) -> Self { match repo_type { RepoType::All => ReposListOrgType::All, RepoType::Source => ReposListOrgType::Sources, RepoType::Fork => ReposListOrgType::Forks, } } } fn normalize_repo_identifier(owner: &str, repo: &str) -> Option { let owner = owner.trim().trim_matches('/'); let repo = repo.trim().trim_matches('/'); let repo = repo.strip_suffix(".git").unwrap_or(repo); if owner.is_empty() || repo.is_empty() { return None; } Some(format!("{}/{}", owner.to_lowercase(), repo.to_lowercase())) } fn parse_repo_name_from_path(path: &str) -> Option { let trimmed = path.trim().trim_matches('/'); if trimmed.is_empty() { return None; } let mut parts = trimmed.split('/'); let owner = parts.next()?; let repo = parts.next()?; if parts.next().is_some() { return None; } normalize_repo_identifier(owner, repo) } fn parse_repo_name_from_url(repo_url: &str) -> Option { let url = Url::parse(repo_url).ok()?; parse_repo_name_from_path(url.path()) } fn parse_excluded_repo(raw: &str) -> Option { let trimmed = raw.trim(); if trimmed.is_empty() { return None; } if let Some(name) = parse_repo_name_from_url(trimmed) { return Some(name); } if let Some(idx) = trimmed.rfind(':') { if let Some(name) = parse_repo_name_from_path(&trimmed[idx + 1..]) { return Some(name); } } parse_repo_name_from_path(trimmed) } use crate::git_host; fn build_exclude_matcher(exclude_repos: &[String]) -> git_host::ExcludeMatcher { git_host::build_exclude_matcher(exclude_repos, |raw| parse_excluded_repo(raw), "GitHub") } fn should_exclude_repo(clone_url: &str, excludes: &git_host::ExcludeMatcher) -> bool { git_host::should_exclude_repo(clone_url, excludes, parse_repo_name_from_url) } fn create_github_client(github_url: &url::Url, ignore_certs: bool) -> Result> { // Try personal access token let credentials = if let Ok(token) = env::var("KF_GITHUB_TOKEN") { Credentials::Token(token) } else { Credentials::Token("".to_string()) // Anonymous access }; let mut client_builder = reqwest::Client::builder(); if ignore_certs { client_builder = client_builder.danger_accept_invalid_certs(ignore_certs); } let reqwest_client = client_builder.build().context("Failed to build HTTP client")?; let http_client = reqwest_middleware_octorust::ClientBuilder::new(reqwest_client).build(); let mut client = Client::custom( concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION")), credentials, http_client, ); // Override host if not using api.github.com if github_url.host_str() != Some("api.github.com") { client.with_host_override(github_url.as_str()); } Ok(Arc::new(client)) } fn normalize_api_base(api_url: &Url) -> Url { let mut base = api_url.clone(); if !base.path().ends_with('/') { let path = format!("{}/", base.path()); base.set_path(&path); } base } pub async fn enumerate_contributor_repo_urls( repo_url: &GitUrl, github_api_url: &Url, ignore_certs: bool, exclude_repos: &[String], repo_clone_limit: Option, progress_enabled: bool, ) -> Result> { let (_, owner, repo) = parse_repo(repo_url).context("invalid GitHub repo URL")?; let exclude_set = build_exclude_matcher(exclude_repos); let client = reqwest::Client::builder().danger_accept_invalid_certs(ignore_certs).build()?; let token = env::var("KF_GITHUB_TOKEN").ok().filter(|t| !t.is_empty()); let api_base = normalize_api_base(github_api_url); let mut contributor_logins = Vec::new(); let mut seen_contributors = HashSet::new(); let mut page = 1; loop { let mut url = api_base .join(&format!("repos/{owner}/{repo}/contributors")) .context("Failed to build GitHub contributors URL")?; url.query_pairs_mut().append_pair("per_page", "100").append_pair("page", &page.to_string()); let mut req = client.get(url).header("User-Agent", GLOBAL_USER_AGENT.as_str()); if let Some(token) = token.as_ref() { req = req.bearer_auth(token); } let resp = req.send().await?; if !resp.status().is_success() { warn_on_rate_limit("GitHub", resp.status(), "listing contributors"); break; } let contributors: Vec = resp.json().await?; if contributors.is_empty() { break; } for contributor in contributors { if let Some(login) = contributor.login { if seen_contributors.insert(login.clone()) { contributor_logins.push(login); } } } page += 1; } let (per_user_limit, total_limit) = determine_contributor_repo_limits(repo_clone_limit, contributor_logins.len(), "GitHub"); let progress = build_contributor_progress_bar( progress_enabled, contributor_logins.len() as u64, "Enumerating GitHub contributor repositories...", ); let mut repo_urls = Vec::new(); let mut total_repo_count = 0usize; for login in contributor_logins { if let Some(total_limit) = total_limit { if total_repo_count >= total_limit { break; } } let mut user_repo_count = 0usize; page = 1; loop { if let Some(per_user_limit) = per_user_limit { if user_repo_count >= per_user_limit { break; } } if let Some(total_limit) = total_limit { if total_repo_count >= total_limit { break; } } let mut url = api_base .join(&format!("users/{login}/repos")) .context("Failed to build GitHub user repos URL")?; url.query_pairs_mut() .append_pair("per_page", "100") .append_pair("page", &page.to_string()) .append_pair("type", "all") .append_pair("sort", "updated") .append_pair("direction", "desc"); let mut req = client.get(url).header("User-Agent", GLOBAL_USER_AGENT.as_str()); if let Some(token) = token.as_ref() { req = req.bearer_auth(token); } let resp = req.send().await?; if !resp.status().is_success() { warn_on_rate_limit("GitHub", resp.status(), "listing user repositories"); break; } let repos: Vec = resp.json().await?; if repos.is_empty() { break; } for repo in repos { if let Some(per_user_limit) = per_user_limit { if user_repo_count >= per_user_limit { break; } } if let Some(total_limit) = total_limit { if total_repo_count >= total_limit { break; } } if should_exclude_repo(&repo.clone_url, &exclude_set) { continue; } repo_urls.push(repo.clone_url); user_repo_count += 1; total_repo_count += 1; } page += 1; } progress.inc(1); } repo_urls.sort(); repo_urls.dedup(); progress.finish_and_clear(); Ok(repo_urls) } fn warn_on_rate_limit(service: &str, status: StatusCode, action: &str) { if status == StatusCode::FORBIDDEN || status == StatusCode::TOO_MANY_REQUESTS { warn!("{service} API rate limit or access restriction while {action}: HTTP {status}"); } } fn determine_contributor_repo_limits( repo_clone_limit: Option, user_count: usize, service: &str, ) -> (Option, Option) { let Some(limit) = repo_clone_limit else { return (None, None); }; if user_count == 0 { return (Some(0), Some(limit)); } if user_count > limit { let per_user_limit = std::cmp::max(1, limit / 100); info!( "Found {user_count} {service} contributors which exceeds repo-clone-limit {limit}. \ Consider increasing repo-clone-limit; sampling {per_user_limit} repos per user until the limit is reached." ); return (Some(per_user_limit), Some(limit)); } let per_user_limit = std::cmp::max(1, limit / user_count); (Some(per_user_limit), Some(limit)) } fn build_contributor_progress_bar( progress_enabled: bool, length: u64, message: &str, ) -> ProgressBar { if progress_enabled { let style = ProgressStyle::with_template("{spinner} {msg} {pos}/{len} [{elapsed_precise}]") .expect("progress bar style template should compile"); let pb = ProgressBar::new(length).with_style(style).with_message(message.to_string()); pb.enable_steady_tick(Duration::from_millis(500)); pb } else { ProgressBar::hidden() } } pub async fn enumerate_repo_urls( repo_specifiers: &RepoSpecifiers, github_url: url::Url, ignore_certs: bool, mut progress: Option<&mut ProgressBar>, ) -> Result> { let client = create_github_client(&github_url, ignore_certs)?; let mut repo_urls = Vec::new(); let exclude_set = build_exclude_matcher(&repo_specifiers.exclude_repos); let user_repo_type: ReposListUserType = repo_specifiers.repo_filter.clone().into(); let org_repo_type: ReposListOrgType = repo_specifiers.repo_filter.clone().into(); for username in &repo_specifiers.user { let repos = client .repos() .list_all_for_user( username, user_repo_type.clone(), ReposListOrgSort::Created, Order::Desc, ) .await?; repo_urls.extend(repos.body.into_iter().filter_map(|repo| { let clone_url = repo.clone_url; if should_exclude_repo(&clone_url, &exclude_set) { None } else { Some(clone_url) } })); if let Some(progress) = progress.as_mut() { progress.inc(1); } } let orgs = if repo_specifiers.all_organizations { let mut all_orgs = Vec::new(); let org_list = client.orgs().list_all(100).await?; all_orgs.extend(org_list.body.into_iter().map(|org| org.login)); all_orgs } else { repo_specifiers.organization.clone() }; for org_name in orgs { let repos = client .repos() .list_all_for_org( &org_name, org_repo_type.clone(), ReposListOrgSort::Created, Order::Desc, ) .await?; repo_urls.extend(repos.body.into_iter().filter_map(|repo| { let clone_url = repo.clone_url; if should_exclude_repo(&clone_url, &exclude_set) { None } else { Some(clone_url) } })); if let Some(progress) = progress.as_mut() { progress.inc(1); } } repo_urls.sort(); repo_urls.dedup(); Ok(repo_urls) } pub async fn list_repositories( api_url: Url, ignore_certs: bool, progress_enabled: bool, users: &[String], orgs: &[String], all_orgs: bool, exclude_repos: &[String], repo_filter: RepoType, ) -> Result<()> { let repo_specifiers = RepoSpecifiers { user: users.to_vec(), organization: orgs.to_vec(), all_organizations: all_orgs, repo_filter, exclude_repos: exclude_repos.to_vec(), }; // Create a progress bar just for displaying status // let mut progress = ProgressBar::new_spinner("Fetching repositories...", // true,); let mut progress = if progress_enabled { let style = ProgressStyle::with_template("{spinner} {msg} [{elapsed_precise}]") .expect("progress bar style template should compile"); let pb = ProgressBar::new_spinner().with_style(style).with_message("Fetching repositories"); pb.enable_steady_tick(Duration::from_millis(500)); pb } else { ProgressBar::hidden() }; let repo_urls = enumerate_repo_urls(&repo_specifiers, api_url, ignore_certs, Some(&mut progress)).await?; // Print repositories for url in repo_urls { println!("{}", url); } Ok(()) } fn parse_repo(repo_url: &GitUrl) -> Option<(String, String, String)> { let url = Url::parse(repo_url.as_str()).ok()?; let host = url.host_str()?.to_string(); let mut segments = url.path_segments()?; let owner = segments.next()?.to_string(); let mut repo = segments.next()?.to_string(); if let Some(stripped) = repo.strip_suffix(".git") { repo = stripped.to_string(); } Some((host, owner, repo)) } pub fn wiki_url(repo_url: &GitUrl) -> Option { let (host, owner, repo) = parse_repo(repo_url)?; let wiki = format!("https://{host}/{owner}/{repo}.wiki.git"); GitUrl::from_str(&wiki).ok() } pub async fn fetch_repo_items( repo_url: &GitUrl, ignore_certs: bool, output_root: &Path, datastore: &Arc>, ) -> Result> { let (_, owner, repo) = parse_repo(repo_url).context("invalid GitHub repo URL")?; let client = reqwest::Client::builder().danger_accept_invalid_certs(ignore_certs).build()?; let mut dirs = Vec::new(); // Issues let issues_dir = output_root.join("github_issues").join(&owner).join(&repo); fs::create_dir_all(&issues_dir)?; let mut page = 1; loop { let url = format!( "https://api.github.com/repos/{owner}/{repo}/issues?state=all&per_page=100&page={page}" ); let mut req = client.get(&url).header("User-Agent", GLOBAL_USER_AGENT.as_str()); if let Ok(token) = env::var("KF_GITHUB_TOKEN") { if !token.is_empty() { req = req.bearer_auth(token); } } let resp = req.send().await?; if !resp.status().is_success() { break; } let issues: Vec = resp.json().await?; if issues.is_empty() { break; } for issue in issues { let number = issue.get("number").and_then(|v| v.as_u64()).unwrap_or(0); let title = issue.get("title").and_then(|v| v.as_str()).unwrap_or(""); let body = issue.get("body").and_then(|v| v.as_str()).unwrap_or(""); let content = format!("# {title}\n\n{body}"); let file_path = issues_dir.join(format!("issue_{number}.md")); fs::write(&file_path, content)?; let url = format!("https://github.com/{owner}/{repo}/issues/{number}"); let mut ds = datastore.lock().unwrap(); ds.register_repo_link(file_path, url); } page += 1; } if issues_dir.read_dir().ok().and_then(|mut d| d.next()).is_some() { dirs.push(issues_dir); } // Gists let gists_dir = output_root.join("github_gists").join(&owner); fs::create_dir_all(&gists_dir)?; let mut seen = HashSet::new(); // Public gists for the owner page = 1; loop { let url = format!("https://api.github.com/users/{owner}/gists?per_page=100&page={page}"); let mut req = client.get(&url).header("User-Agent", GLOBAL_USER_AGENT.as_str()); if let Ok(token) = env::var("KF_GITHUB_TOKEN") { if !token.is_empty() { req = req.bearer_auth(&token); } } let resp = req.send().await?; if !resp.status().is_success() { break; } let gists: Vec = resp.json().await?; if gists.is_empty() { break; } for gist in gists { if let Some(id) = gist.get("id").and_then(|v| v.as_str()) { if seen.insert(id.to_string()) { let mut req_g = client .get(&format!("https://api.github.com/gists/{id}")) .header("User-Agent", GLOBAL_USER_AGENT.as_str()); if let Ok(token) = env::var("KF_GITHUB_TOKEN") { if !token.is_empty() { req_g = req_g.bearer_auth(&token); } } let detail: Value = req_g.send().await?.json().await?; if let Some(files) = detail.get("files").and_then(|v| v.as_object()) { let gist_dir = gists_dir.join(id); fs::create_dir_all(&gist_dir)?; for (fname, fobj) in files { if let Some(content) = fobj.get("content").and_then(|v| v.as_str()) { let file_path = gist_dir.join(fname); fs::write(&file_path, content)?; let url = format!("https://gist.github.com/{id}"); let mut ds = datastore.lock().unwrap(); ds.register_repo_link(file_path, url); } } } } } } page += 1; } // Private gists for authenticated user if they own the repo if let Ok(token) = env::var("KF_GITHUB_TOKEN") { if !token.is_empty() { page = 1; loop { let url = format!("https://api.github.com/gists?per_page=100&page={page}"); let resp = client .get(&url) .header("User-Agent", GLOBAL_USER_AGENT.as_str()) .bearer_auth(&token) .send() .await?; if !resp.status().is_success() { break; } let gists: Vec = resp.json().await?; if gists.is_empty() { break; } for gist in gists { let owner_login = gist.get("owner").and_then(|o| o.get("login")).and_then(|v| v.as_str()); if owner_login == Some(owner.as_str()) { if let Some(id) = gist.get("id").and_then(|v| v.as_str()) { if seen.insert(id.to_string()) { let detail: Value = client .get(&format!("https://api.github.com/gists/{id}")) .header("User-Agent", GLOBAL_USER_AGENT.as_str()) .bearer_auth(&token) .send() .await? .json() .await?; if let Some(files) = detail.get("files").and_then(|v| v.as_object()) { let gist_dir = gists_dir.join(id); fs::create_dir_all(&gist_dir)?; for (fname, fobj) in files { if let Some(content) = fobj.get("content").and_then(|v| v.as_str()) { let file_path = gist_dir.join(fname); fs::write(&file_path, content)?; let url = format!("https://gist.github.com/{id}"); let mut ds = datastore.lock().unwrap(); ds.register_repo_link(file_path, url); } } } } } } } page += 1; } } } if gists_dir.read_dir().ok().and_then(|mut d| d.next()).is_some() { dirs.push(gists_dir); } Ok(dirs) } #[cfg(test)] mod tests { use super::*; #[test] fn parse_excluded_repo_variants() { assert_eq!(parse_excluded_repo("Owner/Repo").as_deref(), Some("owner/repo")); assert_eq!(parse_excluded_repo("owner/repo.git").as_deref(), Some("owner/repo")); assert_eq!( parse_excluded_repo("https://github.com/Owner/Repo.git").as_deref(), Some("owner/repo") ); assert_eq!( parse_excluded_repo("git@github.com:Owner/Repo.git").as_deref(), Some("owner/repo") ); assert_eq!( parse_excluded_repo("ssh://git@github.example.com/Owner/Repo.git").as_deref(), Some("owner/repo") ); assert_eq!( parse_excluded_repo(" https://github.com/Owner/Repo ").as_deref(), Some("owner/repo") ); assert_eq!(parse_excluded_repo("not-a-repo"), None); } #[test] fn should_exclude_repo_matches_normalized_names() { let excludes = build_exclude_matcher(&vec!["Owner/Repo".to_string()]); assert!(should_exclude_repo("https://github.com/owner/repo.git", &excludes)); assert!(!should_exclude_repo("https://github.com/owner/other.git", &excludes)); } #[test] fn should_exclude_repo_matches_ssh_urls() { let excludes = build_exclude_matcher(&vec!["owner/repo".to_string()]); assert!(should_exclude_repo("ssh://git@github.example.com/owner/repo.git", &excludes)); } #[test] fn should_exclude_repo_matches_globs() { let excludes = build_exclude_matcher(&vec!["owner/*-archive".to_string()]); assert!(should_exclude_repo("https://github.com/owner/project-archive.git", &excludes)); assert!(!should_exclude_repo("https://github.com/owner/project.git", &excludes)); } }