kingfisher/src/validation/httpvalidation.rs
2026-01-14 21:45:55 -08:00

608 lines
21 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use std::{collections::BTreeMap, future::Future, str::FromStr, time::Duration};
use crate::validation::GLOBAL_USER_AGENT;
use anyhow::{anyhow, Error, Result};
use http::StatusCode;
use liquid::Object;
use quick_xml::de::from_str as xml_from_str;
use reqwest::{
header,
header::{HeaderMap, HeaderName, HeaderValue},
Client, Method, RequestBuilder, Response, Url,
};
use serde::de::IgnoredAny;
use sha1::{Digest, Sha1};
use tokio::time::sleep;
use tracing::debug;
use crate::rules::rule::ResponseMatcher;
/// Build a deterministic cache key from the immutable parts of an HTTP request.
///
/// * `method` case-insensitive HTTP verb (“GET”, “POST”…)
/// * `url` fully-qualified URL (any query string should already be present)
/// * `headers` *logical* headers you intend to send (template-rendered, lower-level additions
/// such as `User-Agent` may be appended by the caller)
///
/// The parts are concatenated with `\0` separators before hashing to avoid accidental
/// collisions such as `"GET/foo"` vs `"GE" + "T/foo"`.
pub fn generate_http_cache_key_parts(
method: &str,
url: &Url,
headers: &BTreeMap<String, String>,
) -> String {
let method = method.to_uppercase(); // ensure "get" == "GET"
let url = url.as_str(); // canonical form from `reqwest::Url`
let mut hasher = Sha1::new();
hasher.update(method.as_bytes());
hasher.update(b"\0");
hasher.update(url.as_bytes());
hasher.update(b"\0");
// Collect headers sorted lexicographically (BTreeMap is already sorted),
// then hash as `key:value\0`
for (k, v) in headers {
hasher.update(k.as_bytes());
hasher.update(b":");
hasher.update(v.as_bytes());
hasher.update(b"\0");
}
// Hex-encode and prefix so callers can tell this key came from HTTP logic
format!("HTTP:{:x}", hasher.finalize())
}
/// Parse an HTTP method from a string.
pub fn parse_http_method(method_str: &str) -> Result<Method, String> {
Method::from_str(method_str).map_err(|_| format!("Invalid HTTP method: {}", method_str))
}
/// Build a reqwest RequestBuilder using the provided parameters.
pub fn build_request_builder(
client: &Client,
method_str: &str,
url: &Url,
headers: &BTreeMap<String, String>,
body: &Option<String>,
timeout: Duration,
parser: &liquid::Parser,
globals: &liquid::Object,
) -> Result<RequestBuilder, String> {
let method = parse_http_method(method_str).map_err(|err_msg| {
debug!("{}", err_msg);
err_msg
})?;
let mut request_builder = client.request(method, url.clone()).timeout(timeout);
let custom_headers = process_headers(headers, parser, globals, url)
.map_err(|e| format!("Error processing headers: {}", e))?;
// Prepare a standard set of headers.
let user_agent = GLOBAL_USER_AGENT.as_str();
let standard_headers = [
(header::USER_AGENT, user_agent),
(
header::ACCEPT,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
),
(header::ACCEPT_LANGUAGE, "en-US,en;q=0.5"),
(header::ACCEPT_ENCODING, "gzip, deflate, br"),
(header::CONNECTION, "keep-alive"),
];
// Start with the standard headers and then overlay any custom headers so
// caller-specified values take precedence over defaults.
let mut combined_headers = HeaderMap::new();
for (name, value) in &standard_headers {
if let Ok(hv) = HeaderValue::from_str(value) {
combined_headers.insert(name.clone(), hv);
}
}
for (name, value) in custom_headers.iter() {
combined_headers.insert(name.clone(), value.clone());
}
request_builder = request_builder.headers(combined_headers);
// If a body template is provided, parse and render it
if let Some(body_template) = body {
let template = parser
.parse(body_template)
.map_err(|e| format!("Error parsing body template: {}", e))?;
let rendered_body = template
.render(globals)
.map_err(|e| format!("Error rendering body template: {}", e))?;
request_builder = request_builder.body(rendered_body);
}
Ok(request_builder)
}
/// Process headers from a BTreeMap, rendering any Liquid templates.
pub fn process_headers(
headers: &BTreeMap<String, String>,
parser: &liquid::Parser,
globals: &Object,
url: &Url,
) -> Result<HeaderMap> {
let mut headers_map = HeaderMap::new();
for (key, value) in headers {
// Render the template
let template = match parser.parse(value) {
Ok(t) => t,
Err(e) => {
debug!("Error parsing Liquid template for '{}': {}", key, e);
continue;
}
};
let header_value = match template.render(globals) {
Ok(s) => s,
Err(e) => {
debug!(
"Failed to render header template. URL = <{}> | Key '{}': {}",
url.as_str(),
key,
e
);
continue;
}
};
// Clean key and value
let cleaned_key = key.trim().replace(&['\n', '\r'][..], "");
let cleaned_value = header_value.trim().replace(&['\n', '\r'][..], "");
// Validate header name
let name = match HeaderName::from_str(&cleaned_key) {
Ok(n) => n,
Err(e) => {
debug!(
"Invalid header name. URL = <{}> | Key '{}': {}",
url.as_str(),
cleaned_key,
e
);
continue;
}
};
// Validate header value
let value = match HeaderValue::from_str(&cleaned_value) {
Ok(v) => v,
Err(e) => {
debug!(
"Invalid header value. URL = <{}> | Value '{}': {}",
url.as_str(),
cleaned_value,
e
);
continue;
}
};
headers_map.insert(name, value);
}
Ok(headers_map)
}
/// Exponentialbackoff retry helper that always returns `Result<T, anyhow::Error>`
async fn retry_with_backoff<F, Fut, T>(
mut operation: F,
is_retryable: impl Fn(&Result<T, Error>, usize) -> bool,
max_retries: usize,
backoff_min: Duration,
backoff_max: Duration,
) -> Result<T, Error>
where
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, Error>>,
{
let mut retries = 0;
while retries <= max_retries {
let result = operation().await;
// If this result is *not* retryable, return it directly (Ok or Err).
if !is_retryable(&result, retries) {
return result;
}
retries += 1;
if retries > max_retries {
break;
}
let backoff = backoff_min.saturating_mul(2u32.pow(retries as u32)).min(backoff_max);
sleep(backoff).await;
}
Err(anyhow!("Max retries reached"))
}
pub async fn retry_multipart_request<F, Fut>(
mut build_request: F,
max_retries: usize,
backoff_min: Duration,
backoff_max: Duration,
) -> Result<Response, Error>
where
F: FnMut() -> Fut,
Fut: Future<Output = RequestBuilder>,
{
retry_with_backoff(
// 1) operation: build + send
move || {
let fut = build_request();
async move {
let rb = fut.await;
rb.send().await.map_err(Error::from)
}
},
// 2) same retry logic
|res: &Result<_, Error>, _attempt| match res {
Ok(resp)
if matches!(
resp.status(),
StatusCode::BAD_GATEWAY
| StatusCode::SERVICE_UNAVAILABLE
| StatusCode::GATEWAY_TIMEOUT
) =>
{
true
}
Err(_) => true,
_ => false,
},
max_retries,
backoff_min,
backoff_max,
)
.await
}
pub async fn retry_request(
request_builder: RequestBuilder,
max_retries: u32,
backoff_min: Duration,
backoff_max: Duration,
) -> Result<Response, Error> {
retry_with_backoff(
// 1) operation: clone + send, yielding Result<Response, Error>
move || {
let rb =
request_builder.try_clone().expect("retry_request: failed to clone RequestBuilder");
async move { rb.send().await.map_err(Error::from) }
},
// 2) is_retryable: transient HTTP status or network error
|res: &Result<_, Error>, _attempt| match res {
Ok(resp)
if matches!(
resp.status(),
StatusCode::BAD_GATEWAY
| StatusCode::SERVICE_UNAVAILABLE
| StatusCode::GATEWAY_TIMEOUT
) =>
{
true
}
Err(_) => true,
_ => false,
},
max_retries as usize,
backoff_min,
backoff_max,
)
.await
}
/// Return `true` when the body is very likely HTML.
fn body_looks_like_html(body: &str, headers: &HeaderMap) -> bool {
// ---- 1. header heuristic ---------------------------------------------
let header_says_html = headers
.get("content-type")
.and_then(|v| v.to_str().ok())
.map(|ct| {
let ct = ct.to_ascii_lowercase();
ct.contains("text/html") || ct.contains("application/xhtml")
})
.unwrap_or(false);
// ---- 2. early-body scan (<=1024 bytes) --------------------------------
// Find the last character boundary at or before 1024 bytes to avoid UTF-8 boundary issues
// Walk backward at most 3 bytes (UTF-8 max char size is 4 bytes) to find valid boundary
let mut end = 1024.min(body.len());
while end > 0 && !body.is_char_boundary(end) {
end -= 1;
}
let probe = &body[..end];
// Trim any leading whitespace so we still catch HTML that starts after newlines/indentation.
let trimmed = probe.trim_start_matches(|c: char| c.is_whitespace());
let probe = trimmed.to_ascii_lowercase();
let body_looks_htmlish = probe.starts_with('<') && probe.contains("<html");
// ⇒ Only HTML if **both** header and body agree
header_says_html && body_looks_htmlish
}
/// Validate the response by checking word and status matchers.
pub fn validate_response(
matchers: &[ResponseMatcher],
body: &str,
status: &StatusCode,
headers: &HeaderMap,
html_allowed: bool,
) -> bool {
// Since match_all_types is always true here, we simply require all word and status conditions
// to hold.
let word_ok = matchers
.iter()
.filter_map(|m| {
if let ResponseMatcher::WordMatch { words, match_all_words, negative, .. } = m {
let raw = if *match_all_words {
words.iter().all(|w| body.contains(w))
} else {
words.iter().any(|w| body.contains(w))
};
Some(if *negative { !raw } else { raw })
} else {
None
}
})
.all(|b| b);
let status_ok = matchers
.iter()
.filter_map(|m| {
if let ResponseMatcher::StatusMatch {
status: expected,
match_all_status,
negative,
..
} = m
{
let raw = if *match_all_status {
expected.iter().all(|s| s.to_string() == status.as_str())
} else {
expected.iter().any(|s| s.to_string() == status.as_str())
};
Some(if *negative { !raw } else { raw })
} else {
None
}
})
.all(|b| b);
// ── Header checks ──────────────────────────────────────────
let header_ok = matchers
.iter()
.filter_map(|m| {
if let ResponseMatcher::HeaderMatch { header, expected, match_all_values, .. } = m {
// header names are case-insensitive
let val = headers
.get(header)
.and_then(|v| v.to_str().ok())
.unwrap_or_default()
.to_ascii_lowercase();
Some(if *match_all_values {
expected.iter().all(|e| val.contains(&e.to_ascii_lowercase()))
} else {
expected.iter().any(|e| val.contains(&e.to_ascii_lowercase()))
})
} else {
None
}
})
.all(|b| b);
// ----- JsonValid ----------------------------------------------------------
let json_ok = matchers
.iter()
.filter_map(|m| {
if matches!(m, ResponseMatcher::JsonValid { .. }) {
Some(serde_json::from_str::<serde_json::Value>(body).is_ok())
} else {
None
}
})
.all(|b| b);
let xml_ok = matchers
.iter()
.filter_map(|m| {
if matches!(m, ResponseMatcher::XmlValid { .. }) {
// succeeds if `body` is well-formed XML
Some(xml_from_str::<IgnoredAny>(body).is_ok())
} else {
None
}
})
.all(|b| b);
let html_detected = body_looks_like_html(body, headers);
let html_ok = html_allowed || !html_detected;
// // ── debug line ─-
// debug!(
// "validate_response -- word:{}, status:{}, header:{}, json:{}, xml:{} ⇒ {}",
// word_ok, status_ok, header_ok, json_ok, xml_ok, all_ok
// );
// // ──────────────────────────────────────────────────────────────
let all_ok = word_ok && status_ok && header_ok && json_ok && xml_ok && html_ok;
all_ok
}
#[cfg(test)]
mod tests {
use std::sync::Once;
use wiremock::{
matchers::{method, path},
Mock, MockServer, ResponseTemplate,
};
use super::*;
static INIT: Once = Once::new();
fn init() {
INIT.call_once(|| {
let _ = tracing_subscriber::fmt::try_init();
});
}
#[test]
fn test_build_request_builder() {
init();
let client = Client::builder()
.gzip(true) // enable gzip
.deflate(true) // enable deflate
.brotli(true) // enable brotli
.build()
.expect("building reqwest client");
let parser = liquid::ParserBuilder::with_stdlib().build().unwrap();
let globals = liquid::Object::new();
let headers = BTreeMap::from([
("Content-Type".to_string(), "application/json".to_string()),
("Accept".to_string(), "application/custom".to_string()),
]);
let url = Url::from_str("https://example.com").unwrap();
let result = build_request_builder(
&client,
"GET",
&url,
&headers,
&None,
Duration::from_secs(10),
&parser,
&globals,
)
.expect("building request");
let req = result.build().expect("finalizing request");
assert_eq!(
req.headers().get(header::ACCEPT).and_then(|v| v.to_str().ok()),
Some("application/custom"),
);
}
#[tokio::test]
async fn test_retry_request() {
init();
let mock_server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/test"))
.respond_with(ResponseTemplate::new(200))
.mount(&mock_server)
.await;
let client = Client::builder()
.gzip(true) // enable gzip
.deflate(true) // enable deflate
.brotli(true) // enable brotli
.build()
.expect("building reqwest client");
let request_builder = client.get(&format!("{}/test", mock_server.uri()));
let response = retry_request(
request_builder,
3,
Duration::from_millis(50),
Duration::from_millis(200),
)
.await;
assert!(response.is_ok());
}
#[test]
fn test_validate_response() {
// --- arrange ----------------------------------------------------------
let matchers = vec![ResponseMatcher::WordMatch {
r#type: "word-match".to_string(),
words: vec!["test".to_string()],
match_all_words: true,
negative: false,
}];
let status = StatusCode::OK;
let body = "This is a test";
let headers = HeaderMap::new(); // empty header map
let html_allowed = false;
// --- act --------------------------------------------------------------
let result = validate_response(&matchers, body, &status, &headers, html_allowed);
// --- assert -----------------------------------------------------------
assert!(result);
}
#[test]
fn test_validate_response_slack_webhook() {
// Build matchers equivalent to rule kingfisher.slack.4
let matchers = vec![
ResponseMatcher::WordMatch {
r#type: "word-match".to_string(),
words: vec!["invalid_payload".to_string()],
match_all_words: false, // rule omits this → default is false
negative: false,
},
ResponseMatcher::WordMatch {
r#type: "word-match".to_string(),
words: vec!["invalid_token".to_string()],
match_all_words: false,
negative: true, // body must *not* contain “invalid_token”
},
];
// Simulate the real Slack response you posted
let body = "invalid_payload";
let status = StatusCode::BAD_REQUEST; // 400
let mut headers = HeaderMap::new();
headers.insert(header::CONTENT_TYPE, HeaderValue::from_static("text/plain"));
// Call validate_response with html_allowed = false
let ok = validate_response(&matchers, body, &status, &headers, false);
// 4It *should* be valid (true) because all matcher conditions hold
assert!(ok, "Slack webhook response should be considered ACTIVE");
}
#[test]
fn test_body_looks_like_html_trims_whitespace() {
let mut headers = HeaderMap::new();
headers.insert(header::CONTENT_TYPE, HeaderValue::from_static("text/html; charset=utf-8"));
let body = "\n\n \n<!DOCTYPE html>\n<html lang=\"en\"><body>page</body></html>";
assert!(body_looks_like_html(body, &headers));
}
#[test]
fn test_html_response_rejected_when_not_allowed() {
let matchers = vec![ResponseMatcher::StatusMatch {
r#type: "status-match".to_string(),
status: vec![StatusCode::OK.into()],
match_all_status: false,
negative: false,
}];
let mut headers = HeaderMap::new();
headers.insert(header::CONTENT_TYPE, HeaderValue::from_static("text/html; charset=utf-8"));
let body = "\n<html><body>Sign in</body></html>";
let ok = validate_response(&matchers, body, &StatusCode::OK, &headers, false);
assert!(!ok, "HTML responses should be rejected unless explicitly allowed");
}
#[test]
fn test_body_looks_like_html_utf8_boundary() {
// Test case for UTF-8 boundary issue: multi-byte character at 1024-byte boundary
// This reproduces the bug where slicing at byte 1024 would panic if it's in the middle
// of a multi-byte character (e.g., Chinese character '业' spans bytes 1023..1026)
let mut headers = HeaderMap::new();
headers.insert(header::CONTENT_TYPE, HeaderValue::from_static("text/html; charset=utf-8"));
// HTML at the start, with padding to push a multi-byte char to byte 1024
// This mirrors the real crash: HTML response from Gitee with Chinese chars
let html_start = "<!DOCTYPE html><html lang=\"zh-CN\"><head><title>";
let padding_len = 1023 - html_start.len();
let body = format!(
"{}{}业</title></head><body>Gitee</body></html>",
html_start,
"x".repeat(padding_len)
);
// Verify our test setup: multi-byte char should be at byte 1023
assert_eq!(body.as_bytes()[1023], 0xE4, "Expected first byte of '业' at position 1023");
// This should not panic AND should correctly identify HTML
let result = body_looks_like_html(&body, &headers);
assert!(
result,
"Should correctly identify HTML even with multi-byte characters at boundary"
);
}
}