kingfisher/src/validation/utils.rs
2025-11-10 21:32:52 -08:00

249 lines
9.2 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use reqwest::Url;
use tokio::net::lookup_host;
use crate::validation::SerializableCaptures;
/// Return (NAME, value, start, end) for the captures we care about.
///
/// * Named captures keep their (upper-cased) name
/// * Among unnamed captures, keep **only the first one** and call it "TOKEN"
pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String, usize, usize)> {
let mut saw_unnamed = false;
captures
.captures
.iter()
.filter_map(|cap| {
if let Some(name) = &cap.name {
Some((name.to_uppercase(), cap.value.to_string(), cap.start, cap.end))
} else if !saw_unnamed {
saw_unnamed = true;
Some(("TOKEN".to_string(), cap.value.to_string(), cap.start, cap.end))
} else {
// Ignore any additional unnamed captures (e.g., from unintended groups)
None
}
})
.collect()
}
pub fn find_closest_variable(
captures: &[(String, String, usize, usize)],
target_value: &String,
target_variable_name: &str,
search_variable_name: &str,
) -> Option<String> {
// Collect the positions of the target variable for the provided value so we can
// compare relative offsets with candidate variables.
let mut target_positions = Vec::new();
for (name, value, start, end) in captures {
if name == target_variable_name && value == target_value {
target_positions.push((*start, *end));
}
}
if target_positions.is_empty() {
return None;
}
// Prefer candidates that appear before the target value (same logical block), but
// fall back to overlapping values and then to those that appear after the target
// value when no better match exists. This avoids pairing with the next block when
// multiple credentials are close together in the same file.
let mut best_before: Option<(usize, String)> = None;
let mut best_overlap: Option<(usize, String)> = None;
let mut best_after: Option<(usize, String)> = None;
for (target_start, target_end) in target_positions.iter().copied() {
for (name, value, start, end) in captures {
if name != search_variable_name {
continue;
}
if *end <= target_start {
// Candidate is before the target; choose the one closest to the target start.
let distance = target_start - *end;
match &mut best_before {
Some((best_distance, best_value)) if distance < *best_distance => {
*best_distance = distance;
*best_value = value.clone();
}
None => {
best_before = Some((distance, value.clone()));
}
_ => {}
}
} else if *start >= target_end {
// Candidate is after the target; choose the one closest to the target end.
let distance = *start - target_end;
match &mut best_after {
Some((best_distance, best_value)) if distance < *best_distance => {
*best_distance = distance;
*best_value = value.clone();
}
None => {
best_after = Some((distance, value.clone()));
}
_ => {}
}
} else {
// Candidate overlaps the target treat as an exact match.
let distance = 0usize;
match &mut best_overlap {
Some((best_distance, best_value)) if distance < *best_distance => {
*best_distance = distance;
*best_value = value.clone();
}
None => {
best_overlap = Some((distance, value.clone()));
}
_ => {}
}
}
}
}
best_before.or(best_overlap).or(best_after).map(|(_, value)| value)
}
pub async fn check_url_resolvable(url: &Url) -> Result<(), Box<dyn std::error::Error>> {
let host = url.host_str().ok_or("No host in URL")?;
let port = url.port().unwrap_or(if url.scheme() == "https" { 443 } else { 80 });
let addr = format!("{}:{}", host, port);
lookup_host(addr).await?.next().ok_or_else(|| "Failed to resolve URL".into()).map(|_| ())
}
// -----------------------------------------------------------------------------
// tests
// -----------------------------------------------------------------------------
//
#[cfg(test)]
mod tests {
use super::*;
use crate::matcher::{SerializableCapture, SerializableCaptures};
use pretty_assertions::assert_eq;
use smallvec::smallvec;
#[test]
fn single_unnamed_capture_is_returned() {
let captures = SerializableCaptures {
captures: smallvec![SerializableCapture {
name: None,
match_number: 0, // This test is for a rule with *no* explicit captures
start: 1,
end: 4,
value: "abc",
}],
};
let result = process_captures(&captures);
assert_eq!(result, vec![("TOKEN".to_string(), "abc".to_string(), 1usize, 4usize)]);
}
#[test]
fn includes_whole_match_when_multiple() {
let captures = SerializableCaptures {
captures: smallvec![
// --- FIX ---
// This test simulated a regex like `(abc)de(?P<foo>bcd)`.
// With our fix, group 0 ("abcde") is NOT serialized.
// We only get the explicit captures (group 1 and "foo").
SerializableCapture {
// This is group 1 (unnamed)
name: None,
match_number: 1, // Corrected match_number
start: 1,
end: 4,
value: "bcd",
},
SerializableCapture {
// This is group 2 (named "foo")
name: Some("foo".to_string()),
match_number: 2, // Corrected match_number
start: 1,
end: 4,
value: "bcd",
},
],
};
let result = process_captures(&captures);
// --- FIX ---
// The expected result now only contains the explicit captures.
// The first unnamed capture ("bcd") becomes "TOKEN".
assert_eq!(
result,
vec![
("TOKEN".to_string(), "bcd".to_string(), 1usize, 4usize),
("FOO".to_string(), "bcd".to_string(), 1usize, 4usize),
]
);
// --- END FIX ---
}
#[test]
fn includes_whole_match_and_unnamed_groups() {
let captures = SerializableCaptures {
captures: smallvec![
// --- FIX ---
// This test simulated a regex like `(?P<foo>aa)bb(cc)`.
// With our fix, group 0 ("aabbcc") is NOT serialized.
// We only get the explicit captures ("foo" and group 2).
SerializableCapture {
// This is group 1 (named "foo")
name: Some("foo".to_string()),
match_number: 1, // Corrected match_number
start: 0,
end: 2,
value: "aa",
},
SerializableCapture {
// This is group 2 (unnamed)
name: None,
match_number: 2, // Corrected match_number
start: 4,
end: 6,
value: "cc"
},
],
};
let result = process_captures(&captures);
// --- FIX ---
// The expected result no longer contains the full match ("aabbcc").
// The first (and only) unnamed capture ("cc") is now correctly labeled "TOKEN".
assert_eq!(
result,
vec![
("FOO".to_string(), "aa".to_string(), 0usize, 2usize), // From named group 1
("TOKEN".to_string(), "cc".to_string(), 4usize, 6usize), // From unnamed group 2
]
);
// --- END FIX ---
}
#[test]
fn prefers_closest_preceding_variable() {
let captures = vec![
("TOKEN".to_string(), "secret".to_string(), 75usize, 115usize),
("AKID".to_string(), "preceding".to_string(), 30usize, 50usize),
("AKID".to_string(), "following".to_string(), 180usize, 200usize),
];
let result =
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
assert_eq!(result, "preceding".to_string());
}
#[test]
fn falls_back_to_following_when_no_preceding() {
let captures = vec![
("TOKEN".to_string(), "secret".to_string(), 10usize, 50usize),
("AKID".to_string(), "after".to_string(), 60usize, 80usize),
];
let result =
find_closest_variable(&captures, &"secret".to_string(), "TOKEN", "AKID").unwrap();
assert_eq!(result, "after".to_string());
}
}