diff --git a/Cargo.lock b/Cargo.lock index 23d1ca2..171a4c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -97,7 +97,7 @@ checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" [[package]] name = "archive-pdf-urls" -version = "0.3.1" +version = "0.4.0" dependencies = [ "clap", "env_logger", @@ -1773,7 +1773,7 @@ dependencies = [ [[package]] name = "waybackmachine-client" -version = "0.3.1" +version = "0.4.0" dependencies = [ "chrono", "mockito", diff --git a/Cargo.toml b/Cargo.toml index c1487f6..37cfb71 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "archive-pdf-urls" -version = "0.3.1" +version = "0.4.0" authors = ["Javier Arias "] edition = "2021" license = "Apache-2.0" @@ -18,4 +18,4 @@ log = "0.4.21" lopdf = "0.32.0" regex = "1.10.4" tokio = { version = "1.36.0", features = ["full"] } -waybackmachine-client = { version = "=0.3.1", path = "waybackmachine-client"} +waybackmachine-client = { version = "=0.4.0", path = "waybackmachine-client"} diff --git a/waybackmachine-client/Cargo.toml b/waybackmachine-client/Cargo.toml index 3f5f153..6072d49 100644 --- a/waybackmachine-client/Cargo.toml +++ b/waybackmachine-client/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "waybackmachine-client" -version = "0.3.1" +version = "0.4.0" authors = ["Javier Arias "] edition = "2021" license = "Apache-2.0" diff --git a/waybackmachine-client/src/archivableurl.rs b/waybackmachine-client/src/archivableurl.rs index ae33cbc..33ec1a9 100644 --- a/waybackmachine-client/src/archivableurl.rs +++ b/waybackmachine-client/src/archivableurl.rs @@ -7,6 +7,9 @@ pub struct ArchivableUrl { pub url: Url, } +/// List of domains that block wayback requests +const EXCLUDED_DOMAINS: &[&str] = &["archive.org", "jstor.org", "diw.de"]; + impl ArchivableUrl { /// Parses and validates the URL for archiving pub fn parse(url: &str) -> Result { @@ -24,14 +27,16 @@ impl ArchivableUrl { // Check if the host is excluded match host { - Host::Domain(domain) if domain.contains("localhost") => { - return Err(Error::InvalidUrl(self.url.to_string())); - } - Host::Domain(domain) if domain.contains("archive.org") => { - return Err(Error::ExcludedUrl(self.url.to_string())); - } - Host::Domain(domain) if domain.contains("jstor.org") => { - return Err(Error::ExcludedUrl(self.url.to_string())); + Host::Domain(domain) => { + if domain.contains("localhost") { + return Err(Error::InvalidUrl(self.url.to_string())); + } + + for &pattern in EXCLUDED_DOMAINS { + if domain.contains(pattern) { + return Err(Error::ExcludedUrl(self.url.to_string())); + } + } } Host::Ipv4(ipv4) if ipv4.is_loopback() @@ -153,4 +158,14 @@ mod tests { assert!(result.is_err()); assert_eq!(result.err(), Some(Error::ExcludedUrl(url.to_string()))); } + + #[test] + fn excluded_domains() { + for &domain in EXCLUDED_DOMAINS { + let url = format!("https://{}/some-path", domain); + let result = ArchivableUrl::parse(&url); + assert!(result.is_err()); + assert_eq!(result.err(), Some(Error::ExcludedUrl(url))); + } + } }