From 3e5171631da9195bc8a555cf163f9e6e196a9a2f Mon Sep 17 00:00:00 2001 From: Jens Reimann Date: Tue, 29 Oct 2024 13:37:01 +0100 Subject: [PATCH] feat: allow ignoring missing (404) files when importing --- Cargo.lock | 20 +++---- Cargo.toml | 8 +-- common/src/lib.rs | 1 + common/src/serde.rs | 4 ++ modules/importer/src/model/csaf.rs | 4 ++ modules/importer/src/model/sbom.rs | 6 +- modules/importer/src/runner/csaf/mod.rs | 11 +++- modules/importer/src/runner/csaf/report.rs | 57 +++++++++++-------- modules/importer/src/runner/csaf/storage.rs | 11 ++-- modules/importer/src/runner/sbom/mod.rs | 11 +++- modules/importer/src/runner/sbom/report.rs | 61 +++++++++++++-------- modules/importer/src/runner/sbom/storage.rs | 14 +++-- modules/importer/src/test.rs | 1 + openapi.yaml | 4 ++ server/src/sample_data.rs | 3 + xtask/schema/generate-dump.json | 6 ++ xtask/src/dataset.rs | 2 + 17 files changed, 149 insertions(+), 75 deletions(-) create mode 100644 common/src/serde.rs diff --git a/Cargo.lock b/Cargo.lock index cd403f10b..ce8b2811b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1739,9 +1739,9 @@ dependencies = [ [[package]] name = "csaf-walker" -version = "0.9.3" +version = "0.10.0-alpha.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5af10591ad4a6b6a56f14753f0d3341ac5e12b3c1fa171c8f554b1af87ce995f" +checksum = "0cde0dba0cac57fef9d7842e815404a1b13125e47920cef4f2d79eb508959924" dependencies = [ "anyhow", "async-trait", @@ -1752,7 +1752,7 @@ dependencies = [ "csv", "digest", "filetime", - "fluent-uri 0.2.0", + "fluent-uri 0.3.2", "futures", "hickory-resolver", "html-escape", @@ -2538,9 +2538,9 @@ dependencies = [ [[package]] name = "fluent-uri" -version = "0.2.0" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d77395429e0ce700a8378be6625660a4aa00ca5dc5cd1527193ebd0946cc9b3" +checksum = "1918b65d96df47d3591bed19c5cca17e3fa5d0707318e4b5ef2eae01764df7e5" dependencies = [ "borrow-or-share", "ref-cast", @@ -6378,9 +6378,9 @@ dependencies = [ [[package]] name = "sbom-walker" -version = "0.9.3" +version = "0.10.0-alpha.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d230a1fc4bb9fea74333adbad2d466fd85927f8acec6a5234d9c4e96d838b8" +checksum = "a4bb03045c0f81c22061f69f7701521dce1f64eaf3d1e79d06daf2be3ee0cf08" dependencies = [ "anyhow", "async-trait", @@ -6390,7 +6390,7 @@ dependencies = [ "cyclonedx-bom", "digest", "filetime", - "fluent-uri 0.2.0", + "fluent-uri 0.3.2", "futures", "http 1.1.0", "humantime", @@ -9207,9 +9207,9 @@ dependencies = [ [[package]] name = "walker-common" -version = "0.9.3" +version = "0.10.0-alpha.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a874683afa5b5b913bcecbf74d4d3692f7a77849093c3645616d2d980f9989d" +checksum = "992cd6ccc9f26d93d83005b5d74365b1ef38451770924a1f03170649f93a7622" dependencies = [ "anyhow", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index d3eacfb9d..06b5b6702 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,7 @@ clap = "4" concat-idents = "1" cpe = "0.1.5" csaf = { version = "0.5.0", default-features = false } -csaf-walker = { version = "0.9.0", default-features = false } +csaf-walker = { version = "0.10.0-alpha.1", default-features = false } cve = "0.3.1" cyclonedx-bom = "0.7.0" env_logger = "0.11.0" @@ -108,7 +108,7 @@ ring = "0.17.8" roxmltree = "0.20.0" rstest = "0.23.0" rust-s3 = "0.35" -sbom-walker = { version = "0.9.0", default-features = false, features = ["crypto-openssl", "cyclonedx-bom", "spdx-rs"] } +sbom-walker = { version = "0.10.0-alpha.1", default-features = false, features = ["crypto-openssl", "cyclonedx-bom", "spdx-rs"] } schemars = "0.8" sea-orm = { version = "~1.0", features = ["debug-print"] } # See https://www.sea-ql.org/blog/2024-08-04-sea-orm-1.0/#release-planning sea-orm-migration = "~1.0" @@ -147,8 +147,8 @@ utoipa-redoc = { version = "5.0.0", features = ["actix-web"] } utoipa-swagger-ui = "8.0.3" uuid = "1.7.0" walkdir = "2.5" -walker-common = "0.9.3" -walker-extras = "0.9.0" +walker-common = "0.10.0-alpha.1" +walker-extras = "0.10.0-alpha.1" zip = "2.2.0" trustify-auth = { path = "common/auth", features = ["actix", "swagger"] } diff --git a/common/src/lib.rs b/common/src/lib.rs index 22bd69981..604e7519d 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -12,6 +12,7 @@ pub mod package; pub mod purl; pub mod reqwest; pub mod sbom; +pub mod serde; pub mod time; pub mod tls; pub mod uuid; diff --git a/common/src/serde.rs b/common/src/serde.rs new file mode 100644 index 000000000..adb6827d6 --- /dev/null +++ b/common/src/serde.rs @@ -0,0 +1,4 @@ +/// Check if a value is its default value. +pub fn is_default(value: &D) -> bool { + value == &Default::default() +} diff --git a/modules/importer/src/model/csaf.rs b/modules/importer/src/model/csaf.rs index 5bf098cc0..5a3085cce 100644 --- a/modules/importer/src/model/csaf.rs +++ b/modules/importer/src/model/csaf.rs @@ -1,4 +1,5 @@ use super::*; +use trustify_common::serde::is_default; #[derive( Clone, @@ -24,6 +25,9 @@ pub struct CsafImporter { #[serde(default, skip_serializing_if = "Option::is_none")] pub fetch_retries: Option, + + #[serde(default, skip_serializing_if = "is_default")] + pub ignore_missing: bool, } impl Deref for CsafImporter { diff --git a/modules/importer/src/model/sbom.rs b/modules/importer/src/model/sbom.rs index 6403c12af..c7c1963ed 100644 --- a/modules/importer/src/model/sbom.rs +++ b/modules/importer/src/model/sbom.rs @@ -1,5 +1,5 @@ use super::*; -use trustify_common::model::BinaryByteSize; +use trustify_common::{model::BinaryByteSize, serde::is_default}; #[derive( Clone, @@ -31,6 +31,9 @@ pub struct SbomImporter { #[serde(default, skip_serializing_if = "Option::is_none")] pub fetch_retries: Option, + + #[serde(default, skip_serializing_if = "is_default")] + pub ignore_missing: bool, } impl Deref for SbomImporter { @@ -78,6 +81,7 @@ mod test { only_patterns: vec![], size_limit: Some(bytesize::ByteSize::mib(1234).into()), fetch_retries: None, + ignore_missing: false, } ); diff --git a/modules/importer/src/runner/csaf/mod.rs b/modules/importer/src/runner/csaf/mod.rs index 7e8591450..addc29fde 100644 --- a/modules/importer/src/runner/csaf/mod.rs +++ b/modules/importer/src/runner/csaf/mod.rs @@ -20,6 +20,8 @@ use csaf_walker::{ walker::Walker, }; use parking_lot::Mutex; +use reqwest::StatusCode; +use std::collections::HashSet; use std::{sync::Arc, time::SystemTime}; use tracing::instrument; use trustify_module_ingestor::{graph::Graph, service::IngestorService}; @@ -46,6 +48,7 @@ impl super::ImportRunner { v3_signatures, only_patterns, fetch_retries, + ignore_missing, } = importer; let report = Arc::new(Mutex::new(ReportBuilder::new())); @@ -71,7 +74,13 @@ impl super::ImportRunner { // wrap storage with report - let storage = CsafReportVisitor(ReportVisitor::new(report.clone(), storage)); + let storage = CsafReportVisitor { + next: ReportVisitor::new(report.clone(), storage), + ignore_errors: match ignore_missing { + true => HashSet::from_iter([StatusCode::NOT_FOUND]), + false => HashSet::new(), + }, + }; // validate (called by retriever) diff --git a/modules/importer/src/runner/csaf/report.rs b/modules/importer/src/runner/csaf/report.rs index b5d9d716b..6eec80793 100644 --- a/modules/importer/src/runner/csaf/report.rs +++ b/modules/importer/src/runner/csaf/report.rs @@ -5,51 +5,62 @@ use crate::runner::{ report::{Phase, ReportVisitor}, }; use csaf_walker::{ - retrieve::RetrievalError, + source::{HttpSource, HttpSourceError}, validation::{ValidatedAdvisory, ValidatedVisitor, ValidationContext, ValidationError}, }; +use reqwest::StatusCode; +use std::collections::HashSet; use trustify_module_ingestor::service; -use walker_common::utils::url::Urlify; +use walker_common::{fetcher, retrieve::RetrievalError, utils::url::Urlify}; -pub struct CsafReportVisitor(pub ReportVisitor>); +pub struct CsafReportVisitor { + pub next: ReportVisitor>, + pub ignore_errors: HashSet, +} -impl ValidatedVisitor for CsafReportVisitor { - type Error = as ValidatedVisitor>::Error; - type Context = as ValidatedVisitor>::Context; +impl ValidatedVisitor for CsafReportVisitor { + type Error = as ValidatedVisitor>::Error; + type Context = as ValidatedVisitor>::Context; async fn visit_context( &self, context: &ValidationContext<'_>, ) -> Result { - self.0.next.visit_context(context).await + self.next.next.visit_context(context).await } async fn visit_advisory( &self, context: &Self::Context, - result: Result, + result: Result>, ) -> Result<(), Self::Error> { let file = result.url().to_string(); - self.0.report.lock().tick(); + self.next.report.lock().tick(); - let result = self.0.next.visit_advisory(context, result).await; + let result = self.next.next.visit_advisory(context, result).await; if let Err(err) = &result { match err { - StorageError::Validation(ValidationError::Retrieval( - RetrievalError::InvalidResponse { code, .. }, - )) => { - self.0.report.lock().add_error( + StorageError::Validation(ValidationError::Retrieval(err)) => { + self.next.report.lock().add_error( Phase::Retrieval, file, - format!("retrieval of document failed: {code}"), + format!("retrieval of document failed: {err}"), ); - if code.is_client_error() { - // If it's a client error, there's no need to re-try. We simply claim - // success after we logged it. - return Ok(()); + // handle client error as non-retry error + + if let RetrievalError::Source { + err: HttpSourceError::Fetcher(fetcher::Error::Request(err)), + discovered: _, + } = err + { + if let Some(status) = err.status() { + if self.ignore_errors.contains(&status) { + return Ok(()); + } + } } } StorageError::Validation(ValidationError::DigestMismatch { @@ -57,7 +68,7 @@ impl ValidatedVisitor for CsafReportVisitor { actual, .. }) => { - self.0.report.lock().add_error( + self.next.report.lock().add_error( Phase::Validation, file, format!("digest mismatch - expected: {expected}, actual: {actual}"), @@ -68,7 +79,7 @@ impl ValidatedVisitor for CsafReportVisitor { return Ok(()); } StorageError::Validation(ValidationError::Signature { error, .. }) => { - self.0.report.lock().add_error( + self.next.report.lock().add_error( Phase::Validation, file, format!("unable to verify signature: {error}"), @@ -79,7 +90,7 @@ impl ValidatedVisitor for CsafReportVisitor { return Ok(()); } StorageError::Processing(err) => { - self.0.report.lock().add_error( + self.next.report.lock().add_error( Phase::Upload, file, format!("processing failed: {err}"), @@ -90,7 +101,7 @@ impl ValidatedVisitor for CsafReportVisitor { return Ok(()); } StorageError::Storage(err) => { - self.0.report.lock().add_error( + self.next.report.lock().add_error( Phase::Upload, file, format!("upload failed: {err}"), diff --git a/modules/importer/src/runner/csaf/storage.rs b/modules/importer/src/runner/csaf/storage.rs index 327ba59ca..aaa258d3a 100644 --- a/modules/importer/src/runner/csaf/storage.rs +++ b/modules/importer/src/runner/csaf/storage.rs @@ -1,6 +1,7 @@ use crate::runner::{common::storage::StorageError, context::RunContext, report::ReportBuilder}; -use csaf_walker::validation::{ - ValidatedAdvisory, ValidatedVisitor, ValidationContext, ValidationError, +use csaf_walker::{ + source::Source, + validation::{ValidatedAdvisory, ValidatedVisitor, ValidationContext, ValidationError}, }; use parking_lot::Mutex; use std::sync::Arc; @@ -16,8 +17,8 @@ pub struct StorageVisitor { pub labels: Labels, } -impl ValidatedVisitor for StorageVisitor { - type Error = StorageError; +impl ValidatedVisitor for StorageVisitor { + type Error = StorageError>; type Context = (); async fn visit_context(&self, _: &ValidationContext<'_>) -> Result { @@ -27,7 +28,7 @@ impl ValidatedVisitor for StorageVisitor { async fn visit_advisory( &self, _context: &Self::Context, - result: Result, + result: Result>, ) -> Result<(), Self::Error> { let doc = result?; let location = doc.context.url().to_string(); diff --git a/modules/importer/src/runner/sbom/mod.rs b/modules/importer/src/runner/sbom/mod.rs index fafc4d2f4..4ed9109ae 100644 --- a/modules/importer/src/runner/sbom/mod.rs +++ b/modules/importer/src/runner/sbom/mod.rs @@ -13,12 +13,14 @@ use crate::{ server::context::WalkerProgress, }; use parking_lot::Mutex; +use reqwest::StatusCode; use sbom_walker::{ retrieve::RetrievingVisitor, source::{HttpOptions, HttpSource}, validation::ValidationVisitor, walker::Walker, }; +use std::collections::HashSet; use std::{sync::Arc, time::SystemTime}; use tracing::instrument; use trustify_module_ingestor::{graph::Graph, service::IngestorService}; @@ -49,6 +51,7 @@ impl super::ImportRunner { only_patterns, size_limit, fetch_retries, + ignore_missing, } = importer; let url = Url::parse(&source).map_err(|err| ScannerError::Critical(err.into()))?; @@ -74,7 +77,13 @@ impl super::ImportRunner { // wrap storage with report - let storage = SbomReportVisitor(ReportVisitor::new(report.clone(), storage)); + let storage = SbomReportVisitor { + next: ReportVisitor::new(report.clone(), storage), + ignore_errors: match ignore_missing { + true => HashSet::from_iter([StatusCode::NOT_FOUND]), + false => HashSet::new(), + }, + }; // validate (called by retriever) diff --git a/modules/importer/src/runner/sbom/report.rs b/modules/importer/src/runner/sbom/report.rs index f44baaf9c..be1aeff5b 100644 --- a/modules/importer/src/runner/sbom/report.rs +++ b/modules/importer/src/runner/sbom/report.rs @@ -4,52 +4,65 @@ use crate::runner::{ report::{Phase, ReportVisitor}, sbom::storage::StorageVisitor, }; +use reqwest::StatusCode; use sbom_walker::{ - retrieve::RetrievalError, - validation::{ValidatedSbom, ValidatedVisitor, ValidationContext, ValidationError}, + source::{HttpSource, HttpSourceError}, + validation::{ValidatedSbom, ValidatedVisitor, ValidationContext}, }; +use std::collections::HashSet; use trustify_module_ingestor::service; -use walker_common::utils::url::Urlify; +use walker_common::{ + fetcher, retrieve::RetrievalError, utils::url::Urlify, validate::ValidationError, +}; -pub struct SbomReportVisitor(pub ReportVisitor>); +pub struct SbomReportVisitor { + pub next: ReportVisitor>, + pub ignore_errors: HashSet, +} -impl ValidatedVisitor for SbomReportVisitor { - type Error = as ValidatedVisitor>::Error; - type Context = as ValidatedVisitor>::Context; +impl ValidatedVisitor for SbomReportVisitor { + type Error = as ValidatedVisitor>::Error; + type Context = as ValidatedVisitor>::Context; async fn visit_context( &self, context: &ValidationContext<'_>, ) -> Result { - self.0.next.visit_context(context).await + self.next.next.visit_context(context).await } async fn visit_sbom( &self, context: &Self::Context, - result: Result, + result: Result>, ) -> Result<(), Self::Error> { let file = result.url().to_string(); - self.0.report.lock().tick(); + self.next.report.lock().tick(); - let result = self.0.next.visit_sbom(context, result).await; + let result = self.next.next.visit_sbom(context, result).await; if let Err(err) = &result { match err { - StorageError::Validation(ValidationError::Retrieval( - RetrievalError::InvalidResponse { code, .. }, - )) => { - self.0.report.lock().add_error( + StorageError::Validation(ValidationError::Retrieval(err)) => { + self.next.report.lock().add_error( Phase::Retrieval, file, - format!("retrieval of document failed: {code}"), + format!("retrieval of document failed: {err}"), ); - if code.is_client_error() { - // If it's a client error, there's no need to re-try. We simply claim - // success after we logged it. - return Ok(()); + // handle client error as non-retry error + + if let RetrievalError::Source { + err: HttpSourceError::Fetcher(fetcher::Error::Request(err)), + discovered: _, + } = err + { + if let Some(status) = err.status() { + if self.ignore_errors.contains(&status) { + return Ok(()); + } + } } } StorageError::Validation(ValidationError::DigestMismatch { @@ -57,7 +70,7 @@ impl ValidatedVisitor for SbomReportVisitor { actual, .. }) => { - self.0.report.lock().add_error( + self.next.report.lock().add_error( Phase::Validation, file, format!("digest mismatch - expected: {expected}, actual: {actual}"), @@ -68,7 +81,7 @@ impl ValidatedVisitor for SbomReportVisitor { return Ok(()); } StorageError::Validation(ValidationError::Signature { error, .. }) => { - self.0.report.lock().add_error( + self.next.report.lock().add_error( Phase::Validation, file, format!("unable to verify signature: {error}"), @@ -79,7 +92,7 @@ impl ValidatedVisitor for SbomReportVisitor { return Ok(()); } StorageError::Processing(err) => { - self.0.report.lock().add_error( + self.next.report.lock().add_error( Phase::Upload, file, format!("processing failed: {err}"), @@ -90,7 +103,7 @@ impl ValidatedVisitor for SbomReportVisitor { return Ok(()); } StorageError::Storage(err) => { - self.0.report.lock().add_error( + self.next.report.lock().add_error( Phase::Upload, file, format!("upload failed: {err}"), diff --git a/modules/importer/src/runner/sbom/storage.rs b/modules/importer/src/runner/sbom/storage.rs index b4121205c..a377c396f 100644 --- a/modules/importer/src/runner/sbom/storage.rs +++ b/modules/importer/src/runner/sbom/storage.rs @@ -5,13 +5,15 @@ use crate::runner::{ report::{Message, Phase, ReportBuilder}, }; use parking_lot::Mutex; -use sbom_walker::validation::{ - ValidatedSbom, ValidatedVisitor, ValidationContext, ValidationError, +use sbom_walker::{ + source::HttpSource, + validation::{ValidatedSbom, ValidatedVisitor, ValidationContext}, }; use std::sync::Arc; use trustify_entity::labels::Labels; use trustify_module_ingestor::service::{Format, IngestorService}; -use walker_common::{compression::decompress_opt, utils::url::Urlify}; +use walker_common::utils::url::Urlify; +use walker_common::{compression::decompress_opt, validate::ValidationError}; pub struct StorageVisitor { pub context: C, @@ -23,8 +25,8 @@ pub struct StorageVisitor { pub report: Arc>, } -impl ValidatedVisitor for StorageVisitor { - type Error = StorageError; +impl ValidatedVisitor for StorageVisitor { + type Error = StorageError>; type Context = (); async fn visit_context( @@ -37,7 +39,7 @@ impl ValidatedVisitor for StorageVisitor { async fn visit_sbom( &self, _context: &Self::Context, - result: Result, + result: Result>, ) -> Result<(), Self::Error> { let doc = result?; let file = doc.possibly_relative_url(); diff --git a/modules/importer/src/test.rs b/modules/importer/src/test.rs index 1a47fccbf..e46cb0eb0 100644 --- a/modules/importer/src/test.rs +++ b/modules/importer/src/test.rs @@ -30,6 +30,7 @@ fn mock_configuration(source: impl Into) -> ImporterConfiguration { size_limit: None, fetch_retries: None, + ignore_missing: false, }) } diff --git a/openapi.yaml b/openapi.yaml index d8b5e6b2f..fc8c499e9 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -2318,6 +2318,8 @@ components: - integer - 'null' minimum: 0 + ignoreMissing: + type: boolean onlyPatterns: type: array items: @@ -3276,6 +3278,8 @@ components: - integer - 'null' minimum: 0 + ignoreMissing: + type: boolean keys: type: array items: diff --git a/server/src/sample_data.rs b/server/src/sample_data.rs index ad1aad91e..84f96c214 100644 --- a/server/src/sample_data.rs +++ b/server/src/sample_data.rs @@ -159,6 +159,7 @@ pub async fn sample_data(db: trustify_common::db::Database) -> anyhow::Result<() only_patterns: vec![], size_limit: None, fetch_retries: Some(50), + ignore_missing: false, })).await?; add( @@ -175,6 +176,7 @@ pub async fn sample_data(db: trustify_common::db::Database) -> anyhow::Result<() v3_signatures: true, only_patterns: vec![], fetch_retries: Some(50), + ignore_missing: false, }), ) .await?; @@ -193,6 +195,7 @@ pub async fn sample_data(db: trustify_common::db::Database) -> anyhow::Result<() v3_signatures: true, only_patterns: vec!["^cve-2024-".into()], fetch_retries: Some(50), + ignore_missing: false, }), ) .await?; diff --git a/xtask/schema/generate-dump.json b/xtask/schema/generate-dump.json index aba6f33bc..48d1c89ec 100644 --- a/xtask/schema/generate-dump.json +++ b/xtask/schema/generate-dump.json @@ -157,6 +157,9 @@ "format": "uint", "minimum": 0.0 }, + "ignoreMissing": { + "type": "boolean" + }, "labels": { "description": "Labels which will be applied to the ingested documents.", "allOf": [ @@ -464,6 +467,9 @@ "format": "uint", "minimum": 0.0 }, + "ignoreMissing": { + "type": "boolean" + }, "keys": { "type": "array", "items": { diff --git a/xtask/src/dataset.rs b/xtask/src/dataset.rs index 2b5a4bcd7..66300bb93 100644 --- a/xtask/src/dataset.rs +++ b/xtask/src/dataset.rs @@ -64,6 +64,7 @@ impl GenerateDump { only_patterns: vec![], size_limit: self.size_limit, fetch_retries: self.fetch_retries, + ignore_missing: false, }), ImporterConfiguration::Csaf(CsafImporter { common: default_common("Red Hat VEX documents from 2024"), @@ -71,6 +72,7 @@ impl GenerateDump { v3_signatures: true, only_patterns: vec!["^cve-2024-".into()], fetch_retries: self.fetch_retries, + ignore_missing: false, }) ];