diff --git a/Cargo.lock b/Cargo.lock index 373cf72..faee86f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -424,8 +424,10 @@ dependencies = [ "serde", "serde_json", "tar", + "tempfile", "tracing", "tracing-subscriber", + "walkdir", "yara", "yara-sys", "zip", @@ -1471,6 +1473,15 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.23" @@ -1682,14 +1693,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.10.1" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" dependencies = [ "cfg-if", "fastrand", + "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -2014,6 +2026,16 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -2121,6 +2143,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -2154,6 +2185,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-targets" version = "0.48.5" diff --git a/Cargo.toml b/Cargo.toml index c91648e..ae0996c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,8 +15,10 @@ reqwest = {version = "0.12.4", features = ["blocking", "json", "gzip"]} serde = {version = "1.0.203", features = ["derive"]} serde_json = "1.0.121" tar = "0.4.40" +tempfile = "3.12.0" tracing = "0.1.40" tracing-subscriber = {version = "0.3.18", features = ["env-filter"]} +walkdir = "2.5.0" yara = "0.27.0" yara-sys = {version = "0.27.0", features = ["yara-static"]} zip = "2.1.6" diff --git a/src/client.rs b/src/client.rs index b2fd446..771e7fa 100644 --- a/src/client.rs +++ b/src/client.rs @@ -2,25 +2,16 @@ mod methods; mod models; use chrono::{DateTime, TimeDelta, Utc}; +use flate2::read::GzDecoder; pub use methods::*; pub use models::*; +use tempfile::{tempdir, tempfile, TempDir}; -use crate::APP_CONFIG; use color_eyre::Result; -use flate2::read::GzDecoder; use reqwest::{blocking::Client, Url}; -use std::{ - io::{Cursor, Read}, - time::Duration, -}; +use std::{io, time::Duration}; use tracing::{error, info, trace, warn}; -/// Type alias representing a tar archive -pub type TarballType = tar::Archive>>; - -/// Type alias representing a zip archive -pub type ZipType = zip::ZipArchive>>; - pub struct AuthState { pub access_token: String, pub expires_at: DateTime, @@ -154,25 +145,37 @@ impl DragonflyClient { } } -pub fn fetch_tarball(http_client: &Client, download_url: &Url) -> Result { - let response = http_client.get(download_url.clone()).send()?; +/// Download and unpack a tarball, return the [`TempDir`] containing the contents. +fn extract_tarball(response: R) -> Result { + let mut tarball = tar::Archive::new(GzDecoder::new(response)); + let tmpdir = tempdir()?; + tarball.unpack(tmpdir.path())?; + Ok(tmpdir) +} - let decompressed = GzDecoder::new(response); - let mut cursor: Cursor> = Cursor::new(Vec::new()); - decompressed - .take(APP_CONFIG.max_scan_size) - .read_to_end(cursor.get_mut())?; +/// Download and extract a zip, return the [`TempDir`] containing the contents. +fn extract_zipfile(mut response: R) -> Result { + let mut file = tempfile()?; - Ok(tar::Archive::new(cursor)) -} + // first write the archive to a file because `response` isn't Seek, which is needed by + // `zip::ZipArchive::new` + io::copy(&mut response, &mut file)?; + + let mut zip = zip::ZipArchive::new(file)?; + let tmpdir = tempdir()?; + zip.extract(tmpdir.path())?; -pub fn fetch_zipfile(http_client: &Client, download_url: &Url) -> Result { - let response = http_client.get(download_url.to_string()).send()?; + Ok(tmpdir) +} - let mut cursor = Cursor::new(Vec::new()); - response - .take(APP_CONFIG.max_scan_size) - .read_to_end(cursor.get_mut())?; +pub fn download_distribution(http_client: &Client, download_url: Url) -> Result { + // This conversion is fast as per the docs + let is_tarball = download_url.as_str().ends_with(".tar.gz"); + let response = http_client.get(download_url).send()?; - Ok(zip::ZipArchive::new(cursor)?) + if is_tarball { + extract_tarball(response) + } else { + extract_zipfile(response) + } } diff --git a/src/scanner.rs b/src/scanner.rs index 5999fd0..969adf0 100644 --- a/src/scanner.rs +++ b/src/scanner.rs @@ -1,77 +1,72 @@ -use std::{ - collections::HashSet, - io::Read, - path::{Path, PathBuf}, -}; +use std::{collections::HashSet, path::Path}; use color_eyre::Result; use reqwest::{blocking::Client, Url}; +use tempfile::TempDir; +use walkdir::WalkDir; use yara::Rules; use crate::client::DistributionScanResult; use crate::{ - client::{ - fetch_tarball, fetch_zipfile, FileScanResult, Job, SubmitJobResultsSuccess, TarballType, - ZipType, - }, + client::{download_distribution, FileScanResult, Job, SubmitJobResultsSuccess}, exts::RuleExt, utils::create_inspector_url, }; -/// Scan an archive format using Yara rules. -trait Scan { - fn scan(&mut self, rules: &Rules) -> Result>; -} - -impl Scan for TarballType { - /// Scan a tarball against the given rule set - fn scan(&mut self, rules: &Rules) -> Result> { - let file_scan_results = self - .entries()? - .filter_map(Result::ok) - .map(|mut tarfile| { - let path = tarfile.path()?.to_path_buf(); - scan_file(&mut tarfile, &path, rules) - }) - .filter_map(Result::ok) - .collect(); - - Ok(file_scan_results) - } -} - -impl Scan for ZipType { - /// Scan a zipfile against the given rule set - fn scan(&mut self, rules: &Rules) -> Result> { - let mut file_scan_results = Vec::new(); - for idx in 0..self.len() { - let mut file = self.by_index(idx)?; - let path = PathBuf::from(file.name()); - let scan_results = scan_file(&mut file, &path, rules)?; - file_scan_results.push(scan_results); - } - - Ok(file_scan_results) - } -} - /// A distribution consisting of an archive and an inspector url. struct Distribution { - file: Box, + dir: TempDir, inspector_url: Url, download_url: Url, } impl Distribution { fn scan(&mut self, rules: &Rules) -> Result { - let results = self.file.scan(rules)?; + let mut file_scan_results: Vec = Vec::new(); + for entry in WalkDir::new(self.dir.path()) + .into_iter() + .filter_map(|dirent| dirent.into_iter().find(|de| de.file_type().is_file())) + { + let file_scan_result = self.scan_file(entry.path(), rules)?; + file_scan_results.push(file_scan_result); + } Ok(DistributionScanResults::new( - results, + file_scan_results, self.inspector_url.clone(), &self.download_url, )) } + + /// Scan a file given it's path, and compiled rules. + /// + /// # Arguments + /// * `path` - The path of the file to scan. + /// * `rules` - The compiled rule set to scan this file against + fn scan_file(&self, path: &Path, rules: &Rules) -> Result { + let rules = rules + .scan_file(path, 10)? + .into_iter() + .filter(|rule| { + let filetypes = rule.get_filetypes(); + filetypes.is_empty() + || filetypes + .iter() + .any(|filetype| path.to_string_lossy().ends_with(filetype)) + }) + .collect(); + + Ok(FileScanResult::new( + self.relative_to_archive_root(path)?, + rules, + )) + } + + /// Make the path relative to the archive root + fn relative_to_archive_root(&self, path: &Path) -> Result { + // Use strip prefix to remove the tempdir path, then skip the archive dir + Ok(path.strip_prefix(self.dir.path())?.iter().skip(1).collect()) + } } /// Struct representing the results of a scanned distribution @@ -229,12 +224,10 @@ pub fn scan_all_distributions( let download_url: Url = distribution.parse().unwrap(); let inspector_url = create_inspector_url(&job.name, &job.version, &download_url); + let dir = download_distribution(http_client, download_url.clone())?; + let mut dist = Distribution { - file: if distribution.ends_with(".tar.gz") { - Box::new(fetch_tarball(http_client, &download_url)?) - } else { - Box::new(fetch_zipfile(http_client, &download_url)?) - }, + dir, inspector_url, download_url, }; @@ -245,42 +238,20 @@ pub fn scan_all_distributions( Ok(distribution_scan_results) } -/// Scan a file given it implements `Read`. -/// -/// # Arguments -/// * `path` - The path corresponding to this file -/// * `rules` - The compiled rule set to scan this file against -fn scan_file(file: &mut impl Read, path: &Path, rules: &Rules) -> Result { - let mut buffer = Vec::new(); - file.read_to_end(&mut buffer)?; - - let rules = rules - .scan_mem(&buffer, 10)? - .into_iter() - .filter(|rule| { - let filetypes = rule.get_filetypes(); - filetypes.is_empty() - || filetypes - .iter() - .any(|filetype| path.to_string_lossy().ends_with(filetype)) - }) - .collect(); - - Ok(FileScanResult::new(path.to_path_buf(), rules)) -} - #[cfg(test)] mod tests { use std::collections::HashMap; + use std::io::Write; use std::{collections::HashSet, path::PathBuf}; use yara::Compiler; - use super::{scan_file, DistributionScanResults, PackageScanResults}; + use super::{DistributionScanResults, PackageScanResults}; use crate::client::{ DistributionScanResult, Match, MetadataValue, PatternMatch, Range, RuleMatch, ScanResultSerializer, SubmitJobResultsError, SubmitJobResultsSuccess, }; use crate::test::make_file_scan_result; + use tempfile::{tempdir, tempdir_in}; #[test] fn test_scan_result_success_serialization() { @@ -447,10 +418,26 @@ mod tests { let compiler = Compiler::new().unwrap().add_rules_str(rules).unwrap(); let rules = compiler.compile_rules().unwrap(); - let result = - scan_file(&mut "I love Rust!".as_bytes(), &PathBuf::default(), &rules).unwrap(); - assert_eq!(result.path, PathBuf::default()); + let tempdir = tempdir().unwrap(); + let archive_root = tempfile::Builder::new().tempdir_in(tempdir.path()).unwrap(); + + let mut tmpfile = tempfile::NamedTempFile::new_in(archive_root.path()).unwrap(); + + writeln!(&mut tmpfile, "I hate Rust >:(").unwrap(); + + let distro = super::Distribution { + dir: tempdir, + download_url: "https://example.com".parse().unwrap(), + inspector_url: "https://example.com".parse().unwrap(), + }; + + let result = distro.scan_file(tmpfile.path(), &rules).unwrap(); + + assert_eq!( + result.path, + tmpfile.path().strip_prefix(archive_root.path()).unwrap() + ); assert_eq!( RuleMatch { identifier: "contains_rust".to_string(), @@ -470,4 +457,54 @@ mod tests { assert_eq!(result.calculate_score(), 5); } + + #[test] + fn test_relative_to_archive_root() { + let tempdir = tempdir().unwrap(); + + let input_path = &tempdir.path().join("package-name").join("README.md"); + let expected_path = PathBuf::from("README.md"); + + let distro = super::Distribution { + dir: tempdir, + download_url: "https://example.com".parse().unwrap(), + inspector_url: "https://example.com".parse().unwrap(), + }; + + let result = distro.relative_to_archive_root(input_path).unwrap(); + + assert_eq!(expected_path, result); + } + + #[test] + fn scan_skips_directories() { + let rules = r#" + rule contains_rust { + meta: + weight = 5 + strings: + $rust = "rust" nocase + condition: + $rust + } + "#; + + let compiler = Compiler::new().unwrap().add_rules_str(rules).unwrap(); + + let rules = compiler.compile_rules().unwrap(); + let tempdir = tempdir().unwrap(); + let _subtempdir = tempdir_in(tempdir.path()).unwrap(); + let mut tempfile = tempfile::NamedTempFile::new_in(tempdir.path()).unwrap(); + writeln!(&mut tempfile, "rust").unwrap(); + + let mut distro = super::Distribution { + dir: tempdir, + download_url: "https://example.com".parse().unwrap(), + inspector_url: "https://example.com".parse().unwrap(), + }; + + let results = distro.scan(&rules).unwrap(); + + assert_eq!(results.distro_scan_results.files.len(), 1); + } }