diff --git a/Cargo.lock b/Cargo.lock index bbc00a6..8d626ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -696,6 +696,12 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.6.0" @@ -1144,6 +1150,15 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +[[package]] +name = "encoding_rs" +version = "0.8.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" +dependencies = [ + "cfg-if", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -1192,6 +1207,21 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1613,6 +1643,22 @@ dependencies = [ "tower-service", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.4.1", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.6" @@ -1892,6 +1938,23 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "native-tls" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -1957,12 +2020,50 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "openssl" +version = "0.10.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" +dependencies = [ + "bitflags 2.6.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.72", +] + [[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "openssl-sys" +version = "0.9.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "outref" version = "0.5.1" @@ -2220,7 +2321,7 @@ dependencies = [ [[package]] name = "redacter" -version = "0.2.0" +version = "0.3.0" dependencies = [ "async-recursion", "async-trait", @@ -2239,6 +2340,7 @@ dependencies = [ "indicatif", "mime", "mime_guess", + "reqwest", "rsb_derive", "rvstruct", "serde", @@ -2248,6 +2350,7 @@ dependencies = [ "thiserror", "tokio", "tokio-util", + "url", "zip", ] @@ -2283,19 +2386,23 @@ dependencies = [ "async-compression", "base64 0.22.1", "bytes", + "encoding_rs", "futures-core", "futures-util", + "h2 0.4.5", "http 1.1.0", "http-body 1.0.1", "http-body-util", "hyper 1.4.1", "hyper-rustls 0.27.2", + "hyper-tls", "hyper-util", "ipnet", "js-sys", "log", "mime", "mime_guess", + "native-tls", "once_cell", "percent-encoding", "pin-project-lite", @@ -2308,7 +2415,9 @@ dependencies = [ "serde_json", "serde_urlencoded", "sync_wrapper 1.0.1", + "system-configuration", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.0", "tokio-util", "tower-service", @@ -2384,7 +2493,7 @@ version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags", + "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", @@ -2573,7 +2682,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags", + "bitflags 2.6.0", "core-foundation", "core-foundation-sys", "libc", @@ -2818,6 +2927,27 @@ dependencies = [ "futures-core", ] +[[package]] +name = "system-configuration" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tempfile" version = "3.11.0" @@ -2926,6 +3056,16 @@ dependencies = [ "syn 2.0.72", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" @@ -3164,6 +3304,12 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" diff --git a/Cargo.toml b/Cargo.toml index 59acf06..93c1c8d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "redacter" -version = "0.2.0" +version = "0.3.0" edition = "2021" authors = ["Abdulla Abdurakhmanov "] license = "Apache-2.0" @@ -18,7 +18,8 @@ description = "Copy & Redact files cli tool utilizing Data Loss Prevention (DLP) default = [] ci-gcp = [] # For testing on CI/GCP ci-aws = [] # For testing on CI/AWS -ci = ["ci-gcp", "ci-aws"] +ci-ms-presidio = [] # For testing on CI/MS Presidiom +ci = ["ci-gcp", "ci-aws", "ci-ms-presidio"] [dependencies] @@ -48,6 +49,8 @@ csv-async = { version = "1", default-features = false, features = ["tokio", "tok aws-config = { version = "1", features = ["behavior-version-latest"] } aws-sdk-s3 = { version = "1" } aws-sdk-comprehend = { version = "1" } +url = "2" +reqwest = { version = "0.12", features = ["multipart", "h2"] } [dev-dependencies] diff --git a/README.md b/README.md index 5bea1d0..9e8e533 100644 --- a/README.md +++ b/README.md @@ -20,11 +20,16 @@ Google Cloud Platform's DLP API. * Amazon Simple Storage Service (S3) * Zip files * **DLP Integration:** - * GCP DLP API for accurate and customizable redaction for: + * [Google Cloud Platform DLP](https://cloud.google.com/security/products/dlp?hl=en) for accurate and customizable + redaction for: * text, html, json files * structured data table files (csv) * images (jpeg, png, bpm, gif) - * AWS Comprehend PII redaction for text files. + * [AWS Comprehend](https://aws.amazon.com/comprehend/) PII redaction for text files. + * [Microsoft Presidio](https://microsoft.github.io/presidio/) for PII redaction (open source project that you can + install on-prem). + * text, html, json files + * images * ... more DLP providers can be added in the future. * **CLI:** Easy-to-use command-line interface for streamlined workflows. * Built with Rust to ensure speed, safety, and reliability. @@ -58,7 +63,7 @@ Options: -f, --filename-filter Filter by name using glob patterns such as *.txt -d, --redact - Redacter type [possible values: gcp-dlp, aws-comprehend-dlp] + Redacter type [possible values: gcp-dlp, aws-comprehend, ms-presidio] --gcp-project-id GCP project id that will be used to redact and bill API calls --allow-unsupported-copies @@ -69,6 +74,10 @@ Options: CSV delimiter (default is ',' --aws-region AWS region for AWS Comprehend DLP redacter + --ms-presidio-text-analyze-url + URL for text analyze endpoint for MsPresidio redacter + --ms-presidio-image-redact-url + URL for image redact endpoint for MsPresidio redacter -h, --help Print help ``` @@ -91,12 +100,19 @@ Source/destination can be a local file or directory, or a file in GCS, S3, or a To be able to use GCP DLP you need to authenticate using `gcloud auth application-default login` or provide a service account key using `GOOGLE_APPLICATION_CREDENTIALS` environment variable. -### AWS Comprehend DLP +### AWS Comprehend To be able to use AWS Comprehend DLP you need to authenticate using `aws configure` or provide a service account. To provide an AWS region use `--aws-region` option since AWS Comprehend may not be available in all regions. AWS Comprehend DLP is only available for unstructured text files. +### Microsoft Presidio + +To be able to use Microsoft Presidio DLP you need to have a running instance of the Presidio API. +You can use Docker to run it locally or deploy it to your infrastructure. +You need to provide the URLs for text analysis and image redaction endpoints using `--ms-presidio-text-analyze-url` and +`--ms-presidio-image-redact-url` options. + ## Examples: ```sh @@ -128,6 +144,12 @@ and/or by size: redacter cp -m 1024 ... ``` +MS Presidio redacter: + +```sh +redacter cp -d ms-presidio --ms-presidio-text-analyze-url http://localhost:5002/analyze --ms-presidio-image-redact-url http://localhost:5003/redact ... +``` + ## Security considerations - Your file contents are sent to the DLP API for redaction. Make sure you trust the DLP API provider. diff --git a/src/args.rs b/src/args.rs index 251dce3..4077618 100644 --- a/src/args.rs +++ b/src/args.rs @@ -3,6 +3,7 @@ use crate::errors::AppError; use crate::redacters::{GcpDlpRedacterOptions, RedacterOptions, RedacterProviderOptions}; use clap::*; use std::fmt::Display; +use url::Url; #[derive(Parser, Debug)] #[command(author, about)] @@ -40,7 +41,8 @@ pub enum CliCommand { #[derive(ValueEnum, Debug, Clone)] pub enum RedacterType { GcpDlp, - AwsComprehendDlp, + AwsComprehend, + MsPresidio, } impl std::str::FromStr for RedacterType { @@ -49,7 +51,8 @@ impl std::str::FromStr for RedacterType { fn from_str(s: &str) -> Result { match s { "gcp-dlp" => Ok(RedacterType::GcpDlp), - "aws-comprehend-dlp" => Ok(RedacterType::AwsComprehendDlp), + "aws-comprehend" => Ok(RedacterType::AwsComprehend), + "ms-presidio" => Ok(RedacterType::MsPresidio), _ => Err(format!("Unknown redacter type: {}", s)), } } @@ -59,7 +62,8 @@ impl Display for RedacterType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { RedacterType::GcpDlp => write!(f, "gcp-dlp"), - RedacterType::AwsComprehendDlp => write!(f, "aws-comprehend-dlp"), + RedacterType::AwsComprehend => write!(f, "aws-comprehend"), + RedacterType::MsPresidio => write!(f, "ms-presidio"), } } } @@ -95,6 +99,12 @@ pub struct RedacterArgs { #[arg(long, help = "AWS region for AWS Comprehend DLP redacter")] pub aws_region: Option, + + #[arg(long, help = "URL for text analyze endpoint for MsPresidio redacter")] + pub ms_presidio_text_analyze_url: Option, + + #[arg(long, help = "URL for image redact endpoint for MsPresidio redacter")] + pub ms_presidio_image_redact_url: Option, } impl TryInto for RedacterArgs { @@ -110,11 +120,28 @@ impl TryInto for RedacterArgs { message: "GCP project id is required for GCP DLP redacter".to_string(), }), }, - Some(RedacterType::AwsComprehendDlp) => Ok(RedacterProviderOptions::AwsComprehendDlp( - crate::redacters::AwsComprehendDlpRedacterOptions { + Some(RedacterType::AwsComprehend) => Ok(RedacterProviderOptions::AwsComprehend( + crate::redacters::AwsComprehendRedacterOptions { region: self.aws_region.map(aws_config::Region::new), }, )), + Some(RedacterType::MsPresidio) => { + if self.ms_presidio_text_analyze_url.is_none() + && self.ms_presidio_image_redact_url.is_none() + { + return Err(AppError::RedacterConfigError { + message: + "MsPresidio requires text analyze/image URL specified (at least one)" + .to_string(), + }); + } + Ok(RedacterProviderOptions::MsPresidio( + crate::redacters::MsPresidioRedacterOptions { + text_analyze_url: self.ms_presidio_text_analyze_url, + image_redact_url: self.ms_presidio_image_redact_url, + }, + )) + } None => Err(AppError::RedacterConfigError { message: "Redacter type is required".to_string(), }), diff --git a/src/errors.rs b/src/errors.rs index 16f5466..1034196 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -39,6 +39,8 @@ pub enum AppError { AwsSdkError(#[from] Box), #[error("MIME error")] MimeError(#[from] mime::FromStrError), + #[error("HTTP client error")] + HttpClientError(#[from] reqwest::Error), #[error("Zip error")] ZipError(#[from] zip::result::ZipError), #[error("CSV parser error")] diff --git a/src/filesystems/mod.rs b/src/filesystems/mod.rs index 87416c1..90bea54 100644 --- a/src/filesystems/mod.rs +++ b/src/filesystems/mod.rs @@ -22,6 +22,16 @@ pub use file_matcher::*; #[derive(Debug, Clone, ValueStruct)] pub struct RelativeFilePath(pub String); +impl RelativeFilePath { + pub fn filename(&self) -> String { + self.value() + .split('/') + .last() + .map(|s| s.to_string()) + .unwrap_or_default() + } +} + #[derive(Debug, Clone)] pub struct AbsoluteFilePath { pub file_path: String, diff --git a/src/redacters/aws_comprehend.rs b/src/redacters/aws_comprehend.rs index 142d598..38e2668 100644 --- a/src/redacters/aws_comprehend.rs +++ b/src/redacters/aws_comprehend.rs @@ -10,21 +10,21 @@ use aws_config::Region; use rvstruct::ValueStruct; #[derive(Debug, Clone)] -pub struct AwsComprehendDlpRedacterOptions { +pub struct AwsComprehendRedacterOptions { pub region: Option, } #[derive(Clone)] -pub struct AwsComprehendDlpRedacter<'a> { +pub struct AwsComprehendRedacter<'a> { client: aws_sdk_comprehend::Client, redacter_options: RedacterOptions, reporter: &'a AppReporter<'a>, } -impl<'a> AwsComprehendDlpRedacter<'a> { +impl<'a> AwsComprehendRedacter<'a> { pub async fn new( redacter_options: RedacterOptions, - aws_dlp_options: AwsComprehendDlpRedacterOptions, + aws_dlp_options: AwsComprehendRedacterOptions, reporter: &'a AppReporter<'a>, ) -> AppResult { let region_provider = aws_config::meta::region::RegionProviderChain::first_try( @@ -87,7 +87,7 @@ impl<'a> AwsComprehendDlpRedacter<'a> { } } -impl<'a> Redacter for AwsComprehendDlpRedacter<'a> { +impl<'a> Redacter for AwsComprehendRedacter<'a> { async fn redact(&self, input: RedacterDataItem) -> AppResult { match &input.content { RedacterDataItemContent::Value(_) => self.redact_text_file(input).await, @@ -143,8 +143,8 @@ mod tests { let input = RedacterDataItem { file_ref, content }; let redacter_options = RedacterOptions { - provider_options: RedacterProviderOptions::AwsComprehendDlp( - AwsComprehendDlpRedacterOptions { + provider_options: RedacterProviderOptions::AwsComprehend( + AwsComprehendRedacterOptions { region: Some(Region::new(test_aws_region.clone())), }, ), @@ -153,9 +153,9 @@ mod tests { csv_delimiter: None, }; - let redacter = AwsComprehendDlpRedacter::new( + let redacter = AwsComprehendRedacter::new( redacter_options, - AwsComprehendDlpRedacterOptions { + AwsComprehendRedacterOptions { region: Some(Region::new(test_aws_region)), }, &reporter, diff --git a/src/redacters/mod.rs b/src/redacters/mod.rs index beae5bd..d9e049a 100644 --- a/src/redacters/mod.rs +++ b/src/redacters/mod.rs @@ -15,6 +15,9 @@ pub use gcp_dlp::*; mod aws_comprehend; pub use aws_comprehend::*; +mod ms_presidio; +pub use ms_presidio::*; + #[derive(Debug, Clone)] pub struct RedacterDataItem { pub content: RedacterDataItemContent, @@ -37,7 +40,8 @@ pub enum RedacterDataItemContent { #[derive(Clone)] pub enum Redacters<'a> { GcpDlp(GcpDlpRedacter<'a>), - AwsComprehendDlp(AwsComprehendDlpRedacter<'a>), + AwsComprehendDlp(AwsComprehendRedacter<'a>), + MsPresidio(MsPresidioRedacter<'a>), } #[derive(Debug, Clone)] @@ -51,14 +55,16 @@ pub struct RedacterOptions { #[derive(Debug, Clone)] pub enum RedacterProviderOptions { GcpDlp(GcpDlpRedacterOptions), - AwsComprehendDlp(AwsComprehendDlpRedacterOptions), + AwsComprehend(AwsComprehendRedacterOptions), + MsPresidio(MsPresidioRedacterOptions), } impl Display for RedacterOptions { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self.provider_options { RedacterProviderOptions::GcpDlp(_) => write!(f, "gcp-dlp"), - RedacterProviderOptions::AwsComprehendDlp(_) => write!(f, "aws-comprehend-dlp"), + RedacterProviderOptions::AwsComprehend(_) => write!(f, "aws-comprehend-dlp"), + RedacterProviderOptions::MsPresidio(_) => write!(f, "ms-presidio"), } } } @@ -72,16 +78,14 @@ impl<'a> Redacters<'a> { RedacterProviderOptions::GcpDlp(ref options) => Ok(Redacters::GcpDlp( GcpDlpRedacter::new(redacter_options.clone(), options.clone(), reporter).await?, )), - RedacterProviderOptions::AwsComprehendDlp(ref options) => { - Ok(Redacters::AwsComprehendDlp( - AwsComprehendDlpRedacter::new( - redacter_options.clone(), - options.clone(), - reporter, - ) + RedacterProviderOptions::AwsComprehend(ref options) => Ok(Redacters::AwsComprehendDlp( + AwsComprehendRedacter::new(redacter_options.clone(), options.clone(), reporter) .await?, - )) - } + )), + RedacterProviderOptions::MsPresidio(ref options) => Ok(Redacters::MsPresidio( + MsPresidioRedacter::new(redacter_options.clone(), options.clone(), reporter) + .await?, + )), } } @@ -239,6 +243,7 @@ impl<'a> Redacter for Redacters<'a> { match self { Redacters::GcpDlp(redacter) => redacter.redact(input).await, Redacters::AwsComprehendDlp(redacter) => redacter.redact(input).await, + Redacters::MsPresidio(redacter) => redacter.redact(input).await, } } @@ -251,6 +256,7 @@ impl<'a> Redacter for Redacters<'a> { Redacters::AwsComprehendDlp(redacter) => { redacter.redact_supported_options(file_ref).await } + Redacters::MsPresidio(redacter) => redacter.redact_supported_options(file_ref).await, } } @@ -258,6 +264,7 @@ impl<'a> Redacter for Redacters<'a> { match self { Redacters::GcpDlp(redacter) => redacter.options(), Redacters::AwsComprehendDlp(redacter) => redacter.options(), + Redacters::MsPresidio(redacter) => redacter.options(), } } } diff --git a/src/redacters/ms_presidio.rs b/src/redacters/ms_presidio.rs new file mode 100644 index 0000000..7d64cf0 --- /dev/null +++ b/src/redacters/ms_presidio.rs @@ -0,0 +1,282 @@ +use rvstruct::ValueStruct; +use serde::{Deserialize, Serialize}; +use url::Url; + +use crate::errors::AppError; +use crate::filesystems::FileSystemRef; +use crate::redacters::{ + RedactSupportedOptions, Redacter, RedacterDataItem, RedacterDataItemContent, RedacterOptions, + Redacters, +}; +use crate::reporter::AppReporter; +use crate::AppResult; + +#[derive(Debug, Clone)] +pub struct MsPresidioRedacterOptions { + pub text_analyze_url: Option, + pub image_redact_url: Option, +} + +#[derive(Clone)] +pub struct MsPresidioRedacter<'a> { + client: reqwest::Client, + ms_presidio_options: MsPresidioRedacterOptions, + redacter_options: RedacterOptions, + reporter: &'a AppReporter<'a>, +} + +#[derive(Serialize, Clone, Debug)] +struct MsPresidioAnalyzeRequest { + text: String, + language: String, +} + +#[derive(Deserialize, Clone, Debug)] +struct MsPresidioAnalyzedItem { + entity_type: String, + start: Option, + end: Option, +} + +impl<'a> MsPresidioRedacter<'a> { + /// List of entity types that should be disallowed for redacting + /// since they produce a lot of false positives + const DISALLOW_ENTITY_TYPES: [&'static str; 1] = ["US_DRIVER_LICENSE"]; + + pub async fn new( + redacter_options: RedacterOptions, + ms_presidio_options: MsPresidioRedacterOptions, + reporter: &'a AppReporter<'a>, + ) -> AppResult { + let client = reqwest::Client::new(); + Ok(Self { + client, + ms_presidio_options, + redacter_options, + reporter, + }) + } + + pub async fn redact_text_file( + &self, + input: RedacterDataItem, + ) -> AppResult { + self.reporter.report(format!( + "Redacting a text file: {} ({:?})", + input.file_ref.relative_path.value(), + input.file_ref.media_type + ))?; + let text_content = match input.content { + RedacterDataItemContent::Value(content) => Ok(content), + _ => Err(AppError::SystemError { + message: "Unsupported item for text redacting".to_string(), + }), + }?; + + let analyze_url = self.ms_presidio_options.text_analyze_url.as_ref().ok_or( + AppError::RedacterConfigError { + message: "Text analyze URL is not configured".to_string(), + }, + )?; + let analyze_request = MsPresidioAnalyzeRequest { + text: text_content.clone(), + language: "en".to_string(), + }; + let response = self + .client + .post(analyze_url.clone()) + .json(&analyze_request) + .send() + .await?; + if !response.status().is_success() + || response + .headers() + .get("content-type") + .iter() + .all(|v| *v != mime::APPLICATION_JSON.as_ref()) + { + let response_status = response.status(); + let response_text = response.text().await.unwrap_or_default(); + return Err(AppError::SystemError { + message: format!( + "Failed to analyze text: {}. HTTP status: {}.", + response_text, response_status + ), + }); + } + let response_items: Vec = response.json().await?; + let redacted_text_content = response_items + .iter() + .filter(|item| !Self::DISALLOW_ENTITY_TYPES.contains(&item.entity_type.as_str())) + .fold(text_content, |acc, entity| { + match (entity.start, entity.end) { + (Some(start), Some(end)) => [ + acc[..start].to_string(), + "X".repeat(end - start), + acc[end..].to_string(), + ] + .concat(), + (Some(start), None) => { + acc[..start].to_string() + "X".repeat(acc.len() - start).as_str() + } + (None, Some(end)) => ["X".repeat(end), acc[end..].to_string()].concat(), + _ => acc, + } + }); + Ok(RedacterDataItemContent::Value(redacted_text_content)) + } + + pub async fn redact_image_file( + &self, + input: RedacterDataItem, + ) -> AppResult { + let redact_url = self.ms_presidio_options.image_redact_url.as_ref().ok_or( + AppError::RedacterConfigError { + message: "Image redact URL is not configured".to_string(), + }, + )?; + + match input.content { + RedacterDataItemContent::Image { mime_type, data } => { + self.reporter.report(format!( + "Redacting an image file: {} ({:?})", + input.file_ref.relative_path.value(), + input.file_ref.media_type + ))?; + let file_part = reqwest::multipart::Part::bytes(data.to_vec()) + .file_name(input.file_ref.relative_path.filename()) + .mime_str(mime_type.as_ref()) + .unwrap(); + let form = reqwest::multipart::Form::new().part("image", file_part); + let response = self + .client + .post(redact_url.clone()) + .multipart(form) + .send() + .await?; + if !response.status().is_success() { + let response_status = response.status(); + let response_text = response.text().await.unwrap_or_default(); + return Err(AppError::SystemError { + message: format!( + "Failed to redact image: {}. HTTP status: {}.", + response_text, response_status + ), + }); + } + let redacted_image_bytes = response.bytes().await?; + Ok(RedacterDataItemContent::Image { + mime_type, + data: redacted_image_bytes, + }) + } + _ => Err(AppError::SystemError { + message: "Unsupported item for image redacting".to_string(), + }), + } + } +} + +impl<'a> Redacter for MsPresidioRedacter<'a> { + async fn redact(&self, input: RedacterDataItem) -> AppResult { + match &input.content { + RedacterDataItemContent::Value(_) => self.redact_text_file(input).await, + RedacterDataItemContent::Image { .. } => self.redact_image_file(input).await, + RedacterDataItemContent::Table { .. } => Err(AppError::SystemError { + message: "Attempt to redact of unsupported table type".to_string(), + }), + } + } + + async fn redact_supported_options( + &self, + file_ref: &FileSystemRef, + ) -> AppResult { + Ok(match file_ref.media_type.as_ref() { + Some(media_type) + if Redacters::is_mime_text(media_type) + && self.ms_presidio_options.text_analyze_url.is_some() => + { + RedactSupportedOptions::Supported + } + Some(media_type) + if Redacters::is_mime_table(media_type) + && self.ms_presidio_options.text_analyze_url.is_some() => + { + RedactSupportedOptions::SupportedAsText + } + Some(media_type) + if Redacters::is_mime_image(media_type) + && self.ms_presidio_options.image_redact_url.is_some() => + { + RedactSupportedOptions::Supported + } + _ => RedactSupportedOptions::Unsupported, + }) + } + + fn options(&self) -> &RedacterOptions { + &self.redacter_options + } +} + +#[allow(unused_imports)] +mod tests { + use console::Term; + + use crate::redacters::RedacterProviderOptions; + + use super::*; + + #[tokio::test] + #[cfg_attr(not(feature = "ci-ms-presidio"), ignore)] + async fn redact_text_file_test() -> Result<(), Box> { + let term = Term::stdout(); + let reporter: AppReporter = AppReporter::from(&term); + let test_analyze_url: Url = Url::parse( + std::env::var("TEST_MS_PRESIDIO_ANALYZE_URL") + .expect("TEST_MS_PRESIDIO_ANALYZE_URL required") + .as_str(), + )?; + let test_content = "Hello, John"; + + let file_ref = FileSystemRef { + relative_path: "temp_file.txt".into(), + media_type: Some(mime::TEXT_PLAIN), + file_size: Some(test_content.len() as u64), + }; + + let content = RedacterDataItemContent::Value(test_content.to_string()); + let input = RedacterDataItem { file_ref, content }; + + let redacter_options = RedacterOptions { + provider_options: RedacterProviderOptions::MsPresidio(MsPresidioRedacterOptions { + text_analyze_url: Some(test_analyze_url.clone()), + image_redact_url: None, + }), + allow_unsupported_copies: false, + csv_headers_disable: false, + csv_delimiter: None, + }; + + let redacter = MsPresidioRedacter::new( + redacter_options, + MsPresidioRedacterOptions { + text_analyze_url: Some(test_analyze_url), + image_redact_url: None, + }, + &reporter, + ) + .await?; + + let redacted_content = redacter.redact(input).await?; + match redacted_content { + RedacterDataItemContent::Value(value) => { + assert_eq!(value, "Hello, XXXX"); + } + _ => panic!("Unexpected redacted content type"), + } + + Ok(()) + } +}