From 2a874ef43ff8b7179552af3a4462f545a16dac88 Mon Sep 17 00:00:00 2001 From: Abdulla Abdurakhmanov Date: Mon, 19 Aug 2024 12:52:45 +0200 Subject: [PATCH] Image to text OCR conversion implementation to redact images with text only DLPs (#24) --- .gitattributes | 2 + .gitignore | 1 + Cargo.lock | 147 ++++- Cargo.toml | 12 +- src/commands/copy_command.rs | 155 ++--- src/common_types.rs | 10 + src/errors.rs | 20 + src/file_converters/mod.rs | 35 +- src/file_converters/ocr.rs | 6 + src/file_converters/ocr_ocrs.rs | 120 ++++ src/file_systems/clipboard.rs | 4 +- src/main.rs | 2 +- src/redacters/aws_comprehend.rs | 3 - src/redacters/gcp_dlp.rs | 3 - src/redacters/gemini_llm.rs | 12 +- src/redacters/mod.rs | 2 - src/redacters/ms_presidio.rs | 12 - src/redacters/open_ai_llm.rs | 3 - src/redacters/simple_image_redacter.rs | 31 +- src/redacters/stream_redacter.rs | 777 ++++++++++++++++--------- test-fixtures/media/form-example.png | 3 + 21 files changed, 959 insertions(+), 401 deletions(-) create mode 100644 .gitattributes create mode 100644 src/file_converters/ocr.rs create mode 100644 src/file_converters/ocr_ocrs.rs create mode 100644 test-fixtures/media/form-example.png diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..503b5ad --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.rten filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index b0e73dd..b8c3af6 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ *.swp tmp/ lib/ +models/ diff --git a/Cargo.lock b/Cargo.lock index f416814..5038b9e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1335,6 +1335,27 @@ dependencies = [ "subtle", ] +[[package]] +name = "dirs" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.48.0", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -1453,6 +1474,16 @@ dependencies = [ "subtle", ] +[[package]] +name = "flatbuffers" +version = "22.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ae1bfc84d904f75e7ef6f8796b020c606a9e8e271e2004c0a74f7edeedba45f" +dependencies = [ + "bitflags 1.3.2", + "rustc_version", +] + [[package]] name = "flate2" version = "1.0.31" @@ -2251,7 +2282,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", +] + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + +[[package]] +name = "libredox" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +dependencies = [ + "bitflags 2.6.0", + "libc", ] [[package]] @@ -2488,6 +2535,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "number_prefix" version = "0.4.0" @@ -2602,6 +2659,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "ocrs" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04022287d9279e74c3d718cc7fd06fa57b48998bb2a24eb6e8582934220212a0" +dependencies = [ + "anyhow", + "rayon", + "rten", + "rten-imageproc", + "rten-tensor", + "thiserror", + "wasm-bindgen", +] + [[package]] name = "once_cell" version = "1.19.0" @@ -2614,6 +2686,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "outref" version = "0.5.1" @@ -3072,6 +3150,7 @@ dependencies = [ name = "redacter" version = "0.9.0" dependencies = [ + "anyhow", "arboard", "async-recursion", "async-trait", @@ -3084,6 +3163,7 @@ dependencies = [ "clap", "console", "csv-async", + "dirs", "futures", "gcloud-sdk", "globset", @@ -3092,10 +3172,13 @@ dependencies = [ "indicatif", "mime", "mime_guess", + "ocrs", "pdfium-render", "rand", "reqwest", "rsb_derive", + "rten", + "rten-imageproc", "rvstruct", "serde", "serde_json", @@ -3120,6 +3203,17 @@ dependencies = [ "bitflags 2.6.0", ] +[[package]] +name = "redox_users" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" +dependencies = [ + "getrandom", + "libredox", + "thiserror", +] + [[package]] name = "regex" version = "1.10.6" @@ -3263,6 +3357,57 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "rten" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09c030cdf90e64c5eeeba389ca59da14b0a106b1b8366c15591251bb6a2e777f" +dependencies = [ + "flatbuffers", + "libm", + "num_cpus", + "rayon", + "rten-simd", + "rten-tensor", + "rten-vecmath", + "rustc-hash 1.1.0", + "smallvec", + "wasm-bindgen", +] + +[[package]] +name = "rten-imageproc" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ba61077269b2b2c90445bfd55fb798dcd544b56e7fd78faaea51940b8e429ae" +dependencies = [ + "rten-tensor", +] + +[[package]] +name = "rten-simd" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eb16da64e0d08ce56dc17d8304ab2da541176ee30430c0b0e581a7841a660ae" + +[[package]] +name = "rten-tensor" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f5e53d2e43bb736e89e4ea41b707e024190f8ba47c3eddf5a3c2d022089909" +dependencies = [ + "smallvec", +] + +[[package]] +name = "rten-vecmath" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56eccc46a7e7a2df2cebb7ba95e613a01942a01e0f2f2f7d6122176ab7372e9f" +dependencies = [ + "rten-simd", +] + [[package]] name = "rustc-demangle" version = "0.1.24" diff --git a/Cargo.toml b/Cargo.toml index bd79244..b04ad40 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,16 +15,18 @@ categories = ["command-line-utilities"] description = "Copy & Redact cli tool to securely copy and redact files removing Personal Identifiable Information (PII) across various filesystems." [features] -default = ["pdf-render", "clipboard"] +default = ["pdf-render", "clipboard", "ocr"] ci-gcp = [] # For testing on CI/GCP ci-aws = [] # For testing on CI/AWS ci-ms-presidio = [] # For testing on CI/MS Presidiom ci-gcp-llm = [] # For testing on CI/GCP with LLM models ci-open-ai = [] # For testing on CI/OpenAIP ci-clibpoard = [] # For testing on CI/Clipboard +ci-ocr = [] # For testing on CI/OCR ci = ["ci-gcp", "ci-aws", "ci-ms-presidio", "ci-gcp-llm", "ci-open-ai", "ci-clibpoard"] pdf-render = ["pdfium-render"] clipboard = ["arboard"] +ocr = ["ocrs", "rten", "rten-imageproc"] [dependencies] @@ -43,6 +45,7 @@ sha2 = "0.10" async-trait = "0.1" hex = "0.4" thiserror = "1" +anyhow = "1" sync_wrapper = { version = "1", features = ["futures"] } async-recursion = "1" mime = "0.3" @@ -62,8 +65,13 @@ rand = "0.8" pdfium-render = { version = "0.8", features = ["thread_safe", "image"], optional = true } image = "0.25" bytes = { version = "1" } -arboard = { version = "3", features = ["image"], optional = true } serde_json = "1" +arboard = { version = "3", features = ["image"], optional = true } +ocrs = { version = "0.8", optional = true } +rten = { version = "0.10", optional = true } +rten-imageproc = { version = "0.10", optional = true } +dirs = "5.0.1" + [dev-dependencies] diff --git a/src/commands/copy_command.rs b/src/commands/copy_command.rs index 01186b3..02d12c5 100644 --- a/src/commands/copy_command.rs +++ b/src/commands/copy_command.rs @@ -2,10 +2,7 @@ use crate::errors::AppError; use crate::file_converters::FileConverters; use crate::file_systems::{DetectFileSystem, FileSystemConnection, FileSystemRef}; use crate::file_tools::{FileMatcher, FileMatcherResult, FileMimeOverride}; -use crate::redacters::{ - redact_stream, RedactSupportedOptions, Redacter, RedacterBaseOptions, RedacterOptions, - Redacters, -}; +use crate::redacters::{Redacter, RedacterBaseOptions, RedacterOptions, Redacters, StreamRedacter}; use crate::reporter::AppReporter; use crate::AppResult; use console::{pad_str, Alignment, Style, Term}; @@ -49,48 +46,17 @@ pub async fn command_copy( options: CopyCommandOptions, redacter_options: Option, ) -> AppResult { - let bold_style = Style::new().bold(); - let redacted_output = if let Some(ref options) = redacter_options.as_ref() { - bold_style - .clone() - .green() - .apply_to(format!("✓ Yes ({})", &options)) - } else { - bold_style.clone().red().apply_to("✗ No".to_string()) - }; - let sampling_output = if let Some(ref sampling_size) = redacter_options - .as_ref() - .and_then(|o| o.base_options.sampling_size) - { - Style::new().apply_to(format!("{} bytes.", sampling_size)) - } else { - Style::new().dim().apply_to("-".to_string()) - }; - - let mut file_converters = FileConverters::new(); - file_converters.init().await?; - - let converter_style = Style::new(); - let pdf_support_output = if file_converters.pdf_image_converter.is_some() { - converter_style - .clone() - .green() - .apply_to("✓ Yes".to_string()) - } else { - converter_style.clone().dim().apply_to("✗ No".to_string()) - }; + let term_reporter = AppReporter::from(term); + let file_converters = FileConverters::new().init(&term_reporter).await?; - term.write_line( - format!( - "Copying from {} to {}.\nRedacting: {}.\nSampling: {}\nPDF to image support: {}\n", - bold_style.clone().white().apply_to(source), - bold_style.clone().yellow().apply_to(destination), - redacted_output, - sampling_output, - pdf_support_output, - ) - .as_str(), - )?; + report_copy_info( + term, + source, + destination, + &redacter_options, + &file_converters, + ) + .await?; let bar = ProgressBar::new(1); bar.set_style( @@ -131,6 +97,7 @@ pub async fn command_copy( .iter() .map(|file| file.file_size.unwrap_or(0)) .sum(); + let bold_style = Style::new().bold(); bar.println( format!( "Found {} files. Total size: {}", @@ -196,6 +163,65 @@ pub async fn command_copy( copy_result } +async fn report_copy_info( + term: &Term, + source: &str, + destination: &str, + redacter_options: &Option, + file_converters: &FileConverters<'_>, +) -> AppResult<()> { + let bold_style = Style::new().bold(); + let redacted_output = if let Some(ref options) = redacter_options.as_ref() { + bold_style + .clone() + .green() + .apply_to(format!("✓ Yes ({})", &options)) + } else { + bold_style.clone().red().apply_to("✗ No".to_string()) + }; + let sampling_output = if let Some(ref sampling_size) = redacter_options + .as_ref() + .and_then(|o| o.base_options.sampling_size) + { + Style::new().apply_to(format!("{} bytes.", sampling_size)) + } else { + Style::new().dim().apply_to("-".to_string()) + }; + + let converter_style = Style::new(); + let pdf_support_output = if file_converters.pdf_image_converter.is_some() { + converter_style + .clone() + .green() + .apply_to("✓ Yes".to_string()) + } else { + converter_style.clone().dim().apply_to("✗ No".to_string()) + }; + + let ocr_support_output = if file_converters.ocr.is_some() { + converter_style + .clone() + .green() + .apply_to("✓ Yes".to_string()) + } else { + converter_style.clone().dim().apply_to("✗ No".to_string()) + }; + + term.write_line( + format!( + "Copying from {} to {}.\nRedacting: {}. | Sampling: {} | PDF to image support: {} | OCR support: {}\n", + bold_style.clone().white().apply_to(source), + bold_style.clone().yellow().apply_to(destination), + redacted_output, + sampling_output, + pdf_support_output, + ocr_support_output, + ) + .as_str(), + )?; + Ok(()) +} + enum TransferFileResult { Copied, Skipped, @@ -214,7 +240,7 @@ async fn transfer_and_redact_file< destination_fs: &mut DFS, options: &CopyCommandOptions, redacter: &Option<(RedacterBaseOptions, Vec)>, - file_converters: &FileConverters, + file_converters: &FileConverters<'a>, ) -> AppResult { let bold_style = Style::new().bold().white(); let (base_file_ref, source_reader) = source_fs.download(source_file_ref).await?; @@ -312,31 +338,28 @@ async fn redact_upload_file< dest_file_ref: &FileSystemRef, options: &CopyCommandOptions, redacter_with_options: &(RedacterBaseOptions, Vec), - file_converters: &FileConverters, + file_converters: &FileConverters<'a>, ) -> AppResult { let (redacter_base_options, redacters) = redacter_with_options; - let mut support_redacters = Vec::new(); + let stream_redacter = StreamRedacter::new(redacter_base_options, file_converters, bar); + let dest_file_ref_overridden = options .file_mime_override .override_for_file_ref(dest_file_ref.clone()); - for redacter in redacters { - let redacter_supported_options = redacter - .redact_supported_options(&dest_file_ref_overridden) - .await?; - if redacter_supported_options != RedactSupportedOptions::Unsupported { - support_redacters.push(redacter); - } - } - if !support_redacters.is_empty() { - match redact_stream( - &support_redacters, - redacter_base_options, - source_reader, - &dest_file_ref_overridden, - file_converters, - bar, - ) - .await + + let (redact_plan, supported_redacters) = stream_redacter + .create_redact_plan(redacters, &dest_file_ref_overridden) + .await?; + + if !supported_redacters.is_empty() { + match stream_redacter + .redact_stream( + source_reader, + redact_plan, + &supported_redacters, + &dest_file_ref_overridden, + ) + .await { Ok(redacted_result) if redacted_result.number_of_redactions > 0 diff --git a/src/common_types.rs b/src/common_types.rs index 4e2840b..e529bb2 100644 --- a/src/common_types.rs +++ b/src/common_types.rs @@ -1,7 +1,17 @@ use rvstruct::ValueStruct; +use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, ValueStruct)] pub struct GcpProjectId(String); #[derive(Debug, Clone, ValueStruct)] pub struct AwsAccountId(String); + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TextImageCoords { + pub x1: f32, + pub y1: f32, + pub x2: f32, + pub y2: f32, + pub text: Option, +} diff --git a/src/errors.rs b/src/errors.rs index ce14db1..1cc58c6 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -46,8 +46,19 @@ pub enum AppError { SystemTimeError(#[from] SystemTimeError), #[error("JSON serialization error: {0}")] JsonSerializeError(#[from] serde_json::Error), + #[cfg(feature = "ocr")] + #[error("Model load error: {0}")] + OcrModelLoadError(#[from] rten::ModelLoadError), + #[cfg(feature = "ocr")] + #[error("OCR image error: {0}")] + OcrImageError(#[from] ocrs::ImageSourceError), #[error("System error: {message}")] SystemError { message: String }, + #[error("System error: {message}")] + SystemErrorWithCause { + message: String, + cause: Box, + }, } impl< @@ -67,3 +78,12 @@ impl Self::GoogleCloudRestSdkApiError(Box::new(err)) } } + +impl From for AppError { + fn from(err: anyhow::Error) -> Self { + Self::SystemErrorWithCause { + message: err.to_string(), + cause: Box::new(err), + } + } +} diff --git a/src/file_converters/mod.rs b/src/file_converters/mod.rs index 8b1f178..cb943e8 100644 --- a/src/file_converters/mod.rs +++ b/src/file_converters/mod.rs @@ -1,35 +1,44 @@ +use crate::file_converters::ocr::Ocr; use crate::file_converters::pdf::PdfToImage; +use crate::reporter::AppReporter; use crate::AppResult; +pub mod ocr; pub mod pdf; #[cfg(feature = "pdf-render")] mod pdf_image_converter; -pub struct FileConverters { - pub pdf_image_converter: Option>, +#[cfg(feature = "ocr")] +mod ocr_ocrs; + +pub struct FileConverters<'a> { + pub pdf_image_converter: Option>, + pub ocr: Option>, } -impl FileConverters { +impl<'a> FileConverters<'a> { pub fn new() -> Self { Self { pdf_image_converter: None, + ocr: None, } } - #[cfg(feature = "pdf-render")] - pub async fn init(&mut self) -> AppResult<()> { - match crate::file_converters::pdf_image_converter::PdfImageConverter::new().ok() { - Some(pdf_image_converter) => { + pub async fn init(mut self, app_reporter: &'a AppReporter<'a>) -> AppResult { + #[cfg(feature = "pdf-render")] + { + if let Ok(pdf_image_converter) = pdf_image_converter::PdfImageConverter::new() { self.pdf_image_converter = Some(Box::new(pdf_image_converter)); - Ok(()) } - None => Ok(()), } - } + #[cfg(feature = "ocr")] + { + if let Ok(ocr) = ocr_ocrs::Ocrs::new(app_reporter) { + self.ocr = Some(Box::new(ocr)); + } + } - #[cfg(not(feature = "pdf-render"))] - pub async fn init(&mut self) -> AppResult<()> { - Ok(()) + Ok(self) } } diff --git a/src/file_converters/ocr.rs b/src/file_converters/ocr.rs new file mode 100644 index 0000000..7e65f29 --- /dev/null +++ b/src/file_converters/ocr.rs @@ -0,0 +1,6 @@ +use crate::common_types::TextImageCoords; +use crate::AppResult; + +pub trait Ocr { + fn image_to_text(&self, image: image::DynamicImage) -> AppResult>; +} diff --git a/src/file_converters/ocr_ocrs.rs b/src/file_converters/ocr_ocrs.rs new file mode 100644 index 0000000..f40dab4 --- /dev/null +++ b/src/file_converters/ocr_ocrs.rs @@ -0,0 +1,120 @@ +use crate::common_types::TextImageCoords; +use crate::errors::AppError; +use crate::file_converters::ocr::Ocr; +use crate::reporter::AppReporter; +use crate::AppResult; +use ocrs::{ImageSource, OcrEngine, OcrEngineParams, OcrInput, TextItem}; +use std::path::PathBuf; + +pub struct Ocrs<'a> { + ocr_engine: OcrEngine, + #[allow(dead_code)] + app_reporter: &'a AppReporter<'a>, +} + +impl<'a> Ocrs<'a> { + pub fn new(app_reporter: &'a AppReporter<'a>) -> AppResult { + let find_models_dir = Self::find_models_dir()?; + app_reporter.report(format!( + "Loading OCR models from {}", + find_models_dir.to_string_lossy() + ))?; + let detection_model_path = find_models_dir.join("text-detection.rten"); + let rec_model_path = find_models_dir.join("text-recognition.rten"); + let detection_model = rten::Model::load_file(detection_model_path)?; + let recognition_model = rten::Model::load_file(rec_model_path)?; + let ocr_engine = OcrEngine::new(OcrEngineParams { + detection_model: Some(detection_model), + recognition_model: Some(recognition_model), + ..Default::default() + })?; + Ok(Self { + ocr_engine, + app_reporter, + }) + } + + fn find_models_dir() -> AppResult { + let executable = std::env::current_exe()?; + let current_dir = executable.parent().map(|p| p.to_path_buf()); + + vec![ + current_dir.clone().map(|p| p.join("models").join("ocrs")), + current_dir + .clone() + .and_then(|p| p.parent().map(|p| p.join("share").join("ocrs"))), + dirs::home_dir().map(|p| p.join(".cache").join("ocrs")), + ] + .into_iter() + .collect::>>() + .iter() + .flatten() + .find(|p| p.exists()) + .cloned() + .ok_or_else(|| AppError::SystemError { + message: "Could not find models directory".to_string(), + }) + } +} + +impl<'a> Ocr for Ocrs<'a> { + fn image_to_text(&self, image: image::DynamicImage) -> AppResult> { + let rgb_image = image.to_rgb8(); + let image_source = ImageSource::from_bytes(rgb_image.as_raw(), rgb_image.dimensions())?; + let input: OcrInput = self.ocr_engine.prepare_input(image_source)?; + let word_rects = self.ocr_engine.detect_words(&input)?; + let line_rects = self.ocr_engine.find_text_lines(&input, &word_rects); + let mut text_image_coords = vec![]; + for text_line in self + .ocr_engine + .recognize_text(&input, &line_rects)? + .into_iter() + .flatten() + { + let mut current_word = "".to_string(); + let mut current_word_rect: Option = None; + + for char in text_line.chars() { + match current_word_rect { + None => { + current_word_rect = Some(char.rect); + current_word = char.char.to_string(); + } + Some(ref current_rect) if char.char == ' ' => { + text_image_coords.push(TextImageCoords { + text: Some(current_word.clone()), + x1: current_rect.left() as f32, + y1: current_rect.top() as f32, + x2: current_rect.right() as f32, + y2: current_rect.bottom() as f32, + }); + current_word_rect = None; + } + Some(current_rect) => { + current_word_rect = Some(current_rect.union(char.rect)); + current_word.push(char.char); + } + } + } + } + Ok(text_image_coords) + } +} + +#[allow(unused_imports)] +mod tests { + use super::*; + use console::Term; + + #[test] + #[cfg_attr(not(feature = "ci-ocr"), ignore)] + fn test_recognise_png_file() -> AppResult<()> { + let term = Term::stdout(); + let app_reporter = AppReporter::from(&term); + let ocrs = Ocrs::new(&app_reporter)?; + let image = image::open("test-fixtures/media/form-example.png")?; + let text_image_coords = ocrs.image_to_text(image)?; + assert!(text_image_coords.len() > 10); + Ok(()) + } +} diff --git a/src/file_systems/clipboard.rs b/src/file_systems/clipboard.rs index b87fcb6..6d8b1c0 100644 --- a/src/file_systems/clipboard.rs +++ b/src/file_systems/clipboard.rs @@ -180,7 +180,7 @@ mod tests { let term = Term::stdout(); let reporter: AppReporter = AppReporter::from(&term); - let mut fs = DetectFileSystem::open(&format!("clipboard://"), &reporter).await?; + let mut fs = DetectFileSystem::open("clipboard://", &reporter).await?; let test_content = "Test content"; @@ -214,7 +214,7 @@ mod tests { let term = Term::stdout(); let reporter: AppReporter = AppReporter::from(&term); - let mut fs = DetectFileSystem::open(&format!("clipboard://"), &reporter).await?; + let mut fs = DetectFileSystem::open("clipboard://", &reporter).await?; let test_content: image::RgbaImage = RgbaImage::new(100, 100); let mut writer = std::io::Cursor::new(Vec::new()); diff --git a/src/main.rs b/src/main.rs index 8bd89f5..71abcc6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -82,7 +82,7 @@ async fn handle_args(cli: CliArgs, term: &Term) -> AppResult<()> { .await?; term.write_line( format!( - "{} -> {}\n{} files processed.\n{} files skipped.", + "\n{} -> {}: {} files processed. {} files skipped.", source, destination, Style::new() diff --git a/src/redacters/aws_comprehend.rs b/src/redacters/aws_comprehend.rs index 5e06ef6..dfb511f 100644 --- a/src/redacters/aws_comprehend.rs +++ b/src/redacters/aws_comprehend.rs @@ -96,9 +96,6 @@ impl<'a> Redacter for AwsComprehendRedacter<'a> { Some(media_type) if Redacters::is_mime_text(media_type) => { RedactSupportedOptions::Supported } - Some(media_type) if Redacters::is_mime_table(media_type) => { - RedactSupportedOptions::SupportedAsText - } _ => RedactSupportedOptions::Unsupported, }) } diff --git a/src/redacters/gcp_dlp.rs b/src/redacters/gcp_dlp.rs index dbe0ca9..8e28bd2 100644 --- a/src/redacters/gcp_dlp.rs +++ b/src/redacters/gcp_dlp.rs @@ -281,9 +281,6 @@ impl<'a> Redacter for GcpDlpRedacter<'a> { Some(media_type) if Self::check_supported_image_type(media_type) => { RedactSupportedOptions::Supported } - Some(media_type) if Redacters::is_mime_pdf(media_type) => { - RedactSupportedOptions::SupportedAsImages - } _ => RedactSupportedOptions::Unsupported, }) } diff --git a/src/redacters/gemini_llm.rs b/src/redacters/gemini_llm.rs index a98b590..9ff85c3 100644 --- a/src/redacters/gemini_llm.rs +++ b/src/redacters/gemini_llm.rs @@ -1,9 +1,9 @@ use crate::args::RedacterType; -use crate::common_types::GcpProjectId; +use crate::common_types::{GcpProjectId, TextImageCoords}; use crate::errors::AppError; use crate::file_systems::FileSystemRef; use crate::redacters::{ - redact_image_at_coords, PiiImageCoords, RedactSupportedOptions, Redacter, RedacterDataItem, + redact_image_at_coords, RedactSupportedOptions, Redacter, RedacterDataItem, RedacterDataItemContent, Redacters, }; use crate::reporter::AppReporter; @@ -306,7 +306,7 @@ impl<'a> GeminiLlmRedacter<'a> { ) => acc + text, _ => acc, }); - let pii_image_coords: Vec = + let pii_image_coords: Vec = serde_json::from_str(&content_json)?; Ok(RedacterDataItem { file_ref: input.file_ref, @@ -357,12 +357,6 @@ impl<'a> Redacter for GeminiLlmRedacter<'a> { Some(media_type) if Redacters::is_mime_image(media_type) => { RedactSupportedOptions::Supported } - Some(media_type) if Redacters::is_mime_table(media_type) => { - RedactSupportedOptions::SupportedAsText - } - Some(media_type) if Redacters::is_mime_pdf(media_type) => { - RedactSupportedOptions::SupportedAsImages - } _ => RedactSupportedOptions::Unsupported, }) } diff --git a/src/redacters/mod.rs b/src/redacters/mod.rs index 334adec..5a7e714 100644 --- a/src/redacters/mod.rs +++ b/src/redacters/mod.rs @@ -157,8 +157,6 @@ impl<'a> Redacters<'a> { #[derive(Debug, Clone, PartialEq, Eq)] pub enum RedactSupportedOptions { Supported, - SupportedAsText, - SupportedAsImages, Unsupported, } diff --git a/src/redacters/ms_presidio.rs b/src/redacters/ms_presidio.rs index 467b568..7a59cce 100644 --- a/src/redacters/ms_presidio.rs +++ b/src/redacters/ms_presidio.rs @@ -194,24 +194,12 @@ impl<'a> Redacter for MsPresidioRedacter<'a> { { RedactSupportedOptions::Supported } - Some(media_type) - if Redacters::is_mime_table(media_type) - && self.ms_presidio_options.text_analyze_url.is_some() => - { - RedactSupportedOptions::SupportedAsText - } Some(media_type) if Redacters::is_mime_image(media_type) && self.ms_presidio_options.image_redact_url.is_some() => { RedactSupportedOptions::Supported } - Some(media_type) - if Redacters::is_mime_pdf(media_type) - && self.ms_presidio_options.image_redact_url.is_some() => - { - RedactSupportedOptions::SupportedAsImages - } _ => RedactSupportedOptions::Unsupported, }) } diff --git a/src/redacters/open_ai_llm.rs b/src/redacters/open_ai_llm.rs index c99d422..a120f67 100644 --- a/src/redacters/open_ai_llm.rs +++ b/src/redacters/open_ai_llm.rs @@ -162,9 +162,6 @@ impl<'a> Redacter for OpenAiLlmRedacter<'a> { Some(media_type) if Redacters::is_mime_text(media_type) => { RedactSupportedOptions::Supported } - Some(media_type) if Redacters::is_mime_table(media_type) => { - RedactSupportedOptions::SupportedAsText - } _ => RedactSupportedOptions::Unsupported, }) } diff --git a/src/redacters/simple_image_redacter.rs b/src/redacters/simple_image_redacter.rs index c752bb5..c8c678c 100644 --- a/src/redacters/simple_image_redacter.rs +++ b/src/redacters/simple_image_redacter.rs @@ -1,23 +1,14 @@ +use crate::common_types::TextImageCoords; use crate::errors::AppError; use crate::AppResult; use bytes::Bytes; -use image::ImageFormat; +use image::{ImageFormat, RgbaImage}; use mime::Mime; -use serde::{Deserialize, Serialize}; - -#[derive(Debug, Clone, Deserialize, Serialize)] -pub struct PiiImageCoords { - pub x1: f32, - pub y1: f32, - pub x2: f32, - pub y2: f32, - pub text: Option, -} pub fn redact_image_at_coords( mime: Mime, data: Bytes, - pii_coords: Vec, + pii_coords: Vec, approximation_factor: f32, ) -> AppResult { let image_format = ImageFormat::from_mime_type(&mime).ok_or_else(|| AppError::SystemError { @@ -25,7 +16,18 @@ pub fn redact_image_at_coords( })?; let image = image::load_from_memory_with_format(&data, image_format)?; let mut image = image.to_rgba8(); - for PiiImageCoords { x1, y1, x2, y2, .. } in pii_coords { + redact_rgba_image_at_coords(&mut image, &pii_coords, approximation_factor); + let mut output = std::io::Cursor::new(Vec::new()); + image.write_to(&mut output, image_format)?; + Ok(output.into_inner().into()) +} + +pub fn redact_rgba_image_at_coords( + image: &mut RgbaImage, + pii_coords: &Vec, + approximation_factor: f32, +) { + for TextImageCoords { x1, y1, x2, y2, .. } in pii_coords { for x in ((x1 - x1 * approximation_factor) as u32)..((x2 + x2 * approximation_factor) as u32) { @@ -38,7 +40,4 @@ pub fn redact_image_at_coords( } } } - let mut output = std::io::Cursor::new(Vec::new()); - image.write_to(&mut output, image_format)?; - Ok(output.into_inner().into()) } diff --git a/src/redacters/stream_redacter.rs b/src/redacters/stream_redacter.rs index af5dcd7..c4b7ff8 100644 --- a/src/redacters/stream_redacter.rs +++ b/src/redacters/stream_redacter.rs @@ -1,315 +1,556 @@ use crate::errors::AppError; +use crate::file_converters::ocr::Ocr; use crate::file_converters::pdf::{PdfInfo, PdfPageInfo, PdfToImage}; use crate::file_converters::FileConverters; use crate::file_systems::FileSystemRef; use crate::redacters::{ - RedactSupportedOptions, Redacter, RedacterBaseOptions, RedacterDataItem, - RedacterDataItemContent, Redacters, + redact_rgba_image_at_coords, RedactSupportedOptions, Redacter, RedacterBaseOptions, + RedacterDataItem, RedacterDataItemContent, Redacters, }; use crate::AppResult; use futures::{Stream, TryStreamExt}; use image::ImageFormat; use indicatif::ProgressBar; +use std::collections::HashSet; pub struct RedactStreamResult { pub number_of_redactions: usize, pub stream: Box> + Send + Sync + Unpin + 'static>, } -pub async fn redact_stream< - S: Stream> + Send + Unpin + Sync + 'static, ->( - redacters: &Vec<&impl Redacter>, - redacter_base_options: &RedacterBaseOptions, - input: S, - file_ref: &FileSystemRef, - file_converters: &FileConverters, - bar: &ProgressBar, -) -> AppResult { - let mut redacters_supported_options = Vec::with_capacity(redacters.len()); - for redacter in redacters { - let supported_options = redacter.redact_supported_options(file_ref).await?; - redacters_supported_options.push((*redacter, supported_options)); +pub struct StreamRedacter<'a> { + redacter_base_options: &'a RedacterBaseOptions, + file_converters: &'a FileConverters<'a>, + bar: &'a ProgressBar, +} + +pub struct StreamRedactPlan { + pub apply_pdf_image_converter: bool, + pub apply_ocr: bool, + pub leave_data_table_as_text: bool, +} + +impl<'a> StreamRedacter<'a> { + pub fn new( + redacter_base_options: &'a RedacterBaseOptions, + file_converters: &'a FileConverters<'a>, + bar: &'a ProgressBar, + ) -> Self { + Self { + redacter_base_options, + file_converters, + bar, + } } - let mut redacted = stream_to_redact_item( - redacter_base_options, - input, - file_ref, - &redacters_supported_options, - ) - .await?; - let mut number_of_redactions = 0; + pub async fn create_redact_plan( + &'a self, + redacters: &'a Vec, + file_ref: &FileSystemRef, + ) -> AppResult<(StreamRedactPlan, Vec<&'a impl Redacter>)> { + let mut stream_redact_plan = StreamRedactPlan { + apply_pdf_image_converter: false, + apply_ocr: false, + leave_data_table_as_text: false, + }; + // Supports natively + let mut supported_redacters = Vec::new(); + for redacter in redacters { + let supported_options = redacter.redact_supported_options(file_ref).await?; + if supported_options == RedactSupportedOptions::Supported { + supported_redacters.push(redacter); + } + } - for (index, (redacter, options)) in redacters_supported_options.iter().enumerate() { - let width = " ".repeat(index); - match options { - RedactSupportedOptions::Supported => { - bar.println(format!( - "{width}↳ Redacting using {} redacter", - redacter.redacter_type() - )); - redacted = redacter.redact(redacted).await?; - number_of_redactions += 1; + if supported_redacters.is_empty() { + match &file_ref.media_type { + Some(file_ref_media) => { + // Supports with conversion + if Redacters::is_mime_table(file_ref_media) { + for redacter in redacters { + let supported_options = redacter + .redact_supported_options(&FileSystemRef { + media_type: Some(mime::TEXT_PLAIN), + ..file_ref.clone() + }) + .await?; + if supported_options == RedactSupportedOptions::Supported { + supported_redacters.push(redacter); + } + } + if !supported_redacters.is_empty() { + stream_redact_plan.leave_data_table_as_text = true; + } + } else if self.file_converters.pdf_image_converter.is_some() + && Redacters::is_mime_pdf(file_ref_media) + { + for redacter in redacters { + let supported_options = redacter + .redact_supported_options(&FileSystemRef { + media_type: Some(mime::IMAGE_PNG), + ..file_ref.clone() + }) + .await?; + if supported_options == RedactSupportedOptions::Supported { + supported_redacters.push(redacter); + } + } + + if !supported_redacters.is_empty() { + stream_redact_plan.apply_pdf_image_converter = true; + } + + if supported_redacters.is_empty() && self.file_converters.ocr.is_some() { + for redacter in redacters { + let supported_options = redacter + .redact_supported_options(&FileSystemRef { + media_type: Some(mime::TEXT_PLAIN), + ..file_ref.clone() + }) + .await?; + if supported_options == RedactSupportedOptions::Supported { + supported_redacters.push(redacter); + } + } + if !supported_redacters.is_empty() { + stream_redact_plan.apply_pdf_image_converter = true; + stream_redact_plan.apply_ocr = true; + } + } + } else if self.file_converters.ocr.is_some() + && Redacters::is_mime_image(file_ref_media) + { + for redacter in redacters { + let supported_options = redacter + .redact_supported_options(&FileSystemRef { + media_type: Some(mime::TEXT_PLAIN), + ..file_ref.clone() + }) + .await?; + if supported_options == RedactSupportedOptions::Supported { + supported_redacters.push(redacter); + } + } + if !supported_redacters.is_empty() { + stream_redact_plan.apply_ocr = true; + } + } + } + None => {} } - RedactSupportedOptions::SupportedAsImages => { - match file_converters.pdf_image_converter { - Some(ref converter) => { - redacted = redact_pdf_with_images_converter( - file_ref, - bar, - redacted, - *redacter, - &width, - converter.as_ref(), - ) - .await?; + } + + Ok((stream_redact_plan, supported_redacters)) + } + + pub async fn redact_stream< + S: Stream> + Send + Unpin + Sync + 'static, + >( + &'a self, + input: S, + redact_plan: StreamRedactPlan, + redacters: &[&'a impl Redacter], + file_ref: &FileSystemRef, + ) -> AppResult { + let mut redacted = self + .stream_to_redact_item(self.redacter_base_options, input, file_ref, &redact_plan) + .await?; + let mut number_of_redactions = 0; + + for (index, redacter) in redacters.iter().enumerate() { + let width = " ".repeat(index); + if redact_plan.apply_pdf_image_converter { + match ( + &self.file_converters.pdf_image_converter, + &self.file_converters.ocr, + ) { + (Some(ref pdf_to_image), _) if !redact_plan.apply_ocr => { + redacted = self + .redact_pdf_with_images_converter( + file_ref, + redacted, + *redacter, + &width, + pdf_to_image.as_ref(), + None, + ) + .await?; number_of_redactions += 1; } - None => { - bar.println(format!( + (Some(ref pdf_to_image), Some(ref ocr)) => { + redacted = self + .redact_pdf_with_images_converter( + file_ref, + redacted, + *redacter, + &width, + pdf_to_image.as_ref(), + Some(ocr.as_ref()), + ) + .await?; + number_of_redactions += 1; + } + (None, Some(_)) => { + self.bar.println(format!( "{width}↲ Skipping redaction because PDF to image converter is not available", )); } + (Some(_), None) => { + self.bar.println(format!( + "{width}↲ Skipping redaction because OCR is not available", + )); + } + (None, None) => { + self.bar.println(format!( + "{width}↲ Skipping redaction because PDF/OCR are not available", + )); + } } - } - RedactSupportedOptions::SupportedAsText => { - if matches!(redacted.content, RedacterDataItemContent::Value(_)) { - bar.println(format!( - "{width}↳ Redacting as text using {} redacter", - redacter.redacter_type() - )); - redacted = redacter.redact(redacted).await?; - number_of_redactions += 1; + } else if redact_plan.apply_ocr { + match self.file_converters.ocr { + Some(ref ocr) => { + redacted = self + .redact_with_ocr_converter( + file_ref, + redacted, + *redacter, + &width, + ocr.as_ref(), + ) + .await?; + number_of_redactions += 1; + } + None => { + self.bar.println(format!( + "{width}↲ Skipping redaction because OCR is not available", + )); + } } + } else { + self.bar.println(format!( + "{width}↳ Redacting using {} redacter", + redacter.redacter_type() + )); + redacted = redacter.redact(redacted).await?; + number_of_redactions += 1; } - RedactSupportedOptions::Unsupported => {} } - } - let output_stream = match redacted.content { - RedacterDataItemContent::Value(content) => { - let bytes = bytes::Bytes::from(content.into_bytes()); - Box::new(futures::stream::iter(vec![Ok(bytes)])) - } - RedacterDataItemContent::Image { data, .. } => { - Box::new(futures::stream::iter(vec![Ok(data)])) - } - RedacterDataItemContent::Pdf { data } => Box::new(futures::stream::iter(vec![Ok(data)])), - RedacterDataItemContent::Table { headers, rows } => { - let mut writer = csv_async::AsyncWriter::from_writer(vec![]); - writer.write_record(headers).await?; - for row in rows { - writer.write_record(row).await?; + let output_stream = match redacted.content { + RedacterDataItemContent::Value(content) => { + let bytes = bytes::Bytes::from(content.into_bytes()); + Box::new(futures::stream::iter(vec![Ok(bytes)])) } - writer.flush().await?; - let bytes = bytes::Bytes::from(writer.into_inner().await?); - Box::new(futures::stream::iter(vec![Ok(bytes)])) - } - }; + RedacterDataItemContent::Image { data, .. } => { + Box::new(futures::stream::iter(vec![Ok(data)])) + } + RedacterDataItemContent::Pdf { data } => { + Box::new(futures::stream::iter(vec![Ok(data)])) + } + RedacterDataItemContent::Table { headers, rows } => { + let mut writer = csv_async::AsyncWriter::from_writer(vec![]); + writer.write_record(headers).await?; + for row in rows { + writer.write_record(row).await?; + } + writer.flush().await?; + let bytes = bytes::Bytes::from(writer.into_inner().await?); + Box::new(futures::stream::iter(vec![Ok(bytes)])) + } + }; - Ok(RedactStreamResult { - number_of_redactions, - stream: output_stream, - }) -} + Ok(RedactStreamResult { + number_of_redactions, + stream: output_stream, + }) + } -async fn stream_to_redact_item< - S: Stream> + Send + Unpin + Sync + 'static, ->( - redacter_base_options: &RedacterBaseOptions, - input: S, - file_ref: &FileSystemRef, - redacters_supported_options: &[(&impl Redacter, RedactSupportedOptions)], -) -> AppResult { - match file_ref.media_type { - Some(ref mime) - if Redacters::is_mime_text(mime) - || (Redacters::is_mime_table(mime) - && redacters_supported_options - .iter() - .any(|(_, o)| matches!(o, RedactSupportedOptions::SupportedAsText)) - && !redacters_supported_options - .iter() - .all(|(_, o)| matches!(o, RedactSupportedOptions::Supported))) => - { - stream_to_text_redact_item(redacter_base_options, input, file_ref).await - } - Some(ref mime) if Redacters::is_mime_image(mime) => { - stream_to_image_redact_item(input, file_ref, mime.clone()).await - } - Some(ref mime) if Redacters::is_mime_table(mime) => { - stream_to_table_redact_item(redacter_base_options, input, file_ref).await - } - Some(ref mime) if Redacters::is_mime_pdf(mime) => { - stream_to_pdf_redact_item(input, file_ref).await + async fn stream_to_redact_item< + S: Stream> + Send + Unpin + Sync + 'static, + >( + &'a self, + redacter_base_options: &RedacterBaseOptions, + input: S, + file_ref: &FileSystemRef, + redact_plan: &StreamRedactPlan, + ) -> AppResult { + match file_ref.media_type { + Some(ref mime) + if Redacters::is_mime_text(mime) + || (Redacters::is_mime_table(mime) && redact_plan.leave_data_table_as_text) => + { + self.stream_to_text_redact_item(input, file_ref).await + } + Some(ref mime) if Redacters::is_mime_image(mime) => { + self.stream_to_image_redact_item(input, file_ref, mime.clone()) + .await + } + Some(ref mime) if Redacters::is_mime_table(mime) => { + self.stream_to_table_redact_item(redacter_base_options, input, file_ref) + .await + } + Some(ref mime) if Redacters::is_mime_pdf(mime) => { + self.stream_to_pdf_redact_item(input, file_ref).await + } + Some(ref mime) => Err(AppError::SystemError { + message: format!("Media type {} is not supported for redaction", mime), + }), + None => Err(AppError::SystemError { + message: "Media type is not provided to redact".to_string(), + }), } - Some(ref mime) => Err(AppError::SystemError { - message: format!("Media type {} is not supported for redaction", mime), - }), - None => Err(AppError::SystemError { - message: "Media type is not provided to redact".to_string(), - }), } -} -async fn stream_to_text_redact_item< - S: Stream> + Send + Unpin + Sync + 'static, ->( - redacter_base_options: &RedacterBaseOptions, - input: S, - file_ref: &FileSystemRef, -) -> AppResult { - let all_chunks: Vec = input.try_collect().await?; - let all_bytes = all_chunks.concat(); - let whole_content = String::from_utf8(all_bytes).map_err(|e| AppError::SystemError { - message: format!("Failed to convert bytes to string: {}", e), - })?; - let content = if let Some(sampling_size) = redacter_base_options.sampling_size { - let sampling_size = std::cmp::min(sampling_size, whole_content.len()); - whole_content - .chars() - .take(sampling_size) - .collect::() - } else { - whole_content - }; - Ok(RedacterDataItem { - content: RedacterDataItemContent::Value(content), - file_ref: file_ref.clone(), - }) -} + async fn stream_to_text_redact_item< + S: Stream> + Send + Unpin + Sync + 'static, + >( + &'a self, + input: S, + file_ref: &FileSystemRef, + ) -> AppResult { + let all_chunks: Vec = input.try_collect().await?; + let all_bytes = all_chunks.concat(); + let whole_content = String::from_utf8(all_bytes).map_err(|e| AppError::SystemError { + message: format!("Failed to convert bytes to string: {}", e), + })?; + let content = if let Some(sampling_size) = self.redacter_base_options.sampling_size { + let sampling_size = std::cmp::min(sampling_size, whole_content.len()); + whole_content + .chars() + .take(sampling_size) + .collect::() + } else { + whole_content + }; + Ok(RedacterDataItem { + content: RedacterDataItemContent::Value(content), + file_ref: file_ref.clone(), + }) + } -async fn stream_to_table_redact_item< - S: Stream> + Send + Unpin + Sync + 'static, ->( - redacter_base_options: &RedacterBaseOptions, - input: S, - file_ref: &FileSystemRef, -) -> AppResult { - let reader = tokio_util::io::StreamReader::new( - input.map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err)), - ); - let mut reader = csv_async::AsyncReaderBuilder::default() - .has_headers(!redacter_base_options.csv_headers_disable) - .delimiter( - redacter_base_options - .csv_delimiter - .as_ref() - .cloned() - .unwrap_or(b','), - ) - .create_reader(reader); - let headers = if !redacter_base_options.csv_headers_disable { - reader - .headers() - .await? - .into_iter() - .map(|h| h.to_string()) - .collect() - } else { - vec![] - }; - let records: Vec = reader.records().try_collect().await?; - Ok(RedacterDataItem { - content: RedacterDataItemContent::Table { - headers, - rows: records - .iter() - .map(|r| r.iter().map(|c| c.to_string()).collect()) - .collect(), - }, - file_ref: file_ref.clone(), - }) -} + async fn stream_to_table_redact_item< + S: Stream> + Send + Unpin + Sync + 'static, + >( + &'a self, + redacter_base_options: &RedacterBaseOptions, + input: S, + file_ref: &FileSystemRef, + ) -> AppResult { + let reader = tokio_util::io::StreamReader::new( + input.map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err)), + ); + let mut reader = csv_async::AsyncReaderBuilder::default() + .has_headers(!redacter_base_options.csv_headers_disable) + .delimiter( + redacter_base_options + .csv_delimiter + .as_ref() + .cloned() + .unwrap_or(b','), + ) + .create_reader(reader); + let headers = if !redacter_base_options.csv_headers_disable { + reader + .headers() + .await? + .into_iter() + .map(|h| h.to_string()) + .collect() + } else { + vec![] + }; + let records: Vec = reader.records().try_collect().await?; + Ok(RedacterDataItem { + content: RedacterDataItemContent::Table { + headers, + rows: records + .iter() + .map(|r| r.iter().map(|c| c.to_string()).collect()) + .collect(), + }, + file_ref: file_ref.clone(), + }) + } -async fn stream_to_image_redact_item< - S: Stream> + Send + Unpin + Sync + 'static, ->( - input: S, - file_ref: &FileSystemRef, - mime: mime::Mime, -) -> AppResult { - let all_chunks: Vec = input.try_collect().await?; - let all_bytes = all_chunks.concat(); - Ok(RedacterDataItem { - content: RedacterDataItemContent::Image { - mime_type: mime.clone(), - data: all_bytes.into(), - }, - file_ref: file_ref.clone(), - }) -} + async fn stream_to_image_redact_item< + S: Stream> + Send + Unpin + Sync + 'static, + >( + &'a self, + input: S, + file_ref: &FileSystemRef, + mime: mime::Mime, + ) -> AppResult { + let all_chunks: Vec = input.try_collect().await?; + let all_bytes = all_chunks.concat(); + Ok(RedacterDataItem { + content: RedacterDataItemContent::Image { + mime_type: mime.clone(), + data: all_bytes.into(), + }, + file_ref: file_ref.clone(), + }) + } -async fn stream_to_pdf_redact_item< - S: Stream> + Send + Unpin + Sync + 'static, ->( - input: S, - file_ref: &FileSystemRef, -) -> AppResult { - let all_chunks: Vec = input.try_collect().await?; - let all_bytes = all_chunks.concat(); - Ok(RedacterDataItem { - content: RedacterDataItemContent::Pdf { - data: all_bytes.into(), - }, - file_ref: file_ref.clone(), - }) -} + async fn stream_to_pdf_redact_item< + S: Stream> + Send + Unpin + Sync + 'static, + >( + &'a self, + input: S, + file_ref: &FileSystemRef, + ) -> AppResult { + let all_chunks: Vec = input.try_collect().await?; + let all_bytes = all_chunks.concat(); + Ok(RedacterDataItem { + content: RedacterDataItemContent::Pdf { + data: all_bytes.into(), + }, + file_ref: file_ref.clone(), + }) + } -async fn redact_pdf_with_images_converter( - file_ref: &FileSystemRef, - bar: &ProgressBar, - redacted: RedacterDataItem, - redacter: &impl Redacter, - width: &String, - converter: &dyn PdfToImage, -) -> Result { - match redacted.content { - RedacterDataItemContent::Pdf { data } => { - bar.println(format!( - "{width}↳ Redacting using {} redacter and converting the PDF to images", - redacter.redacter_type() - )); - let pdf_info = converter.convert_to_images(data)?; - bar.println(format!( - "{width} ↳ Converting {pdf_info_pages} images", - pdf_info_pages = pdf_info.pages.len() - )); - let mut redacted_pages = Vec::with_capacity(pdf_info.pages.len()); - for page in pdf_info.pages { - let mut png_image_bytes = std::io::Cursor::new(Vec::new()); - page.page_as_images - .write_to(&mut png_image_bytes, ImageFormat::Png)?; - let image_to_redact = RedacterDataItem { - content: RedacterDataItemContent::Image { - mime_type: mime::IMAGE_PNG, - data: png_image_bytes.into_inner().into(), + async fn redact_pdf_with_images_converter( + &'a self, + file_ref: &FileSystemRef, + redacted: RedacterDataItem, + redacter: &impl Redacter, + width: &String, + converter: &dyn PdfToImage, + ocr: Option<&dyn Ocr>, + ) -> Result { + match redacted.content { + RedacterDataItemContent::Pdf { data } => { + self.bar.println(format!( + "{width}↳ Redacting using {} redacter and converting the PDF to images", + redacter.redacter_type() + )); + let pdf_info = converter.convert_to_images(data)?; + self.bar.println(format!( + "{width} ↳ Converting {pdf_info_pages} images", + pdf_info_pages = pdf_info.pages.len() + )); + let mut redacted_pages = Vec::with_capacity(pdf_info.pages.len()); + for page in pdf_info.pages { + let mut png_image_bytes = std::io::Cursor::new(Vec::new()); + page.page_as_images + .write_to(&mut png_image_bytes, ImageFormat::Png)?; + let image_to_redact = RedacterDataItem { + content: RedacterDataItemContent::Image { + mime_type: mime::IMAGE_PNG, + data: png_image_bytes.into_inner().into(), + }, + file_ref: file_ref.clone(), + }; + let redacted_image = if let Some(ocr_engine) = ocr { + self.redact_with_ocr_converter( + file_ref, + image_to_redact, + redacter, + &format!(" {}", width), + ocr_engine, + ) + .await? + } else { + redacter.redact(image_to_redact).await? + }; + if let RedacterDataItemContent::Image { data, .. } = redacted_image.content { + redacted_pages.push(PdfPageInfo { + page_as_images: image::load_from_memory_with_format( + &data, + ImageFormat::Png, + )?, + ..page + }); + } + } + let redacted_pdf_info = PdfInfo { + pages: redacted_pages, + }; + let redact_pdf_as_images = converter.images_to_pdf(redacted_pdf_info)?; + Ok(RedacterDataItem { + content: RedacterDataItemContent::Pdf { + data: redact_pdf_as_images, }, file_ref: file_ref.clone(), - }; - let redacted_image = redacter.redact(image_to_redact).await?; - if let RedacterDataItemContent::Image { data, .. } = redacted_image.content { - redacted_pages.push(PdfPageInfo { - page_as_images: image::load_from_memory_with_format( - &data, - ImageFormat::Png, - )?, - ..page - }); + }) + } + _ => Ok(redacted), + } + } + + async fn redact_with_ocr_converter( + &'a self, + file_ref: &FileSystemRef, + redacted: RedacterDataItem, + redacter: &impl Redacter, + width: &String, + ocr: &dyn Ocr, + ) -> Result { + match &redacted.content { + RedacterDataItemContent::Image { data, mime_type } => { + match ImageFormat::from_mime_type(mime_type) { + Some(image_format) => { + self.bar.println(format!( + "{width}↳ Redacting using {} redacter and converting the image to text using OCR engine", + redacter.redacter_type() + )); + let image = image::load_from_memory_with_format(data, image_format)?; + let text_coords = ocr.image_to_text(image.clone())?; + let text = text_coords + .iter() + .map(|coord| coord.text.clone()) + .collect::>>() + .into_iter() + .flatten() + .collect::>() + .join(" "); + + let redacted_text = redacter + .redact(RedacterDataItem { + content: RedacterDataItemContent::Value(text), + file_ref: file_ref.clone(), + }) + .await?; + + match redacted_text.content { + RedacterDataItemContent::Value(content) => { + let words_set: HashSet<&str> = + HashSet::from_iter(content.split(" ").collect::>()); + let mut redacted_image = image.to_rgba8(); + for text_coord in text_coords { + if let Some(text) = &text_coord.text { + if !words_set.contains(text.as_str()) { + redact_rgba_image_at_coords( + &mut redacted_image, + &vec![text_coord], + 0.10, + ); + } + } + } + let mut output = std::io::Cursor::new(Vec::new()); + redacted_image.write_to(&mut output, image_format)?; + Ok(RedacterDataItem { + file_ref: file_ref.clone(), + content: RedacterDataItemContent::Image { + mime_type: mime_type.clone(), + data: output.into_inner().into(), + }, + }) + } + _ => Err(AppError::SystemError { + message: "Redacted text is not returned as text".to_string(), + }), + } + } + None => { + self.bar.println(format!( + "{width}↲ Skipping redaction through OCR because image format is not supported", + )); + Ok(redacted) + } } } - let redacted_pdf_info = PdfInfo { - pages: redacted_pages, - }; - let redact_pdf_as_images = converter.images_to_pdf(redacted_pdf_info)?; - Ok(RedacterDataItem { - content: RedacterDataItemContent::Pdf { - data: redact_pdf_as_images, - }, - file_ref: file_ref.clone(), - }) + _ => Ok(redacted), } - _ => Ok(redacted), } } diff --git a/test-fixtures/media/form-example.png b/test-fixtures/media/form-example.png new file mode 100644 index 0000000..1473181 --- /dev/null +++ b/test-fixtures/media/form-example.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bc295b7a4fcf808235013540800bc7386253a94741ab54cf3013792dd5ad742 +size 308951