diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..503b5ad --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.rten filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index b0e73dd..b8c3af6 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ *.swp tmp/ lib/ +models/ diff --git a/Cargo.lock b/Cargo.lock index 65a01a3..5038b9e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1335,6 +1335,27 @@ dependencies = [ "subtle", ] +[[package]] +name = "dirs" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.48.0", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -2270,6 +2291,16 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +[[package]] +name = "libredox" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +dependencies = [ + "bitflags 2.6.0", + "libc", +] + [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -2655,6 +2686,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "outref" version = "0.5.1" @@ -3113,6 +3150,7 @@ dependencies = [ name = "redacter" version = "0.9.0" dependencies = [ + "anyhow", "arboard", "async-recursion", "async-trait", @@ -3125,6 +3163,7 @@ dependencies = [ "clap", "console", "csv-async", + "dirs", "futures", "gcloud-sdk", "globset", @@ -3138,6 +3177,8 @@ dependencies = [ "rand", "reqwest", "rsb_derive", + "rten", + "rten-imageproc", "rvstruct", "serde", "serde_json", @@ -3162,6 +3203,17 @@ dependencies = [ "bitflags 2.6.0", ] +[[package]] +name = "redox_users" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" +dependencies = [ + "getrandom", + "libredox", + "thiserror", +] + [[package]] name = "regex" version = "1.10.6" diff --git a/Cargo.toml b/Cargo.toml index 907c419..b04ad40 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,16 +15,18 @@ categories = ["command-line-utilities"] description = "Copy & Redact cli tool to securely copy and redact files removing Personal Identifiable Information (PII) across various filesystems." [features] -default = ["pdf-render", "clipboard"] +default = ["pdf-render", "clipboard", "ocr"] ci-gcp = [] # For testing on CI/GCP ci-aws = [] # For testing on CI/AWS ci-ms-presidio = [] # For testing on CI/MS Presidiom ci-gcp-llm = [] # For testing on CI/GCP with LLM models ci-open-ai = [] # For testing on CI/OpenAIP ci-clibpoard = [] # For testing on CI/Clipboard +ci-ocr = [] # For testing on CI/OCR ci = ["ci-gcp", "ci-aws", "ci-ms-presidio", "ci-gcp-llm", "ci-open-ai", "ci-clibpoard"] pdf-render = ["pdfium-render"] clipboard = ["arboard"] +ocr = ["ocrs", "rten", "rten-imageproc"] [dependencies] @@ -43,6 +45,7 @@ sha2 = "0.10" async-trait = "0.1" hex = "0.4" thiserror = "1" +anyhow = "1" sync_wrapper = { version = "1", features = ["futures"] } async-recursion = "1" mime = "0.3" @@ -65,6 +68,9 @@ bytes = { version = "1" } serde_json = "1" arboard = { version = "3", features = ["image"], optional = true } ocrs = { version = "0.8", optional = true } +rten = { version = "0.10", optional = true } +rten-imageproc = { version = "0.10", optional = true } +dirs = "5.0.1" diff --git a/src/commands/copy_command.rs b/src/commands/copy_command.rs index 01186b3..da20fe3 100644 --- a/src/commands/copy_command.rs +++ b/src/commands/copy_command.rs @@ -49,48 +49,17 @@ pub async fn command_copy( options: CopyCommandOptions, redacter_options: Option, ) -> AppResult { - let bold_style = Style::new().bold(); - let redacted_output = if let Some(ref options) = redacter_options.as_ref() { - bold_style - .clone() - .green() - .apply_to(format!("✓ Yes ({})", &options)) - } else { - bold_style.clone().red().apply_to("✗ No".to_string()) - }; - let sampling_output = if let Some(ref sampling_size) = redacter_options - .as_ref() - .and_then(|o| o.base_options.sampling_size) - { - Style::new().apply_to(format!("{} bytes.", sampling_size)) - } else { - Style::new().dim().apply_to("-".to_string()) - }; + let term_reporter = AppReporter::from(term); + let file_converters = FileConverters::new().init(&term_reporter).await?; - let mut file_converters = FileConverters::new(); - file_converters.init().await?; - - let converter_style = Style::new(); - let pdf_support_output = if file_converters.pdf_image_converter.is_some() { - converter_style - .clone() - .green() - .apply_to("✓ Yes".to_string()) - } else { - converter_style.clone().dim().apply_to("✗ No".to_string()) - }; - - term.write_line( - format!( - "Copying from {} to {}.\nRedacting: {}.\nSampling: {}\nPDF to image support: {}\n", - bold_style.clone().white().apply_to(source), - bold_style.clone().yellow().apply_to(destination), - redacted_output, - sampling_output, - pdf_support_output, - ) - .as_str(), - )?; + report_copy_info( + term, + source, + destination, + &redacter_options, + &file_converters, + ) + .await?; let bar = ProgressBar::new(1); bar.set_style( @@ -131,6 +100,7 @@ pub async fn command_copy( .iter() .map(|file| file.file_size.unwrap_or(0)) .sum(); + let bold_style = Style::new().bold(); bar.println( format!( "Found {} files. Total size: {}", @@ -196,6 +166,65 @@ pub async fn command_copy( copy_result } +async fn report_copy_info( + term: &Term, + source: &str, + destination: &str, + redacter_options: &Option, + file_converters: &FileConverters<'_>, +) -> AppResult<()> { + let bold_style = Style::new().bold(); + let redacted_output = if let Some(ref options) = redacter_options.as_ref() { + bold_style + .clone() + .green() + .apply_to(format!("✓ Yes ({})", &options)) + } else { + bold_style.clone().red().apply_to("✗ No".to_string()) + }; + let sampling_output = if let Some(ref sampling_size) = redacter_options + .as_ref() + .and_then(|o| o.base_options.sampling_size) + { + Style::new().apply_to(format!("{} bytes.", sampling_size)) + } else { + Style::new().dim().apply_to("-".to_string()) + }; + + let converter_style = Style::new(); + let pdf_support_output = if file_converters.pdf_image_converter.is_some() { + converter_style + .clone() + .green() + .apply_to("✓ Yes".to_string()) + } else { + converter_style.clone().dim().apply_to("✗ No".to_string()) + }; + + let ocr_support_output = if file_converters.ocr.is_some() { + converter_style + .clone() + .green() + .apply_to("✓ Yes".to_string()) + } else { + converter_style.clone().dim().apply_to("✗ No".to_string()) + }; + + term.write_line( + format!( + "Copying from {} to {}.\nRedacting: {}. | Sampling: {} | PDF to image support: {} | OCR support: {}\n", + bold_style.clone().white().apply_to(source), + bold_style.clone().yellow().apply_to(destination), + redacted_output, + sampling_output, + pdf_support_output, + ocr_support_output, + ) + .as_str(), + )?; + Ok(()) +} + enum TransferFileResult { Copied, Skipped, @@ -214,7 +243,7 @@ async fn transfer_and_redact_file< destination_fs: &mut DFS, options: &CopyCommandOptions, redacter: &Option<(RedacterBaseOptions, Vec)>, - file_converters: &FileConverters, + file_converters: &FileConverters<'a>, ) -> AppResult { let bold_style = Style::new().bold().white(); let (base_file_ref, source_reader) = source_fs.download(source_file_ref).await?; @@ -312,7 +341,7 @@ async fn redact_upload_file< dest_file_ref: &FileSystemRef, options: &CopyCommandOptions, redacter_with_options: &(RedacterBaseOptions, Vec), - file_converters: &FileConverters, + file_converters: &FileConverters<'a>, ) -> AppResult { let (redacter_base_options, redacters) = redacter_with_options; let mut support_redacters = Vec::new(); diff --git a/src/errors.rs b/src/errors.rs index ce14db1..1cc58c6 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -46,8 +46,19 @@ pub enum AppError { SystemTimeError(#[from] SystemTimeError), #[error("JSON serialization error: {0}")] JsonSerializeError(#[from] serde_json::Error), + #[cfg(feature = "ocr")] + #[error("Model load error: {0}")] + OcrModelLoadError(#[from] rten::ModelLoadError), + #[cfg(feature = "ocr")] + #[error("OCR image error: {0}")] + OcrImageError(#[from] ocrs::ImageSourceError), #[error("System error: {message}")] SystemError { message: String }, + #[error("System error: {message}")] + SystemErrorWithCause { + message: String, + cause: Box, + }, } impl< @@ -67,3 +78,12 @@ impl Self::GoogleCloudRestSdkApiError(Box::new(err)) } } + +impl From for AppError { + fn from(err: anyhow::Error) -> Self { + Self::SystemErrorWithCause { + message: err.to_string(), + cause: Box::new(err), + } + } +} diff --git a/src/file_converters/mod.rs b/src/file_converters/mod.rs index be2607f..bf25847 100644 --- a/src/file_converters/mod.rs +++ b/src/file_converters/mod.rs @@ -1,5 +1,6 @@ use crate::file_converters::ocr::Ocr; use crate::file_converters::pdf::PdfToImage; +use crate::reporter::AppReporter; use crate::AppResult; pub mod ocr; @@ -8,12 +9,15 @@ pub mod pdf; #[cfg(feature = "pdf-render")] mod pdf_image_converter; -pub struct FileConverters { - pub pdf_image_converter: Option>, - pub ocr: Option>, +#[cfg(feature = "ocr")] +mod ocr_ocrs; + +pub struct FileConverters<'a> { + pub pdf_image_converter: Option>, + pub ocr: Option>, } -impl FileConverters { +impl<'a> FileConverters<'a> { pub fn new() -> Self { Self { pdf_image_converter: None, @@ -21,19 +25,20 @@ impl FileConverters { } } - #[cfg(feature = "pdf-render")] - pub async fn init(&mut self) -> AppResult<()> { - match crate::file_converters::pdf_image_converter::PdfImageConverter::new().ok() { - Some(pdf_image_converter) => { + pub async fn init(mut self, app_reporter: &'a AppReporter<'a>) -> AppResult { + #[cfg(feature = "pdf-render")] + { + if let Some(pdf_image_converter) = pdf_image_converter::PdfImageConverter::new().ok() { self.pdf_image_converter = Some(Box::new(pdf_image_converter)); - Ok(()) } - None => Ok(()), } - } + #[cfg(feature = "ocr")] + { + if let Some(ocr) = ocr_ocrs::Ocrs::new(app_reporter).ok() { + self.ocr = Some(Box::new(ocr)); + } + } - #[cfg(not(feature = "pdf-render"))] - pub async fn init(&mut self) -> AppResult<()> { - Ok(()) + Ok(self) } } diff --git a/src/file_converters/ocr_ocrs.rs b/src/file_converters/ocr_ocrs.rs new file mode 100644 index 0000000..617bd19 --- /dev/null +++ b/src/file_converters/ocr_ocrs.rs @@ -0,0 +1,116 @@ +use crate::common_types::TextImageCoords; +use crate::errors::AppError; +use crate::file_converters::ocr::Ocr; +use crate::reporter::AppReporter; +use crate::AppResult; +use ocrs::{ImageSource, OcrEngine, OcrEngineParams, OcrInput, TextItem}; +use std::path::PathBuf; + +pub struct Ocrs<'a> { + ocr_engine: OcrEngine, + app_reporter: &'a AppReporter<'a>, +} + +impl<'a> Ocrs<'a> { + pub fn new(app_reporter: &'a AppReporter<'a>) -> AppResult { + let find_models_dir = Self::find_models_dir()?; + app_reporter.report(format!( + "Loading OCR models from {}", + find_models_dir.to_string_lossy() + ))?; + let detection_model_path = find_models_dir.join("text-detection.rten"); + let rec_model_path = find_models_dir.join("text-recognition.rten"); + let detection_model = rten::Model::load_file(detection_model_path)?; + let recognition_model = rten::Model::load_file(rec_model_path)?; + let ocr_engine = OcrEngine::new(OcrEngineParams { + detection_model: Some(detection_model), + recognition_model: Some(recognition_model), + ..Default::default() + })?; + Ok(Self { + ocr_engine, + app_reporter, + }) + } + + fn find_models_dir() -> AppResult { + let executable = std::env::current_exe()?; + let current_dir = executable.parent().map(|p| p.to_path_buf()); + + vec![ + current_dir.clone().map(|p| p.join("models").join("ocrs")), + current_dir + .clone() + .and_then(|p| p.parent().map(|p| p.join("share").join("ocrs"))), + dirs::home_dir().map(|p| p.join(".cache").join("ocrs")), + ] + .into_iter() + .collect::>>() + .iter() + .flatten() + .find(|p| p.exists()) + .cloned() + .ok_or_else(|| AppError::SystemError { + message: "Could not find models directory".to_string(), + }) + } +} + +impl<'a> Ocr for Ocrs<'a> { + fn image_to_text(&self, image: image::DynamicImage) -> AppResult> { + let rgb_image = image.to_rgb8(); + let image_source = ImageSource::from_bytes(rgb_image.as_raw(), rgb_image.dimensions())?; + let input: OcrInput = self.ocr_engine.prepare_input(image_source)?; + let word_rects = self.ocr_engine.detect_words(&input)?; + let line_rects = self.ocr_engine.find_text_lines(&input, &word_rects); + let mut text_image_coords = vec![]; + for text_line in self.ocr_engine.recognize_text(&input, &line_rects)? { + if let Some(text_line) = text_line { + let mut current_word = "".to_string(); + let mut current_word_rect: Option = None; + + for char in text_line.chars() { + match current_word_rect { + None => { + current_word_rect = Some(char.rect); + current_word = char.char.to_string(); + } + Some(ref current_rect) if char.char == ' ' => { + text_image_coords.push(TextImageCoords { + text: Some(current_word.clone()), + x1: current_rect.left() as f32, + y1: current_rect.top() as f32, + x2: current_rect.right() as f32, + y2: current_rect.bottom() as f32, + }); + current_word_rect = None; + } + Some(current_rect) => { + current_word_rect = Some(current_rect.union(char.rect)); + current_word.push(char.char); + } + } + } + } + } + Ok(text_image_coords) + } +} + +#[allow(unused_imports)] +mod tests { + use super::*; + use console::Term; + + #[test] + #[cfg_attr(not(feature = "ci-ocr"), ignore)] + fn test_recognise_png_file() -> AppResult<()> { + let term = Term::stdout(); + let app_reporter = AppReporter::from(&term); + let ocrs = Ocrs::new(&app_reporter)?; + let image = image::open("test-fixtures/media/form-example.png")?; + let text_image_coords = ocrs.image_to_text(image)?; + assert!(text_image_coords.len() > 10); + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs index 8bd89f5..71abcc6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -82,7 +82,7 @@ async fn handle_args(cli: CliArgs, term: &Term) -> AppResult<()> { .await?; term.write_line( format!( - "{} -> {}\n{} files processed.\n{} files skipped.", + "\n{} -> {}: {} files processed. {} files skipped.", source, destination, Style::new() diff --git a/src/redacters/stream_redacter.rs b/src/redacters/stream_redacter.rs index af5dcd7..56dc088 100644 --- a/src/redacters/stream_redacter.rs +++ b/src/redacters/stream_redacter.rs @@ -23,7 +23,7 @@ pub async fn redact_stream< redacter_base_options: &RedacterBaseOptions, input: S, file_ref: &FileSystemRef, - file_converters: &FileConverters, + file_converters: &FileConverters<'_>, bar: &ProgressBar, ) -> AppResult { let mut redacters_supported_options = Vec::with_capacity(redacters.len()); diff --git a/test-fixtures/media/form-example.png b/test-fixtures/media/form-example.png new file mode 100644 index 0000000..1473181 --- /dev/null +++ b/test-fixtures/media/form-example.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bc295b7a4fcf808235013540800bc7386253a94741ab54cf3013792dd5ad742 +size 308951