Skip to content

Commit

Permalink
OCR WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
abdolence committed Aug 18, 2024
1 parent d5fa6d0 commit f97d14e
Show file tree
Hide file tree
Showing 11 changed files with 294 additions and 60 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.rten filter=lfs diff=lfs merge=lfs -text
*.png filter=lfs diff=lfs merge=lfs -text
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
*.swp
tmp/
lib/
models/
52 changes: 52 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,18 @@ categories = ["command-line-utilities"]
description = "Copy & Redact cli tool to securely copy and redact files removing Personal Identifiable Information (PII) across various filesystems."

[features]
default = ["pdf-render", "clipboard"]
default = ["pdf-render", "clipboard", "ocr"]
ci-gcp = [] # For testing on CI/GCP
ci-aws = [] # For testing on CI/AWS
ci-ms-presidio = [] # For testing on CI/MS Presidiom
ci-gcp-llm = [] # For testing on CI/GCP with LLM models
ci-open-ai = [] # For testing on CI/OpenAIP
ci-clibpoard = [] # For testing on CI/Clipboard
ci-ocr = [] # For testing on CI/OCR
ci = ["ci-gcp", "ci-aws", "ci-ms-presidio", "ci-gcp-llm", "ci-open-ai", "ci-clibpoard"]
pdf-render = ["pdfium-render"]
clipboard = ["arboard"]
ocr = ["ocrs", "rten", "rten-imageproc"]


[dependencies]
Expand All @@ -43,6 +45,7 @@ sha2 = "0.10"
async-trait = "0.1"
hex = "0.4"
thiserror = "1"
anyhow = "1"
sync_wrapper = { version = "1", features = ["futures"] }
async-recursion = "1"
mime = "0.3"
Expand All @@ -65,6 +68,9 @@ bytes = { version = "1" }
serde_json = "1"
arboard = { version = "3", features = ["image"], optional = true }
ocrs = { version = "0.8", optional = true }
rten = { version = "0.10", optional = true }
rten-imageproc = { version = "0.10", optional = true }
dirs = "5.0.1"



Expand Down
115 changes: 72 additions & 43 deletions src/commands/copy_command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,48 +49,17 @@ pub async fn command_copy(
options: CopyCommandOptions,
redacter_options: Option<RedacterOptions>,
) -> AppResult<CopyCommandResult> {
let bold_style = Style::new().bold();
let redacted_output = if let Some(ref options) = redacter_options.as_ref() {
bold_style
.clone()
.green()
.apply_to(format!("✓ Yes ({})", &options))
} else {
bold_style.clone().red().apply_to("✗ No".to_string())
};
let sampling_output = if let Some(ref sampling_size) = redacter_options
.as_ref()
.and_then(|o| o.base_options.sampling_size)
{
Style::new().apply_to(format!("{} bytes.", sampling_size))
} else {
Style::new().dim().apply_to("-".to_string())
};
let term_reporter = AppReporter::from(term);
let file_converters = FileConverters::new().init(&term_reporter).await?;

let mut file_converters = FileConverters::new();
file_converters.init().await?;

let converter_style = Style::new();
let pdf_support_output = if file_converters.pdf_image_converter.is_some() {
converter_style
.clone()
.green()
.apply_to("✓ Yes".to_string())
} else {
converter_style.clone().dim().apply_to("✗ No".to_string())
};

term.write_line(
format!(
"Copying from {} to {}.\nRedacting: {}.\nSampling: {}\nPDF to image support: {}\n",
bold_style.clone().white().apply_to(source),
bold_style.clone().yellow().apply_to(destination),
redacted_output,
sampling_output,
pdf_support_output,
)
.as_str(),
)?;
report_copy_info(
term,
source,
destination,
&redacter_options,
&file_converters,
)
.await?;

let bar = ProgressBar::new(1);
bar.set_style(
Expand Down Expand Up @@ -131,6 +100,7 @@ pub async fn command_copy(
.iter()
.map(|file| file.file_size.unwrap_or(0))
.sum();
let bold_style = Style::new().bold();
bar.println(
format!(
"Found {} files. Total size: {}",
Expand Down Expand Up @@ -196,6 +166,65 @@ pub async fn command_copy(
copy_result
}

async fn report_copy_info(
term: &Term,
source: &str,
destination: &str,
redacter_options: &Option<RedacterOptions>,
file_converters: &FileConverters<'_>,
) -> AppResult<()> {
let bold_style = Style::new().bold();
let redacted_output = if let Some(ref options) = redacter_options.as_ref() {
bold_style
.clone()
.green()
.apply_to(format!("✓ Yes ({})", &options))
} else {
bold_style.clone().red().apply_to("✗ No".to_string())
};
let sampling_output = if let Some(ref sampling_size) = redacter_options
.as_ref()
.and_then(|o| o.base_options.sampling_size)
{
Style::new().apply_to(format!("{} bytes.", sampling_size))
} else {
Style::new().dim().apply_to("-".to_string())
};

let converter_style = Style::new();
let pdf_support_output = if file_converters.pdf_image_converter.is_some() {
converter_style
.clone()
.green()
.apply_to("✓ Yes".to_string())
} else {
converter_style.clone().dim().apply_to("✗ No".to_string())
};

let ocr_support_output = if file_converters.ocr.is_some() {
converter_style
.clone()
.green()
.apply_to("✓ Yes".to_string())
} else {
converter_style.clone().dim().apply_to("✗ No".to_string())
};

term.write_line(
format!(
"Copying from {} to {}.\nRedacting: {}. | Sampling: {} | PDF to image support: {} | OCR support: {}\n",
bold_style.clone().white().apply_to(source),
bold_style.clone().yellow().apply_to(destination),
redacted_output,
sampling_output,
pdf_support_output,
ocr_support_output,
)
.as_str(),
)?;
Ok(())
}

enum TransferFileResult {
Copied,
Skipped,
Expand All @@ -214,7 +243,7 @@ async fn transfer_and_redact_file<
destination_fs: &mut DFS,
options: &CopyCommandOptions,
redacter: &Option<(RedacterBaseOptions, Vec<impl Redacter>)>,
file_converters: &FileConverters,
file_converters: &FileConverters<'a>,
) -> AppResult<TransferFileResult> {
let bold_style = Style::new().bold().white();
let (base_file_ref, source_reader) = source_fs.download(source_file_ref).await?;
Expand Down Expand Up @@ -312,7 +341,7 @@ async fn redact_upload_file<
dest_file_ref: &FileSystemRef,
options: &CopyCommandOptions,
redacter_with_options: &(RedacterBaseOptions, Vec<impl Redacter>),
file_converters: &FileConverters,
file_converters: &FileConverters<'a>,
) -> AppResult<TransferFileResult> {
let (redacter_base_options, redacters) = redacter_with_options;
let mut support_redacters = Vec::new();
Expand Down
20 changes: 20 additions & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,19 @@ pub enum AppError {
SystemTimeError(#[from] SystemTimeError),
#[error("JSON serialization error: {0}")]
JsonSerializeError(#[from] serde_json::Error),
#[cfg(feature = "ocr")]
#[error("Model load error: {0}")]
OcrModelLoadError(#[from] rten::ModelLoadError),
#[cfg(feature = "ocr")]
#[error("OCR image error: {0}")]
OcrImageError(#[from] ocrs::ImageSourceError),
#[error("System error: {message}")]
SystemError { message: String },
#[error("System error: {message}")]
SystemErrorWithCause {
message: String,
cause: Box<dyn std::fmt::Debug + Send + Sync + 'static>,
},
}

impl<
Expand All @@ -67,3 +78,12 @@ impl<T: std::fmt::Debug + Send + Sync + 'static>
Self::GoogleCloudRestSdkApiError(Box::new(err))
}
}

impl From<anyhow::Error> for AppError {
fn from(err: anyhow::Error) -> Self {
Self::SystemErrorWithCause {
message: err.to_string(),
cause: Box::new(err),
}
}
}
33 changes: 19 additions & 14 deletions src/file_converters/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::file_converters::ocr::Ocr;
use crate::file_converters::pdf::PdfToImage;
use crate::reporter::AppReporter;
use crate::AppResult;

pub mod ocr;
Expand All @@ -8,32 +9,36 @@ pub mod pdf;
#[cfg(feature = "pdf-render")]
mod pdf_image_converter;

pub struct FileConverters {
pub pdf_image_converter: Option<Box<dyn PdfToImage + 'static>>,
pub ocr: Option<Box<dyn Ocr + 'static>>,
#[cfg(feature = "ocr")]
mod ocr_ocrs;

pub struct FileConverters<'a> {
pub pdf_image_converter: Option<Box<dyn PdfToImage + 'a>>,
pub ocr: Option<Box<dyn Ocr + 'a>>,
}

impl FileConverters {
impl<'a> FileConverters<'a> {
pub fn new() -> Self {
Self {
pdf_image_converter: None,
ocr: None,
}
}

#[cfg(feature = "pdf-render")]
pub async fn init(&mut self) -> AppResult<()> {
match crate::file_converters::pdf_image_converter::PdfImageConverter::new().ok() {
Some(pdf_image_converter) => {
pub async fn init(mut self, app_reporter: &'a AppReporter<'a>) -> AppResult<Self> {
#[cfg(feature = "pdf-render")]
{
if let Some(pdf_image_converter) = pdf_image_converter::PdfImageConverter::new().ok() {
self.pdf_image_converter = Some(Box::new(pdf_image_converter));
Ok(())
}
None => Ok(()),
}
}
#[cfg(feature = "ocr")]
{
if let Some(ocr) = ocr_ocrs::Ocrs::new(app_reporter).ok() {
self.ocr = Some(Box::new(ocr));
}
}

#[cfg(not(feature = "pdf-render"))]
pub async fn init(&mut self) -> AppResult<()> {
Ok(())
Ok(self)
}
}
Loading

0 comments on commit f97d14e

Please sign in to comment.