Skip to content

Commit

Permalink
Docs
Browse files Browse the repository at this point in the history
  • Loading branch information
abdolence committed Aug 14, 2024
1 parent 2c7d894 commit 1c6c43b
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 57 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "redacter"
version = "0.6.0"
version = "0.7.0"
edition = "2021"
authors = ["Abdulla Abdurakhmanov <[email protected]>"]
license = "Apache-2.0"
Expand Down
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ Google Cloud Platform's DLP API.
* text, html, json files
* structured data table files (csv)
* images (jpeg, png, bpm, gif)
* PDF files (rendering as images)
* [AWS Comprehend](https://aws.amazon.com/comprehend/) PII redaction:
* text, html, csv, json files
* [Microsoft Presidio](https://microsoft.github.io/presidio/) for PII redaction (open source project that you can
install on-prem).
* text, html, csv, json files
* images
* PDF files (rendering as images)
* [Gemini LLM](https://ai.google.dev/gemini-api/docs) based redaction
* text, html, csv, json files
* [Open AI LLM](https://openai.com/) based redaction
Expand Down Expand Up @@ -151,6 +153,25 @@ Optionally, you can provide a model name using `--open-ai-model` option. Default
You can specify multiple redacters using `--redact` option multiple times.
The tool will apply redaction in the order of the redacters specified.

## PDF redaction

PDF redaction is supported by rendering PDF files as images and redacting them.
To render and convert PDF files the tool uses external library `Pdfium` (the C++ PDF library used by the Google Chromium
project).
This library needs to be installed separately on your system.

Installation instructions:

- Download the latest release from, for example,
here [Pdfium releases](https://github.com/bblanchon/pdfium-binaries/releases) for your system.
- Extract the archive and copy library file `libpdfium.so` to the one of the following directory:
- The path the redacter tool installed (such as `/usr/local/bin`)
- The path that resides with redacter tool `/usr/local/lib/` if you have installed the tool in `/usr/local/bin`
- The path specified in the `LD_LIBRARY_PATH` environment variable

If library is detected correctly it will be reported in the tool output as.
> PDF to image support: ✓ Yes
## Examples:

```sh
Expand Down
6 changes: 5 additions & 1 deletion src/commands/copy_command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ pub async fn command_copy(

let converter_style = Style::new();
let pdf_support_output = if file_converters.pdf_image_converter.is_some() {
converter_style.clone().green().apply_to(format!("✓ Yes"))
converter_style
.clone()
.green()
.apply_to("✓ Yes".to_string())
} else {
converter_style.clone().dim().apply_to("✗ No".to_string())
};
Expand Down Expand Up @@ -192,6 +195,7 @@ enum TransferFileResult {
Skipped,
}

#[allow(clippy::too_many_arguments)]
async fn transfer_and_redact_file<
'a,
SFS: FileSystemConnection<'a>,
Expand Down
1 change: 0 additions & 1 deletion src/file_converters/pdf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ impl PdfImageConverter {
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(
&current_dir
.parent()
.clone()
.map(|p| p.join("lib"))
.unwrap_or(current_dir.clone()),
))
Expand Down
122 changes: 69 additions & 53 deletions src/redacters/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pub use gemini_llm::*;

mod open_ai_llm;
use crate::args::RedacterType;
use crate::file_converters::pdf::{PdfInfo, PdfPageInfo};
use crate::file_converters::pdf::{PdfInfo, PdfPageInfo, PdfToImage};
use crate::file_converters::FileConverters;
pub use open_ai_llm::*;

Expand Down Expand Up @@ -242,58 +242,17 @@ pub async fn redact_stream<
}
RedactSupportedOptions::SupportedAsImages => {
match file_converters.pdf_image_converter {
Some(ref converter) => match redacted.content {
RedacterDataItemContent::Pdf { data } => {
bar.println(format!(
"{width}↳ Redacting using {} redacter and converting the PDF to images",
redacter.redacter_type()
));
let pdf_info = converter.convert_to_images(data)?;
bar.println(format!(
"{width} ↳ Converting {pdf_info_pages} images",
pdf_info_pages = pdf_info.pages.len()
));
let mut redacted_pages = Vec::with_capacity(pdf_info.pages.len());
for page in pdf_info.pages {
let mut png_image_bytes = std::io::Cursor::new(Vec::new());
page.page_as_images
.write_to(&mut png_image_bytes, ImageFormat::Png)?;
let image_to_redact = RedacterDataItem {
content: RedacterDataItemContent::Image {
mime_type: mime::IMAGE_PNG,
data: png_image_bytes.into_inner().into(),
},
file_ref: file_ref.clone(),
};
let redacted_image = redacter.redact(image_to_redact).await?;
match redacted_image.content {
RedacterDataItemContent::Image { data, .. } => {
redacted_pages.push(PdfPageInfo {
page_as_images: image::load_from_memory_with_format(
&data,
ImageFormat::Png,
)?,
..page
});
}
_ => {}
}
}
let redacted_pdf_info = PdfInfo {
pages: redacted_pages,
..pdf_info
};
let redact_pdf_as_images =
converter.images_to_pdf(redacted_pdf_info)?;
redacted = RedacterDataItem {
content: RedacterDataItemContent::Pdf {
data: redact_pdf_as_images,
},
file_ref: file_ref.clone(),
}
}
_ => {}
},
Some(ref converter) => {
redacted = redact_pdf_with_images_converter(
file_ref,
bar,
redacted,
*redacter,
&width,
converter.as_ref(),
)
.await?
}
None => {
bar.println(format!(
"{width}↲ Skipping redaction because PDF to image converter is not available",
Expand Down Expand Up @@ -471,3 +430,60 @@ async fn stream_to_pdf_redact_item<
file_ref: file_ref.clone(),
})
}

async fn redact_pdf_with_images_converter(
file_ref: &FileSystemRef,
bar: &ProgressBar,
redacted: RedacterDataItem,
redacter: &impl Redacter,
width: &String,
converter: &dyn PdfToImage,
) -> Result<RedacterDataItem, AppError> {
match redacted.content {
RedacterDataItemContent::Pdf { data } => {
bar.println(format!(
"{width}↳ Redacting using {} redacter and converting the PDF to images",
redacter.redacter_type()
));
let pdf_info = converter.convert_to_images(data)?;
bar.println(format!(
"{width} ↳ Converting {pdf_info_pages} images",
pdf_info_pages = pdf_info.pages.len()
));
let mut redacted_pages = Vec::with_capacity(pdf_info.pages.len());
for page in pdf_info.pages {
let mut png_image_bytes = std::io::Cursor::new(Vec::new());
page.page_as_images
.write_to(&mut png_image_bytes, ImageFormat::Png)?;
let image_to_redact = RedacterDataItem {
content: RedacterDataItemContent::Image {
mime_type: mime::IMAGE_PNG,
data: png_image_bytes.into_inner().into(),
},
file_ref: file_ref.clone(),
};
let redacted_image = redacter.redact(image_to_redact).await?;
if let RedacterDataItemContent::Image { data, .. } = redacted_image.content {
redacted_pages.push(PdfPageInfo {
page_as_images: image::load_from_memory_with_format(
&data,
ImageFormat::Png,
)?,
..page
});
}
}
let redacted_pdf_info = PdfInfo {
pages: redacted_pages,
};
let redact_pdf_as_images = converter.images_to_pdf(redacted_pdf_info)?;
Ok(RedacterDataItem {
content: RedacterDataItemContent::Pdf {
data: redact_pdf_as_images,
},
file_ref: file_ref.clone(),
})
}
_ => Ok(redacted),
}
}

0 comments on commit 1c6c43b

Please sign in to comment.