Skip to content

Commit

Permalink
Open AI LLM redacting support (#8)
Browse files Browse the repository at this point in the history
* Open AI LLM as DLP

* Documentation update
  • Loading branch information
abdolence authored Aug 9, 2024
1 parent 27e5be7 commit 3f953d7
Show file tree
Hide file tree
Showing 7 changed files with 292 additions and 17 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ ci-gcp = [] # For testing on CI/GCP
ci-aws = [] # For testing on CI/AWS
ci-ms-presidio = [] # For testing on CI/MS Presidiom
ci-gcp-llm = [] # For testing on CI/GCP with LLM models
ci = ["ci-gcp", "ci-aws", "ci-ms-presidio", "ci-gcp-llm"]
ci-open-ai = [] # For testing on CI/OpenAIP
ci = ["ci-gcp", "ci-aws", "ci-ms-presidio", "ci-gcp-llm", "ci-open-ai"]


[dependencies]
Expand Down
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ Google Cloud Platform's DLP API.
* images
* [Gemini LLM](https://ai.google.dev/gemini-api/docs) based redaction
* text, html, csv, json files
* [Open AI LLM](https://openai.com/) based redaction
* text, html, csv, json files
* ... more DLP providers can be added in the future.
* **CLI:** Easy-to-use command-line interface for streamlined workflows.
* Built with Rust to ensure speed, safety, and reliability.
Expand Down Expand Up @@ -67,7 +69,7 @@ Options:
-f, --filename-filter <FILENAME_FILTER>
Filter by name using glob patterns such as *.txt
-d, --redact <REDACT>
Redacter type [possible values: gcp-dlp, aws-comprehend, ms-presidio, gemini-llm]
Redacter type [possible values: gcp-dlp, aws-comprehend, ms-presidio, gemini-llm, open-ai-llm]
--gcp-project-id <GCP_PROJECT_ID>
GCP project id that will be used to redact and bill API calls
--allow-unsupported-copies
Expand All @@ -86,6 +88,10 @@ Options:
Gemini model name for Gemini LLM redacter. Default is 'models/gemini-1.5-flash'
--sampling-size <SAMPLING_SIZE>
Sampling size in bytes before redacting files. Disabled by default
--open-ai-api-key <OPEN_AI_API_KEY>
API key for OpenAI LLM redacter
--open-ai-model <OPEN_AI_MODEL>
Open AI model name for OpenAI LLM redacter. Default is 'gpt-4o-mini'
-h, --help
Print help
```
Expand Down Expand Up @@ -135,6 +141,11 @@ To be able to use GCP DLP you need to:
official [instructions](https://ai.google.dev/gemini-api/docs/oauth#set-cloud).
- provide a GCP project id using `--gcp-project-id` option.

### Open AI LLM

To be able to use Open AI LLM you need to provide an API key using `--open-ai-api-key` command line option.
Optionally, you can provide a model name using `--open-ai-model` option. Default is `gpt-4o-mini`.

## Examples:

```sh
Expand Down
25 changes: 24 additions & 1 deletion src/args.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use crate::common_types::GcpProjectId;
use crate::errors::AppError;
use crate::redacters::{
GcpDlpRedacterOptions, GeminiLlmModelName, RedacterOptions, RedacterProviderOptions,
GcpDlpRedacterOptions, GeminiLlmModelName, OpenAiLlmApiKey, OpenAiModelName, RedacterOptions,
RedacterProviderOptions,
};
use clap::*;
use std::fmt::Display;
Expand Down Expand Up @@ -62,6 +63,7 @@ pub enum RedacterType {
AwsComprehend,
MsPresidio,
GeminiLlm,
OpenAiLlm,
}

impl std::str::FromStr for RedacterType {
Expand All @@ -85,6 +87,7 @@ impl Display for RedacterType {
RedacterType::AwsComprehend => write!(f, "aws-comprehend"),
RedacterType::MsPresidio => write!(f, "ms-presidio"),
RedacterType::GeminiLlm => write!(f, "gemini-llm"),
RedacterType::OpenAiLlm => write!(f, "openai-llm"),
}
}
}
Expand Down Expand Up @@ -138,6 +141,15 @@ pub struct RedacterArgs {
help = "Sampling size in bytes before redacting files. Disabled by default"
)]
pub sampling_size: Option<usize>,

#[arg(long, help = "API key for OpenAI LLM redacter")]
pub open_ai_api_key: Option<OpenAiLlmApiKey>,

#[arg(
long,
help = "Open AI model name for OpenAI LLM redacter. Default is 'gpt-4o-mini'"
)]
pub open_ai_model: Option<OpenAiModelName>,
}

impl TryInto<RedacterOptions> for RedacterArgs {
Expand Down Expand Up @@ -186,6 +198,17 @@ impl TryInto<RedacterOptions> for RedacterArgs {
gemini_model: self.gemini_model,
},
)),
Some(RedacterType::OpenAiLlm) => Ok(RedacterProviderOptions::OpenAiLlm(
crate::redacters::OpenAiLlmRedacterOptions {
api_key: self
.open_ai_api_key
.ok_or_else(|| AppError::RedacterConfigError {
message: "OpenAI API key is required for OpenAI LLM redacter"
.to_string(),
})?,
model: self.open_ai_model,
},
)),
None => Err(AppError::RedacterConfigError {
message: "Redacter type is required".to_string(),
}),
Expand Down
3 changes: 2 additions & 1 deletion src/commands/copy_command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,9 @@ async fn transfer_and_redact_file<
};
bar.println(
format!(
"Copying {} ({}) to {}. Size: {}",
"Copying {} ({},{}) to {}. Size: {}",
bold_style.apply_to(&base_resolved_file_ref.file_path),
base_resolved_file_ref.scheme,
file_ref
.media_type
.as_ref()
Expand Down
12 changes: 0 additions & 12 deletions src/filesystems/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,6 @@ pub struct AbsoluteFilePath {
pub scheme: String,
}

impl AbsoluteFilePath {
pub fn value(&self) -> String {
format!("{}://{}", self.scheme, self.file_path)
}
}

impl RelativeFilePath {
pub fn is_dir(&self) -> bool {
self.value().ends_with('/')
}
}

#[derive(Debug, Clone)]
pub struct FileSystemRef {
pub relative_path: RelativeFilePath,
Expand Down
14 changes: 13 additions & 1 deletion src/redacters/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::errors::AppError;
use crate::filesystems::FileSystemRef;
use crate::reporter::AppReporter;
use crate::AppResult;
Expand All @@ -16,9 +17,11 @@ mod ms_presidio;
pub use ms_presidio::*;

mod gemini_llm;
use crate::errors::AppError;
pub use gemini_llm::*;

mod open_ai_llm;
pub use open_ai_llm::*;

#[derive(Debug, Clone)]
pub struct RedacterDataItem {
pub content: RedacterDataItemContent,
Expand All @@ -44,6 +47,7 @@ pub enum Redacters<'a> {
AwsComprehendDlp(AwsComprehendRedacter<'a>),
MsPresidio(MsPresidioRedacter<'a>),
GeminiLlm(GeminiLlmRedacter<'a>),
OpenAiLlm(OpenAiLlmRedacter<'a>),
}

#[derive(Debug, Clone)]
Expand All @@ -61,6 +65,7 @@ pub enum RedacterProviderOptions {
AwsComprehend(AwsComprehendRedacterOptions),
MsPresidio(MsPresidioRedacterOptions),
GeminiLlm(GeminiLlmRedacterOptions),
OpenAiLlm(OpenAiLlmRedacterOptions),
}

impl Display for RedacterOptions {
Expand All @@ -70,6 +75,7 @@ impl Display for RedacterOptions {
RedacterProviderOptions::AwsComprehend(_) => write!(f, "aws-comprehend-dlp"),
RedacterProviderOptions::MsPresidio(_) => write!(f, "ms-presidio"),
RedacterProviderOptions::GeminiLlm(_) => write!(f, "gemini-llm"),
RedacterProviderOptions::OpenAiLlm(_) => write!(f, "openai-llm"),
}
}
}
Expand All @@ -94,6 +100,9 @@ impl<'a> Redacters<'a> {
RedacterProviderOptions::GeminiLlm(ref options) => Ok(Redacters::GeminiLlm(
GeminiLlmRedacter::new(redacter_options.clone(), options.clone(), reporter).await?,
)),
RedacterProviderOptions::OpenAiLlm(ref options) => Ok(Redacters::OpenAiLlm(
OpenAiLlmRedacter::new(redacter_options.clone(), options.clone(), reporter).await?,
)),
}
}

Expand Down Expand Up @@ -147,6 +156,7 @@ impl<'a> Redacter for Redacters<'a> {
Redacters::AwsComprehendDlp(redacter) => redacter.redact(input).await,
Redacters::MsPresidio(redacter) => redacter.redact(input).await,
Redacters::GeminiLlm(redacter) => redacter.redact(input).await,
Redacters::OpenAiLlm(redacter) => redacter.redact(input).await,
}
}

Expand All @@ -161,6 +171,7 @@ impl<'a> Redacter for Redacters<'a> {
}
Redacters::MsPresidio(redacter) => redacter.redact_supported_options(file_ref).await,
Redacters::GeminiLlm(redacter) => redacter.redact_supported_options(file_ref).await,
Redacters::OpenAiLlm(redacter) => redacter.redact_supported_options(file_ref).await,
}
}

Expand All @@ -170,6 +181,7 @@ impl<'a> Redacter for Redacters<'a> {
Redacters::AwsComprehendDlp(redacter) => redacter.options(),
Redacters::MsPresidio(redacter) => redacter.options(),
Redacters::GeminiLlm(redacter) => redacter.options(),
Redacters::OpenAiLlm(redacter) => redacter.options(),
}
}
}
Expand Down
Loading

0 comments on commit 3f953d7

Please sign in to comment.