How can we use internal or Giskard meta data mirror for testing prompt injection #1967
-
For doing prompt injection scan Giskard is using its own Github repository to load the meta data. class PromptInjectionDataLoader (giskard/llm/loaders/prompt_injections.py) INJECTION_DATA_URL = "https://raw.githubusercontent.com/Giskard-AI/prompt-injections/v0.0.2/prompt_injections.csv" The problem here is that many companies do not allow access to external websites during the execution. Thank you, |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment
-
Hello, As of today we don't have any way to override the loading of prompts that are being used by the detector. We will take a look into implementing this feature. However for now I think that you'll probably have to declare a custom detector extending the existing one: from typing import Optional
import pandas as pd
from giskard.datasets.base import Dataset
from giskard.llm.evaluators.string_matcher import StringMatcherConfig
from giskard.scanner.decorators import detector
from giskard.scanner.llm import LLMPromptInjectionDetector
import ast
def from_records_to_configs(records):
configs = []
for row in records:
kwargs = {k: v for k, v in row.items() if k in list(StringMatcherConfig.__annotations__.keys())}
configs.append(StringMatcherConfig(**kwargs))
return configs
class CustomPromptInjectionDataLoader:
def __init__(
self,
num_samples: Optional[int] = None,
):
self.num_samples = num_samples
self._df = None
def load_dataset_from_group(self, features, group) -> Dataset:
prompts = self.prompts_from_group(group)
prompts = pd.DataFrame({feature: prompts for feature in features}, index=prompts.index)
return Dataset(
df=prompts,
name="Injection Prompts",
target=None,
cat_columns=None,
validation=False,
)
@property
def df(self):
if self._df is None:
prompt_injections_df = pd.read_csv('my_csv.csv', index_col=["index"])
meta_df = pd.read_csv('my_meta.csv', index_col=["index"])
meta_df.expected_strings = meta_df.expected_strings.apply(ast.literal_eval)
self._df = prompt_injections_df.join(meta_df)
if self.num_samples is not None:
self._df = self._df.sample(self.num_samples)
return self._df
@property
def names(self):
return self.df.name.tolist()
@property
def groups(self):
return self.df.group_mapping.unique().tolist()
def df_from_group(self, group):
return self.df.loc[self.df["group_mapping"] == group]
def prompts_from_group(self, group):
return self.df_from_group(group).prompt
def configs_from_group(self, group):
configs_records = self.df_from_group(group).drop(["prompt"], axis=1).to_dict("records")
return from_records_to_configs(configs_records)
def group_description(self, group):
group_description = self.df_from_group(group).description.to_list()
return group_description[0]
def group_deviation_description(self, group):
group_deviation_description = self.df_from_group(group).deviation_description.to_list()
return group_deviation_description[0]
@detector("llm_prompt_injection", tags=["custom_prompt_injection", "jailbreak", "prompt_injection", "llm", "generative", "text_generation"])
class CustomLLMPromptInjectionDetector(LLMPromptInjectionDetector):
def __init__(self, num_samples: Optional[int] = None, threshold: float = 0.5):
super().__init__(num_samples, threshold)
@property
def data_loader(self):
if self._data_loader is None:
self._data_loader = CustomPromptInjectionDataLoader(num_samples=self.num_samples)
return self._data_loader
Then you'll be able to run this detector using the |
Beta Was this translation helpful? Give feedback.
Hello,
As of today we don't have any way to override the loading of prompts that are being used by the detector. We will take a look into implementing this feature.
However for now I think that you'll probably have to declare a custom detector extending the existing one: