How can we use internal or Giskard meta data mirror for testing prompt injection #1967

sturiot · 2024-07-01T17:01:15Z

sturiot
Jul 1, 2024

For doing prompt injection scan Giskard is using its own Github repository to load the meta data.

class PromptInjectionDataLoader (giskard/llm/loaders/prompt_injections.py)

INJECTION_DATA_URL = "https://raw.githubusercontent.com/Giskard-AI/prompt-injections/v0.0.2/prompt_injections.csv"
GISKARD_META_URL = "https://raw.githubusercontent.com/Giskard-AI/prompt-injections/v0.0.2/giskard_meta_data.csv"

The problem here is that many companies do not allow access to external websites during the execution.
How can we override these URLs to use internal mirrors, load data locally or using custom prompts to make the analysis?

Thank you,

Answered by kevinmessiaen

Jul 8, 2024

Hello,

As of today we don't have any way to override the loading of prompts that are being used by the detector. We will take a look into implementing this feature.

However for now I think that you'll probably have to declare a custom detector extending the existing one:

from typing import Optional

import pandas as pd

from giskard.datasets.base import Dataset
from giskard.llm.evaluators.string_matcher import StringMatcherConfig
from giskard.scanner.decorators import detector
from giskard.scanner.llm import LLMPromptInjectionDetector

import ast

def from_records_to_configs(records):
    configs = []
    for row in records:
        kwargs = {k: v for k, v in row.items() if k in list(Stri…

View full answer

kevinmessiaen · 2024-07-08T07:09:43Z

kevinmessiaen
Jul 8, 2024
Maintainer

Hello,

As of today we don't have any way to override the loading of prompts that are being used by the detector. We will take a look into implementing this feature.

However for now I think that you'll probably have to declare a custom detector extending the existing one:

from typing import Optional

import pandas as pd

from giskard.datasets.base import Dataset
from giskard.llm.evaluators.string_matcher import StringMatcherConfig
from giskard.scanner.decorators import detector
from giskard.scanner.llm import LLMPromptInjectionDetector

import ast

def from_records_to_configs(records):
    configs = []
    for row in records:
        kwargs = {k: v for k, v in row.items() if k in list(StringMatcherConfig.__annotations__.keys())}
        configs.append(StringMatcherConfig(**kwargs))
    return configs

class CustomPromptInjectionDataLoader:
    def __init__(
            self,
            num_samples: Optional[int] = None,
    ):
        self.num_samples = num_samples
        self._df = None

    def load_dataset_from_group(self, features, group) -> Dataset:
        prompts = self.prompts_from_group(group)
        prompts = pd.DataFrame({feature: prompts for feature in features}, index=prompts.index)
        return Dataset(
            df=prompts,
            name="Injection Prompts",
            target=None,
            cat_columns=None,
            validation=False,
        )

    @property
    def df(self):
        if self._df is None:
            prompt_injections_df = pd.read_csv('my_csv.csv', index_col=["index"])
            meta_df = pd.read_csv('my_meta.csv', index_col=["index"])
            meta_df.expected_strings = meta_df.expected_strings.apply(ast.literal_eval)
            self._df = prompt_injections_df.join(meta_df)

            if self.num_samples is not None:
                self._df = self._df.sample(self.num_samples)

        return self._df

    @property
    def names(self):
        return self.df.name.tolist()

    @property
    def groups(self):
        return self.df.group_mapping.unique().tolist()

    def df_from_group(self, group):
        return self.df.loc[self.df["group_mapping"] == group]

    def prompts_from_group(self, group):
        return self.df_from_group(group).prompt

    def configs_from_group(self, group):
        configs_records = self.df_from_group(group).drop(["prompt"], axis=1).to_dict("records")
        return from_records_to_configs(configs_records)

    def group_description(self, group):
        group_description = self.df_from_group(group).description.to_list()
        return group_description[0]

    def group_deviation_description(self, group):
        group_deviation_description = self.df_from_group(group).deviation_description.to_list()
        return group_deviation_description[0]


@detector("llm_prompt_injection", tags=["custom_prompt_injection", "jailbreak", "prompt_injection", "llm", "generative", "text_generation"])
class CustomLLMPromptInjectionDetector(LLMPromptInjectionDetector):

    def __init__(self, num_samples: Optional[int] = None, threshold: float = 0.5):
        super().__init__(num_samples, threshold)

    @property
    def data_loader(self):
        if self._data_loader is None:
            self._data_loader = CustomPromptInjectionDataLoader(num_samples=self.num_samples)

        return self._data_loader

Then you'll be able to run this detector using the only peram: giskard.scan(model, dataset, only=['custom_prompt_injection'])

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Giskard

How can we use internal or Giskard meta data mirror for testing prompt injection #1967

{{title}}

Replies: 1 comment

{{title}}

Select a reply

Giskard

How can we use internal or Giskard meta data mirror for testing prompt injection #1967

sturiot Jul 1, 2024

Replies: 1 comment

kevinmessiaen Jul 8, 2024 Maintainer

sturiot
Jul 1, 2024

kevinmessiaen
Jul 8, 2024
Maintainer