-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathopenai_utils.py
157 lines (125 loc) · 5.94 KB
/
openai_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from _types import AttackType, Focus
from openai import OpenAI
OpenAIClient = OpenAI
SUMMARIZE_ABSTRACT_PROMPT = """\
You will be provided with an abstract of a scientific paper. \
Compress this abstract in 1-2 sentences. Use very concise language usable as \
bullet points on a slide deck. Respond ONLY with your summary.
"""
ASSIGN_LABEL_PROMPT = """\
You will be provided with an abstract of a scientific paper. \
Assess the most applicable focus label based on the target audience, \
research focus, produced materials, and key outcomes.
{labels}
Respond with ONLY ONE of the labels above. Do not include anything else in your response.
"""
# Attack Type descriptions
EVASION_DESCRIPTION = """\
Model Evasion is an adversarial attack aimed at bypassing or evading a machine
learning model's defenses, usually to make it produce incorrect outputs or behave
in ways that favor the attacker. In this context, the adversary doesn't try to
"break" the model or extract data from it (like in model inversion) but instead
seeks to manipulate the model's behavior in a way that allows them to achieve a
desired outcome, such as bypassing detection systems or generating misleading predictions.
"""
EXTRACTION_DESCRIPTION = """\
Model Extraction refers to an attack where an adversary tries to replicate or steal
the functionality of a machine learning model by querying it and using the outputs
to build a copy of the original model. This type of attack doesn't necessarily involve
extracting sensitive data used for training, as in model inversion, but instead focuses
on how the model behaves—its predictions and outputs—in order to create a surrogate or
shadow model that behaves similarly to the original.
"""
INVERSION_DESCRIPTION = """\
Model inversion refers to a set of techniques in machine learning where an attacker
tries to extract confidential information from a trained AI model by interacting with
it in specific ways, often through extensive querying. By doing so, the attacker may
be able to infer details about the data used to train the model. These details can
range from personal information to the reconstruction of private or sensitive datasets,
potentially revealing confidential information.
"""
POISONING_DESCRIPTION = """\
Model Poisoning is an attack on machine learning models where an adversary intentionally
manipulates data in the training set to impact how a model behaves. Unlike attacks like
model inversion or model extraction, which focus on extracting information from the model,
model poisoning targets the model during its training phase. By introducing misleading,
incorrect, or adversarial data, attackers can manipulate a model's behavior, often without
detection, leading to significant security, reliability, and ethical risks.
"""
PROMPT_INJECTION_DESCRIPTION = """\
Prompt injection is a critical vulnerability in Large Language Models (LLMs), where malicious
users manipulate model behavior by crafting inputs that override, bypass, or exploit how the
model follows instructions. This vulnerability has become more pronounced with the widespread
use of generative AI systems, enabling attackers to induce unintended responses that may lead
to data leakage, misinformation, or system disruptions.
"""
ATTACK_TYPE_DESCRIPTIONS: dict[AttackType, str] = {
AttackType.ModelEvasion: EVASION_DESCRIPTION,
AttackType.ModelExtraction: EXTRACTION_DESCRIPTION,
AttackType.ModelInversion: INVERSION_DESCRIPTION,
AttackType.ModelPoisoning: POISONING_DESCRIPTION,
AttackType.PromptInjection: PROMPT_INJECTION_DESCRIPTION,
AttackType.Other: "None of the above",
}
ASSIGN_ATTACK_TYPE_PROMPT = """\
You will be provided with an abstract of a scientific paper. \
Assess the most applicable attack type label based on the \
research focus, produced materials, and key outcomes.
{types}
If you feel like none of the types apply, you can respond with "Other".
Respond with ONLY ONE of the labels above. Do not include anything else in your response.
"""
# Model Evasion
# Model Extraction
# Model Inversion
# Model Poisoning
# Prompt Injection
def get_openai_client(token: str) -> OpenAIClient:
return OpenAI(api_key=token)
def summarize_abstract_with_openai(client: OpenAIClient, abstract: str) -> str:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": SUMMARIZE_ABSTRACT_PROMPT},
{"role": "user", "content": f"{abstract}"},
],
temperature=0.5,
max_tokens=100,
)
return response.choices[0].message.content.strip() # type: ignore
def get_focus_label_from_abstract(client: OpenAIClient, abstract: str) -> Focus | None:
system_prompt = ASSIGN_LABEL_PROMPT.format(
labels="\n".join([f"- {f.value}" for f in Focus])
)
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"{abstract}"},
],
temperature=0.5,
max_tokens=10,
)
content = response.choices[0].message.content.strip() # type: ignore
if content not in [f.value for f in Focus]:
return None
return Focus(content)
def get_attack_type_from_abstract(client: OpenAIClient, abstract: str) -> AttackType | None:
system_prompt = ASSIGN_ATTACK_TYPE_PROMPT.format(
types="\n".join([f"- `{t.value}`: {ATTACK_TYPE_DESCRIPTIONS[t]}" for t in AttackType])
)
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"{abstract}"},
],
temperature=0.5,
max_tokens=10,
)
content = response.choices[0].message.content.strip() # type: ignore
content = content.strip("`")
if content not in [t.value for t in AttackType]:
print(f"Invalid attack type: {content}")
return None
return AttackType(content)