-
Notifications
You must be signed in to change notification settings - Fork 187
/
image_captioning_from_gpt4v_mapper.py
272 lines (244 loc) · 13.1 KB
/
image_captioning_from_gpt4v_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
import copy
from typing import Optional
import requests
from loguru import logger
from pydantic import Field
from typing_extensions import Annotated
from data_juicer.utils.mm_utils import (SpecialTokens, image_byte_to_base64,
insert_texts_after_placeholders,
load_image_byte,
remove_non_special_tokens,
remove_special_tokens)
from ..base_op import OPERATORS, Mapper
from ..op_fusion import LOADED_IMAGES
SYSTEM_PROMPTS = {
'resoning':
"You are an AI visual assistant that can analyze a single image. The task is to use the provided image, create a plausible question about the image, and provide the answer in detail.\n\nYou can create complex questions beyond describing the scene. Make the question challenging by not including the visual content details in the question so that the user needs to reason about that first.\n\nTo answer such questions, you should require first understanding the visual content, then based on the background knowledge or reasoning, either explain why the things are happening that way, or provide guides and help to user's request. \n\nPlease give the Q&A content directly and separate questions and answers with Q and A.", # noqa: E501
'description':
'You are an AI visual assistant that can analyze a single image. The task is to use the provided image, create a reasonable question that describes the content of the image, and provide the answer in detail.\n\nPlease give the Q&A content directly and separate questions and answers with Q and A.', # noqa: E501
'conversation':
'You are an AI visual assistant, and you are seeing a single image.\n\nDesign a conversation between you and a person asking about this image. The answers should be in a tone that a visual AI assistant is seeing the image and answering the question. Ask diverse questions and give corresponding answers.\n\nInclude questions asking about the visual content of the image, including the object types, counting the objects, object actions, object locations, relative positions between objects, etc. Only include questions that have definite answers:\n(1) one can see the content in the image that the question asks about and can answer confidently;\n(2) one can determine confidently from the image that it is not in the image.\nDo not ask any question that cannot be answered confidently.\n\nConversation also include complex questions that are relevant to the content in the image, for example, asking about background knowledge of the objects in the image, asking to discuss about events happening in the image, etc. Again, do not ask about uncertain details.\nProvide detailed answers when answering complex questions. For example, give detailed examples or reasoning steps to make the content more convincing and well-organized. Please give the content of the conversation directly and separate questions and answers with Q and A' # noqa: E501
}
def call_gpt_vision_api(api_key,
system_prompt,
user_prompt,
base64_image,
max_tokens=500,
temperature=1.0,
model='gpt-4-vision-preview'):
api_url = 'https://api.openai.com/v1/chat/completions'
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {api_key}'
}
data = {
'model':
model,
'messages': [{
'role': 'system',
'content': system_prompt
}, {
'role':
'user',
'content': [{
'type': 'text',
'text': user_prompt
}, {
'type': 'image_url',
'image_url': {
'url': f'data:image/jpeg;base64,{base64_image}',
'detail': 'low'
}
}]
}],
'max_tokens':
max_tokens,
'temperature':
temperature
}
try:
response = requests.post(api_url, headers=headers, json=data)
response.raise_for_status()
result = response.json()
if 'choices' in result and result['choices']:
return result['choices'][0]['text']
else:
logger.warning('No results returned from the API, return None.')
return None
except requests.exceptions.HTTPError as errh:
if errh.response.status_code == 401:
logger.warning('Invalid API key provided.')
elif errh.response.status_code == 429:
logger.warning(
'API request limit has been reached. Please try again later.')
else:
logger.warning(f'HTTP error occurred: {errh}')
except requests.exceptions.ConnectionError:
logger.warning('Network error occurred. Please check your connection.')
except requests.exceptions.Timeout:
logger.warning('The request timed out. Please try again later.')
except requests.exceptions.RequestException as err:
logger.warningt(f'An error occurred: {err}')
except Exception as e:
logger.warning(f'An unexpected error occurred: {e}')
logger.warning('API request failed, return None.')
return None
@OPERATORS.register_module('image_captioning_from_gpt4v_mapper')
@LOADED_IMAGES.register_module('image_captioning_from_gpt4v_mapper')
class ImageCaptioningFromGPT4VMapper(Mapper):
"""Mapper to generate samples whose texts are generated based on
gpt-4-visison and the image."""
_batched_op = True
def __init__(self,
mode: str = 'description',
api_key: str = '',
max_token: int = 500,
temperature: Annotated[float, Field(ge=0, le=1)] = 1.0,
system_prompt: str = '',
user_prompt: str = '',
user_prompt_key: Optional[str] = None,
keep_original_sample: bool = True,
any_or_all: str = 'any',
*args,
**kwargs):
"""
Initialization method.
:param mode: mode of text generated from images, can be one of
['resoning', 'description', 'conversation', 'custom']
:param api_key: the API key to authenticate the request.
:param max_token: the maximum number of tokens to generate.
Default is 500.
:param temperature: controls the randomness of the output (range
from 0 to 1). Default is 0.
:param system_prompt: a string prompt used to set the context of a
conversation and provide global guidance or rules for the
gpt4-vision so that it can generate responses in the expected way.
If `mode` set to `custom`, the parameter will be used.
:param user_prompt: a string prompt to guide the generation of
gpt4-vision for each samples. It's "" in default, which means no
prompt provided.
:param uers_prompt_key: the key name of fields in samples to store
prompts for each sample. It's used for set different prompts for
different samples. If it's none, use prompt in parameter "prompt".
It's None in default.
:param keep_original_sample: whether to keep the original sample. If
it's set to False, there will be only generated text in the
final datasets and the original text will be removed. It's True
in default.
:param any_or_all: keep this sample with 'any' or 'all' strategy of
all images. 'any': keep this sample if any images meet the
condition. 'all': keep this sample only if all images meet the
condition.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
if mode not in ['resoning', 'description', 'conversation', 'custom']:
raise ValueError(
f'Mode [{mode}] is not supported. '
f'Can only be one of '
f'["resoning", "description", "conversation", "custom"].')
if mode == 'custom':
self.system_prompt = system_prompt
logger.info('The parameter `mode` set to `[custom]`. Data-Juicer '
'will use `system_prompt` to generate text.')
else:
self.system_prompt = SYSTEM_PROMPTS[mode]
logger.info(
f'The parameter `mode` set to [{mode}]. Data-Juicer will '
f'use default prompt to generate text.')
self.mode = mode
self.api_key = api_key
self.max_token = max_token
self.temperature = temperature
self.user_prompt = user_prompt
self.user_prompt_key = user_prompt_key
self.keep_original_sample = keep_original_sample
self.any_or_all = any_or_all
self.extra_args = kwargs
# report a warning when both user_prompt and user_prompt_key are set
if self.user_prompt and self.user_prompt_key:
logger.warning(
'Both the parameter `user_prompt` and `user_prompt_key` are '
'set. Data-Juicer will consider `user_prompt_key` first.')
def _process_single_sample(self, sample):
# there is no image in this sample
if self.image_key not in sample or not sample[self.image_key]:
return []
# the generated results
generated_sample = copy.deepcopy(sample)
generated_sample[self.text_key] = ''
# load all image(s)
loaded_image_keys = sample[self.image_key]
images = {}
for loaded_image_key in loaded_image_keys:
if loaded_image_key not in images:
# avoid loading the same images
image = load_image_byte(loaded_image_key)
images[loaded_image_key] = image
# construct user prompts
if self.user_prompt_key and isinstance(sample[self.user_prompt_key],
str):
# check user_prompt_key is not None, and it's a str in the sample
prompt_texts = sample[self.user_prompt_key]
elif self.user_prompt and isinstance(self.user_prompt, str):
# check prompt is not None, and it's a str
prompt_texts = self.user_prompt
else:
prompt_texts = ''
offset = 0
# do generation for each image chunk by chunk
for chunk in sample[self.text_key].split(SpecialTokens.eoc):
# skip empty chunks or contents after the last eoc token
if not chunk.strip():
continue
else:
img_count = chunk.count(SpecialTokens.image)
text_with_only_special_tokens = remove_non_special_tokens(
chunk)
generated_text_single_chunk = []
for image_key in loaded_image_keys[offset:offset + img_count]:
image = images[image_key]
res = call_gpt_vision_api(self.api_key, self.system_prompt,
prompt_texts,
image_byte_to_base64(image),
self.max_token, self.temperature)
generated_text_single_chunk.append(res)
if self.any_or_all == 'all' and not all(
generated_text_single_chunk):
return []
# insert the generated text according to given mode
place_holders = [SpecialTokens.image] * img_count
new_generated_text_per_chunk = insert_texts_after_placeholders(
original_string=text_with_only_special_tokens,
placeholders=place_holders,
new_texts=generated_text_single_chunk)
generated_sample[
self.
text_key] += f'{new_generated_text_per_chunk}{SpecialTokens.eoc}' # noqa: E501
offset += img_count
if self.any_or_all == 'any' and not remove_special_tokens(
generated_sample[self.text_key]):
return []
return [generated_sample]
def process_batched(self, samples):
# reconstruct samples from "dict of lists" to "list of dicts"
reconstructed_samples = []
for i in range(len(samples[self.text_key])):
reconstructed_samples.append(
{key: samples[key][i]
for key in samples})
samples_after_generation = []
# do generation for each sample within the batch
for ori_sample in reconstructed_samples:
if self.keep_original_sample:
samples_after_generation.append(ori_sample)
generated_samples = self._process_single_sample(ori_sample)
if len(generated_samples) != 0:
samples_after_generation.extend(generated_samples)
# reconstruct samples from "list of dicts" to "dict of lists"
keys = samples_after_generation[0].keys()
res_samples = {}
for key in keys:
res_samples[key] = [s[key] for s in samples_after_generation]
return res_samples