Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mllm_mapper #400

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
6 changes: 6 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ process:
cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
radius: 2 # radius of blur kernel
- mllm_mapper: # use MLLMs for visual question answering tasks
Qirui-jiao marked this conversation as resolved.
Show resolved Hide resolved
hf_model: 'liuhaotian/llava-v1.6-vicuna-7b' # model name of the MLLM on huggingface
max_new_tokens: 256 # the maximum number of new tokens generated by the model
temperature: 0.2 # used to control the randomness of generated text
top_p: None # randomly select the next word from the group of words whose cumulative probability reaches p
num_beams: 1 # the larger the beam search size, the higher the quality of the generated text
- nlpaug_en_mapper: # simply augment texts in English based on the nlpaug library
sequential: false # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
aug_num: 1 # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
Expand Down
4 changes: 3 additions & 1 deletion data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
clean_ip_mapper, clean_links_mapper, expand_macro_mapper,
extract_qa_mapper, fix_unicode_mapper, image_blur_mapper,
image_captioning_from_gpt4v_mapper, image_captioning_mapper,
image_diffusion_mapper, image_face_blur_mapper,
image_diffusion_mapper, image_face_blur_mapper, mllm_mapper,
nlpaug_en_mapper, nlpcda_zh_mapper,
punctuation_normalization_mapper, remove_bibliography_mapper,
remove_comments_mapper, remove_header_mapper,
Expand Down Expand Up @@ -39,6 +39,7 @@
from .image_captioning_mapper import ImageCaptioningMapper
from .image_diffusion_mapper import ImageDiffusionMapper
from .image_face_blur_mapper import ImageFaceBlurMapper
from .mllm_mapper import MllmMapper
from .nlpaug_en_mapper import NlpaugEnMapper
from .nlpcda_zh_mapper import NlpcdaZhMapper
from .punctuation_normalization_mapper import PunctuationNormalizationMapper
Expand Down Expand Up @@ -118,6 +119,7 @@
'AudioFFmpegWrappedMapper',
'VideoSplitByDurationMapper',
'VideoFaceBlurMapper',
'MllmMapper'
]

# yapf: enable
110 changes: 110 additions & 0 deletions data_juicer/ops/mapper/mllm_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import torch

from data_juicer.ops.base_op import OPERATORS, Mapper
from data_juicer.ops.op_fusion import LOADED_IMAGES
from data_juicer.utils.mm_utils import load_image
from data_juicer.utils.model_utils import get_model, prepare_model

OP_NAME = 'mllm_mapper'
torch.set_num_threads(1)


@LOADED_IMAGES.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class MllmMapper(Mapper):
Qirui-jiao marked this conversation as resolved.
Show resolved Hide resolved
"""Mapper to use MLLMs for visual question answering tasks.
Recommended model list: [
liuhaotian/llava-v1.6-vicuna-7b
]
"""
_accelerator = 'cuda'

def __init__(self,
hf_model: str = 'liuhaotian/llava-v1.6-vicuna-7b',
max_new_tokens=256,
temperature=0.2,
top_p=None,
num_beams=1,
*args,
**kwargs):
"""
Initialization method.
:param hf_model: hugginface model id.
:param max_new_tokens: the maximum number of new tokens
generated by the model.
:param temperature: used to control the randomness of \
generated text. The higher the temperature, the more \
random and creative the generated text will be.
:param top_p: randomly select the next word from the group \
of words whose cumulative probability reaches p.
:param num_beams: the larger the beam search size, the higher \
the quality of the generated text.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, num_proc=1, **kwargs)

self.hf_model = hf_model
self.model_key = prepare_model(model_type='huggingface',
pretrained_model_name_or_path=hf_model)
self.max_new_tokens = max_new_tokens
self.temperature = temperature
self.top_p = top_p
self.num_beams = num_beams

def process(self, sample=None, rank=None):

# there is no image in this sample
if self.image_key not in sample or not sample[self.image_key]:
return sample

# load images
loaded_image_keys = sample[self.image_key]
images = {}
for loaded_image_key in loaded_image_keys:
if loaded_image_key not in images:
# avoid loading the same images
image = load_image(loaded_image_key)
images[loaded_image_key] = image

model, processor = get_model(model_key=self.model_key,
rank=rank,
use_cuda=self.use_cuda())

conversation = [
{
'role':
'user',
'content': [
{
'type': 'text',
'text': sample[self.text_key]
Qirui-jiao marked this conversation as resolved.
Show resolved Hide resolved
},
{
'type': 'image'
},
],
},
]
prompt = processor.apply_chat_template(conversation,
add_generation_prompt=True)

sample[self.text_key] = []

for image_key in images:
inputs = processor(images=images[image_key],
text=prompt,
return_tensors='pt').to(model.device)

response = model.generate(**inputs,
max_new_tokens=self.max_new_tokens,
temperature=self.temperature,
top_p=self.top_p,
num_beams=self.num_beams)

output = processor.decode(response.cpu()[0],
skip_special_tokens=True)

sample[self.text_key].append(output)

return sample
3 changes: 2 additions & 1 deletion docs/Operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The operators in Data-Juicer are categorized into 5 types.
| Type | Number | Description |
|-----------------------------------|:------:|-------------------------------------------------|
| [ Formatter ]( #formatter ) | 7 | Discovers, loads, and canonicalizes source data |
| [ Mapper ]( #mapper ) | 43 | Edits and transforms samples |
| [ Mapper ]( #mapper ) | 44 | Edits and transforms samples |
| [ Filter ]( #filter ) | 41 | Filters out low-quality samples |
| [ Deduplicator ]( #deduplicator ) | 5 | Detects and removes duplicate samples |
| [ Selector ]( #selector ) | 4 | Selects top samples based on ranking |
Expand Down Expand Up @@ -64,6 +64,7 @@ All the specific operators are listed below, each featured with several capabili
| image_captioning_mapper | Multimodal | - | generate samples whose captions are generated based on another model (such as blip2) and the figure within the original sample |
| image_diffusion_mapper | Multimodal | - | Generate and augment images by stable diffusion model |
| image_face_blur_mapper | Image | - | Blur faces detected in images |
| mllm_mapper | Multimodal | en, zh | Use multimodal large language models for image-text question answering tasks |
| nlpaug_en_mapper | General | en | Simply augments texts in English based on the `nlpaug` library |
| nlpcda_zh_mapper | General | zh | Simply augments texts in Chinese based on the `nlpcda` library |
| punctuation_normalization_mapper | General | en, zh | Normalizes various Unicode punctuations to their ASCII equivalents |
Expand Down
3 changes: 2 additions & 1 deletion docs/Operators_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
| 类型 | 数量 | 描述 |
|------------------------------------|:--:|---------------|
| [ Formatter ]( #formatter ) | 7 | 发现、加载、规范化原始数据 |
| [ Mapper ]( #mapper ) | 43 | 对数据样本进行编辑和转换 |
| [ Mapper ]( #mapper ) | 44 | 对数据样本进行编辑和转换 |
| [ Filter ]( #filter ) | 41 | 过滤低质量样本 |
| [ Deduplicator ]( #deduplicator ) | 5 | 识别、删除重复样本 |
| [ Selector ]( #selector ) | 4 | 基于排序选取高质量样本 |
Expand Down Expand Up @@ -63,6 +63,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
| image_captioning_mapper | Multimodal | - | 生成样本,其标题是根据另一个辅助模型(例如 blip2)和原始样本中的图形生成的。 |
| image_diffusion_mapper | Multimodal | - | 用stable diffusion生成图像,对图像进行增强 |
| image_face_blur_mapper | Image | - | 对图像中的人脸进行模糊处理 |
| mllm_mapper | Multimodal | en, zh | 使用多模态大语言模型执行图文问答任务 |
| nlpaug_en_mapper | General | en | 使用`nlpaug`库对英语文本进行简单增强 |
| nlpcda_zh_mapper | General | zh | 使用`nlpcda`库对中文文本进行简单增强 |
| punctuation_normalization_mapper | General | en, zh | 将各种 Unicode 标点符号标准化为其 ASCII 等效项 |
Expand Down
36 changes: 36 additions & 0 deletions tests/ops/mapper/test_mllm_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import unittest
from data_juicer.ops.mapper.mllm_mapper import MllmMapper
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase

class MllmMapperTest(DataJuicerTestCaseBase):

text_key = 'text'
image_key = "images"

def _run_mllm(self, enable_vllm=False):
op = MllmMapper(
hf_model='liuhaotian/llava-v1.6-vicuna-7b',
temperature=0.9,
top_p=0.95,
max_new_tokens=512
)

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
'data')
img2_path = os.path.join(data_path, 'img2.jpg')
img3_path = os.path.join(data_path, 'img3.jpg')

samples = [
{self.text_key: 'Describe this image.', self.image_key: [img2_path, img3_path]},
]

for sample in samples:
result = op.process(sample)
print(f'Output results: {result}')

def test_mllm(self):
self._run_mllm()


if __name__ == '__main__':
unittest.main()