From ac691be322ac14841b1e4643331473d7b3dfd196 Mon Sep 17 00:00:00 2001 From: Junming Yang <60545459+junming-yang@users.noreply.github.com> Date: Fri, 19 Jul 2024 17:38:18 +0800 Subject: [PATCH 1/9] update --- requirements.txt | 1 + vlmeval/vlm/cogvlm.py | 32 +++++++++++++++++++++++----- vlmeval/vlm/emu.py | 2 +- vlmeval/vlm/internvl_chat.py | 41 ++++++++++++++++++++++++++++++------ 4 files changed, 63 insertions(+), 13 deletions(-) diff --git a/requirements.txt b/requirements.txt index cfec36885..c4fc42366 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,3 +30,4 @@ validators visual_genome xlsxwriter xtuner +accelerate \ No newline at end of file diff --git a/vlmeval/vlm/cogvlm.py b/vlmeval/vlm/cogvlm.py index d5d1ece94..963d90c7b 100644 --- a/vlmeval/vlm/cogvlm.py +++ b/vlmeval/vlm/cogvlm.py @@ -51,11 +51,33 @@ class CogVlm(BaseModel): def __init__(self, model_path='THUDM/cogvlm2-llama3-chat-19B', tokenizer_name=None, **kwargs): assert model_path is not None - model = AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype=torch.bfloat16, - trust_remote_code=True, - ).to('cuda').eval() + from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model + + with init_empty_weights(): + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + + local_rank = int(os.environ.get('LOCAL_RANK', 0)) + device_num = torch.cuda.device_count() + + device_1 = local_rank + device_2 = local_rank + device_num // 2 + no_split_module = model._no_split_modules + + device_map = infer_auto_device_map( + model, + max_memory={ + device_1: '22GiB', + device_2: '22GiB' + }, + no_split_module_classes=no_split_module) + model = dispatch_model( + model, + device_map=device_map).eval() self.kwargs = kwargs if tokenizer_name: diff --git a/vlmeval/vlm/emu.py b/vlmeval/vlm/emu.py index 1051c799b..ce8b1b141 100644 --- a/vlmeval/vlm/emu.py +++ b/vlmeval/vlm/emu.py @@ -21,7 +21,7 @@ def __init__(self, from transformers import AutoModelForCausalLM, AutoTokenizer from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model - local_rank = os.environ.get('LOCAL_RANK', 0) + local_rank = int(os.environ.get('LOCAL_RANK', 0)) device_num = torch.cuda.device_count() assert local_rank * 2 <= device_num, 'The number of devices does not match the world size' diff --git a/vlmeval/vlm/internvl_chat.py b/vlmeval/vlm/internvl_chat.py index 563714357..003e4ea15 100644 --- a/vlmeval/vlm/internvl_chat.py +++ b/vlmeval/vlm/internvl_chat.py @@ -149,13 +149,40 @@ def __init__(self, model_path='OpenGVLab/InternVL-Chat-V1-5', load_in_8bit=False else: device = torch.cuda.current_device() self.device = device - self.model = AutoModel.from_pretrained( - model_path, - torch_dtype=torch.bfloat16, - trust_remote_code=True, - load_in_8bit=load_in_8bit).eval() - if not load_in_8bit: - self.model = self.model.to(device) + from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model + + self.model_path = model_path + self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False) + + with init_empty_weights(): + model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, + trust_remote_code=True, + load_in_8bit=load_in_8bit).eval() + + local_rank = int(os.environ.get('LOCAL_RANK', 0)) + device_num = torch.cuda.device_count() + + device_1 = local_rank + device_2 = local_rank + device_num // 2 + no_split_module = model._no_split_modules + + device_map = infer_auto_device_map( + model, + max_memory={ + device_1: '40GiB', + device_2: '40GiB' + }, + no_split_module_classes=no_split_module) + + model = dispatch_model( + model, + device_map=device_map, + offload_folder="offload").eval() + + if not load_in_8bit: + self.model = self.model.to(device) + + self.model = model self.image_size = self.model.config.vision_config.image_size self.version = version From acc0a02087b69ff63e4defbe54dfeac681f10984 Mon Sep 17 00:00:00 2001 From: Junming Yang <60545459+junming-yang@users.noreply.github.com> Date: Mon, 22 Jul 2024 20:02:58 +0800 Subject: [PATCH 2/9] update --- vlmeval/vlm/internvl_chat.py | 7 +++---- vlmeval/vlm/omnilmm.py | 24 ++++++++++++++++++++++-- vlmeval/vlm/pandagpt.py | 27 ++++++++++++++++++++++++--- 3 files changed, 49 insertions(+), 9 deletions(-) diff --git a/vlmeval/vlm/internvl_chat.py b/vlmeval/vlm/internvl_chat.py index 003e4ea15..d9f1558a5 100644 --- a/vlmeval/vlm/internvl_chat.py +++ b/vlmeval/vlm/internvl_chat.py @@ -154,10 +154,9 @@ def __init__(self, model_path='OpenGVLab/InternVL-Chat-V1-5', load_in_8bit=False self.model_path = model_path self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False) - with init_empty_weights(): - model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, - trust_remote_code=True, - load_in_8bit=load_in_8bit).eval() + model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, + trust_remote_code=True, + load_in_8bit=load_in_8bit).eval() local_rank = int(os.environ.get('LOCAL_RANK', 0)) device_num = torch.cuda.device_count() diff --git a/vlmeval/vlm/omnilmm.py b/vlmeval/vlm/omnilmm.py index f5c4ea414..faa4e2e9e 100644 --- a/vlmeval/vlm/omnilmm.py +++ b/vlmeval/vlm/omnilmm.py @@ -6,6 +6,7 @@ from .base import BaseModel from ..smp import * from ..dataset import DATASET_TYPE +from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model DEFAULT_IMAGE_TOKEN = '' @@ -73,7 +74,26 @@ class OmniLMM12B(BaseModel): def __init__(self, model_path, root, **kwargs) -> None: sys.path.append(root) - model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path) + with init_empty_weights(): + model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path) + + local_rank = int(os.environ.get('LOCAL_RANK', 0)) + device_num = torch.cuda.device_count() + + device_1 = local_rank + device_2 = local_rank + device_num // 2 + device_map = infer_auto_device_map( + model, + max_memory={ + device_1: '22GiB', + device_2: '22GiB' + }, + no_split_module_classes=['Eva','MistralDecoderLayer', 'ModuleList', 'Resampler']) + print(device_map) + model = dispatch_model( + model, + device_map=device_map).eval() + self.model = model self.image_token_len = image_token_len self.image_transform = img_processor @@ -90,7 +110,7 @@ def __init__(self, model_path, root, **kwargs) -> None: torch.cuda.empty_cache() def generate_inner(self, message, dataset=None): - prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + prompt, image_path = self.message_to_promptimg(message) try: image = Image.open(image_path).convert('RGB') except: diff --git a/vlmeval/vlm/pandagpt.py b/vlmeval/vlm/pandagpt.py index 47821de7e..57d5f06f4 100644 --- a/vlmeval/vlm/pandagpt.py +++ b/vlmeval/vlm/pandagpt.py @@ -3,7 +3,8 @@ import os.path as osp import warnings from .base import BaseModel - +from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model +import os class PandaGPT(BaseModel): @@ -40,14 +41,34 @@ def __init__(self, name, root=None, **kwargs): delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu')) model.load_state_dict(delta_ckpt, strict=False) torch.cuda.empty_cache() - self.model = model.eval().half().cuda() + # self.model = model.eval().half().cuda() + + local_rank = int(os.environ.get('LOCAL_RANK', 0)) + device_num = torch.cuda.device_count() + + device_1 = local_rank + device_2 = local_rank + device_num // 2 + + device_map = infer_auto_device_map( + model, + max_memory={ + device_1: '32GiB', + device_2: '32GiB' + }, + no_split_module_classes=['LlamaDecoderLayer', 'VisionTransformer']) + device_map['llama_model.base_model.model.lm_head'] = device_map['llama_proj'] = device_1 + print(device_map) + model = dispatch_model( + model, + device_map=device_map).eval() + self.model = model kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001} kwargs_default.update(kwargs) self.kwargs = kwargs_default warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') def generate_inner(self, message, dataset=None): - prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + prompt, image_path = self.message_to_promptimg(message) struct = { 'prompt': prompt, 'image_paths': [image_path], From 609c0e00ffef8ed83cd95350944804650ef8b4ee Mon Sep 17 00:00:00 2001 From: Junming Yang <60545459+junming-yang@users.noreply.github.com> Date: Fri, 26 Jul 2024 18:07:08 +0800 Subject: [PATCH 3/9] update --- requirements.txt | 2 +- vlmeval/smp/vlm.py | 41 ++++++++++++++++++++++++++++++++++++ vlmeval/vlm/cogvlm.py | 6 +++--- vlmeval/vlm/internvl_chat.py | 10 ++++----- vlmeval/vlm/omnilmm.py | 4 ++-- vlmeval/vlm/pandagpt.py | 2 +- 6 files changed, 53 insertions(+), 12 deletions(-) diff --git a/requirements.txt b/requirements.txt index c4fc42366..32f09ab51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +accelerate einops gradio huggingface_hub @@ -30,4 +31,3 @@ validators visual_genome xlsxwriter xtuner -accelerate \ No newline at end of file diff --git a/vlmeval/smp/vlm.py b/vlmeval/smp/vlm.py index 4241e0e2a..4c06b6b44 100644 --- a/vlmeval/smp/vlm.py +++ b/vlmeval/smp/vlm.py @@ -8,6 +8,7 @@ import base64 from PIL import Image Image.MAX_IMAGE_PIXELS = 1e9 +import torch def rescale_img(img, tgt=None): @@ -169,3 +170,43 @@ def circular_pred(df, extract_func=None): flag_map = {k: v for k, v in flag_map.items() if valid_map[k]} flags = list(flag_map.values()) return np.mean(flags) + + +def get_memory(): + total_memory = torch.cuda.get_device_properties(0).total_memory + total_mem = total_memory / 1024 / 1024 / 1024 + return total_mem + + +def build_device_map(model, defualt_map=None, no_split=None): + total_num_gpus = torch.cuda.device_count() + rank, world_size = get_rank_and_world_size() + + alpha = 1 if world_size == total_num_gpus else 0.96 + beta = 1 if world_size == total_num_gpus else 0.8 + num_gpus = total_num_gpus // world_size + memory_map = {} + per_gpu_mem = get_memory() * alpha + memory_map.update({rank: f'{beta * per_gpu_mem:.2f}GiB'}) + for gpu_id in range(1, num_gpus): + memory_map.update({rank + gpu_id * world_size: f'{per_gpu_mem:.2f}GiB'}) + + no_split_module = model._no_split_modules + no_split_module = no_split_module.extend(no_split) if no_split is not None else no_split_module + device_map = infer_auto_device_map( + model, + max_memory=memory_map, + no_split_module_classes=no_split_module + ) + if no_split is not None: + for i in no_split: + device_map[i] = rank + + try: + model = dispatch_model( + model, + device_map=device_map).eval() + except: + assert model is not None, f"Model can not be loaded to {world_size} process with {get_memory() * total_num_gpus} GiB, + try to decrease --proc-per-node or increase gpu memory." + return model, device_map diff --git a/vlmeval/vlm/cogvlm.py b/vlmeval/vlm/cogvlm.py index 963d90c7b..b8aa36c20 100644 --- a/vlmeval/vlm/cogvlm.py +++ b/vlmeval/vlm/cogvlm.py @@ -52,7 +52,7 @@ class CogVlm(BaseModel): def __init__(self, model_path='THUDM/cogvlm2-llama3-chat-19B', tokenizer_name=None, **kwargs): assert model_path is not None from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model - + with init_empty_weights(): model = AutoModelForCausalLM.from_pretrained( model_path, @@ -60,14 +60,14 @@ def __init__(self, model_path='THUDM/cogvlm2-llama3-chat-19B', tokenizer_name=No low_cpu_mem_usage=True, trust_remote_code=True, ) - + local_rank = int(os.environ.get('LOCAL_RANK', 0)) device_num = torch.cuda.device_count() device_1 = local_rank device_2 = local_rank + device_num // 2 no_split_module = model._no_split_modules - + device_map = infer_auto_device_map( model, max_memory={ diff --git a/vlmeval/vlm/internvl_chat.py b/vlmeval/vlm/internvl_chat.py index d9f1558a5..59e3d950b 100644 --- a/vlmeval/vlm/internvl_chat.py +++ b/vlmeval/vlm/internvl_chat.py @@ -150,10 +150,10 @@ def __init__(self, model_path='OpenGVLab/InternVL-Chat-V1-5', load_in_8bit=False device = torch.cuda.current_device() self.device = device from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model - + self.model_path = model_path self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False) - + model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True, load_in_8bit=load_in_8bit).eval() @@ -164,7 +164,7 @@ def __init__(self, model_path='OpenGVLab/InternVL-Chat-V1-5', load_in_8bit=False device_1 = local_rank device_2 = local_rank + device_num // 2 no_split_module = model._no_split_modules - + device_map = infer_auto_device_map( model, max_memory={ @@ -172,11 +172,11 @@ def __init__(self, model_path='OpenGVLab/InternVL-Chat-V1-5', load_in_8bit=False device_2: '40GiB' }, no_split_module_classes=no_split_module) - + model = dispatch_model( model, device_map=device_map, - offload_folder="offload").eval() + offload_folder='offload').eval() if not load_in_8bit: self.model = self.model.to(device) diff --git a/vlmeval/vlm/omnilmm.py b/vlmeval/vlm/omnilmm.py index faa4e2e9e..dae7dc2c8 100644 --- a/vlmeval/vlm/omnilmm.py +++ b/vlmeval/vlm/omnilmm.py @@ -76,7 +76,7 @@ def __init__(self, model_path, root, **kwargs) -> None: sys.path.append(root) with init_empty_weights(): model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path) - + local_rank = int(os.environ.get('LOCAL_RANK', 0)) device_num = torch.cuda.device_count() @@ -93,7 +93,7 @@ def __init__(self, model_path, root, **kwargs) -> None: model = dispatch_model( model, device_map=device_map).eval() - + self.model = model self.image_token_len = image_token_len self.image_transform = img_processor diff --git a/vlmeval/vlm/pandagpt.py b/vlmeval/vlm/pandagpt.py index 57d5f06f4..81382b4de 100644 --- a/vlmeval/vlm/pandagpt.py +++ b/vlmeval/vlm/pandagpt.py @@ -42,7 +42,7 @@ def __init__(self, name, root=None, **kwargs): model.load_state_dict(delta_ckpt, strict=False) torch.cuda.empty_cache() # self.model = model.eval().half().cuda() - + local_rank = int(os.environ.get('LOCAL_RANK', 0)) device_num = torch.cuda.device_count() From 722b78d6307f45706fd93a6a1e59698e85d18b3a Mon Sep 17 00:00:00 2001 From: Junming Yang <60545459+junming-yang@users.noreply.github.com> Date: Thu, 1 Aug 2024 19:32:23 +0800 Subject: [PATCH 4/9] update --- vlmeval/smp/vlm.py | 35 ++++++++++++++++----------- vlmeval/vlm/emu.py | 23 +++--------------- vlmeval/vlm/internvl_chat.py | 47 +++++++++--------------------------- vlmeval/vlm/omnilmm.py | 21 +++------------- vlmeval/vlm/pandagpt.py | 28 ++++----------------- 5 files changed, 44 insertions(+), 110 deletions(-) diff --git a/vlmeval/smp/vlm.py b/vlmeval/smp/vlm.py index 4c06b6b44..afdd441dc 100644 --- a/vlmeval/smp/vlm.py +++ b/vlmeval/smp/vlm.py @@ -7,8 +7,11 @@ import os.path as osp import base64 from PIL import Image -Image.MAX_IMAGE_PIXELS = 1e9 import torch +from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model +from .misc import get_rank_and_world_size + +Image.MAX_IMAGE_PIXELS = 1e9 def rescale_img(img, tgt=None): @@ -178,35 +181,39 @@ def get_memory(): return total_mem -def build_device_map(model, defualt_map=None, no_split=None): +def build_device_map(model, default_map=None, no_split=None, alpha=0.97, beta=0.9): total_num_gpus = torch.cuda.device_count() rank, world_size = get_rank_and_world_size() - - alpha = 1 if world_size == total_num_gpus else 0.96 - beta = 1 if world_size == total_num_gpus else 0.8 + if world_size == total_num_gpus: + return model.cuda() + num_gpus = total_num_gpus // world_size memory_map = {} - per_gpu_mem = get_memory() * alpha + per_gpu_mem = 45 * alpha memory_map.update({rank: f'{beta * per_gpu_mem:.2f}GiB'}) for gpu_id in range(1, num_gpus): memory_map.update({rank + gpu_id * world_size: f'{per_gpu_mem:.2f}GiB'}) - - no_split_module = model._no_split_modules - no_split_module = no_split_module.extend(no_split) if no_split is not None else no_split_module + if hasattr(model, '_no_split_modules'): + no_split_module = model._no_split_modules + else: + no_split_module = [] + if no_split is not None: + no_split_module = list(set((no_split_module + no_split))) device_map = infer_auto_device_map( model, max_memory=memory_map, no_split_module_classes=no_split_module ) - if no_split is not None: - for i in no_split: + if default_map is not None: + for i in default_map: device_map[i] = rank - + for value in device_map.values(): + assert value != 'disk', 'Please check and make sure to have enough memory to load model.' try: model = dispatch_model( model, device_map=device_map).eval() except: - assert model is not None, f"Model can not be loaded to {world_size} process with {get_memory() * total_num_gpus} GiB, - try to decrease --proc-per-node or increase gpu memory." + assert model is not None, f"""Model can not be loaded to {world_size} process with {get_memory() * total_num_gpus} GiB, + try to decrease --proc-per-node or increase gpu memory.""" return model, device_map diff --git a/vlmeval/vlm/emu.py b/vlmeval/vlm/emu.py index ce8b1b141..0088fe5f8 100644 --- a/vlmeval/vlm/emu.py +++ b/vlmeval/vlm/emu.py @@ -27,12 +27,6 @@ def __init__(self, assert local_rank * 2 <= device_num, 'The number of devices does not match the world size' assert device_num >= 2, 'You need at least 2 GPUs to use EMU' - device_1 = local_rank - device_2 = local_rank + device_num // 2 - - torch.cuda.set_device(device_1) - torch.cuda.set_device(device_2) - tokenizer = AutoTokenizer.from_pretrained(model_path) # "BAAI/Emu2-Chat" self.tokenizer = tokenizer with init_empty_weights(): @@ -42,20 +36,9 @@ def __init__(self, low_cpu_mem_usage=True, trust_remote_code=True) - device_map = infer_auto_device_map( - model, - max_memory={ - device_1: '38GiB', - device_2: '38GiB' - }, - no_split_module_classes=['Block', 'LlamaDecoderLayer']) - - # input and output logits should be on same device - device_map['model.decoder.lm.lm_head'] = device_1 - - model = dispatch_model( - model, - device_map=device_map).eval() + no_split = ['Block', 'LlamaDecoderLayer'] + default_map = ['model.decoder.lm.lm_head'] + model, _ = build_device_map(model, default_map, no_split) self.model = model kwargs_default = dict(max_new_tokens=512, length_penalty=-1) diff --git a/vlmeval/vlm/internvl_chat.py b/vlmeval/vlm/internvl_chat.py index 59e3d950b..7ce7a6f4a 100644 --- a/vlmeval/vlm/internvl_chat.py +++ b/vlmeval/vlm/internvl_chat.py @@ -137,52 +137,27 @@ def __init__(self, model_path='OpenGVLab/InternVL-Chat-V1-5', load_in_8bit=False self.model_path = model_path self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False) - if listinstr(['InternVL2-Llama3-76B'], model_path): - device_map = split_model(model_path.split('/')[1]) - self.model = AutoModel.from_pretrained( + if not load_in_8bit: + model = AutoModel.from_pretrained( model_path, torch_dtype=torch.bfloat16, load_in_8bit=load_in_8bit, trust_remote_code=True, low_cpu_mem_usage=True, - device_map=device_map).eval() + device_map='cpu').eval() + default_map = [ + 'vision_model', 'mlp1', 'language_model.model.tok_embeddings', + 'language_model.model.embed_tokens', 'language_model.output', + 'language_model.model.norm', 'language_model.lm_head' + ] + model, _ = build_device_map(model, default_map) else: - device = torch.cuda.current_device() - self.device = device - from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model - - self.model_path = model_path - self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False) - model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True, load_in_8bit=load_in_8bit).eval() - - local_rank = int(os.environ.get('LOCAL_RANK', 0)) - device_num = torch.cuda.device_count() - - device_1 = local_rank - device_2 = local_rank + device_num // 2 - no_split_module = model._no_split_modules - - device_map = infer_auto_device_map( - model, - max_memory={ - device_1: '40GiB', - device_2: '40GiB' - }, - no_split_module_classes=no_split_module) - - model = dispatch_model( - model, - device_map=device_map, - offload_folder='offload').eval() - - if not load_in_8bit: - self.model = self.model.to(device) - + self.device = torch.cuda.current_device() + self.model_path = model_path self.model = model - self.image_size = self.model.config.vision_config.image_size self.version = version self.kwargs = kwargs diff --git a/vlmeval/vlm/omnilmm.py b/vlmeval/vlm/omnilmm.py index dae7dc2c8..3e17fc02c 100644 --- a/vlmeval/vlm/omnilmm.py +++ b/vlmeval/vlm/omnilmm.py @@ -76,23 +76,10 @@ def __init__(self, model_path, root, **kwargs) -> None: sys.path.append(root) with init_empty_weights(): model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path) - - local_rank = int(os.environ.get('LOCAL_RANK', 0)) - device_num = torch.cuda.device_count() - - device_1 = local_rank - device_2 = local_rank + device_num // 2 - device_map = infer_auto_device_map( - model, - max_memory={ - device_1: '22GiB', - device_2: '22GiB' - }, - no_split_module_classes=['Eva','MistralDecoderLayer', 'ModuleList', 'Resampler']) - print(device_map) - model = dispatch_model( - model, - device_map=device_map).eval() + + default_map = ['lm_head', 'model.norm', 'model.resampler', 'model.layers'] + no_split = ['Eva','MistralDecoderLayer', 'ModuleList', 'Resampler'] + model, _ = build_device_map(model, default_map, no_split) self.model = model self.image_token_len = image_token_len diff --git a/vlmeval/vlm/pandagpt.py b/vlmeval/vlm/pandagpt.py index 81382b4de..0c85e6591 100644 --- a/vlmeval/vlm/pandagpt.py +++ b/vlmeval/vlm/pandagpt.py @@ -3,8 +3,7 @@ import os.path as osp import warnings from .base import BaseModel -from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model -import os +from ..smp import * class PandaGPT(BaseModel): @@ -41,27 +40,10 @@ def __init__(self, name, root=None, **kwargs): delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu')) model.load_state_dict(delta_ckpt, strict=False) torch.cuda.empty_cache() - # self.model = model.eval().half().cuda() - - local_rank = int(os.environ.get('LOCAL_RANK', 0)) - device_num = torch.cuda.device_count() - - device_1 = local_rank - device_2 = local_rank + device_num // 2 - - device_map = infer_auto_device_map( - model, - max_memory={ - device_1: '32GiB', - device_2: '32GiB' - }, - no_split_module_classes=['LlamaDecoderLayer', 'VisionTransformer']) - device_map['llama_model.base_model.model.lm_head'] = device_map['llama_proj'] = device_1 - print(device_map) - model = dispatch_model( - model, - device_map=device_map).eval() - self.model = model + default_map = ['llama_model.base_model.model.lm_head', 'llama_proj'] + no_split_list = ['LlamaDecoderLayer', 'VisionTransformer'] + model, _ = build_device_map(model, default_map, no_split_list) + self.model = model.eval() kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001} kwargs_default.update(kwargs) self.kwargs = kwargs_default From 42acc727f049df5acd9fe3bab0835273cde0d67c Mon Sep 17 00:00:00 2001 From: Junming Yang <60545459+junming-yang@users.noreply.github.com> Date: Thu, 1 Aug 2024 19:35:06 +0800 Subject: [PATCH 5/9] update --- vlmeval/smp/vlm.py | 13 +++++-------- vlmeval/vlm/internvl_chat.py | 9 +++++---- vlmeval/vlm/omnilmm.py | 4 ++-- vlmeval/vlm/pandagpt.py | 3 +++ 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/vlmeval/smp/vlm.py b/vlmeval/smp/vlm.py index afdd441dc..6c33c15f7 100644 --- a/vlmeval/smp/vlm.py +++ b/vlmeval/smp/vlm.py @@ -186,7 +186,7 @@ def build_device_map(model, default_map=None, no_split=None, alpha=0.97, beta=0. rank, world_size = get_rank_and_world_size() if world_size == total_num_gpus: return model.cuda() - + num_gpus = total_num_gpus // world_size memory_map = {} per_gpu_mem = 45 * alpha @@ -209,11 +209,8 @@ def build_device_map(model, default_map=None, no_split=None, alpha=0.97, beta=0. device_map[i] = rank for value in device_map.values(): assert value != 'disk', 'Please check and make sure to have enough memory to load model.' - try: - model = dispatch_model( - model, - device_map=device_map).eval() - except: - assert model is not None, f"""Model can not be loaded to {world_size} process with {get_memory() * total_num_gpus} GiB, - try to decrease --proc-per-node or increase gpu memory.""" + + model = dispatch_model( + model, + device_map=device_map).eval() return model, device_map diff --git a/vlmeval/vlm/internvl_chat.py b/vlmeval/vlm/internvl_chat.py index 7ce7a6f4a..c3ed34358 100644 --- a/vlmeval/vlm/internvl_chat.py +++ b/vlmeval/vlm/internvl_chat.py @@ -146,15 +146,16 @@ def __init__(self, model_path='OpenGVLab/InternVL-Chat-V1-5', load_in_8bit=False low_cpu_mem_usage=True, device_map='cpu').eval() default_map = [ - 'vision_model', 'mlp1', 'language_model.model.tok_embeddings', + 'vision_model', 'mlp1', 'language_model.model.tok_embeddings', 'language_model.model.embed_tokens', 'language_model.output', 'language_model.model.norm', 'language_model.lm_head' ] model, _ = build_device_map(model, default_map) else: - model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, - trust_remote_code=True, - load_in_8bit=load_in_8bit).eval() + model = AutoModel.from_pretrained( + model_path, torch_dtype=torch.bfloat16, + trust_remote_code=True, + load_in_8bit=load_in_8bit).eval() self.device = torch.cuda.current_device() self.model_path = model_path self.model = model diff --git a/vlmeval/vlm/omnilmm.py b/vlmeval/vlm/omnilmm.py index 3e17fc02c..9b176caf6 100644 --- a/vlmeval/vlm/omnilmm.py +++ b/vlmeval/vlm/omnilmm.py @@ -76,9 +76,9 @@ def __init__(self, model_path, root, **kwargs) -> None: sys.path.append(root) with init_empty_weights(): model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path) - + default_map = ['lm_head', 'model.norm', 'model.resampler', 'model.layers'] - no_split = ['Eva','MistralDecoderLayer', 'ModuleList', 'Resampler'] + no_split = ['Eva', 'MistralDecoderLayer', 'ModuleList', 'Resampler'] model, _ = build_device_map(model, default_map, no_split) self.model = model diff --git a/vlmeval/vlm/pandagpt.py b/vlmeval/vlm/pandagpt.py index 0c85e6591..2bbf24d7a 100644 --- a/vlmeval/vlm/pandagpt.py +++ b/vlmeval/vlm/pandagpt.py @@ -5,6 +5,7 @@ from .base import BaseModel from ..smp import * + class PandaGPT(BaseModel): INSTALL_REQ = True @@ -40,9 +41,11 @@ def __init__(self, name, root=None, **kwargs): delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu')) model.load_state_dict(delta_ckpt, strict=False) torch.cuda.empty_cache() + default_map = ['llama_model.base_model.model.lm_head', 'llama_proj'] no_split_list = ['LlamaDecoderLayer', 'VisionTransformer'] model, _ = build_device_map(model, default_map, no_split_list) + self.model = model.eval() kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001} kwargs_default.update(kwargs) From 1e7e545aaec19bc9e47eb49aa76e58e164eb0f72 Mon Sep 17 00:00:00 2001 From: Junming Yang <60545459+junming-yang@users.noreply.github.com> Date: Thu, 1 Aug 2024 19:39:58 +0800 Subject: [PATCH 6/9] fix --- vlmeval/vlm/omnilmm.py | 2 +- vlmeval/vlm/pandagpt.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vlmeval/vlm/omnilmm.py b/vlmeval/vlm/omnilmm.py index f27721bc6..afdefe28a 100644 --- a/vlmeval/vlm/omnilmm.py +++ b/vlmeval/vlm/omnilmm.py @@ -122,7 +122,7 @@ def __init__(self, model_path, root, **kwargs) -> None: torch.cuda.empty_cache() def generate_inner(self, message, dataset=None): - prompt, image_path = self.message_to_promptimg(message) + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) try: image = Image.open(image_path).convert('RGB') except: diff --git a/vlmeval/vlm/pandagpt.py b/vlmeval/vlm/pandagpt.py index 2bbf24d7a..805a55d22 100644 --- a/vlmeval/vlm/pandagpt.py +++ b/vlmeval/vlm/pandagpt.py @@ -53,7 +53,7 @@ def __init__(self, name, root=None, **kwargs): warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') def generate_inner(self, message, dataset=None): - prompt, image_path = self.message_to_promptimg(message) + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) struct = { 'prompt': prompt, 'image_paths': [image_path], From b69b438696b6d454fb1e0188d371e66b5b191911 Mon Sep 17 00:00:00 2001 From: Junming Yang <60545459+junming-yang@users.noreply.github.com> Date: Thu, 1 Aug 2024 19:43:51 +0800 Subject: [PATCH 7/9] support cogvlm --- vlmeval/vlm/cogvlm.py | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/vlmeval/vlm/cogvlm.py b/vlmeval/vlm/cogvlm.py index b8aa36c20..12db53b11 100644 --- a/vlmeval/vlm/cogvlm.py +++ b/vlmeval/vlm/cogvlm.py @@ -27,9 +27,9 @@ def __init__(self, model_path='THUDM/glm-4v-9b', **kwargs): self.end_text_token = '<|endoftext|>' def generate_inner(self, message, dataset=None): - prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + prompt, image_path = self.message_to_promptimg(message) image = Image.open(image_path).convert('RGB') - if dataset is not None and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']: + if dataset is not None and DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']: prompt += '\nShort Answer.' inputs = self.tokenizer.apply_chat_template( [{'role': 'user', 'image': image, 'content': prompt}], @@ -51,8 +51,8 @@ class CogVlm(BaseModel): def __init__(self, model_path='THUDM/cogvlm2-llama3-chat-19B', tokenizer_name=None, **kwargs): assert model_path is not None - from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model - + from accelerate import init_empty_weights + with init_empty_weights(): model = AutoModelForCausalLM.from_pretrained( model_path, @@ -60,25 +60,8 @@ def __init__(self, model_path='THUDM/cogvlm2-llama3-chat-19B', tokenizer_name=No low_cpu_mem_usage=True, trust_remote_code=True, ) - - local_rank = int(os.environ.get('LOCAL_RANK', 0)) - device_num = torch.cuda.device_count() - - device_1 = local_rank - device_2 = local_rank + device_num // 2 - no_split_module = model._no_split_modules - - device_map = infer_auto_device_map( - model, - max_memory={ - device_1: '22GiB', - device_2: '22GiB' - }, - no_split_module_classes=no_split_module) - model = dispatch_model( - model, - device_map=device_map).eval() - + model, _ = build_device_map(model) + self.kwargs = kwargs if tokenizer_name: tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name) From 228ea091561bc5941cc4dda47cb01a198166e7d2 Mon Sep 17 00:00:00 2001 From: Junming Yang <60545459+junming-yang@users.noreply.github.com> Date: Thu, 1 Aug 2024 19:45:08 +0800 Subject: [PATCH 8/9] update --- vlmeval/vlm/cogvlm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vlmeval/vlm/cogvlm.py b/vlmeval/vlm/cogvlm.py index 12db53b11..ddec88960 100644 --- a/vlmeval/vlm/cogvlm.py +++ b/vlmeval/vlm/cogvlm.py @@ -52,7 +52,7 @@ class CogVlm(BaseModel): def __init__(self, model_path='THUDM/cogvlm2-llama3-chat-19B', tokenizer_name=None, **kwargs): assert model_path is not None from accelerate import init_empty_weights - + with init_empty_weights(): model = AutoModelForCausalLM.from_pretrained( model_path, @@ -61,7 +61,7 @@ def __init__(self, model_path='THUDM/cogvlm2-llama3-chat-19B', tokenizer_name=No trust_remote_code=True, ) model, _ = build_device_map(model) - + self.kwargs = kwargs if tokenizer_name: tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name) From cae15c6373b651dacef9594252dae9c299a1ea49 Mon Sep 17 00:00:00 2001 From: Junming Yang <60545459+junming-yang@users.noreply.github.com> Date: Wed, 7 Aug 2024 14:16:35 +0800 Subject: [PATCH 9/9] fix --- vlmeval/smp/vlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vlmeval/smp/vlm.py b/vlmeval/smp/vlm.py index 0122eeef1..c65ea239e 100644 --- a/vlmeval/smp/vlm.py +++ b/vlmeval/smp/vlm.py @@ -199,7 +199,7 @@ def build_device_map(model, default_map=None, no_split=None, alpha=0.97, beta=0. num_gpus = total_num_gpus // world_size memory_map = {} - per_gpu_mem = 45 * alpha + per_gpu_mem = get_memory() * alpha memory_map.update({rank: f'{beta * per_gpu_mem:.2f}GiB'}) for gpu_id in range(1, num_gpus): memory_map.update({rank + gpu_id * world_size: f'{per_gpu_mem:.2f}GiB'})