diff --git a/asuka_README.md b/misc/asuka_README.md similarity index 100% rename from asuka_README.md rename to misc/asuka_README.md diff --git a/misc/requirements.txt b/misc/requirements.txt new file mode 100644 index 0000000..0ff2d84 --- /dev/null +++ b/misc/requirements.txt @@ -0,0 +1,15 @@ +timm==0.5.4 +Pillow +blobfile +mypy +numpy +pytest +requests +einops +tensorboardX +deepspeed==0.6.5 +scipy +sh +wandb +ftfy +regex \ No newline at end of file diff --git a/misc/train.sh b/misc/train.sh new file mode 100644 index 0000000..ad3537e --- /dev/null +++ b/misc/train.sh @@ -0,0 +1,8 @@ +SEG_CONFIG=configs/eva2_hybrid/Segmenter_EVA02_large_24_512_slide_80k.py +PRETRAIN_CKPT=pretrained/eva02_L_pt_m38m_p14to16.pt + +python -m torch.distributed.launch \ +--use_env train.py --launcher pytorch \ + ${SEG_CONFIG} \ + --seed 0 --deterministic --gpu-ids 4 5 6 7 \ + --options model.backbone.pretrained=${PRETRAIN_CKPT} \ No newline at end of file diff --git a/test.py b/test.py index 2c6f2bc..2c7305d 100644 --- a/test.py +++ b/test.py @@ -19,7 +19,7 @@ from mmseg.models import build_segmentor from mmseg.utils import build_ddp, build_dp, get_device, setup_multi_processes -from backbone import eva2 +from mmseg.models.backbones import EVA2 def parse_args(): diff --git a/tools/dist_train.sh b/tools/dist_train.sh index 5b43fff..5c41bb3 100644 --- a/tools/dist_train.sh +++ b/tools/dist_train.sh @@ -1,9 +1,8 @@ #!/usr/bin/env bash -CONFIG=$1 -GPUS=$2 -PORT=${PORT:-29500} +CONFIG="configs/eva2_hybrid/Segmenter_EVA02_large_24_512_slide_80k.py" +GPUS=4 PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ -python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ - $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} +python -m torch.distributed.launch --nproc_per_node=${GPUS} --master_port=-29500 \ + $(dirname "$0")/train.py ${CONFIG} --launcher pytorch ${@:3} diff --git a/train.py b/train.py index c02a85e..51d00ae 100644 --- a/train.py +++ b/train.py @@ -3,9 +3,11 @@ import os import os.path as osp import time +import bitsandbytes as bnb import mmcv import torch +import torch.nn as nn from mmcv.runner import init_dist from mmcv.utils import Config, DictAction, get_git_hash @@ -16,7 +18,24 @@ from mmseg.models import build_segmentor from mmseg.utils import collect_env, get_root_logger -from backbone import eva2 +from mmseg.models.backbones import EVA2 +import loralib as lora + +from transformers import ( + PreTrainedModel, + PretrainedConfig, + AutoModelForCausalLM, + BitsAndBytesConfig, +) +from peft import ( + prepare_model_for_kbit_training, + LoraConfig, + get_peft_model, + PeftModel +) +from peft.tuners.lora import LoraLayer +from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR + def parse_args(): parser = argparse.ArgumentParser(description='Train a segmentor') @@ -62,6 +81,73 @@ def parse_args(): return args +def find_eva_linear_names(model): + lora_module_names = set() + for name, module in model.backbone.named_modules(): + if isinstance(module, bnb.nn.Linear4bit): + names = name.split('.') + lora_module_names.add(names[0] if len(names) == 1 else names[-1]) + + if 'lm_head' in lora_module_names: + lora_module_names.remove('lm_head') + return list(lora_module_names) + + +def get_accelerate_model(args, model, checkpoint_dir=None): + pconfig=PretrainedConfig(is_encoder_decoder=True,torch_dtype=torch.float32) + prtr=PreTrainedModel(pconfig) + # prtr.save_pretrained('workbench/pretrained/') + + model = AutoModelForCausalLM.from_pretrained( + None, + state_dict=model.state_dict(), + config=prtr, + load_in_4bit=True, + device_map='auto', + max_memory={0: '5120MB'}, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4', + ), + torch_dtype=torch.bfloat16, + ) + + setattr(model, 'model_parallel', True) + setattr(model, 'is_parallelizable', True) + + model.config.torch_dtype=torch.bfloat16 + model=prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) + model.gradient_checkpointing_enable() + + if checkpoint_dir is not None: + print("Loading adapters from checkpoint.") + model = PeftModel.from_pretrained(model, osp.join(checkpoint_dir, 'adapter_model'), is_trainable=True) + else: + print(f'Adding LoRA modules...') + modules = find_eva_linear_names(model) + config = LoraConfig( + r=args.lora_rank, + lora_alpha=args.lora_alpha, + target_modules=modules, + lora_dropout=0.1, + bias="none", + ) + model=get_peft_model(model, config) + + for name, module in model.named_modules(): + if isinstance(module, LoraLayer): + module = module.to(torch.bfloat16) + if 'norm' in name: + module = module.to(torch.float32) + if 'lm_head' in name or 'embed_tokens' in name: + if hasattr(module, 'weight') and module.weight.dtype == torch.float32: + module = module.to(torch.bfloat16) + + return model + + def main(): args = parse_args() @@ -132,9 +218,12 @@ def main(): model = build_segmentor( cfg.model, train_cfg=cfg.get('train_cfg'), - test_cfg=cfg.get('test_cfg')) + test_cfg=cfg.get('test_cfg') + ) - logger.info(model) + # for k,v in model.named_parameters(): + # print('{}: {}'.format(k, v.requires_grad)) + # logger.info(model) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: @@ -151,6 +240,7 @@ def main(): PALETTE=datasets[0].PALETTE) # add an attribute for visualization convenience model.CLASSES = datasets[0].CLASSES + train_segmentor( model, datasets, @@ -162,4 +252,4 @@ def main(): if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/train.sh b/train.sh new file mode 100644 index 0000000..ab1a2d9 --- /dev/null +++ b/train.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +GPUS=4 +CONFIGS="configs/eva2_hybrid/Segmenter_EVA02_large_24_512_slide_80k.py" + +python -m torch.distributed.launch --nproc_per_node=${GPUS} \ + --use_env train.py --launcher pytorch \ + ${CONFIGS} --seed 0 --deterministic --gpus ${GPUS} \ + # --load-from workbench/iter_60000.pth \ No newline at end of file