diff --git a/requirements.txt b/requirements.txt index 8f0ca6d..2fe255a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -torch>=1.6.0 -transformers==3.1.0 -tqdm>=4.36.1 -numpy==1.16.0 -packaging +torch>=1.9.0 +transformers==4.12.5 +tqdm>=4.62.1 +numpy==1.20.0 +packaging==21.0 diff --git a/src/run_transformer_ner.py b/src/run_transformer_ner.py index 7b00cb6..5e40d6a 100644 --- a/src/run_transformer_ner.py +++ b/src/run_transformer_ner.py @@ -23,8 +23,11 @@ def main(): # add arguments parser.add_argument("--model_type", default='bert', type=str, required=True, help="valid values: bert, roberta or xlnet") - parser.add_argument("--pretrained_model", type=str, required=True, + parser.add_argument("--pretrained_model", type=str, default=None, help="The pretrained model file or directory for fine tuning.") + # resume training on a NER model if set it will overwrite pretrained_model + parser.add_argument("--resume_from_model", type=str, default=None, + help="The NER model file or directory for continuous fine tuning.") parser.add_argument("--config_name", default=None, type=str, help="Pretrained config name or path if not the same as pretrained_model") parser.add_argument("--tokenizer_name", default=None, type=str, @@ -116,6 +119,14 @@ def main(): global_args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.info("Task will use cuda device: GPU_{}.".format(torch.cuda.current_device()) if torch.cuda.device_count() else 'Task will use CPU.') + # if use resume_from_model, then resume_from_model will overwrite pretrained_model + if global_args.resume_from_model: + global_args.pretrained_model = global_args.resume_from_model + + if global_args.resume_from_model is None and global_args.pretrained_model is None: + raise RuntimeError("""Both resume_from_model and pretrained_model are not set. + You have to specify one of them.""") + # if args.tokenizer_name and args.config_name are not specially set, set them as pretrained_model if not global_args.tokenizer_name: global_args.tokenizer_name = global_args.pretrained_model diff --git a/src/transformer_ner/data_utils.py b/src/transformer_ner/data_utils.py index 872bc4f..74c37e7 100644 --- a/src/transformer_ner/data_utils.py +++ b/src/transformer_ner/data_utils.py @@ -96,7 +96,8 @@ def get_labels(self, default='bert', customized_label2idx=None): _, dev_labels = self._read_data(self.data_dir / "dev.txt", task='train') if dev_labels.intersection(train_labels) != dev_labels: - self.logger.warning("dev set has label ({}) not appeared in train set.".format({e for e in dev_labels if e not in train_labels})) + self.logger.warning("dev set has label ({}) not appeared in train set.".format( + {e for e in dev_labels if e not in train_labels})) for lb in sorted(train_labels, key=lambda x: x.split("-")[-1]): if lb not in label2idx: diff --git a/src/transformer_ner/task.py b/src/transformer_ner/task.py index 5bd2330..2431a55 100644 --- a/src/transformer_ner/task.py +++ b/src/transformer_ner/task.py @@ -12,7 +12,9 @@ import os import random +import warnings from pathlib import Path +import traceback import numpy as np import torch @@ -72,6 +74,7 @@ def load_model(args, new_model_dir=None): model = model(config=args.config) model.load_state_dict(state_dict=ckpt) except AttributeError as Ex: + args.logger.error(traceback.format_exc()) args.logger.warning( """The model seems save using model.save instead of model.state_dict, attempt to directly using the loaded checkpoint as model.""") @@ -495,6 +498,18 @@ def run_task(args): else: label2idx = json_load(os.path.join(args.new_model_dir, "label2idx.json")) + # if use resume_from_model, + # we need to check the label in new data exact same as the one used in training previous model + if args.do_train and args.resume_from_model is not None: + label2idx_from_old = json_load(Path(args.resume_from_model / "label2idx.json")) + assert len(label2idx_from_old) == len(label2idx), """expect same label2idx but get old one from resume model as {}; + and the new one from current data is {}""".format(label2idx_from_old, label2idx) + for k in label2idx.keys(): + assert k in label2idx_from_old, f"the label {k} is not in old label2idx " \ + "check your data make sure all annotations are the same in two datasets" + warnings.warn("will overwrite label2idx with label2idx from old model to make sure labels are mapped correct.") + label2idx = label2idx_from_old + num_labels = len(label2idx) idx2label = {v: k for k, v in label2idx.items()} args.num_labels = num_labels @@ -513,13 +528,17 @@ def run_task(args): args.tokenizer_name, do_lower_case=args.do_lower_case, add_prefix_space=True) else: tokenizer = model_tokenizer.from_pretrained(args.tokenizer_name, do_lower_case=args.do_lower_case) - tokenizer.add_tokens(NEXT_TOKEN) - config = model_config.from_pretrained(args.config_name, num_labels=num_labels) - config.use_crf = args.use_crf - config.label2idx = args.label2idx - config.use_focal_loss = args.focal_loss - config.focal_loss_gamma = args.focal_loss_gamma - args.logger.info("New Model Config:\n{}".format(config)) + + if args.resume_from_model is None: + tokenizer.add_tokens(NEXT_TOKEN) + config = model_config.from_pretrained(args.config_name, num_labels=num_labels) + config.use_crf = args.use_crf + config.label2idx = args.label2idx + config.use_focal_loss = args.focal_loss + config.focal_loss_gamma = args.focal_loss_gamma + args.logger.info("New Model Config:\n{}".format(config)) + else: + config = model_config.from_pretrained(args.config_name, num_labels=num_labels) if args.pretrained_model == "microsoft/deberta-xlarge-v2": raise NotImplementedError("""the deberta-xlarge-v2 tokenizer is different from other deberta models @@ -527,13 +546,17 @@ def run_task(args): you can try other debata models: microsoft/deberta-base, microsoft/deberta-large, microsoft/deberta-xlarge""") - model = model_model.from_pretrained(args.pretrained_model, config=config) + if args.resume_from_model is not None: + args.config = config + model = load_model(args, args.resume_from_model) + else: + model = model_model.from_pretrained(args.pretrained_model, config=config) + # #add an control token for combine sentence if it is too long to fit max_seq_len + model.resize_token_embeddings(len(tokenizer)) + config.vocab_size = len(tokenizer) + args.config = model.config - # #add an control token for combine sentence if it is too long to fit max_seq_len - model.resize_token_embeddings(len(tokenizer)) - config.vocab_size = len(tokenizer) args.tokenizer = tokenizer - args.config = model.config model.to(args.device) train_examples = ner_data_processor.get_train_examples() @@ -561,7 +584,7 @@ def run_task(args): # predict - test.txt file prediction (if you need predict many files, use 'run_transformer_batch_prediction') if args.do_predict: args.config = model_config.from_pretrained(args.new_model_dir, num_labels=num_labels) - args.use_crf = args.config.use_crf + # args.use_crf = args.config.use_crf # args.model_type = args.config.model_type if args.model_type in {"roberta", "bart", "longformer"}: # we need to set add_prefix_space to True for roberta, longformer, and Bart (any tokenizer from GPT-2) diff --git a/src/transformer_ner/test_transfomer.py b/src/transformer_ner/test_transfomer.py index 481da19..6aaf6e2 100644 --- a/src/transformer_ner/test_transfomer.py +++ b/src/transformer_ner/test_transfomer.py @@ -8,21 +8,24 @@ class Args: - def __init__(self, model_type, pretrained_model): + def __init__(self, model_type, pretrained_model, do_train=True, do_predict=True, + new_model_dir=None, resume_from_model=None): self.model_type = model_type - self.pretrained_model = pretrained_model + self.pretrained_model = pretrained_model if resume_from_model is None else resume_from_model self.config_name = self.pretrained_model self.tokenizer_name = self.pretrained_model self.do_lower_case = True self.overwrite_model_dir = True - self.data_dir = Path(__file__).resolve().parent.parent.parent/'test_data/conll-2003' + self.data_dir = Path(__file__).resolve().parent.parent.parent / 'test_data/conll-2003' self.data_has_offset_information = False - self.new_model_dir = Path(__file__).resolve().parent.parent.parent/f'new_ner_model/{model_type}_new_ner_model' - self.predict_output_file = Path(__file__).resolve().parent.parent.parent/f"new_ner_model/{model_type}_new_ner_model/pred.txt" + self.new_model_dir = new_model_dir if new_model_dir is not None else Path( + __file__).resolve().parent.parent.parent / f'new_ner_model/{model_type}_new_ner_model' + self.predict_output_file = Path( + __file__).resolve().parent.parent.parent / f"new_ner_model/{model_type}_new_ner_model/pred.txt" self.overwrite_output_dir = True self.max_seq_length = 16 - self.do_train = True - self.do_predict = True + self.do_train = do_train + self.do_predict = do_predict self.model_selection_scoring = "strict-f_score-1" self.train_batch_size = 4 self.eval_batch_size = 4 @@ -30,8 +33,8 @@ def __init__(self, model_type, pretrained_model): self.seed = 13 self.logger = TransformerNERLogger( logger_level="i", - logger_file=Path(__file__).resolve().parent.parent.parent/"new_ner_model/log.txt").get_logger() - self.num_train_epochs = 2 + logger_file=Path(__file__).resolve().parent.parent.parent / "new_ner_model/log.txt").get_logger() + self.num_train_epochs = 1 self.gradient_accumulation_steps = 1 self.do_warmup = True self.label2idx = None @@ -46,23 +49,47 @@ def __init__(self, model_type, pretrained_model): self.fp16 = False self.local_rank = -1 self.device = "cpu" - self.train_steps = 100 + self.train_steps = 10 self.early_stop = -1 self.progress_bar = True self.save_model_core = True self.use_crf = False self.focal_loss = False self.focal_loss_gamma = 2 + self.resume_from_model = resume_from_model def test(): - for each in [('deberta', "microsoft/deberta-base"), - ('bert', 'bert-base-uncased'), + # test training + for each in [('bert', 'bert-base-uncased'), + ('deberta', "microsoft/deberta-base"), ('roberta', 'roberta-base'), ('xlnet', 'xlnet-base-cased')]: - args = Args(each[0], each[1]) + args = Args(each[0], each[1], do_train=True, do_predict=False) run_task(args) +def test1(): + # test prediction + args = Args("bert", 'bert-base-uncased', do_train=False, do_predict=True, + new_model_dir=Path( + __file__).resolve().parent.parent.parent / "new_ner_model" / "bert-base-uncased_conll2003") + run_task(args) + + +def test2(): + # test continuous training from existing NER model + args = Args("bert", 'bert-base-uncased', do_train=True, do_predict=True, + resume_from_model=Path( + __file__).resolve().parent.parent.parent / "new_ner_model" / "bert-base-uncased_conll2003") + run_task(args) + + if __name__ == '__main__': - test() + which_test = input("run which test? 1 or 2") + if which_test == "1": + test() + elif which_test == "2": + test1() + else: + test2()