Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add continuous training from an existing NER model; two datasets must… #22

Merged
merged 2 commits into from
Dec 4, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion src/run_transformer_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@ def main():
# add arguments
parser.add_argument("--model_type", default='bert', type=str, required=True,
help="valid values: bert, roberta or xlnet")
parser.add_argument("--pretrained_model", type=str, required=True,
parser.add_argument("--pretrained_model", type=str, default=None,
help="The pretrained model file or directory for fine tuning.")
# resume training on a NER model if set it will overwrite pretrained_model
parser.add_argument("--resume_from_model", type=str, default=None,
help="The NER model file or directory for continuous fine tuning.")
parser.add_argument("--config_name", default=None, type=str,
help="Pretrained config name or path if not the same as pretrained_model")
parser.add_argument("--tokenizer_name", default=None, type=str,
Expand Down Expand Up @@ -116,6 +119,14 @@ def main():
global_args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info("Task will use cuda device: GPU_{}.".format(torch.cuda.current_device()) if torch.cuda.device_count() else 'Task will use CPU.')

# if use resume_from_model, then resume_from_model will overwrite pretrained_model
if global_args.resume_from_model:
global_args.pretrained_model = global_args.resume_from_model

if global_args.resume_from_model is None and global_args.pretrained_model is None:
raise RuntimeError("""Both resume_from_model and pretrained_model are not set.
You have to specify one of them.""")

# if args.tokenizer_name and args.config_name are not specially set, set them as pretrained_model
if not global_args.tokenizer_name:
global_args.tokenizer_name = global_args.pretrained_model
Expand Down
3 changes: 2 additions & 1 deletion src/transformer_ner/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ def get_labels(self, default='bert', customized_label2idx=None):
_, dev_labels = self._read_data(self.data_dir / "dev.txt", task='train')

if dev_labels.intersection(train_labels) != dev_labels:
self.logger.warning("dev set has label ({}) not appeared in train set.".format({e for e in dev_labels if e not in train_labels}))
self.logger.warning("dev set has label ({}) not appeared in train set.".format(
{e for e in dev_labels if e not in train_labels}))

for lb in sorted(train_labels, key=lambda x: x.split("-")[-1]):
if lb not in label2idx:
Expand Down
49 changes: 36 additions & 13 deletions src/transformer_ner/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@

import os
import random
import warnings
from pathlib import Path
import traceback

import numpy as np
import torch
Expand Down Expand Up @@ -72,6 +74,7 @@ def load_model(args, new_model_dir=None):
model = model(config=args.config)
model.load_state_dict(state_dict=ckpt)
except AttributeError as Ex:
args.logger.error(traceback.format_exc())
args.logger.warning(
"""The model seems save using model.save instead of model.state_dict,
attempt to directly using the loaded checkpoint as model.""")
Expand Down Expand Up @@ -495,6 +498,18 @@ def run_task(args):
else:
label2idx = json_load(os.path.join(args.new_model_dir, "label2idx.json"))

# if use resume_from_model,
# we need to check the label in new data exact same as the one used in training previous model
if args.do_train and args.resume_from_model is not None:
label2idx_from_old = json_load(Path(args.resume_from_model / "label2idx.json"))
assert len(label2idx_from_old) == len(label2idx), """expect same label2idx but get old one from resume model as {};
and the new one from current data is {}""".format(label2idx_from_old, label2idx)
for k in label2idx.keys():
assert k in label2idx_from_old, f"the label {k} is not in old label2idx " \
"check your data make sure all annotations are the same in two datasets"
warnings.warn("will overwrite label2idx with label2idx from old model to make sure labels are mapped correct.")
label2idx = label2idx_from_old

num_labels = len(label2idx)
idx2label = {v: k for k, v in label2idx.items()}
args.num_labels = num_labels
Expand All @@ -513,27 +528,35 @@ def run_task(args):
args.tokenizer_name, do_lower_case=args.do_lower_case, add_prefix_space=True)
else:
tokenizer = model_tokenizer.from_pretrained(args.tokenizer_name, do_lower_case=args.do_lower_case)
tokenizer.add_tokens(NEXT_TOKEN)
config = model_config.from_pretrained(args.config_name, num_labels=num_labels)
config.use_crf = args.use_crf
config.label2idx = args.label2idx
config.use_focal_loss = args.focal_loss
config.focal_loss_gamma = args.focal_loss_gamma
args.logger.info("New Model Config:\n{}".format(config))

if args.resume_from_model is None:
tokenizer.add_tokens(NEXT_TOKEN)
config = model_config.from_pretrained(args.config_name, num_labels=num_labels)
config.use_crf = args.use_crf
config.label2idx = args.label2idx
config.use_focal_loss = args.focal_loss
config.focal_loss_gamma = args.focal_loss_gamma
args.logger.info("New Model Config:\n{}".format(config))
else:
config = model_config.from_pretrained(args.config_name, num_labels=num_labels)

if args.pretrained_model == "microsoft/deberta-xlarge-v2":
raise NotImplementedError("""the deberta-xlarge-v2 tokenizer is different from other deberta models
the support for deberta-xlarge-v2 is not implemented.
you can try other debata models: microsoft/deberta-base,
microsoft/deberta-large, microsoft/deberta-xlarge""")

model = model_model.from_pretrained(args.pretrained_model, config=config)
if args.resume_from_model is not None:
args.config = config
model = load_model(args, args.resume_from_model)
else:
model = model_model.from_pretrained(args.pretrained_model, config=config)
# #add an control token for combine sentence if it is too long to fit max_seq_len
model.resize_token_embeddings(len(tokenizer))
config.vocab_size = len(tokenizer)
args.config = model.config

# #add an control token for combine sentence if it is too long to fit max_seq_len
model.resize_token_embeddings(len(tokenizer))
config.vocab_size = len(tokenizer)
args.tokenizer = tokenizer
args.config = model.config
model.to(args.device)

train_examples = ner_data_processor.get_train_examples()
Expand Down Expand Up @@ -561,7 +584,7 @@ def run_task(args):
# predict - test.txt file prediction (if you need predict many files, use 'run_transformer_batch_prediction')
if args.do_predict:
args.config = model_config.from_pretrained(args.new_model_dir, num_labels=num_labels)
args.use_crf = args.config.use_crf
# args.use_crf = args.config.use_crf
# args.model_type = args.config.model_type
if args.model_type in {"roberta", "bart", "longformer"}:
# we need to set add_prefix_space to True for roberta, longformer, and Bart (any tokenizer from GPT-2)
Expand Down
55 changes: 41 additions & 14 deletions src/transformer_ner/test_transfomer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,33 @@

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can skip this since this is for test purposes. But you can get an idea of how the new function has been tested from here.


class Args:
def __init__(self, model_type, pretrained_model):
def __init__(self, model_type, pretrained_model, do_train=True, do_predict=True,
new_model_dir=None, resume_from_model=None):
self.model_type = model_type
self.pretrained_model = pretrained_model
self.pretrained_model = pretrained_model if resume_from_model is None else resume_from_model
self.config_name = self.pretrained_model
self.tokenizer_name = self.pretrained_model
self.do_lower_case = True
self.overwrite_model_dir = True
self.data_dir = Path(__file__).resolve().parent.parent.parent/'test_data/conll-2003'
self.data_dir = Path(__file__).resolve().parent.parent.parent / 'test_data/conll-2003'
self.data_has_offset_information = False
self.new_model_dir = Path(__file__).resolve().parent.parent.parent/f'new_ner_model/{model_type}_new_ner_model'
self.predict_output_file = Path(__file__).resolve().parent.parent.parent/f"new_ner_model/{model_type}_new_ner_model/pred.txt"
self.new_model_dir = new_model_dir if new_model_dir is not None else Path(
__file__).resolve().parent.parent.parent / f'new_ner_model/{model_type}_new_ner_model'
self.predict_output_file = Path(
__file__).resolve().parent.parent.parent / f"new_ner_model/{model_type}_new_ner_model/pred.txt"
self.overwrite_output_dir = True
self.max_seq_length = 16
self.do_train = True
self.do_predict = True
self.do_train = do_train
self.do_predict = do_predict
self.model_selection_scoring = "strict-f_score-1"
self.train_batch_size = 4
self.eval_batch_size = 4
self.learning_rate = 0.00001
self.seed = 13
self.logger = TransformerNERLogger(
logger_level="i",
logger_file=Path(__file__).resolve().parent.parent.parent/"new_ner_model/log.txt").get_logger()
self.num_train_epochs = 2
logger_file=Path(__file__).resolve().parent.parent.parent / "new_ner_model/log.txt").get_logger()
self.num_train_epochs = 1
self.gradient_accumulation_steps = 1
self.do_warmup = True
self.label2idx = None
Expand All @@ -46,23 +49,47 @@ def __init__(self, model_type, pretrained_model):
self.fp16 = False
self.local_rank = -1
self.device = "cpu"
self.train_steps = 100
self.train_steps = 10
self.early_stop = -1
self.progress_bar = True
self.save_model_core = True
self.use_crf = False
self.focal_loss = False
self.focal_loss_gamma = 2
self.resume_from_model = resume_from_model


def test():
for each in [('deberta', "microsoft/deberta-base"),
('bert', 'bert-base-uncased'),
# test training
for each in [('bert', 'bert-base-uncased'),
('deberta', "microsoft/deberta-base"),
('roberta', 'roberta-base'),
('xlnet', 'xlnet-base-cased')]:
args = Args(each[0], each[1])
args = Args(each[0], each[1], do_train=True, do_predict=False)
run_task(args)


def test1():
# test prediction
args = Args("bert", 'bert-base-uncased', do_train=False, do_predict=True,
new_model_dir=Path(
__file__).resolve().parent.parent.parent / "new_ner_model" / "bert-base-uncased_conll2003")
run_task(args)


def test2():
# test continuous training from existing NER model
args = Args("bert", 'bert-base-uncased', do_train=True, do_predict=True,
resume_from_model=Path(
__file__).resolve().parent.parent.parent / "new_ner_model" / "bert-base-uncased_conll2003")
run_task(args)


if __name__ == '__main__':
test()
which_test = input("run which test? 1 or 2")
if which_test == "1":
test()
elif which_test == "2":
test1()
else:
test2()