From 4b94afa9f2c787efc1a32a7135f3671cc133ec1e Mon Sep 17 00:00:00 2001 From: louishsu Date: Sun, 22 Aug 2021 15:59:36 +0800 Subject: [PATCH] all data, rdrop1.0 --- TODO | 3 +- args/bert_span-alldata.json | 57 +++++++++++++++ args/bert_span-alldata_rdrop1.0.json | 57 +++++++++++++++ ...rt_span-ctx0.1.json => bert_span-dev.json} | 10 +-- eda.ipynb | 7 +- main.py | 6 +- prepare_data.py | 2 +- run_span.py | 69 ++++++++++++++++--- 8 files changed, 191 insertions(+), 20 deletions(-) create mode 100644 args/bert_span-alldata.json create mode 100644 args/bert_span-alldata_rdrop1.0.json rename args/{bert_span-ctx0.1.json => bert_span-dev.json} (89%) diff --git a/TODO b/TODO index 67a3d61..24f71af 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,4 @@ 1. context-aware 2. regex match -3. 是否有实体重叠问题 \ No newline at end of file +3. 是否有实体重叠问题 +4. R-Drop \ No newline at end of file diff --git a/args/bert_span-alldata.json b/args/bert_span-alldata.json new file mode 100644 index 0000000..742543e --- /dev/null +++ b/args/bert_span-alldata.json @@ -0,0 +1,57 @@ +{ + "version": "alldata", + "device": "cuda:0", + "n_gpu": 1, + "task_name": "ner", + "dataset_name": "cail_ner", + "data_dir": "./data/ner-ctx0-train1.0-seed42/", + "train_file": "train.json", + "dev_file": "dev.json", + "test_file": "dev.json", + "model_type": "bert_span", + "model_name_or_path": "/home/louishsu/NewDisk/Garage/weights/transformers/chinese-roberta-wwm", + "output_dir": "output/", + "max_span_length": 30, + "width_embedding_dim": 128, + "optimizer": "adamw", + "augment_context_aware_p": null, + "rdrop_alpha": null, + "scheme": "IOB2", + "loss_type": "ce", + "config_name": "", + "tokenizer_name": "", + "cache_dir": "cache/", + "train_max_seq_length": 512, + "eval_max_seq_length": 512, + "do_train": true, + "do_eval": false, + "do_predict": true, + "evaluate_during_training": false, + "evaluate_each_epoch": true, + "do_lower_case": true, + "do_fgm": false, + "fgm_epsilon": 1.0, + "fgm_name": "word_embeddings", + "per_gpu_train_batch_size": 16, + "per_gpu_eval_batch_size": 24, + "gradient_accumulation_steps": 2, + "learning_rate": 2e-05, + "other_learning_rate": 1e-3, + "weight_decay": 0.01, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 10.0, + "max_steps": -1, + "warmup_proportion": 0.1, + "logging_steps": 50, + "save_steps": 50, + "save_best_checkpoints": true, + "eval_all_checkpoints": false, + "predict_checkpoints": 0, + "no_cuda": false, + "overwrite_output_dir": true, + "seed": 42, + "fp16": false, + "fp16_opt_level": "O1", + "local_rank": -1 +} diff --git a/args/bert_span-alldata_rdrop1.0.json b/args/bert_span-alldata_rdrop1.0.json new file mode 100644 index 0000000..3ed0985 --- /dev/null +++ b/args/bert_span-alldata_rdrop1.0.json @@ -0,0 +1,57 @@ +{ + "version": "alldata_rdrop1.0", + "device": "cuda:0", + "n_gpu": 1, + "task_name": "ner", + "dataset_name": "cail_ner", + "data_dir": "./data/ner-ctx0-train1.0-seed42/", + "train_file": "train.json", + "dev_file": "dev.json", + "test_file": "dev.json", + "model_type": "bert_span", + "model_name_or_path": "/home/louishsu/NewDisk/Garage/weights/transformers/chinese-roberta-wwm", + "output_dir": "output/", + "max_span_length": 30, + "width_embedding_dim": 128, + "optimizer": "adamw", + "augment_context_aware_p": null, + "rdrop_alpha": 1.0, + "scheme": "IOB2", + "loss_type": "ce", + "config_name": "", + "tokenizer_name": "", + "cache_dir": "cache/", + "train_max_seq_length": 512, + "eval_max_seq_length": 512, + "do_train": true, + "do_eval": false, + "do_predict": true, + "evaluate_during_training": false, + "evaluate_each_epoch": true, + "do_lower_case": true, + "do_fgm": false, + "fgm_epsilon": 1.0, + "fgm_name": "word_embeddings", + "per_gpu_train_batch_size": 8, + "per_gpu_eval_batch_size": 8, + "gradient_accumulation_steps": 2, + "learning_rate": 2e-05, + "other_learning_rate": 1e-3, + "weight_decay": 0.01, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 10.0, + "max_steps": -1, + "warmup_proportion": 0.1, + "logging_steps": 50, + "save_steps": 50, + "save_best_checkpoints": true, + "eval_all_checkpoints": false, + "predict_checkpoints": 0, + "no_cuda": false, + "overwrite_output_dir": true, + "seed": 42, + "fp16": false, + "fp16_opt_level": "O1", + "local_rank": -1 +} diff --git a/args/bert_span-ctx0.1.json b/args/bert_span-dev.json similarity index 89% rename from args/bert_span-ctx0.1.json rename to args/bert_span-dev.json index 2cd52a9..5bcf87f 100644 --- a/args/bert_span-ctx0.1.json +++ b/args/bert_span-dev.json @@ -1,5 +1,5 @@ { - "version": "ctx0.1", + "version": "dev", "device": "cuda:0", "n_gpu": 1, "task_name": "ner", @@ -14,7 +14,7 @@ "max_span_length": 30, "width_embedding_dim": 128, "optimizer": "adamw", - "augment_context_aware_p": 0.1, + "augment_context_aware_p": null, "scheme": "IOB2", "loss_type": "ce", "config_name": "", @@ -31,11 +31,11 @@ "do_fgm": false, "fgm_epsilon": 1.0, "fgm_name": "word_embeddings", - "per_gpu_train_batch_size": 16, - "per_gpu_eval_batch_size": 24, + "per_gpu_train_batch_size": 8, + "per_gpu_eval_batch_size": 8, "gradient_accumulation_steps": 2, "learning_rate": 2e-05, - "other_learning_rate": 1e-3, + "other_learning_rate": 5e-4, "weight_decay": 0.01, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, diff --git a/eda.ipynb b/eda.ipynb index 7287bdb..ba272ea 100644 --- a/eda.ipynb +++ b/eda.ipynb @@ -120,7 +120,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "9489bc8e", + "id": "09c87de7", "metadata": {}, "outputs": [ { @@ -255,8 +255,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "id": "6387604c", + "execution_count": 21, + "id": "57ff4a99", "metadata": {}, "outputs": [ { @@ -290,6 +290,7 @@ "for label, entities in label_entities_map.items():\n", " print(label, utils.LABEL_MEANING_MAP[label])\n", " print(Counter([len(entity) for entity in entities]))\n", + " entities = sorted(list(set(entities)), key=len)\n", " with open(f\"tmp/{label}_{utils.LABEL_MEANING_MAP[label]}.txt\", \"w\") as f:\n", " f.writelines([entity + \"\\n\" for entity in entities])" ] diff --git a/main.py b/main.py index 2485345..f40611f 100644 --- a/main.py +++ b/main.py @@ -9,14 +9,16 @@ def main(): + context_window = 510 json_file = "./args/bert_span-baseline.json" + parser = NerArgumentParser() args = parser.parse_args_from_json(json_file=json_file) args.test_file = "test.json" args.do_train = False args.do_eval = False args.do_test = True - args.per_gpu_eval_batch_size = 6 + args.per_gpu_eval_batch_size = 1 parser.save_args_to_json("./args/pred.json", args) version = args.version @@ -27,7 +29,9 @@ def main(): # upload raw_samples = utils.load_raw(infile) + raw_samples = utils.add_context(raw_samples, context_window) utils.save_samples(os.path.join(data_dir, "test.json"), raw_samples) + os.system(f"sudo /home/user/miniconda/bin/python3 run_span.py ./args/pred.json") os.system(f"sudo cp ./output/ner-{dataset_name}-{model_type}-{version}-{seed}/test_prediction.json {outfile}") diff --git a/prepare_data.py b/prepare_data.py index 054fc15..91aee90 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -15,7 +15,7 @@ help="Data files.") parser.add_argument("--context_window", default=0, type=int, help="Size of context window.") - parser.add_argument("--train_split_ratio", default=0.8, type=float, + parser.add_argument("--train_split_ratio", default=1.0, type=float, help="Size of training data.") parser.add_argument("--output_dir", type=str, default="./data/") parser.add_argument("--seed", default=42, type=int, diff --git a/run_span.py b/run_span.py index a26447c..eed71f4 100644 --- a/run_span.py +++ b/run_span.py @@ -1,4 +1,5 @@ import os +import re import sys import time import json @@ -11,6 +12,7 @@ import torch from torch import nn +from torch.nn import functional as F from torch.nn.utils.rnn import pad_sequence from torch.utils.data import DataLoader, RandomSampler, SequentialSampler import numpy as np @@ -186,6 +188,39 @@ def forward( attentions=outputs.attentions, ) +def compute_kl_loss(p, q, pad_mask=None): + + p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='none') + q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='none') + + # pad_mask is for seq-level tasks + if pad_mask is not None: + p_loss.masked_fill_(pad_mask, 0.) + q_loss.masked_fill_(pad_mask, 0.) + + # You can choose whether to use function "sum" and "mean" depending on your task + p_loss = p_loss.mean() + q_loss = q_loss.mean() + + loss = (p_loss + q_loss) / 2 + return loss + +def forward_rdrop(cls, alpha, **kwargs): + outputs1 = forward(cls, **kwargs) + if outputs1.loss is None or alpha <= 0.: return outputs1 + + outputs2 = forward(cls, **kwargs) + rdrop_loss = compute_kl_loss( + outputs1["logits"], outputs2["logits"], + kwargs["span_mask"].unsqueeze(-1) == 0) + total_loss = (outputs1["loss"] + outputs2["loss"]) / 2. + alpha * rdrop_loss + return TokenClassifierOutput( + loss=total_loss, + logits=outputs1["logits"], + hidden_states=outputs1.hidden_states, + attentions=outputs1.attentions, + ) + class BertSpanV2ForNer(BertPreTrainedModel): def __init__(self, config): @@ -197,8 +232,10 @@ def __init__(self, config): config.max_span_length, config.width_embedding_dim) self.init_weights() - def forward(self, *args, **kwargs): - return forward(self, *args, **kwargs) + def forward(self, **kwargs): + if args.rdrop_alpha is not None: + return forward_rdrop(self, args.rdrop_alpha, **kwargs) + return forward(self, **kwargs) class NeZhaSpanV2ForNer(NeZhaPreTrainedModel): @@ -211,8 +248,10 @@ def __init__(self, config): config.max_span_length, config.width_embedding_dim) self.init_weights() - def forward(self, *args, **kwargs): - return forward(self, *args, **kwargs) + def forward(self, **kwargs): + if args.rdrop_alpha is not None: + return forward_rdrop(self, args.rdrop_alpha, **kwargs) + return forward(self, **kwargs) class NerArgumentParser(ArgumentParser): @@ -255,6 +294,7 @@ def build_arguments(self): self.add_argument("--width_embedding_dim", default=150, type=int) self.add_argument("--optimizer", default="adamw", type=str) self.add_argument("--augment_context_aware_p", default=None, type=float) + self.add_argument("--rdrop_alpha", default=None, type=float) # Other parameters self.add_argument('--scheme', default='IOB2', type=str, @@ -461,6 +501,16 @@ def __call__(self, example): "sent_end": sent_start + len(tokens) }] +# TODO: +class ReDataMasking: + + def __init__(self): + self.nc_reobj = re.compile("(现金)?(人民币)?[0-9]+(.[0-9]+)?余?元(现金)?(人民币)?") + + def __call__(self, example): + ... + + class Example2Feature: def __init__(self, tokenizer, label2id, max_seq_length, max_span_length): @@ -749,10 +799,11 @@ def train(args, model, processor, tokenizer): torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) - if args.local_rank in [-1, 0] and not args.save_best_checkpoints and \ - args.save_steps > 0 and global_step % args.save_steps == 0: + if args.local_rank in [-1, 0] and \ + args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint - output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) + output_dir = os.path.join(args.output_dir, "checkpoint-{}".format( + global_step if not args.save_best_checkpoints else 999999)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( @@ -861,7 +912,7 @@ def predict(args, model, processor, tokenizer, prefix=""): preds = model.span.decode_batch(logits, batch["spans"], batch["span_mask"]) pred, input_len = preds[0], batch["input_len"][0] pred = [(id2label[t], b, e) for t, b, e in pred if id2label[t] != "O"] - pred = get_ner_tags(pred, input_len - 2), + pred = get_ner_tags(pred, input_len - 2) pred = pred[batch["sent_start"][0]: batch["sent_end"][0]] label_entities_map = {label: [] for label in LABEL_MEANING_MAP.keys()} for t, b, e in get_entities(pred): @@ -915,7 +966,7 @@ def load_dataset(args, processor, tokenizer, data_type='train'): args = parser.parse_args_from_json(json_file=os.path.abspath(sys.argv[1])) else: args = parser.build_arguments().parse_args() - # args = parser.parse_args_from_json(json_file="args/bert_span-baseline.json") + # args = parser.parse_args_from_json(json_file="args/bert_span-alldata_rdrop1.0.json") # Set seed before initializing model. seed_everything(args.seed)