Skip to content

Commit

Permalink
all data, rdrop1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
louishsu committed Aug 22, 2021
1 parent 7364419 commit 4b94afa
Show file tree
Hide file tree
Showing 8 changed files with 191 additions and 20 deletions.
3 changes: 2 additions & 1 deletion TODO
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
1. context-aware
2. regex match
3. 是否有实体重叠问题
3. 是否有实体重叠问题
4. R-Drop
57 changes: 57 additions & 0 deletions args/bert_span-alldata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"version": "alldata",
"device": "cuda:0",
"n_gpu": 1,
"task_name": "ner",
"dataset_name": "cail_ner",
"data_dir": "./data/ner-ctx0-train1.0-seed42/",
"train_file": "train.json",
"dev_file": "dev.json",
"test_file": "dev.json",
"model_type": "bert_span",
"model_name_or_path": "/home/louishsu/NewDisk/Garage/weights/transformers/chinese-roberta-wwm",
"output_dir": "output/",
"max_span_length": 30,
"width_embedding_dim": 128,
"optimizer": "adamw",
"augment_context_aware_p": null,
"rdrop_alpha": null,
"scheme": "IOB2",
"loss_type": "ce",
"config_name": "",
"tokenizer_name": "",
"cache_dir": "cache/",
"train_max_seq_length": 512,
"eval_max_seq_length": 512,
"do_train": true,
"do_eval": false,
"do_predict": true,
"evaluate_during_training": false,
"evaluate_each_epoch": true,
"do_lower_case": true,
"do_fgm": false,
"fgm_epsilon": 1.0,
"fgm_name": "word_embeddings",
"per_gpu_train_batch_size": 16,
"per_gpu_eval_batch_size": 24,
"gradient_accumulation_steps": 2,
"learning_rate": 2e-05,
"other_learning_rate": 1e-3,
"weight_decay": 0.01,
"adam_epsilon": 1e-08,
"max_grad_norm": 1.0,
"num_train_epochs": 10.0,
"max_steps": -1,
"warmup_proportion": 0.1,
"logging_steps": 50,
"save_steps": 50,
"save_best_checkpoints": true,
"eval_all_checkpoints": false,
"predict_checkpoints": 0,
"no_cuda": false,
"overwrite_output_dir": true,
"seed": 42,
"fp16": false,
"fp16_opt_level": "O1",
"local_rank": -1
}
57 changes: 57 additions & 0 deletions args/bert_span-alldata_rdrop1.0.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"version": "alldata_rdrop1.0",
"device": "cuda:0",
"n_gpu": 1,
"task_name": "ner",
"dataset_name": "cail_ner",
"data_dir": "./data/ner-ctx0-train1.0-seed42/",
"train_file": "train.json",
"dev_file": "dev.json",
"test_file": "dev.json",
"model_type": "bert_span",
"model_name_or_path": "/home/louishsu/NewDisk/Garage/weights/transformers/chinese-roberta-wwm",
"output_dir": "output/",
"max_span_length": 30,
"width_embedding_dim": 128,
"optimizer": "adamw",
"augment_context_aware_p": null,
"rdrop_alpha": 1.0,
"scheme": "IOB2",
"loss_type": "ce",
"config_name": "",
"tokenizer_name": "",
"cache_dir": "cache/",
"train_max_seq_length": 512,
"eval_max_seq_length": 512,
"do_train": true,
"do_eval": false,
"do_predict": true,
"evaluate_during_training": false,
"evaluate_each_epoch": true,
"do_lower_case": true,
"do_fgm": false,
"fgm_epsilon": 1.0,
"fgm_name": "word_embeddings",
"per_gpu_train_batch_size": 8,
"per_gpu_eval_batch_size": 8,
"gradient_accumulation_steps": 2,
"learning_rate": 2e-05,
"other_learning_rate": 1e-3,
"weight_decay": 0.01,
"adam_epsilon": 1e-08,
"max_grad_norm": 1.0,
"num_train_epochs": 10.0,
"max_steps": -1,
"warmup_proportion": 0.1,
"logging_steps": 50,
"save_steps": 50,
"save_best_checkpoints": true,
"eval_all_checkpoints": false,
"predict_checkpoints": 0,
"no_cuda": false,
"overwrite_output_dir": true,
"seed": 42,
"fp16": false,
"fp16_opt_level": "O1",
"local_rank": -1
}
10 changes: 5 additions & 5 deletions args/bert_span-ctx0.1.json → args/bert_span-dev.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "ctx0.1",
"version": "dev",
"device": "cuda:0",
"n_gpu": 1,
"task_name": "ner",
Expand All @@ -14,7 +14,7 @@
"max_span_length": 30,
"width_embedding_dim": 128,
"optimizer": "adamw",
"augment_context_aware_p": 0.1,
"augment_context_aware_p": null,
"scheme": "IOB2",
"loss_type": "ce",
"config_name": "",
Expand All @@ -31,11 +31,11 @@
"do_fgm": false,
"fgm_epsilon": 1.0,
"fgm_name": "word_embeddings",
"per_gpu_train_batch_size": 16,
"per_gpu_eval_batch_size": 24,
"per_gpu_train_batch_size": 8,
"per_gpu_eval_batch_size": 8,
"gradient_accumulation_steps": 2,
"learning_rate": 2e-05,
"other_learning_rate": 1e-3,
"other_learning_rate": 5e-4,
"weight_decay": 0.01,
"adam_epsilon": 1e-08,
"max_grad_norm": 1.0,
Expand Down
7 changes: 4 additions & 3 deletions eda.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@
{
"cell_type": "code",
"execution_count": 11,
"id": "9489bc8e",
"id": "09c87de7",
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -255,8 +255,8 @@
},
{
"cell_type": "code",
"execution_count": 20,
"id": "6387604c",
"execution_count": 21,
"id": "57ff4a99",
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -290,6 +290,7 @@
"for label, entities in label_entities_map.items():\n",
" print(label, utils.LABEL_MEANING_MAP[label])\n",
" print(Counter([len(entity) for entity in entities]))\n",
" entities = sorted(list(set(entities)), key=len)\n",
" with open(f\"tmp/{label}_{utils.LABEL_MEANING_MAP[label]}.txt\", \"w\") as f:\n",
" f.writelines([entity + \"\\n\" for entity in entities])"
]
Expand Down
6 changes: 5 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@

def main():

context_window = 510
json_file = "./args/bert_span-baseline.json"

parser = NerArgumentParser()
args = parser.parse_args_from_json(json_file=json_file)
args.test_file = "test.json"
args.do_train = False
args.do_eval = False
args.do_test = True
args.per_gpu_eval_batch_size = 6
args.per_gpu_eval_batch_size = 1
parser.save_args_to_json("./args/pred.json", args)

version = args.version
Expand All @@ -27,7 +29,9 @@ def main():

# upload
raw_samples = utils.load_raw(infile)
raw_samples = utils.add_context(raw_samples, context_window)
utils.save_samples(os.path.join(data_dir, "test.json"), raw_samples)

os.system(f"sudo /home/user/miniconda/bin/python3 run_span.py ./args/pred.json")
os.system(f"sudo cp ./output/ner-{dataset_name}-{model_type}-{version}-{seed}/test_prediction.json {outfile}")

Expand Down
2 changes: 1 addition & 1 deletion prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
help="Data files.")
parser.add_argument("--context_window", default=0, type=int,
help="Size of context window.")
parser.add_argument("--train_split_ratio", default=0.8, type=float,
parser.add_argument("--train_split_ratio", default=1.0, type=float,
help="Size of training data.")
parser.add_argument("--output_dir", type=str, default="./data/")
parser.add_argument("--seed", default=42, type=int,
Expand Down
69 changes: 60 additions & 9 deletions run_span.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re
import sys
import time
import json
Expand All @@ -11,6 +12,7 @@

import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import numpy as np
Expand Down Expand Up @@ -186,6 +188,39 @@ def forward(
attentions=outputs.attentions,
)

def compute_kl_loss(p, q, pad_mask=None):

p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='none')
q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='none')

# pad_mask is for seq-level tasks
if pad_mask is not None:
p_loss.masked_fill_(pad_mask, 0.)
q_loss.masked_fill_(pad_mask, 0.)

# You can choose whether to use function "sum" and "mean" depending on your task
p_loss = p_loss.mean()
q_loss = q_loss.mean()

loss = (p_loss + q_loss) / 2
return loss

def forward_rdrop(cls, alpha, **kwargs):
outputs1 = forward(cls, **kwargs)
if outputs1.loss is None or alpha <= 0.: return outputs1

outputs2 = forward(cls, **kwargs)
rdrop_loss = compute_kl_loss(
outputs1["logits"], outputs2["logits"],
kwargs["span_mask"].unsqueeze(-1) == 0)
total_loss = (outputs1["loss"] + outputs2["loss"]) / 2. + alpha * rdrop_loss
return TokenClassifierOutput(
loss=total_loss,
logits=outputs1["logits"],
hidden_states=outputs1.hidden_states,
attentions=outputs1.attentions,
)

class BertSpanV2ForNer(BertPreTrainedModel):

def __init__(self, config):
Expand All @@ -197,8 +232,10 @@ def __init__(self, config):
config.max_span_length, config.width_embedding_dim)
self.init_weights()

def forward(self, *args, **kwargs):
return forward(self, *args, **kwargs)
def forward(self, **kwargs):
if args.rdrop_alpha is not None:
return forward_rdrop(self, args.rdrop_alpha, **kwargs)
return forward(self, **kwargs)

class NeZhaSpanV2ForNer(NeZhaPreTrainedModel):

Expand All @@ -211,8 +248,10 @@ def __init__(self, config):
config.max_span_length, config.width_embedding_dim)
self.init_weights()

def forward(self, *args, **kwargs):
return forward(self, *args, **kwargs)
def forward(self, **kwargs):
if args.rdrop_alpha is not None:
return forward_rdrop(self, args.rdrop_alpha, **kwargs)
return forward(self, **kwargs)

class NerArgumentParser(ArgumentParser):

Expand Down Expand Up @@ -255,6 +294,7 @@ def build_arguments(self):
self.add_argument("--width_embedding_dim", default=150, type=int)
self.add_argument("--optimizer", default="adamw", type=str)
self.add_argument("--augment_context_aware_p", default=None, type=float)
self.add_argument("--rdrop_alpha", default=None, type=float)

# Other parameters
self.add_argument('--scheme', default='IOB2', type=str,
Expand Down Expand Up @@ -461,6 +501,16 @@ def __call__(self, example):
"sent_end": sent_start + len(tokens)
}]

# TODO:
class ReDataMasking:

def __init__(self):
self.nc_reobj = re.compile("(现金)?(人民币)?[0-9]+(.[0-9]+)?余?元(现金)?(人民币)?")

def __call__(self, example):
...


class Example2Feature:

def __init__(self, tokenizer, label2id, max_seq_length, max_span_length):
Expand Down Expand Up @@ -749,10 +799,11 @@ def train(args, model, processor, tokenizer):
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
logger.info("Saving optimizer and scheduler states to %s", output_dir)
if args.local_rank in [-1, 0] and not args.save_best_checkpoints and \
args.save_steps > 0 and global_step % args.save_steps == 0:
if args.local_rank in [-1, 0] and \
args.save_steps > 0 and global_step % args.save_steps == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(
global_step if not args.save_best_checkpoints else 999999))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
model_to_save = (
Expand Down Expand Up @@ -861,7 +912,7 @@ def predict(args, model, processor, tokenizer, prefix=""):
preds = model.span.decode_batch(logits, batch["spans"], batch["span_mask"])
pred, input_len = preds[0], batch["input_len"][0]
pred = [(id2label[t], b, e) for t, b, e in pred if id2label[t] != "O"]
pred = get_ner_tags(pred, input_len - 2),
pred = get_ner_tags(pred, input_len - 2)
pred = pred[batch["sent_start"][0]: batch["sent_end"][0]]
label_entities_map = {label: [] for label in LABEL_MEANING_MAP.keys()}
for t, b, e in get_entities(pred):
Expand Down Expand Up @@ -915,7 +966,7 @@ def load_dataset(args, processor, tokenizer, data_type='train'):
args = parser.parse_args_from_json(json_file=os.path.abspath(sys.argv[1]))
else:
args = parser.build_arguments().parse_args()
# args = parser.parse_args_from_json(json_file="args/bert_span-baseline.json")
# args = parser.parse_args_from_json(json_file="args/bert_span-alldata_rdrop1.0.json")

# Set seed before initializing model.
seed_everything(args.seed)
Expand Down

0 comments on commit 4b94afa

Please sign in to comment.