Skip to content

Commit

Permalink
Add spaCy config for training ents
Browse files Browse the repository at this point in the history
  • Loading branch information
meghanabhange committed Jun 3, 2021
0 parents commit 2237ecc
Show file tree
Hide file tree
Showing 11 changed files with 30,620 additions and 0 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# MeasNER : Bio NER for Counts and Measurements

### Installation

```
pip install XXXXX
```
9,144 changes: 9,144 additions & 0 deletions assets/dev.json

Large diffs are not rendered by default.

5,170 changes: 5,170 additions & 0 deletions assets/test.json

Large diffs are not rendered by default.

15,828 changes: 15,828 additions & 0 deletions assets/train.json

Large diffs are not rendered by default.

83 changes: 83 additions & 0 deletions configs/base_config.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all default settings:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
[paths]
train = null
dev = null

[system]
gpu_allocator = "pytorch"

[nlp]
lang = "en"
pipeline = ["transformer","ner"]
batch_size = 128

[components]

[components.transformer]
factory = "transformer"

[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
tokenizer_config = {"use_fast": true}

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0

[components.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 500

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

[training.optimizer]
@optimizers = "Adam.v1"

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 5e-5

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256

[initialize]
vectors = null
137 changes: 137 additions & 0 deletions configs/config.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
[paths]
train = null
dev = null
vectors = null
init_tok2vec = null

[system]
gpu_allocator = "pytorch"
seed = 0

[nlp]
lang = "en"
pipeline = ["transformer","ner"]
batch_size = 128
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.ner]
factory = "ner"
moves = null
update_with_oracle_cut_size = 100

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
pooling = {"@layers":"reduce_mean.v1"}
upstream = "*"

[components.transformer]
factory = "transformer"
max_batch_items = 4096
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}

[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96

[components.transformer.model.tokenizer_config]
use_fast = true

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 500
gold_preproc = false
limit = 0
augmenter = null

[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
before_to_disk = null

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256
get_length = null

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.00005

[training.score_weights]
ents_per_type = null
ents_f = 1.0
ents_p = 0.0
ents_r = 0.0

[pretraining]

[initialize]
vectors = null
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]
106 changes: 106 additions & 0 deletions project.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
title: "MeasNER : Bio NER for Counts and Measurements"
description: "NER trained with BiomedNLP-PubMedBERT-base-uncased-abstract on MeasEval data for Counts and Measurements"
# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
name: "measner"
lang: "en"
train: "train.json"
dev: "dev.json"
test: "test.json"
version: "0.1"
# Set your GPU ID, -1 is CPU
gpu_id: 0

# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "corpus", "configs", "training", "scripts", "packages"]

# Assets that should be downloaded or available in the directory. We're shipping
# them with the project, so they won't have to be downloaded.
assets:
- dest: "assets/train.json"
description: "Train Data"
- dest: "assets/dev.json"
description: "Eval Data"

# Workflows are sequences of commands (see below) executed in order. You can
# run them via "spacy project run [workflow]". If a commands's inputs/outputs
# haven't changed, it won't be re-run.
workflows:
all:
- convert
- create-config
- train
- evaluate
train-measner:
- convert
- create-config
- train
- evaluate
- package

# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
- name: "convert"
help: "Convert the data to spaCy's binary format"
script:
- "python scripts/convert.py ${vars.lang} assets/${vars.train} corpus/train.spacy"
- "python scripts/convert.py ${vars.lang} assets/${vars.dev} corpus/dev.spacy"
- "python scripts/convert.py ${vars.lang} assets/${vars.test} corpus/test.spacy"
deps:
- "assets/${vars.train}"
- "assets/${vars.dev}"
- "assets/${vars.test}"
- "scripts/convert.py"
outputs:
- "corpus/train.spacy"
- "corpus/dev.spacy"
- "corpus/test.spacy"

- name: "create-config"
help: "Create a new config with an NER pipeline component"
script:
- "python -m spacy init fill-config configs/base_config.cfg configs/config.cfg"
outputs:
- "configs/config.cfg"

- name: "train"
help: "Train the NER model"
script:
- "python -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --training.eval_frequency 10 --training.patience 50 --gpu-id ${vars.gpu_id}"
deps:
- "configs/config.cfg"
- "corpus/train.spacy"
- "corpus/dev.spacy"
outputs:
- "training/model-best"

- name: "evaluate"
help: "Evaluate the model and export metrics"
script:
- "python -m spacy evaluate training/model-best corpus/dev.spacy --output training/metrics.json"
deps:
- "corpus/dev.spacy"
- "training/model-best"
outputs:
- "training/metrics.json"

- name: package
help: "Package the trained model as a pip package"
script:
- "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --code measner.py --force"
deps:
- "training/model-best"
outputs_no_cache:
- "packages/${vars.lang}_${vars.name}-${vars.version}/dist/${vars.lang}_${vars.name}-${vars.version}.tar.gz"

- name: visualize-model
help: Visualize the model's output interactively using Streamlit
script:
- "streamlit run scripts/visualize_model.py training/model-best \"I saw Shaka Khan in London.\""
deps:
- "scripts/visualize_model.py"
- "training/model-best"
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
spacy
spacy_transformers
sentencepiece
protobuf
17 changes: 17 additions & 0 deletions scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import logging

LOGGER = logging.Logger("denomme")
file_handler = logging.FileHandler("denomme.log")
stream_handler = logging.StreamHandler()

stream_formatter = logging.Formatter("%(asctime)-15s %(levelname)-8s %(message)s")
file_formatter = logging.Formatter(
"{'time':'%(asctime)s', 'name': '%(name)s', \
'level': '%(levelname)s', 'message': '%(message)s'}"
)

file_handler.setFormatter(file_formatter)
stream_handler.setFormatter(stream_formatter)

LOGGER.addHandler(file_handler)
LOGGER.addHandler(stream_handler)
Loading

0 comments on commit 2237ecc

Please sign in to comment.