Add spaCy config for training ents

meghanabhange · Jun 3, 2021 · 2237ecc · 2237ecc
commit 2237ecc
Show file tree

Hide file tree

Showing 11 changed files with 30,620 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,7 @@
+# MeasNER : Bio NER for Counts and Measurements
+
+### Installation 
+
+```
+pip install XXXXX
+```
diff --git a/assets/dev.json b/assets/dev.json
diff --git a/assets/test.json b/assets/test.json
diff --git a/assets/train.json b/assets/train.json
diff --git a/configs/base_config.cfg b/configs/base_config.cfg
@@ -0,0 +1,83 @@
+# This is an auto-generated partial config. To use it with 'spacy train'
+# you can run spacy init fill-config to auto-fill all default settings:
+# python -m spacy init fill-config ./base_config.cfg ./config.cfg
+[paths]
+train = null
+dev = null
+
+[system]
+gpu_allocator = "pytorch"
+
+[nlp]
+lang = "en"
+pipeline = ["transformer","ner"]
+batch_size = 128
+
+[components]
+
+[components.transformer]
+factory = "transformer"
+
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v1"
+name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
+tokenizer_config = {"use_fast": true}
+
+[components.transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+
+[components.ner]
+factory = "ner"
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "ner"
+extra_state_tokens = false
+hidden_width = 64
+maxout_pieces = 2
+use_upper = false
+nO = null
+
+[components.ner.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.ner.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 500
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+
+[training]
+accumulate_gradient = 3
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 5e-5
+
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+
+[initialize]
+vectors = null
diff --git a/configs/config.cfg b/configs/config.cfg
@@ -0,0 +1,137 @@
+[paths]
+train = null
+dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = "pytorch"
+seed = 0
+
+[nlp]
+lang = "en"
+pipeline = ["transformer","ner"]
+batch_size = 128
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.ner]
+factory = "ner"
+moves = null
+update_with_oracle_cut_size = 100
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "ner"
+extra_state_tokens = false
+hidden_width = 64
+maxout_pieces = 2
+use_upper = false
+nO = null
+
+[components.ner.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+pooling = {"@layers":"reduce_mean.v1"}
+upstream = "*"
+
+[components.transformer]
+factory = "transformer"
+max_batch_items = 4096
+set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
+
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v1"
+name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
+
+[components.transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+
+[components.transformer.model.tokenizer_config]
+use_fast = true
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 500
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+accumulate_gradient = 3
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+get_length = null
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 0.00005
+
+[training.score_weights]
+ents_per_type = null
+ents_f = 1.0
+ents_p = 0.0
+ents_r = 0.0
+
+[pretraining]
+
+[initialize]
+vectors = null
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
diff --git a/project.yml b/project.yml
@@ -0,0 +1,106 @@
+title: "MeasNER : Bio NER for Counts and Measurements"
+description: "NER trained with BiomedNLP-PubMedBERT-base-uncased-abstract on MeasEval data for Counts and Measurements"
+# Variables can be referenced across the project.yml using ${vars.var_name}
+vars:
+  name: "measner"
+  lang: "en"
+  train: "train.json"
+  dev: "dev.json"
+  test: "test.json"
+  version: "0.1"
+  # Set your GPU ID, -1 is CPU
+  gpu_id: 0
+
+# These are the directories that the project needs. The project CLI will make
+# sure that they always exist.
+directories: ["assets", "corpus", "configs", "training", "scripts", "packages"]
+
+# Assets that should be downloaded or available in the directory. We're shipping
+# them with the project, so they won't have to be downloaded.
+assets:
+  - dest: "assets/train.json"
+    description: "Train Data"
+  - dest: "assets/dev.json"
+    description: "Eval Data"
+
+# Workflows are sequences of commands (see below) executed in order. You can
+# run them via "spacy project run [workflow]". If a commands's inputs/outputs
+# haven't changed, it won't be re-run.
+workflows:
+  all:
+    - convert
+    - create-config
+    - train
+    - evaluate
+  train-measner:
+    - convert
+    - create-config
+    - train
+    - evaluate
+    - package
+
+# Project commands, specified in a style similar to CI config files (e.g. Azure
+# pipelines). The name is the command name that lets you trigger the command
+# via "spacy project run [command] [path]". The help message is optional and
+# shown when executing "spacy project run [optional command] [path] --help".
+commands:
+  - name: "convert"
+    help: "Convert the data to spaCy's binary format"
+    script:
+      - "python scripts/convert.py ${vars.lang} assets/${vars.train} corpus/train.spacy"
+      - "python scripts/convert.py ${vars.lang} assets/${vars.dev} corpus/dev.spacy"
+      - "python scripts/convert.py ${vars.lang} assets/${vars.test} corpus/test.spacy"
+    deps:
+      - "assets/${vars.train}"
+      - "assets/${vars.dev}"
+      - "assets/${vars.test}"
+      - "scripts/convert.py"
+    outputs:
+      - "corpus/train.spacy"
+      - "corpus/dev.spacy"
+      - "corpus/test.spacy"
+
+  - name: "create-config"
+    help: "Create a new config with an NER pipeline component"
+    script:
+      - "python -m spacy init fill-config configs/base_config.cfg configs/config.cfg"
+    outputs:
+      - "configs/config.cfg"
+
+  - name: "train"
+    help: "Train the NER model"
+    script:
+      - "python -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --training.eval_frequency 10 --training.patience 50 --gpu-id ${vars.gpu_id}"
+    deps:
+      - "configs/config.cfg"
+      - "corpus/train.spacy"
+      - "corpus/dev.spacy"
+    outputs:
+      - "training/model-best"
+
+  - name: "evaluate"
+    help: "Evaluate the model and export metrics"
+    script:
+      - "python -m spacy evaluate training/model-best corpus/dev.spacy --output training/metrics.json"
+    deps:
+      - "corpus/dev.spacy"
+      - "training/model-best"
+    outputs:
+      - "training/metrics.json"
+
+  - name: package
+    help: "Package the trained model as a pip package"
+    script:
+      - "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --code measner.py --force"
+    deps:
+      - "training/model-best"
+    outputs_no_cache:
+      - "packages/${vars.lang}_${vars.name}-${vars.version}/dist/${vars.lang}_${vars.name}-${vars.version}.tar.gz"
+
+  - name: visualize-model
+    help: Visualize the model's output interactively using Streamlit
+    script:
+      - "streamlit run scripts/visualize_model.py training/model-best \"I saw Shaka Khan in London.\""
+    deps:
+      - "scripts/visualize_model.py"
+      - "training/model-best"
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+spacy
+spacy_transformers
+sentencepiece
+protobuf
diff --git a/scripts/__init__.py b/scripts/__init__.py
@@ -0,0 +1,17 @@
+import logging
+
+LOGGER = logging.Logger("denomme")
+file_handler = logging.FileHandler("denomme.log")
+stream_handler = logging.StreamHandler()
+
+stream_formatter = logging.Formatter("%(asctime)-15s %(levelname)-8s %(message)s")
+file_formatter = logging.Formatter(
+    "{'time':'%(asctime)s', 'name': '%(name)s', \
+    'level': '%(levelname)s', 'message': '%(message)s'}"
+)
+
+file_handler.setFormatter(file_formatter)
+stream_handler.setFormatter(stream_formatter)
+
+LOGGER.addHandler(file_handler)
+LOGGER.addHandler(stream_handler)