-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 2237ecc
Showing
11 changed files
with
30,620 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# MeasNER : Bio NER for Counts and Measurements | ||
|
||
### Installation | ||
|
||
``` | ||
pip install XXXXX | ||
``` |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# This is an auto-generated partial config. To use it with 'spacy train' | ||
# you can run spacy init fill-config to auto-fill all default settings: | ||
# python -m spacy init fill-config ./base_config.cfg ./config.cfg | ||
[paths] | ||
train = null | ||
dev = null | ||
|
||
[system] | ||
gpu_allocator = "pytorch" | ||
|
||
[nlp] | ||
lang = "en" | ||
pipeline = ["transformer","ner"] | ||
batch_size = 128 | ||
|
||
[components] | ||
|
||
[components.transformer] | ||
factory = "transformer" | ||
|
||
[components.transformer.model] | ||
@architectures = "spacy-transformers.TransformerModel.v1" | ||
name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract" | ||
tokenizer_config = {"use_fast": true} | ||
|
||
[components.transformer.model.get_spans] | ||
@span_getters = "spacy-transformers.strided_spans.v1" | ||
window = 128 | ||
stride = 96 | ||
|
||
[components.ner] | ||
factory = "ner" | ||
|
||
[components.ner.model] | ||
@architectures = "spacy.TransitionBasedParser.v2" | ||
state_type = "ner" | ||
extra_state_tokens = false | ||
hidden_width = 64 | ||
maxout_pieces = 2 | ||
use_upper = false | ||
nO = null | ||
|
||
[components.ner.model.tok2vec] | ||
@architectures = "spacy-transformers.TransformerListener.v1" | ||
grad_factor = 1.0 | ||
|
||
[components.ner.model.tok2vec.pooling] | ||
@layers = "reduce_mean.v1" | ||
|
||
[corpora] | ||
|
||
[corpora.train] | ||
@readers = "spacy.Corpus.v1" | ||
path = ${paths.train} | ||
max_length = 500 | ||
|
||
[corpora.dev] | ||
@readers = "spacy.Corpus.v1" | ||
path = ${paths.dev} | ||
max_length = 0 | ||
|
||
[training] | ||
accumulate_gradient = 3 | ||
dev_corpus = "corpora.dev" | ||
train_corpus = "corpora.train" | ||
|
||
[training.optimizer] | ||
@optimizers = "Adam.v1" | ||
|
||
[training.optimizer.learn_rate] | ||
@schedules = "warmup_linear.v1" | ||
warmup_steps = 250 | ||
total_steps = 20000 | ||
initial_rate = 5e-5 | ||
|
||
[training.batcher] | ||
@batchers = "spacy.batch_by_padded.v1" | ||
discard_oversize = true | ||
size = 2000 | ||
buffer = 256 | ||
|
||
[initialize] | ||
vectors = null |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
[paths] | ||
train = null | ||
dev = null | ||
vectors = null | ||
init_tok2vec = null | ||
|
||
[system] | ||
gpu_allocator = "pytorch" | ||
seed = 0 | ||
|
||
[nlp] | ||
lang = "en" | ||
pipeline = ["transformer","ner"] | ||
batch_size = 128 | ||
disabled = [] | ||
before_creation = null | ||
after_creation = null | ||
after_pipeline_creation = null | ||
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} | ||
|
||
[components] | ||
|
||
[components.ner] | ||
factory = "ner" | ||
moves = null | ||
update_with_oracle_cut_size = 100 | ||
|
||
[components.ner.model] | ||
@architectures = "spacy.TransitionBasedParser.v2" | ||
state_type = "ner" | ||
extra_state_tokens = false | ||
hidden_width = 64 | ||
maxout_pieces = 2 | ||
use_upper = false | ||
nO = null | ||
|
||
[components.ner.model.tok2vec] | ||
@architectures = "spacy-transformers.TransformerListener.v1" | ||
grad_factor = 1.0 | ||
pooling = {"@layers":"reduce_mean.v1"} | ||
upstream = "*" | ||
|
||
[components.transformer] | ||
factory = "transformer" | ||
max_batch_items = 4096 | ||
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} | ||
|
||
[components.transformer.model] | ||
@architectures = "spacy-transformers.TransformerModel.v1" | ||
name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract" | ||
|
||
[components.transformer.model.get_spans] | ||
@span_getters = "spacy-transformers.strided_spans.v1" | ||
window = 128 | ||
stride = 96 | ||
|
||
[components.transformer.model.tokenizer_config] | ||
use_fast = true | ||
|
||
[corpora] | ||
|
||
[corpora.dev] | ||
@readers = "spacy.Corpus.v1" | ||
path = ${paths.dev} | ||
max_length = 0 | ||
gold_preproc = false | ||
limit = 0 | ||
augmenter = null | ||
|
||
[corpora.train] | ||
@readers = "spacy.Corpus.v1" | ||
path = ${paths.train} | ||
max_length = 500 | ||
gold_preproc = false | ||
limit = 0 | ||
augmenter = null | ||
|
||
[training] | ||
accumulate_gradient = 3 | ||
dev_corpus = "corpora.dev" | ||
train_corpus = "corpora.train" | ||
seed = ${system.seed} | ||
gpu_allocator = ${system.gpu_allocator} | ||
dropout = 0.1 | ||
patience = 1600 | ||
max_epochs = 0 | ||
max_steps = 20000 | ||
eval_frequency = 200 | ||
frozen_components = [] | ||
before_to_disk = null | ||
|
||
[training.batcher] | ||
@batchers = "spacy.batch_by_padded.v1" | ||
discard_oversize = true | ||
size = 2000 | ||
buffer = 256 | ||
get_length = null | ||
|
||
[training.logger] | ||
@loggers = "spacy.ConsoleLogger.v1" | ||
progress_bar = false | ||
|
||
[training.optimizer] | ||
@optimizers = "Adam.v1" | ||
beta1 = 0.9 | ||
beta2 = 0.999 | ||
L2_is_weight_decay = true | ||
L2 = 0.01 | ||
grad_clip = 1.0 | ||
use_averages = false | ||
eps = 0.00000001 | ||
|
||
[training.optimizer.learn_rate] | ||
@schedules = "warmup_linear.v1" | ||
warmup_steps = 250 | ||
total_steps = 20000 | ||
initial_rate = 0.00005 | ||
|
||
[training.score_weights] | ||
ents_per_type = null | ||
ents_f = 1.0 | ||
ents_p = 0.0 | ||
ents_r = 0.0 | ||
|
||
[pretraining] | ||
|
||
[initialize] | ||
vectors = null | ||
init_tok2vec = ${paths.init_tok2vec} | ||
vocab_data = null | ||
lookups = null | ||
before_init = null | ||
after_init = null | ||
|
||
[initialize.components] | ||
|
||
[initialize.tokenizer] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
title: "MeasNER : Bio NER for Counts and Measurements" | ||
description: "NER trained with BiomedNLP-PubMedBERT-base-uncased-abstract on MeasEval data for Counts and Measurements" | ||
# Variables can be referenced across the project.yml using ${vars.var_name} | ||
vars: | ||
name: "measner" | ||
lang: "en" | ||
train: "train.json" | ||
dev: "dev.json" | ||
test: "test.json" | ||
version: "0.1" | ||
# Set your GPU ID, -1 is CPU | ||
gpu_id: 0 | ||
|
||
# These are the directories that the project needs. The project CLI will make | ||
# sure that they always exist. | ||
directories: ["assets", "corpus", "configs", "training", "scripts", "packages"] | ||
|
||
# Assets that should be downloaded or available in the directory. We're shipping | ||
# them with the project, so they won't have to be downloaded. | ||
assets: | ||
- dest: "assets/train.json" | ||
description: "Train Data" | ||
- dest: "assets/dev.json" | ||
description: "Eval Data" | ||
|
||
# Workflows are sequences of commands (see below) executed in order. You can | ||
# run them via "spacy project run [workflow]". If a commands's inputs/outputs | ||
# haven't changed, it won't be re-run. | ||
workflows: | ||
all: | ||
- convert | ||
- create-config | ||
- train | ||
- evaluate | ||
train-measner: | ||
- convert | ||
- create-config | ||
- train | ||
- evaluate | ||
- package | ||
|
||
# Project commands, specified in a style similar to CI config files (e.g. Azure | ||
# pipelines). The name is the command name that lets you trigger the command | ||
# via "spacy project run [command] [path]". The help message is optional and | ||
# shown when executing "spacy project run [optional command] [path] --help". | ||
commands: | ||
- name: "convert" | ||
help: "Convert the data to spaCy's binary format" | ||
script: | ||
- "python scripts/convert.py ${vars.lang} assets/${vars.train} corpus/train.spacy" | ||
- "python scripts/convert.py ${vars.lang} assets/${vars.dev} corpus/dev.spacy" | ||
- "python scripts/convert.py ${vars.lang} assets/${vars.test} corpus/test.spacy" | ||
deps: | ||
- "assets/${vars.train}" | ||
- "assets/${vars.dev}" | ||
- "assets/${vars.test}" | ||
- "scripts/convert.py" | ||
outputs: | ||
- "corpus/train.spacy" | ||
- "corpus/dev.spacy" | ||
- "corpus/test.spacy" | ||
|
||
- name: "create-config" | ||
help: "Create a new config with an NER pipeline component" | ||
script: | ||
- "python -m spacy init fill-config configs/base_config.cfg configs/config.cfg" | ||
outputs: | ||
- "configs/config.cfg" | ||
|
||
- name: "train" | ||
help: "Train the NER model" | ||
script: | ||
- "python -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --training.eval_frequency 10 --training.patience 50 --gpu-id ${vars.gpu_id}" | ||
deps: | ||
- "configs/config.cfg" | ||
- "corpus/train.spacy" | ||
- "corpus/dev.spacy" | ||
outputs: | ||
- "training/model-best" | ||
|
||
- name: "evaluate" | ||
help: "Evaluate the model and export metrics" | ||
script: | ||
- "python -m spacy evaluate training/model-best corpus/dev.spacy --output training/metrics.json" | ||
deps: | ||
- "corpus/dev.spacy" | ||
- "training/model-best" | ||
outputs: | ||
- "training/metrics.json" | ||
|
||
- name: package | ||
help: "Package the trained model as a pip package" | ||
script: | ||
- "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --code measner.py --force" | ||
deps: | ||
- "training/model-best" | ||
outputs_no_cache: | ||
- "packages/${vars.lang}_${vars.name}-${vars.version}/dist/${vars.lang}_${vars.name}-${vars.version}.tar.gz" | ||
|
||
- name: visualize-model | ||
help: Visualize the model's output interactively using Streamlit | ||
script: | ||
- "streamlit run scripts/visualize_model.py training/model-best \"I saw Shaka Khan in London.\"" | ||
deps: | ||
- "scripts/visualize_model.py" | ||
- "training/model-best" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
spacy | ||
spacy_transformers | ||
sentencepiece | ||
protobuf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import logging | ||
|
||
LOGGER = logging.Logger("denomme") | ||
file_handler = logging.FileHandler("denomme.log") | ||
stream_handler = logging.StreamHandler() | ||
|
||
stream_formatter = logging.Formatter("%(asctime)-15s %(levelname)-8s %(message)s") | ||
file_formatter = logging.Formatter( | ||
"{'time':'%(asctime)s', 'name': '%(name)s', \ | ||
'level': '%(levelname)s', 'message': '%(message)s'}" | ||
) | ||
|
||
file_handler.setFormatter(file_formatter) | ||
stream_handler.setFormatter(stream_formatter) | ||
|
||
LOGGER.addHandler(file_handler) | ||
LOGGER.addHandler(stream_handler) |
Oops, something went wrong.