Skip to content

Commit

Permalink
Fix broken streamlit (#392)
Browse files Browse the repository at this point in the history
* fix: rename pubmed bert everywhere

* build: pin python to <3.10

* fix: use custom load archive to fix model naming issue

* build: try bumping poetry build action

* build: allow macos failures in ci
  • Loading branch information
JohnGiorgi authored May 27, 2024
1 parent a77bdd4 commit 3f15c67
Show file tree
Hide file tree
Showing 11 changed files with 3,531 additions and 3,407 deletions.
11 changes: 8 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,17 @@ on:

jobs:
build:

runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
python-version: [3.8, 3.9]
include:
- os: macos-latest
allow-failure: true
- os: ubuntu-latest
allow-failure: false

steps:
- uses: actions/checkout@v3
Expand All @@ -25,7 +30,7 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- name: Install Poetry
uses: abatilo/actions-poetry@v2.1.5
uses: abatilo/actions-poetry@v3.0.0
- name: Install dependencies with Poetry
run: |
poetry install
Expand All @@ -52,4 +57,4 @@ jobs:
# very reliable but we don't want to report a failure
# in the github UI just because the coverage report failed to
# be published.
fail_ci_if_error: false
fail_ci_if_error: false
6,800 changes: 3,406 additions & 3,394 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ classifiers = [
exclude = ["tests", "test_fixtures", "training_config"]

[tool.poetry.dependencies]
python = "^3.8"
python = "^3.8,<3.10"
typer = { extras = ["all"], version = "^0.4.0" }
validators = "^0.20.0"
more-itertools = "^8.10.0"
Expand All @@ -58,6 +58,7 @@ codecov = "^2.1.12"
# Required to run the demo. Streamlit installs all dev dependencies, so we stick them here.
streamlit = "^1.12.0"
pyvis = "^0.2.1"
altair = "^4.0.0"

# This configuration is adapted from: https://github.com/allenai/allennlp/blob/main/pyproject.toml
[tool.black]
Expand Down
110 changes: 108 additions & 2 deletions seq2rel/seq2rel.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
import logging
import os
import shutil
from os import PathLike
from pathlib import Path
from typing import Any, List, Optional, Union
from typing import Any, Dict, List, Optional, Union

import torch
from allennlp.common import util as common_util
from allennlp.common.file_utils import cached_path
from allennlp.models.archival import load_archive
from allennlp.common.meta import Meta
from allennlp.common.params import Params
from allennlp.models import archival
from allennlp.predictors import Predictor
from more_itertools import chunked
from validators.url import url

from seq2rel.common.util import sanitize_text

logger = logging.getLogger(__name__)

PRETRAINED_MODELS = {
"cdr": "https://github.com/JohnGiorgi/seq2rel/releases/download/pretrained-models/cdr.tar.gz",
"cdr_hints": "https://github.com/JohnGiorgi/seq2rel/releases/download/pretrained-models/cdr_hints.tar.gz",
Expand All @@ -21,6 +29,103 @@
"docred": "https://github.com/JohnGiorgi/seq2rel/releases/download/pretrained-models/docred.tar.gz",
}

# Needed strictly to rename
# microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext with
# microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext
# otherwise identical to allennlp.models.archival.load_archive
def load_archive(
archive_file: Union[str, PathLike],
cuda_device: int = -1,
overrides: Union[str, Dict[str, Any]] = "",
weights_file: str = None,
) -> archival.Archive:
"""
Instantiates an Archive from an archived `tar.gz` file.
# Parameters
archive_file : `Union[str, PathLike]`
The archive file to load the model from.
cuda_device : `int`, optional (default = `-1`)
If `cuda_device` is >= 0, the model will be loaded onto the
corresponding GPU. Otherwise it will be loaded onto the CPU.
overrides : `Union[str, Dict[str, Any]]`, optional (default = `""`)
JSON overrides to apply to the unarchived `Params` object.
weights_file : `str`, optional (default = `None`)
The weights file to use. If unspecified, weights.th in the archive_file will be used.
"""
# redirect to the cache, if necessary
resolved_archive_file = cached_path(archive_file)

if resolved_archive_file == archive_file:
logger.info(f"loading archive file {archive_file}")
else:
logger.info(f"loading archive file {archive_file} from cache at {resolved_archive_file}")

meta: Optional[Meta] = None

tempdir = None
try:
if os.path.isdir(resolved_archive_file):
serialization_dir = resolved_archive_file
else:
with archival.extracted_archive(resolved_archive_file, cleanup=False) as tempdir:
serialization_dir = tempdir

if weights_file:
weights_path = weights_file
else:
weights_path = archival.get_weights_path(serialization_dir)

# Load config
config = Params.from_file(os.path.join(serialization_dir, archival.CONFIG_NAME), overrides)

# Rename
# microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext with
# microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext
def rename_pretrained_model(config: Params) -> Params:
for key, value in config.items():
if isinstance(value, dict):
rename_pretrained_model(value)
else:
if value == "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext":
config[
key
] = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"

config = config.as_dict()
rename_pretrained_model(config)
config = Params(config)

# Instantiate model and dataset readers. Use a duplicate of the config, as it will get consumed.
dataset_reader, validation_dataset_reader = archival._load_dataset_readers(
config.duplicate(), serialization_dir
)
model = archival._load_model(
config.duplicate(), weights_path, serialization_dir, cuda_device
)

# Load meta.
meta_path = os.path.join(serialization_dir, archival.META_NAME)
if os.path.exists(meta_path):
meta = Meta.from_path(meta_path)
finally:
if tempdir is not None:
logger.info(f"removing temporary unarchived model dir at {tempdir}")
shutil.rmtree(tempdir, ignore_errors=True)

# Check version compatibility.
if meta is not None:
archival._check_version_compatibility(archive_file, meta)

return archival.Archive(
model=model,
config=config,
dataset_reader=dataset_reader,
validation_dataset_reader=validation_dataset_reader,
meta=meta,
)


class Seq2Rel:
"""A simple interface to the model for the purposes of extracting entities and relations from text.
Expand Down Expand Up @@ -82,6 +187,7 @@ def __init__(self, pretrained_model_name_or_path: str, **kwargs: Any) -> None:
overrides = {
"model.source_embedder.token_embedders.tokens.load_weights": False,
}

# Allow user to update these with kwargs.
if "overrides" in kwargs:
overrides.update(kwargs.pop("overrides"))
Expand Down
2 changes: 1 addition & 1 deletion test_fixtures/experiment.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// The pretrained model to use as encoder. This is a reasonable default for biomedical text.
// Should be a registered name in the Transformers library (see https://huggingface.co/models)
// OR a path on disk to a serialized transformer model.
local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";

// These are reasonable defaults.
local max_length = 16; // Max length of input text
Expand Down
2 changes: 1 addition & 1 deletion training_config/cdr.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// The pretrained model to use as encoder. This is a reasonable default for biomedical text.
// Should be a registered name in the Transformers library (see https://huggingface.co/models)
// OR a path on disk to a serialized transformer model.
local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";

// These are reasonable defaults.
local max_length = 512; // Max length of input text
Expand Down
2 changes: 1 addition & 1 deletion training_config/cdr_hints.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// The pretrained model to use as encoder. This is a reasonable default for biomedical text.
// Should be a registered name in the Transformers library (see https://huggingface.co/models)
// OR a path on disk to a serialized transformer model.
local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";

// These are reasonable defaults.
local max_length = 512; // Max length of input text
Expand Down
2 changes: 1 addition & 1 deletion training_config/dgm.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// The pretrained model to use as encoder. This is a reasonable default for biomedical text.
// Should be a registered name in the Transformers library (see https://huggingface.co/models)
// OR a path on disk to a serialized transformer model.
local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";

// These are reasonable defaults.
local max_length = 512; // Max length of input text
Expand Down
2 changes: 1 addition & 1 deletion training_config/dgm_hints.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// The pretrained model to use as encoder. This is a reasonable default for biomedical text.
// Should be a registered name in the Transformers library (see https://huggingface.co/models)
// OR a path on disk to a serialized transformer model.
local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";

// These are reasonable defaults.
local max_length = 512; // Max length of input text
Expand Down
2 changes: 1 addition & 1 deletion training_config/gda.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// The pretrained model to use as encoder. This is a reasonable default for biomedical text.
// Should be a registered name in the Transformers library (see https://huggingface.co/models)
// OR a path on disk to a serialized transformer model.
local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";

// These are reasonable defaults.
local max_length = 512; // Max length of input text
Expand Down
2 changes: 1 addition & 1 deletion training_config/gda_hints.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// The pretrained model to use as encoder. This is a reasonable default for biomedical text.
// Should be a registered name in the Transformers library (see https://huggingface.co/models)
// OR a path on disk to a serialized transformer model.
local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";

// These are reasonable defaults.
local max_length = 512; // Max length of input text
Expand Down

0 comments on commit 3f15c67

Please sign in to comment.