Fix broken streamlit (#392)

* fix: rename pubmed bert everywhere * build: pin python to <3.10 * fix: use custom load archive to fix model naming issue * build: try bumping poetry build action * build: allow macos failures in ci
JohnGiorgi · May 27, 2024 · 3f15c67 · 3f15c67
1 parent a77bdd4
commit 3f15c67
Show file tree

Hide file tree

Showing 11 changed files with 3,531 additions and 3,407 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -11,12 +11,17 @@ on:
 
 jobs:
   build:
-
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
         os: [ubuntu-latest, macos-latest]
         python-version: [3.8, 3.9]
+        include:
+          - os: macos-latest
+            allow-failure: true
+          - os: ubuntu-latest
+            allow-failure: false
 
     steps:
       - uses: actions/checkout@v3
@@ -25,7 +30,7 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install Poetry
-        uses: abatilo/actions-poetry@v2.1.5
+        uses: abatilo/actions-poetry@v3.0.0
       - name: Install dependencies with Poetry
         run: |
           poetry install
@@ -52,4 +57,4 @@ jobs:
           # very reliable but we don't want to report a failure
           # in the github UI just because the coverage report failed to
           # be published.
-          fail_ci_if_error: false
+          fail_ci_if_error: false
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,7 +36,7 @@ classifiers = [
 exclude = ["tests", "test_fixtures", "training_config"]
 
 [tool.poetry.dependencies]
-python = "^3.8"
+python = "^3.8,<3.10"
 typer = { extras = ["all"], version = "^0.4.0" }
 validators = "^0.20.0"
 more-itertools = "^8.10.0"
@@ -58,6 +58,7 @@ codecov = "^2.1.12"
 # Required to run the demo. Streamlit installs all dev dependencies, so we stick them here.
 streamlit = "^1.12.0"
 pyvis = "^0.2.1"
+altair = "^4.0.0"
 
 # This configuration is adapted from: https://github.com/allenai/allennlp/blob/main/pyproject.toml
 [tool.black]

diff --git a/seq2rel/seq2rel.py b/seq2rel/seq2rel.py
@@ -1,16 +1,24 @@
+import logging
+import os
+import shutil
+from os import PathLike
 from pathlib import Path
-from typing import Any, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from allennlp.common import util as common_util
 from allennlp.common.file_utils import cached_path
-from allennlp.models.archival import load_archive
+from allennlp.common.meta import Meta
+from allennlp.common.params import Params
+from allennlp.models import archival
 from allennlp.predictors import Predictor
 from more_itertools import chunked
 from validators.url import url
 
 from seq2rel.common.util import sanitize_text
 
+logger = logging.getLogger(__name__)
+
 PRETRAINED_MODELS = {
     "cdr": "https://github.com/JohnGiorgi/seq2rel/releases/download/pretrained-models/cdr.tar.gz",
     "cdr_hints": "https://github.com/JohnGiorgi/seq2rel/releases/download/pretrained-models/cdr_hints.tar.gz",
@@ -21,6 +29,103 @@
     "docred": "https://github.com/JohnGiorgi/seq2rel/releases/download/pretrained-models/docred.tar.gz",
 }
 
+# Needed strictly to rename
+# microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext with
+# microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext
+# otherwise identical to allennlp.models.archival.load_archive
+def load_archive(
+    archive_file: Union[str, PathLike],
+    cuda_device: int = -1,
+    overrides: Union[str, Dict[str, Any]] = "",
+    weights_file: str = None,
+) -> archival.Archive:
+    """
+    Instantiates an Archive from an archived `tar.gz` file.
+
+    # Parameters
+
+    archive_file : `Union[str, PathLike]`
+        The archive file to load the model from.
+    cuda_device : `int`, optional (default = `-1`)
+        If `cuda_device` is >= 0, the model will be loaded onto the
+        corresponding GPU. Otherwise it will be loaded onto the CPU.
+    overrides : `Union[str, Dict[str, Any]]`, optional (default = `""`)
+        JSON overrides to apply to the unarchived `Params` object.
+    weights_file : `str`, optional (default = `None`)
+        The weights file to use.  If unspecified, weights.th in the archive_file will be used.
+    """
+    # redirect to the cache, if necessary
+    resolved_archive_file = cached_path(archive_file)
+
+    if resolved_archive_file == archive_file:
+        logger.info(f"loading archive file {archive_file}")
+    else:
+        logger.info(f"loading archive file {archive_file} from cache at {resolved_archive_file}")
+
+    meta: Optional[Meta] = None
+
+    tempdir = None
+    try:
+        if os.path.isdir(resolved_archive_file):
+            serialization_dir = resolved_archive_file
+        else:
+            with archival.extracted_archive(resolved_archive_file, cleanup=False) as tempdir:
+                serialization_dir = tempdir
+
+        if weights_file:
+            weights_path = weights_file
+        else:
+            weights_path = archival.get_weights_path(serialization_dir)
+
+        # Load config
+        config = Params.from_file(os.path.join(serialization_dir, archival.CONFIG_NAME), overrides)
+
+        # Rename
+        # microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext with
+        # microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext
+        def rename_pretrained_model(config: Params) -> Params:
+            for key, value in config.items():
+                if isinstance(value, dict):
+                    rename_pretrained_model(value)
+                else:
+                    if value == "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext":
+                        config[
+                            key
+                        ] = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
+
+        config = config.as_dict()
+        rename_pretrained_model(config)
+        config = Params(config)
+
+        # Instantiate model and dataset readers. Use a duplicate of the config, as it will get consumed.
+        dataset_reader, validation_dataset_reader = archival._load_dataset_readers(
+            config.duplicate(), serialization_dir
+        )
+        model = archival._load_model(
+            config.duplicate(), weights_path, serialization_dir, cuda_device
+        )
+
+        # Load meta.
+        meta_path = os.path.join(serialization_dir, archival.META_NAME)
+        if os.path.exists(meta_path):
+            meta = Meta.from_path(meta_path)
+    finally:
+        if tempdir is not None:
+            logger.info(f"removing temporary unarchived model dir at {tempdir}")
+            shutil.rmtree(tempdir, ignore_errors=True)
+
+    # Check version compatibility.
+    if meta is not None:
+        archival._check_version_compatibility(archive_file, meta)
+
+    return archival.Archive(
+        model=model,
+        config=config,
+        dataset_reader=dataset_reader,
+        validation_dataset_reader=validation_dataset_reader,
+        meta=meta,
+    )
+
 
 class Seq2Rel:
     """A simple interface to the model for the purposes of extracting entities and relations from text.
@@ -82,6 +187,7 @@ def __init__(self, pretrained_model_name_or_path: str, **kwargs: Any) -> None:
         overrides = {
             "model.source_embedder.token_embedders.tokens.load_weights": False,
         }
+
         # Allow user to update these with kwargs.
         if "overrides" in kwargs:
             overrides.update(kwargs.pop("overrides"))

diff --git a/test_fixtures/experiment.jsonnet b/test_fixtures/experiment.jsonnet
@@ -3,7 +3,7 @@
 // The pretrained model to use as encoder. This is a reasonable default for biomedical text.
 // Should be a registered name in the Transformers library (see https://huggingface.co/models) 
 // OR a path on disk to a serialized transformer model.
-local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
+local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";
 
 // These are reasonable defaults.
 local max_length = 16;        // Max length of input text

diff --git a/training_config/cdr.jsonnet b/training_config/cdr.jsonnet
@@ -3,7 +3,7 @@
 // The pretrained model to use as encoder. This is a reasonable default for biomedical text.
 // Should be a registered name in the Transformers library (see https://huggingface.co/models) 
 // OR a path on disk to a serialized transformer model.
-local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
+local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";
 
 // These are reasonable defaults.
 local max_length = 512;       // Max length of input text

diff --git a/training_config/cdr_hints.jsonnet b/training_config/cdr_hints.jsonnet
@@ -3,7 +3,7 @@
 // The pretrained model to use as encoder. This is a reasonable default for biomedical text.
 // Should be a registered name in the Transformers library (see https://huggingface.co/models) 
 // OR a path on disk to a serialized transformer model.
-local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
+local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";
 
 // These are reasonable defaults.
 local max_length = 512;       // Max length of input text

diff --git a/training_config/dgm.jsonnet b/training_config/dgm.jsonnet
@@ -3,7 +3,7 @@
 // The pretrained model to use as encoder. This is a reasonable default for biomedical text.
 // Should be a registered name in the Transformers library (see https://huggingface.co/models) 
 // OR a path on disk to a serialized transformer model.
-local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
+local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";
 
 // These are reasonable defaults.
 local max_length = 512;       // Max length of input text

diff --git a/training_config/dgm_hints.jsonnet b/training_config/dgm_hints.jsonnet
@@ -3,7 +3,7 @@
 // The pretrained model to use as encoder. This is a reasonable default for biomedical text.
 // Should be a registered name in the Transformers library (see https://huggingface.co/models) 
 // OR a path on disk to a serialized transformer model.
-local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
+local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";
 
 // These are reasonable defaults.
 local max_length = 512;       // Max length of input text

diff --git a/training_config/gda.jsonnet b/training_config/gda.jsonnet
@@ -3,7 +3,7 @@
 // The pretrained model to use as encoder. This is a reasonable default for biomedical text.
 // Should be a registered name in the Transformers library (see https://huggingface.co/models) 
 // OR a path on disk to a serialized transformer model.
-local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
+local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";
 
 // These are reasonable defaults.
 local max_length = 512;       // Max length of input text

diff --git a/training_config/gda_hints.jsonnet b/training_config/gda_hints.jsonnet
@@ -3,7 +3,7 @@
 // The pretrained model to use as encoder. This is a reasonable default for biomedical text.
 // Should be a registered name in the Transformers library (see https://huggingface.co/models) 
 // OR a path on disk to a serialized transformer model.
-local model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext";
+local model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext";
 
 // These are reasonable defaults.
 local max_length = 512;       // Max length of input text