Add ONNX support for Embeddings and Pipelines, closes #109

neuml · Aug 31, 2021 · 2ed7c7f · 2ed7c7f
1 parent f2512f5
commit 2ed7c7f
Show file tree

Hide file tree

Showing 13 changed files with 250 additions and 72 deletions.
diff --git a/docs/pipelines/onnx.md b/docs/pipelines/onnx.md
@@ -1,33 +1,23 @@
 # HFOnnx
 
-Exports a Hugging Face Transformer model to ONNX.
+Exports a Hugging Face Transformer model to ONNX. Currently, this works best with classification/pooling/qa models. Work is ongoing for sequence to
+sequence models (summarization, transcription, translation).
 
 Example on how to use the pipeline below.
 
 ```python
-from onnxruntime import InferenceSession, SessionOptions
-from transformers import AutoTokenizer
+from txtai.pipeline import HFOnnx, Labels
 
-from txtai.pipeline import HFOnnx
-
-# Normalize logits using sigmoid function
-sigmoid = lambda x: 1.0 / (1.0 + np.exp(-x))
+# Model path
+path = "distilbert-base-uncased-finetuned-sst-2-english"
 
 # Export model to ONNX
 onnx = HFOnnx()
-model = onnx("distilbert-base-uncased-finetuned-sst-2-english", "sequence-classification", "model.onnx", True)
-
-# Build ONNX session
-options = SessionOptions()
-session = InferenceSession(model, options)
-
-# Tokenize
-tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
-tokens = tokenizer(["I am happy"], return_tensors="np")
+model = onnx(path, "text-classification", "model.onnx", True)
 
 # Run inference and validate
-outputs = session.run(None, dict(tokens))
-outputs = sigmoid(outputs[0])
+labels = Labels((model, path), dynamic=False)
+labels("I am happy")
 ```
 
 ::: txtai.pipeline.HFOnnx.__init__

diff --git a/setup.py b/setup.py
@@ -19,6 +19,8 @@
     "uvicorn>=0.12.1",
 ]
 
+extras["model"] = ["onnxruntime>=1.8.1"]
+
 extras["pipeline"] = [
     "fasttext>=0.9.2",
     "nltk>=3.5",
@@ -33,15 +35,14 @@
     "annoy>=1.16.3",
     "fasttext>=0.9.2",
     "hnswlib>=0.5.0",
-    "onnxruntime>=1.8.1",
     "pymagnitude-lite>=0.1.43",
     "scikit-learn>=0.23.1",
     "sentence-transformers>=2.0.0",
 ]
 
 extras["workflow"] = ["apache-libcloud>=3.3.1", "pillow>=7.2.0", "requests>=2.24.0"]
 
-extras["all"] = extras["api"] + extras["pipeline"] + extras["similarity"] + extras["workflow"]
+extras["all"] = extras["api"] + extras["model"] + extras["pipeline"] + extras["similarity"] + extras["workflow"]
 
 setup(
     name="txtai",

diff --git a/src/python/txtai/models/__init__.py b/src/python/txtai/models/__init__.py
@@ -3,4 +3,5 @@
 """
 
 from .models import Models
+from .onnx import OnnxModel
 from .pooling import MeanPooling, Pooling
diff --git a/src/python/txtai/models/models.py b/src/python/txtai/models/models.py
@@ -2,8 +2,14 @@
 Models module
 """
 
+import os
+
 import torch
 
+from transformers import AutoModel, AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
+
+from .onnx import OnnxModel
+
 
 class Models:
     """
@@ -81,3 +87,36 @@ def reference(deviceid):
         """
 
         return "cpu" if deviceid < 0 else "cuda:{}".format(deviceid)
+
+    @staticmethod
+    def load(path, task="default"):
+        """
+        Loads a machine learning model. Handles multiple model frameworks (ONNX, Transformers).
+
+        Args:
+            path: path to model
+            task: task name used to lookup model configuration
+
+        Returns:
+            machine learning model
+        """
+
+        # Detect ONNX models
+        if isinstance(path, bytes) or (isinstance(path, str) and os.path.isfile(path)):
+            return OnnxModel(path)
+
+        # Return path, if path isn't a string
+        if not isinstance(path, str):
+            return path
+
+        # Transformer models
+        config = {
+            "default": AutoModel.from_pretrained,
+            "question-answering": AutoModelForQuestionAnswering.from_pretrained,
+            "summarization": AutoModelForSeq2SeqLM.from_pretrained,
+            "text-classification": AutoModelForSequenceClassification.from_pretrained,
+            "zero-shot-classification": AutoModelForSequenceClassification.from_pretrained,
+        }
+
+        # Load model for supported tasks. Return path for unsupported tasks.
+        return config[task](path) if task in config else path
diff --git a/src/python/txtai/models/onnx.py b/src/python/txtai/models/onnx.py
@@ -0,0 +1,104 @@
+"""
+ONNX module
+"""
+
+# Conditional import
+try:
+    from onnxruntime import InferenceSession, SessionOptions
+
+    ONNX_RUNTIME = True
+except ImportError:
+    ONNX_RUNTIME = False
+
+import numpy as np
+import torch
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto.modeling_auto import (
+    MODEL_MAPPING,
+    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+)
+from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING
+from transformers.modeling_utils import PreTrainedModel
+
+# pylint: disable=W0223
+class OnnxModel(PreTrainedModel):
+    """
+    Provides a Transformers/PyTorch compatible interface for ONNX models. Handles casting inputs
+    and outputs with minimal to no copying of data.
+    """
+
+    def __init__(self, model):
+        """
+        Creates a new OnnxModel.
+
+        Args:
+            model: path to model or InferenceSession
+        """
+
+        if not ONNX_RUNTIME:
+            raise ImportError('onnxruntime is not available - install "model" extra to enable')
+
+        super().__init__(PretrainedConfig())
+
+        # Create ONNX session
+        self.model = InferenceSession(model, SessionOptions())
+
+        # Add references for this class to supported AutoModel classes
+        name = self.__class__.__name__
+        if name not in MODEL_MAPPING:
+            MODEL_MAPPING[name] = self.__class__
+            MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING[name] = self.__class__
+            MODEL_FOR_QUESTION_ANSWERING_MAPPING[name] = self.__class__
+
+        # Add references for this class to support pipeline AutoTokenizers
+        if type(self.config) not in TOKENIZER_MAPPING:
+            TOKENIZER_MAPPING[type(self.config)] = None
+
+    def forward(self, **inputs):
+        """
+        Runs inputs through an ONNX model and returns outputs. This method handles casting inputs
+        and outputs between torch tensors and numpy arrays as shared memory (no copy).
+
+        Args:
+            inputs: model inputs
+
+        Returns:
+            model outputs
+        """
+
+        inputs = self.parse(inputs)
+
+        # Run inputs through ONNX model
+        results = self.model.run(None, inputs)
+
+        # pylint: disable=E1101
+        return torch.from_numpy(np.array(results))
+
+    def parse(self, inputs):
+        """
+        Parse model inputs and handle converting to ONNX compatible inputs.
+
+        Args:
+            inputs: model inputs
+
+        Returns:
+            ONNX compatible model inputs
+        """
+
+        features = {}
+
+        # Select features from inputs
+        for key in ["input_ids", "attention_mask", "token_type_ids"]:
+            if key in inputs:
+                value = inputs[key]
+
+                # Cast torch tensors to numpy
+                if hasattr(value, "cpu"):
+                    value = value.cpu().numpy()
+
+                # Cast to numpy array if not already one
+                features[key] = np.asarray(value)
+
+        return features
diff --git a/src/python/txtai/models/pooling.py b/src/python/txtai/models/pooling.py
@@ -7,7 +7,7 @@
 
 from torch import nn
 
-from transformers import AutoModel, AutoTokenizer
+from transformers import AutoTokenizer
 
 from .models import Models
 
@@ -17,21 +17,22 @@ class Pooling(nn.Module):
     Builds pooled vectors usings outputs from a transformers model.
     """
 
-    def __init__(self, path, device, batch=32, maxlength=None):
+    def __init__(self, path, device, tokenizer=None, batch=32, maxlength=None):
         """
         Creates a new Pooling model.
 
         Args:
-            path: path to transformers model
+            path: path to model, accepts Hugging Face model hub id or local path
             device: tensor device id
+            tokenizer: optional path to tokenizer
             batch: batch size
             maxlength: max sequence length
         """
 
         super().__init__()
 
-        self.model = AutoModel.from_pretrained(path)
-        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.model = Models.load(path)
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer if tokenizer else path)
         self.device = Models.device(device)
 
         # Detect unbounded tokenizer typically found in older models

diff --git a/src/python/txtai/pipeline/hfonnx.py b/src/python/txtai/pipeline/hfonnx.py
@@ -18,8 +18,7 @@
 
 from torch.onnx import export
 
-from transformers import AutoModel, AutoModelForCausalLM, AutoModelForMultipleChoice, AutoModelForQuestionAnswering
-from transformers import AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoTokenizer
+from transformers import AutoModel, AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoTokenizer
 
 from ..models import MeanPooling
 from .tensors import Tensors
@@ -40,6 +39,9 @@ def __call__(self, path, task="default", output=None, quantize=False, opset=12):
             output: optional output model path, defaults to return byte array if None
             quantize: if model should be quantized (requires onnx to be installed), defaults to False
             opset: onnx opset, defaults to 12
+
+        Returns:
+            path to model output or model as bytes depending on output parameter
         """
 
         inputs, outputs, model = self.parameters(task)
@@ -141,12 +143,7 @@ def parameters(self, task):
 
         config = {
             "default": (OrderedDict({"last_hidden_state": {0: "batch", 1: "sequence"}}), AutoModel.from_pretrained),
-            "causal-lm": (OrderedDict({"logits": {0: "batch", 1: "sequence"}}), AutoModelForCausalLM.from_pretrained),
             "pooling": (OrderedDict({"embeddings": {0: "batch", 1: "sequence"}}), lambda x: MeanPoolingOnnx(x, -1)),
-            "seq2seq-lm": (OrderedDict({"logits": {0: "batch", 1: "decoder_sequence"}}), AutoModelForSeq2SeqLM.from_pretrained),
-            "sequence-classification": (OrderedDict({"logits": {0: "batch"}}), AutoModelForSequenceClassification.from_pretrained),
-            "token-classification": (OrderedDict({"logits": {0: "batch", 1: "sequence"}}), AutoModelForTokenClassification.from_pretrained),
-            "multiple-choice": (OrderedDict({"logits": {0: "batch"}}), AutoModelForMultipleChoice.from_pretrained),
             "question-answering": (
                 OrderedDict(
                     {
@@ -156,8 +153,12 @@ def parameters(self, task):
                 ),
                 AutoModelForQuestionAnswering.from_pretrained,
             ),
+            "text-classification": (OrderedDict({"logits": {0: "batch"}}), AutoModelForSequenceClassification.from_pretrained),
         }
 
+        # Aliases
+        config["zero-shot-classification"] = config["text-classification"]
+
         return (inputs,) + config[task]
 
 

diff --git a/src/python/txtai/pipeline/hfpipeline.py b/src/python/txtai/pipeline/hfpipeline.py
@@ -36,7 +36,7 @@ def __init__(self, task, path=None, quantize=False, gpu=False, model=None):
 
             # Transformer pipeline task
             if isinstance(path, tuple):
-                self.pipeline = pipeline(task, model=path[0], tokenizer=path[1], device=deviceid)
+                self.pipeline = pipeline(task, model=Models.load(path[0], task), tokenizer=path[1], device=deviceid)
             else:
                 self.pipeline = pipeline(task, model=path, tokenizer=path, device=deviceid)
 

diff --git a/src/python/txtai/vectors/factory.py b/src/python/txtai/vectors/factory.py
@@ -2,8 +2,6 @@
 Factory module
 """
 
-import os
-
 from .transformers import TransformersVectors
 from .words import WordVectors, WORDS
 
@@ -58,6 +56,6 @@ def method(config):
 
         # Infer method from path, if blank
         if not method and path:
-            method = "words" if os.path.isfile(path) else "transformers"
+            method = "words" if WordVectors.isDatabase(path) else "transformers"
 
         return method
diff --git a/src/python/txtai/vectors/transformers.py b/src/python/txtai/vectors/transformers.py
@@ -2,6 +2,7 @@
 Transformers module
 """
 
+import os
 import pickle
 import tempfile
 
@@ -14,7 +15,7 @@
     SENTENCE_TRANSFORMERS = False
 
 from .base import Vectors
-from ..models import MeanPooling, Models
+from ..models import MeanPooling, Models, Pooling
 from ..pipeline.tokenizer import Tokenizer
 
 
@@ -32,7 +33,10 @@ def load(self, path):
 
         # Build embeddings with transformers (default)
         if transformers:
-            return MeanPooling(path, device=deviceid)
+            if isinstance(path, bytes) or (isinstance(path, str) and os.path.isfile(path)):
+                return Pooling(path, device=deviceid, tokenizer=self.config.get("tokenizer"))
+
+            return MeanPooling(path, device=deviceid, tokenizer=self.config.get("tokenizer"))
 
         if not SENTENCE_TRANSFORMERS:
             raise ImportError('sentence-transformers is not available - install "similarity" extra to enable')

diff --git a/src/python/txtai/vectors/words.py b/src/python/txtai/vectors/words.py
@@ -153,6 +153,28 @@ def lookup(self, tokens):
 
         return self.model.query(tokens)
 
+    @staticmethod
+    def isDatabase(path):
+        """
+        Checks if this is a SQLite database file which is the file format used for word vectors databases.
+
+        Args:
+            path: path to check
+
+        Returns:
+            True if this is a SQLite database
+        """
+
+        if isinstance(path, str) and os.path.isfile(path) and os.path.getsize(path) >= 100:
+            # Read 100 byte SQLite header
+            with open(path, "rb") as f:
+                header = f.read(100)
+
+            # Check for SQLite header
+            return header.startswith(b"SQLite format 3\000")
+
+        return False
+
     @staticmethod
     def build(data, size, mincount, path):
         """