Release 0.3.0 (#49)

* add half precision and transformations logic * fix bugs * add support to gpu * add support to gpu * fix minor bug in gpu code * fix minor bug in gpu code * refactor quantization * add dataset interface * fix bug * fix bugs with dataset api * fix bug * update test * solve minor issues * fix error in cuda * fix error with tensorRT * fix bug in tvm and change name of quantization_ths * add resources * Modify readme (#48) * Create section - integration with other libraries * update readme (work-in-progress) * Minor Readme update * rename notebook * update notebook * rename notebook * Rename notebook * Rename notebook * Rename notebook * Rename notebook * Rename notebook * update version to 0.3.0 * Update readme with latest release information * Benchmarks * Update readme * Update readme with benchmarks * Update readme, minor changes * solve api issue with tf * fix typos in benchmarks Co-authored-by: morgoth95 <[email protected]> Co-authored-by: Emile Courthoud <[email protected]> Co-authored-by: Nebuly <[email protected]>
nebuly-ai · May 10, 2022 · b2a6a5a · b2a6a5a
1 parent 8de0f7f
commit b2a6a5a
Show file tree

Hide file tree

Showing 43 changed files with 3,842 additions and 786 deletions.
diff --git a/README.md b/README.md
diff --git a/nebullvm/api/frontend/huggingface.py b/nebullvm/api/frontend/huggingface.py
@@ -1,11 +1,23 @@
 from collections import OrderedDict
 from tempfile import TemporaryDirectory
-from typing import Tuple, Union, List, Iterable, Dict, Any, Type
+from typing import (
+    Tuple,
+    Union,
+    List,
+    Iterable,
+    Dict,
+    Any,
+    Type,
+    Callable,
+    Optional,
+    Sequence,
+)
 
 import numpy as np
 import torch
 
 from nebullvm import optimize_torch_model
+from nebullvm.api.frontend.utils import ifnone, QUANTIZATION_METRIC_MAP
 from nebullvm.base import DataType, ModelCompiler
 from nebullvm.inference_learners.base import (
     PytorchBaseInferenceLearner,
@@ -161,7 +173,7 @@ def _save_wrapper_extra_info(self):
     def _load_wrapper_extra_info(builder_inputs: Dict) -> Dict:
         return builder_inputs
 
-    def predict(self, *args, **kwargs) -> Any:
+    def run(self, *args, **kwargs) -> Any:
         """Run the underlying optimized model for getting a prediction.
 
         The method has an hybrid interface. It accepts inputs either as
@@ -290,18 +302,60 @@ def _get_extra_optimizer(
     return [HuggingFaceOptimizer(hugging_face_params={})]
 
 
+class _HFDataset(Sequence):
+    def __init__(
+        self,
+        input_texts: List,
+        ys: Optional[List],
+        keywords: List[str],
+        batch_size: int,
+        tokenizer: PreTrainedTokenizer,
+        tokenizer_args: Dict,
+    ):
+        self._input_texts = input_texts
+        self._ys = ys
+        self._bs = batch_size
+        self._keys = keywords
+        self._tokenizer = tokenizer
+        if self._tokenizer.pad_token is None:
+            self._tokenizer.pad_token = self._tokenizer.eos_token
+        _tokenizer_args = {"truncation": True, "padding": True}
+        _tokenizer_args.update(tokenizer_args)
+        self._tokenizer_args = _tokenizer_args
+
+    def __getitem__(self, item: int):
+        pointer = self._bs * item
+        if pointer >= len(self):
+            raise IndexError
+        mini_batch = self._input_texts[
+            pointer : pointer + self._bs  # noqa E203
+        ]
+        if self._ys is not None:
+            mini_batch_y = self._ys[pointer : pointer + self._bs]  # noqa E203
+        else:
+            mini_batch_y = None
+        encoded_inputs = self._tokenizer(mini_batch, **self._tokenizer_args)
+        return tuple(encoded_inputs[key] for key in self._keys), mini_batch_y
+
+    def __len__(self):
+        return len(self._input_texts)
+
+
 def optimize_huggingface_model(
     model: PreTrainedModel,
     tokenizer: PreTrainedTokenizer,
-    target_text: str,
+    input_texts: List[str],
     batch_size: int,
     max_input_sizes: List[Tuple[int, ...]],
     save_dir: str,
     extra_input_info: List[Dict] = None,
     use_static_shape: bool = False,
     use_torch_api: bool = False,
     tokenizer_args: Dict = None,
-    quantization_ths: float = None,
+    ignore_compilers: List[str] = None,
+    perf_loss_ths: float = None,
+    perf_metric: Union[str, Callable] = None,
+    ys: List = None,
 ):
     """Optimize the HuggingFace model.
 
@@ -316,7 +370,12 @@ def optimize_huggingface_model(
         model (PreTrainedModel): HuggingFace transformers model.
         tokenizer (PreTrainedTokenizer): Tokenizer used for building model's
             inputs.
-        target_text (str): Example of test to be given as model input.
+        input_texts (List[str]): Texts either from the training set or similar
+            to the ones contained in the text. If the perf_loss_ths is
+            passed the input_text will be used for computing the drop in
+            precision and for setting the quantization parameters. If you
+            selected a quantization metric needing the input labels you need to
+            provide them for each input in the `ys` argument.
         batch_size (int): Batch size needed for the model.
         max_input_sizes (List[Tuple[int]]): List containing the maximum size of
             all the input tensors of the model.
@@ -339,19 +398,51 @@ def optimize_huggingface_model(
             succeeds. Clearly, in case of failure of the torch API, a second
             tentative will be done with the ONNX interface.
         tokenizer_args (Dict, optional): Extra args needed for the tokenizer.
-        quantization_ths (float, optional): Tolerated relative error for
-            performing quantization before compiling the model. If no value
-            is given, no quantization will be performed.
+        ignore_compilers (List[str], optional): List of DL compilers we want
+            to ignore while running the optimization. Compiler name should be
+            one between "tvm", "tensor RT", "openvino" and "onnxruntime".
+        perf_loss_ths (float, optional): Tolerated relative error for
+            performing approximation techniques before compiling the model.
+            If no value is given, no optimization will be performed. Note that
+            it will not be used for compilers using the torch API when
+            `use_torch_api` is `True`. Just dynamic quantization will be
+            performed, since no data is given as input.
+        perf_metric (Union[Callable, str], optional): The metric to
+            be used for accepting or refusing a precision-reduction
+            optimization proposal. If none is given but a `perf_loss_ths` is
+            received, the `nebullvm.measure.compute_relative_difference`
+            metric will be used as default one. A user-defined metric can
+            be passed as function accepting as inputs two tuples of tensors
+            (produced by the baseline and the quantized model) and the related
+            original labels.
+            For more information see
+            `nebullvm.measure.compute_relative_difference` and
+            `nebullvm.measure.compute_accuracy_drop`. `perf_metric`
+            accepts as value also a string containing the metric name. At the
+            current stage the supported metrics are `"precision"` and
+            `"accuracy"`.
+        ys: List of target labels. For each input in `input_texts` there should
+            be the corresponding label. Note that this feature is just used for
+            estimating the accuracy drop while running precision-reduction
+            techniques. It will be ignored if these techniques are not
+            activated.
     """
+    if perf_loss_ths is not None and ys is None and perf_metric == "accuracy":
+        raise ValueError(
+            "You cannot select the accuracy as quantization metric without "
+            "providing valid labels!"
+        )
+    if isinstance(perf_metric, str):
+        perf_metric = QUANTIZATION_METRIC_MAP.get(perf_metric)
     tokenizer_args = tokenizer_args or {}
     tokenizer_args.update({"return_tensors": "pt"})
     output_structure, output_type = _get_output_structure(
-        text=target_text,
+        text=input_texts[0],
         model=model,
         tokenizer=tokenizer,
         tokenizer_args=tokenizer_args,
     )
-    input_example = tokenizer(target_text, **tokenizer_args)
+    input_example = tokenizer(input_texts[0], **tokenizer_args)
     input_types = [_extract_input_type(v) for v in input_example.values()] or [
         "int"
     ] * len(input_example)
@@ -370,23 +461,37 @@ def optimize_huggingface_model(
             extra_input_info=extra_input_info,
             use_torch_api=use_torch_api,
             dynamic_axis=_get_dynamic_axis(
-                text=target_text,
+                text=input_texts[0],
                 tokenizer=tokenizer,
                 model=model,
                 tokenizer_args=tokenizer_args,
             )
             if not use_static_shape
             else None,
-            quantization_ths=quantization_ths,
-            ignore_compilers=[ModelCompiler.TENSOR_RT.value]
-            if use_static_shape
-            else [
-                ModelCompiler.TENSOR_RT.value,
-                ModelCompiler.APACHE_TVM.value,
-            ],
-            custom_optimizers=_get_extra_optimizer(model.config)
-            if quantization_ths is None
-            else None,
+            perf_loss_ths=perf_loss_ths,
+            perf_metric=perf_metric,
+            dataloader=_HFDataset(
+                input_texts,
+                ys,
+                list(wrapper_model.inputs_types.keys()),
+                batch_size,
+                tokenizer,
+                tokenizer_args,
+            ),
+            ignore_compilers=list(
+                set(
+                    (
+                        [ModelCompiler.TENSOR_RT.value]
+                        if use_static_shape
+                        else [
+                            ModelCompiler.TENSOR_RT.value,
+                            ModelCompiler.APACHE_TVM.value,
+                        ]
+                    )
+                    + ifnone(ignore_compilers, [])
+                )
+            ),
+            custom_optimizers=_get_extra_optimizer(model.config),
         )
         final_model = HuggingFaceInferenceLearner(
             core_inference_learner=optimized_model,