Skip to content

Commit

Permalink
Release 0.3.0 (#49)
Browse files Browse the repository at this point in the history
* add half precision and transformations logic

* fix bugs

* add support to gpu

* add support to gpu

* fix minor bug in gpu code

* fix minor bug in gpu code

* refactor quantization

* add dataset interface

* fix bug

* fix bugs with dataset api

* fix bug

* update test

* solve minor issues

* fix error in cuda

* fix error with tensorRT

* fix bug in tvm and change name of quantization_ths

* add resources

* Modify readme (#48)

* Create section - integration with other libraries

* update readme (work-in-progress)

* Minor Readme update

* rename notebook

* update notebook

* rename notebook

* Rename notebook

* Rename notebook

* Rename notebook

* Rename notebook

* Rename notebook

* update version to 0.3.0

* Update readme with latest release information

* Benchmarks

* Update readme

* Update readme with benchmarks

* Update readme, minor changes

* solve api issue with tf

* fix typos in benchmarks

Co-authored-by: morgoth95 <[email protected]>
Co-authored-by: Emile Courthoud <[email protected]>
Co-authored-by: Nebuly <[email protected]>
  • Loading branch information
4 people authored May 10, 2022
1 parent 8de0f7f commit b2a6a5a
Show file tree
Hide file tree
Showing 43 changed files with 3,842 additions and 786 deletions.
448 changes: 280 additions & 168 deletions README.md

Large diffs are not rendered by default.

147 changes: 126 additions & 21 deletions nebullvm/api/frontend/huggingface.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,23 @@
from collections import OrderedDict
from tempfile import TemporaryDirectory
from typing import Tuple, Union, List, Iterable, Dict, Any, Type
from typing import (
Tuple,
Union,
List,
Iterable,
Dict,
Any,
Type,
Callable,
Optional,
Sequence,
)

import numpy as np
import torch

from nebullvm import optimize_torch_model
from nebullvm.api.frontend.utils import ifnone, QUANTIZATION_METRIC_MAP
from nebullvm.base import DataType, ModelCompiler
from nebullvm.inference_learners.base import (
PytorchBaseInferenceLearner,
Expand Down Expand Up @@ -161,7 +173,7 @@ def _save_wrapper_extra_info(self):
def _load_wrapper_extra_info(builder_inputs: Dict) -> Dict:
return builder_inputs

def predict(self, *args, **kwargs) -> Any:
def run(self, *args, **kwargs) -> Any:
"""Run the underlying optimized model for getting a prediction.
The method has an hybrid interface. It accepts inputs either as
Expand Down Expand Up @@ -290,18 +302,60 @@ def _get_extra_optimizer(
return [HuggingFaceOptimizer(hugging_face_params={})]


class _HFDataset(Sequence):
def __init__(
self,
input_texts: List,
ys: Optional[List],
keywords: List[str],
batch_size: int,
tokenizer: PreTrainedTokenizer,
tokenizer_args: Dict,
):
self._input_texts = input_texts
self._ys = ys
self._bs = batch_size
self._keys = keywords
self._tokenizer = tokenizer
if self._tokenizer.pad_token is None:
self._tokenizer.pad_token = self._tokenizer.eos_token
_tokenizer_args = {"truncation": True, "padding": True}
_tokenizer_args.update(tokenizer_args)
self._tokenizer_args = _tokenizer_args

def __getitem__(self, item: int):
pointer = self._bs * item
if pointer >= len(self):
raise IndexError
mini_batch = self._input_texts[
pointer : pointer + self._bs # noqa E203
]
if self._ys is not None:
mini_batch_y = self._ys[pointer : pointer + self._bs] # noqa E203
else:
mini_batch_y = None
encoded_inputs = self._tokenizer(mini_batch, **self._tokenizer_args)
return tuple(encoded_inputs[key] for key in self._keys), mini_batch_y

def __len__(self):
return len(self._input_texts)


def optimize_huggingface_model(
model: PreTrainedModel,
tokenizer: PreTrainedTokenizer,
target_text: str,
input_texts: List[str],
batch_size: int,
max_input_sizes: List[Tuple[int, ...]],
save_dir: str,
extra_input_info: List[Dict] = None,
use_static_shape: bool = False,
use_torch_api: bool = False,
tokenizer_args: Dict = None,
quantization_ths: float = None,
ignore_compilers: List[str] = None,
perf_loss_ths: float = None,
perf_metric: Union[str, Callable] = None,
ys: List = None,
):
"""Optimize the HuggingFace model.
Expand All @@ -316,7 +370,12 @@ def optimize_huggingface_model(
model (PreTrainedModel): HuggingFace transformers model.
tokenizer (PreTrainedTokenizer): Tokenizer used for building model's
inputs.
target_text (str): Example of test to be given as model input.
input_texts (List[str]): Texts either from the training set or similar
to the ones contained in the text. If the perf_loss_ths is
passed the input_text will be used for computing the drop in
precision and for setting the quantization parameters. If you
selected a quantization metric needing the input labels you need to
provide them for each input in the `ys` argument.
batch_size (int): Batch size needed for the model.
max_input_sizes (List[Tuple[int]]): List containing the maximum size of
all the input tensors of the model.
Expand All @@ -339,19 +398,51 @@ def optimize_huggingface_model(
succeeds. Clearly, in case of failure of the torch API, a second
tentative will be done with the ONNX interface.
tokenizer_args (Dict, optional): Extra args needed for the tokenizer.
quantization_ths (float, optional): Tolerated relative error for
performing quantization before compiling the model. If no value
is given, no quantization will be performed.
ignore_compilers (List[str], optional): List of DL compilers we want
to ignore while running the optimization. Compiler name should be
one between "tvm", "tensor RT", "openvino" and "onnxruntime".
perf_loss_ths (float, optional): Tolerated relative error for
performing approximation techniques before compiling the model.
If no value is given, no optimization will be performed. Note that
it will not be used for compilers using the torch API when
`use_torch_api` is `True`. Just dynamic quantization will be
performed, since no data is given as input.
perf_metric (Union[Callable, str], optional): The metric to
be used for accepting or refusing a precision-reduction
optimization proposal. If none is given but a `perf_loss_ths` is
received, the `nebullvm.measure.compute_relative_difference`
metric will be used as default one. A user-defined metric can
be passed as function accepting as inputs two tuples of tensors
(produced by the baseline and the quantized model) and the related
original labels.
For more information see
`nebullvm.measure.compute_relative_difference` and
`nebullvm.measure.compute_accuracy_drop`. `perf_metric`
accepts as value also a string containing the metric name. At the
current stage the supported metrics are `"precision"` and
`"accuracy"`.
ys: List of target labels. For each input in `input_texts` there should
be the corresponding label. Note that this feature is just used for
estimating the accuracy drop while running precision-reduction
techniques. It will be ignored if these techniques are not
activated.
"""
if perf_loss_ths is not None and ys is None and perf_metric == "accuracy":
raise ValueError(
"You cannot select the accuracy as quantization metric without "
"providing valid labels!"
)
if isinstance(perf_metric, str):
perf_metric = QUANTIZATION_METRIC_MAP.get(perf_metric)
tokenizer_args = tokenizer_args or {}
tokenizer_args.update({"return_tensors": "pt"})
output_structure, output_type = _get_output_structure(
text=target_text,
text=input_texts[0],
model=model,
tokenizer=tokenizer,
tokenizer_args=tokenizer_args,
)
input_example = tokenizer(target_text, **tokenizer_args)
input_example = tokenizer(input_texts[0], **tokenizer_args)
input_types = [_extract_input_type(v) for v in input_example.values()] or [
"int"
] * len(input_example)
Expand All @@ -370,23 +461,37 @@ def optimize_huggingface_model(
extra_input_info=extra_input_info,
use_torch_api=use_torch_api,
dynamic_axis=_get_dynamic_axis(
text=target_text,
text=input_texts[0],
tokenizer=tokenizer,
model=model,
tokenizer_args=tokenizer_args,
)
if not use_static_shape
else None,
quantization_ths=quantization_ths,
ignore_compilers=[ModelCompiler.TENSOR_RT.value]
if use_static_shape
else [
ModelCompiler.TENSOR_RT.value,
ModelCompiler.APACHE_TVM.value,
],
custom_optimizers=_get_extra_optimizer(model.config)
if quantization_ths is None
else None,
perf_loss_ths=perf_loss_ths,
perf_metric=perf_metric,
dataloader=_HFDataset(
input_texts,
ys,
list(wrapper_model.inputs_types.keys()),
batch_size,
tokenizer,
tokenizer_args,
),
ignore_compilers=list(
set(
(
[ModelCompiler.TENSOR_RT.value]
if use_static_shape
else [
ModelCompiler.TENSOR_RT.value,
ModelCompiler.APACHE_TVM.value,
]
)
+ ifnone(ignore_compilers, [])
)
),
custom_optimizers=_get_extra_optimizer(model.config),
)
final_model = HuggingFaceInferenceLearner(
core_inference_learner=optimized_model,
Expand Down
Loading

0 comments on commit b2a6a5a

Please sign in to comment.