diff --git a/Dockerfile b/Dockerfile index c0f2e80a..23d824a3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -60,3 +60,4 @@ RUN if [ "$COMPILER" = "all" ] || [ "$COMPILER" = "tvm" ] ; then \ fi ENV SIGOPT_PROJECT="tmp" +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.8/dist-packages/tensorrt diff --git a/apps/accelerate/speedster/docs/en/docs/advanced_options.md b/apps/accelerate/speedster/docs/en/docs/advanced_options.md index 223b9146..ee84a1cd 100644 --- a/apps/accelerate/speedster/docs/en/docs/advanced_options.md +++ b/apps/accelerate/speedster/docs/en/docs/advanced_options.md @@ -12,6 +12,7 @@ In particular, we will overview: - [Optimization Time: constrained vs unconstrained](#optimization-time-constrained-vs-unconstrained) - [Selecting specific compilers/compressors](#select-specific-compilerscompressors) - [Using dynamic shape](#using-dynamic-shape) +- [Enable TensorrtExecutionProvider for ONNXRuntime on GPU](#enable-tensorrtexecutionprovider-for-onnxruntime-on-gpu) - [Custom models](#custom-models) - [Store the performances of all the optimization techniques](#store-the-performances-of-all-the-optimization-techniques) - [Set number of threads](#set-number-of-threads) @@ -106,7 +107,7 @@ Default: False. `device`: str, optional -Device used for inference, it can be cpu or gpu. gpu will be used if available, otherwise cpu. +Device used for inference, it can be cpu or gpu/cuda (both gpu and cuda options are supported). A specific gpu can be selected using notation gpu:1 or cuda:1. gpu will be used if available, otherwise cpu. Default: None. @@ -135,6 +136,16 @@ optimized_model = optimize_model( ) ``` +If we are working on a multi-gpu machine and we want to use a specific gpu, we can use: + +```python +from speedster import optimize_model + +optimized_model = optimize_model( + model, input_data=input_data, device="cuda:1" # also device="gpu:1" is supported +) +``` + ## Optimization Time: constrained vs unconstrained One of the first options that can be customized in `Speedster` is the `optimization_time` parameter. In order to optimize the model, `Speedster` will try a list of compilers which allow to keep the same accuracy of the original model. In addition to compilers, it can also use other techniques such as pruning, quantization, and other compression techniques which can lead to a little drop in accuracy and may require some time to complete. @@ -162,11 +173,15 @@ optimized_model = optimize_model( # You can find the list of all compilers and compressors below # COMPILER_LIST = [ # "deepsparse", -# "tensor_rt", +# "tensor_rt", # Skips all the tensor RT pipelines +# "torch_tensor_rt", # Skips only the tensor RT pipeline for PyTorch +# "onnx_tensor_rt", # Skips only the tensor RT pipeline for ONNX # "torchscript", # "onnxruntime", # "tflite", -# "tvm", +# "tvm", # Skips all the TVM pipelines +# "onnx_tvm", # Skips only the TVM pipeline for ONNX +# "torch_tvm", # Skips only the TVM pipeline for PyTorch # "openvino", # "bladedisc", # "intel_neural_compressor", @@ -238,6 +253,15 @@ optimized_model = optimize_model( ) ``` +## Enable TensorrtExecutionProvider for ONNXRuntime on GPU + +By default, `Speedster` will use the `CUDAExecutionProvider` for ONNXRuntime on GPU. If you want to use the `TensorrtExecutionProvider` instead, you must add the TensorRT installation path to the env variable LD_LIBRARY_PATH. +If you installed TensorRT through the nebullvm auto_installer, you can do it by running the following command in the terminal: + +```bash +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"//site-packages/tensorrt" +``` + ## Custom models `Speedster` is designed to optimize models that take as inputs and return in output only tensors or np.ndarrays (and dictionaries/strings for huggingface). Some models may require instead a custom input, for example a dictionary where the keys are the names of the inputs and the values are the input tensors, or may return a dictionary as output. We can optimize such models with `Speedster` by defining a model wrapper. diff --git a/nebullvm/operations/optimizations/compilers/pytorch.py b/nebullvm/operations/optimizations/compilers/pytorch.py index c2d7345b..92b80b98 100644 --- a/nebullvm/operations/optimizations/compilers/pytorch.py +++ b/nebullvm/operations/optimizations/compilers/pytorch.py @@ -92,6 +92,8 @@ def _compile_model( if quantization_type is QuantizationType.HALF: input_sample = [ t.to(self.device.to_torch_format()).half() + if torch.is_floating_point(t) + else t.to(self.device.to_torch_format()) for t in input_sample ] else: diff --git a/nebullvm/tools/base.py b/nebullvm/tools/base.py index 8e0d002c..a14114d4 100644 --- a/nebullvm/tools/base.py +++ b/nebullvm/tools/base.py @@ -198,7 +198,7 @@ def get_total_memory(self) -> int: ) def get_free_memory(self) -> int: - # Return total memory in bytes using nvidia-smi in bytes + # Return free memory in bytes using nvidia-smi in bytes if self.type is DeviceType.CPU: raise Exception("CPU does not have memory") else: diff --git a/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb b/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb index 9040fd98..639a7ab0 100644 --- a/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb +++ b/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_BERT_with_Speedster.ipynb @@ -113,6 +113,32 @@ "## Model and Dataset setup" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cf24c4c4", + "metadata": {}, + "source": [ + "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cf8ff74", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\" # Change this path according to your TensorRT location\n", + "\n", + "if os.path.exists(tensorrt_path):\n", + " os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n", + "else:\n", + " print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")" + ] + }, { "cell_type": "markdown", "id": "e4d55115", diff --git a/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb b/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb index 823a372f..f1c84434 100644 --- a/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb +++ b/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_DistilBERT_with_Speedster.ipynb @@ -113,6 +113,32 @@ "## Model and Dataset setup" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cf24c4c4", + "metadata": {}, + "source": [ + "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cf8ff74", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\" # Change this path according to your TensorRT location\n", + "\n", + "if os.path.exists(tensorrt_path):\n", + " os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n", + "else:\n", + " print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")" + ] + }, { "cell_type": "markdown", "id": "e4d55115", diff --git a/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb b/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb index 7885c299..aad3d7f5 100644 --- a/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb +++ b/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_GPT2_with_Speedster.ipynb @@ -113,6 +113,32 @@ "## Model and Dataset setup" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cf24c4c4", + "metadata": {}, + "source": [ + "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cf8ff74", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\" # Change this path according to your TensorRT location\n", + "\n", + "if os.path.exists(tensorrt_path):\n", + " os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n", + "else:\n", + " print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")" + ] + }, { "cell_type": "markdown", "id": "e4d55115", diff --git a/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb b/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb index 63fe2a7f..5f326984 100644 --- a/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb +++ b/notebooks/speedster/huggingface/Accelerate_Hugging_Face_PyTorch_T5_with_Speedster.ipynb @@ -104,7 +104,7 @@ }, "outputs": [], "source": [ - "!python -m nebullvm.installers.auto_installer --backends huggingface --compilers all" + "!python -m nebullvm.installers.auto_installer --frameworks huggingface --compilers all" ] }, { @@ -117,6 +117,32 @@ "## Model and Dataset setup" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cf24c4c4", + "metadata": {}, + "source": [ + "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cf8ff74", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\" # Change this path according to your TensorRT location\n", + "\n", + "if os.path.exists(tensorrt_path):\n", + " os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n", + "else:\n", + " print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")" + ] + }, { "cell_type": "markdown", "id": "e4d55115", diff --git a/notebooks/speedster/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb b/notebooks/speedster/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb index cfce1b38..91d3bcec 100644 --- a/notebooks/speedster/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb +++ b/notebooks/speedster/huggingface/Accelerate_Hugging_Face_TensorFlow_BERT_with_Speedster.ipynb @@ -113,6 +113,32 @@ "## Model and Dataset setup" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cf24c4c4", + "metadata": {}, + "source": [ + "Add tensorrt installation path to the LD_LIBRARY_PATH env variable, in order to activate TensorrtExecutionProvider for ONNXRuntime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cf8ff74", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "tensorrt_path = \"/usr/local/lib/python3.8/dist-packages/tensorrt\" # Change this path according to your TensorRT location\n", + "\n", + "if os.path.exists(tensorrt_path):\n", + " os.environ['LD_LIBRARY_PATH'] += f\":{tensorrt_path}\"\n", + "else:\n", + " print(\"Unable to find TensorRT path. ONNXRuntime won't use TensorrtExecutionProvider.\")" + ] + }, { "cell_type": "markdown", "id": "e4d55115",