diff --git a/nebullvm/config.py b/nebullvm/config.py index 2a628f22..97a0c2d9 100644 --- a/nebullvm/config.py +++ b/nebullvm/config.py @@ -1,7 +1,7 @@ from nebullvm.optional_modules.torch import torch -VERSION = "0.7.0" +VERSION = "0.7.1" LEARNER_METADATA_FILENAME = "metadata.json" ONNX_OPSET_VERSION = 13 NEBULLVM_DEBUG_FILE = "nebullvm_debug.json" diff --git a/nebullvm/operations/optimizations/compilers/pytorch.py b/nebullvm/operations/optimizations/compilers/pytorch.py index 66d908b4..1e605cc8 100644 --- a/nebullvm/operations/optimizations/compilers/pytorch.py +++ b/nebullvm/operations/optimizations/compilers/pytorch.py @@ -76,16 +76,22 @@ def execute( model, quantization_type, input_tfms, train_input_data ) - self.compiled_model = self._compile_model(model, input_data) + self.compiled_model = self._compile_model( + model, input_data, quantization_type + ) def _compile_model( self, model: Union[Module, GraphModule], input_data: DataManager, + quantization_type: QuantizationType, ) -> ScriptModule: input_sample = input_data.get_list(1)[0] if self.device is Device.GPU: - input_sample = [t.cuda() for t in input_sample] + if quantization_type is QuantizationType.HALF: + input_sample = [t.cuda().half() for t in input_sample] + else: + input_sample = [t.cuda() for t in input_sample] if not isinstance(model, torch.fx.GraphModule): model.eval() diff --git a/nebullvm/operations/optimizations/compilers/tensor_rt.py b/nebullvm/operations/optimizations/compilers/tensor_rt.py index 0441f15a..5d51207e 100644 --- a/nebullvm/operations/optimizations/compilers/tensor_rt.py +++ b/nebullvm/operations/optimizations/compilers/tensor_rt.py @@ -1,4 +1,5 @@ import abc +import copy import os import subprocess from pathlib import Path @@ -153,7 +154,9 @@ def _compile_model( with torch_tensorrt.logging.errors(): trt_model = torch_tensorrt.compile( - model, + model + if dtype is not torch.half + else copy.deepcopy(model).half(), inputs=[ torch_tensorrt.Input( tensor.shape, diff --git a/notebooks/speedster/pytorch/Accelerate_PyTorch_YOLO_with_Speedster.ipynb b/notebooks/speedster/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb similarity index 100% rename from notebooks/speedster/pytorch/Accelerate_PyTorch_YOLO_with_Speedster.ipynb rename to notebooks/speedster/pytorch/Accelerate_PyTorch_YOLOv5_with_Speedster.ipynb diff --git a/notebooks/speedster/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb b/notebooks/speedster/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb new file mode 100644 index 00000000..04a8e429 --- /dev/null +++ b/notebooks/speedster/pytorch/Accelerate_PyTorch_YOLOv8_with_Speedster.ipynb @@ -0,0 +1,363 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3c977e4a", + "metadata": { + "id": "3c977e4a" + }, + "source": [ + "![nebullvm nebuly AI accelerate inference optimize DeepLearning](https://user-images.githubusercontent.com/38586138/201391643-a80407e5-2c28-409c-90c9-327795cd27e8.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Accelerate Ultralytics YOLOv8 with Speedster" + ] + }, + { + "cell_type": "markdown", + "id": "6cfcd562", + "metadata": { + "id": "6cfcd562" + }, + "source": [ + "Hi and welcome 👋\n", + "\n", + "In this notebook we will discover how in just a few steps you can speed up the response time of deep learning model inference using the Speedster app from the open-source library nebullvm.\n", + "\n", + "With Speedster's latest API, you can speed up models up to 10 times without any loss of accuracy (option A), or accelerate them up to 20-30 times by setting a self-defined amount of accuracy/precision that you are willing to trade off to get even lower response time (option B). To accelerate your model, Speedster takes advantage of various optimization techniques such as deep learning compilers (in both option A and option B), quantization, half accuracy, and so on (option B).\n", + "\n", + "Let's jump to the code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%env CUDA_VISIBLE_DEVICES=0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install Speedster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install speedster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python -m nebullvm.installers.auto_installer --backends torch-full --compilers all" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install Ultralytics YOLOv8" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install ultralytics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load YOLOv8s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from ultralytics import YOLO\n", + "\n", + "yolo = YOLO('yolov8s.pt')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's load a test dummy data and see the original output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_data = torch.randn(1, 3, 640, 640)\n", + "yolo.model(test_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The original YOLOv8 model return as output a tuple where the first element is a tensor and the second is a list of tensors. Speedster currently supports only models that return only tensors, so we need to create a wrapper to overcome this issue:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "class YOLOWrapper(torch.nn.Module):\n", + " def __init__(self, yolo_model):\n", + " super().__init__()\n", + " self.model = yolo_model.model\n", + " \n", + " def forward(self, x, *args, **kwargs):\n", + " res = self.model(x)\n", + " return res[0], *tuple(res[1])\n", + " \n", + "model_wrapper = YOLOWrapper(yolo)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## YOLOv8s Optimization with GPU" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now optimize the model using speedster:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from speedster import optimize_model\n", + "\n", + "# Provide some input data for the model \n", + "input_data = [((torch.randn(1, 3, 640, 640), ), torch.tensor([0])) for i in range(100)]\n", + "\n", + "# Run Speedster optimization\n", + "optimized_model = optimize_model(\n", + " model_wrapper, input_data=input_data, metric_drop_ths=0.1, store_latencies=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can finally restore the original output format by wrapping the optimized model in a new class:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class OptimizedYOLO(torch.nn.Module):\n", + " def __init__(self, optimized_model):\n", + " super().__init__()\n", + " self.model = optimized_model\n", + " \n", + " def forward(self, x, *args, **kwargs):\n", + " res = self.model(x)\n", + " return res[0], list(res[1:])\n", + " \n", + "optimized_wrapper = OptimizedYOLO(optimized_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_wrapper(test_data.cuda())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## YOLOv8s Optimization with CPU" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from speedster import optimize_model\n", + "from ultralytics import YOLO\n", + "\n", + "yolo = YOLO('yolov8s.pt')\n", + "model_wrapper = YOLOWrapper(yolo)\n", + "\n", + "# Provide some input data for the model \n", + "input_data = [((torch.randn(1, 3, 640, 640), ), torch.tensor([0])) for i in range(100)]\n", + "\n", + "# Run Speedster optimization\n", + "optimized_model = optimize_model(\n", + " model_wrapper, input_data=input_data, metric_drop_ths=0.1, store_latencies=True, device=\"cpu\"\n", + ")\n", + "\n", + "optimized_wrapper = OptimizedYOLO(optimized_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_wrapper(test_data)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b72bdf54", + "metadata": {}, + "source": [ + "## Save and reload the optimized model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ada71f91", + "metadata": {}, + "source": [ + "We can easily save to disk the optimized model with the following line:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99b3a9d0", + "metadata": {}, + "outputs": [], + "source": [ + "optimized_model.save(\"model_save_path\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6308ddd7", + "metadata": {}, + "source": [ + "We can then load again the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9946f6b", + "metadata": {}, + "outputs": [], + "source": [ + "from nebullvm.operations.inference_learners.base import LearnerMetadata\n", + "\n", + "optimized_model = LearnerMetadata.read(\"model_save_path\").load_model(\"model_save_path\")\n", + "optimized_wrapper = OptimizedYOLO(optimized_model)" + ] + }, + { + "cell_type": "markdown", + "id": "d50807de", + "metadata": { + "id": "d50807de" + }, + "source": [ + "What an amazing result, right?!? Stay tuned for more cool content from the Nebuly team :) " + ] + }, + { + "cell_type": "markdown", + "id": "c90ff6e5", + "metadata": {}, + "source": [ + "