Inference on Multiple streams #990

williamhoole · 2024-08-12T11:56:22Z

Hi, I have done some benchmarking using the ./trtexec binary. I was using this to measure the inference speed of a model and compare if using a Larger batch size or using Multiple streams is better for Inference. The results show that when I use a model with batch size 1 on 10 streams it is faster than running a single model with batch size 10. I wanted to test this with real images( because ./trtexec generates random image inputs) however i don't get the same inference speed. What is the correct way to run multiple streams to run models in parallel?

when using ./trtexec --loadEngine= path/to/engine.engine --streams=10 --iterations=100

my python code while trying to create multiple contextexecution for multiple streams:

import torch
import numpy as np
import time
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
from PIL import Image
from utils import make_query_image

def load_image(img_path):
    with open(img_path, 'rb') as f:
        img = Image.open(f).convert('RGB')
    return img

def preprocess_image(img_path, img_size=(256, 256)):
    img = load_image(img_path)
    resized_img = make_query_image(img, img_size)
    img_array = np.array(resized_img)
    img_tensor = torch.from_numpy(img_array)[None] / 255.0
    img_tensor = img_tensor.unsqueeze(0).to(device=device, dtype=torch.float32)
    return img_tensor

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem, name):
        self.host = host_mem
        self.device = device_mem
        self.name = name

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

class TRTInference:
    def __init__(self, engine_path, dtype=np.float32):
        self.engine_path = engine_path
        self.dtype = dtype
        self.logger = trt.Logger(trt.Logger.WARNING)
        self.runtime = trt.Runtime(self.logger)
        self.engine = self.load_engine(self.runtime, self.engine_path)
        self.contexts = [self.engine.create_execution_context() for _ in range(num_contexts)]
        self.streams = [cuda.Stream() for _ in range(num_contexts)]
        self.buffers = [self.allocate_buffers() for _ in range(num_contexts)]

    @staticmethod
    def load_engine(trt_runtime, engine_path):
        trt.init_libnvinfer_plugins(None, "")
        with open(engine_path, 'rb') as f:
            engine_data = f.read()
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine

    def allocate_buffers(self):
        inputs = []
        outputs = []
        bindings = []

        for i in range(self.engine.num_bindings):
            binding = self.engine[i]
            size = trt.volume(self.engine.get_tensor_shape(binding))
            host_mem = cuda.pagelocked_empty(size, self.dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            bindings.append(int(device_mem))

            if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
                inputs.append(HostDeviceMem(host_mem, device_mem, self.engine.get_tensor_name(i)))
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem, self.engine.get_tensor_name(i)))

        return inputs, outputs, bindings

    def infer(self, img0, img1, context_idx):
        context = self.contexts[context_idx]
        stream = self.streams[context_idx]
        inputs, outputs, bindings = self.buffers[context_idx]

        img0 = img0.cpu().numpy().astype(np.float32).ravel()
        img1 = img1.cpu().numpy().astype(np.float32).ravel()

        np.copyto(inputs[0].host, img0)
        np.copyto(inputs[1].host, img1)

        for inp in inputs:
            cuda.memcpy_htod_async(inp.device, inp.host, stream)

        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)

        for out in outputs:
            cuda.memcpy_dtoh_async(out.host, out.device, stream)

        stream.synchronize()

        output1 = outputs[0].host.reshape((1, 1024, 1024))
        output2 = outputs[1].host.reshape((1, 1024, 1024))

        return output1, output2

# Path to the TensorRT engine
trt_engine_path = "Feature_Matching_Model.engine"

img0_path = "path/to/img0.png"
image1_paths = [
   "path/to/img1.png"
"path/to/img2.png",
"path/to/img3.png",
"path/to/img4.png",
"path/to/img5.png"
]

# Number of parallel execution contexts
num_contexts = 10

# Load and preprocess images
device = 'cuda' if torch.cuda.is_available() else 'cpu'
img0 = preprocess_image(img0_path)

# Preprocess all target images
preprocessed_images = [preprocess_image(img_path) for img_path in image1_paths]

# Instantiate the TRT model with multiple contexts
trt_model = TRTInference(trt_engine_path)

# Warm-up loop to discard the initial inference overhead
warmup_iterations = 100
print("Warming up...")

for _ in range(warmup_iterations):
    for idx in range(len(preprocessed_images)):
        trt_model.infer(img0, preprocessed_images[idx], context_idx=idx % num_contexts)

print("Warm-up completed. Starting timed inferences...")
start_time = time.time()

# Run inference in parallel without ThreadPoolExecutor
results = []
for idx in range(len(preprocessed_images)):
    output1, output2 = trt_model.infer(img0, preprocessed_images[idx], context_idx=idx % num_contexts)
    results.append((output1, output2))

infer_time = time.time() - start_time
print("Inference time:", infer_time)
print("Results shape:", np.shape(results))

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Inference on Multiple streams #990

Inference on Multiple streams #990

williamhoole commented Aug 12, 2024 •

edited

Loading

Inference on Multiple streams #990

Inference on Multiple streams #990

Comments

williamhoole commented Aug 12, 2024 • edited Loading

williamhoole commented Aug 12, 2024 •

edited

Loading