You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi, I have done some benchmarking using the ./trtexec binary. I was using this to measure the inference speed of a model and compare if using a Larger batch size or using Multiple streams is better for Inference. The results show that when I use a model with batch size 1 on 10 streams it is faster than running a single model with batch size 10. I wanted to test this with real images( because ./trtexec generates random image inputs) however i don't get the same inference speed. What is the correct way to run multiple streams to run models in parallel?
when using ./trtexec --loadEngine= path/to/engine.engine --streams=10 --iterations=100
my python code while trying to create multiple contextexecution for multiple streams:
import torch
import numpy as np
import time
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
from PIL import Image
from utils import make_query_image
def load_image(img_path):
with open(img_path, 'rb') as f:
img = Image.open(f).convert('RGB')
return img
def preprocess_image(img_path, img_size=(256, 256)):
img = load_image(img_path)
resized_img = make_query_image(img, img_size)
img_array = np.array(resized_img)
img_tensor = torch.from_numpy(img_array)[None] / 255.0
img_tensor = img_tensor.unsqueeze(0).to(device=device, dtype=torch.float32)
return img_tensor
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem, name):
self.host = host_mem
self.device = device_mem
self.name = name
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class TRTInference:
def __init__(self, engine_path, dtype=np.float32):
self.engine_path = engine_path
self.dtype = dtype
self.logger = trt.Logger(trt.Logger.WARNING)
self.runtime = trt.Runtime(self.logger)
self.engine = self.load_engine(self.runtime, self.engine_path)
self.contexts = [self.engine.create_execution_context() for _ in range(num_contexts)]
self.streams = [cuda.Stream() for _ in range(num_contexts)]
self.buffers = [self.allocate_buffers() for _ in range(num_contexts)]
@staticmethod
def load_engine(trt_runtime, engine_path):
trt.init_libnvinfer_plugins(None, "")
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def allocate_buffers(self):
inputs = []
outputs = []
bindings = []
for i in range(self.engine.num_bindings):
binding = self.engine[i]
size = trt.volume(self.engine.get_tensor_shape(binding))
host_mem = cuda.pagelocked_empty(size, self.dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
inputs.append(HostDeviceMem(host_mem, device_mem, self.engine.get_tensor_name(i)))
else:
outputs.append(HostDeviceMem(host_mem, device_mem, self.engine.get_tensor_name(i)))
return inputs, outputs, bindings
def infer(self, img0, img1, context_idx):
context = self.contexts[context_idx]
stream = self.streams[context_idx]
inputs, outputs, bindings = self.buffers[context_idx]
img0 = img0.cpu().numpy().astype(np.float32).ravel()
img1 = img1.cpu().numpy().astype(np.float32).ravel()
np.copyto(inputs[0].host, img0)
np.copyto(inputs[1].host, img1)
for inp in inputs:
cuda.memcpy_htod_async(inp.device, inp.host, stream)
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
for out in outputs:
cuda.memcpy_dtoh_async(out.host, out.device, stream)
stream.synchronize()
output1 = outputs[0].host.reshape((1, 1024, 1024))
output2 = outputs[1].host.reshape((1, 1024, 1024))
return output1, output2
# Path to the TensorRT engine
trt_engine_path = "Feature_Matching_Model.engine"
img0_path = "path/to/img0.png"
image1_paths = [
"path/to/img1.png"
"path/to/img2.png",
"path/to/img3.png",
"path/to/img4.png",
"path/to/img5.png"
]
# Number of parallel execution contexts
num_contexts = 10
# Load and preprocess images
device = 'cuda' if torch.cuda.is_available() else 'cpu'
img0 = preprocess_image(img0_path)
# Preprocess all target images
preprocessed_images = [preprocess_image(img_path) for img_path in image1_paths]
# Instantiate the TRT model with multiple contexts
trt_model = TRTInference(trt_engine_path)
# Warm-up loop to discard the initial inference overhead
warmup_iterations = 100
print("Warming up...")
for _ in range(warmup_iterations):
for idx in range(len(preprocessed_images)):
trt_model.infer(img0, preprocessed_images[idx], context_idx=idx % num_contexts)
print("Warm-up completed. Starting timed inferences...")
start_time = time.time()
# Run inference in parallel without ThreadPoolExecutor
results = []
for idx in range(len(preprocessed_images)):
output1, output2 = trt_model.infer(img0, preprocessed_images[idx], context_idx=idx % num_contexts)
results.append((output1, output2))
infer_time = time.time() - start_time
print("Inference time:", infer_time)
print("Results shape:", np.shape(results))
The text was updated successfully, but these errors were encountered:
Hi, I have done some benchmarking using the ./trtexec binary. I was using this to measure the inference speed of a model and compare if using a Larger batch size or using Multiple streams is better for Inference. The results show that when I use a model with batch size 1 on 10 streams it is faster than running a single model with batch size 10. I wanted to test this with real images( because ./trtexec generates random image inputs) however i don't get the same inference speed. What is the correct way to run multiple streams to run models in parallel?
when using
./trtexec --loadEngine= path/to/engine.engine --streams=10 --iterations=100
my python code while trying to create multiple contextexecution for multiple streams:
The text was updated successfully, but these errors were encountered: