文章目录
前言
公司最近需要对大疆的妙算3适配目标检测算法进行预研,本文记录下适配的过程。
一、妙算3是什么?
大疆妙算3是大疆(DJI)推出的一款高性能、小型化的机载计算机,它的核心定位是为机器人、无人机及其他智能移动设备提供强大的边缘计算能力,可以理解为专为空中和地面移动平台设计的“超级大脑”。
本质上妙算3是基于NVIDIA Orin开发者套件的一个产品,所以NVIDIA的jetson的一套在妙算3同样适用。不同的是jetson是给root权限的,但是妙算3是没给root权限的,因此妙算3没法直接通过apt-get install安装所需的包与库。
二、环境准备
1.查看设备信息
jtop

2.下载项目源码
git clone -b v7.0 https://github.com/ultralytics/yolov5.git
git clone -b yolov5-v7.0 https://github.com/wang-xinyu/tensorrtx.git
三、trt-检测
3.1 使用 .pt生成 .wts 文件
cd yolov5/
# yolov5s.pt下载,wget无法使用就手动下载
wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt
cp [PATH-TO-TENSORRTX]/yolov5/gen_wts.py .
python3 gen_wts.py -w yolov5s.pt -o yolov5s.wts
# 将会生成一个'yolov5s.wts'文件,这里安需pip安装所需的库
3.2 构建 tensorrtx/yolov5 并运行
[PATH-TO-TENSORRTX]/yolov5/src/config.h可以修改类别数目kNumClass,批处理数kBatchSize
[PATH-TO-TENSORRTX]/yolov5/下的CMakeLists.txt替换成下面的文件
cmake_minimum_required(VERSION 3.10)
project(yolov5)
add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Release) # 改为Release以获得更好性能
# 检查是否是Jetson设备
if(EXISTS "/proc/device-tree/model")
file(READ "/proc/device-tree/model" JETSON_MODEL)
set(IS_JETSON TRUE)
message(STATUS "Building on Jetson device: ${JETSON_MODEL}")
else()
set(IS_JETSON FALSE)
endif()
# 查找OpenCV
find_package(OpenCV REQUIRED)
if(OpenCV_FOUND)
message(STATUS "Found OpenCV: ${OpenCV_VERSION}")
message(STATUS "OpenCV include dir: ${OpenCV_INCLUDE_DIRS}")
message(STATUS "OpenCV libraries: ${OpenCV_LIBS}")
else()
message(FATAL_ERROR "OpenCV not found!")
endif()
# 使用旧版CUDA支持,避免enable_language问题
if(IS_JETSON)
# Jetson上使用FindCUDA模块
find_package(CUDA REQUIRED)
if(CUDA_FOUND)
message(STATUS "Found CUDA: ${CUDA_VERSION}")
message(STATUS "CUDA include dir: ${CUDA_INCLUDE_DIRS}")
message(STATUS "CUDA libraries: ${CUDA_LIBRARIES}")
# 设置CUDA NVCC标志
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
# Jetson专用架构设置
if(JETSON_MODEL MATCHES "Nano")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_53")
elseif(JETSON_MODEL MATCHES "TX2")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_62")
elseif(JETSON_MODEL MATCHES "Xavier")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_72")
elseif(JETSON_MODEL MATCHES "Orin")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_87")
else()
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_53")
endif()
else()
message(FATAL_ERROR "CUDA not found on Jetson!")
endif()
# 包含CUDA头文件和库
#include_directories(/usr/include/aarch64-linux-gnu/)
#link_directories(/usr/lib/aarch64-linux-gnu/)
# 包含CUDA头文件和库
#include_directories(${CUDA_INCLUDE_DIRS})
include_directories(/usr/local/cuda-11.4/include)
#link_directories(${CUDA_LIBRARIES})
link_directories(/usr/local/cuda-11.4/lib64/)
endif()
include_directories(${PROJECT_SOURCE_DIR}/src/)
include_directories(${PROJECT_SOURCE_DIR}/plugin/)
# 添加OpenCV包含目录
include_directories(${OpenCV_INCLUDE_DIRS})
#file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp)
#file(GLOB_RECURSE PLUGIN_SRCS ${PROJECT_SOURCE_DIR}/plugin/*.cu)
file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
file(GLOB_RECURSE PLUGIN_SRCS ${PROJECT_SOURCE_DIR}/plugin/*.cu)
# 如果使用FindCUDA,需要特殊处理CU文件
if(IS_JETSON AND CUDA_FOUND)
# 使用CUDA_ADD_LIBRARY和CUDA_ADD_EXECUTABLE
cuda_add_library(myplugins SHARED ${PLUGIN_SRCS})
cuda_add_executable(yolov5_det ${PROJECT_SOURCE_DIR}/yolov5_det.cpp ${SRCS})
#cuda_add_executable(yolov5_seg ${PROJECT_SOURCE_DIR}/yolov5_seg.cpp ${SRCS})
else()
# 原处理方式
add_library(myplugins SHARED ${PLUGIN_SRCS})
add_executable(yolov5_det yolov5_det.cpp ${SRCS})
#add_executable(yolov5_seg yolov5_seg.cpp ${SRCS})
endif()
target_link_libraries(myplugins nvinfer cudart)
# 链接OpenCV库到yolov5_det
target_link_libraries(yolov5_det nvinfer cudart myplugins ${OpenCV_LIBS})
#target_link_libraries(yolov5_seg nvinfer cudart myplugins ${OpenCV_LIBS})
cd [PATH-TO-TENSORRTX]/yolov5/
mkdir build
cd build
cp [PATH-TO-ultralytics-yolov5]/yolov5s.wts .
cmake ..
make
会生成yolov5_det可执行文件
./yolov5_det -s yolov5s.wts yolov5s.engine s
会生成yolov5s.engine模型
推理图片
cd ..
python3 yolov5_det_trt

3.3 tensorrt调用检测模型跑视频文件(支持batch)
"""
An example that uses TensorRT's Python api to make inferences on video.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
LEN_ALL_RESULT = 38001
LEN_ONE_RESULT = 38
# 在导入其他模块之前修复
import sys
if not hasattr(np, 'bool'):
np.bool = bool
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
"""
description: Plots one bounding box on image img,
this function comes from YoLov5 project.
param:
x: a box likes [x1,y1,x2,y2]
img: a opencv image object
color: color to draw rectangle, such as (0,255,0)
label: str
line_thickness: int
return:
no return
"""
tl = (
line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
) # line/font thickness
# color = color or [random.randint(0, 255) for _ in range(3)]
color = [0, 255, 0]
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
if label:
tf = max(tl - 1, 1) # font thickness
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
cv2.putText(
img,
label,
(c1[0], c1[1] - 2),
0,
tl / 3,
[225, 255, 255],
thickness=tf,
lineType=cv2.LINE_AA,
)
class YoLov5TRT(object):
"""
description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
"""
def __init__(self, engine_file_path):
# Create a Context on this device,
self.ctx = cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
# Deserialize the engine from file
with open(engine_file_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
print('bingding:', binding, engine.get_binding_shape(binding))
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(cuda_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
# Store
self.stream = stream
self.context = context
self.engine = engine
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.bindings = bindings
self.batch_size = engine.max_batch_size
def infer(self, raw_image_generator):
start = time.time()
threading.Thread.__init__(self)
# Make self the active context, pushing it on top of the context stack.
self.ctx.push()
# Restore
stream = self.stream
context = self.context
engine = self.engine
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
# Do image preprocess
batch_image_raw = []
batch_origin_h = []
batch_origin_w = []
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
for i, image_raw in enumerate(raw_image_generator):
if image_raw is None:
# Use black image for empty frames
image_raw = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8)
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
batch_image_raw.append(image_raw)
batch_origin_h.append(origin_h)
batch_origin_w.append(origin_w)
np.copyto(batch_input_image[i], input_image)
batch_input_image = np.ascontiguousarray(batch_input_image)
# Copy input image to host buffer
np.copyto(host_inputs[0], batch_input_image.ravel())
#start = time.time()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
# Run inference.
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
# Synchronize the stream
stream.synchronize()
#end = time.time()
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
# Here we use the first row of output in that batch_size = 1
output = host_outputs[0]
# Do postprocess
for i in range(self.batch_size):
result_boxes, result_scores, result_classid = self.post_process(
output[i * LEN_ALL_RESULT: (i + 1) * LEN_ALL_RESULT], batch_origin_h[i], batch_origin_w[i]
)
# Draw rectangles and labels on the original image
for j in range(len(result_boxes)):
box = result_boxes[j]
plot_one_box(
box,
batch_image_raw[i],
label="{}:{:.2f}".format(
categories[int(result_classid[j])], result_scores[j]
),
)
end = time.time()
return batch_image_raw, end - start
def destroy(self):
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
def get_video_frames(self, video_path, batch_size=None):
"""
description: Generator to get frames from video
"""
if batch_size is None:
batch_size = self.batch_size
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"Error: Cannot open video file {video_path}")
yield None
return
while True:
frames = []
for _ in range(batch_size):
ret, frame = cap.read()
if not ret:
if frames: # Return the last partial batch
while len(frames) < batch_size:
frames.append(None)
yield frames
cap.release()
return
frames.append(frame)
yield frames
def get_raw_image_zeros(self, image_path_batch=None):
"""
description: Ready data for warmup
"""
for _ in range(self.batch_size):
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
def preprocess_image(self, raw_bgr_image):
"""
description: Convert BGR image to RGB,
resize and pad it to target size, normalize to [0,1],
transform to NCHW format.
param:
input_image_path: str, image path
return:
image: the processed image
image_raw: the original image
h: original height
w: original width
"""
if raw_bgr_image is None:
# Create a black image
raw_bgr_image = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8)
image_raw = raw_bgr_image
h, w, c = image_raw.shape
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
# Calculate widht and height and paddings
r_w = self.input_w / w
r_h = self.input_h / h
if r_h > r_w:
tw = self.input_w
th = int(r_w * h)
tx1 = tx2 = 0
ty1 = int((self.input_h - th) / 2)
ty2 = self.input_h - th - ty1
else:
tw = int(r_h * w)
th = self.input_h
tx1 = int((self.input_w - tw) / 2)
tx2 = self.input_w - tw - tx1
ty1 = ty2 = 0
# Resize the image with long side while maintaining ratio
image = cv2.resize(image, (tw, th))
# Pad the short side with (128,128,128)
image = cv2.copyMakeBorder(
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
)
image = image.astype(np.float32)
# Normalize to [0,1]
image /= 255.0
# HWC to CHW format:
image = np.transpose(image, [2, 0, 1])
# CHW to NCHW format
image = np.expand_dims(image, axis=0)
# Convert the image to row-major order, also known as "C order":
image = np.ascontiguousarray(image)
return image, image_raw, h, w
def xywh2xyxy(self, origin_h, origin_w, x):
"""
description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
param:
origin_h: height of original image
origin_w: width of original image
x: A boxes numpy, each row is a box [center_x, center_y, w, h]
return:
y: A boxes numpy, each row is a box [x1, y1, x2, y2]
"""
y = np.zeros_like(x)
r_w = self.input_w / origin_w
r_h = self.input_h / origin_h
if r_h > r_w:
y[:, 0] = x[:, 0] - x[:, 2] / 2
y[:, 2] = x[:, 0] + x[:, 2] / 2
y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
y /= r_w
else:
y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
y[:, 1] = x[:, 1] - x[:, 3] / 2
y[:, 3] = x[:, 1] + x[:, 3] / 2
y /= r_h
return y
def post_process(self, output, origin_h, origin_w):
"""
description: postprocess the prediction
param:
output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
origin_h: height of original image
origin_w: width of original image
return:
result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
result_scores: finally scores, a numpy, each element is the score correspoing to box
result_classid: finally classid, a numpy, each element is the classid correspoing to box
"""
# Get the num of boxes detected
num = int(output[0])
# Reshape to a two dimentional ndarray
pred = np.reshape(output[1:], (-1, LEN_ONE_RESULT))[:num, :]
pred = pred[:, :6]
# Do nms
boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
result_boxes = boxes[:, :4] if len(boxes) else np.array([])
result_scores = boxes[:, 4] if len(boxes) else np.array([])
result_classid = boxes[:, 5] if len(boxes) else np.array([])
return result_boxes, result_scores, result_classid
def bbox_iou(self, box1, box2, x1y1x2y2=True):
"""
description: compute the IoU of two bounding boxes
param:
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
x1y1x2y2: select the coordinate format
return:
iou: computed iou
"""
if not x1y1x2y2:
# Transform from center and width to exact coordinates
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
else:
# Get the coordinates of bounding boxes
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
# Get the coordinates of the intersection rectangle
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
# Intersection area
inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
# Union Area
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
return iou
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
"""
description: Removes detections with lower object confidence score than 'conf_thres' and performs
Non-Maximum Suppression to further filter detections.
param:
prediction: detections, (x1, y1, x2, y2, conf, cls_id)
origin_h: original image height
origin_w: original image width
conf_thres: a confidence threshold to filter detections
nms_thres: a iou threshold to filter detections
return:
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
"""
# Get the boxes that score > CONF_THRESH
boxes = prediction[prediction[:, 4] >= conf_thres]
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
# clip the coordinates
boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1)
boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1)
boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1)
boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1)
# Object confidence
confs = boxes[:, 4]
# Sort by the confs
boxes = boxes[np.argsort(-confs)]
# Perform non-maximum suppression
keep_boxes = []
while boxes.shape[0]:
large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
label_match = boxes[0, -1] == boxes[:, -1]
# Indices of boxes with lower confidence scores, large IOUs and matching labels
invalid = large_overlap & label_match
keep_boxes += [boxes[0]]
boxes = boxes[~invalid]
boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
return boxes
def process_video(yolov5_wrapper, video_path, output_path=None, show_video=False):
"""
description: Process a video file for object detection
param:
yolov5_wrapper: YoLov5TRT instance
video_path: path to input video
output_path: path to save output video (optional)
show_video: whether to show video in real-time
"""
# Open video file
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"Error: Cannot open video file {video_path}")
return
# Get video properties
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"Video Info: {width}x{height} @ {fps}fps, Total frames: {total_frames}")
# Video writer for output
out_writer = None
if output_path:
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
frame_count = 0
total_time = 0
# Create a frame generator
def frame_generator(cap, batch_size):
while True:
frames = []
for _ in range(batch_size):
ret, frame = cap.read()
if not ret:
if frames: # Return the last partial batch
while len(frames) < batch_size:
frames.append(None)
yield frames
return
frames.append(frame)
yield frames
# Process video frame by frame
for batch_frames in frame_generator(cap, yolov5_wrapper.batch_size):
# Get frames generator
def batch_generator():
for frame in batch_frames:
yield frame
# Perform inference
processed_frames, inference_time = yolov5_wrapper.infer(batch_generator())
total_time += inference_time
# Display and save frames
for i, frame in enumerate(processed_frames):
if batch_frames[i] is None: # Skip empty frames
continue
frame_count += 1
# Display frame
if show_video:
cv2.imshow('YOLOv5 TensorRT Inference', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# Save frame to output video
if out_writer:
out_writer.write(frame)
# Print progress
if frame_count % 10 == 0:
print(f"Processed {frame_count}/{total_frames} frames, "
f"FPS: {1.0/(total_time/frame_count):.2f}")
# Clean up
cap.release()
if out_writer:
out_writer.release()
if show_video:
cv2.destroyAllWindows()
print(f"\nInference complete!")
print(f"Total frames processed: {frame_count}")
print(f"Total time: {total_time:.2f}s")
print(f"Average FPS: {frame_count/total_time:.2f}")
if __name__ == "__main__":
# load custom plugin and engine
PLUGIN_LIBRARY = "/home/dji/tensorrtx-yolov5-v7.0/yolov5/build_b4/libmyplugins.so"
engine_file_path = "/home/dji/tensorrtx-yolov5-v7.0/yolov5/build_b4/yolov5s.engine"
video_path = "/home/dji/video/test_person.mp4"
#output_path = "output.mp4"
output_path = None
ctypes.CDLL(PLUGIN_LIBRARY)
# load coco labels
categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
"hair drier", "toothbrush"]
# a YoLov5TRT instance
yolov5_wrapper = YoLov5TRT(engine_file_path)
try:
print('Batch size is', yolov5_wrapper.batch_size)
# Warm up
print("Warming up...")
for i in range(10):
batch_image_raw, use_time = yolov5_wrapper.infer(yolov5_wrapper.get_raw_image_zeros())
print('Warm up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
# Process video
process_video(yolov5_wrapper, video_path, output_path, show_video=False)
finally:
# destroy the instance
yolov5_wrapper.destroy()
四、trt-分割
4.1 使用 .pt生成 .wts 文件
cd yolov5/
# yolov5s-seg.pt下载,wget无法使用就手动下载
wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s-seg.pt
cp [PATH-TO-TENSORRTX]/yolov5/gen_wts.py .
python gen_wts.py -w yolov5s-seg.pt -o yolov5s-seg.wts -t seg
# 将会生成一个'yolov5s-seg.wts'文件
4.2 构建 tensorrtx/yolov5 并运行
在3.2CMakeLists.txt文件中做如下就修改:

cd [PATH-TO-TENSORRTX]/yolov5/
mkdir build
cd build
cp [PATH-TO-ultralytics-yolov5]/yolov5s-seg.wts .
cmake ..
make
./yolov5_seg -s yolov5s-seg.wts yolov5s-seg.engine s
至此,会生成yolov5s.engine-seg模型
推理图片
cd ..
python3 yolov5_seg_trt.py

4.3 tensorrt调用分割模型跑视频文件(支持batch)
"""
An example that uses TensorRT's Python api to make inferences on video for YOLOv5 instance segmentation.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
# 在导入其他模块之前修复numpy兼容性问题
if not hasattr(np, 'bool'):
np.bool = bool
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
"""
description: Plots one bounding box on image img,
this function comes from YoLov5 project.
param:
x: a box likes [x1,y1,x2,y2]
img: a opencv image object
color: color to draw rectangle, such as (0,255,0)
label: str
line_thickness: int
return:
no return
"""
tl = (
line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
) # line/font thickness
#color = color or [random.randint(0, 255) for _ in range(3)]
color = [0, 255, 0]
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
if label:
tf = max(tl - 1, 1) # font thickness
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
cv2.putText(
img,
label,
(c1[0], c1[1] - 2),
0,
tl / 3,
[225, 255, 255],
thickness=tf,
lineType=cv2.LINE_AA,
)
class YoLov5TRT(object):
"""
description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
"""
def __init__(self, engine_file_path):
# Create a Context on this device,
self.ctx = cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
# Deserialize the engine from file
with open(engine_file_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
print('bingding:', binding, engine.get_binding_shape(binding))
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(cuda_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
# Store
self.stream = stream
self.context = context
self.engine = engine
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.bindings = bindings
self.batch_size = engine.max_batch_size
# Data length
self.det_output_total_length = host_outputs[0].shape[0] # Total length for all batches
self.mask_output_total_length = host_outputs[1].shape[0] # Total length for all batches
# Length per batch
self.det_output_per_batch = self.det_output_total_length // self.batch_size
self.mask_output_per_batch = self.mask_output_total_length // self.batch_size
# Segmentation parameters
self.seg_w = int(self.input_w / 4)
self.seg_h = int(self.input_h / 4)
self.seg_c = 32 # Typically 32 for yolov5-seg
self.det_row_output_length = self.seg_c + 6
# Draw mask
self.colors_obj = Colors()
print(f"Batch size: {self.batch_size}")
print(f"Detection output per batch: {self.det_output_per_batch}")
print(f"Mask output per batch: {self.mask_output_per_batch}")
print(f"Segmentation params: w={self.seg_w}, h={self.seg_h}, c={self.seg_c}")
print(f"Detection row output length: {self.det_row_output_length}")
def infer(self, raw_image_generator):
#start = time.time()
threading.Thread.__init__(self)
# Make self the active context, pushing it on top of the context stack.
self.ctx.push()
# Restore
stream = self.stream
context = self.context
engine = self.engine
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
# Do image preprocess
batch_image_raw = []
batch_origin_h = []
batch_origin_w = []
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
valid_frames_count = 0
for i, image_raw in enumerate(raw_image_generator):
if image_raw is None:
# Skip if no frame
break
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
batch_image_raw.append(image_raw)
batch_origin_h.append(origin_h)
batch_origin_w.append(origin_w)
np.copyto(batch_input_image[i], input_image)
valid_frames_count += 1
# If not enough frames, pad with zeros
for i in range(valid_frames_count, self.batch_size):
# Create a black image
black_image = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8)
input_image, image_raw, origin_h, origin_w = self.preprocess_image(black_image)
batch_image_raw.append(image_raw)
batch_origin_h.append(origin_h)
batch_origin_w.append(origin_w)
np.copyto(batch_input_image[i], input_image)
batch_input_image = np.ascontiguousarray(batch_input_image)
# Copy input image to host buffer
np.copyto(host_inputs[0], batch_input_image.ravel())
start = time.time()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
# Run inference.
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream)
# Synchronize the stream
stream.synchronize()
end = time.time()
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
# Here we use the first row of output in that batch_size = 1
output_bbox = host_outputs[0]
output_proto_mask = host_outputs[1]
# Do postprocess for each valid frame
processed_frames = []
for i in range(self.batch_size):
if i >= valid_frames_count:
# Skip padded frames
processed_frames.append(batch_image_raw[i])
continue
# Get the output for this specific batch
bbox_start = i * self.det_output_per_batch
bbox_end = (i + 1) * self.det_output_per_batch
mask_start = i * self.mask_output_per_batch
mask_end = (i + 1) * self.mask_output_per_batch
frame_output_bbox = output_bbox[bbox_start:bbox_end]
frame_output_proto_mask = output_proto_mask[mask_start:mask_end]
result_boxes, result_scores, result_classid, result_proto_coef = self.post_process(
frame_output_bbox, batch_origin_h[i], batch_origin_w[i]
)
if result_proto_coef.shape[0] > 0:
#result_masks = self.process_mask(frame_output_proto_mask, result_proto_coef,
# result_boxes, batch_origin_h[i], batch_origin_w[i])
# Draw masks on the original image
#self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid],
# im_src=batch_image_raw[i])
# Draw rectangles and labels on the original image
for j in range(len(result_boxes)):
box = result_boxes[j]
plot_one_box(
box,
batch_image_raw[i],
label="{}:{:.2f}".format(
categories[int(result_classid[j])], result_scores[j]
),
)
processed_frames.append(batch_image_raw[i])
#end = time.time()
return processed_frames, end - start
def destroy(self):
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
def get_video_frames(self, video_path, batch_size=None):
"""
description: Generator to get frames from video
"""
if batch_size is None:
batch_size = self.batch_size
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"Error: Cannot open video file {video_path}")
yield None
return
while True:
frames = []
for _ in range(batch_size):
ret, frame = cap.read()
if not ret:
if frames: # Return the last partial batch
while len(frames) < batch_size:
frames.append(None)
yield frames
cap.release()
return
frames.append(frame)
yield frames
def get_raw_image_zeros(self, image_path_batch=None):
"""
description: Ready data for warmup
"""
for _ in range(self.batch_size):
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
def preprocess_image(self, raw_bgr_image):
"""
description: Convert BGR image to RGB,
resize and pad it to target size, normalize to [0,1],
transform to NCHW format.
param:
input_image_path: str, image path
return:
image: the processed image
image_raw: the original image
h: original height
w: original width
"""
if raw_bgr_image is None:
# Create a black image
raw_bgr_image = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8)
image_raw = raw_bgr_image
h, w, c = image_raw.shape
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
# Calculate width and height and paddings
r_w = self.input_w / w
r_h = self.input_h / h
if r_h > r_w:
tw = self.input_w
th = int(r_w * h)
tx1 = tx2 = 0
ty1 = int((self.input_h - th) / 2)
ty2 = self.input_h - th - ty1
else:
tw = int(r_h * w)
th = self.input_h
tx1 = int((self.input_w - tw) / 2)
tx2 = self.input_w - tw - tx1
ty1 = ty2 = 0
# Resize the image with long side while maintaining ratio
image = cv2.resize(image, (tw, th))
# Pad the short side with (128,128,128)
image = cv2.copyMakeBorder(
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
)
image = image.astype(np.float32)
# Normalize to [0,1]
image /= 255.0
# HWC to CHW format:
image = np.transpose(image, [2, 0, 1])
# CHW to NCHW format
image = np.expand_dims(image, axis=0)
# Convert the image to row-major order, also known as "C order":
image = np.ascontiguousarray(image)
return image, image_raw, h, w
def xywh2xyxy(self, origin_h, origin_w, x):
"""
description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
param:
origin_h: height of original image
origin_w: width of original image
x: A boxes numpy, each row is a box [center_x, center_y, w, h]
return:
y: A boxes numpy, each row is a box [x1, y1, x2, y2]
"""
y = np.zeros_like(x)
r_w = self.input_w / origin_w
r_h = self.input_h / origin_h
if r_h > r_w:
y[:, 0] = x[:, 0] - x[:, 2] / 2
y[:, 2] = x[:, 0] + x[:, 2] / 2
y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
y /= r_w
else:
y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
y[:, 1] = x[:, 1] - x[:, 3] / 2
y[:, 3] = x[:, 1] + x[:, 3] / 2
y /= r_h
return y
def post_process(self, output_boxes, origin_h, origin_w):
"""
description: postprocess the prediction
param:
output: A numpy likes [num_boxes, cx, cy, w, h, conf, cls_id, mask[32], cx, cy, w, h, conf, cls_id, mask[32] ...]
origin_h: height of original image
origin_w: width of original image
return:
result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
result_scores: finally scores, a numpy, each element is the score correspoing to box
result_classid: finally classid, a numpy, each element is the classid correspoing to box
"""
# Get the num of boxes detected
num = int(output_boxes[0])
if num == 0:
return np.array([]), np.array([]), np.array([]), np.array([])
# Calculate the actual data length
data_length = len(output_boxes) - 1
# Calculate number of rows
num_rows = data_length // self.det_row_output_length
if num_rows == 0:
return np.array([]), np.array([]), np.array([]), np.array([])
# Reshape to a two dimentional ndarray
pred = np.reshape(output_boxes[1:1+num_rows*self.det_row_output_length],
(num_rows, self.det_row_output_length))[:num, :]
# Do nms
boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH,
nms_thres=IOU_THRESHOLD)
result_boxes = boxes[:, :4] if len(boxes) else np.array([])
result_scores = boxes[:, 4] if len(boxes) else np.array([])
result_classid = boxes[:, 5] if len(boxes) else np.array([])
result_proto_coef = boxes[:, 6:] if len(boxes) else np.array([])
return result_boxes, result_scores, result_classid, result_proto_coef
def bbox_iou(self, box1, box2, x1y1x2y2=True):
"""
description: compute the IoU of two bounding boxes
param:
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
x1y1x2y2: select the coordinate format
return:
iou: computed iou
"""
if not x1y1x2y2:
# Transform from center and width to exact coordinates
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
else:
# Get the coordinates of bounding boxes
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
# Get the coordinates of the intersection rectangle
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
# Intersection area
inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
# Union Area
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
return iou
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
"""
description: Removes detections with lower object confidence score than 'conf_thres' and performs
Non-Maximum Suppression to further filter detections.
param:
prediction: detections, (x1, y1, x2, y2, conf, cls_id, mask coefficients[32])
origin_h: original image height
origin_w: original image width
conf_thres: a confidence threshold to filter detections
nms_thres: a iou threshold to filter detections
return:
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
"""
if len(prediction) == 0:
return np.array([])
# Get the boxes that score > CONF_THRESH
boxes = prediction[prediction[:, 4] >= conf_thres]
if len(boxes) == 0:
return np.array([])
# Transform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
# clip the coordinates
boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
# Object confidence
confs = boxes[:, 4]
# Sort by the confs
boxes = boxes[np.argsort(-confs)]
# Perform non-maximum suppression
keep_boxes = []
while boxes.shape[0]:
large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
label_match = boxes[0, 5] == boxes[:, 5]
# Indices of boxes with lower confidence scores, large IOUs and matching labels
invalid = large_overlap & label_match
keep_boxes += [boxes[0]]
boxes = boxes[~invalid]
boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
return boxes
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def scale_mask(self, mask, ih, iw):
mask = cv2.resize(mask, (self.input_w, self.input_h))
r_w = self.input_w / (iw * 1.0)
r_h = self.input_h / (ih * 1.0)
if r_h > r_w:
w = self.input_w
h = int(r_w * ih)
x = 0
y = int((self.input_h - h) / 2)
else:
w = int(r_h * iw)
h = self.input_h
x = int((self.input_w - w) / 2)
y = 0
crop = mask[y:y+h, x:x+w]
crop = cv2.resize(crop, (iw, ih))
return crop
def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw):
"""
description: Mask pred by yolov5 instance segmentation ,
param:
output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input
result_proto_coef: prototype mask coefficients (n, 32), n represents n results
result_boxes :
ih: rows of original image
iw: cols of original image
return:
mask_result: (n, ih, iw)
"""
if len(result_proto_coef) == 0 or len(result_boxes) == 0:
return np.array([])
# Reshape the mask output
result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w)
c, mh, mw = result_proto_masks.shape
# Calculate masks
masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh, mw)
mask_result = []
for mask, box in zip(masks, result_boxes):
mask_s = np.zeros((ih, iw))
crop_mask = self.scale_mask(mask, ih, iw)
x1 = int(box[0])
y1 = int(box[1])
x2 = int(box[2])
y2 = int(box[3])
# Ensure coordinates are within image bounds
x1 = max(0, min(x1, iw-1))
y1 = max(0, min(y1, ih-1))
x2 = max(0, min(x2, iw-1))
y2 = max(0, min(y2, ih-1))
if x2 <= x1 or y2 <= y1:
continue
crop = crop_mask[y1:y2, x1:x2]
crop = np.where(crop >= 0.5, 1, 0)
crop = crop.astype(np.uint8)
mask_s[y1:y2, x1:x2] = crop
mask_result.append(mask_s)
return np.array(mask_result) if mask_result else np.array([])
def draw_mask(self, masks, colors_, im_src, alpha=0.5):
"""
description: Draw mask on image ,
param:
masks : result_mask
colors_: color to draw mask
im_src : original image
alpha : scale between original image and mask
return:
no return
"""
if len(masks) == 0:
return
masks = np.asarray(masks, dtype=np.uint8)
masks = np.ascontiguousarray(masks.transpose(1, 2, 0))
masks = np.asarray(masks, dtype=np.float32)
colors_ = np.asarray(colors_, dtype=np.float32)
s = masks.sum(2, keepdims=True).clip(0, 1)
masks = (masks @ colors_).clip(0, 255)
im_src[:] = masks * alpha + im_src * (1 - s * alpha)
def process_video(yolov5_wrapper, video_path, output_path=None, show_video=False):
"""
description: Process a video file for instance segmentation
param:
yolov5_wrapper: YoLov5TRT instance
video_path: path to input video
output_path: path to save output video (optional)
show_video: whether to show video in real-time
"""
# Open video file
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"Error: Cannot open video file {video_path}")
return
# Get video properties
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"Video Info: {width}x{height} @ {fps}fps, Total frames: {total_frames}")
# Video writer for output
out_writer = None
if output_path:
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
frame_count = 0
total_time = 0
# Process video frame by frame
for batch_frames in yolov5_wrapper.get_video_frames(video_path):
if batch_frames[0] is None: # End of video
break
# Create frame generator
def batch_generator():
for frame in batch_frames:
yield frame
# Perform inference
processed_frames, inference_time = yolov5_wrapper.infer(batch_generator())
total_time += inference_time
# Display and save frames
for i, frame in enumerate(processed_frames):
if batch_frames[i] is None: # Skip empty frames
continue
frame_count += 1
# Display frame
if show_video:
cv2.imshow('YOLOv5 TensorRT Inference', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# Save frame to output video
if out_writer:
out_writer.write(frame)
# Print progress
if frame_count % 10 == 0:
avg_fps = frame_count / total_time if total_time > 0 else 0
print(f"Processed {frame_count}/{total_frames} frames, "
f"FPS: {avg_fps:.2f}")
# Clean up
cap.release()
if out_writer:
out_writer.release()
if show_video:
cv2.destroyAllWindows()
print(f"\nInference complete!")
print(f"Total frames processed: {frame_count}")
print(f"Total time: {total_time:.2f}s")
if total_time > 0:
print(f"Average FPS: {frame_count/total_time:.2f}")
class Colors:
def __init__(self):
hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A',
'92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF',
'344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF',
'FF95C8', 'FF37C7')
self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
self.n = len(self.palette)
def __call__(self, i, bgr=False):
c = self.palette[int(i) % self.n]
return (c[2], c[1], c[0]) if bgr else c
@staticmethod
def hex2rgb(h): # rgb order (PIL)
return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))
if __name__ == "__main__":
# load custom plugin and engine
PLUGIN_LIBRARY = "/home/dji/tensorrtx-yolov5-v7.0/yolov5/build_seg_b4/libmyplugins.so"
engine_file_path = "/home/dji/tensorrtx-yolov5-v7.0/yolov5/build_seg_b4/yolov5s-seg.engine"
video_path = "/home/dji/video/test_person.mp4" # 你的视频路径
output_path = None # 输出视频路径
ctypes.CDLL(PLUGIN_LIBRARY)
# load coco labels
categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
"hair drier", "toothbrush"]
# a YoLov5TRT instance
yolov5_wrapper = YoLov5TRT(engine_file_path)
try:
print('Batch size is', yolov5_wrapper.batch_size)
# Warm up
print("Warming up...")
for i in range(10):
batch_image_raw, use_time = yolov5_wrapper.infer(yolov5_wrapper.get_raw_image_zeros())
print(f'Warm up {i+1} -> {batch_image_raw[0].shape}, time->{use_time * 1000:.2f}ms')
# Process video
process_video(yolov5_wrapper, video_path, output_path, show_video=False)
finally:
# destroy the instance
yolov5_wrapper.destroy()
五、性能测试记录
| 模型 | batch1-推理耗时 | batch1-前处理+推理+后处理+检测框渲染耗时 | batch1-FPS | batch4-推理耗时 | batch4-前处理+推理+后处理+检测框渲染耗时 | batch4-FPS |
|---|---|---|---|---|---|---|
| yolov5s-det | 13.37ms | 29.15ms | 34.3 | 5.44ms | 19.00ms | 52.64 |
| yolov5s-seg | 15.86ms | 30.84ms | 32.4 | 6.61ms | 21.02ms | 47.57 |
总结
本文记录了在大疆妙算3上的运行一些简单的目标检测与分割算法的过程,也给出了检测与分割分别在batch1与batch4批处理下的推理耗时、整体耗时以及FPS。
参考文档:
https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v7.0/yolov5
2026年1月15日14:56:11

2569

被折叠的 条评论
为什么被折叠?



