大疆妙算3适配yolov5检测与分割

最新推荐文章于 2026-03-12 09:44:57 发布

原创最新推荐文章于 2026-03-12 09:44:57 发布 · 1.3k 阅读

16 ·

本内容遵循CC 4.0 BY-SA版权协议

GEO检测

收录于

深度学习

Jetson

CLIP-GmP-ViT-L-14编码模型

图像识别

CLIP

CLIP-GmP-ViT-L-14编码模型` 是一个图文双塔编码模型，适合做图文匹配、零样本分类和跨模态检索演示。本镜像已经完成 Web 部署，打开页面即可上传图片并测试图文表征能力

文章目录

前言
一、妙算3是什么？
二、环境准备
- 1.查看设备信息
- 2.下载项目源码
三、trt-检测
四、trt-分割
五、性能测试记录
总结

前言

公司最近需要对大疆的妙算3适配目标检测算法进行预研，本文记录下适配的过程。

一、妙算3是什么？

大疆妙算3是大疆（DJI）推出的一款高性能、小型化的机载计算机，它的核心定位是为机器人、无人机及其他智能移动设备提供强大的边缘计算能力，可以理解为专为空中和地面移动平台设计的“超级大脑”。
本质上妙算3是基于NVIDIA Orin开发者套件的一个产品，所以NVIDIA的jetson的一套在妙算3同样适用。不同的是jetson是给root权限的，但是妙算3是没给root权限的，因此妙算3没法直接通过apt-get install安装所需的包与库。

二、环境准备

1.查看设备信息

jtop

在这里插入图片描述

2.下载项目源码

git clone -b v7.0 https://github.com/ultralytics/yolov5.git
git clone -b yolov5-v7.0 https://github.com/wang-xinyu/tensorrtx.git

三、trt-检测

3.1 使用 .pt生成 .wts 文件

cd yolov5/
# yolov5s.pt下载，wget无法使用就手动下载
wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt
cp [PATH-TO-TENSORRTX]/yolov5/gen_wts.py .
python3 gen_wts.py -w yolov5s.pt -o yolov5s.wts
# 将会生成一个'yolov5s.wts'文件，这里安需pip安装所需的库

3.2 构建 tensorrtx/yolov5 并运行

[PATH-TO-TENSORRTX]/yolov5/src/config.h可以修改类别数目kNumClass，批处理数kBatchSize
[PATH-TO-TENSORRTX]/yolov5/下的CMakeLists.txt替换成下面的文件

cmake_minimum_required(VERSION 3.10)

project(yolov5)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Release)  # 改为Release以获得更好性能

# 检查是否是Jetson设备
if(EXISTS "/proc/device-tree/model")
    file(READ "/proc/device-tree/model" JETSON_MODEL)
    set(IS_JETSON TRUE)
    message(STATUS "Building on Jetson device: ${JETSON_MODEL}")
else()
    set(IS_JETSON FALSE)
endif()

# 查找OpenCV
find_package(OpenCV REQUIRED)
if(OpenCV_FOUND)
    message(STATUS "Found OpenCV: ${OpenCV_VERSION}")
    message(STATUS "OpenCV include dir: ${OpenCV_INCLUDE_DIRS}")
    message(STATUS "OpenCV libraries: ${OpenCV_LIBS}")
else()
    message(FATAL_ERROR "OpenCV not found!")
endif()

# 使用旧版CUDA支持，避免enable_language问题
if(IS_JETSON)
    # Jetson上使用FindCUDA模块
    find_package(CUDA REQUIRED)

    if(CUDA_FOUND)
        message(STATUS "Found CUDA: ${CUDA_VERSION}")
        message(STATUS "CUDA include dir: ${CUDA_INCLUDE_DIRS}")
        message(STATUS "CUDA libraries: ${CUDA_LIBRARIES}")

        # 设置CUDA NVCC标志
        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")

        # Jetson专用架构设置
        if(JETSON_MODEL MATCHES "Nano")
            set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_53")
        elseif(JETSON_MODEL MATCHES "TX2")
            set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_62")
        elseif(JETSON_MODEL MATCHES "Xavier")
            set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_72")
        elseif(JETSON_MODEL MATCHES "Orin")
            set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_87")
        else()
            set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_53")
        endif()
    else()
        message(FATAL_ERROR "CUDA not found on Jetson!")
    endif()

    # 包含CUDA头文件和库
    #include_directories(/usr/include/aarch64-linux-gnu/)
    #link_directories(/usr/lib/aarch64-linux-gnu/)
    # 包含CUDA头文件和库
    #include_directories(${CUDA_INCLUDE_DIRS})
    include_directories(/usr/local/cuda-11.4/include)
    #link_directories(${CUDA_LIBRARIES})
    link_directories(/usr/local/cuda-11.4/lib64/)
    
endif()

include_directories(${PROJECT_SOURCE_DIR}/src/)
include_directories(${PROJECT_SOURCE_DIR}/plugin/)
# 添加OpenCV包含目录
include_directories(${OpenCV_INCLUDE_DIRS})

#file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp)
#file(GLOB_RECURSE PLUGIN_SRCS ${PROJECT_SOURCE_DIR}/plugin/*.cu)
file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
file(GLOB_RECURSE PLUGIN_SRCS ${PROJECT_SOURCE_DIR}/plugin/*.cu)


# 如果使用FindCUDA，需要特殊处理CU文件
if(IS_JETSON AND CUDA_FOUND)
    # 使用CUDA_ADD_LIBRARY和CUDA_ADD_EXECUTABLE
    cuda_add_library(myplugins SHARED ${PLUGIN_SRCS})
    cuda_add_executable(yolov5_det ${PROJECT_SOURCE_DIR}/yolov5_det.cpp ${SRCS})
    #cuda_add_executable(yolov5_seg ${PROJECT_SOURCE_DIR}/yolov5_seg.cpp ${SRCS})
else()
    # 原处理方式
    add_library(myplugins SHARED ${PLUGIN_SRCS})
    add_executable(yolov5_det yolov5_det.cpp ${SRCS})
    #add_executable(yolov5_seg yolov5_seg.cpp ${SRCS})
endif()

target_link_libraries(myplugins nvinfer cudart)
# 链接OpenCV库到yolov5_det
target_link_libraries(yolov5_det nvinfer cudart myplugins ${OpenCV_LIBS})
#target_link_libraries(yolov5_seg nvinfer cudart myplugins ${OpenCV_LIBS})

cd [PATH-TO-TENSORRTX]/yolov5/
mkdir build
cd build
cp [PATH-TO-ultralytics-yolov5]/yolov5s.wts . 
cmake ..
make

会生成yolov5_det可执行文件

./yolov5_det -s yolov5s.wts yolov5s.engine s

会生成yolov5s.engine模型
推理图片

cd ..
python3 yolov5_det_trt

在这里插入图片描述

3.3 tensorrt调用检测模型跑视频文件（支持batch）

"""
An example that uses TensorRT's Python api to make inferences on video.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
LEN_ALL_RESULT = 38001
LEN_ONE_RESULT = 38

# 在导入其他模块之前修复
import sys
if not hasattr(np, 'bool'):
    np.bool = bool

def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov5 project.
    param: 
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return

    """
    tl = (
        line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    # color = color or [random.randint(0, 255) for _ in range(3)]
    color = [0, 255, 0]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLov5TRT(object):
    """
    description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

    def infer(self, raw_image_generator):
        start = time.time()
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            if image_raw is None:
                # Use black image for empty frames
                image_raw = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8)
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        #start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        #end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
                result_boxes, result_scores, result_classid = self.post_process(
                    output[i * LEN_ALL_RESULT: (i + 1) * LEN_ALL_RESULT], batch_origin_h[i], batch_origin_w[i]
                )
                # Draw rectangles and labels on the original image
                for j in range(len(result_boxes)):
                    box = result_boxes[j]
                    plot_one_box(
                        box,
                        batch_image_raw[i],
                        label="{}:{:.2f}".format(
                            categories[int(result_classid[j])], result_scores[j]
                        ),
                    )
        end = time.time()
        return batch_image_raw, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        
    def get_video_frames(self, video_path, batch_size=None):
        """
        description: Generator to get frames from video
        """
        if batch_size is None:
            batch_size = self.batch_size
        
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print(f"Error: Cannot open video file {video_path}")
            yield None
            return
            
        while True:
            frames = []
            for _ in range(batch_size):
                ret, frame = cap.read()
                if not ret:
                    if frames:  # Return the last partial batch
                        while len(frames) < batch_size:
                            frames.append(None)
                        yield frames
                    cap.release()
                    return
                frames.append(frame)
            yield frames
            
    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        if raw_bgr_image is None:
            # Create a black image
            raw_bgr_image = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8)
            
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0] - x[:, 2] / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, LEN_ONE_RESULT))[:num, :]
        pred = pred[:, :6]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))            
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
                     np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes


def process_video(yolov5_wrapper, video_path, output_path=None, show_video=False):
    """
    description: Process a video file for object detection
    param:
        yolov5_wrapper: YoLov5TRT instance
        video_path: path to input video
        output_path: path to save output video (optional)
        show_video: whether to show video in real-time
    """
    # Open video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Cannot open video file {video_path}")
        return
    
    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    print(f"Video Info: {width}x{height} @ {fps}fps, Total frames: {total_frames}")
    
    # Video writer for output
    out_writer = None
    if output_path:
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    frame_count = 0
    total_time = 0
    
    # Create a frame generator
    def frame_generator(cap, batch_size):
        while True:
            frames = []
            for _ in range(batch_size):
                ret, frame = cap.read()
                if not ret:
                    if frames:  # Return the last partial batch
                        while len(frames) < batch_size:
                            frames.append(None)
                        yield frames
                    return
                frames.append(frame)
            yield frames
    
    # Process video frame by frame
    for batch_frames in frame_generator(cap, yolov5_wrapper.batch_size):
        # Get frames generator
        def batch_generator():
            for frame in batch_frames:
                yield frame
        
        # Perform inference
        processed_frames, inference_time = yolov5_wrapper.infer(batch_generator())
        total_time += inference_time
        
        # Display and save frames
        for i, frame in enumerate(processed_frames):
            if batch_frames[i] is None:  # Skip empty frames
                continue
                
            frame_count += 1
            
            # Display frame
            if show_video:
                cv2.imshow('YOLOv5 TensorRT Inference', frame)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
            
            # Save frame to output video
            if out_writer:
                out_writer.write(frame)
            
            # Print progress
            if frame_count % 10 == 0:
                print(f"Processed {frame_count}/{total_frames} frames, "
                      f"FPS: {1.0/(total_time/frame_count):.2f}")
    
    # Clean up
    cap.release()
    if out_writer:
        out_writer.release()
    if show_video:
        cv2.destroyAllWindows()
    
    print(f"\nInference complete!")
    print(f"Total frames processed: {frame_count}")
    print(f"Total time: {total_time:.2f}s")
    print(f"Average FPS: {frame_count/total_time:.2f}")


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "/home/dji/tensorrtx-yolov5-v7.0/yolov5/build_b4/libmyplugins.so"
    engine_file_path = "/home/dji/tensorrtx-yolov5-v7.0/yolov5/build_b4/yolov5s.engine"

    video_path = "/home/dji/video/test_person.mp4"
    #output_path = "output.mp4"
    output_path = None


    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels
    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
            "hair drier", "toothbrush"]

    # a YoLov5TRT instance
    yolov5_wrapper = YoLov5TRT(engine_file_path)
    try:
        print('Batch size is', yolov5_wrapper.batch_size)
        
        # Warm up
        print("Warming up...")
        for i in range(10):
            batch_image_raw, use_time = yolov5_wrapper.infer(yolov5_wrapper.get_raw_image_zeros())
            print('Warm up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
        
        # Process video
        process_video(yolov5_wrapper, video_path, output_path, show_video=False)
        
    finally:
        # destroy the instance
        yolov5_wrapper.destroy()

四、trt-分割

4.1 使用 .pt生成 .wts 文件

cd yolov5/
# yolov5s-seg.pt下载，wget无法使用就手动下载
wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s-seg.pt
cp [PATH-TO-TENSORRTX]/yolov5/gen_wts.py .
python gen_wts.py -w yolov5s-seg.pt -o yolov5s-seg.wts -t seg
# 将会生成一个'yolov5s-seg.wts'文件

4.2 构建 tensorrtx/yolov5 并运行

在3.2CMakeLists.txt文件中做如下就修改：
在这里插入图片描述

cd [PATH-TO-TENSORRTX]/yolov5/
mkdir build
cd build
cp [PATH-TO-ultralytics-yolov5]/yolov5s-seg.wts .
cmake ..
make
./yolov5_seg -s yolov5s-seg.wts yolov5s-seg.engine s

至此，会生成yolov5s.engine-seg模型
推理图片

cd ..
python3 yolov5_seg_trt.py

在这里插入图片描述

4.3 tensorrt调用分割模型跑视频文件（支持batch）

"""
An example that uses TensorRT's Python api to make inferences on video for YOLOv5 instance segmentation.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4

# 在导入其他模块之前修复numpy兼容性问题
if not hasattr(np, 'bool'):
    np.bool = bool

def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    """
    description: Plots one bounding box on image img,
                 this function comes from YoLov5 project.
    param: 
        x:      a box likes [x1,y1,x2,y2]
        img:    a opencv image object
        color:  color to draw rectangle, such as (0,255,0)
        label:  str
        line_thickness: int
    return:
        no return
    """
    tl = (
        line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    )  # line/font thickness
    #color = color or [random.randint(0, 255) for _ in range(3)]
    color = [0, 255, 0]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label,
            (c1[0], c1[1] - 2),
            0,
            tl / 3,
            [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA,
        )


class YoLov5TRT(object):
    """
    description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)
        
        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

        # Data length
        self.det_output_total_length = host_outputs[0].shape[0]  # Total length for all batches
        self.mask_output_total_length = host_outputs[1].shape[0]  # Total length for all batches
        
        # Length per batch
        self.det_output_per_batch = self.det_output_total_length // self.batch_size
        self.mask_output_per_batch = self.mask_output_total_length // self.batch_size
        
        # Segmentation parameters
        self.seg_w = int(self.input_w / 4)
        self.seg_h = int(self.input_h / 4)
        self.seg_c = 32  # Typically 32 for yolov5-seg
        self.det_row_output_length = self.seg_c + 6
        
        # Draw mask
        self.colors_obj = Colors()
        
        print(f"Batch size: {self.batch_size}")
        print(f"Detection output per batch: {self.det_output_per_batch}")
        print(f"Mask output per batch: {self.mask_output_per_batch}")
        print(f"Segmentation params: w={self.seg_w}, h={self.seg_h}, c={self.seg_c}")
        print(f"Detection row output length: {self.det_row_output_length}")

    def infer(self, raw_image_generator):
        #start = time.time()
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        valid_frames_count = 0
        
        for i, image_raw in enumerate(raw_image_generator):
            if image_raw is None:
                # Skip if no frame
                break
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
            valid_frames_count += 1
        
        # If not enough frames, pad with zeros
        for i in range(valid_frames_count, self.batch_size):
            # Create a black image
            black_image = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8)
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(black_image)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        
        # Here we use the first row of output in that batch_size = 1
        output_bbox = host_outputs[0]
        output_proto_mask = host_outputs[1]
        
        # Do postprocess for each valid frame
        processed_frames = []
        for i in range(self.batch_size):
            if i >= valid_frames_count:
                # Skip padded frames
                processed_frames.append(batch_image_raw[i])
                continue
                
            # Get the output for this specific batch
            bbox_start = i * self.det_output_per_batch
            bbox_end = (i + 1) * self.det_output_per_batch
            mask_start = i * self.mask_output_per_batch
            mask_end = (i + 1) * self.mask_output_per_batch
            
            frame_output_bbox = output_bbox[bbox_start:bbox_end]
            frame_output_proto_mask = output_proto_mask[mask_start:mask_end]
            
            result_boxes, result_scores, result_classid, result_proto_coef = self.post_process(
                frame_output_bbox, batch_origin_h[i], batch_origin_w[i]
            )
            
            if result_proto_coef.shape[0] > 0:
                #result_masks = self.process_mask(frame_output_proto_mask, result_proto_coef, 
                #                                result_boxes, batch_origin_h[i], batch_origin_w[i])
                # Draw masks on the original image
                #self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid], 
                #              im_src=batch_image_raw[i])

                # Draw rectangles and labels on the original image
                for j in range(len(result_boxes)):
                    box = result_boxes[j]
                    plot_one_box(
                        box,
                        batch_image_raw[i],
                        label="{}:{:.2f}".format(
                            categories[int(result_classid[j])], result_scores[j]
                        ),
                    )
            
            processed_frames.append(batch_image_raw[i])
        
        #end = time.time()
        return processed_frames, end - start

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

    def get_video_frames(self, video_path, batch_size=None):
        """
        description: Generator to get frames from video
        """
        if batch_size is None:
            batch_size = self.batch_size
        
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print(f"Error: Cannot open video file {video_path}")
            yield None
            return
            
        while True:
            frames = []
            for _ in range(batch_size):
                ret, frame = cap.read()
                if not ret:
                    if frames:  # Return the last partial batch
                        while len(frames) < batch_size:
                            frames.append(None)
                        yield frames
                    cap.release()
                    return
                frames.append(frame)
            yield frames

    def get_raw_image_zeros(self, image_path_batch=None):
        """
        description: Ready data for warmup
        """
        for _ in range(self.batch_size):
            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)

    def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        if raw_bgr_image is None:
            # Create a black image
            raw_bgr_image = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8)
            
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate width and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.input_w / origin_w
        r_h = self.input_h / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0] - x[:, 2] / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2
            y /= r_h

        return y

    def post_process(self, output_boxes, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes, cx, cy, w, h, conf, cls_id, mask[32], cx, cy, w, h, conf, cls_id, mask[32] ...] 
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output_boxes[0])
        if num == 0:
            return np.array([]), np.array([]), np.array([]), np.array([])
        
        # Calculate the actual data length
        data_length = len(output_boxes) - 1
        # Calculate number of rows
        num_rows = data_length // self.det_row_output_length
        
        if num_rows == 0:
            return np.array([]), np.array([]), np.array([]), np.array([])
            
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output_boxes[1:1+num_rows*self.det_row_output_length], 
                         (num_rows, self.det_row_output_length))[:num, :]
        
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH,
                                         nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        result_proto_coef = boxes[:, 6:] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid, result_proto_coef

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))            
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
                     np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id, mask coefficients[32])
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        if len(prediction) == 0:
            return np.array([])
            
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        if len(boxes) == 0:
            return np.array([])
            
        # Transform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, 5] == boxes[:, 5]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def scale_mask(self, mask, ih, iw):
        mask = cv2.resize(mask, (self.input_w, self.input_h))
        r_w = self.input_w / (iw * 1.0)
        r_h = self.input_h / (ih * 1.0)
        if r_h > r_w:
            w = self.input_w
            h = int(r_w * ih)
            x = 0
            y = int((self.input_h - h) / 2)
        else:
            w = int(r_h * iw)
            h = self.input_h
            x = int((self.input_w - w) / 2)
            y = 0
        crop = mask[y:y+h, x:x+w]
        crop = cv2.resize(crop, (iw, ih))
        return crop

    def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw):
        """
        description: Mask pred by yolov5 instance segmentation ,
        param: 
            output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input
            result_proto_coef: prototype mask coefficients (n, 32), n represents n results
            result_boxes     :  
            ih: rows of original image
            iw: cols of original image
        return:
            mask_result: (n, ih, iw)
        """
        if len(result_proto_coef) == 0 or len(result_boxes) == 0:
            return np.array([])
            
        # Reshape the mask output
        result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w)
        c, mh, mw = result_proto_masks.shape
        
        # Calculate masks
        masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh, mw)
        
        mask_result = []
        for mask, box in zip(masks, result_boxes):
            mask_s = np.zeros((ih, iw))
            crop_mask = self.scale_mask(mask, ih, iw)            
            x1 = int(box[0])
            y1 = int(box[1])
            x2 = int(box[2])
            y2 = int(box[3])
            
            # Ensure coordinates are within image bounds
            x1 = max(0, min(x1, iw-1))
            y1 = max(0, min(y1, ih-1))
            x2 = max(0, min(x2, iw-1))
            y2 = max(0, min(y2, ih-1))
            
            if x2 <= x1 or y2 <= y1:
                continue
                
            crop = crop_mask[y1:y2, x1:x2]
            crop = np.where(crop >= 0.5, 1, 0)
            crop = crop.astype(np.uint8)
            mask_s[y1:y2, x1:x2] = crop
            mask_result.append(mask_s)
        
        return np.array(mask_result) if mask_result else np.array([])

    def draw_mask(self, masks, colors_, im_src, alpha=0.5):
        """
        description: Draw mask on image ,
        param: 
            masks  : result_mask
            colors_: color to draw mask
            im_src : original image
            alpha  : scale between original  image and mask
        return:
            no return
        """
        if len(masks) == 0:
            return
        masks = np.asarray(masks, dtype=np.uint8)
        masks = np.ascontiguousarray(masks.transpose(1, 2, 0))
        masks = np.asarray(masks, dtype=np.float32)
        colors_ = np.asarray(colors_, dtype=np.float32)
        s = masks.sum(2, keepdims=True).clip(0, 1)
        masks = (masks @ colors_).clip(0, 255)
        im_src[:] = masks * alpha + im_src * (1 - s * alpha)


def process_video(yolov5_wrapper, video_path, output_path=None, show_video=False):
    """
    description: Process a video file for instance segmentation
    param:
        yolov5_wrapper: YoLov5TRT instance
        video_path: path to input video
        output_path: path to save output video (optional)
        show_video: whether to show video in real-time
    """
    # Open video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Cannot open video file {video_path}")
        return
    
    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    print(f"Video Info: {width}x{height} @ {fps}fps, Total frames: {total_frames}")
    
    # Video writer for output
    out_writer = None
    if output_path:
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    frame_count = 0
    total_time = 0
    
    # Process video frame by frame
    for batch_frames in yolov5_wrapper.get_video_frames(video_path):
        if batch_frames[0] is None:  # End of video
            break
            
        # Create frame generator
        def batch_generator():
            for frame in batch_frames:
                yield frame
        
        # Perform inference
        processed_frames, inference_time = yolov5_wrapper.infer(batch_generator())
        total_time += inference_time
        
        # Display and save frames
        for i, frame in enumerate(processed_frames):
            if batch_frames[i] is None:  # Skip empty frames
                continue
                
            frame_count += 1
            
            # Display frame
            if show_video:
                cv2.imshow('YOLOv5 TensorRT Inference', frame)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
            
            # Save frame to output video
            if out_writer:
                out_writer.write(frame)
            
            # Print progress
            if frame_count % 10 == 0:
                avg_fps = frame_count / total_time if total_time > 0 else 0
                print(f"Processed {frame_count}/{total_frames} frames, "
                      f"FPS: {avg_fps:.2f}")
    
    # Clean up
    cap.release()
    if out_writer:
        out_writer.release()
    if show_video:
        cv2.destroyAllWindows()
    
    print(f"\nInference complete!")
    print(f"Total frames processed: {frame_count}")
    print(f"Total time: {total_time:.2f}s")
    if total_time > 0:
        print(f"Average FPS: {frame_count/total_time:.2f}")


class Colors:
    def __init__(self):
        hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A',
                '92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF',
                '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF',
                'FF95C8', 'FF37C7')
        self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
        self.n = len(self.palette)

    def __call__(self, i, bgr=False):
        c = self.palette[int(i) % self.n]
        return (c[2], c[1], c[0]) if bgr else c

    @staticmethod
    def hex2rgb(h):  # rgb order (PIL)
        return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))


if __name__ == "__main__":
    # load custom plugin and engine
    PLUGIN_LIBRARY = "/home/dji/tensorrtx-yolov5-v7.0/yolov5/build_seg_b4/libmyplugins.so"
    engine_file_path = "/home/dji/tensorrtx-yolov5-v7.0/yolov5/build_seg_b4/yolov5s-seg.engine"
    video_path = "/home/dji/video/test_person.mp4"  # 你的视频路径
    output_path = None  # 输出视频路径

    ctypes.CDLL(PLUGIN_LIBRARY)

    # load coco labels
    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
            "hair drier", "toothbrush"]

    # a YoLov5TRT instance
    yolov5_wrapper = YoLov5TRT(engine_file_path)
    try:
        print('Batch size is', yolov5_wrapper.batch_size)
        
        # Warm up
        print("Warming up...")
        for i in range(10):
            batch_image_raw, use_time = yolov5_wrapper.infer(yolov5_wrapper.get_raw_image_zeros())
            print(f'Warm up {i+1} -> {batch_image_raw[0].shape}, time->{use_time * 1000:.2f}ms')
        
        # Process video
        process_video(yolov5_wrapper, video_path, output_path, show_video=False)
        
    finally:
        # destroy the instance
        yolov5_wrapper.destroy()