读代码学习TensorRT

代码仓库: https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v4.0/yolov5s

TensorRT 7.0 documentation: https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-700/tensorrt-api/c_api/namespacemembers_func.html

Cuda documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/modules.html#modules

TensorRT 做的工作

构建期
- 模型解析/建立
- 计算图优化
- 节点消除
- 多精度支持
- 优选kernel/format
- 导入plugin
- 显存优化
运行期
- 运行时环境
- 序列化反序列化

版本

GTX1080 / Ubuntu16.04 / cuda10.0 / cudnn7.6.5 / tensorrt7.0.0 / nvinfer7.0.0 / opencv3.3
Yolov5 v4.0

TensorRTX由于基于TensorRT7.0，与最新的8.5.2的API有较大不同
7.0 https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-700/tensorrt-api/c_api/index.html
latest(8.5.2 now) https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/index.html

yolov5.cpp

首先是宏定义常量：

#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
#define DEVICE 0  // GPU id
#define NMS_THRESH 0.4
#define CONF_THRESH 0.5
#define BATCH_SIZE 1

其中，USE_FP16用于指定量化精度；DEVICE用于指定GPU id，在单显卡状态下默认为0；NMS_THRESH是NMS算法中的筛选阈值；CONF_THRESH是置信度confidence筛选阈值；BATCH_SIZE。

// stuff we know about the network and the input/output blobs
static const int INPUT_H = Yolo::INPUT_H;
static const int INPUT_W = Yolo::INPUT_W;
static const int CLASS_NUM = Yolo::CLASS_NUM;
static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;

通过const变量定义了一些常量。其中INPUT_H和INPUT_W指定了输入的尺寸；CLASS_NUM指定了输出标签的种类数量；OUTPUT_SIZE？？。

从main()函数开始看起：

int main(int argc, char** argv) {
    cudaSetDevice(DEVICE); // 指定GPU

    std::string wts_name = "";
    std::string engine_name = "";
    float gd = 0.0f, gw = 0.0f; // yolov5*.yaml中depth_multiple和width_multiple两个参数
    std::string img_dir;
    // 检查命令行输入参数
    if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir)) {
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./yolov5 -s [.wts] [.engine] [s/m/l/x or c gd gw]  // serialize model to plan file" << std::endl;
        std::cerr << "./yolov5 -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
        return -1;
    }

    // create a model using the API directly and serialize it to a stream
    // 通过原生API直接构建模型，然后序列化模型
    if (!wts_name.empty()) {
        IHostMemory* modelStream{ nullptr };
        APIToModel(BATCH_SIZE, &modelStream, gd, gw, wts_name);
        assert(modelStream != nullptr);
        std::ofstream p(engine_name, std::ios::binary);
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    }

接下来先看看APIToModel()

/**
 * @brief create a model using the API directly
*/
void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, float& gd, float& gw, std::string& wts_name) {
    // Create builder
    // 建立Builder（引擎构建器）
    IBuilder* builder = createInferBuilder(gLogger); // Builder
    IBuilderConfig* config = builder->createBuilderConfig(); // BuilderConfig

    // Create model to populate the network, then set the outputs and create an engine
    // 建立Engine
    ICudaEngine* engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
    assert(engine != nullptr);

    // Serialize the engine
    // 序列化
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

IBuilder是TensorRT标准类，作用是"Builds an engine from a network definition. "，需要#include <NvInfer.h>。

createInferBuilder是tensorrt7.0的接口，8.5.2中已取消。作用是"Create an instance of an IBuilder class.“，返回的对象是"This class is the logging class for the builder.”。传入参数需要为ILogger对象，是"Application-implemented logging interface for the builder, engine and runtime"，用于生成器、引擎和运行时的应用程序实现的日志记录接口。在TensorRTX中作者继承ILogger类实现了Logger类(在logging.h中)，因此输入的是Logger对象。

builder->createBuilderConfig()的描述为"Create a builder configuration object."，即只进行engine的某些配置，不参与网络结构定义。

ICudaEngine是TensorRT运行时推理引擎，描述为"An engine for executing inference on a built network, with functionally unsafe features."。作者在函数build_engine通过包装原生API实现的网络各层堆叠生成engine。

接下来看build_engine()

1
2

ICudaEngine* build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

输入参数有Builder，BuilderConfig，然后在第一行又通过createNetworkV2()创建了一个network对象。createNetworkV2()的作用是"Create a network definition object."，即network对象的主要作用为定义网络结构。同时有另一个函数createNetwork()主要用于兼容早期版本的TensorRT。createNetwork()与createNetworkV2()最主要的不同为CreateNetworkV2支持动态形状dynamic shapes 和显式批处理维度explicit batch sizes。

// Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
/**
 * @brief Add an input tensor to the network. 
 * @param name The name of the tensor.
 * @param type The type of the data held in the tensor.
 * @param dimensions The dimensions of the tensor.
*/
ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
assert(data);

std::map<std::string, Weights> weightMap = loadWeights(wts_name);
Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

这段主要在定义输入张量大小和加载权重。

network->addInput()的描述为：

"输入张量的名称name用于查找从网络构建的引擎的缓冲区数组中的索引。尺寸的体积必须小于2^30个元素。对于具有隐式批次维度的网络，此卷包括长度设置为最大批次大小的批次维度。对于具有所有显式维度和通配符维度的网络，体积基于IOptimizationProfile指定的最大值。维度通常为正整数。例外的是，在具有所有显式维度的网络中，-1可以用作在运行时指定维度的通配符。具有此类通配符的输入张量必须在IOptimizationProfiles中具有相应的条目，指示允许的极值，并且输入维度必须由IExecutionContext:setBindingDimensions设置。不同的IExecutionContext实例可以具有不同的维度。只有EngineCapability::kDEFAULT支持通配符维度。它们在安全环境中不受支持。DLA不支持｛C，H，W｝维度中的通配符维度。

张量尺寸的指定与格式无关。例如，如果张量以“NHWC”或矢量化格式格式化，则维度仍按顺序{N，C，H，W}指定。对于具有通道维度的2D图像，最后三个维度总是{C，H，W}。对于具有通道维度的3D图像，最后四个维度总是{C，D，H，W}。"

TensorRTX从.wts文件中加载权重。加载后以map即元组的形式组织。

Weights也是Nvidia的标准类，描述为"An array of weights used as a layer parameter."，

接下来就是不断堆叠作者通过原生API实现的网络层。

首先是用具体权重实例化每一层对象。这一段类似于yolov5*.yaml

/* ------ yolov5 backbone------ */
auto focus0 = focus(network, weightMap, *data, 3, get_width(64, gw), 3, "model.0");
auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(9, gd), true, 1, 0.5, "model.4");
auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, 9, 13, "model.8");

/* ------ yolov5 head ------ */
auto bottleneck_csp9 = C3(network, weightMap, *spp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.9");
auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10");

// reinterpret_cast是c++里面的强制类型转换符，“reinterpret_cast 运算符并不会改变括号中运算对象的值，而是对该对象从位模式上进行重新解释”。malloc是c里面的动态内存分配函数。
float* deval = reinterpret_cast<float*>(malloc(sizeof(float) * get_width(512, gw) * 2 * 2));
for (int i = 0; i < get_width(512, gw) * 2 * 2; i++) {
    deval[i] = 1.0; // 这个deval不知道有什么用
}
Weights deconvw
ts11{ DataType::kFLOAT, deval, get_width(512, gw) * 2 * 2 };
IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), get_width(512, gw), DimsHW{ 2, 2 }, deconvwts11, emptywts);
deconv11->setStrideNd(DimsHW{ 2, 2 });
deconv11->setNbGroups(get_width(512, gw));
weightMap["deconv11"] = deconvwts11;

ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) };
auto cat12 = network->addConcatenation(inputTensors12, 2);
auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13");
auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14");

Weights deconvwts15{ DataType::kFLOAT, deval, get_width(256, gw) * 2 * 2 };
IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), get_width(256, gw), DimsHW{ 2, 2 }, deconvwts15, emptywts);
deconv15->setStrideNd(DimsHW{ 2, 2 });
deconv15->setNbGroups(get_width(256, gw));
ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) };
auto cat16 = network->addConcatenation(inputTensors16, 2);

auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17");

// yolo layer 0
IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18");
ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
auto cat19 = network->addConcatenation(inputTensors19, 2);
auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20");
//yolo layer 1
IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21");
ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
auto cat22 = network->addConcatenation(inputTensors22, 2);
auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);

auto yolo = addYoLoLayer(network, weightMap, det0, det1, det2);
yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
network->markOutput(*yolo->getOutput(0));

这是Yolov5 v4.0的网络结构

# parameters
nc: 80  # number of classes
depth_multiple: 0.33  # model depth multiple
width_multiple: 0.50  # layer channel multiple

# anchors
anchors:
  - [10,13, 16,30, 33,23]  # P3/8
  - [30,61, 62,45, 59,119]  # P4/16
  - [116,90, 156,198, 373,326]  # P5/32

# YOLOv5 backbone
backbone:
  # [from, number, module, args]
  [[-1, 1, Focus, [64, 3]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, C3, [128]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 9, C3, [256]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 9, C3, [512]],
   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
   [-1, 1, SPP, [1024, [5, 9, 13]]],
   [-1, 3, C3, [1024, False]],  # 9
  ]

# YOLOv5 head
head:
  [[-1, 1, Conv, [512, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C3, [512, False]],  # 13

   [-1, 1, Conv, [256, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)

   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 14], 1, Concat, [1]],  # cat head P4
   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)

   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 10], 1, Concat, [1]],  # cat head P5
   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)

   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
  ]

其中以convBlock为例，对比yolov5 pytorch和TensorRTX中基于原生API的实现。

ILayer* convBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
    int p = ksize / 2;
    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts);
    assert(conv1);
    conv1->setStrideNd(DimsHW{ s, s });
    conv1->setPaddingNd(DimsHW{ p, p });
    conv1->setNbGroups(g);
    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);

    // silu = x * sigmoid
    auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID);
    assert(sig);
    auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD);
    assert(ew);
    return ew;
}

# Yolov5 v4.0
class Conv(nn.Module):
    # Standard convolution
    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
        super(Conv, self).__init__()
        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
        self.bn = nn.BatchNorm2d(c2)
        self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

    def fuseforward(self, x):
        return self.act(self.conv(x))

对比一下，可以看到并不能直接从pytorch翻译成TensorRT，需要添加很多细节。这应该是由于pytorch框架内部隐含实现了很多细节。但是只要网络是基于pytorch实现的，都可以参考别人的翻译来实现，因为在翻译时只需要以pytorch API为单位进行翻译即可。

注意到在common.hpp和yololayer.cu中作者自行实现了一个plugin，

// common.hpp
IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, IConvolutionLayer* det0, IConvolutionLayer* det1, IConvolutionLayer* det2)
{
    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
    std::vector<float> anchors_yolo = getAnchors(weightMap);
    PluginField pluginMultidata[4];
    int NetData[4];
    NetData[0] = Yolo::CLASS_NUM;
    NetData[1] = Yolo::INPUT_W;
    NetData[2] = Yolo::INPUT_H;
    NetData[3] = Yolo::MAX_OUTPUT_BBOX_COUNT;
    pluginMultidata[0].data = NetData;
    pluginMultidata[0].length = 3;
    pluginMultidata[0].name = "netdata";
    pluginMultidata[0].type = PluginFieldType::kFLOAT32;
    int scale[3] = { 8, 16, 32 };
    int plugindata[3][8];
    std::string names[3];
    for (int k = 1; k < 4; k++)
    {
        plugindata[k - 1][0] = Yolo::INPUT_W / scale[k - 1];
        plugindata[k - 1][1] = Yolo::INPUT_H / scale[k - 1];
        for (int i = 2; i < 8; i++)
        {
            plugindata[k - 1][i] = int(anchors_yolo[(k - 1) * 6 + i - 2]);
        }
        pluginMultidata[k].data = plugindata[k - 1];
        pluginMultidata[k].length = 8;
        names[k - 1] = "yolodata" + std::to_string(k);
        pluginMultidata[k].name = names[k - 1].c_str();
        pluginMultidata[k].type = PluginFieldType::kFLOAT32;
    }
    PluginFieldCollection pluginData;
    pluginData.nbFields = 4;
    pluginData.fields = pluginMultidata;
    IPluginV2 *pluginObj = creator->createPlugin("yololayer", &pluginData);
    ITensor* inputTensors_yolo[] = { det2->getOutput(0), det1->getOutput(0), det0->getOutput(0) };
    auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
    return yolo;
}

// yololayer.h
namespace Yolo
{
    static constexpr int CHECK_COUNT = 3;
    static constexpr float IGNORE_THRESH = 0.1f;
    struct YoloKernel
    {
        int width;
        int height;
        float anchors[CHECK_COUNT * 2];
    };
    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
    static constexpr int CLASS_NUM = 80;
    static constexpr int INPUT_H = 608;
    static constexpr int INPUT_W = 608;

    static constexpr int LOCATIONS = 4;
    struct alignas(float) Detection {
        //center_x center_y w h
        float bbox[LOCATIONS];
        float conf;  // bbox_conf * cls_conf
        float class_id;
    };
}

namespace nvinfer1
{
    class YoloLayerPlugin : public IPluginV2IOExt
    {
    public:
        YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel);
        YoloLayerPlugin(const void* data, size_t length);
        ~YoloLayerPlugin();

        int getNbOutputs() const override
        {
            return 1;
        }

        Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;

        int initialize() override;

        virtual void terminate() override {};

        virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }

        virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;

        virtual size_t getSerializationSize() const override;

        virtual void serialize(void* buffer) const override;

        bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
            return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
        }

        const char* getPluginType() const override;

        const char* getPluginVersion() const override;

        void destroy() override;

        IPluginV2IOExt* clone() const override;

        void setPluginNamespace(const char* pluginNamespace) override;

        const char* getPluginNamespace() const override;

        DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;

        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;

        bool canBroadcastInputAcrossBatch(int inputIndex) const override;

        void attachToContext(
            cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;

        void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;

        void detachFromContext() override;

    private:
        void forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize = 1);
        int mThreadCount = 256;
        const char* mPluginNamespace;
        int mKernelCount;
        int mClassCount;
        int mYoloV5NetWidth;
        int mYoloV5NetHeight;
        int mMaxOutObject;
        std::vector<Yolo::YoloKernel> mYoloKernel;
        void** mAnchor;
    };

    class YoloPluginCreator : public IPluginCreator
    {
    public:
        YoloPluginCreator();

        ~YoloPluginCreator() override = default;

        const char* getPluginName() const override;

        const char* getPluginVersion() const override;

        const PluginFieldCollection* getFieldNames() override;

        IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;

        IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;

        void setPluginNamespace(const char* libNamespace) override
        {
            mNamespace = libNamespace;
        }

        const char* getPluginNamespace() const override
        {
            return mNamespace.c_str();
        }

    private:
        std::string mNamespace;
        static PluginFieldCollection mFC;
        static std::vector<PluginField> mPluginAttributes;
    };
    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
};

可以看到plugin是cuda编程实现的，具体是那一部分？估计大概率是多重检测头。对比回yolov5的common.py

class DetectMultiBackend(nn.Module):
    # YOLOv5 MultiBackend class for python inference on various backends
    def __init__(self, weights='yolov5s.pt', device=torch.device('cpu'), dnn=False, data=None, fp16=False, fuse=True):
        # Usage:
        #   PyTorch:              weights = *.pt
        #   TorchScript:                    *.torchscript
        #   ONNX Runtime:                   *.onnx
        #   ONNX OpenCV DNN:                *.onnx --dnn
        #   OpenVINO:                       *.xml
        #   CoreML:                         *.mlmodel
        #   TensorRT:                       *.engine
        #   TensorFlow SavedModel:          *_saved_model
        #   TensorFlow GraphDef:            *.pb
        #   TensorFlow Lite:                *.tflite
        #   TensorFlow Edge TPU:            *_edgetpu.tflite
        #   PaddlePaddle:                   *_paddle_model
        from models.experimental import attempt_download, attempt_load  # scoped to avoid circular import

        super().__init__()
        w = str(weights[0] if isinstance(weights, list) else weights)
        pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, triton = self._model_type(w)
        fp16 &= pt or jit or onnx or engine  # FP16
        nhwc = coreml or saved_model or pb or tflite or edgetpu  # BHWC formats (vs torch BCWH)
        stride = 32  # default stride
        cuda = torch.cuda.is_available() and device.type != 'cpu'  # use CUDA
        if not (pt or triton):
            w = attempt_download(w)  # download if not local

        if pt:  # PyTorch
            model = attempt_load(weights if isinstance(weights, list) else w, device=device, inplace=True, fuse=fuse)
            stride = max(int(model.stride.max()), 32)  # model stride
            names = model.module.names if hasattr(model, 'module') else model.names  # get class names
            model.half() if fp16 else model.float()
            self.model = model  # explicitly assign for to(), cpu(), cuda(), half()
        ......
        elif engine:  # TensorRT
            LOGGER.info(f'Loading {w} for TensorRT inference...')
            import tensorrt as trt  # https://developer.nvidia.com/nvidia-tensorrt-download
            check_version(trt.__version__, '7.0.0', hard=True)  # require tensorrt>=7.0.0
            if device.type == 'cpu':
                device = torch.device('cuda:0')
            Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
            logger = trt.Logger(trt.Logger.INFO)
            with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
                model = runtime.deserialize_cuda_engine(f.read())
            context = model.create_execution_context()
            bindings = OrderedDict()
            output_names = []
            fp16 = False  # default updated below
            dynamic = False
            for i in range(model.num_bindings):
                name = model.get_binding_name(i)
                dtype = trt.nptype(model.get_binding_dtype(i))
                if model.binding_is_input(i):
                    if -1 in tuple(model.get_binding_shape(i)):  # dynamic
                        dynamic = True
                        context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2]))
                    if dtype == np.float16:
                        fp16 = True
                else:  # output
                    output_names.append(name)
                shape = tuple(context.get_binding_shape(i))
                im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
                bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr()))
            binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
            batch_size = bindings['images'].shape[0]  # if dynamic, this is instead max batch size

看不懂，过。

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#if defined(USE_FP16)
    config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
    assert(builder->platformHasFastInt8());
    config->setFlag(BuilderFlag::kINT8);
    Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME);
    config->setInt8Calibrator(calibrator);
#endif

builder和config设置了一些engine的参数，然后设置了计算精度是FP16或者0INT8。

1
2
3

std::cout << "Building engine, please wait for a while..." << std::endl;
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
std::cout << "Build engine successfully!" << std::endl;

build engine

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*)(mem.second.values));
    }

    return engine;
}

收尾工作。build_engine()全部看完。

回到APIToModel。

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

将engine序列化，然后收尾。

回到main()

// create a model using the API directly and serialize it to a stream
if (!wts_name.empty()) {
    IHostMemory* modelStream{ nullptr };
    APIToModel(BATCH_SIZE, &modelStream, gd, gw, wts_name);
    assert(modelStream != nullptr);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
        std::cerr << "could not open plan output file" << std::endl;
        return -1;
    }
    p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
    modelStream->destroy();
    return 0;
}

我们仍在这一段。在将engine序列化后，可以将其导出为.engine文件保存。

每一次构建完网络后，都会先将其导出为.engine文件。

// deserialize the .engine and run inference
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
    std::cerr << "read " << engine_name << " error!" << std::endl;
    return -1;
}
char *trtModelStream = nullptr;
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();

这一段代码就是输入.engine文件，然后将其反序列化为engine。如果已有.engine文件，就不再需要重新构建网络。

std::vector<std::string> file_names;
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
    std::cerr << "read_files_in_dir failed." << std::endl;
    return -1;
}

TensorRTX的作者希望我们以图片的形式输入数据进行推理。当我们需要改造TensorRTX时，可以跟踪file_names的行为并将其改造为我们需要的数据流。

到这里为止，我们已经构建了一个用于推理的engine，接下来我们应该构建context，即申请显存。

// prepare input data ---------------------------
static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
//for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
//    data[i] = 1.0;
static float prob[BATCH_SIZE * OUTPUT_SIZE];
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
assert(engine->getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
CUDA_CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
// Create stream
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));

createInferRuntime()描述为"Create an instance of an IRuntime class.This class is the logging class for the runtime."。即createInferRuntime实例化的是一个logger，而ICudaEngine类描述的才是真正的engine。

前面从API构建网络的最后对engine进行了序列化，而读取.engine文件时也是序列化状态，因此需要反序列化deserializeCudaEngine。

IExecutionContext描述为"使用具有功能不安全特性的引擎执行推理的上下文。一个ICudaEngine实例可能存在多个执行上下文，允许同一个引擎同时执行多个批处理。如果引擎支持动态形状，则并发使用的每个执行上下文都必须使用单独的优化配置文件。"上下文实际上就是显存空间。

getNbBindings()，binding不知道是什么。描述为"获取绑定索引的数量。如果引擎是为K个配置文件构建的，那么第一个getNbBindings()/K绑定将由配置文件编号0使用，下面的getNbBinding()/KK绑定将由第1个配置文件使用。"后面用binding获取了输入和输出，猜测估计是网络中内存与显存需要进行交换的部分？？？因为除了输入输出，中间部分都可由GPU自动分配显存，但输入输出要实现约定好大小以进行数据交换。

cudaMalloc()和cudaStreamCreate()属于cuda编程部分，不属于TensorRT。cudaMalloc()的描述为"在设备上分配线性内存的大小字节，并在*devPtr中返回分配内存的指针。分配的内存适合于任何类型的变量。内存未清除。cudaMalloc（）在失败时返回cudaErrorMemoryAllocation。“其中设备是在main()的第一行cudaSetDevice(DEVICE);指定的。cudaStreamCreate()的描述为"Creates a new asynchronous stream.”，应该是内存与显存的传输流。

到了这部分，推理的准备已经全部完成，后面的部分就是推理部分。

int fcount = 0;
for (int f = 0; f < (int)file_names.size(); f++) {
    fcount++;
    if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
    for (int b = 0; b < fcount; b++) {
        cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);
        if (img.empty()) continue;
        cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); // letterbox BGR to RGB
        int i = 0;
        for (int row = 0; row < INPUT_H; ++row) {
            uchar* uc_pixel = pr_img.data + row * pr_img.step;
            for (int col = 0; col < INPUT_W; ++col) {
                data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0;
                data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
                data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
                uc_pixel += 3;
                ++i;
            }
        }
    }

    // Run inference
    auto start = std::chrono::system_clock::now();
    doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
    auto end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
    for (int b = 0; b < fcount; b++) {
        auto& res = batch_res[b];
        nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH);
    }
    for (int b = 0; b < fcount; b++) {
        auto& res = batch_res[b];
        //std::cout << res.size() << std::endl;
        cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect(img, res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
        cv::imwrite("_" + file_names[f - fcount + 1 + b], img);
    }
    fcount = 0;
}

基本上与pytorch的实现类似，从推理到nms，还包含计算推理时间以及最后在图上画框的部分。第一部分应该是对输入的图像进一步处理成输入数据，除了整理数据组织形式外，其他的处理操作没看懂。对TensorRTX改造时主要的处理部分。

其中推理接口doInference()

void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) {
    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);
}

可以看到基本上用cuda编程，将数据送上去排队，然后等待输出。

// Release stream and buffers
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(buffers[inputIndex]));
CUDA_CHECK(cudaFree(buffers[outputIndex]));
// Destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();

最后的收尾工作，包括传输流的回收，释放显存，最后释放内存。