如何使用TensorRT对训练好的PyTorch模型进行加速？

点击上方“3D视觉工坊”，选择“星标”

干货第一时间送达

作者丨伯恩legacy@知乎

来源丨https://zhuanlan.zhihu.com/p/88318324

编辑丨计算机视觉联盟

一.简介

TensorRT是Nvidia公司出的能加速模型推理的框架，其实就是让你训练的模型在测试阶段的速度加快，比如你的模型测试一张图片的速度是50ms，那么用tensorRT加速的话，可能只需要10ms。当然具体能加速多少也不能保证，反正确实速度能提升不少。但是TensorRT坑爹的地方在于，有些模型操作是不支持的、又或者就算支持但是支持并不完善，对于这些难题，要么自己写插件，要么就只能等待官方的更新了。

现在我们训练深度学习模型主流的框架有tensorflow，pytorch，mxnet，caffe等。这个贴子只涉及pytorch，对于tensorflow的话，可以参考TensorRT部署深度学习模型，https://zhuanlan.zhihu.com/p/84125533，这个帖子是c++如何部署TensorRT。其实原理都是一样的，对于tensorflow模型，需要把pb模型转化为uff模型；对于pytorch模型，需要把pth模型转化为onnx模型；对于caffe模型，则不需要转化，因为tensorRT是可以直接读取caffe模型的。mxnet模型也是需要转化为onnx的。

那么，这篇教学贴主要是从python和c++两种语言环境下，尝试将pytorch模型转化为tensorRT，教刚接触TensorRT的同学们如何快速上手。

二.TensorRT的安装

TensorRT的安装并不难，推荐安装最新版本的。由于我使用的是Centos，因此我一般是按照这个教程来安装TensorRT的。

CentOS安装TensorRT指南 https://tbr8.org/how-to-install-tensorrt-on-centos/

安装完成后，在python环境下import tensorrt看能不能成功，并且编译一下官方的sampleMnist的例子，如果都可以的话，就安装成功了。

python环境下，成功导入tensorrt

运行官方的mnist例子

三.Python环境下pytorch模型如何转化为TensorRT

python环境下pytorch模型转化为TensorRT有两种路径，一种是先把pytorch的pt模型转化为onnx，然后再转化为TensorRT；另一种是直接把pytorch的pt模型转成TensorRT。

首先，我们先把pt模型转化为onnx模型，需要安装onnx，直接pip install onnx即可。我们以ResNet50为例，代码如下：

import torchvisionimport torchfrom torch.autograd import Variableimport onnxprint(torch.__version__)
input_name = ['input']output_name = ['output']input = Variable(torch.randn(1, 3, 224, 224)).cuda()model = torchvision.models.resnet50(pretrained=True).cuda()torch.onnx.export(model, input, 'resnet50.onnx', input_names=input_name, output_names=output_name, verbose=True)

以上代码使用torchvision里面预训练的resnet50模型为基础，将resnet50的pt模型转化成res50.onnx，其中规定onnx的输入名是'input'，输出名是'output'，输入图像的大小是3通道224x224。其中batch size是1，其实这个batch size你可以取3、4、5等。运行这个代码就可以生成一个名为resnet50.onnx文件。

最好检查一下生成的onnx，代码如下：

test = onnx.load('resnet50.onnx')
onnx.checker.check_model(test)
print("==> Passed")

接下来比较一下pytorch模型和TensorRT的结果吧：

import pycuda.autoinitimport numpy as npimport pycuda.driver as cudaimport tensorrt as trtimport torchimport osimport timefrom PIL import Imageimport cv2import torchvision
filename = 'test.jpg'max_batch_size = 1onnx_model_path = 'resnet50.onnx'
TRT_LOGGER = trt.Logger()  # This logger is required to build an engine

def get_img_np_nchw(filename):    image = cv2.imread(filename)    image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)    image_cv = cv2.resize(image_cv, (224, 224))    miu = np.array([0.485, 0.456, 0.406])    std = np.array([0.229, 0.224, 0.225])    img_np = np.array(image_cv, dtype=float) / 255.    r = (img_np[:, :, 0] - miu[0]) / std[0]    g = (img_np[:, :, 1] - miu[1]) / std[1]    b = (img_np[:, :, 2] - miu[2]) / std[2]    img_np_t = np.array([r, g, b])    img_np_nchw = np.expand_dims(img_np_t, axis=0)    return img_np_nchw
class HostDeviceMem(object):    def __init__(self, host_mem, device_mem):        """Within this context, host_mom means the cpu memory and device means the GPU memory        """        self.host = host_mem        self.device = device_mem
    def __str__(self):        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
    def __repr__(self):        return self.__str__()

def allocate_buffers(engine):    inputs = []    outputs = []    bindings = []    stream = cuda.Stream()    for binding in engine:        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size        dtype = trt.nptype(engine.get_binding_dtype(binding))        # Allocate host and device buffers        host_mem = cuda.pagelocked_empty(size, dtype)        device_mem = cuda.mem_alloc(host_mem.nbytes)        # Append the device buffer to device bindings.        bindings.append(int(device_mem))        # Append to the appropriate list.        if engine.binding_is_input(binding):            inputs.append(HostDeviceMem(host_mem, device_mem))        else:            outputs.append(HostDeviceMem(host_mem, device_mem))    return inputs, outputs, bindings, stream

def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", \               fp16_mode=False, int8_mode=False, save_engine=False,               ):    """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
    def build_engine(max_batch_size, save_engine):        """Takes an ONNX file and creates a TensorRT engine to run inference with"""        with trt.Builder(TRT_LOGGER) as builder, \                builder.create_network() as network, \                trt.OnnxParser(network, TRT_LOGGER) as parser:
            builder.max_workspace_size = 1 << 30  # Your workspace size            builder.max_batch_size = max_batch_size            # pdb.set_trace()            builder.fp16_mode = fp16_mode  # Default: False            builder.int8_mode = int8_mode  # Default: False            if int8_mode:                # To be updated                raise NotImplementedError
            # Parse model file            if not os.path.exists(onnx_file_path):                quit('ONNX file {} not found'.format(onnx_file_path))
            print('Loading ONNX file from path {}...'.format(onnx_file_path))            with open(onnx_file_path, 'rb') as model:                print('Beginning ONNX file parsing')                parser.parse(model.read())
            print('Completed parsing of ONNX file')            print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
            engine = builder.build_cuda_engine(network)            print("Completed creating Engine")
            if save_engine:                with open(engine_file_path, "wb") as f:                    f.write(engine.serialize())            return engine
    if os.path.exists(engine_file_path):        # If a serialized engine exists, load it instead of building a new one.        print("Reading engine from file {}".format(engine_file_path))        with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:            return runtime.deserialize_cuda_engine(f.read())    else:        return build_engine(max_batch_size, save_engine)

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):    # Transfer data from CPU to the GPU.    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]    # Run inference.    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)    # Transfer predictions back from the GPU.    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]    # Synchronize the stream    stream.synchronize()    # Return only the host outputs.    return [out.host for out in outputs]

def postprocess_the_outputs(h_outputs, shape_of_output):    h_outputs = h_outputs.reshape(*shape_of_output)    return h_outputs


img_np_nchw = get_img_np_nchw(filename)img_np_nchw = img_np_nchw.astype(dtype=np.float32)
# These two modes are dependent on hardwaresfp16_mode = Falseint8_mode = Falsetrt_engine_path = './model_fp16_{}_int8_{}.trt'.format(fp16_mode, int8_mode)# Build an engineengine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode)# Create the context for this enginecontext = engine.create_execution_context()# Allocate buffers for input and outputinputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings
# Do inferenceshape_of_output = (max_batch_size, 1000)# Load data to the bufferinputs[0].host = img_np_nchw.reshape(-1)
# inputs[1].host = ... for multiple inputt1 = time.time()trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy datat2 = time.time()feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
print('TensorRT ok')
model = torchvision.models.resnet50(pretrained=True).cuda()resnet_model = model.eval()
input_for_torch = torch.from_numpy(img_np_nchw).cuda()t3 = time.time()feat_2= resnet_model(input_for_torch)t4 = time.time()feat_2 = feat_2.cpu().data.numpy()print('Pytorch ok!')

mse = np.mean((feat - feat_2)**2)print("Inference time with the TensorRT engine: {}".format(t2-t1))print("Inference time with the PyTorch model: {}".format(t4-t3))print('MSE Error = {}'.format(mse))
print('All completed!')

运行结果如下：

TensorRT okPytorch ok!Inference time with the TensorRT engine: 0.0037250518798828125Inference time with the PyTorch model: 0.3574800491333008MSE Error = 3.297184357139993e-12

这个结果Pytorch模型ResNet50竟然需要340ms，感觉有些迷，但是好像没发现有啥问题。可以发现，TensorRT进行inference的结果和pytorch前向的结果差距很小。代码来源于https://github.com/RizhaoCai/PyTorch_ONNX_TensorRT

接下来介绍python环境下，直接把pytorch模型转化为TensorRT，参考的代码来源于NVIDIA-AI-IOT/torch2trt，https://github.com/NVIDIA-AI-IOT/torch2trt这个工程比较简单易懂，质量很高，安装也不难，我自己运行的结果如下：

对于你自己的Pytorch模型，只需要把该代码的model进行替换即可。注意在运行过程中经常会出现"output tensor has no attribute _trt"，这是因为你模型当中有一些操作还没有实现，需要自己实现。

四.C++环境下Pytorch模型如何转化为TensorRT

c++环境下，以TensorRT5.1.5.0的sampleOnnxMNIST为例子，用opencv读取一张图片，然后让TensorRT进行doInference输出(1,1000)的特征。代码如下所示，把这个代码替换sampleOnnxMNIST替换，然后编译就能运行了。

#include#include #include#include #include#include#include#include#include #include #include #include "NvInfer.h"#include "NvOnnxParser.h"#include "argsParser.h"#include "logger.h"#include "common.h"#include "image.hpp"#define DebugP(x) std::cout << "Line" << __LINE__ << "  " << #x << "=" << x << std::endl

using namespace nvinfer1;
static const int INPUT_H = 224;static const int INPUT_W = 224;static const int INPUT_C = 3;static const int OUTPUT_SIZE = 1000;
const char* INPUT_BLOB_NAME = "input";const char* OUTPUT_BLOB_NAME = "output";
const std::string gSampleName = "TensorRT.sample_onnx_image";

samplesCommon::Args gArgs;

bool onnxToTRTModel(const std::string& modelFile, // name of the onnx model                    unsigned int maxBatchSize,    // batch size - NB must be at least as large as the batch we want to run with                    IHostMemory*& trtModelStream) // output buffer for the TensorRT model{    // create the builder    IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());    assert(builder != nullptr);    nvinfer1::INetworkDefinition* network = builder->createNetwork();
    auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
    //Optional - uncomment below lines to view network layer information    //config->setPrintLayerInfo(true);    //parser->reportParsingInfo();
    if ( !parser->parseFromFile( locateFile(modelFile, gArgs.dataDirs).c_str(), static_cast(gLogger.getReportableSeverity()) ) )    {        gLogError << "Failure while parsing ONNX file" << std::endl;        return false;    }        // Build the engine    builder->setMaxBatchSize(maxBatchSize);    //builder->setMaxWorkspaceSize(1 << 20);    builder->setMaxWorkspaceSize(10 << 20);    builder->setFp16Mode(gArgs.runInFp16);    builder->setInt8Mode(gArgs.runInInt8);
    if (gArgs.runInInt8)    {        samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);    }        samplesCommon::enableDLA(builder, gArgs.useDLACore);        ICudaEngine* engine = builder->buildCudaEngine(*network);    assert(engine);
    // we can destroy the parser    parser->destroy();
    // serialize the engine, then close everything down    trtModelStream = engine->serialize();    engine->destroy();    network->destroy();    builder->destroy();
    return true;}
void doInference(IExecutionContext& context, float* input, float* output, int batchSize){    const ICudaEngine& engine = context.getEngine();    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),    // of these, but in this case we know that there is exactly one input and one output.    assert(engine.getNbBindings() == 2);    void* buffers[2];
    // In order to bind the buffers, we need to know the names of the input and output tensors.    // note that indices are guaranteed to be less than IEngine::getNbBindings()        const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);        DebugP(inputIndex); DebugP(outputIndex);    // create GPU buffers and a stream    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float)));    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
    cudaStream_t stream;    CHECK(cudaStreamCreate(&stream));
    // DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));    context.enqueue(batchSize, buffers, stream, nullptr);    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));    cudaStreamSynchronize(stream);
    // release the stream and the buffers    cudaStreamDestroy(stream);    CHECK(cudaFree(buffers[inputIndex]));    CHECK(cudaFree(buffers[outputIndex]));}
//!//! \brief This function prints the help information for running this sample//!void printHelpInfo(){    std::cout << "Usage: ./sample_onnx_mnist [-h or --help] [-d or --datadir=] [--useDLACore=]\n";    std::cout << "--help          Display help information\n";    std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data directories are given, the default is to use (data/samples/mnist/, data/mnist/)" << std::endl;    std::cout << "--useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform." << std::endl;    std::cout << "--int8          Run in Int8 mode.\n";    std::cout << "--fp16          Run in FP16 mode." << std::endl;}
int main(int argc, char** argv){    bool argsOK = samplesCommon::parseArgs(gArgs, argc, argv);    if (gArgs.help)    {        printHelpInfo();        return EXIT_SUCCESS;    }    if (!argsOK)    {        gLogError << "Invalid arguments" << std::endl;        printHelpInfo();        return EXIT_FAILURE;    }    if (gArgs.dataDirs.empty())    {        gArgs.dataDirs = std::vector{"data/samples/mnist/", "data/mnist/"};    }
    auto sampleTest = gLogger.defineTest(gSampleName, argc, const_cast(argv));
    gLogger.reportTestStart(sampleTest);
    // create a TensorRT model from the onnx model and serialize it to a stream    IHostMemory* trtModelStream{nullptr};
    if (!onnxToTRTModel("resnet50.onnx", 1, trtModelStream))        gLogger.reportFail(sampleTest);
    assert(trtModelStream != nullptr);    std::cout << "Successfully parsed ONNX file!!!!" << std::endl;            std::cout << "Start reading the input image!!!!" << std::endl;        cv::Mat image = cv::imread(locateFile("test.jpg", gArgs.dataDirs), cv::IMREAD_COLOR);    if (image.empty()) {        std::cout << "The input image is empty!!! Please check....."<= 0)    {        runtime->setDLACore(gArgs.useDLACore);    }
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);    assert(engine != nullptr);    trtModelStream->destroy();    IExecutionContext* context = engine->createExecutionContext();    assert(context != nullptr);        float prob[OUTPUT_SIZE];    typedef std::chrono::high_resolution_clock Time;    typedef std::chrono::duration

如何使用TensorRT对训练好的PyTorch模型进行加速？

最近更新

热门博客

[ 申请 ]友情链接：