Tensorrt踩坑日記 | python、pytorch 轉(zhuǎn) onnx 推理加速

極市導(dǎo)讀
?之前介紹過pytorch轉(zhuǎn)onnx模型部署翻車記,今天給大家分享一篇Tensorrt踩坑日記,記錄Python、pytorch轉(zhuǎn)換onnx的過程中踩的坑。>>加入極市CV技術(shù)交流群,走在計算機(jī)視覺的最前沿
簡單說明一下pytorch轉(zhuǎn)onnx的意義。在pytorch訓(xùn)練出一個深度學(xué)習(xí)模型后,需要在TensorRT或者openvino部署,這時需要先把Pytorch模型轉(zhuǎn)換到onnx模型之后再做其它轉(zhuǎn)換。因此,在使用pytorch訓(xùn)練深度學(xué)習(xí)模型完成后,在TensorRT或者openvino或者opencv和onnxruntime部署時,pytorch模型轉(zhuǎn)onnx這一步是必不可少的。本文介紹Python、pytorch轉(zhuǎn)換onnx的過程中遇到的坑。

配置
Ubuntu 16.04
python 3.6
onnx 1.6
pytorch 1.5
pycuda 2019.1.2
torchvision 0.1.8
建議詳讀,先安裝好環(huán)境:https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#import_onnx_python)
步驟
1.將pytorch模型轉(zhuǎn)換成onnx模型
這邊用的是Darknet生成的pytoch模型
import?torch
from?torch.autograd?import?Variable
import?onnx
input_name?=?['input']
output_name?=?['output']
input?=?Variable(torch.randn(1,?3,?544,?544)).cuda()
model?=?x.model.cuda()#x.model為我生成的模型
#?model?=?torch.load('',?map_location="cuda:0")
torch.onnx.export(model,?input,?'model.onnx',?input_names=input_name,?output_names=output_name,?verbose=True)
其中
#model?=?x.model.cuda()
#若是不添加cuda()
model?=?x.model
出現(xiàn)報錯
RuntimeError:?Input?type?(torch.cuda.FloatTensor)?and?weight?type?(torch.FloatTensor)?should?be?the?same
2.檢查模型
model?=?onnx.load("model.onnx")
onnx.checker.check_model(model)
print("==>?Passed")
3.測試onnx模型使用tensorrt推理前后對比
import?pycuda.autoinit
import?numpy?as?np
import?pycuda.driver?as?cuda
import?tensorrt?as?trt
import?torch
import?os
import?time
from?PIL?import?Image
import?cv2
import?torchvision
filename?=?'000000.jpg'
max_batch_size?=?1
onnx_model_path?=?'yolo.onnx'
TRT_LOGGER?=?trt.Logger()??#?This?logger?is?required?to?build?an?engine
def?get_img_np_nchw(filename):
????image?=?cv2.imread(filename)
????image_cv?=?cv2.cvtColor(image,?cv2.COLOR_BGR2RGB)
????image_cv?=?cv2.resize(image_cv,?(1920,?1080))
????miu?=?np.array([0.485,?0.456,?0.406])
????std?=?np.array([0.229,?0.224,?0.225])
????img_np?=?np.array(image_cv,?dtype=float)?/?255.
????r?=?(img_np[:,?:,?0]?-?miu[0])?/?std[0]
????g?=?(img_np[:,?:,?1]?-?miu[1])?/?std[1]
????b?=?(img_np[:,?:,?2]?-?miu[2])?/?std[2]
????img_np_t?=?np.array([r,?g,?b])
????img_np_nchw?=?np.expand_dims(img_np_t,?axis=0)
????return?img_np_nchw
class?HostDeviceMem(object):
????def?__init__(self,?host_mem,?device_mem):
????????"""Within?this?context,?host_mom?means?the?cpu?memory?and?device?means?the?GPU?memory
????????"""
????????self.host?=?host_mem
????????self.device?=?device_mem
????def?__str__(self):
????????return?"Host:\n"?+?str(self.host)?+?"\nDevice:\n"?+?str(self.device)
????def?__repr__(self):
????????return?self.__str__()
def?allocate_buffers(engine):
????inputs?=?[]
????outputs?=?[]
????bindings?=?[]
????stream?=?cuda.Stream()
????for?binding?in?engine:
????????size?=?trt.volume(engine.get_binding_shape(binding))?*?engine.max_batch_size
????????dtype?=?trt.nptype(engine.get_binding_dtype(binding))
????????#?Allocate?host?and?device?buffers
????????host_mem?=?cuda.pagelocked_empty(size,?dtype)
????????device_mem?=?cuda.mem_alloc(host_mem.nbytes)
????????#?Append?the?device?buffer?to?device?bindings.
????????bindings.append(int(device_mem))
????????#?Append?to?the?appropriate?list.
????????if?engine.binding_is_input(binding):
????????????inputs.append(HostDeviceMem(host_mem,?device_mem))
????????else:
????????????outputs.append(HostDeviceMem(host_mem,?device_mem))
????return?inputs,?outputs,?bindings,?stream
def?get_engine(max_batch_size=1,?onnx_file_path="",?engine_file_path="",?\
???????????????fp16_mode=False,?int8_mode=False,?save_engine=False,
???????????????):
????"""Attempts?to?load?a?serialized?engine?if?available,?otherwise?builds?a?new?TensorRT?engine?and?saves?it."""
????def?build_engine(max_batch_size,?save_engine):
????????"""Takes?an?ONNX?file?and?creates?a?TensorRT?engine?to?run?inference?with"""
????????EXPLICIT_BATCH?=?1?<(int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
????????with?trt.Builder(TRT_LOGGER)?as?builder,?\
????????????????builder.create_network(EXPLICIT_BATCH)?as?network,?\
????????????????trt.OnnxParser(network,?TRT_LOGGER)?as?parser:
????????????builder.max_workspace_size?=?1?<30??#?Your?workspace?size
????????????builder.max_batch_size?=?max_batch_size
????????????#?pdb.set_trace()
????????????builder.fp16_mode?=?fp16_mode??#?Default:?False
????????????builder.int8_mode?=?int8_mode??#?Default:?False
????????????if?int8_mode:
????????????????#?To?be?updated
????????????????raise?NotImplementedError
????????????#?Parse?model?file
????????????if?not?os.path.exists(onnx_file_path):
????????????????quit('ONNX?file?{}?not?found'.format(onnx_file_path))
????????????print('Loading?ONNX?file?from?path?{}...'.format(onnx_file_path))
????????????with?open(onnx_file_path,?'rb')?as?model:
????????????????print('Beginning?ONNX?file?parsing')
????????????????parser.parse(model.read())
????????????????if?not?parser.parse(model.read()):
????????????????????for?error?in?range(parser.num_errors):
????????????????????????print(parser.get_error(error))
????????????????????print("===========Parsing?fail!!!!=================")
????????????????else?:
????????????????????print('Completed?parsing?of?ONNX?file')
????????????print('Building?an?engine?from?file?{};?this?may?take?a?while...'.format(onnx_file_path))
????????????engine?=?builder.build_cuda_engine(network)
????????????print("Completed?creating?Engine")
????????????if?save_engine:
????????????????with?open(engine_file_path,?"wb")?as?f:
????????????????????f.write(engine.serialize())
????????????return?engine
????if?os.path.exists(engine_file_path):
????????#?If?a?serialized?engine?exists,?load?it?instead?of?building?a?new?one.
????????print("Reading?engine?from?file?{}".format(engine_file_path))
????????with?open(engine_file_path,?"rb")?as?f,?trt.Runtime(TRT_LOGGER)?as?runtime:
????????????return?runtime.deserialize_cuda_engine(f.read())
????else:
????????return?build_engine(max_batch_size,?save_engine)
def?do_inference(context,?bindings,?inputs,?outputs,?stream,?batch_size=1):
????#?Transfer?data?from?CPU?to?the?GPU.
????[cuda.memcpy_htod_async(inp.device,?inp.host,?stream)?for?inp?in?inputs]
????#?Run?inference.
????context.execute_async(batch_size=batch_size,?bindings=bindings,?stream_handle=stream.handle)
????#?Transfer?predictions?back?from?the?GPU.
????[cuda.memcpy_dtoh_async(out.host,?out.device,?stream)?for?out?in?outputs]
????#?Synchronize?the?stream
????stream.synchronize()
????#?Return?only?the?host?outputs.
????return?[out.host?for?out?in?outputs]
def?postprocess_the_outputs(h_outputs,?shape_of_output):
????h_outputs?=?h_outputs.reshape(*shape_of_output)
????return?h_outputs
img_np_nchw?=?get_img_np_nchw(filename)
img_np_nchw?=?img_np_nchw.astype(dtype=np.float32)
#?These?two?modes?are?dependent?on?hardwares
fp16_mode?=?False
int8_mode?=?False
trt_engine_path?=?'./model_fp16_{}_int8_{}.trt'.format(fp16_mode,?int8_mode)
#?Build?an?engine
engine?=?get_engine(max_batch_size,?onnx_model_path,?trt_engine_path,?fp16_mode,?int8_mode)
#?Create?the?context?for?this?engine
context?=?engine.create_execution_context()
#?Allocate?buffers?for?input?and?output
inputs,?outputs,?bindings,?stream?=?allocate_buffers(engine)?#?input,?output:?host?#?bindings
#?Do?inference
shape_of_output?=?(max_batch_size,?1000)
#?Load?data?to?the?buffer
inputs[0].host?=?img_np_nchw.reshape(-1)
#?inputs[1].host?=?...?for?multiple?input
t1?=?time.time()
trt_outputs?=?do_inference(context,?bindings=bindings,?inputs=inputs,?outputs=outputs,?stream=stream)?#?numpy?data
t2?=?time.time()
feat?=?postprocess_the_outputs(trt_outputs[0],?shape_of_output)
print('TensorRT?ok')
#將model改為自己的模型,此處為pytoch的resnet50,需聯(lián)網(wǎng)下載
model?=?torchvision.models.resnet50(pretrained=True).cuda()
resnet_model?=?model.eval()
input_for_torch?=?torch.from_numpy(img_np_nchw).cuda()
t3?=?time.time()
feat_2=?resnet_model(input_for_torch)
t4?=?time.time()
feat_2?=?feat_2.cpu().data.numpy()
print('Pytorch?ok!')
mse?=?np.mean((feat?-?feat_2)**2)
print("Inference?time?with?the?TensorRT?engine:?{}".format(t2-t1))
print("Inference?time?with?the?PyTorch?model:?{}".format(t4-t3))
print('MSE?Error?=?{}'.format(mse))
print('All?completed!')
報錯:
In?node?-1?(importModel):?INVALID_VALUE:?Assertion?failed:?!_importer_ctx.network()->hasImplicitBatchDimension()?&&?"This?version?of?the?ONNX?parser?only?supports?TensorRT?INetworkDefinitions?with?an?explicit?batch?dimension.?Please?ensure?the?network?was?created?using?the?EXPLICIT_BATCH?NetworkDefinitionCreationFlag."
解決:
????def?build_engine(max_batch_size,?save_engine):
?
????????EXPLICIT_BATCH?=?1?<(int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
????????with?trt.Builder(TRT_LOGGER)?as?builder,?\
????????????????builder.create_network(EXPLICIT_BATCH)?as?network,?\
????????????????trt.OnnxParser(network,?TRT_LOGGER)?as?parser:
報錯:
Traceback?(most?recent?call?last):
??line?126,?in?
????[cuda.memcpy_htod_async(inp.device,?inp.host,?stream)?for?inp?in?inputs]
pycuda._driver.LogicError:?cuMemcpyHtoDAsync?failed:?invalid?argument
解決:
def?get_img_np_nchw(filename):
????image?=?cv2.imread(filename)
????image_cv?=?cv2.cvtColor(image,?cv2.COLOR_BGR2RGB)
????image_cv?=?cv2.resize(image_cv,?(1920,?1080))
?
輸入的檢測圖像尺寸需要resize成model的input的size
改為
def?get_img_np_nchw(filename):
????image?=?cv2.imread(filename)
????image_cv?=?cv2.cvtColor(image,?cv2.COLOR_BGR2RGB)
????image_cv?=?cv2.resize(image_cv,?(544,544))
報錯
line?139,?in?postprocess_the_outputs
????h_outputs?=?h_outputs.reshape(*shape_of_output)
ValueError:?cannot?reshape?array?of?size?5780?into?shape?(1,1000)
解決:
#shape_of_output?=?(max_batch_size,?1000)
#修改成自己模型ouput的大小
shape_of_output?=?(1,20,17,17)如果覺得有用,就請分享到朋友圈吧!
公眾號后臺回復(fù)“transformer”獲取最新Transformer綜述論文下載~

#?CV技術(shù)社群邀請函?#

備注:姓名-學(xué)校/公司-研究方向-城市(如:小極-北大-目標(biāo)檢測-深圳)
即可申請加入極市目標(biāo)檢測/圖像分割/工業(yè)檢測/人臉/醫(yī)學(xué)影像/3D/SLAM/自動駕駛/超分辨率/姿態(tài)估計/ReID/GAN/圖像增強(qiáng)/OCR/視頻理解等技術(shù)交流群
每月大咖直播分享、真實項目需求對接、求職內(nèi)推、算法競賽、干貨資訊匯總、與?10000+來自港科大、北大、清華、中科院、CMU、騰訊、百度等名校名企視覺開發(fā)者互動交流~

