Inference using TensorRT Backend.

0x01 Tensorflow 2.0

1.1. Convert

keras hdf5 –> .pb

1
2
3
4
5
6
# V2 behaviour is disabled by default in Jetpack 4.4.DP.
import tensorflow.compat.v2 as tf
from tensorflow.keras.models import load_model

model = load_model('./model/fer2013_mini_XCEPTION.102-0.66.hdf5')
model.save('./model/tf_savedmodel', save_format='tf')

.pb–>trt.pb

1
2
3
4
5
params = trt.DEFAULT_TRT_CONVERSION_PARAMS
params._replace(precision_mode=trt.TrtPrecisionMode.INT8)
converter = trt.TrtGraphConverterV2(input_saved_model_dir='./model/tf_savedmodel', conversion_params=params)
converter.convert()
converter.save('./model/trt_int8')

1.2. Inference

1
2
3
4
5
6
if use_trt:
saved_model_loaded = tf.saved_model.load('./model/trt_int8', tags=[trt.tag_constants.SERVING])
graph_func = saved_model_loaded.signatures[trt.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
self.emotion_classifier = frozen_func
else:
self.emotion_classifier = tf.keras.models.load_model('xxx.hdf5', compile=False)

1.3. v2 Behaviour

It looks like the current Tensorflow for JP 4.4 was compiled with --config=v1 flag, as V2 behaviour seems to be disabled in default.
The workaround is:

1
2
3
import tensorflow.compat.v2 as tf
import tensorflow.compat.v2.keras as keras
tf.enable_v2_behavior()

0x02 Others ( Pytorch / MxNet / Caffe )

2.1. Convert model to ONNX

1
pass

2.2. Build TensorRT Engine from ONNX Model

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def build_engine_onnx(model_file):
with trt.Builder(TRT_LOGGER) as builder,
builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network,
trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_workspace_size = common.GiB(1)
builder.max_batch_size = batch_size
# Load the Onnx model and parse it in order to populate the TensorRT network.
with open(model_file, 'rb') as model:
parser.parse(model.read())
return builder.build_cuda_engine(network)

engine = build_engine_onnx('resnet100.onnx')
engine_file_path = './arcface_trt.engine'
with open(engine_file_path, "wb") as f:
f.write(engine.serialize())

2.3. Inference from TRT Engine

1
2
3
4
5
6
7
8
9
10
11
12
13
14
def build(engine_file):
with open(engine_file, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
inputs, outputs, bindings, stream = allocate_buffers(engine)
context = engine.create_execution_context()

def run(objects_frame):
allocate_place = np.prod(objects_frame.shape)
inputs[0].host[:allocate_place] = objects_frame.flatten(order='C').astype(np.float32)
trt_outputs = do_inference(
self.context, bindings=self.bindings,
inputs=inputs, outputs=outputs, stream=stream)

return trt_outputs

2.4. pycuda

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
try:
# Sometimes python2 does not understand FileNotFoundError
FileNotFoundError
except NameError:
FileNotFoundError = IOError

def GiB(val):
return val * 1 << 30 # 1 << 10 << 10 << 10, 1024*1024*1024

# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem

def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

def __repr__(self):
return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype) # pagelocked memory (Direct Memory Access,DMA)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]