世界在旅程的尽头终结

0%

Inference using TensorRT Backend.

0x01 Tensorflow 2.0

1.1. Convert

keras hdf5 –> .pb

1
# V2 behaviour is disabled by default in Jetpack 4.4.DP.
2
import tensorflow.compat.v2 as tf
3
from tensorflow.keras.models import load_model
4
5
model = load_model('./model/fer2013_mini_XCEPTION.102-0.66.hdf5')
6
model.save('./model/tf_savedmodel', save_format='tf')

.pb–>trt.pb

1
params = trt.DEFAULT_TRT_CONVERSION_PARAMS
2
params._replace(precision_mode=trt.TrtPrecisionMode.INT8)
3
converter = trt.TrtGraphConverterV2(input_saved_model_dir='./model/tf_savedmodel', conversion_params=params)
4
converter.convert()
5
converter.save('./model/trt_int8')

1.2. Inference

1
if use_trt:
2
    saved_model_loaded = tf.saved_model.load('./model/trt_int8', tags=[trt.tag_constants.SERVING])
3
    graph_func = saved_model_loaded.signatures[trt.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
4
    self.emotion_classifier = frozen_func
5
else:
6
    self.emotion_classifier = tf.keras.models.load_model('xxx.hdf5', compile=False)

1.3. v2 Behaviour

It looks like the current Tensorflow for JP 4.4 was compiled with --config=v1 flag, as V2 behaviour seems to be disabled in default.
The workaround is:

1
import tensorflow.compat.v2 as tf
2
import tensorflow.compat.v2.keras as keras
3
tf.enable_v2_behavior()

0x02 Others ( Pytorch / MxNet / Caffe )

2.1. Convert model to ONNX

1
pass

2.2. Build TensorRT Engine from ONNX Model

1
def build_engine_onnx(model_file):
2
    with trt.Builder(TRT_LOGGER) as builder,
3
        builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network,
4
            trt.OnnxParser(network, TRT_LOGGER) as parser:
5
                builder.max_workspace_size = common.GiB(1)
6
                builder.max_batch_size = batch_size
7
                # Load the Onnx model and parse it in order to populate the TensorRT network.
8
                with open(model_file, 'rb') as model:
9
                    parser.parse(model.read())
10
                return builder.build_cuda_engine(network)
11
12
engine = build_engine_onnx('resnet100.onnx')
13
engine_file_path = './arcface_trt.engine'
14
with open(engine_file_path, "wb") as f:
15
    f.write(engine.serialize())

2.3. Inference from TRT Engine

1
def build(engine_file):
2
    with open(engine_file, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
3
        engine = runtime.deserialize_cuda_engine(f.read())
4
    inputs, outputs, bindings, stream = allocate_buffers(engine)
5
    context = engine.create_execution_context()
6
7
def run(objects_frame):
8
    allocate_place = np.prod(objects_frame.shape)
9
    inputs[0].host[:allocate_place] = objects_frame.flatten(order='C').astype(np.float32)
10
    trt_outputs = do_inference(
11
        self.context, bindings=self.bindings,
12
        inputs=inputs, outputs=outputs, stream=stream)
13
14
    return trt_outputs

2.4. pycuda

1
try:
2
    # Sometimes python2 does not understand FileNotFoundError
3
    FileNotFoundError
4
except NameError:
5
    FileNotFoundError = IOError
6
7
def GiB(val):
8
    return val * 1 << 30   # 1 << 10 << 10 << 10, 1024*1024*1024
9
10
# Simple helper data class that's a little nicer to use than a 2-tuple.
11
class HostDeviceMem(object):
12
    def __init__(self, host_mem, device_mem):
13
        self.host = host_mem
14
        self.device = device_mem
15
16
    def __str__(self):
17
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
18
19
    def __repr__(self):
20
        return self.__str__()
21
22
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
23
def allocate_buffers(engine):
24
    inputs = []
25
    outputs = []
26
    bindings = []
27
    stream = cuda.Stream()
28
    for binding in engine:
29
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
30
        dtype = trt.nptype(engine.get_binding_dtype(binding))
31
        # Allocate host and device buffers
32
        host_mem = cuda.pagelocked_empty(size, dtype)      # pagelocked memory (Direct Memory Access,DMA)
33
        device_mem = cuda.mem_alloc(host_mem.nbytes)
34
        # Append the device buffer to device bindings.
35
        bindings.append(int(device_mem))
36
        # Append to the appropriate list.
37
        if engine.binding_is_input(binding):
38
            inputs.append(HostDeviceMem(host_mem, device_mem))
39
        else:
40
            outputs.append(HostDeviceMem(host_mem, device_mem))
41
    return inputs, outputs, bindings, stream
42
43
# This function is generalized for multiple inputs/outputs.
44
# inputs and outputs are expected to be lists of HostDeviceMem objects.
45
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
46
    # Transfer input data to the GPU.
47
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
48
    # Run inference.
49
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
50
    # Transfer predictions back from the GPU.
51
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
52
    # Synchronize the stream
53
    stream.synchronize()
54
    # Return only the host outputs.
55
    return [out.host for out in outputs]