0x01 Tensorflow 2.0
1.1. Convert
keras hdf5 –> .pb
1 | # V2 behaviour is disabled by default in Jetpack 4.4.DP. |
2 | import tensorflow.compat.v2 as tf |
3 | from tensorflow.keras.models import load_model |
4 | |
5 | model = load_model('./model/fer2013_mini_XCEPTION.102-0.66.hdf5') |
6 | model.save('./model/tf_savedmodel', save_format='tf') |
.pb–>trt.pb
1 | params = trt.DEFAULT_TRT_CONVERSION_PARAMS |
2 | params._replace(precision_mode=trt.TrtPrecisionMode.INT8) |
3 | converter = trt.TrtGraphConverterV2(input_saved_model_dir='./model/tf_savedmodel', conversion_params=params) |
4 | converter.convert() |
5 | converter.save('./model/trt_int8') |
1.2. Inference
1 | if use_trt: |
2 | saved_model_loaded = tf.saved_model.load('./model/trt_int8', tags=[trt.tag_constants.SERVING]) |
3 | graph_func = saved_model_loaded.signatures[trt.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] |
4 | self.emotion_classifier = frozen_func |
5 | else: |
6 | self.emotion_classifier = tf.keras.models.load_model('xxx.hdf5', compile=False) |
1.3. v2 Behaviour
It looks like the current Tensorflow for JP 4.4 was compiled with --config=v1
flag, as V2 behaviour seems to be disabled in default.
The workaround is:
1 | import tensorflow.compat.v2 as tf |
2 | import tensorflow.compat.v2.keras as keras |
3 | tf.enable_v2_behavior() |
0x02 Others ( Pytorch / MxNet / Caffe )
2.1. Convert model to ONNX
1 | pass |
2.2. Build TensorRT Engine from ONNX Model
1 | def build_engine_onnx(model_file): |
2 | with trt.Builder(TRT_LOGGER) as builder, |
3 | builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network, |
4 | trt.OnnxParser(network, TRT_LOGGER) as parser: |
5 | builder.max_workspace_size = common.GiB(1) |
6 | builder.max_batch_size = batch_size |
7 | # Load the Onnx model and parse it in order to populate the TensorRT network. |
8 | with open(model_file, 'rb') as model: |
9 | parser.parse(model.read()) |
10 | return builder.build_cuda_engine(network) |
11 | |
12 | engine = build_engine_onnx('resnet100.onnx') |
13 | engine_file_path = './arcface_trt.engine' |
14 | with open(engine_file_path, "wb") as f: |
15 | f.write(engine.serialize()) |
2.3. Inference from TRT Engine
1 | def build(engine_file): |
2 | with open(engine_file, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: |
3 | engine = runtime.deserialize_cuda_engine(f.read()) |
4 | inputs, outputs, bindings, stream = allocate_buffers(engine) |
5 | context = engine.create_execution_context() |
6 | |
7 | def run(objects_frame): |
8 | allocate_place = np.prod(objects_frame.shape) |
9 | inputs[0].host[:allocate_place] = objects_frame.flatten(order='C').astype(np.float32) |
10 | trt_outputs = do_inference( |
11 | self.context, bindings=self.bindings, |
12 | inputs=inputs, outputs=outputs, stream=stream) |
13 | |
14 | return trt_outputs |
2.4. pycuda
1 | try: |
2 | # Sometimes python2 does not understand FileNotFoundError |
3 | FileNotFoundError |
4 | except NameError: |
5 | FileNotFoundError = IOError |
6 | |
7 | def GiB(val): |
8 | return val * 1 << 30 # 1 << 10 << 10 << 10, 1024*1024*1024 |
9 | |
10 | # Simple helper data class that's a little nicer to use than a 2-tuple. |
11 | class HostDeviceMem(object): |
12 | def __init__(self, host_mem, device_mem): |
13 | self.host = host_mem |
14 | self.device = device_mem |
15 | |
16 | def __str__(self): |
17 | return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) |
18 | |
19 | def __repr__(self): |
20 | return self.__str__() |
21 | |
22 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. |
23 | def allocate_buffers(engine): |
24 | inputs = [] |
25 | outputs = [] |
26 | bindings = [] |
27 | stream = cuda.Stream() |
28 | for binding in engine: |
29 | size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size |
30 | dtype = trt.nptype(engine.get_binding_dtype(binding)) |
31 | # Allocate host and device buffers |
32 | host_mem = cuda.pagelocked_empty(size, dtype) # pagelocked memory (Direct Memory Access,DMA) |
33 | device_mem = cuda.mem_alloc(host_mem.nbytes) |
34 | # Append the device buffer to device bindings. |
35 | bindings.append(int(device_mem)) |
36 | # Append to the appropriate list. |
37 | if engine.binding_is_input(binding): |
38 | inputs.append(HostDeviceMem(host_mem, device_mem)) |
39 | else: |
40 | outputs.append(HostDeviceMem(host_mem, device_mem)) |
41 | return inputs, outputs, bindings, stream |
42 | |
43 | # This function is generalized for multiple inputs/outputs. |
44 | # inputs and outputs are expected to be lists of HostDeviceMem objects. |
45 | def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): |
46 | # Transfer input data to the GPU. |
47 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] |
48 | # Run inference. |
49 | context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) |
50 | # Transfer predictions back from the GPU. |
51 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] |
52 | # Synchronize the stream |
53 | stream.synchronize() |
54 | # Return only the host outputs. |
55 | return [out.host for out in outputs] |