-
Notifications
You must be signed in to change notification settings - Fork 2
/
tune_relay_arm.py
496 lines (439 loc) · 19.5 KB
/
tune_relay_arm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
.. _tune_relay_arm:
Auto-tuning a Convolutional Network for ARM CPU
===============================================
**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Zhao Wu <https://github.com/FrozenGene>`_, `Eddie Yan <https://github.com/eqy>`_
Auto-tuning for a specific ARM device is critical for getting the best
performance. This is a tutorial about how to tune a whole convolutional
network.
The operator implementation for ARM CPU in TVM is written in template form.
The template has many tunable knobs (tile factor, vectorization, unrolling, etc).
We will tune all convolution and depthwise convolution operators
in the neural network. After tuning, we produce a log file which stores
the best knob values for all required operators. When the TVM compiler compiles
these operators, it will query this log file to get the best knob values.
We also released pre-tuned parameters for some arm devices. You can go to
`ARM CPU Benchmark <https://github.com/apache/tvm/wiki/Benchmark#arm-cpu>`_
to see the results.
Note that this tutorial will not run on Windows or recent versions of macOS. To
get it to run, you will need to wrap the body of this tutorial in a :code:`if
__name__ == "__main__":` block.
"""
######################################################################
# Install dependencies
# --------------------
# To use the autotvm package in tvm, we need to install some extra dependencies.
# (change "3" to "2" if you use python2):
#
# .. code-block:: bash
#
# pip3 install --user psutil xgboost tornado cloudpickle
#
# To make TVM run faster during tuning, it is recommended to use cython
# as FFI of TVM. In the root directory of TVM, execute
# (change "3" to "2" if you use python2):
#
# .. code-block:: bash
#
# pip3 install --user cython
# sudo make cython3
#
# Now return to python code. Import packages.
import os
import numpy as np
import tvm
from tvm import relay, autotvm
import tvm.relay.testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.contrib.utils import tempdir
import tvm.contrib.graph_executor as runtime
from util import load_test_image, download_model_zoo,parse_options, get_device_arch, get_device_attributes, get_device_type, get_tvm_target
#################################################################
# Define network
# --------------
# First we need to define the network in relay frontend API.
# We can load some pre-defined network from :code:`relay.testing`.
# We can also load models from MXNet, ONNX and TensorFlow.
def get_network(name, batch_size):
"""Get the symbol definition and random weight of a network"""
input_shape = (batch_size, 3, 224, 224)
output_shape = (batch_size, 1000)
if "resnet" in name:
n_layer = int(name.split("-")[1])
mod, params = relay.testing.resnet.get_workload(
num_layers=n_layer, batch_size=batch_size, dtype=dtype
)
elif "vgg" in name:
n_layer = int(name.split("-")[1])
mod, params = relay.testing.vgg.get_workload(
num_layers=n_layer, batch_size=batch_size, dtype=dtype
)
elif name =="inception-v3f":
model_dir = '/inception_v3_2018_04_27/'
model_name ='inception_v3.tflite'
input_tensor = "input"
input_sshape = (1, 299, 299, 3)
model_dir = download_model_zoo(model_dir, model_name)
tflite_model_file = os.path.join(model_dir, model_name)
tflite_model_buf = open(tflite_model_file, "rb").read()
# Get TFLite model from buffer
try:
import tflite
tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
except AttributeError:
import tflite.Model
tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
mod, params = relay.frontend.from_tflite(tflite_model,
shape_dict={input_tensor: input_sshape},
dtype_dict={input_tensor: "float32"})
desired_layouts = {'nn.conv2d': ['NCHW', 'default']}
seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),relay.transform.ConvertLayout(desired_layouts)])
with tvm.transform.PassContext(opt_level=3):
mod = seq(mod)
elif name == "mobilenet-v1f":
model_dir ="/mobilenet_v1_0.5_128/"
model_name = "mobilenet_v1_0.5_128.tflite"
input_tensor = "input"
input_sshape = (1, 128, 128, 3)
model_dir = download_model_zoo(model_dir, model_name)
tflite_model_file = os.path.join(model_dir, model_name)
tflite_model_buf = open(tflite_model_file, "rb").read()
# Get TFLite model from buffer
try:
import tflite
tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
except AttributeError:
import tflite.Model
tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
#mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size)
mod, params = relay.frontend.from_tflite(tflite_model,
shape_dict={input_tensor: input_sshape},
dtype_dict={input_tensor: "float32"})
desired_layouts = {'nn.conv2d': ['NCHW', 'default']}
seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),relay.transform.ConvertLayout(desired_layouts)])
with tvm.transform.PassContext(opt_level=3):
mod = seq(mod)
elif name == "mobilenet-v2f":
model_dir ="/mobilenet_v2_1.0_224/"
model_name = "mobilenet_v2_1.0_224.tflite"
input_tensor = "input"
input_sshape = (1, 224, 224, 3)
model_dir = download_model_zoo(model_dir, model_name)
tflite_model_file = os.path.join(model_dir, model_name)
tflite_model_buf = open(tflite_model_file, "rb").read()
# Get TFLite model from buffer
try:
import tflite
tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
except AttributeError:
import tflite.Model
tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
#mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size)
mod, params = relay.frontend.from_tflite(tflite_model,
shape_dict={input_tensor: input_sshape},
dtype_dict={input_tensor: "float32"})
desired_layouts = {'nn.conv2d': ['NCHW', 'default']}
seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),relay.transform.ConvertLayout(desired_layouts)])
with tvm.transform.PassContext(opt_level=3):
mod = seq(mod)
elif name == "squeezenet_v1.1":
mod, params = relay.testing.squeezenet.get_workload(
batch_size=batch_size, version="1.1", dtype=dtype
)
elif name == "inception_v3":
input_shape = (batch_size, 3, 299, 299)
mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
elif name == "mxnet":
# an example for mxnet model
from mxnet.gluon.model_zoo.vision import get_model
block = get_model("resnet18_v1", pretrained=True)
mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
net = mod["main"]
net = relay.Function(
net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
)
mod = tvm.IRModule.from_expr(net)
else:
raise ValueError("Unsupported network: " + name)
return mod, params, input_shape, output_shape
#################################################################
# Start RPC Tracker
# -----------------
# TVM uses RPC session to communicate with ARM boards.
# During tuning, the tuner will send the generated code to the board and
# measure the speed of code on the board.
#
# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
# The RPC Tracker is a centralized controller node. We can register all devices to
# the tracker. For example, if we have 10 phones, we can register all of them
# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
#
# To start an RPC tracker, run this command on the host machine. The tracker is
# required during the whole tuning process, so we need to open a new terminal for
# this command:
#
# .. code-block:: bash
#
# python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
#
# The expected output is
#
# .. code-block:: bash
#
# INFO:RPCTracker:bind to 0.0.0.0:9190
#################################################################
# Register Devices to RPC Tracker
# -----------------------------------
# Now we can register our devices to the tracker. The first step is to
# build the TVM runtime for the ARM devices.
#
# * For Linux:
# Follow this section :ref:`build-tvm-runtime-on-device` to build
# the TVM runtime on the device. Then register the device to tracker by
#
# .. code-block:: bash
#
# python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rk3399
#
# (replace :code:`[HOST_IP]` with the IP address of your host machine)
#
# * For Android:
# Follow this `readme page <https://github.com/apache/tvm/tree/main/apps/android_rpc>`_ to
# install the TVM RPC APK on the android device. Make sure you can pass the android rpc test.
# Then you have already registered your device. During tuning, you have to go to developer option
# and enable "Keep screen awake during changing" and charge your phone to make it stable.
#
# After registering devices, we can confirm it by querying rpc_tracker
#
# .. code-block:: bash
#
# python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
#
# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 3B and 2 rk3399,
# the output can be
#
# .. code-block:: bash
#
# Queue Status
# ----------------------------------
# key total free pending
# ----------------------------------
# mate10pro 2 2 0
# rk3399 2 2 0
# rpi3b 11 11 0
# ----------------------------------
#
# You can register multiple devices to the tracker to accelerate the measurement in tuning.
###########################################
# Set Tuning Options
# ------------------
# Before tuning, we should apply some configurations. Here I use an RK3399 board
# as example. In your setting, you should modify the target and device_key accordingly.
# set :code:`use_android` to True if you use android phone.
#### DEVICE CONFIG ####
# Replace "aarch64-linux-gnu" with the correct target of your board.
# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
target = tvm.target.Target("llvm -mcpu=cortex-a75 -mtriple=aarch64-linux-gnu")
# Also replace this with the device key in your tracker
device_key = "db845"
# Set this to True if you use android phone
use_android = False
#### TUNING OPTION ####
#network = "inception_v3"
network = "inception-v3f"
log_file = "%s.%s.log" % (device_key, network)
dtype = "float32"
tuning_option = {
"log_filename": log_file,
"tuner": "xgb",
"n_trial": 1500,
"early_stopping": 800,
"measure_option": autotvm.measure_option(
builder=autotvm.LocalBuilder(build_func="ndk" if use_android else "default"),
runner=autotvm.RPCRunner(
device_key,
host="0.0.0.0",
port=9190,
number=5,
timeout=10,
),
),
}
####################################################################
#
# .. note:: How to set tuning options
#
# In general, the default values provided here work well.
# If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
# which makes the tuning run longer.
# If your device runs very slow or your conv2d operators have many GFLOPs, considering to
# set timeout larger.
#
# If your model has depthwise convolution, you could consider setting
# :code:`try_spatial_pack_depthwise` be :code:`True`, which perform better than default
# optimization in general. For example, on ARM CPU A53 2.0GHz, we find it could boost 1.6x
# performance of depthwise convolution on Mobilenet V1 model.
###################################################################
# Begin Tuning
# ------------
# Now we can extract tuning tasks from the network and begin tuning.
# Here, we provide a simple utility function to tune a list of tasks.
# This function is just an initial implementation which tunes them in sequential order.
# We will introduce a more sophisticated tuning scheduler in the future.
# You can skip the implementation of this function for this tutorial.
def tune_tasks(
tasks,
measure_option,
tuner="xgb",
n_trial=1000,
early_stopping=None,
log_filename="tuning.log",
use_transfer_learning=True,
):
# create tmp log file
tmp_log_file = log_filename + ".tmp"
if os.path.exists(tmp_log_file):
os.remove(tmp_log_file)
for i, tsk in enumerate(reversed(tasks)):
prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
# create tuner
if tuner == "xgb" or tuner == "xgb-rank":
tuner_obj = XGBTuner(tsk, loss_type="rank")
elif tuner == "xgb_knob":
tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
elif tuner == "ga":
tuner_obj = GATuner(tsk, pop_size=50)
elif tuner == "random":
tuner_obj = RandomTuner(tsk)
elif tuner == "gridsearch":
tuner_obj = GridSearchTuner(tsk)
else:
raise ValueError("Invalid tuner: " + tuner)
if use_transfer_learning:
if os.path.isfile(tmp_log_file):
tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
# do tuning
tsk_trial = min(n_trial, len(tsk.config_space))
tuner_obj.tune(
n_trial=tsk_trial,
early_stopping=early_stopping,
measure_option=measure_option,
callbacks=[
autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
autotvm.callback.log_to_file(tmp_log_file),
],
)
# pick best records to a cache file
autotvm.record.pick_best(tmp_log_file, log_filename)
os.remove(tmp_log_file)
########################################################################
# Finally, we launch tuning jobs and evaluate the end-to-end performance.
def tune_and_evaluate(tuning_opt):
# extract workloads from relay program
print("Extract tasks...")
mod, params, input_shape, _ = get_network(network, batch_size=1)
tasks = autotvm.task.extract_from_program(
mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
)
# run tuning tasks
print("Tuning...")
tune_tasks(tasks, **tuning_opt)
# compile kernels with history best records
with autotvm.apply_history_best(log_file):
print("Compile...")
with tvm.transform.PassContext(opt_level=3):
lib = relay.build_module.build(mod, target=target, params=params)
# export library
tmp = tempdir()
if use_android:
from tvm.contrib import ndk
filename = "net.so"
lib.export_library(tmp.relpath(filename), ndk.create_shared)
else:
filename = "net.tar"
lib.export_library(tmp.relpath(filename))
# upload module to device
print("Upload...")
remote = autotvm.measure.request_remote(device_key, "0.0.0.0", 9190, timeout=10000)
remote.upload(tmp.relpath(filename))
rlib = remote.load_module(filename)
# upload parameters to device
dev = remote.device(str(target), 0)
module = runtime.GraphModule(rlib["default"](dev))
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
#module.set_input("data", data_tvm)
input_tensor = "input"
module.set_input(input_tensor, data_tvm)
# evaluate
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", dev, number=1, repeat=10)
prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
print(
"Mean inference time (std dev): %.2f ms (%.2f ms)"
% (np.mean(prof_res), np.std(prof_res))
)
# We do not run the tuning in our webpage server since it takes too long.
# Uncomment the following line to run it by yourself.
tune_and_evaluate(tuning_option)
######################################################################
# Sample Output
# -------------
# The tuning needs to compile many programs and extract feature from them.
# So a high performance CPU is recommended.
# One sample output is listed below.
# It takes about 2 hours on a 32T AMD Ryzen Threadripper.
#
# .. code-block:: bash
#
# Extract tasks...
# Tuning...
# [Task 1/12] Current/Best: 22.37/ 52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done.
# [Task 2/12] Current/Best: 6.51/ 18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done.
# [Task 3/12] Current/Best: 4.67/ 24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done.
# [Task 4/12] Current/Best: 11.35/ 46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done.
# [Task 5/12] Current/Best: 1.01/ 19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done.
# [Task 6/12] Current/Best: 2.47/ 23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done.
# [Task 7/12] Current/Best: 14.57/ 33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done.
# [Task 8/12] Current/Best: 1.13/ 17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done.
# [Task 9/12] Current/Best: 14.45/ 22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done.
# [Task 10/12] Current/Best: 3.22/ 15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done.
# [Task 11/12] Current/Best: 11.03/ 32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done.
# [Task 12/12] Current/Best: 8.00/ 21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done.
# Compile...
# Upload...
# Evaluate inference time cost...
# Mean inference time (std dev): 162.59 ms (0.06 ms)
######################################################################
#
# .. note:: **Experiencing Difficulties?**
#
# The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
# then there must be something wrong.
#
# First, make sure you set the correct configuration of your device.
# Then, you can print debug information by adding these lines in the beginning
# of the script. It will print every measurement result, where you can find useful
# error messages.
#
# .. code-block:: python
#
# import logging
# logging.getLogger('autotvm').setLevel(logging.DEBUG)
#
# Finally, always feel free to ask our community for help on https://discuss.tvm.apache.org