apache · zhiics · Jul 21, 2020 · Jul 9, 2020 · Jul 6, 2020 · Jul 14, 2020
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -71,6 +71,8 @@ tvm_option(USE_TFLITE "Build with tflite support" OFF)
 tvm_option(USE_TENSORFLOW_PATH "TensorFlow root path when use TFLite" none)
 tvm_option(USE_COREML "Build with coreml support" OFF)
 tvm_option(USE_TARGET_ONNX "Build with ONNX Codegen support" OFF)
+tvm_option(USE_ARM_COMPUTE_LIB "Build with Arm Compute Library" OFF)
+tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library graph runtime" OFF)
 
 if(USE_CPP_RPC AND UNIX)
   message(FATAL_ERROR "USE_CPP_RPC is only supported with WIN32. Use the Makefile for non-Windows.")
@@ -331,6 +333,7 @@ include(cmake/modules/contrib/TFLite.cmake)
 include(cmake/modules/contrib/TF_TVMDSOOP.cmake)
 include(cmake/modules/contrib/CoreML.cmake)
 include(cmake/modules/contrib/ONNX.cmake)
+include(cmake/modules/contrib/ArmComputeLib.cmake)
 
 include(CheckCXXCompilerFlag)
 if(NOT MSVC)

diff --git a/cmake/config.cmake b/cmake/config.cmake
@@ -184,6 +184,18 @@ set(USE_SORT ON)
 # Whether use MKL-DNN (DNNL) codegen
 set(USE_DNNL_CODEGEN OFF)
 
+# Whether to use Arm Compute Library (ACL) codegen
+# We provide 2 separate flags since we cannot build the ACL runtime on x86.
+# This is useful for cases where you want to cross-compile a relay graph
+# on x86 then run on AArch.
+#
+# USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported
+#                       operators to Arm Compute Library. OFF/ON
+# USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME - Run Arm Compute Library annotated functions via the ACL
+#                                     runtime. OFF/ON/"path/to/ACL"
+set(USE_ARM_COMPUTE_LIB OFF)
+set(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME OFF)
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)

diff --git a/cmake/modules/contrib/ArmComputeLib.cmake b/cmake/modules/contrib/ArmComputeLib.cmake
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# We separate the codegen and runtime build since ACL can only be built
+# for AArch. In the world where we take the cross compilation approach,
+# which is common with arm devices, we need to be able to cross-compile
+# a relay graph on x86 for AArch and then run the graph on AArch.
+if(USE_ARM_COMPUTE_LIB)
+    file(GLOB ACL_RELAY_CONTRIB_SRC src/relay/backend/contrib/arm_compute_lib/*.cc)
+    file(GLOB ACL_RUNTIME_MODULE src/runtime/contrib/arm_compute_lib/acl_runtime.cc)
+    list(APPEND COMPILER_SRCS ${ACL_RELAY_CONTRIB_SRC})
+    list(APPEND COMPILER_SRCS ${ACL_RUNTIME_MODULE})
+    message(STATUS "Build with Arm Compute Library support...")
+endif()
+
+if(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME)
+    set(ACL_PATH ${CMAKE_CURRENT_SOURCE_DIR}/acl)
+    # Detect custom ACL path.
+    if (NOT USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME STREQUAL "ON")
+        set(ACL_PATH ${USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME})
+    endif()
+
+    file(GLOB ACL_CONTRIB_SRC src/runtime/contrib/arm_compute_lib/*)
+
+    set(ACL_INCLUDE_DIRS ${ACL_PATH}/include ${ACL_PATH})
+    include_directories(${ACL_INCLUDE_DIRS})
+
+    find_library(EXTERN_ACL_COMPUTE_LIB
+            NAMES arm_compute libarm_compute
+            HINTS "${ACL_PATH}" "${ACL_PATH}/lib" "${ACL_PATH}/build"
+            )
+    find_library(EXTERN_ACL_COMPUTE_CORE_LIB
+            NAMES arm_compute_core libarm_compute_core
+            HINTS "${ACL_PATH}" "${ACL_PATH}/lib" "${ACL_PATH}/build"
+            )
+    find_library(EXTERN_ACL_COMPUTE_GRAPH_LIB
+            NAMES arm_compute_graph libarm_compute_graph
+            HINTS "${ACL_PATH}" "${ACL_PATH}/lib" "${ACL_PATH}/build"
+            )
+
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_ACL_COMPUTE_LIB})
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_ACL_COMPUTE_CORE_LIB})
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_ACL_COMPUTE_GRAPH_LIB})
+    list(APPEND RUNTIME_SRCS ${ACL_CONTRIB_SRC})
+    message(STATUS "Build with Arm Compute Library graph runtime support: "
+            ${EXTERN_ACL_COMPUTE_LIB} ", \n"
+            ${EXTERN_ACL_COMPUTE_CORE_LIB} ", \n"
+            ${EXTERN_ACL_COMPUTE_GRAPH_LIB})
+
+    # Set flag to detect ACL graph runtime support.
+    add_definitions(-DTVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB)
+endif()
diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
@@ -0,0 +1,138 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Relay Arm|reg| Compute Library Integration
+==========================================
+
+Introduction
+------------
+
+Arm Compute Library (ACL) is an open source project that provides accelerated kernels for Arm CPU's
+and GPU's. Currently the integration offloads operators to ACL to use hand-crafted assembler
+routines in the library. By offloading select operators from a relay graph to ACL we can achieve
+a performance boost on such devices.
+
+Building with ACL support
+-------------------------
+
+The current implementation has two separate build options in cmake. The reason for this split is
+because ACL cannot be used on an x86 machine. However, we still want to be able compile an ACL
+runtime module on an x86 machine.
+
+* USE_ARM_COMPUTE_LIB=ON/OFF - Enabling this flag will add support for compiling an ACL runtime module.
+* USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME=ON/OFF/path-to-acl - Enabling this flag will allow the graph runtime to
+  compute the ACL offloaded functions.
+
+These flags can be used in different scenarios depending on your setup. For example, if you want
+to compile ACL on an x86 machine and then run the module on a remote Arm device via RPC, you will
+need to use USE_ARM_COMPUTE_LIB=ON on the x86 machine and USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME=ON on the remote
+AArch64 device.
+
+Usage
+-----
+
+.. note::
+
+    This section may not stay up-to-date with changes to the API.
+
+Create a relay graph. This may be a single operator or a whole graph. The intention is that any
+relay graph can be input. The ACL integration will only pick supported operators to be offloaded
+whilst the rest will be computed via TVM. (For this example we will use a single
+max_pool2d operator).
+
+.. code:: python
+
+    import tvm
+    from tvm import relay
+
+    data_type = "float32"
+    data_shape = (1, 14, 14, 512)
+    strides = (2, 2)
+    padding = (0, 0, 0, 0)
+    pool_size = (2, 2)
+    layout = "NHWC"
+    output_shape = (1, 7, 7, 512)
+
+    data = relay.var('data', shape=data_shape, dtype=data_type)
+    out = relay.nn.max_pool2d(data, pool_size=pool_size, strides=strides, layout=layout, padding=padding)
+    module = tvm.IRModule.from_expr(out)
+
+
+Annotate and partition the graph for ACL.
+
+..code:: python
+
+    from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib
+    partition_for_arm_compute_lib(module)
+
+
+Build the Relay graph.
+
+.. code:: python
+
+    target = "llvm -mtriple=aarch64-linux-gnu -mattr=+neon"
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        lib = relay.build(module, target=target)
+
+
+Export the module.
+
+.. code:: python
+
+    lib_path = '~/lib_acl.so'
+    cross_compile = 'aarch64-linux-gnu-c++'
+    lib.export_library(lib_path, cc=cross_compile)
+
+
+Run Inference. This must be on an Arm device. If compiling on x86 device and running on aarch64,
+consider using the RPC mechanism.
+
+.. code:: python
+
+    ctx = tvm.cpu(0)
+    loaded_lib = tvm.runtime.load_module('lib_acl.so')
+    gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx))
+    d_data = np.random.uniform(0, 1, data_shape).astype(data_type)
+    map_inputs = {'data': d_data}
+    gen_module.set_input(**map_inputs)
+    gen_module.run()
+
+
+More examples
+-------------
+The example above only shows a basic example of how ACL can be used for offloading a single
+Maxpool2D. If you would like to see more examples for each implemented operator and for
+networks refer to the tests: `tests/python/contrib/test_arm_compute_lib`. Here you can modify
+`infrastructure.py` to use the remote device you have setup.
+
+
+Adding a new operator
+---------------------
+Adding a new operator requires changes to a series of places. This section will give a hint on
+what needs to be changed and where, it will not however dive into the complexities for an
+individual operator. This is left to the developer.
+
+There are a series of files we need to make changes to:
+* `python/relay/op/contrib/arm_compute_lib.py` In this file we define the operators we wish to offload using the
+`op.register` decorator. This will mean the annotation pass recognizes this operator as ACL
+offloadable.
+* `src/relay/backend/contrib/arm_compute_lib/codegen_acl.h` Implement `Make[OpName]` method. This is where we
+declare how the operator should be represented by JSON. This will be used to create the ACL module.
+* `src/runtime/contrib/arm_compute_lib/acl_kernel.h` Implement `Create[OpName]Layer` method. This is where we
+define how the JSON representation can be used to create an ACL function. We simply define how to
+translate from the JSON representation to ACL API.
+* `tests/python/contrib/test_arm_compute_lib` Add unit tests for the given operator.
diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst
@@ -68,3 +68,4 @@ target device without relying on RPC. see the following resources on how to do s
    android
    integrate
    hls
+   arm_compute_lib
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
@@ -18,5 +18,6 @@
 """Contrib modules."""
 from .register import get_pattern_table, register_pattern_table
 
+from .arm_compute_lib import *
 from .dnnl import *
 from .coreml import *
diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -0,0 +1,119 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""Arm Compute Library supported operators."""
+import tvm
+from tvm.relay import transform
+from tvm.relay.build_module import bind_params_by_name
+
+from ...dataflow_pattern import wildcard, is_op, is_constant
+from .register import register_pattern_table
+
+
+def is_arm_compute_runtime_enabled():
+    """Check if the ACL graph runtime is present.
+
+    Returns
+    -------
+    ret: bool
+        True if present, False if not.
+    """
+    return tvm.get_global_func("relay.op.is_arm_compute_runtime_enabled")()
+
+
+def partition_for_arm_compute_lib(mod, params=None):
+    """Partition the graph greedily offloading supported
+    operators to Arm Compute Library.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+    params : Optional[Dict[str, NDArray]]
+        Constant input parameters.
+
+    Returns
+    -------
+    ret : annotated and partitioned module.
+    """
+    if params:
+        mod['main'] = bind_params_by_name(mod['main'], params)
+
+    seq = tvm.transform.Sequential([transform.MergeComposite(arm_compute_lib_pattern_table()),
+                                    transform.AnnotateTarget('arm_compute_lib'),
+                                    transform.PartitionGraph()])
+
+    return seq(mod)
+
+
+@register_pattern_table("arm_compute_lib")
+def arm_compute_lib_pattern_table():
+    """Get the ACL pattern table."""
+
+    def conv_pattern():
+        """Create a convolution pattern.
+
+        Returns
+        -------
+        pattern : dataflow_pattern.AltPattern
+            Denotes the convolution pattern.
+        """
+        pattern = is_op('nn.pad')(wildcard()) | wildcard()
+        pattern = is_op('nn.conv2d')(pattern, is_constant())
+        pattern = pattern.optional(lambda x: is_op('nn.bias_add')(x, is_constant()))
+        pattern = pattern.optional(is_op('nn.relu'))
+        return pattern
+
+    def check_conv(extract):
+        """Check conv pattern is supported by ACL."""
+        call = extract
+        while call.op.name != "nn.conv2d":
+            call = call.args[0]
+        return conv2d(call.attrs, call.args)
+
+    return [('arm_compute_lib.conv2d', conv_pattern(), check_conv)]
+
+
+def _register_external_op_helper(op_name, supported=True):
+    @tvm.ir.register_op_attr(op_name, "target.arm_compute_lib")
+    def _func_wrapper(attrs, args):
+        return supported
+
+    return _func_wrapper
+
+
+_register_external_op_helper("reshape")
+
+
+@tvm.ir.register_op_attr("nn.conv2d", "target.arm_compute_lib")
+def conv2d(attrs, args):
+    """Check if the external ACL codegen for conv2d should be used."""
+    if attrs.groups != 1:
+        return False
+    if attrs.data_layout != "NHWC":
+        return False
+
+    return True
+
+
+@tvm.ir.register_op_attr("nn.max_pool2d", "target.arm_compute_lib")
+def max_pool2d(attrs, args):
+    """Check if the external ACL codegen for maxpool2d should be used."""
+    if attrs.layout != "NHWC":
+        return False
+
+    return True
-Original file line number
+Diff line change
@@ Expand Up @@
        android
        integrate
        hls
+       arm_compute_lib