[CODEGEN/RUNTIME] Metal support, runtime improvement. (dmlc#111)

* [CODEGEN/RUNTIME] Metal support, runtime improvement. * Fix case when no device is available
abergeron · May 2, 2017 · 706f9b6 · 706f9b6
1 parent 9ba40dc
commit 706f9b6
Show file tree

Hide file tree

Showing 69 changed files with 1,933 additions and 617 deletions.
diff --git a/Makefile b/Makefile
@@ -18,7 +18,9 @@ all: lib/libtvm.so lib/libtvm_runtime.so lib/libtvm.a
 LIB_HALIDE_IR = HalideIR/lib/libHalideIR.a
 
 SRC = $(wildcard src/*.cc src/*/*.cc src/*/*/*.cc)
+METAL_SRC = $(wildcard src/runtime/metal/*.mm)
 ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC))
+METAL_OBJ = $(patsubst src/%.mm, build/%.o, $(METAL_SRC))
 ALL_DEP = $(ALL_OBJ) $(LIB_HALIDE_IR)
 
 RUNTIME_SRC = $(wildcard src/runtime/*.cc src/runtime/*/*.cc)
@@ -29,14 +31,15 @@ ALL_DEP = $(ALL_OBJ) $(LIB_HALIDE_IR)
 export LDFLAGS = -pthread -lm
 export CFLAGS =  -std=c++11 -Wall -O2 -fno-rtti\
 	 -Iinclude -Idlpack/include -Idmlc-core/include -IHalideIR/src  -fPIC -DDMLC_ENABLE_RTTI=0
+export OBJCFLAGS= -fobjc-arc
 
 ifdef CUDA_PATH
 	NVCC=$(CUDA_PATH)/bin/nvcc
 	CFLAGS += -I$(CUDA_PATH)/include
 	LDFLAGS += -L$(CUDA_PATH)/lib64
 endif
 
-ifeq ($(USE_CUDA), 1)
+ifeq ($(ENABLE_CUDA), 1)
 	CFLAGS += -DTVM_CUDA_RUNTIME=1
 	LDFLAGS += -lcuda -lcudart -lnvrtc
 else
@@ -45,9 +48,10 @@ endif
 
 FRAMEWORKS=
 
-ifeq ($(USE_OPENCL), 1)
+UNAME_S := $(shell uname -s)
+
+ifeq ($(ENABLE_OPENCL), 1)
 	CFLAGS += -DTVM_OPENCL_RUNTIME=1
-	UNAME_S := $(shell uname -s)
 	ifeq ($(UNAME_S), Darwin)
 		FRAMEWORKS += -framework OpenCL
 	else
@@ -57,10 +61,20 @@ else
 	CFLAGS += -DTVM_OPENCL_RUNTIME=0
 endif
 
+ifeq ($(ENABLE_METAL), 1)
+	CFLAGS += -DTVM_METAL_RUNTIME=1
+	LDFLAGS += -lObjc
+	ALL_DEP += $(METAL_OBJ)
+	RUNTIME_DEP += $(METAL_OBJ)
+	FRAMEWORKS += -framework Metal -framework Foundation
+else
+	CFLAGS += -DTVM_METAL_RUNTIME=0
+endif
+
 # llvm configuration
 LLVM_CONFIG=llvm-config
 
-ifeq ($(USE_LLVM), 1)
+ifeq ($(ENABLE_LLVM), 1)
 	LLVM_VERSION=$(shell $(LLVM_CONFIG) --version| cut -b 1,3)
 	LLVM_INCLUDE=$(filter -I%, $(shell $(LLVM_CONFIG) --cxxflags))
 	LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags --libs --system-libs)
@@ -87,6 +101,11 @@ build/%.o: src/%.cc
 	$(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
 	$(CXX) -c $(CFLAGS) -c $< -o $@
 
+build/%.o: src/%.mm
+	@mkdir -p $(@D)
+	$(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
+	$(CXX) $(OBJCFLAGS) -c $(CFLAGS) -c $< -o $@
+
 lib/libtvm.so: $(ALL_DEP)
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
@@ -105,7 +124,7 @@ LIBHALIDEIR:
 	+ cd HalideIR; make lib/libHalideIR.a ; cd $(ROOTDIR)
 
 cpplint:
-	python2 dmlc-core/scripts/lint.py tvm cpp include src verilog
+	python dmlc-core/scripts/lint.py tvm cpp include src verilog
 
 pylint:
 	pylint python/tvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc

diff --git a/docs/api/python/ndarray.rst b/docs/api/python/ndarray.rst
@@ -12,4 +12,5 @@ tvm.ndarray
 .. autofunction:: tvm.cpu
 .. autofunction:: tvm.gpu
 .. autofunction:: tvm.opencl
+.. autofunction:: tvm.metal
 .. autofunction:: tvm.ndarray.array
diff --git a/include/tvm/codegen.h b/include/tvm/codegen.h
@@ -31,13 +31,6 @@ using runtime::TVMRetValue;
  */
 runtime::Module Build(const Array<LoweredFunc>& funcs,
                       const std::string& target);
-
-/*!
- * \param target The target to be queried.
- * \return Whether target is enabled.
- */
-bool TargetEnabled(const std::string& target);
-
 }  // namespace codegen
 }  // namespace tvm
 

diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
@@ -41,6 +41,8 @@ typedef int64_t tvm_index_t;
 
 /*! \brief Extension device types in TVM */
 typedef enum {
+  /*! \brief Metal buffer. */
+  kMetal = 8,
   /*! \brief Simulated on board RAM */
   kVPI = 9
 } TVMDeviceExtType;
@@ -360,7 +362,7 @@ TVM_DLL int TVMFuncGetGlobal(const char* name, TVMFunctionHandle* out);
 TVM_DLL int TVMFuncListGlobalNames(int *out_size,
                                    const char*** out_array);
 
-// Array related apis for quick proptying
+// Array related apis for quick proptyping
 /*!
  * \brief Allocate a nd-array's memory,
  *  including space of shape, of given spec.

diff --git a/include/tvm/runtime/config.h b/include/tvm/runtime/config.h
@@ -20,4 +20,11 @@
 #define TVM_OPENCL_RUNTIME 0
 #endif
 
+/*!
+ *\brief whether to use metal runtime
+ */
+#ifndef TVM_METAL_RUNTIME
+#define TVM_METAL_RUNTIME 0
+#endif
+
 #endif  // TVM_RUNTIME_CONFIG_H_
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
@@ -145,8 +145,8 @@ class ModuleNode {
 namespace symbol {
 /*! \brief Global variable to store module context. */
 constexpr const char* tvm_module_ctx = "__tvm_module_ctx";
-/*! \brief Local function to set the device during API entry. */
-constexpr const char* tvm_entry_setdevice = "__tvm_entry_setdevice";
+/*! \brief global function to set device */
+constexpr const char* tvm_set_device = "__tvm_set_device";
 /*! \brief Auxiliary counter to global barrier. */
 constexpr const char* tvm_global_barrier_state = "__tvm_global_barrier_state";
 /*! \brief Prepare the global barrier before kernels that uses global barrier. */

diff --git a/make/config.mk b/make/config.mk
@@ -34,16 +34,19 @@ ADD_CFLAGS =
 # matrix computation libraries for CPU/GPU
 #---------------------------------------------
 
-# whether use CUDA during compile
-USE_CUDA = 1
+# whether enable CUDA during compile
+ENABLE_CUDA = 1
 
-# whether use OpenCL during compile
-USE_OPENCL = 0
+# whether enable OpenCL during compile
+ENABLE_OPENCL = 0
+
+# whether enable Metal during compile
+ENABLE_METAL = 0
 
 # whether build with LLVM support
 # This requires llvm-config to be in your PATH
 # Requires LLVM version >= 4.0
-USE_LLVM = 0
+ENABLE_LLVM = 0
 
 # add the path to CUDA library to link and compile flag
 # if you have already add them to environment variable.

diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
@@ -16,7 +16,7 @@
 from . import ir_builder
 
 from . import ndarray as nd
-from .ndarray import cpu, gpu, opencl, cl, vpi
+from .ndarray import context, cpu, gpu, opencl, cl, metal, mtl, vpi
 
 from ._ffi.function import Function
 from ._ffi.base import TVMError, __version__

diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py
@@ -4,7 +4,8 @@
 from __future__ import absolute_import
 import ctypes
 import numpy as np
-from .base import _LIB, check_call, c_array
+from .base import _LIB, check_call, c_array, string_types
+from .. import _api_internal
 
 tvm_shape_index_t = ctypes.c_int64
 
@@ -63,22 +64,62 @@ def __eq__(self, other):
     def __ne__(self, other):
         return not self.__eq__(other)
 
+
 class TVMContext(ctypes.Structure):
     """TVM context strucure."""
     _fields_ = [("device_id", ctypes.c_int),
                 ("device_type", ctypes.c_int)]
-
     MASK2STR = {
         1 : 'cpu',
         2 : 'gpu',
         4 : 'opencl',
+        8 : 'metal',
         9 : 'vpi'
     }
-    def __init__(self, device_id, device_type):
+    STR2MASK = {
+        'cpu': 1,
+        'gpu': 2,
+        'cuda': 2,
+        'cl': 4,
+        'opencl': 4,
+        'metal': 8,
+        'vpi': 9
+    }
+    def __init__(self, device_type, device_id):
         super(TVMContext, self).__init__()
         self.device_id = device_id
         self.device_type = device_type
 
+    @property
+    def exist(self):
+        """Whether this device exist."""
+        return _api_internal._GetDeviceAttr(
+            self.device_type, self.device_id, 0) != 0
+
+    @property
+    def max_threads_per_block(self):
+        """Maximum number of threads on each block."""
+        return _api_internal._GetDeviceAttr(
+            self.device_type, self.device_id, 1)
+
+    @property
+    def warp_size(self):
+        """Number of threads that executes in concurrent."""
+        return _api_internal._GetDeviceAttr(
+            self.device_type, self.device_id, 2)
+
+    def sync(self):
+        """Synchronize until jobs finished at the context."""
+        check_call(_LIB.TVMSynchronize(self, None))
+
+    def __eq__(self, other):
+        return (isinstance(other, TVMContext) and
+                self.device_id == other.device_id and
+                self.device_type == other.device_type)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
     def __repr__(self):
         return "%s(%d)" % (
             TVMContext.MASK2STR[self.device_type], self.device_id)
@@ -97,48 +138,38 @@ class TVMArray(ctypes.Structure):
 
 TVMArrayHandle = ctypes.POINTER(TVMArray)
 
-
-def cpu(dev_id=0):
-    """Construct a CPU device
+def context(dev_type, dev_id=0):
+    """Construct a TVM context with given device type and id.
 
     Parameters
     ----------
-    dev_id : int, optional
-        The integer device id
-    """
-    return TVMContext(dev_id, 1)
-
+    dev_type: int or str
+        The device type mask or name of the device.
 
-def gpu(dev_id=0):
-    """Construct a CPU device
-
-    Parameters
-    ----------
     dev_id : int, optional
         The integer device id
-    """
-    return TVMContext(dev_id, 2)
 
+    Returns
+    -------
+    ctx: TVMContext
+        The corresponding context.
 
-def opencl(dev_id=0):
-    """Construct a OpenCL device
+    Examples
+    --------
+    Context can be used to create reflection of context by
+    string representation of the device type.
 
-    Parameters
-    ----------
-    dev_id : int, optional
-        The integer device id
-    """
-    return TVMContext(dev_id, 4)
+    .. code-block:: python
 
-def vpi(dev_id=0):
-    """Construct a VPI simulated device
-
-    Parameters
-    ----------
-    dev_id : int, optional
-        The integer device id
+      assert tvm.context("cpu", 1) == tvm.cpu(1)
+      assert tvm.context("gpu", 0) == tvm.gpu(0)
+      assert tvm.context("cuda", 0) == tvm.gpu(0)
     """
-    return TVMContext(dev_id, 9)
+    if isinstance(dev_type, string_types):
+        if not dev_type in TVMContext.STR2MASK:
+            raise ValueError("Unknown device type %s" % dev_type)
+        dev_type = TVMContext.STR2MASK[dev_type]
+    return TVMContext(dev_type, dev_id)
 
 
 def numpyasarray(np_data):
@@ -154,10 +185,11 @@ def numpyasarray(np_data):
     arr.dtype = TVMType(np.dtype(data.dtype).name)
     arr.ndim = data.ndim
     # CPU device
-    arr.ctx = cpu(0)
+    arr.ctx = context(1, 0)
     return arr, shape
 
-def empty(shape, dtype="float32", ctx=cpu(0)):
+
+def empty(shape, dtype="float32", ctx=context(1, 0)):
     """Create an empty array given shape and device
 
     Parameters
@@ -185,17 +217,6 @@ def empty(shape, dtype="float32", ctx=cpu(0)):
     return _CLASS_NDARRAY(handle)
 
 
-def sync(ctx):
-    """Synchronize all the context
-
-    Parameters
-    ----------
-    ctx : TVMContext
-        The context to be synced
-    """
-    check_call(_LIB.TVMSynchronize(ctx, None))
-
-
 class NDArrayBase(object):
     """A simple Device/CPU Array object in runtime."""
     __slots__ = ["handle", "is_view"]

diff --git a/python/tvm/build.py b/python/tvm/build.py
@@ -10,6 +10,7 @@
 from . import expr
 from . import ir_pass
 from . import collections
+from . import module
 from . import codegen
 
 
@@ -149,7 +150,7 @@ def build(sch,
     fsplits[0] = ir_pass.LowerPackedCall(fsplits[0])
     if len(fsplits) > 1:
         if not target_host:
-            target_host = "llvm" if codegen.enabled("llvm") else "stackvm"
+            target_host = "llvm" if module.enabled("llvm") else "stackvm"
         mhost = codegen.build_module(fsplits[0], target_host)
         if target:
             mdev = codegen.build_module(fsplits[1:], target)

diff --git a/python/tvm/codegen.py b/python/tvm/codegen.py
@@ -19,21 +19,4 @@ def build_module(lowered_func, target):
     """
     return _Build(lowered_func, target)
 
-
-def enabled(target):
-    """Whether target is enabled for codegen.
-
-    Parameters
-    ----------
-    target : str
-        The target module type.
-
-    Returns
-    -------
-    enabled : boolean
-        Whether the target module is enabled.
-    """
-    return _Enabled(target)
-
-
 _init_api("tvm.codegen")