Skip to content

Commit

Permalink
[CODEGEN/RUNTIME] Metal support, runtime improvement. (dmlc#111)
Browse files Browse the repository at this point in the history
* [CODEGEN/RUNTIME] Metal support, runtime improvement.

* Fix case when no device is available
  • Loading branch information
tqchen authored May 2, 2017
1 parent 9ba40dc commit 706f9b6
Show file tree
Hide file tree
Showing 69 changed files with 1,933 additions and 617 deletions.
29 changes: 24 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ all: lib/libtvm.so lib/libtvm_runtime.so lib/libtvm.a
LIB_HALIDE_IR = HalideIR/lib/libHalideIR.a

SRC = $(wildcard src/*.cc src/*/*.cc src/*/*/*.cc)
METAL_SRC = $(wildcard src/runtime/metal/*.mm)
ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC))
METAL_OBJ = $(patsubst src/%.mm, build/%.o, $(METAL_SRC))
ALL_DEP = $(ALL_OBJ) $(LIB_HALIDE_IR)

RUNTIME_SRC = $(wildcard src/runtime/*.cc src/runtime/*/*.cc)
Expand All @@ -29,14 +31,15 @@ ALL_DEP = $(ALL_OBJ) $(LIB_HALIDE_IR)
export LDFLAGS = -pthread -lm
export CFLAGS = -std=c++11 -Wall -O2 -fno-rtti\
-Iinclude -Idlpack/include -Idmlc-core/include -IHalideIR/src -fPIC -DDMLC_ENABLE_RTTI=0
export OBJCFLAGS= -fobjc-arc

ifdef CUDA_PATH
NVCC=$(CUDA_PATH)/bin/nvcc
CFLAGS += -I$(CUDA_PATH)/include
LDFLAGS += -L$(CUDA_PATH)/lib64
endif

ifeq ($(USE_CUDA), 1)
ifeq ($(ENABLE_CUDA), 1)
CFLAGS += -DTVM_CUDA_RUNTIME=1
LDFLAGS += -lcuda -lcudart -lnvrtc
else
Expand All @@ -45,9 +48,10 @@ endif

FRAMEWORKS=

ifeq ($(USE_OPENCL), 1)
UNAME_S := $(shell uname -s)

ifeq ($(ENABLE_OPENCL), 1)
CFLAGS += -DTVM_OPENCL_RUNTIME=1
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S), Darwin)
FRAMEWORKS += -framework OpenCL
else
Expand All @@ -57,10 +61,20 @@ else
CFLAGS += -DTVM_OPENCL_RUNTIME=0
endif

ifeq ($(ENABLE_METAL), 1)
CFLAGS += -DTVM_METAL_RUNTIME=1
LDFLAGS += -lObjc
ALL_DEP += $(METAL_OBJ)
RUNTIME_DEP += $(METAL_OBJ)
FRAMEWORKS += -framework Metal -framework Foundation
else
CFLAGS += -DTVM_METAL_RUNTIME=0
endif

# llvm configuration
LLVM_CONFIG=llvm-config

ifeq ($(USE_LLVM), 1)
ifeq ($(ENABLE_LLVM), 1)
LLVM_VERSION=$(shell $(LLVM_CONFIG) --version| cut -b 1,3)
LLVM_INCLUDE=$(filter -I%, $(shell $(LLVM_CONFIG) --cxxflags))
LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags --libs --system-libs)
Expand All @@ -87,6 +101,11 @@ build/%.o: src/%.cc
$(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
$(CXX) -c $(CFLAGS) -c $< -o $@

build/%.o: src/%.mm
@mkdir -p $(@D)
$(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
$(CXX) $(OBJCFLAGS) -c $(CFLAGS) -c $< -o $@

lib/libtvm.so: $(ALL_DEP)
@mkdir -p $(@D)
$(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
Expand All @@ -105,7 +124,7 @@ LIBHALIDEIR:
+ cd HalideIR; make lib/libHalideIR.a ; cd $(ROOTDIR)

cpplint:
python2 dmlc-core/scripts/lint.py tvm cpp include src verilog
python dmlc-core/scripts/lint.py tvm cpp include src verilog

pylint:
pylint python/tvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc
Expand Down
1 change: 1 addition & 0 deletions docs/api/python/ndarray.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ tvm.ndarray
.. autofunction:: tvm.cpu
.. autofunction:: tvm.gpu
.. autofunction:: tvm.opencl
.. autofunction:: tvm.metal
.. autofunction:: tvm.ndarray.array
7 changes: 0 additions & 7 deletions include/tvm/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,6 @@ using runtime::TVMRetValue;
*/
runtime::Module Build(const Array<LoweredFunc>& funcs,
const std::string& target);

/*!
* \param target The target to be queried.
* \return Whether target is enabled.
*/
bool TargetEnabled(const std::string& target);

} // namespace codegen
} // namespace tvm

Expand Down
4 changes: 3 additions & 1 deletion include/tvm/runtime/c_runtime_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ typedef int64_t tvm_index_t;

/*! \brief Extension device types in TVM */
typedef enum {
/*! \brief Metal buffer. */
kMetal = 8,
/*! \brief Simulated on board RAM */
kVPI = 9
} TVMDeviceExtType;
Expand Down Expand Up @@ -360,7 +362,7 @@ TVM_DLL int TVMFuncGetGlobal(const char* name, TVMFunctionHandle* out);
TVM_DLL int TVMFuncListGlobalNames(int *out_size,
const char*** out_array);

// Array related apis for quick proptying
// Array related apis for quick proptyping
/*!
* \brief Allocate a nd-array's memory,
* including space of shape, of given spec.
Expand Down
7 changes: 7 additions & 0 deletions include/tvm/runtime/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,11 @@
#define TVM_OPENCL_RUNTIME 0
#endif

/*!
*\brief whether to use metal runtime
*/
#ifndef TVM_METAL_RUNTIME
#define TVM_METAL_RUNTIME 0
#endif

#endif // TVM_RUNTIME_CONFIG_H_
4 changes: 2 additions & 2 deletions include/tvm/runtime/module.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ class ModuleNode {
namespace symbol {
/*! \brief Global variable to store module context. */
constexpr const char* tvm_module_ctx = "__tvm_module_ctx";
/*! \brief Local function to set the device during API entry. */
constexpr const char* tvm_entry_setdevice = "__tvm_entry_setdevice";
/*! \brief global function to set device */
constexpr const char* tvm_set_device = "__tvm_set_device";
/*! \brief Auxiliary counter to global barrier. */
constexpr const char* tvm_global_barrier_state = "__tvm_global_barrier_state";
/*! \brief Prepare the global barrier before kernels that uses global barrier. */
Expand Down
13 changes: 8 additions & 5 deletions make/config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,19 @@ ADD_CFLAGS =
# matrix computation libraries for CPU/GPU
#---------------------------------------------

# whether use CUDA during compile
USE_CUDA = 1
# whether enable CUDA during compile
ENABLE_CUDA = 1

# whether use OpenCL during compile
USE_OPENCL = 0
# whether enable OpenCL during compile
ENABLE_OPENCL = 0

# whether enable Metal during compile
ENABLE_METAL = 0

# whether build with LLVM support
# This requires llvm-config to be in your PATH
# Requires LLVM version >= 4.0
USE_LLVM = 0
ENABLE_LLVM = 0

# add the path to CUDA library to link and compile flag
# if you have already add them to environment variable.
Expand Down
2 changes: 1 addition & 1 deletion python/tvm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from . import ir_builder

from . import ndarray as nd
from .ndarray import cpu, gpu, opencl, cl, vpi
from .ndarray import context, cpu, gpu, opencl, cl, metal, mtl, vpi

from ._ffi.function import Function
from ._ffi.base import TVMError, __version__
Expand Down
115 changes: 68 additions & 47 deletions python/tvm/_ffi/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from __future__ import absolute_import
import ctypes
import numpy as np
from .base import _LIB, check_call, c_array
from .base import _LIB, check_call, c_array, string_types
from .. import _api_internal

tvm_shape_index_t = ctypes.c_int64

Expand Down Expand Up @@ -63,22 +64,62 @@ def __eq__(self, other):
def __ne__(self, other):
return not self.__eq__(other)


class TVMContext(ctypes.Structure):
"""TVM context strucure."""
_fields_ = [("device_id", ctypes.c_int),
("device_type", ctypes.c_int)]

MASK2STR = {
1 : 'cpu',
2 : 'gpu',
4 : 'opencl',
8 : 'metal',
9 : 'vpi'
}
def __init__(self, device_id, device_type):
STR2MASK = {
'cpu': 1,
'gpu': 2,
'cuda': 2,
'cl': 4,
'opencl': 4,
'metal': 8,
'vpi': 9
}
def __init__(self, device_type, device_id):
super(TVMContext, self).__init__()
self.device_id = device_id
self.device_type = device_type

@property
def exist(self):
"""Whether this device exist."""
return _api_internal._GetDeviceAttr(
self.device_type, self.device_id, 0) != 0

@property
def max_threads_per_block(self):
"""Maximum number of threads on each block."""
return _api_internal._GetDeviceAttr(
self.device_type, self.device_id, 1)

@property
def warp_size(self):
"""Number of threads that executes in concurrent."""
return _api_internal._GetDeviceAttr(
self.device_type, self.device_id, 2)

def sync(self):
"""Synchronize until jobs finished at the context."""
check_call(_LIB.TVMSynchronize(self, None))

def __eq__(self, other):
return (isinstance(other, TVMContext) and
self.device_id == other.device_id and
self.device_type == other.device_type)

def __ne__(self, other):
return not self.__eq__(other)

def __repr__(self):
return "%s(%d)" % (
TVMContext.MASK2STR[self.device_type], self.device_id)
Expand All @@ -97,48 +138,38 @@ class TVMArray(ctypes.Structure):

TVMArrayHandle = ctypes.POINTER(TVMArray)


def cpu(dev_id=0):
"""Construct a CPU device
def context(dev_type, dev_id=0):
"""Construct a TVM context with given device type and id.
Parameters
----------
dev_id : int, optional
The integer device id
"""
return TVMContext(dev_id, 1)

dev_type: int or str
The device type mask or name of the device.
def gpu(dev_id=0):
"""Construct a CPU device
Parameters
----------
dev_id : int, optional
The integer device id
"""
return TVMContext(dev_id, 2)
Returns
-------
ctx: TVMContext
The corresponding context.
def opencl(dev_id=0):
"""Construct a OpenCL device
Examples
--------
Context can be used to create reflection of context by
string representation of the device type.
Parameters
----------
dev_id : int, optional
The integer device id
"""
return TVMContext(dev_id, 4)
.. code-block:: python
def vpi(dev_id=0):
"""Construct a VPI simulated device
Parameters
----------
dev_id : int, optional
The integer device id
assert tvm.context("cpu", 1) == tvm.cpu(1)
assert tvm.context("gpu", 0) == tvm.gpu(0)
assert tvm.context("cuda", 0) == tvm.gpu(0)
"""
return TVMContext(dev_id, 9)
if isinstance(dev_type, string_types):
if not dev_type in TVMContext.STR2MASK:
raise ValueError("Unknown device type %s" % dev_type)
dev_type = TVMContext.STR2MASK[dev_type]
return TVMContext(dev_type, dev_id)


def numpyasarray(np_data):
Expand All @@ -154,10 +185,11 @@ def numpyasarray(np_data):
arr.dtype = TVMType(np.dtype(data.dtype).name)
arr.ndim = data.ndim
# CPU device
arr.ctx = cpu(0)
arr.ctx = context(1, 0)
return arr, shape

def empty(shape, dtype="float32", ctx=cpu(0)):

def empty(shape, dtype="float32", ctx=context(1, 0)):
"""Create an empty array given shape and device
Parameters
Expand Down Expand Up @@ -185,17 +217,6 @@ def empty(shape, dtype="float32", ctx=cpu(0)):
return _CLASS_NDARRAY(handle)


def sync(ctx):
"""Synchronize all the context
Parameters
----------
ctx : TVMContext
The context to be synced
"""
check_call(_LIB.TVMSynchronize(ctx, None))


class NDArrayBase(object):
"""A simple Device/CPU Array object in runtime."""
__slots__ = ["handle", "is_view"]
Expand Down
3 changes: 2 additions & 1 deletion python/tvm/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from . import expr
from . import ir_pass
from . import collections
from . import module
from . import codegen


Expand Down Expand Up @@ -149,7 +150,7 @@ def build(sch,
fsplits[0] = ir_pass.LowerPackedCall(fsplits[0])
if len(fsplits) > 1:
if not target_host:
target_host = "llvm" if codegen.enabled("llvm") else "stackvm"
target_host = "llvm" if module.enabled("llvm") else "stackvm"
mhost = codegen.build_module(fsplits[0], target_host)
if target:
mdev = codegen.build_module(fsplits[1:], target)
Expand Down
17 changes: 0 additions & 17 deletions python/tvm/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,4 @@ def build_module(lowered_func, target):
"""
return _Build(lowered_func, target)


def enabled(target):
"""Whether target is enabled for codegen.
Parameters
----------
target : str
The target module type.
Returns
-------
enabled : boolean
Whether the target module is enabled.
"""
return _Enabled(target)


_init_api("tvm.codegen")
Loading

0 comments on commit 706f9b6

Please sign in to comment.