-
Notifications
You must be signed in to change notification settings - Fork 3.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[CLML] Version compatibility and various test cases #13670
Changes from 3 commits
a48b96c
32ce3bf
a85e777
9469b05
6950d4b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -153,13 +153,24 @@ class CLMLRuntime : public JSONRuntimeBase { | |
ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result; | ||
|
||
for (cl_uint i = 0; i < numVersions; ++i) { | ||
#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 2 | ||
if (majorVersions[i] == 2) { | ||
LOG(WARNING) << "CLML Version Selected:" << majorVersions[i] << " : " << majorVersions[i]; | ||
h_ClmlIntf = clGetMLInterfaceV2QCOM(0); | ||
ICHECK(h_ClmlIntf != NULL) << "clGetMLInterfaceV2QCOM:" << result; | ||
LOG(WARNING) << "CLML Target version:" << majorVersions[i]; | ||
break; | ||
} | ||
#endif | ||
#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 3 | ||
if (majorVersions[i] == 3) { | ||
h_ClmlIntf = clGetMLInterfaceV3QCOM(0); | ||
LOG(WARNING) << "CLML Target version:" << majorVersions[i]; | ||
break; | ||
} | ||
#endif | ||
} | ||
ICHECK(h_ClmlIntf != NULL) | ||
<< "clGetMLInterfaceVxQCOM:" << result | ||
<< " Perhaps there is mispatch between CLML SDK version to target supported version"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's print target supported version |
||
char* tune_flag; | ||
if ((tune_flag = getenv("CLML_IS_TUNNING_RUN"))) | ||
this->is_tuning_run = std::stoi(tune_flag); | ||
|
@@ -400,7 +411,7 @@ class CLMLRuntime : public JSONRuntimeBase { | |
this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); | ||
this->layer_.func_outs.push_back(out); | ||
} else if ("add" == op_name || "subtract" == op_name || "multiply" == op_name || | ||
"minimum" == op_name || "maximum" == op_name) { | ||
"minimum" == op_name || "maximum" == op_name || "divide" == op_name) { | ||
auto out = CreateBinaryLayer(&layer_, node); | ||
this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); | ||
this->layer_.func_outs.push_back(out); | ||
|
@@ -523,16 +534,15 @@ class CLMLRuntime : public JSONRuntimeBase { | |
} | ||
|
||
cl_ml_tensor_qcom DeviceMakeCLMLTensor( | ||
void* pClmlIntf, cl_context context, tensor_dims_t dims, | ||
cl_context context, tensor_dims_t dims, | ||
cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, | ||
cl_channel_type dtype = CL_FLOAT) { | ||
cl_ml_tensor_qcom tensor; | ||
cl_int result = CL_OUT_OF_RESOURCES; | ||
|
||
cl_ml_tensor_desc_qcom desc = { | ||
dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, { 0 }}; | ||
CLMLInterfaceV2QCOM* clmlIntf = reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf); | ||
result = clmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &tensor); | ||
result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &tensor); | ||
ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result; | ||
(void)result; | ||
return tensor; | ||
|
@@ -544,9 +554,8 @@ class CLMLRuntime : public JSONRuntimeBase { | |
cl_int result = CL_OUT_OF_HOST_MEMORY; | ||
cl_mem buffer = NULL; | ||
|
||
CLMLInterfaceV2QCOM* clmlIntf = reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf); | ||
result = | ||
clmlIntf->clGetMLTensorMemorySizeQCOM(workspace->context, pTensorMemDesc->tensor, &size); | ||
h_ClmlIntf->clGetMLTensorMemorySizeQCOM(workspace->context, pTensorMemDesc->tensor, &size); | ||
ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result; | ||
|
||
buffer = clCreateBuffer(workspace->context, CL_MEM_READ_WRITE, size, NULL, &result); | ||
|
@@ -612,8 +621,7 @@ class CLMLRuntime : public JSONRuntimeBase { | |
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); | ||
|
||
auto tensor_dsc = std::make_shared<cl_ml_tensor_memory_desc_qcom>(); | ||
tensor_dsc->tensor = | ||
DeviceMakeCLMLTensor(h_ClmlIntf, workspace->context, dims, layout, cl_dtype); | ||
tensor_dsc->tensor = DeviceMakeCLMLTensor(workspace->context, dims, layout, cl_dtype); | ||
return tensor_dsc; | ||
} | ||
|
||
|
@@ -901,7 +909,6 @@ class CLMLRuntime : public JSONRuntimeBase { | |
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, | ||
cl_dtype); | ||
auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); | ||
auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]); | ||
|
||
std::vector<std::string> windows = node.GetAttr<std::vector<std::string>>("pool_size"); | ||
std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides"); | ||
|
@@ -1103,7 +1110,6 @@ class CLMLRuntime : public JSONRuntimeBase { | |
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); | ||
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); | ||
int inputSize = input_.size(); | ||
int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]); | ||
auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); | ||
cl_ml_tensor_qcom* concatInputs = new cl_ml_tensor_qcom[inputSize]; | ||
for (int i = 0; i < inputSize; i++) { | ||
|
@@ -1236,6 +1242,8 @@ class CLMLRuntime : public JSONRuntimeBase { | |
binary_op = CL_TENSOR_OP_SUB_QCOM; | ||
else if (op_name == "multiply") | ||
binary_op = CL_TENSOR_OP_MUL_QCOM; | ||
else if (op_name == "divide") | ||
binary_op = CL_TENSOR_OP_DIV_QCOM; | ||
else if (op_name == "minimum") | ||
binary_op = CL_TENSOR_OP_MIN_QCOM; | ||
else if (op_name == "maximum") | ||
|
@@ -1260,7 +1268,12 @@ class CLMLRuntime : public JSONRuntimeBase { | |
|
||
CachedLayer layer_; | ||
// CLML Context | ||
#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 2 | ||
CLMLInterfaceV2QCOM* h_ClmlIntf = NULL; | ||
#endif | ||
#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 3 | ||
CLMLInterfaceV3QCOM* h_ClmlIntf = NULL; | ||
#endif | ||
cl::OpenCLWorkspace* workspace = NULL; | ||
cl::OpenCLThreadEntry* tentry = NULL; | ||
cl_ml_tuningcache_qcom tuning_cache = NULL; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,9 +39,9 @@ class Device: | |
Configuration for CLML tests. | ||
|
||
Check tests/python/contrib/clml/ for the presence of an test_config.json file. | ||
This file can be used to override the default configuration here which will attempt to run the Arm | ||
Compute Library runtime tests locally if the runtime is available. Changing the configuration | ||
will allow these runtime tests to be offloaded to a remote Arm device via a tracker for example. | ||
This file can be used to override the default configuration here which will attempt to run the | ||
Open CLML runtime tests locally if the runtime is available. Changing the configuration | ||
will allow these runtime tests to be offloaded to a remote Snapdragon device via a tracker for example. | ||
|
||
Notes | ||
----- | ||
|
@@ -101,6 +101,25 @@ def _get_remote(cls): | |
return device | ||
|
||
|
||
def get_cpu_op_count(mod): | ||
"""Traverse graph counting ops offloaded to TVM.""" | ||
|
||
class Counter(tvm.relay.ExprVisitor): | ||
def __init__(self): | ||
super().__init__() | ||
self.count = 0 | ||
|
||
def visit_call(self, call): | ||
if isinstance(call.op, tvm.ir.Op): | ||
self.count += 1 | ||
|
||
super().visit_call(call) | ||
|
||
c = Counter() | ||
c.visit(mod["main"]) | ||
return c.count | ||
|
||
|
||
def skip_codegen_test(): | ||
"""Skip test if it requires the CLML codegen and it's not present.""" | ||
if not tvm.get_global_func("relay.ext.clml", True): | ||
|
@@ -130,7 +149,6 @@ def build_and_run( | |
|
||
try: | ||
libm = build_module(mod, device.target, device.target_host, params, enable_clml, tune_log) | ||
|
||
clml_modules = extract_clml_modules(libm) | ||
for mod in clml_modules: | ||
source = mod.get_source("json") | ||
|
@@ -155,9 +173,9 @@ def build_and_run( | |
for _ in range(no_runs): | ||
gen_module.run() | ||
out.append([gen_module.get_output(i) for i in range(outputs)]) | ||
time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=1) | ||
cost = time_f().mean | ||
print("%g secs/iteration\n" % cost) | ||
# time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=1) | ||
# cost = time_f().mean | ||
# print("%g secs/iteration\n" % cost) | ||
Comment on lines
+176
to
+178
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did you remove it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This affects CI time and also to get ideal numbers we need to set |
||
return out | ||
|
||
|
||
|
@@ -181,16 +199,34 @@ def extract_clml_modules(module): | |
|
||
|
||
def verify_codegen( | ||
module, | ||
mod, | ||
known_good_codegen, | ||
device, | ||
params, | ||
num_clml_modules=1, | ||
tvm_ops=0, | ||
target="llvm -mtriple=aarch64-linux-gnu", | ||
): | ||
"""Check clml codegen against a known good output.""" | ||
module = build_module(module, target, tvm_ops=tvm_ops, clml_partitions=num_clml_modules) | ||
clml_modules = extract_clml_modules(module) | ||
if isinstance(mod, tvm.relay.expr.Call): | ||
mod = tvm.IRModule.from_expr(mod) | ||
with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): | ||
mod = clml.partition_for_clml(mod, params) | ||
tvm_op_count = get_cpu_op_count(mod) | ||
assert tvm_op_count == tvm_ops, "Got {} TVM operators, expected {}".format( | ||
srkreddy1238 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
tvm_op_count, tvm_ops | ||
) | ||
partition_count = 0 | ||
for global_var in mod.get_global_vars(): | ||
if "clml" in global_var.name_hint: | ||
partition_count += 1 | ||
|
||
assert ( | ||
num_clml_modules == partition_count | ||
), "Got {} Open CLML partitions, expected {}".format(partition_count, num_clml_modules) | ||
relay.backend.te_compiler.get().clear() | ||
|
||
module = relay.build(mod, target=device.target, target_host=device.target_host, params=params) | ||
clml_modules = extract_clml_modules(module) | ||
assert len(clml_modules) == num_clml_modules, ( | ||
f"The number of CLML modules produced ({len(clml_modules)}) does not " | ||
f"match the expected value ({num_clml_modules})." | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible situation that we build TVM with a new version of CLML but our target device doesn't support it? Maybe it is better to restrict version of CLML as it was done for OpenCL (we use OpenCL 1.2)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For codegen, OpenCL1.2 restriction is good as we don't have any requirement of using features from higher versions.
CLML has newer versions with new operators support and is essential to support to get perf improvements on new gerations.