diff --git a/benchmark/python/cast_storage.py b/benchmark/python/cast_storage.py new file mode 100644 index 000000000000..7ae537398c42 --- /dev/null +++ b/benchmark/python/cast_storage.py @@ -0,0 +1,99 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import ctypes + +from mxnet.test_utils import * +import os +import time +import argparse + +from mxnet.base import check_call, _LIB + +parser = argparse.ArgumentParser(description="Benchmark cast storage operators", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet') +args = parser.parse_args() + +def measure_cost(repeat, f, *args, **kwargs): + start = time.time() + results = [] + for i in range(repeat): + (f(*args, **kwargs)).wait_to_read() + end = time.time() + diff = end - start + return diff / repeat + + +def run_cast_storage_synthetic(): + def dense_to_sparse(m, n, density, ctx, repeat, stype): + set_default_context(ctx) + data_shape = (m, n) + dns_data = rand_ndarray(data_shape, stype, density).tostype('default') + dns_data.wait_to_read() + + # do one warm up run, verify correctness + assert same(mx.nd.cast_storage(dns_data, stype).asnumpy(), dns_data.asnumpy()) + + # start benchmarking + cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype) + results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format(density*100, str(ctx), m, n, cost*1000) + print(results) + + check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads))) + + # params + # m number of rows + # n number of columns + # density density of the matrix + # num_repeat number of benchmark runs to average over + # contexts mx.cpu(), mx.gpu() + # note: benchmark different contexts separately; to benchmark cpu, compile without CUDA + # benchmarks dns_to_csr, dns_to_rsp + m = [ 512, 512] + n = [50000, 100000] + density = [1.00, 0.80, 0.60, 0.40, 0.20, 0.10, 0.05, 0.02, 0.01] + num_repeat = 10 + contexts = [mx.gpu()] + benchmarks = ["dns_to_csr", "dns_to_rsp"] + + # run benchmark + for b in benchmarks: + stype = '' + print("==================================================") + if b is "dns_to_csr": + stype = 'csr' + print(" cast_storage benchmark: dense to csr, size m x n ") + elif b is "dns_to_rsp": + stype = 'row_sparse' + print(" cast_storage benchmark: dense to rsp, size m x n ") + else: + print("invalid benchmark: %s" %b) + continue + print("==================================================") + headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)') + print(headline) + for i in range(len(n)): + for ctx in contexts: + for den in density: + dense_to_sparse(m[i], n[i], den, ctx, num_repeat, stype) + print("") + print("") + + +if __name__ == "__main__": + run_cast_storage_synthetic() diff --git a/benchmark/python/dot.py b/benchmark/python/dot.py new file mode 100644 index 000000000000..4fe3bcdcd9c1 --- /dev/null +++ b/benchmark/python/dot.py @@ -0,0 +1,280 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import ctypes + +from mxnet.test_utils import * +import scipy.sparse as sp +import os +import time +import argparse + +from mxnet.base import check_call, _LIB +from util import get_data, estimate_density + +parser = argparse.ArgumentParser(description="Benchmark sparse operators", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet') +args = parser.parse_args() + +# some data information +kdda = { + 'data_mini': 'kdda.t.mini', + 'data_name': 'kdda.t', + 'data_origin_name': 'kdda.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2", + 'feature_dim': 20216830, + 'm': 200, + 'batch_size': [64] +} + +avazu = { + 'data_mini': 'avazu-app.t.mini', + 'data_name': 'avazu-app.t', + 'data_origin_name': 'avazu-app.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2", + 'feature_dim': 1000000, + 'm': 500, + 'batch_size': [64, 128] +} + + +def measure_cost(repeat, f, *args, **kwargs): + mx.nd.waitall() + start = time.time() + for i in range(repeat): + f(*args, **kwargs) + mx.nd.waitall() + end = time.time() + diff = end - start + return diff / repeat + + +def test_dot_real(data_dict): + def get_iter(path, data_shape, batch_size): + data_train = mx.io.LibSVMIter(data_libsvm=path, + data_shape=data_shape, + batch_size=batch_size) + data_iter = iter(data_train) + return data_iter + + data_dir = os.path.join(os.getcwd(), 'data') + + path = os.path.join(data_dir, data_dict['data_name']) + if not os.path.exists(path): + get_data( + data_dir, + data_dict['data_name'], + data_dict['url'], + data_dict['data_origin_name'] + ) + assert os.path.exists(path) + + k = data_dict['feature_dim'] + m = data_dict['m'] + density = estimate_density(path, data_dict['feature_dim']) + + mini_path = os.path.join(data_dir, data_dict['data_mini']) + if not os.path.exists(mini_path): + os.system("head -n 2000 %r > %r" % (path, mini_path)) + assert os.path.exists(mini_path) + + print "Running Benchmarking on %r data" % data_dict['data_mini'] + for batch_size in data_dict['batch_size']: # iterator through different batch size of choice + print "batch_size is %d" % batch_size + # model + data_shape = (k, ) + train_iter = get_iter(mini_path, data_shape, batch_size) + weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m)) + + csr_data = [] + dns_data = [] + num_batch = 0 + for batch in train_iter: + data = train_iter.getdata() + csr_data.append(data) + dns_data.append(data.tostype('default')) + num_batch += 1 + bag_of_data = [csr_data, dns_data] + num_repeat = 5 + costs = [] + for d in bag_of_data: + weight.wait_to_read() + cost = 0. + count = 0 + for d_batch in d: + d_batch.wait_to_read() + cost += measure_cost(True, num_repeat, mx.nd.dot, d_batch, weight) + count += 1 + costs.append(cost/count) + t_sparse = costs[0] + t_dense = costs[1] + ratio = t_dense / t_sparse + print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse') + fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f" + print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, t_sparse)) + + +def test_dot_synthetic(): + """benchmark sparse mxnet dot and scipy dot operator with matrices of given density. + `t_sparse` is the runtime of the invoked sparse dot operator in ms, while `t_dense` is the + runtime of dot(dns, dns), with the same matrices except that they are in default storage type. + """ + # Benchmark MXNet's sparse dot operator + def bench_mx_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, rhs_den, trans_lhs, ctx, repeat): + set_default_context(ctx) + # Create matrix instances + lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den) + rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den) + lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.tostype('default') + rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.tostype('default') + # One warm up run, verify correctness + out = mx.nd.dot(lhs_nd, rhs_dns, trans_lhs) + out_expected = mx.nd.dot(lhs_dns, rhs_dns, trans_lhs) + assert_almost_equal(out.asnumpy(), out_expected.asnumpy(), rtol=1e-2, atol=1e-3) + # Start benchmarking + lhs_nd.wait_to_read() + rhs_nd.wait_to_read() + sparse_cost = measure_cost(repeat, mx.nd.dot, lhs_nd, rhs_nd, trans_lhs) + dense_cost = measure_cost(repeat, mx.nd.dot, lhs_dns, rhs_dns, trans_lhs) + speedup = dense_cost / sparse_cost + # Print results + m = lhs_shape[0] + k = lhs_shape[1] + n = rhs_shape[1] + results = '{:15.1f} {:15.1f} {:>10} {:8d} {:8d} {:8d} {:13.2f} {:13.2f} {:8.2f}'.format(lhs_den*100, rhs_den*100, str(ctx), m, k, n, sparse_cost*1000, dense_cost*1000, speedup) + print(results) + + # Benchmark Scipy's sparse dot operator + def bench_sp_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, rhs_den, trans_lhs, ctx, repeat): + set_default_context(ctx) + assert default_context().device_type is 'cpu' + assert lhs_stype is 'csr' + assert rhs_stype is 'default' + # Create matrix instances + lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den) + rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den) + lhs_nd.wait_to_read() + rhs_nd.wait_to_read() + lhs_dns_np = np.transpose(lhs_nd.asnumpy()) if trans_lhs else lhs_nd.asnumpy() + rhs_dns_np = rhs_nd.asnumpy() + lhs_csr_sp = sp.spmatrix.transpose(sp.csr_matrix(lhs_nd.asnumpy())) if trans_lhs else sp.csr_matrix(lhs_nd.asnumpy()) + # One warm up run + out = sp.spmatrix.dot(lhs_csr_sp, rhs_dns_np) + # Start benchmarking + sparse_cost = measure_cost(repeat, sp.spmatrix.dot, lhs_csr_sp, rhs_dns_np) + dense_cost = measure_cost(repeat, np.dot, lhs_dns_np, rhs_dns_np) + speedup = dense_cost / sparse_cost + # Print results + m = lhs_shape[0] + k = lhs_shape[1] + n = rhs_shape[1] + results = '{:15.1f} {:15.1f} {:>10} {:8d} {:8d} {:8d} {:13.2f} {:13.2f} {:8.2f}'.format(lhs_den*100, rhs_den*100, str(ctx), m, k, n, sparse_cost*1000, dense_cost*1000, speedup) + print(results) + + check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads))) + # TODO(haibin): make these runtime options + # params + # m, n, k rows and columns of lhs and rhs matrix + # forward pass: m x k * k x n = m x n + # backward pass: (m x k)^T * m x n = k x n + # density_lhs density of the left-hand side matrix + # density_rhs density of the right-hand side matrix, if applicable + # num_repeat number of benchmark runs to average over + # context mx.cpu(), mx.gpu() + # note: benchmark different contexts separately; to benchmark cpu, compile without CUDA + # mx_benchmarks csr_dns, csr.T_dns, csr_rsp + # sp_benchmarks csr_dns, csr.T_dns + # note: scipy benchmarks are only conducted if context is mx.cpu() + m = 512 + k = [50000, 100000] + n = [64, 128] + density_lhs = [0.64, 0.32, 0.16, 0.08, 0.04, 0.02, 0.01] + density_rhs = [0.64, 0.32, 0.16, 0.08, 0.04, 0.02, 0.01] + num_repeat = 10 + context = mx.gpu() + mx_benchmarks = ["csr_dns", "csr.T_dns", "csr_rsp"] + sp_benchmarks = ["csr_dns", "csr.T_dns"] + + headline = '{:>15} {:>15} {:>10} {:>8} {:>8} {:>8} {:>13} {:>13} {:>8}'.format('lhs_density(%)', 'rhs_density(%)', 'context', 'm', 'k', 'n', 't_sparse(ms)', 't_dense(ms)', 'speedup') + if "csr_dns" in mx_benchmarks: + print("==================================================") + print(" mxnet sparse dot benchmark: dot(csr, dns) = dns ") + print(" (matrix multiplication: m x k * k x n = m x n) ") + print("==================================================") + print(headline) + transpose_lhs = False + for i in range(len(n)): + for d_lhs in density_lhs: + bench_mx_dot((m, k[i]), (k[i], n[i]), 'csr', 'default', d_lhs, 1, transpose_lhs, context, num_repeat) + print "" + + if "csr_dns" in sp_benchmarks and mx.cpu() == context: + print("==================================================") + print(" scipy sparse dot benchmark: dot(csr, dns) = dns ") + print(" (matrix multiplication: m x k * k x n = m x n) ") + print("==================================================") + print(headline) + transpose_lhs = False + for i in range(len(n)): + for d_lhs in density_lhs: + bench_sp_dot((m, k[i]), (k[i], n[i]), 'csr', 'default', d_lhs, 1, transpose_lhs, context, num_repeat) + print "" + + if "csr.T_dns" in mx_benchmarks: + print("==================================================") + print(" mxnet sparse dot benchmark: dot(csr.T, dns) = rsp") + print("(matrix multiplication: (m x k)^T * m x n = k x n)") + print("==================================================") + print(headline) + transpose_lhs = True + for i in range(len(n)): + for d_lhs in density_lhs: + bench_mx_dot((m, k[i]), (m, n[i]), 'csr', 'default', d_lhs, 1, transpose_lhs, context, num_repeat) + print "" + + if "csr.T_dns" in sp_benchmarks and mx.cpu() == context: + print("==================================================") + print(" scipy sparse dot benchmark: dot(csr.T, dns) = dns") + print("(matrix multiplication: (m x k)^T * m x n = k x n)") + print("==================================================") + print(headline) + transpose_lhs = True + for i in range(len(n)): + for d_lhs in density_lhs: + bench_sp_dot((m, k[i]), (m, n[i]), 'csr', 'default', d_lhs, 1, transpose_lhs, context, num_repeat) + print "" + + if "csr_rsp" in mx_benchmarks: + print("==================================================") + print(" mxnet sparse dot benchmark: dot(csr, rsp) = dns ") + print(" (matrix multiplication: m x k * k x n = m x n) ") + print("==================================================") + print(headline) + transpose_lhs = False + for i in range(len(n)): + for d_lhs in density_lhs: + for d_rhs in density_rhs: + bench_mx_dot((m, k[i]), (k[i], n[i]), 'csr', 'row_sparse', d_lhs, d_rhs, transpose_lhs, context, num_repeat) + print "" + print "" + + +if __name__ == "__main__": + test_dot_synthetic() + test_dot_real(avazu) + test_dot_real(kdda) diff --git a/benchmark/python/sparse_end2end.py b/benchmark/python/sparse_end2end.py new file mode 100644 index 000000000000..62a3b77b8482 --- /dev/null +++ b/benchmark/python/sparse_end2end.py @@ -0,0 +1,226 @@ +from mxnet.test_utils import * +import time +import argparse +import os + +parser = argparse.ArgumentParser(description="Run sparse linear regression " \ + "with distributed kvstore", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--profiler', type=int, default=0, + help='whether to use profiler') +parser.add_argument('--num-epoch', type=int, default=1, + help='number of epochs to train') +parser.add_argument('--batch-size', type=int, default=512, + help='number of examples per batch') +parser.add_argument('--num-batch', type=int, default=99999999, + help='number of batches per epoch') +parser.add_argument('--dummy-iter', type=int, default=0, + help='whether to use dummy iterator to exclude io cost') +parser.add_argument('--kvstore', type=str, default='local', + help='what kvstore to use [local, dist_sync, etc]') +parser.add_argument('--log-level', type=str, default='debug', + help='logging level [debug, info, error]') +parser.add_argument('--dataset', type=str, default='avazu', + help='what test dataset to use') +parser.add_argument('--num-gpu', type=int, default=0, + help='number of gpus to use. 0 means using cpu(0);' + 'otherwise, use gpu(0),...,gpu(num_gpu-1)') +parser.add_argument('--output-dim', type=int, default=4, + help='number of columns of the forward output') + + +def get_libsvm_data(data_dir, data_name, url, data_origin_name): + if not os.path.isdir(data_dir): + os.system("mkdir " + data_dir) + os.chdir(data_dir) + if (not os.path.exists(data_name)): + import urllib + zippath = os.path.join(data_dir, data_origin_name) + urllib.urlretrieve(url, zippath) + os.system("bzip2 -d %r" % data_origin_name) + os.chdir("..") + + +class DummyIter(mx.io.DataIter): + "A dummy iterator that always return the same batch, used for speed testing" + def __init__(self, real_iter): + super(DummyIter, self).__init__() + self.real_iter = real_iter + self.provide_data = real_iter.provide_data + self.provide_label = real_iter.provide_label + self.batch_size = real_iter.batch_size + + for batch in real_iter: + self.the_batch = batch + break + + def __iter__(self): + return self + + def next(self): + return self.the_batch + +# testing dataset sources +avazu = { + 'data_name': 'avazu-app.t', + 'data_origin_name': 'avazu-app.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2", + 'feature_dim': 1000000, +} + +kdda = { + 'data_name': 'kdda.t', + 'data_origin_name': 'kdda.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2", + 'feature_dim': 20216830, +} + +datasets = { 'kdda' : kdda, 'avazu' : avazu } + + +def get_sym(feature_dim): + x = mx.symbol.Variable("data", stype='csr') + norm_init = mx.initializer.Normal(sigma=0.01) + w = mx.symbol.Variable("w", shape=(feature_dim, args.output_dim), init=norm_init, stype='row_sparse') + embed = mx.symbol.dot(x, w) + y = mx.symbol.Variable("softmax_label") + model = mx.symbol.SoftmaxOutput(data=embed, label=y, name="out") + return model + + +def row_sparse_pull(kv, key, data, slices, weight_array, priority): + # if have kvstore, need to pull corresponding rows of + # the weights to each context + # column indices (NDArray type) of the csr data + # used as the row_idx of the weight row-sparse matrix + row_indices = data.indices + if len(slices) == 1: + kv.row_sparse_pull(key, weight_array, priority=priority, row_ids=row_indices) + else: # more than one slices, multi-GPU training. Need to retain weight rows according to data slices + # TODO(junwu): + # the following line blocks, may need to pre-compute + # and cache it outside the for loop + indptr = data.indptr.asnumpy() + row_idx_array = [] + for s in slices: + row_idx_array.append(row_indices[indptr[s.start]:indptr[s.stop]]) + kv.row_sparse_pull(key, weight_array, priority=priority, row_ids=row_idx_array) + + +if __name__ == '__main__': + + # arg parser + args = parser.parse_args() + num_epoch = args.num_epoch + num_batch = args.num_batch + kvstore = args.kvstore + profiler = args.profiler > 0 + batch_size = args.batch_size if args.num_gpu == 0 else args.num_gpu * args.batch_size + dummy_iter = args.dummy_iter + dataset = args.dataset + log_level = args.log_level + contexts = mx.context.cpu(0) if args.num_gpu < 1\ + else [mx.context.gpu(i) for i in range(args.num_gpu)] + + # create kvstore when there are gpus + kv = mx.kvstore.create(kvstore) if args.num_gpu >= 1 else None + rank = kv.rank if kv is not None else 0 + num_worker = kv.num_workers if kv is not None else 1 + + # only print log for rank 0 worker + import logging + if rank != 0: + log_level = logging.ERROR + elif log_level == 'DEBUG': + log_level = logging.DEBUG + else: + log_level = logging.INFO + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=log_level, format=head) + + # dataset + assert(dataset in datasets), "unknown dataset " + dataset + metadata = datasets[dataset] + feature_dim = metadata['feature_dim'] + if logging: + logging.debug('preparing data ... ') + data_dir = os.path.join(os.getcwd(), 'data') + path = os.path.join(data_dir, metadata['data_name']) + if not os.path.exists(path): + get_libsvm_data(data_dir, metadata['data_name'], metadata['url'], + metadata['data_origin_name']) + assert os.path.exists(path) + + # data iterator + train_data = mx.io.LibSVMIter(data_libsvm=path, data_shape=(feature_dim,), + batch_size=batch_size, num_parts=num_worker, + part_index=rank) + if dummy_iter: + train_data = DummyIter(train_data) + + # model + model = get_sym(feature_dim) + + # module + mod = mx.mod.Module(symbol=model, data_names=['data'], + label_names=['softmax_label'], context=contexts) + mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) + mod.init_params(initializer=mx.init.Uniform(scale=.1)) + sgd = mx.optimizer.SGD(momentum=0.0, clip_gradient=5.0, + learning_rate=0.1, rescale_grad=1.0/batch_size/num_worker) + mod.init_optimizer(optimizer=sgd, kvstore=kv) + # use accuracy as the metric + metric = mx.metric.create('acc') + + index = mod._exec_group.param_names.index('w') + # weight_array bound to executors of the contexts + weight_array = mod._exec_group.param_arrays[index] + + # start profiler + if profiler: + device = 'cpu' + if args.num_gpu > 0: + device = 'gpu' + str(args.num_gpu) + name = 'profile_' + args.dataset + '_' + device + '_nworker' + str(num_worker)\ + + '_batchsize' + str(args.batch_size) + '_outdim' + str(args.output_dim) + '.json' + mx.profiler.profiler_set_config(mode='all', filename=name) + mx.profiler.profiler_set_state('run') + + logging.debug('start training ...') + start = time.time() + data_iter = iter(train_data) + for epoch in range(num_epoch): + nbatch = 0 + end_of_batch = False + data_iter.reset() + metric.reset() + next_batch = next(data_iter) + if kv is not None: + row_sparse_pull(kv, 'w', next_batch.data[0], mod._exec_group.slices, weight_array, -index) + while not end_of_batch: + nbatch += 1 + batch = next_batch + + mod.forward_backward(batch) + # update parameters + mod.update() + + try: + # pre fetch next batch + next_batch = next(data_iter) + if nbatch == num_batch: + raise StopIteration + if kv is not None: + row_sparse_pull(kv, 'w', next_batch.data[0], mod._exec_group.slices, weight_array, -index) + except StopIteration: + end_of_batch = True + # accumulate prediction accuracy + mod.update_metric(metric, batch.label) + logging.info('epoch %d, %s' % (epoch, metric.get())) + if epoch == 0: + print "num_batches = ", nbatch + if profiler: + mx.profiler.profiler_set_state('stop') + end = time.time() + time_cost = end - start + logging.info('num_worker = ' + str(num_worker) + ', time cost = ' + str(time_cost)) diff --git a/benchmark/python/sparse_op.py b/benchmark/python/sparse_op.py new file mode 100644 index 000000000000..0683aa84eacb --- /dev/null +++ b/benchmark/python/sparse_op.py @@ -0,0 +1,245 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import ctypes + +from mxnet.test_utils import * +import scipy.sparse as sp +import os +import time +import argparse + +from mxnet.base import check_call, _LIB +from util import get_data, estimate_density + +parser = argparse.ArgumentParser(description="Benchmark sparse operators", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet') +args = parser.parse_args() + +# some data information +kdda = { + 'data_mini': 'kdda.t.mini', + 'data_name': 'kdda.t', + 'data_origin_name': 'kdda.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2", + 'feature_dim': 20216830, + 'm': 200, + 'batch_size': [64] +} + +avazu = { + 'data_mini': 'avazu-app.t.mini', + 'data_name': 'avazu-app.t', + 'data_origin_name': 'avazu-app.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2", + 'feature_dim': 1000000, + 'm': 500, + 'batch_size': [64, 128] +} + + +def measure_cost(repeat, f, *args, **kwargs): + # start bench + start = time.time() + results = [] + for i in range(repeat): + results.append(f(*args, **kwargs)) + for result in results: + result.wait_to_read() + end = time.time() + diff = end - start + return diff / repeat + + +def test_dot_real(data_dict): + def get_iter(path, data_shape, batch_size): + data_train = mx.io.LibSVMIter(data_libsvm=path, + data_shape=data_shape, + batch_size=batch_size) + data_iter = iter(data_train) + return data_iter + + data_dir = os.path.join(os.getcwd(), 'data') + + path = os.path.join(data_dir, data_dict['data_name']) + if not os.path.exists(path): + get_data( + data_dir, + data_dict['data_name'], + data_dict['url'], + data_dict['data_origin_name'] + ) + assert os.path.exists(path) + + k = data_dict['feature_dim'] + m = data_dict['m'] + density = estimate_density(path, data_dict['feature_dim']) + + mini_path = os.path.join(data_dir, data_dict['data_mini']) + if not os.path.exists(mini_path): + os.system("head -n 2000 %r > %r" % (path, mini_path)) + assert os.path.exists(mini_path) + + print "Running Benchmarking on %r data" % data_dict['data_mini'] + for batch_size in data_dict['batch_size']: # iterator through different batch size of choice + print "batch_size is %d" % batch_size + # model + data_shape = (k, ) + train_iter = get_iter(mini_path, data_shape, batch_size) + weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m)) + + csr_data = [] + dns_data = [] + num_batch = 0 + for batch in train_iter: + data = train_iter.getdata() + csr_data.append(data) + dns_data.append(data.tostype('default')) + num_batch += 1 + bag_of_data = [csr_data, dns_data] + num_repeat = 5 + costs = [] + for d in bag_of_data: + weight.wait_to_read() + cost = 0. + count = 0 + for d_batch in d: + d_batch.wait_to_read() + cost += measure_cost(num_repeat, mx.nd.dot, d_batch, weight) + count += 1 + costs.append(cost/count) + t_sparse = costs[0] + t_dense = costs[1] + ratio = t_dense / t_sparse + print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse') + fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f" + print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, t_sparse)) + + +def test_dot_synthetic(): + """benchmark mx.nd.dot(sparse_ndarray, dense_ndarray) with given density. + `t_sparse` is the time cost of dot(csr, dns), while `t_dense` is the time cost + of dot(dns, dns), with the same matrix except that it is in default storage type. + """ + def measure_cost_forward_baseline(repeat, dot, lhs, rhs): + start = time.time() + for i in range(repeat): + dot(lhs, rhs) + end = time.time() + diff = end - start + return diff / repeat + + def measure_cost_backward_baseline(repeat, dot, transpose, lhs, rhs): + start = time.time() + for i in range(repeat): + dot(transpose(lhs), rhs) + end = time.time() + diff = end - start + return diff / repeat + + def bench_dot_forward(m, k, n, density, ctx, repeat): + set_default_context(ctx) + dns = mx.nd.random_uniform(shape=(k, n)).copyto(ctx) + data_shape = (m, k) + csr_data = rand_ndarray(data_shape, 'csr', density) + dns_data = csr_data.tostype('default') + rhs_dns_np = dns.asnumpy() + lhs_csr_sp = sp.csr_matrix(dns_data.asnumpy()) # csr in scipy + lhs_dns_np = lhs_csr_sp.tostype('default') + + data = [dns_data, csr_data] + costs = [] + for d in data: + dns.wait_to_read() + d.wait_to_read() + cost = measure_cost(repeat, mx.nd.dot, d, dns) + costs.append(cost) + ratio = costs[0] / costs[1] + + costs_baseline = [] + cost = measure_cost_forward_baseline(repeat, np.dot, lhs_dns_np, rhs_dns_np) + costs_baseline.append(cost) + cost = measure_cost_forward_baseline(repeat, sp.spmatrix.dot, lhs_csr_sp, rhs_dns_np) + costs_baseline.append(cost) + ratio_baseline = costs_baseline[0] / costs_baseline[1] + fmt = "%0.1f\t\t%s\t%d\t%d\t%d\t%0.2f\t\t\t%0.2f\t%0.5f\t\t%0.2f\t\t\t\t%0.6f\t%0.5f" + print(fmt % (density * 100, str(ctx), n, m, k, ratio, costs[0], costs[1], + ratio_baseline, costs_baseline[0], costs_baseline[1])) + + def bench_dot_backward(m, k, n, density, ctx, repeat): + set_default_context(ctx) + dns = mx.nd.random_uniform(shape=(m, n)).copyto(ctx) + data_shape = (m, k) + csr_data = rand_ndarray(data_shape, 'csr', density) + dns_data = csr_data.tostype('default') + rhs_dns_np = dns.asnumpy() + lhs_csr_sp = sp.csr_matrix(dns_data.asnumpy()) + lhs_dns_np = lhs_csr_sp.tostype('default') + + data = [dns_data, csr_data] + costs = [] + for d in data: + dns.wait_to_read() + d.wait_to_read() + cost = measure_cost(repeat, mx.nd.dot, d, dns, transpose_a=True) + costs.append(cost) + ratio = costs[0] / costs[1] + + costs_baseline = [] + cost = measure_cost_backward_baseline(repeat, np.dot, np.transpose, lhs_dns_np, rhs_dns_np) + costs_baseline.append(cost) + cost = measure_cost_backward_baseline(repeat, sp.spmatrix.dot, sp.spmatrix.transpose, lhs_csr_sp, rhs_dns_np) + costs_baseline.append(cost) + ratio_baseline = costs_baseline[0] / costs_baseline[1] + fmt = "%0.1f\t\t%s\t%d\t%d\t%d\t%0.2f\t\t\t%0.2f\t%0.5f\t\t%0.2f\t\t\t\t%0.6f\t%0.5f" + print(fmt % (density * 100, str(ctx), n, m, k, ratio, costs[0], costs[1], + ratio_baseline, costs_baseline[0], costs_baseline[1])) + + print("A = sparse NDArray of shape(m, k)") + print("B = dense NDArray of shape(k, n)") + print("dot_forward\tdot(csr, dns)") + print('density(%)\tcontext\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse' + '\tt_scipy_dense/t_scipy_sparse\tt_scipy_dense\tt_scipy_sparse') + + check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads))) + # TODO(haibin) make these runtime options + m = 512 + k = [50000, 100000] + n = [64, 128] + density = [1.00, 0.90, 0.70, 0.50, 0.30, 0.20, 0.10, 0.07, 0.05, 0.02, 0.01, 0.005, 0.001] + num_repeat = 10 + # contexts = [mx.cpu(), mx.gpu(0)] + contexts = [mx.cpu()] + for i in range(2): + for ctx in contexts: + for den in density: + bench_dot_forward(m, k[i], n[i], den, ctx, num_repeat) + + print("dot_backward\tdot(csr.T, dns)") + print('density(%)\tcontext\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse' + '\tt_scipy_dense/t_scipy_sparse\tt_scipy_dense\tt_scipy_sparse') + for i in range(2): + for ctx in contexts: + for den in density: + bench_dot_backward(m, k[i], n[i], den, ctx, num_repeat) + + +if __name__ == "__main__": + test_dot_real(avazu) + test_dot_real(kdda) + test_dot_synthetic() diff --git a/benchmark/python/util.py b/benchmark/python/util.py new file mode 100644 index 000000000000..947ff4a65037 --- /dev/null +++ b/benchmark/python/util.py @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import random + + +def get_data(data_dir, data_name, url, data_origin_name): + if not os.path.isdir(data_dir): + os.system("mkdir " + data_dir) + os.chdir(data_dir) + if (not os.path.exists(data_name)): + import urllib + zippath = os.path.join(data_dir, data_origin_name) + urllib.urlretrieve(url, zippath) + os.system("bzip2 -d %r" % data_origin_name) + os.chdir("..") + + +def estimate_density(DATA_PATH, feature_size): + """sample 10 times of a size of 1000 for estimating the density of the sparse dataset""" + if not os.path.exists(DATA_PATH): + raise Exception("Data is not there!") + density = [] + P = 0.01 + for _ in xrange(10): + num_non_zero = 0 + num_sample = 0 + with open(DATA_PATH) as f: + for line in f: + if (random.random() < P): + num_non_zero += len(line.split(" ")) - 1 + num_sample += 1 + density.append(num_non_zero * 1.0 / (feature_size * num_sample)) + return sum(density) / len(density) + diff --git a/docs/api/python/ndarray.md b/docs/api/python/ndarray.md index 5e9f7e1a1184..dc0e65dd0062 100644 --- a/docs/api/python/ndarray.md +++ b/docs/api/python/ndarray.md @@ -64,9 +64,21 @@ A detailed tutorial is available at ``` In the rest of this document, we first overview the methods provided by the -`ndarray.NDArray` class, and then list other routines provided by the -`ndarray` package. +`ndarray.NDArray` class and its subclasses, and then list other routines +provided by the `ndarray` package. +The `ndarray` package provides several classes: + +```eval_rst +.. autosummary:: + :nosignatures: + + NDArray + CSRNDArray + RowSparseNDArray +``` + +We summarize the interface for each class in the following sections. ## The `NDArray` class @@ -80,6 +92,7 @@ In the rest of this document, we first overview the methods provided by the NDArray.size NDArray.context NDArray.dtype + NDArray.stype ``` ### Array conversion @@ -94,6 +107,7 @@ In the rest of this document, we first overview the methods provided by the NDArray.asnumpy NDArray.asscalar NDArray.astype + NDArray.tostype ``` ### Array change shape @@ -171,6 +185,35 @@ In the rest of this document, we first overview the methods provided by the NDArray.wait_to_read ``` +## The `RowSparseNDArray` Class + +```eval_rst +.. autosummary:: + :nosignatures: + + RowSparseNDArray.copyto + RowSparseNDArray.tostype + RowSparseNDArray.__setitem__ + RowSparseNDArray.__getitem__ + RowSparseNDArray.data + RowSparseNDArray.indices +``` + +## The `CSRNDArray` Class + +```eval_rst +.. autosummary:: + :nosignatures: + + CSRNDArray.copyto + CSRNDArray.tostype + CSRNDArray.__setitem__ + CSRNDArray.__getitem__ + CSRNDArray.data + CSRNDArray.indices + CSRNDArray.indptr +``` + ## Array creation routines ```eval_rst @@ -499,8 +542,24 @@ The `contrib.ndarray` module contains many useful experimental APIs for new feat ```eval_rst + +.. autoclass:: mxnet.ndarray.NDArray + :members: + :special-members: + +.. autoclass:: mxnet.ndarray.CSRNDArray + :members: + :special-members: + +.. autoclass:: mxnet.ndarray.RowSparseNDArray + :members: + :special-members: + .. automodule:: mxnet.ndarray :members: + :imported-members: + :special-members: + :exclude-members: CachedOp, BaseSparseNDArray, NDArray, CSRNDArray, RowSparseNDArray .. automodule:: mxnet.random :members: diff --git a/example/sparse/get_data.py b/example/sparse/get_data.py new file mode 100644 index 000000000000..578cf2ce5226 --- /dev/null +++ b/example/sparse/get_data.py @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: skip-file +import os, gzip +import pickle as pickle +import sys + +def get_libsvm_data(data_dir, data_name, url, data_origin_name): + if not os.path.isdir(data_dir): + os.mkdir(data_dir) + os.chdir(data_dir) + if (not os.path.exists(data_name)): + import urllib + zippath = os.path.join(data_dir, data_origin_name) + urllib.urlretrieve(url, zippath) + os.system("bzip2 -d %r" % data_origin_name) + os.chdir("..") diff --git a/example/sparse/linear_classification.py b/example/sparse/linear_classification.py new file mode 100644 index 000000000000..567568c6eb80 --- /dev/null +++ b/example/sparse/linear_classification.py @@ -0,0 +1,185 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from mxnet.test_utils import * +from get_data import get_libsvm_data +import time +import argparse +import os + +parser = argparse.ArgumentParser(description="Run sparse linear classification " \ + "with distributed kvstore", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--profiler', type=int, default=0, + help='whether to use profiler') +parser.add_argument('--num-epoch', type=int, default=1, + help='number of epochs to train') +parser.add_argument('--batch-size', type=int, default=8192, + help='number of examples per batch') +parser.add_argument('--num-batch', type=int, default=99999999, + help='number of batches per epoch') +parser.add_argument('--dummy-iter', type=int, default=0, + help='whether to use dummy iterator to exclude io cost') +parser.add_argument('--kvstore', type=str, default='dist_sync', + help='what kvstore to use [local, dist_sync, etc]') +parser.add_argument('--log-level', type=str, default='DEBUG', + help='logging level [debug, info, error]') +parser.add_argument('--dataset', type=str, default='avazu', + help='what test dataset to use') + +class DummyIter(mx.io.DataIter): + "A dummy iterator that always return the same batch, used for speed testing" + def __init__(self, real_iter): + super(DummyIter, self).__init__() + self.real_iter = real_iter + self.provide_data = real_iter.provide_data + self.provide_label = real_iter.provide_label + self.batch_size = real_iter.batch_size + + for batch in real_iter: + self.the_batch = batch + break + + def __iter__(self): + return self + + def next(self): + return self.the_batch + +# testing dataset sources +avazu = { + 'data_name': 'avazu-app.t', + 'data_origin_name': 'avazu-app.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2", + 'feature_dim': 1000000, +} + +kdda = { + 'data_name': 'kdda.t', + 'data_origin_name': 'kdda.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2", + 'feature_dim': 20216830, +} + +datasets = { 'kdda' : kdda, 'avazu' : avazu } + +def linear_model(feature_dim): + x = mx.symbol.Variable("data", stype='csr') + norm_init = mx.initializer.Normal(sigma=0.01) + weight = mx.symbol.Variable("weight", shape=(feature_dim, 1), init=norm_init, stype='row_sparse') + bias = mx.symbol.Variable("bias", shape=(1,), init=norm_init) + dot = mx.symbol.dot(x, weight) + pred = mx.symbol.broadcast_add(dot, bias) + y = mx.symbol.Variable("softmax_label") + model = mx.symbol.SoftmaxOutput(data=pred, label=y, name="out") + return model + +if __name__ == '__main__': + # arg parser + args = parser.parse_args() + num_epoch = args.num_epoch + num_batch = args.num_batch + kvstore = args.kvstore + profiler = args.profiler > 0 + batch_size = args.batch_size + dummy_iter = args.dummy_iter + dataset = args.dataset + log_level = args.log_level + + # create kvstore + kv = mx.kvstore.create(kvstore) + rank = kv.rank + num_worker = kv.num_workers + + # only print log for rank 0 worker + import logging + if rank != 0: + log_level = logging.ERROR + elif log_level == 'DEBUG': + log_level = logging.DEBUG + else: + log_level = logging.INFO + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=log_level, format=head) + + # dataset + assert(dataset in datasets), "unknown dataset " + dataset + metadata = datasets[dataset] + feature_dim = metadata['feature_dim'] + if logging: + logging.debug('preparing data ... ') + data_dir = os.path.join(os.getcwd(), 'data') + path = os.path.join(data_dir, metadata['data_name']) + if not os.path.exists(path): + get_libsvm_data(data_dir, metadata['data_name'], metadata['url'], + metadata['data_origin_name']) + assert os.path.exists(path) + + # data iterator + train_data = mx.io.LibSVMIter(data_libsvm=path, data_shape=(feature_dim,), + batch_size=batch_size, num_parts=num_worker, + part_index=rank) + if dummy_iter: + train_data = DummyIter(train_data) + + # model + model = linear_model(feature_dim) + + # module + mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['softmax_label']) + mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) + mod.init_params(initializer=mx.init.Uniform(scale=.1)) + sgd = mx.optimizer.SGD(momentum=0.0, clip_gradient=5.0, + learning_rate=0.1, rescale_grad=1.0/batch_size/num_worker) + mod.init_optimizer(optimizer=sgd, kvstore=kv) + # use accuracy as the metric + metric = mx.metric.create('Accuracy') + + # start profiler + if profiler: + name = 'profile_output_' + str(num_worker) + '.json' + mx.profiler.profiler_set_config(mode='all', filename=name) + mx.profiler.profiler_set_state('run') + + logging.debug('start training ...') + start = time.time() + data_iter = iter(train_data) + for epoch in range(num_epoch): + nbatch = 0 + data_iter.reset() + metric.reset() + for batch in data_iter: + nbatch += 1 + row_ids = batch.data[0].indices + # pull sparse weight + index = mod._exec_group.param_names.index('weight') + kv.row_sparse_pull('weight', mod._exec_group.param_arrays[index], + priority=-index, row_ids=[row_ids]) + mod.forward_backward(batch) + # update parameters + mod.update() + # accumulate prediction accuracy + mod.update_metric(metric, batch.label) + if nbatch == num_batch: + break + logging.info('epoch %d, %s' % (epoch, metric.get())) + if profiler: + mx.profiler.profiler_set_state('stop') + end = time.time() + time_cost = end - start + logging.info('num_worker = ' + str(num_worker) + ', time cost = ' + str(time_cost)) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 2289354e8a5e..a43f73fe45ab 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -276,6 +276,38 @@ MXNET_DLL int MXNDArrayCreateEx(const mx_uint *shape, int delay_alloc, int dtype, NDArrayHandle *out); + + +/*! + * \brief create an empty sparse NDArray with specified shape and data type + * \param storage_type the storage type of the ndarray + * \param shape the pointer to the shape + * \param ndim the dimension of the shape + * \param dev_type device type, specify device we want to take + * \param dev_id the device id of the specific device + * \param delay_alloc whether to delay allocation until + * the narray is first mutated + * \param dtype data type of created array + * \param num_aux the number of aux data to support this ndarray + * \param aux_type data type of the aux data for the created array + * \param aux_ndims the dimension of the shapes of aux data + * \param aux_shape the shapes of aux data + * \param out the returning handle + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXNDArrayCreateSparseEx(int storage_type, + const mx_uint *shape, + mx_uint ndim, + int dev_type, + int dev_id, + int delay_alloc, + int dtype, + mx_uint num_aux, + int *aux_type, + mx_uint *aux_ndims, + const mx_uint *aux_shape, + NDArrayHandle *out); + /*! * \brief create a NDArray handle that is loaded from raw bytes. * \param buf the head of the raw bytes @@ -350,6 +382,17 @@ MXNET_DLL int MXNDArraySyncCopyFromCPU(NDArrayHandle handle, MXNET_DLL int MXNDArraySyncCopyToCPU(NDArrayHandle handle, void *data, size_t size); +/*! + * \brief Copy src.data() to dst.data() if i = -1, else dst.aux_data(i) if i >= 0 + * This function blocks. Do not use it in performance critical code. + * \param handle_dst handle of a dst ndarray whose data/aux_data has been allocated + * \param handle_src handle of a src ndarray which has default storage type + * \param i dst data blob indicator + */ +MXNET_DLL int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst, + const NDArrayHandle handle_src, + const int i); + /*! * \brief Wait until all the pending writes with respect NDArray are finished. * Always call this before read data out synchronizely. @@ -388,6 +431,7 @@ MXNET_DLL int MXNDArraySlice(NDArrayHandle handle, mx_uint slice_begin, mx_uint slice_end, NDArrayHandle *out); + /*! * \brief Index the NDArray along axis 0. * \param handle the handle to the NDArray @@ -398,6 +442,13 @@ MXNET_DLL int MXNDArraySlice(NDArrayHandle handle, MXNET_DLL int MXNDArrayAt(NDArrayHandle handle, mx_uint idx, NDArrayHandle *out); + +/*! + * \brief get the storage type of the array + */ +MXNET_DLL int MXNDArrayGetStorageType(NDArrayHandle handle, + int *out_storage_type); + /*! * \brief Reshape the NDArray. * \param handle the handle to the narray @@ -436,6 +487,34 @@ MXNET_DLL int MXNDArrayGetData(NDArrayHandle handle, */ MXNET_DLL int MXNDArrayGetDType(NDArrayHandle handle, int *out_dtype); + +/*! + * \brief get the type of the ith aux data in NDArray + * \param handle the handle to the narray + * \param i the index of the aux data + * \param out_type pointer holder to get type of aux data + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXNDArrayGetAuxType(NDArrayHandle handle, + mx_uint i, + int *out_type); + +/*! + * \brief Get a deep copy of the ith aux data blob + * in the form of an NDArray of default storage type. + * This function blocks. Do not use it in performance critical code. + */ +MXNET_DLL int MXNDArrayGetAuxNDArray(NDArrayHandle handle, + mx_uint i, + NDArrayHandle *out); + +/*! + * \brief Get a deep copy of the data blob + * in the form of an NDArray of default storage type. + * This function blocks. Do not use it in performance critical code. + */ +MXNET_DLL int MXNDArrayGetDataNDArray(NDArrayHandle handle, + NDArrayHandle *out); /*! * \brief get the context of the NDArray * \param handle the handle to the narray @@ -581,6 +660,28 @@ MXNET_DLL int MXImperativeInvoke(AtomicSymbolCreator creator, int num_params, const char **param_keys, const char **param_vals); +/*! + * \brief invoke a nnvm op and imperative function + * \param creator the op + * \param num_inputs number of input NDArrays + * \param inputs input NDArrays + * \param num_outputs number of output NDArrays + * \param outputs output NDArrays + * \param num_params number of keyword parameters + * \param param_keys keys for keyword parameters + * \param param_vals values for keyword parameters + * \param out_stypes output ndarrays' stypes + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXImperativeInvokeEx(AtomicSymbolCreator creator, + int num_inputs, + NDArrayHandle *inputs, + int *num_outputs, + NDArrayHandle **outputs, + int num_params, + const char **param_keys, + const char **param_vals, + const int **out_stypes); /*! * \brief set whether to record operator for autograd * \param is_recording 1 when recording, 0 when not recording. @@ -666,6 +767,30 @@ MXNET_DLL int MXCreateCachedOp(SymbolHandle handle, * \brief free cached operator */ MXNET_DLL int MXFreeCachedOp(CachedOpHandle handle); +/*! + * \brief invoke cached operator + */ +MXNET_DLL int MXInvokeCachedOp(CachedOpHandle handle, + int num_inputs, + NDArrayHandle *inputs, + int *num_outputs, + NDArrayHandle **outputs); +/*! + * \brief invoke a cached op + * \param handle the handle to the cached op + * \param num_inputs number of input NDArrays + * \param inputs input NDArrays + * \param num_outputs number of output NDArrays + * \param outputs output NDArrays + * \param out_stypes output ndarrays' stypes + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXInvokeCachedOpEx(CachedOpHandle handle, + int num_inputs, + NDArrayHandle *inputs, + int *num_outputs, + NDArrayHandle **outputs, + const int** out_stypes); /*! * \brief invoke cached operator */ @@ -1017,20 +1142,20 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym, * \return 0 when success, -1 when failure happens */ MXNET_DLL int MXSymbolInferShapePartial(SymbolHandle sym, - mx_uint num_args, - const char** keys, - const mx_uint *arg_ind_ptr, - const mx_uint *arg_shape_data, - mx_uint *in_shape_size, - const mx_uint **in_shape_ndim, - const mx_uint ***in_shape_data, - mx_uint *out_shape_size, - const mx_uint **out_shape_ndim, - const mx_uint ***out_shape_data, - mx_uint *aux_shape_size, - const mx_uint **aux_shape_ndim, - const mx_uint ***aux_shape_data, - int *complete); + mx_uint num_args, + const char** keys, + const mx_uint *arg_ind_ptr, + const mx_uint *arg_shape_data, + mx_uint *in_shape_size, + const mx_uint **in_shape_ndim, + const mx_uint ***in_shape_data, + mx_uint *out_shape_size, + const mx_uint **out_shape_ndim, + const mx_uint ***out_shape_data, + mx_uint *aux_shape_size, + const mx_uint **aux_shape_ndim, + const mx_uint ***aux_shape_data, + int *complete); /*! * \brief infer type of unknown input types given the known one. @@ -1061,6 +1186,10 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym, mx_uint *aux_type_size, const int **aux_type_data, int *complete); + + + + //-------------------------------------------- // Part 4: Executor interface //-------------------------------------------- @@ -1222,36 +1351,39 @@ MXNET_DLL int MXExecutorBindEX(SymbolHandle symbol_handle, ExecutorHandle *out); MXNET_DLL int MXExecutorSimpleBind(SymbolHandle symbol_handle, - int dev_type, - int dev_id, - const mx_uint num_g2c_keys, - const char** g2c_keys, - const int* g2c_dev_types, - const int* g2c_dev_ids, - const mx_uint provided_grad_req_list_len, - const char** provided_grad_req_names, - const char** provided_grad_req_types, - const mx_uint num_provided_arg_shapes, - const char** provided_arg_shape_names, - const mx_uint* provided_arg_shape_data, - const mx_uint* provided_arg_shape_idx, - const mx_uint num_provided_arg_dtypes, - const char** provided_arg_dtype_names, - const int* provided_arg_dtypes, - const mx_uint num_shared_arg_names, - const char** shared_arg_name_list, - int* shared_buffer_len, - const char** shared_buffer_name_list, - NDArrayHandle* shared_buffer_handle_list, - const char*** updated_shared_buffer_name_list, - NDArrayHandle** updated_shared_buffer_handle_list, - mx_uint* num_in_args, - NDArrayHandle** in_args, - NDArrayHandle** arg_grads, - mx_uint* num_aux_states, - NDArrayHandle** aux_states, - ExecutorHandle shared_exec_handle, - ExecutorHandle* out); + int dev_type, + int dev_id, + const mx_uint num_g2c_keys, + const char** g2c_keys, + const int* g2c_dev_types, + const int* g2c_dev_ids, + const mx_uint provided_grad_req_list_len, + const char** provided_grad_req_names, + const char** provided_grad_req_types, + const mx_uint num_provided_arg_shapes, + const char** provided_arg_shape_names, + const mx_uint* provided_arg_shape_data, + const mx_uint* provided_arg_shape_idx, + const mx_uint num_provided_arg_dtypes, + const char** provided_arg_dtype_names, + const int* provided_arg_dtypes, + const mx_uint num_provided_arg_stypes, + const char** provided_arg_stype_names, + const int* provided_arg_stypes, + const mx_uint num_shared_arg_names, + const char** shared_arg_name_list, + int* shared_buffer_len, + const char** shared_buffer_name_list, + NDArrayHandle* shared_buffer_handle_list, + const char*** updated_shared_buffer_name_list, + NDArrayHandle** updated_shared_buffer_handle_list, + mx_uint* num_in_args, + NDArrayHandle** in_args, + NDArrayHandle** arg_grads, + mx_uint* num_aux_states, + NDArrayHandle** aux_states, + ExecutorHandle shared_exec_handle, + ExecutorHandle* out); /*! * \brief set a call back to notify the completion of operation */ @@ -1468,6 +1600,26 @@ MXNET_DLL int MXKVStorePullEx(KVStoreHandle handle, const char** keys, NDArrayHandle* vals, int priority); + +/*! + * \brief pull a list of (key, value) pairs from the kvstore, where each key is a string. + * The NDArray pulled back will be in row_sparse storage with only the specified + * row_ids present based row_ids (others rows are zeros). + * \param handle handle to the kvstore + * \param num the number of key-value pairs + * \param keys the list of keys + * \param vals the list of values + * \param row_ids the list of row_id NDArrays + * \param priority the priority of the action + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXKVStorePullRowSparse(KVStoreHandle handle, + mx_uint num, + const char** keys, + NDArrayHandle* vals, + const NDArrayHandle* row_ids, + int priority); + /*! * \brief user-defined updater for the kvstore * It's this updater's responsibility to delete \a recv and \a local diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h index a74d3b07b5be..85d34778dd8c 100644 --- a/include/mxnet/executor.h +++ b/include/mxnet/executor.h @@ -133,6 +133,7 @@ class Executor { const std::vector& aux_state_ctxes, const std::unordered_map& arg_shape_map, const std::unordered_map& arg_dtype_map, + const std::unordered_map& arg_stype_map, const std::vector& grad_req_types, const std::unordered_set& param_names, std::vector* in_args, diff --git a/include/mxnet/graph_attr_types.h b/include/mxnet/graph_attr_types.h new file mode 100644 index 000000000000..3aba0119d8ca --- /dev/null +++ b/include/mxnet/graph_attr_types.h @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file graph_attr_types.h + * \brief Data structures that can appear in graph attributes. + */ +#ifndef MXNET_GRAPH_ATTR_TYPES_H_ +#define MXNET_GRAPH_ATTR_TYPES_H_ + +#include + +namespace mxnet { + +/*! + * \brief The result holder of storage type of each NodeEntry in the graph. + * \note Stored under graph.attrs["storage_type"], provided by Pass "InferStorageType" + * + * \code + * Graph g = ApplyPass(src_graph, "InferStorageType"); + * const StorageVector& stypes = g.GetAttr("storage_type"); + * // get shape by entry id + * int entry_type = stypes[g.indexed_graph().entry_id(my_entry)]; + * \endcode + * + * \sa FInferStorageType + */ +using StorageTypeVector = std::vector; + +} // namespace mxnet + +#endif // MXNET_GRAPH_ATTR_TYPES_H_ diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index d2924ecea1b5..9ea63b4cec79 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -25,6 +25,7 @@ #define MXNET_KVSTORE_H_ #include #include +#include #include #include #include @@ -173,6 +174,29 @@ class KVStore { const std::vector& values, int priority = 0) = 0; + /*! + * \brief pull a list of key-value pairs from the store. + * The NDArray pulled back will be in row_sparse storage with only the + * specified row_ids present (others rows are zeros). + * \param keys the list of keys + * \param values the list of buffers - row_id pairs + * \param priority the priority of the action. + */ + virtual void PullRowSparse(const std::vector& str_keys, + const std::vector>& val_rowids, + const int priority = 0) = 0; + + /*! + * \brief pull a list of key-value pairs from the store, where each key is a string. + * The NDArray pulled back will be in row_sparse storage with only the + * specified row_ids present (others rows are zeros). + * \param keys the list of keys in string format + * \param values the list of buffers - row_id pairs + * \param priority the priority of the action. + */ + virtual void PullRowSparse(const std::vector& str_keys, + const std::vector>& val_rowids, + const int priority = 0) = 0; /** * \brief the prototype of user-defined updater diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index d7dff4098b27..56e36dffbf27 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -47,7 +47,6 @@ namespace mxnet { -// forward declaration namespace autograd { class AGNode; @@ -71,6 +70,23 @@ class AGNodeEntry { class AutogradRuntime; } // namespace autograd +// enum for storage types +namespace csr { +enum CSRAuxType {kIndPtr, kIdx}; +} + +namespace rowsparse { +enum RowSparseAuxType {kIdx}; +} + +enum NDArrayStorageType { + kUndefinedStorage = -1, // undefined storage + kDefaultStorage, // dense + kRowSparseStorage, // row sparse + kCSRStorage, // csr +}; + + /*! * \brief ndarray interface */ @@ -91,10 +107,55 @@ class NDArray { */ NDArray(const TShape &shape, Context ctx, bool delay_alloc = false, int dtype = mshadow::default_type_flag) - : ptr_(std::make_shared(shape.Size(), ctx, delay_alloc, dtype)), + : ptr_(std::make_shared(shape, ctx, delay_alloc, dtype)), shape_(shape), dtype_(dtype), entry_({nullptr, 0, 0}) { #if MKL_EXPERIMENTAL == 1 Mkl_mem_ = std::make_shared(); +#endif + } + /*! \brief constructor for NDArray with storage type + */ + NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx, + bool delay_alloc = true, int dtype = mshadow::default_type_flag, + std::vector aux_types = {}, std::vector aux_shapes = {}, + TShape storage_shape = TShape(mshadow::Shape1(0))) + : shape_(shape), dtype_(dtype), entry_({nullptr, 0, 0}) { + // Assign default aux types if not given + if (aux_types.size() == 0) { + if (stype == kRowSparseStorage) { + aux_types = {mshadow::kInt64}; + } else if (stype == kCSRStorage) { + aux_types = {mshadow::kInt64, mshadow::kInt64}; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + // Assign default shapes if not given + // unknown shapes are intialized as {0} such that Size() would return 0 + if (aux_shapes.size() == 0) { + if (stype == kRowSparseStorage) { + aux_shapes = {TShape(mshadow::Shape1(0))}; + } else if (stype == kCSRStorage) { + // aux shapes for indptr and indices + aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))}; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + if (storage_shape.Size() == 0) { + if (stype == kRowSparseStorage) { + storage_shape = shape; + storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; + } else if (stype == kCSRStorage) { + storage_shape = aux_shapes[csr::kIdx]; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, + dtype, aux_types, aux_shapes); +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = std::make_shared(); #endif } /*! @@ -111,17 +172,82 @@ class NDArray { Mkl_mem_ = std::make_shared(); #endif } + /*! - * \return the shape of current NDArray + * \brief constructing a static NDArray of non-default storage that shares data with TBlob + * Use with caution: allocate ONLY ONE NDArray for each TBlob, + * make sure the memory region is available through out the life of NDArray + * \param stype the storage type of NDArray + * \param shape the shape of NDArray + * \param data the memory content of static data + * \param aux_data the memory content of static aux data + * \param dev_id the device id this tensor sits at + */ + NDArray(const NDArrayStorageType stype, const TShape &shape, + const TBlob &data, const std::vector &aux_data, int dev_id) + : ptr_(std::make_shared(stype, data, aux_data, dev_id)), shape_(shape), + dtype_(data.type_flag_), entry_({nullptr, 0, 0}) { +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = std::make_shared(); +#endif + } + + + /*! + * \return the shape of current NDArray. */ inline const TShape& shape() const { return shape_; } + /*! + * \return the shape of underlying chunk which stores the NDArray data/value. + * It is only intended for non-default storage. For row-sparse storage, it is the shape of + * the tensor which stores the non-zero values. + */ + inline const TShape &storage_shape() const { + CHECK(ptr_ != nullptr); + CHECK_NE(storage_type(), kDefaultStorage); + return ptr_->storage_shape; + } + + /*! + * \brief get the shape of aux_data(index) + * \param index the index of the aux data + * \return the shape of aux data at given index + */ + inline const TShape& aux_shape(size_t index) const { + CHECK(storage_type() != kDefaultStorage); + return ptr_->aux_shapes[index]; + } + + /* \return the shapes of all aux data */ + const std::vector& aux_shapes() const { + CHECK(storage_type() != kDefaultStorage); + return ptr_->aux_shapes; + } + + /*! returns the dtypes of all aux data */ + const std::vector& aux_types() const { + CHECK(storage_type() != kDefaultStorage); + return ptr_->aux_types; + } + + /*! + * \brief For a sparse operation on a csr matrix for example, + * the size of the column index array + * is an estimated value in the beginning for allocating enough capacity + * for the final result. After the operation is done, the exact size of + * the shape is known and need to be reset using this function. + */ + inline void set_aux_shape(size_t index, const TShape& shape) const { + ptr_->set_aux_shape(index, shape); + } + /*! * \return the data TBlob */ inline const TBlob& data() const { - CheckAndAlloc(); + if (storage_type() == kDefaultStorage) CheckAndAlloc(); SetTBlob(); return tblob_; } @@ -129,6 +255,26 @@ class NDArray { * \return the gradient ndarray. */ NDArray grad() const; + + /*! + * \return the aux TBlob + */ + inline TBlob aux_data(size_t i) const { + auto stype = storage_type(); + TBlob res; + auto shape = aux_shape(i); + auto type = aux_type(i); + MSHADOW_TYPE_SWITCH(type, DType, { + auto dptr = static_cast(ptr_->aux_handles[i].dptr); + CHECK(stype == kRowSparseStorage || stype == kCSRStorage) + << "Unexpected storage type: " << stype; + res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type); + }); +#if MKL_EXPERIMENTAL == 1 + res.Mkl_mem_ = Mkl_mem_; +#endif + return res; + } /*! * \return the context of NDArray, this function is only valid when the NDArray is not empty */ @@ -141,6 +287,15 @@ class NDArray { inline int dtype() const { return dtype_; } + inline int aux_type(size_t i) const { + CHECK(!is_none()); + return ptr_->aux_types[i]; + } + + inline NDArrayStorageType storage_type() const { + if (is_none()) return kUndefinedStorage; + return ptr_->storage_type; + } /*! \return whether this ndarray is not initialized */ inline bool is_none() const { return ptr_.get() == nullptr; @@ -149,6 +304,22 @@ class NDArray { bool fresh_out_grad() const; /*! \return updated grad state in entry_ */ void set_fresh_out_grad(bool state) const; + // returns true if a sparse ndarray's aux_data and storage are initialized + inline bool storage_initialized() const { + if (is_none()) return false; + auto stype = storage_type(); + CHECK_NE(stype, kDefaultStorage); + if (stype == kRowSparseStorage) { + CHECK_EQ(aux_shape(rowsparse::kIdx)[0], storage_shape()[0]); + return aux_shape(0).Size() != 0; + } else if (stype == kCSRStorage) { + CHECK_EQ(aux_shape(csr::kIdx)[0], storage_shape()[0]); + return aux_shape(0).Size() != 0; + } else { + LOG(FATAL) << "Unknown storage type"; + } + return true; + } /*! * \brief Block until all the pending write operations with respect * to current NDArray are finished, and read can be performed. @@ -179,6 +350,12 @@ class NDArray { * \param strm the output stream */ void Save(dmlc::Stream *strm) const; + /*! + * \brief load ndarrays before supporting sparse ndarrays + * \param strm the output stream + * \param magic the magic number used for version control + */ + bool LegacyLoad(dmlc::Stream *strm, const uint32_t magic); /*! * \brief load the content from binary stream * \param strm the output stream @@ -269,6 +446,12 @@ class NDArray { * \param size the size of the source array, in sizeof(DType) not raw btyes. */ void SyncCopyFromCPU(const void *data, size_t size) const; + + /*! + * \brief Copy from src.data()/aux_data(i) to this->data()/aux_data(j) + */ + void SyncCopyFromNDArray(const NDArray &src, int i = -1, int j = -1); + /*! * \brief Do a synchronize copy to a continugous CPU memory region. * @@ -282,17 +465,31 @@ class NDArray { void SyncCopyToCPU(void *data, size_t size) const; /*! * \brief Slice a NDArray - * \param begin begin index in first dim - * \param end end index in first dim + * \param begin begin index in first dim (inclusive) + * \param end end index in first dim (exclusive) * \return sliced NDArray */ NDArray Slice(index_t begin, index_t end) const; + /*! * \brief Index a NDArray * \param idx the index * \return idx-th sub array NDArray */ NDArray At(index_t idx) const; + + /*! + * \brief Generate a deep copy of aux_data(i) returned as + * a default storage type NDArray + */ + NDArray aux_ndarray(size_t i) const; + + /*! + * \brief Generate a deep copy of data() returned as a + * default storage type NDArray + */ + NDArray data_ndarray() const; + /*! * \brief Create a NDArray that shares memory with current one * The new array must have smaller memory size than the current array. @@ -301,6 +498,7 @@ class NDArray { * \return NDArray in new shape and type. */ inline NDArray AsArray(const TShape &shape, int dtype) const { + CHECK_EQ(storage_type(), kDefaultStorage) << "Not implemented yet"; CHECK_GE(shape_.Size() * mshadow::mshadow_sizeof(dtype_), shape.Size() * mshadow::mshadow_sizeof(dtype)) << "NDArray.AsArray: target memory size is bigger"; @@ -342,8 +540,42 @@ class NDArray { * This is an internal function used by system that normal user should not use */ inline void CheckAndAlloc() const { + CHECK_EQ(storage_type(), kDefaultStorage); ptr_->CheckAndAlloc(); } + + /*! + * \brief Allocate the space if the allocation has been delayed + * or the requested size is bigger than the available one. + * This function can only be called by ndarray of default + * storage type and effectively changes the ndarray's shape_. + * Note: This function is named as this to avoid overload conflict + * with CheckAndAlloc(const std::vector &aux_shapes), since + * TShape tmp = some_shape is equivalent to TShape tmp = {some_shape}. + */ + void ReshapeAndAlloc(const TShape& shape) { + CHECK_EQ(storage_type(), kDefaultStorage); + CHECK(!is_none()); + shape_ = shape; + ptr_->CheckAndAlloc(shape.Size() * mshadow::mshadow_sizeof(dtype_)); + } + + /* ! + * \brief Alloc memory for non-default storage + * aux_shape is only known at run time + */ + inline void CheckAndAlloc(const std::vector &aux_shapes) const { + CHECK_NE(storage_type(), kDefaultStorage); + ptr_->CheckAndAlloc(shape_, aux_shapes, dtype_); + } + inline void CheckAndAllocData(const TShape &storage_shape) const { + CHECK_NE(storage_type(), kDefaultStorage); + ptr_->CheckAndAllocData(storage_shape, dtype_); + } + inline void CheckAndAllocAuxData(size_t i, const TShape &aux_shape) const { + CHECK_NE(storage_type(), kDefaultStorage); + ptr_->CheckAndAllocAuxData(i, aux_shape); + } /*! * \brief Save list of ndarray into the Stream.x * \param fo The stream of output. @@ -366,44 +598,138 @@ class NDArray { private: friend class autograd::AutogradRuntime; /*! \brief the real data chunk that backs NDArray */ + // shandle is used to store the actual values in the NDArray + // aux_handles store the aux data(such as indices) if it's needed by non-default storage. struct Chunk { - /*! \brief storage handlefrom storage engine */ + /*! \brief storage handle from storage engine. + for non-default storage, shandle stores the data(value) array. + */ Storage::Handle shandle; + /*! \brief storage handles for aux data (e.g index) + for row_sparse, aux_handles[0] = indices + for csr, aux_handles[0] = indptr, aux_handles[1] = indices + */ + std::vector aux_handles; /*! \brief variable from engine */ Engine::VarHandle var; /*! * \brief if this is true, this means the data do not come * from Storage, and do not need to be freed */ + /*! \brief construct from static data */ bool static_data; - /*! \brief whether allocation is delayed */ + /*! \brief whether data allocation is delayed. This doesn't indicate whether aux data + allocation is delayed. */ bool delay_alloc; + // the type of the storage. The storage_type is never kUndefinedStorage once the chunk + // is constructed. + NDArrayStorageType storage_type = kDefaultStorage; + /*! \brief type of aux */ + std::vector aux_types; + // context of data + Context ctx; + // The shape of the chunk data. + // This might not be the same shape as the NDArray, since the storage may be sparse. + // The default value for storage_shape is {0} when an empty non-default NDArray is created. + TShape storage_shape; + // The shape of aux data. The default value for the shape depends on the type of storage. + // If aux_shapes[i].Size() is zero, aux data i is empty. + std::vector aux_shapes; + /*! \brief default cosntructor */ - Chunk() : static_data(true), delay_alloc(false) { - var = Engine::Get()->NewVariable(); + Chunk() : static_data(true), delay_alloc(false) {} + + /*! \brief construct a new chunk */ + Chunk(TShape shape, Context ctx_, bool delay_alloc_, int dtype) + : static_data(false), delay_alloc(true), ctx(ctx_) { + auto size = shape.Size(); + storage_shape = shape; + var = Engine::Get()->NewVariable(); + shandle.size = size * mshadow::mshadow_sizeof(dtype); + shandle.ctx = ctx_; + if (!delay_alloc_) this->CheckAndAlloc(); } - /*! \brief construct from static data */ + Chunk(const TBlob &data, int dev_id) - : static_data(true), - delay_alloc(false) { + : static_data(true), delay_alloc(false) { + CHECK(storage_type == kDefaultStorage); var = Engine::Get()->NewVariable(); if (data.dev_mask() == cpu::kDevMask) { - shandle.ctx = Context::CPU(); + ctx = Context::CPU(); } else { CHECK_EQ(data.dev_mask(), gpu::kDevMask); - shandle.ctx = Context::GPU(dev_id); + ctx = Context::GPU(dev_id); } + // init shandle + shandle.ctx = ctx; shandle.dptr = data.dptr_; shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_); + storage_shape = data.shape_; } - /*! \brief construct a new chunk */ - Chunk(uint64_t size, Context ctx, bool delay_alloc_, int dtype) - : static_data(false), delay_alloc(true) { + // Constructor for a non-default storage chunk + Chunk(NDArrayStorageType storage_type_, const TShape &storage_shape_, Context ctx_, + bool delay_alloc_, int dtype, const std::vector &aux_types_, + const std::vector &aux_shapes_) + : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_), + aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_), + aux_shapes(aux_shapes_) { + shandle.ctx = ctx; var = Engine::Get()->NewVariable(); - shandle.size = size * mshadow::mshadow_sizeof(dtype); + // aux_handles always reflect the correct number of aux data + for (size_t i = 0; i < aux_shapes.size(); i++) { + CheckAndAllocAuxData(i, aux_shapes[i]); + // this line is needed in case when aux_shapes[i].Size() = 0 + // aux_handles[i] will not be updated and take only default value. + aux_handles[i].ctx = ctx; + } + if (!delay_alloc) { + CheckAndAllocData(storage_shape, dtype); + } + } + + Chunk(const NDArrayStorageType storage_type_, const TBlob &data, + const std::vector &aux_data, int dev_id) + : static_data(true), delay_alloc(false), storage_type(storage_type_) { + using namespace mshadow; + CHECK_NE(storage_type, kDefaultStorage); + // init var + var = Engine::Get()->NewVariable(); + // init ctx + if (data.dev_mask() == cpu::kDevMask) { + ctx = Context::CPU(); + } else { + CHECK_EQ(data.dev_mask(), gpu::kDevMask); + ctx = Context::GPU(dev_id); + } + // init shandle shandle.ctx = ctx; - if (!delay_alloc_) this->CheckAndAlloc(); + shandle.dptr = data.dptr_; + shandle.size = data.shape_.Size() * mshadow_sizeof(data.type_flag_); + storage_shape = data.shape_; + // init aux handles + for (const auto &aux : aux_data) { + Storage::Handle aux_handle; + aux_handle.ctx = ctx; + aux_handle.dptr = aux.dptr_; + aux_handle.size = aux.shape_.Size() * mshadow_sizeof(aux.type_flag_); + aux_handles.push_back(aux_handle); + aux_types.emplace_back(aux.type_flag_); + aux_shapes.emplace_back(aux.shape_); + } + } + + /*! \brief set the shape for ith aux data, and update storage shape if necessary */ + inline void set_aux_shape(const size_t i, const TShape& shape) { + aux_shapes[i] = shape; + if (storage_shape.ndim() > 0) { + if (storage_type == kRowSparseStorage && i == rowsparse::kIdx) { + storage_shape[0] = shape[0]; + } else if (storage_type == kCSRStorage && i == csr::kIdx) { + storage_shape[0] = shape[0]; + } + } } + /*! \brief check if delay alloc is on, do alloc if not yet done */ inline void CheckAndAlloc(void) { if (delay_alloc) { @@ -411,22 +737,112 @@ class NDArray { delay_alloc = false; } } - /*! \brief destructor */ - ~Chunk() { - if (static_data || delay_alloc) { - Engine::Get()->DeleteVariable([](RunContext s) {}, shandle.ctx, var); + + /*! \brief Check and alloc memory for a dense ndarray */ + // size is the number of bytes + void CheckAndAlloc(uint64_t dbytes) { + CHECK_EQ(kDefaultStorage, storage_type); + if (delay_alloc) { + shandle = Storage::Get()->Alloc(dbytes, shandle.ctx); + delay_alloc = false; + } else if (shandle.size < dbytes) { + // free storage if necessary and alloc again + if (shandle.size > 0) Storage::Get()->Free(shandle); + // init storage + shandle = Storage::Get()->Alloc(dbytes, shandle.ctx); + } + } + + inline void CheckAndAlloc(const TShape &shape, const std::vector &aux_shapes, + int dtype) { + // calculate size, perform allocation + if (kRowSparseStorage == storage_type) { + // For row sparse, aux_shape indicates the number of rows to allocate + auto aux_shape = aux_shapes[rowsparse::kIdx]; + CheckAndAllocAuxData(rowsparse::kIdx, aux_shape); + TShape storage_shape(shape); + storage_shape[0] = aux_shape[0]; + CheckAndAllocData(storage_shape, dtype); + } else if (kCSRStorage == storage_type) { + CheckAndAllocAuxData(csr::kIndPtr, aux_shapes[csr::kIndPtr]); + CheckAndAllocAuxData(csr::kIdx, aux_shapes[csr::kIdx]); + CheckAndAllocData(aux_shapes[csr::kIdx], dtype); } else { - Storage::Handle h = this->shandle; - Engine::Get()->DeleteVariable([h](RunContext s) { - Storage::Get()->Free(h); - }, shandle.ctx, var); + LOG(FATAL) << "Storage type " << storage_type << " not implemented for CheckAndAlloc"; + } + } + // create storage handle for data based on shape and dtype, assuming ctx is set + // storage shape is also updated + // if data is already allocated, try reuse the storage. Otherwise, free the current one + // and allocate new storage + inline void CheckAndAllocData(const TShape &shape, int dtype) { + CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data"; + auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); + if (shandle.size < dbytes) { + // free storage if necessary and alloc again + if (shandle.size > 0) Storage::Get()->Free(shandle); + // init storage + shandle = Storage::Get()->Alloc(dbytes, ctx); } + // init shape + storage_shape = shape; + // delay_alloc is only set when data storage handle is present + delay_alloc = false; + } + // create storage handle for aux data based on shape + // this function assumes ctx, aux shapes and aux types are set + // aux shape is also updated + // if aux data is already allocated, try reuse the storage. Otherwise, free the current one + // and allocate new storage + inline void CheckAndAllocAuxData(size_t i, const TShape &shape) { + CHECK_EQ(shape.ndim(), 1) << "shape must be 1D in CheckAndAllocAuxData"; + CHECK_NE(storage_type, kUndefinedStorage) + << "storage type cannot be kUndefinedStorage in CheckAndAllocAuxData"; + CHECK_NE(storage_type, kDefaultStorage) + << "storage type cannot be kDefaultStorage in CheckAndAllocAuxData"; + if (aux_handles.size() <= i) { + aux_handles.resize(i + 1); + } + size_t aux_bytes = shape.Size() * mshadow::mshadow_sizeof(aux_types[i]); + if (aux_handles[i].size < aux_bytes) { + // free storage if necessary and alloc again + if (aux_handles[i].size > 0) Storage::Get()->Free(aux_handles[i]); + // init aux storage + aux_handles[i] = Storage::Get()->Alloc(aux_bytes, ctx); + } + // init shape + set_aux_shape(i, shape); + } + /*! \brief destructor */ + ~Chunk() { + bool skip_free = static_data || delay_alloc; + Storage::Handle h = this->shandle; + std::vector aux_h = this->aux_handles; + Engine::Get()->DeleteVariable([h, aux_h, skip_free](RunContext s) { + if (skip_free == false) { + Storage::Get()->Free(h); + for (size_t i = 0; i < aux_h.size(); i++) { + if (aux_h[i].size > 0) Storage::Get()->Free(aux_h[i]); + } + } + }, shandle.ctx, var); } - }; + }; // struct Chunk void SetTBlob() const { - tblob_.dptr_ = static_cast(ptr_->shandle.dptr) + byte_offset_; - tblob_.shape_ = shape_; + CHECK(ptr_ != nullptr); + TShape shape = shape_; + char *dptr = static_cast(ptr_->shandle.dptr); + auto stype = storage_type(); + if (stype == kDefaultStorage) { + dptr += byte_offset_; + } else if (stype == kCSRStorage || stype == kRowSparseStorage) { + shape = storage_shape(); + } else { + LOG(FATAL) << "unknown storage type " << stype; + } + tblob_.dptr_ = dptr; + tblob_.shape_ = shape; tblob_.type_flag_ = dtype_; tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id); #if MKL_EXPERIMENTAL == 1 @@ -438,7 +854,7 @@ class NDArray { std::shared_ptr Mkl_mem_; #endif /*! \brief internal data of NDArray */ - std::shared_ptr ptr_; + std::shared_ptr ptr_{nullptr}; /*! \brief shape of current NDArray */ TShape shape_; /*! \brief byte offset in chunk */ @@ -455,7 +871,12 @@ class NDArray { * this situation. */ mutable TBlob tblob_; -}; +}; // class NDArray + +/*! + * \return the number of aux data used for given storage type + */ +size_t num_aux_data(NDArrayStorageType stype); /*! * \brief issue an copy operation from one NDArray to another @@ -470,7 +891,6 @@ class NDArray { */ void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0); - /*! * \brief Perform elementwise sum over each data from source, store result into out. * \param source the ndarray we want to sum diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h index 1bcae0d29348..f559a921c522 100644 --- a/include/mxnet/op_attr_types.h +++ b/include/mxnet/op_attr_types.h @@ -25,7 +25,6 @@ #ifndef MXNET_OP_ATTR_TYPES_H_ #define MXNET_OP_ATTR_TYPES_H_ - #include #include @@ -226,6 +225,23 @@ using FCompute = std::function& inputs, const std::vector& req, const std::vector& outputs)>; +/*! + * \brief Resiger an NDArray compute function for simple stateless forward only operator + * + * \note Register under "FComputeEx" and "FComputeEx" + * Dispatched only when operators process non-default storage inputs or outputs + */ +using FComputeEx = std::function& inputs, + const std::vector& req, + const std::vector& outputs)>; + +using FInferStorageType = std::function* in_attrs, + std::vector* out_attrs)>; + } // namespace mxnet #endif // MXNET_OP_ATTR_TYPES_H_ diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h index bfb42de8771a..7e3af8eeca81 100644 --- a/include/mxnet/storage.h +++ b/include/mxnet/storage.h @@ -41,11 +41,11 @@ class Storage { /*! * \brief Pointer to the data. */ - void* dptr; + void* dptr{nullptr}; /*! * \brief Size of the storage. */ - size_t size; + size_t size{0}; /*! * \brief Context information about device and ID. */ diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i index fd1a471bcf16..b4c1336de624 100644 --- a/perl-package/AI-MXNetCAPI/mxnet.i +++ b/perl-package/AI-MXNetCAPI/mxnet.i @@ -1203,6 +1203,12 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle, const mx_uint num_provided_arg_dtypes, const char** in, // provided_arg_dtype_names, const int* in, // provided_arg_dtypes, + +//--------------- sparse related variables, ignored for now + const mx_uint num_provided_arg_stypes, + const char** provided_arg_stype_names, + const int* provided_arg_stypes, +//--------------- const mx_uint num_shared_arg_names, const char** in, // shared_arg_name_list, //------------ diff --git a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i index 640215fd7792..5d2fbd6880a1 100644 --- a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i +++ b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i @@ -820,6 +820,17 @@ } } +%typemap(in,numinputs=0) (const mx_uint num_provided_arg_stypes, const char** provided_arg_stype_names, + const int* provided_arg_stypes) + (mx_uint temp1, char* temp2, int temp3) +{ + $2 = &temp2; + $3 = &temp3; + $1 = 0; + *$2 = NULL; + *$3 = 0; +} + %typemap(in,numinputs=0) (mx_uint* num_aux_states, NDArrayHandle** aux_states) (mx_uint temp1, diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py index 3c3ce76a9284..72dc2b2fec8d 100644 --- a/python/mxnet/__init__.py +++ b/python/mxnet/__init__.py @@ -26,6 +26,7 @@ from . import base from . import contrib from . import ndarray +from . import ndarray as nd from . import name # use mx.sym as short for symbol from . import symbol as sym @@ -34,8 +35,6 @@ from . import io from . import recordio from . import operator -# use mx.nd as short for mx.ndarray -from . import ndarray as nd # use mx.rnd as short for mx.random from . import random as rnd from . import random diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py index 5a50f80498ec..c2e6fce40de8 100644 --- a/python/mxnet/_ctypes/ndarray.py +++ b/python/mxnet/_ctypes/ndarray.py @@ -32,10 +32,19 @@ from ..ndarray_doc import _build_doc +_STORAGE_TYPE_ID_TO_STR = { + -1 : 'undefined', + 0 : 'default', + 1 : 'row_sparse', + 2 : 'csr', +} + + class NDArrayBase(object): """Base data structure for ndarray""" __slots__ = ["handle", "writable"] # pylint: disable= no-member + def __init__(self, handle, writable=True): """initialize a new NDArray @@ -78,7 +87,11 @@ def _imperative_invoke(handle, ndargs, keys, vals, out): output_vars = ctypes.POINTER(NDArrayHandle)() num_output = ctypes.c_int(0) - check_call(_LIB.MXImperativeInvoke( + # return output stypes to avoid the c_api call for checking + # a handle's stype in _ndarray_cls + out_stypes = ctypes.POINTER(ctypes.c_int)() + + check_call(_LIB.MXImperativeInvokeEx( ctypes.c_void_p(handle), ctypes.c_int(len(ndargs)), c_array(NDArrayHandle, [arr.handle for arr in ndargs]), @@ -86,14 +99,17 @@ def _imperative_invoke(handle, ndargs, keys, vals, out): ctypes.byref(output_vars), ctypes.c_int(len(keys)), c_array(ctypes.c_char_p, [c_str(key) for key in keys]), - c_array(ctypes.c_char_p, [c_str(str(val)) for val in vals]))) + c_array(ctypes.c_char_p, [c_str(str(val)) for val in vals]), + ctypes.byref(out_stypes))) if original_output is not None: return original_output if num_output.value == 1: - return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle)) + return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle), + stype=_STORAGE_TYPE_ID_TO_STR[out_stypes[0]]) else: - return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle)) + return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle), + stype=_STORAGE_TYPE_ID_TO_STR[out_stypes[i]]) for i in range(num_output.value)] @@ -128,17 +144,24 @@ def __call__(self, *args, **kwargs): "CachedOp.__call__ got unexpected keyword argument(s): " + \ ', '.join(kwargs.keys())) - check_call(_LIB.MXInvokeCachedOp( + # return output stypes to avoid the c_api call for checking + # a handle's stype in _ndarray_cls + out_stypes = ctypes.POINTER(ctypes.c_int)() + + check_call(_LIB.MXInvokeCachedOpEx( self.handle, ctypes.c_int(len(args)), c_array(NDArrayHandle, [arr.handle for arr in args]), ctypes.byref(num_output), - ctypes.byref(output_vars))) + ctypes.byref(output_vars), + ctypes.byref(out_stypes))) if original_output is not None: return original_output if num_output.value == 1: - return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle)) + return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle), + stype=_STORAGE_TYPE_ID_TO_STR[out_stypes[0]]) else: - return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle)) + return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle), + stype=_STORAGE_TYPE_ID_TO_STR[out_stypes[i]]) for i in range(num_output.value)] diff --git a/python/mxnet/base.py b/python/mxnet/base.py index aad0580e7d07..d446355da0b5 100644 --- a/python/mxnet/base.py +++ b/python/mxnet/base.py @@ -72,6 +72,20 @@ def __str__(self): msg += ' is not implemented for Symbol and only available in NDArray.' return msg +class NotSupportedForSparseNDArray(MXNetError): + def __init__(self, function, alias, *args): + super(NotSupportedForSparseNDArray, self).__init__() + self.function = function.__name__ + self.alias = alias + self.args = [str(type(a)) for a in args] + def __str__(self): + msg = 'Function {}'.format(self.function) + if self.alias: + msg += ' (namely operator "{}")'.format(self.alias) + if self.args: + msg += ' with arguments ({})'.format(', '.join(self.args)) + msg += ' is not supported for SparseNDArray and only available in NDArray.' + return msg class MXCallbackList(ctypes.Structure): """Structure that holds Callback information. Passed to CustomOpProp.""" diff --git a/python/mxnet/contrib/autograd.py b/python/mxnet/contrib/autograd.py index c7fb6e17803a..2d2500e7a217 100644 --- a/python/mxnet/contrib/autograd.py +++ b/python/mxnet/contrib/autograd.py @@ -24,6 +24,7 @@ import functools from ..base import _LIB, check_call, string_types from ..base import mx_uint, NDArrayHandle, c_array +# pylint: disable= unused-import from ..ndarray import NDArray, zeros_like from ..symbol import _GRAD_REQ_MAP diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py index baff834bb33a..5cc94a5e80ac 100644 --- a/python/mxnet/executor.py +++ b/python/mxnet/executor.py @@ -27,6 +27,7 @@ from .base import mx_uint, NDArrayHandle, ExecutorHandle from .base import check_call, c_array, py_str from .ndarray import NDArray +from .ndarray import _ndarray_cls from . import ndarray as nd # those functions are not used here, we just import them to keep backward compatibility @@ -105,7 +106,9 @@ def _get_outputs(self): handles = ctypes.POINTER(NDArrayHandle)() check_call(_LIB.MXExecutorOutputs(self.handle, ctypes.byref(out_size), ctypes.byref(handles))) - return [NDArray(NDArrayHandle(handles[i])) for i in range(out_size.value)] + num_output = out_size.value + outputs = [_ndarray_cls(NDArrayHandle(handles[i])) for i in range(num_output)] + return outputs def forward(self, is_train=False, **kwargs): """Calculate the outputs specified by the bound symbol. diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py index 8ac1aebe72dd..f67b05de5de3 100644 --- a/python/mxnet/image/detection.py +++ b/python/mxnet/image/detection.py @@ -27,7 +27,7 @@ from ..base import numeric_types from .. import ndarray as nd -from .._ndarray_internal import _cvcopyMakeBorder as copyMakeBorder +from ..ndarray._internal import _cvcopyMakeBorder as copyMakeBorder from .. import io from .image import RandomOrderAug, ColorJitterAug, LightingAug, ColorNormalizeAug from .image import ResizeAug, ForceResizeAug, CastAug, HueJitterAug, RandomGrayAug diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py index 2e40019971ac..d99db214222c 100644 --- a/python/mxnet/image/image.py +++ b/python/mxnet/image/image.py @@ -34,9 +34,9 @@ from ..base import numeric_types from .. import ndarray as nd -from .. import _ndarray_internal as _internal -from .._ndarray_internal import _cvimresize as imresize -from .._ndarray_internal import _cvcopyMakeBorder as copyMakeBorder +from ..ndarray import _internal +from ..ndarray._internal import _cvimresize as imresize +from ..ndarray._internal import _cvcopyMakeBorder as copyMakeBorder from .. import io from .. import recordio diff --git a/python/mxnet/io.py b/python/mxnet/io.py index 0404e34ea36c..4e69a8a801cb 100644 --- a/python/mxnet/io.py +++ b/python/mxnet/io.py @@ -34,6 +34,7 @@ from .base import mx_real_t from .base import check_call, build_param_doc as _build_param_doc from .ndarray import NDArray +from .ndarray import _ndarray_cls from .ndarray import array from .ndarray import concatenate @@ -801,12 +802,12 @@ def iter_next(self): def getdata(self): hdl = NDArrayHandle() check_call(_LIB.MXDataIterGetData(self.handle, ctypes.byref(hdl))) - return NDArray(hdl, False) + return _ndarray_cls(hdl, False) def getlabel(self): hdl = NDArrayHandle() check_call(_LIB.MXDataIterGetLabel(self.handle, ctypes.byref(hdl))) - return NDArray(hdl, False) + return _ndarray_cls(hdl, False) def getindex(self): index_size = ctypes.c_uint64(0) diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index fd0091182aea..84759263007c 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -22,6 +22,7 @@ import ctypes import pickle from .ndarray import NDArray +from .ndarray import _ndarray_cls from .base import _LIB from .base import check_call, c_array, c_str, string_types, mx_uint, py_str from .base import NDArrayHandle, KVStoreHandle @@ -53,8 +54,8 @@ def _updater_wrapper(updater): """A wrapper for the user-defined handle.""" def updater_handle(key, lhs_handle, rhs_handle, _): """ ctypes function """ - lhs = NDArray(NDArrayHandle(lhs_handle)) - rhs = NDArray(NDArrayHandle(rhs_handle)) + lhs = _ndarray_cls(NDArrayHandle(lhs_handle)) + rhs = _ndarray_cls(NDArrayHandle(rhs_handle)) updater(key, lhs, rhs) return updater_handle @@ -186,6 +187,8 @@ def pull(self, key, out=None, priority=0): The returned values are gauranteed to be the latest values in the store. + For row_sparse values, please use `row_sparse_pull` instead. + Parameters ---------- key : int or list of int @@ -231,11 +234,89 @@ def pull(self, key, out=None, priority=0): [ 2. 2. 2.]] """ assert(out is not None) + if not isinstance(out, (list, tuple)): + out = [out] + for val in out: + if not isinstance(val, (list, tuple)): + assert(val.stype == 'default') + else: + for v in val: + assert(v.stype == 'default') ckeys, cvals = _ctype_key_value(key, out) check_call(_LIB.MXKVStorePullEx( self.handle, mx_uint(len(ckeys)), ckeys, cvals, ctypes.c_int(priority))) + def row_sparse_pull(self, key, out=None, priority=0, row_ids=None): + """ Pulls a single row_sparse value or a sequence of row_sparse values from the store + with specified row_ids. + + `row_sparse_pull` is executed asynchronously after all previous + `push`/`pull`/`row_sparse_pull` calls for the same input key(s) are finished. + + The returned values are guaranteed to be the latest values in the store. + + Parameters + ---------- + key : str or list of str + Keys. + + out: NDArray or list of NDArray or list of list of NDArray + Values corresponding to the keys. The stype is expected to be row_sparse + + priority : int, optional + The priority of the pull operation. + Higher priority pull operations are likely to be executed before + other pull actions. + + row_ids : NDArray or list of NDArray + The row_ids for which to pull for each value. The row_ids doesn't have to be unique + or sorted. + + Examples + -------- + >>> shape = (3, 3) + >>> kv.init('3', mx.nd.ones(shape).tostype('row_sparse')) + >>> a = mx.nd.zeros(shape, stype='row_sparse') + >>> row_ids = mx.nd.array([0, 2], dtype='int64') + >>> kv.row_sparse_pull('3', out=a, row_ids=row_ids) + >>> print a.asnumpy() + [[ 1. 1. 1.] + [ 0. 0. 0.] + [ 1. 1. 1.]] + >>> duplicate_row_ids = mx.nd.array([2, 2], dtype='int64') + >>> kv.row_sparse_pull('3', out=a, row_ids=duplicate_row_ids) + >>> print a.asnumpy() + [[ 0. 0. 0.] + [ 0. 0. 0.] + [ 1. 1. 1.]] + >>> unsorted_row_ids = mx.nd.array([1, 0], dtype='int64') + >>> kv.row_sparse_pull('3', out=a, row_ids=unsorted_row_ids) + >>> print a.asnumpy() + [[ 1. 1. 1.] + [ 1. 1. 1.] + [ 0. 0. 0.]] + """ + assert(out is not None) + assert(row_ids is not None) + if isinstance(row_ids, NDArray): + row_ids = [row_ids] + if not isinstance(out, (list, tuple)): + out = [out] + for val in out: + if not isinstance(val, (list, tuple)): + assert(val.stype == 'row_sparse') + else: + for v in val: + assert(v.stype == 'row_sparse') + ckeys, cvals = _ctype_key_value(key, out) + _, crow_ids = _ctype_key_value(key, row_ids) + assert(len(crow_ids) == len(cvals)), "number of row_ids doesn't match number of values" + + check_call(_LIB.MXKVStorePullRowSparse( + self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority))) + + def set_optimizer(self, optimizer): """ Registers an optimizer with the kvstore. diff --git a/python/mxnet/model.py b/python/mxnet/model.py index 01b3fa50e18f..38bb15484e7b 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -93,15 +93,29 @@ def _create_kvstore(kvstore, num_device, arg_params): return (kv, update_on_kvstore) -def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, - update_on_kvstore): +def _contains_non_default_storage(params): + if isinstance(params, (list, tuple)): + for param in params: + if param.stype != 'default': + return True + elif isinstance(params, NDArray): + return param.stype != 'default' + else: + return False + +def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, update_on_kvstore): """Initialize kvstore""" for idx, param_on_devs in enumerate(param_arrays): name = param_names[idx] kvstore.init(name, arg_params[name]) if update_on_kvstore: - kvstore.pull(name, param_on_devs, priority=-idx) + if _contains_non_default_storage(param_on_devs): + # skip pulling row_sparse weights + warnings.warn('Detected non-default weight in kvstore to pull. Please make ' \ + 'sure to pull it with row_ids explicitly', RuntimeWarning) + else: + kvstore.pull(name, param_on_devs, priority=-idx) def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names): """Perform update of param_arrays from grad_arrays on kvstore.""" @@ -113,25 +127,36 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names): # push gradient, priority is negative index kvstore.push(name, grad_list, priority=-index) # pull back the weights - kvstore.pull(name, arg_list, priority=-index) + if _contains_non_default_storage(arg_list): + # skip pulling row_sparse weights + warnings.warn('Detected non-default weight in kvstore to pull. Please make ' \ + 'sure to pull it with row_ids', RuntimeWarning) + else: + kvstore.pull(name, arg_list, priority=-index) def _update_params(param_arrays, grad_arrays, updater, num_device, kvstore=None, param_names=None): """Perform update of param_arrays from grad_arrays not on kvstore.""" - for index, pair in enumerate(zip(param_arrays, grad_arrays)): + for i, pair in enumerate(zip(param_arrays, grad_arrays)): arg_list, grad_list = pair if grad_list[0] is None: continue + index = i if kvstore: name = param_names[index] # push gradient, priority is negative index kvstore.push(name, grad_list, priority=-index) # pull back the sum gradients, to the same locations. - kvstore.pull(name, grad_list, priority=-index) + if _contains_non_default_storage(grad_list): + # skip pulling row_sparse weights + warnings.warn('Detected non-default weight in kvstore to pull. Please make ' \ + 'sure to pull it with row_ids', RuntimeWarning) + else: + kvstore.pull(name, grad_list, priority=-index) for k, p in enumerate(zip(arg_list, grad_list)): # faked an index here, to make optimizer create diff # state for the same index but on diff devs, TODO(mli) - # use a better solution latter + # use a better solution later w, g = p updater(index*num_device+k, g, w) diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py index 3123462f9c7c..bae166e3ffd8 100644 --- a/python/mxnet/module/base_module.py +++ b/python/mxnet/module/base_module.py @@ -957,7 +957,8 @@ def bind(self, data_shapes, label_shapes=None, for_training=True, def init_optimizer(self, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), force_init=False): - """Installs and initializes optimizers. + """Installs and initializes optimizers, as well as initialize kvstore for + distributed training Parameters ---------- diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py index 058edd57eb3d..d55b2117ebd3 100644 --- a/python/mxnet/module/module.py +++ b/python/mxnet/module/module.py @@ -25,7 +25,6 @@ import warnings from .. import context as ctx -from .. import ndarray as nd from .. import optimizer as opt from .executor_group import DataParallelExecutorGroup @@ -33,6 +32,7 @@ from ..model import load_checkpoint from ..initializer import Uniform, InitDesc from ..io import DataDesc +from ..ndarray import zeros from .base_module import BaseModule, _check_input_names, _parse_data_desc @@ -427,13 +427,13 @@ def bind(self, data_shapes, label_shapes=None, for_training=True, else: assert self._arg_params is None and self._aux_params is None param_arrays = [ - nd.zeros(x[0].shape, dtype=x[0].dtype) + zeros(shape=x[0].shape, dtype=x[0].dtype, stype=x[0].stype) for x in self._exec_group.param_arrays ] self._arg_params = {name:arr for name, arr in zip(self._param_names, param_arrays)} aux_arrays = [ - nd.zeros(x[0].shape, dtype=x[0].dtype) + zeros(x[0].shape, dtype=x[0].dtype) for x in self._exec_group.aux_arrays ] self._aux_params = {name:arr for name, arr in zip(self._aux_names, aux_arrays)} @@ -441,7 +441,6 @@ def bind(self, data_shapes, label_shapes=None, for_training=True, if shared_module is not None and shared_module.optimizer_initialized: self.borrow_optimizer(shared_module) - def reshape(self, data_shapes, label_shapes=None): """Reshapes the module for new input shapes. @@ -483,6 +482,7 @@ def init_optimizer(self, kvstore='local', optimizer='sgd', if self._params_dirty: self._sync_params_from_devices() + (kvstore, update_on_kvstore) = \ _create_kvstore(kvstore, len(self._context), self._arg_params) diff --git a/python/mxnet/ndarray/__init__.py b/python/mxnet/ndarray/__init__.py new file mode 100644 index 000000000000..016e25de382c --- /dev/null +++ b/python/mxnet/ndarray/__init__.py @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""NDArray API of MXNet.""" + +from . import _internal +from . import op +from .op import CachedOp +# pylint: disable=wildcard-import, redefined-builtin +from .ndarray import * +from .utils import load, save, zeros, empty, array +from .sparse_ndarray import _ndarray_cls, csr_matrix, row_sparse_array +from .sparse_ndarray import BaseSparseNDArray, RowSparseNDArray, CSRNDArray diff --git a/python/mxnet/_ndarray_internal.py b/python/mxnet/ndarray/_internal.py similarity index 100% rename from python/mxnet/_ndarray_internal.py rename to python/mxnet/ndarray/_internal.py diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray/ndarray.py similarity index 86% rename from python/mxnet/ndarray.py rename to python/mxnet/ndarray/ndarray.py index 42f0ff5e87cf..26d5cd453a5b 100644 --- a/python/mxnet/ndarray.py +++ b/python/mxnet/ndarray/ndarray.py @@ -21,6 +21,7 @@ """NDArray API of MXNet.""" from __future__ import absolute_import from __future__ import division + try: from __builtin__ import slice as py_slice except ImportError: @@ -28,40 +29,25 @@ import ctypes import warnings - -import os as _os -import sys as _sys - import operator import numpy as np -from .base import _LIB, string_types, numeric_types, integer_types -from .base import c_array, py_str, c_str, mx_real_t, _Null # pylint: disable=unused-import -from .base import mx_uint, NDArrayHandle, check_call, OpHandle -from .base import ctypes2buffer -from .context import Context -from . import _ndarray_internal as _internal -from .ndarray_doc import _build_doc - - -# Use different version of SymbolBase -# When possible, use cython to speedup part of computation. -# pylint: disable=unused-import -try: - if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0: - from ._ctypes.ndarray import NDArrayBase, _set_ndarray_class - from ._ctypes.ndarray import CachedOp, _imperative_invoke - elif _sys.version_info >= (3, 0): - from ._cy3.ndarray import NDArrayBase, _set_ndarray_class, _imperative_invoke - from ._cy3.ndarray import CachedOp, _imperative_invoke - else: - from ._cy2.ndarray import NDArrayBase, _set_ndarray_class, _imperative_invoke - from ._cy2.ndarray import CachedOp, _imperative_invoke -except ImportError: - if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0: - raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1") - from ._ctypes.ndarray import NDArrayBase, _set_ndarray_class, _imperative_invoke - from ._ctypes.ndarray import CachedOp, _imperative_invoke -# pylint: enable=unused-import +from ..base import _LIB, numeric_types, integer_types +from ..base import c_array, mx_real_t +from ..base import mx_uint, NDArrayHandle, check_call +from ..base import ctypes2buffer +from ..context import Context +from . import _internal +from .op import NDArrayBase, _STORAGE_TYPE_ID_TO_STR +from . import broadcast_add, broadcast_mul, transpose, broadcast_not_equal, broadcast_power +from . import broadcast_sub, broadcast_div, broadcast_to, broadcast_equal, cast_storage +from . import broadcast_greater, broadcast_greater_equal, broadcast_lesser, broadcast_lesser_equal +from . import zeros_like, slice + +__all__ = ["NDArray", "concatenate", "_DTYPE_NP_TO_MX", "_DTYPE_MX_TO_NP", "_GRAD_REQ_MAP", \ + "ones", "add", "arange", "divide", "equal", "full", "greater", "greater_equal", \ + "imdecode", "lesser", "lesser_equal", "maximum", "minimum", "moveaxis", \ + "multiply", "negative", "not_equal", "onehot_encode", "power", "subtract", \ + "true_divide", "waitall", "_new_empty_handle"] # pylint: disable= no-member _DTYPE_NP_TO_MX = { @@ -74,7 +60,6 @@ np.int8 : 5, np.int64 : 6, } - _DTYPE_MX_TO_NP = { -1 : None, 0 : np.float32, @@ -85,7 +70,12 @@ 5 : np.int8, 6 : np.int64, } - +_STORAGE_TYPE_STR_TO_ID = { + 'undefined' : -1, + 'default' : 0, + 'row_sparse' : 1, + 'csr' : 2, +} _GRAD_REQ_MAP = { 'null': 0, 'write': 1, @@ -135,6 +125,11 @@ def waitall(): """ check_call(_LIB.MXNDArrayWaitAll()) +def _storage_type(handle): + storage_type = ctypes.c_int(0) + check_call(_LIB.MXNDArrayGetStorageType(handle, ctypes.byref(storage_type))) + return _STORAGE_TYPE_ID_TO_STR[storage_type.value] + class NDArray(NDArrayBase): """An array object representing a multidimensional, homogeneous array of fixed-size items. @@ -144,6 +139,7 @@ class NDArray(NDArrayBase): # make numpy functions return NDArray instead of numpy object array __array_priority__ = 1000.0 # pylint: disable= no-member, undefined-variable + def __repr__(self): """Returns a string representation of the array.""" shape_info = 'x'.join(['%d' % x for x in self.shape]) @@ -151,6 +147,9 @@ def __repr__(self): self.__class__.__name__, shape_info, self.context) + def __reduce__(self): + return NDArray, (None,), self.__getstate__() + def __add__(self, other): """x.__add__(y) <=> x+y <=> mx.nd.add(x, y) """ return add(self, other) @@ -742,7 +741,6 @@ def wait_to_read(self): """ check_call(_LIB.MXNDArrayWaitToRead(self.handle)) - @property def ndim(self): """Returns the number of dimensions of this array @@ -777,6 +775,7 @@ def shape(self): self.handle, ctypes.byref(ndim), ctypes.byref(pdata))) return tuple(pdata[:ndim.value]) + @property def size(self): """Number of elements in the array. @@ -841,6 +840,12 @@ def dtype(self): self.handle, ctypes.byref(mx_dtype))) return _DTYPE_MX_TO_NP[mx_dtype.value] + @property + def stype(self): + """Storage-type of the array. + """ + return _storage_type(self.handle) + @property # pylint: disable= invalid-name, undefined-variable def T(self): @@ -943,7 +948,7 @@ def astype(self, dtype): >>> y.dtype """ - res = empty(self.shape, ctx=self.context, dtype=dtype) + res = _empty_ndarray(self.shape, ctx=self.context, dtype=dtype) self.copyto(res) return res @@ -964,7 +969,7 @@ def copyto(self, other): Returns ------- - NDArray + NDArray, CSRNDArray, RowSparseNDArray The copied array. If ``other`` is an ``NDArray``, then the return value and ``other`` will point to the same ``NDArray``. @@ -1101,6 +1106,19 @@ def backward(self, out_grad=None, retain_graph=False, train_mode=True): ctypes.c_int(retain_graph), ctypes.c_int(train_mode))) + def tostype(self, stype): + """Return a copy of the array with chosen storage type. + + See Also + ---------- + :meth:`mxnet.ndarray.cast_storage`. + + Returns + ------- + NDArray, CSRNDArray or RowSparseNDArray + A copy of the array with the chosen storage stype + """ + return cast_storage(self, stype=stype) def onehot_encode(indices, out): """One-hot encoding indices into matrix out. @@ -1113,74 +1131,7 @@ def onehot_encode(indices, out): # pylint: enable= no-member, protected-access -def empty(shape, ctx=None, dtype=mx_real_t): - """Returns a new array of given shape and type, without initializing entries. - - Parameters - ---------- - shape : int or tuple of int - The shape of the empty array. - ctx : Context, optional - An optional device context (default is the current default context). - dtype : str or numpy.dtype, optional - An optional value type (default is `float32`). - - Returns - ------- - NDArray - A created array. - - Examples - -------- - >>> mx.nd.empty(1) - - >>> mx.nd.empty((1,2), mx.gpu(0)) - - >>> mx.nd.empty((1,2), mx.gpu(0), 'float16') - - """ - if isinstance(shape, integer_types): - shape = (shape, ) - if ctx is None: - ctx = Context.default_ctx - return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype)) - -def zeros(shape, ctx=None, dtype=mx_real_t, **kwargs): - """Returns a new array filled with all zeros, with the given shape and type. - - Parameters - ---------- - shape : int or tuple of int - The shape of the empty array. - ctx : Context, optional - An optional device context (default is the current default context). - dtype : str or numpy.dtype, optional - An optional value type (default is `float32`). - out : NDArray, optional - The output NDArray (default is `None`). - - Returns - ------- - NDArray - A created array - - Examples - -------- - >>> mx.nd.zeros(1).asnumpy() - array([ 0.], dtype=float32) - >>> mx.nd.zeros((1,2), mx.gpu(0)) - - >>> mx.nd.zeros((1,2), mx.gpu(0), 'float16').asnumpy() - array([[ 0., 0.]], dtype=float16) - """ - # pylint: disable= unused-argument - if ctx is None: - ctx = Context.default_ctx - # pylint: disable= no-member, protected-access - return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs) - # pylint: enable= no-member, protected-access - -def ones(shape, ctx=None, dtype=mx_real_t, **kwargs): +def ones(shape, ctx=None, dtype=None, **kwargs): """Returns a new array filled with all ones, with the given shape and type. Parameters @@ -1212,6 +1163,7 @@ def ones(shape, ctx=None, dtype=mx_real_t, **kwargs): # pylint: disable= unused-argument if ctx is None: ctx = Context.default_ctx + dtype = mx_real_t if dtype is None else dtype # pylint: disable= no-member, protected-access return _internal._ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs) # pylint: enable= no-member, protected-access @@ -1246,12 +1198,11 @@ def full(shape, val, ctx=None, dtype=mx_real_t, out=None): >>> mx.nd.full((1, 2), 2.0, dtype='float16').asnumpy() array([[ 2., 2.]], dtype=float16) """ - out = empty(shape, ctx, dtype) if out is None else out + out = _empty_ndarray(shape, ctx, dtype) if out is None else out out[:] = val return out - -def array(source_array, ctx=None, dtype=None): +def _array(source_array, ctx=None, dtype=None): """Creates an array from any object exposing the array interface. Parameters @@ -1269,18 +1220,6 @@ def array(source_array, ctx=None, dtype=None): ------- NDArray An `NDArray` with the same contents as the `source_array`. - - Examples - -------- - >>> import numpy as np - >>> mx.nd.array([1, 2, 3]) - - >>> mx.nd.array([[1, 2], [3, 4]]) - - >>> mx.nd.array(np.zeros((3, 2))) - - >>> mx.nd.array(np.zeros((3, 2)), mx.gpu(0)) - """ if isinstance(source_array, NDArray): dtype = source_array.dtype if dtype is None else dtype @@ -1291,11 +1230,10 @@ def array(source_array, ctx=None, dtype=None): source_array = np.array(source_array, dtype=dtype) except: raise TypeError('source_array must be array like object') - arr = empty(source_array.shape, ctx, dtype) + arr = _empty_ndarray(source_array.shape, ctx, dtype) arr[:] = source_array return arr - def moveaxis(tensor, source, destination): """Moves the `source` axis into the `destination` position while leaving the other axes in their original order @@ -2309,96 +2247,6 @@ def negative(arr): """ return multiply(arr, -1.0) - -def load(fname): - """Loads an array from file. - - See more details in ``save``. - - Parameters - ---------- - fname : str - The filename. - - Returns - ------- - list of NDArray or dict of str to NDArray - Loaded data. - """ - if not isinstance(fname, string_types): - raise TypeError('fname required to be a string') - out_size = mx_uint() - out_name_size = mx_uint() - handles = ctypes.POINTER(NDArrayHandle)() - names = ctypes.POINTER(ctypes.c_char_p)() - check_call(_LIB.MXNDArrayLoad(c_str(fname), - ctypes.byref(out_size), - ctypes.byref(handles), - ctypes.byref(out_name_size), - ctypes.byref(names))) - if out_name_size.value == 0: - return [NDArray(NDArrayHandle(handles[i])) for i in range(out_size.value)] - else: - assert out_name_size.value == out_size.value - return dict( - (py_str(names[i]), NDArray(NDArrayHandle(handles[i]))) for i in range(out_size.value)) - - -def save(fname, data): - """Saves a list of arrays or a dict of str->array to file. - - Examples of filenames: - - - ``/path/to/file`` - - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports) - - ``hdfs://path/to/file`` (if compiled with HDFS supports) - - Parameters - ---------- - fname : str - The filename. - data : ``NDArray``, list of ``NDArray` or dict of str to ``NDArray`` - The data to save. - - Examples - -------- - >>> x = mx.nd.zeros((2,3)) - >>> y = mx.nd.ones((1,4)) - >>> mx.nd.save('my_list', [x,y]) - >>> mx.nd.save('my_dict', {'x':x, 'y':y}) - >>> mx.nd.load('my_list') - [, ] - >>> mx.nd.load('my_dict') - {'y': , 'x': } - """ - if isinstance(data, NDArray): - data = [data] - handles = [] - if isinstance(data, dict): - keys = [] - for key, val in data.items(): - if not isinstance(key, string_types): - raise TypeError('save only accept dict str->NDArray or list of NDArray') - if not isinstance(val, NDArray): - raise TypeError('save only accept dict str->NDArray or list of NDArray') - keys.append(c_str(key)) - handles.append(val.handle) - keys = c_array(ctypes.c_char_p, keys) - elif isinstance(data, list): - for val in data: - if not isinstance(val, NDArray): - raise TypeError('save only accept dict str->NDArray or list of NDArray') - handles.append(val.handle) - keys = None - else: - raise ValueError("data needs to either be a NDArray, dict of str, NDArray pairs " - "or a list of NDarrays.") - check_call(_LIB.MXNDArraySave(c_str(fname), - mx_uint(len(handles)), - c_array(NDArrayHandle, handles), - keys)) - - def concatenate(arrays, axis=0, always_copy=True): """DEPRECATED, use ``concat`` instead @@ -2435,7 +2283,7 @@ def concatenate(arrays, axis=0, always_copy=True): assert shape_rest2 == arr.shape[axis+1:] assert dtype == arr.dtype ret_shape = shape_rest1 + (shape_axis,) + shape_rest2 - ret = empty(ret_shape, ctx=arrays[0].context, dtype=dtype) + ret = _empty_ndarray(ret_shape, ctx=arrays[0].context, dtype=dtype) idx = 0 begin = [0 for _ in ret_shape] @@ -2497,159 +2345,64 @@ def imdecode(str_img, clip_rect=(0, 0, 0, 0), out=None, index=0, channels=3, mea out=out) -# pylint: disable=too-many-locals, invalid-name -def _make_ndarray_function(handle, name): - """Create a NDArray function from the FunctionHandle.""" - real_name = ctypes.c_char_p() - desc = ctypes.c_char_p() - num_args = mx_uint() - arg_names = ctypes.POINTER(ctypes.c_char_p)() - arg_types = ctypes.POINTER(ctypes.c_char_p)() - arg_descs = ctypes.POINTER(ctypes.c_char_p)() - key_var_num_args = ctypes.c_char_p() - ret_type = ctypes.c_char_p() - - check_call(_LIB.MXSymbolGetAtomicSymbolInfo( - handle, ctypes.byref(real_name), ctypes.byref(desc), - ctypes.byref(num_args), - ctypes.byref(arg_names), - ctypes.byref(arg_types), - ctypes.byref(arg_descs), - ctypes.byref(key_var_num_args), - ctypes.byref(ret_type))) - narg = int(num_args.value) - arg_names = [py_str(arg_names[i]) for i in range(narg)] - arg_types = [py_str(arg_types[i]) for i in range(narg)] - func_name = name - key_var_num_args = py_str(key_var_num_args.value) - ret_type = py_str(ret_type.value) if ret_type.value is not None else '' - doc_str = _build_doc(func_name, - py_str(desc.value), - arg_names, - arg_types, - [py_str(arg_descs[i]) for i in range(narg)], - key_var_num_args, - ret_type) - - dtype_name = None - arr_name = None - ndsignature = [] - signature = [] - ndarg_names = [] - kwarg_names = [] - for i in range(narg): - name, atype = arg_names[i], arg_types[i] - if name == 'dtype': - dtype_name = name - signature.append('%s=_Null'%name) - elif atype.startswith('NDArray') or atype.startswith('Symbol'): - assert not arr_name, \ - "Op can only have one argument with variable " \ - "size and it must be the last argument." - if atype.endswith('[]'): - ndsignature.append('*%s'%name) - arr_name = name - else: - ndsignature.append('%s=None'%name) - ndarg_names.append(name) - else: - signature.append('%s=_Null'%name) - kwarg_names.append(name) - signature.append('out=None') - signature.append('name=None') - signature.append('**kwargs') - signature = ndsignature + signature - - code = [] - if arr_name: - code.append(""" -def %s(*%s, **kwargs):"""%(func_name, arr_name)) - code.append(""" - ndargs = [] - for i in {}: - assert isinstance(i, NDArrayBase), \\ - "Positional arguments must have NDArray type, " \\ - "but got %s"%str(i) - ndargs.append(i)""".format(arr_name)) - if dtype_name is not None: - code.append(""" - if '%s' in kwargs: - kwargs['%s'] = np.dtype(kwargs['%s']).name"""%( - dtype_name, dtype_name, dtype_name)) - code.append(""" - _ = kwargs.pop('name', None) - out = kwargs.pop('out', None) - keys = list(kwargs.keys()) - vals = list(kwargs.values())""") - else: - code.append(""" -def %s(%s): - ndargs = [] - keys = list(kwargs.keys()) - vals = list(kwargs.values())"""%(func_name, ', '.join(signature))) - # NDArray args - for name in ndarg_names: # pylint: disable=redefined-argument-from-local - code.append(""" - if {name} is not None: - assert isinstance({name}, NDArrayBase), \\ - "Argument {name} must have NDArray type, but got %s"%str({name}) - ndargs.append({name})""".format(name=name)) - # kwargs - for name in kwarg_names: # pylint: disable=redefined-argument-from-local - code.append(""" - if %s is not _Null: - keys.append('%s') - vals.append(%s)"""%(name, name, name)) - # dtype - if dtype_name is not None: - code.append(""" - if %s is not _Null: - keys.append('%s') - vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name)) - - code.append(""" - return _imperative_invoke(%d, ndargs, keys, vals, out)"""%( - handle.value)) - - local = {} - exec(''.join(code), None, local) # pylint: disable=exec-used - ndarray_function = local[func_name] - ndarray_function.__name__ = func_name - ndarray_function.__doc__ = doc_str - ndarray_function.__module__ = 'mxnet.ndarray' - return ndarray_function - - -# pylint: enable=too-many-locals, invalid-name -def _init_ndarray_module(ndarray_class, root_namespace): - """List and add all the ndarray functions to current module.""" - _set_ndarray_class(ndarray_class) - plist = ctypes.POINTER(ctypes.c_char_p)() - size = ctypes.c_uint() - - check_call(_LIB.MXListAllOpNames(ctypes.byref(size), - ctypes.byref(plist))) - op_names = [] - for i in range(size.value): - op_names.append(py_str(plist[i])) - - module_obj = _sys.modules["%s.ndarray" % root_namespace] - module_internal = _sys.modules["%s._ndarray_internal" % root_namespace] - module_contrib = _sys.modules["%s.contrib.ndarray" % root_namespace] - for name in op_names: - hdl = OpHandle() - check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl))) - function = _make_ndarray_function(hdl, name) - if function.__name__.startswith('_contrib_'): - function.__name__ = function.__name__[9:] - function.__module__ = 'mxnet.contrib.ndarray' - setattr(module_contrib, function.__name__, function) - elif function.__name__.startswith('_'): - setattr(module_internal, function.__name__, function) - else: - setattr(module_obj, function.__name__, function) +def _zeros_ndarray(shape, ctx=None, dtype=None, **kwargs): + """Returns a new array filled with all zeros, with the given shape and type. -_init_ndarray_module(NDArray, "mxnet") + Parameters + ---------- + shape : int or tuple of int + The shape of the empty array. + ctx : Context, optional + An optional device context (default is the current default context). + dtype : str or numpy.dtype, optional + An optional value type (default is `float32`). + out : NDArray, optional + The output NDArray (default is `None`). -# from .base import add_fileline_to_docstring -# add_fileline_to_docstring(__name__) + Returns + ------- + NDArray + A created array + + Examples + -------- + >>> mx.nd.zeros(1).asnumpy() + array([ 0.], dtype=float32) + >>> mx.nd.zeros((1,2), mx.gpu(0)) + + >>> mx.nd.zeros((1,2), mx.gpu(0), 'float16').asnumpy() + array([[ 0., 0.]], dtype=float16) + """ + # pylint: disable= unused-argument + if ctx is None: + ctx = Context.default_ctx + dtype = mx_real_t if dtype is None else dtype + # pylint: disable= no-member, protected-access + return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs) + # pylint: enable= no-member, protected-access + +def _empty_ndarray(shape, ctx=None, dtype=None): + """Returns a new array of given shape and type, without initializing entries. + + Parameters + ---------- + shape : int or tuple of int + The shape of the empty array. + ctx : Context, optional + An optional device context (default is the current default context). + dtype : str or numpy.dtype, optional + An optional value type (default is `float32`). + + Returns + ------- + NDArray + A created array. + + """ + if isinstance(shape, int): + shape = (shape, ) + if ctx is None: + ctx = Context.default_ctx + if dtype is None: + dtype = mx_real_t + return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype)) diff --git a/python/mxnet/ndarray/op.py b/python/mxnet/ndarray/op.py new file mode 100644 index 000000000000..7580362c0cc1 --- /dev/null +++ b/python/mxnet/ndarray/op.py @@ -0,0 +1,205 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Register backend ops in mxnet.ndarray namespace""" + +import sys as _sys +import os as _os +import ctypes +import numpy as np # pylint: disable=unused-import + +from ..ndarray_doc import _build_doc + +# Use different version of SymbolBase +# When possible, use cython to speedup part of computation. +# pylint: disable=unused-import +try: + if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0: + from .._ctypes.ndarray import NDArrayBase, _set_ndarray_class, _STORAGE_TYPE_ID_TO_STR + from .._ctypes.ndarray import CachedOp, _imperative_invoke + elif _sys.version_info >= (3, 0): + from .._cy3.ndarray import NDArrayBase, _set_ndarray_class,\ + _imperative_invoke, _STORAGE_TYPE_ID_TO_STR + from .._cy3.ndarray import CachedOp, _imperative_invoke + else: + from .._cy2.ndarray import NDArrayBase, _set_ndarray_class,\ + _imperative_invoke, _STORAGE_TYPE_ID_TO_STR + from .._cy2.ndarray import CachedOp, _imperative_invoke +except ImportError: + if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0: + raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1") + from .._ctypes.ndarray import NDArrayBase, _set_ndarray_class,\ + _imperative_invoke, _STORAGE_TYPE_ID_TO_STR + from .._ctypes.ndarray import CachedOp, _imperative_invoke + +from ..base import mx_uint, check_call, _LIB, py_str, OpHandle, c_str, _Null +# pylint: enable=unused-import + + +# pylint: disable=too-many-locals, invalid-name +def _make_ndarray_function(handle, name): + """Create a NDArray function from the FunctionHandle.""" + real_name = ctypes.c_char_p() + desc = ctypes.c_char_p() + num_args = mx_uint() + arg_names = ctypes.POINTER(ctypes.c_char_p)() + arg_types = ctypes.POINTER(ctypes.c_char_p)() + arg_descs = ctypes.POINTER(ctypes.c_char_p)() + key_var_num_args = ctypes.c_char_p() + ret_type = ctypes.c_char_p() + + check_call(_LIB.MXSymbolGetAtomicSymbolInfo( + handle, ctypes.byref(real_name), ctypes.byref(desc), + ctypes.byref(num_args), + ctypes.byref(arg_names), + ctypes.byref(arg_types), + ctypes.byref(arg_descs), + ctypes.byref(key_var_num_args), + ctypes.byref(ret_type))) + narg = int(num_args.value) + arg_names = [py_str(arg_names[i]) for i in range(narg)] + arg_types = [py_str(arg_types[i]) for i in range(narg)] + func_name = name + key_var_num_args = py_str(key_var_num_args.value) + ret_type = py_str(ret_type.value) if ret_type.value is not None else '' + doc_str = _build_doc(func_name, + py_str(desc.value), + arg_names, + arg_types, + [py_str(arg_descs[i]) for i in range(narg)], + key_var_num_args, + ret_type) + + dtype_name = None + arr_name = None + ndsignature = [] + signature = [] + ndarg_names = [] + kwarg_names = [] + for i in range(narg): + name, atype = arg_names[i], arg_types[i] + if name == 'dtype': + dtype_name = name + signature.append('%s=_Null'%name) + elif atype.startswith('NDArray') or atype.startswith('Symbol'): + assert not arr_name, \ + "Op can only have one argument with variable " \ + "size and it must be the last argument." + if atype.endswith('[]'): + ndsignature.append('*%s'%name) + arr_name = name + else: + ndsignature.append('%s=None'%name) + ndarg_names.append(name) + else: + signature.append('%s=_Null'%name) + kwarg_names.append(name) + signature.append('out=None') + signature.append('name=None') + signature.append('**kwargs') + signature = ndsignature + signature + + code = [] + if arr_name: + code.append(""" +def %s(*%s, **kwargs):"""%(func_name, arr_name)) + code.append(""" + ndargs = [] + for i in {}: + assert isinstance(i, NDArrayBase), \\ + "Positional arguments must have NDArray type, " \\ + "but got %s"%str(i) + ndargs.append(i)""".format(arr_name)) + if dtype_name is not None: + code.append(""" + if '%s' in kwargs: + kwargs['%s'] = np.dtype(kwargs['%s']).name"""%( + dtype_name, dtype_name, dtype_name)) + code.append(""" + _ = kwargs.pop('name', None) + out = kwargs.pop('out', None) + keys = list(kwargs.keys()) + vals = list(kwargs.values())""") + else: + code.append(""" +def %s(%s): + ndargs = [] + keys = list(kwargs.keys()) + vals = list(kwargs.values())"""%(func_name, ', '.join(signature))) + # NDArray args + for name in ndarg_names: # pylint: disable=redefined-argument-from-local + code.append(""" + if {name} is not None: + assert isinstance({name}, NDArrayBase), \\ + "Argument {name} must have NDArray type, but got %s"%str({name}) + ndargs.append({name})""".format(name=name)) + # kwargs + for name in kwarg_names: # pylint: disable=redefined-argument-from-local + code.append(""" + if %s is not _Null: + keys.append('%s') + vals.append(%s)"""%(name, name, name)) + # dtype + if dtype_name is not None: + code.append(""" + if %s is not _Null: + keys.append('%s') + vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name)) + + code.append(""" + return _imperative_invoke(%d, ndargs, keys, vals, out)"""%( + handle.value)) + + local = {} + exec(''.join(code), None, local) # pylint: disable=exec-used + ndarray_function = local[func_name] + ndarray_function.__name__ = func_name + ndarray_function.__doc__ = doc_str + ndarray_function.__module__ = 'mxnet.ndarray' + return ndarray_function + + +# pylint: enable=too-many-locals, invalid-name +def _init_ndarray_module(root_namespace): + """List and add all the ndarray functions to current module.""" + plist = ctypes.POINTER(ctypes.c_char_p)() + size = ctypes.c_uint() + + check_call(_LIB.MXListAllOpNames(ctypes.byref(size), + ctypes.byref(plist))) + op_names = [] + for i in range(size.value): + op_names.append(py_str(plist[i])) + + module_obj = _sys.modules["%s.ndarray" % root_namespace] + module_internal = _sys.modules["%s.ndarray._internal" % root_namespace] + module_contrib = _sys.modules["%s.contrib.ndarray" % root_namespace] + for name in op_names: + hdl = OpHandle() + check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl))) + function = _make_ndarray_function(hdl, name) + if function.__name__.startswith('_contrib_'): + function.__name__ = function.__name__[9:] + function.__module__ = 'mxnet.contrib.ndarray' + setattr(module_contrib, function.__name__, function) + elif function.__name__.startswith('_'): + setattr(module_internal, function.__name__, function) + else: + setattr(module_obj, function.__name__, function) + +# register backend operators in mx.nd +_init_ndarray_module("mxnet") diff --git a/python/mxnet/ndarray/sparse_ndarray.py b/python/mxnet/ndarray/sparse_ndarray.py new file mode 100644 index 000000000000..4259fe170121 --- /dev/null +++ b/python/mxnet/ndarray/sparse_ndarray.py @@ -0,0 +1,906 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +"""SparseNDArray API of mxnet.""" +from __future__ import absolute_import +from __future__ import division +try: + from __builtin__ import slice as py_slice +except ImportError: + from builtins import slice as py_slice + +import ctypes +import warnings + +import os as _os +import sys as _sys + +# import operator +import numpy as np +from ..base import NotSupportedForSparseNDArray +from ..base import _LIB, numeric_types +from ..base import c_array, mx_real_t +from ..base import mx_uint, NDArrayHandle, check_call +from ..context import Context +from . import _internal +from .ndarray import _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP +from .ndarray import _STORAGE_TYPE_STR_TO_ID +from .ndarray import NDArray, _storage_type, _zeros_ndarray, _array +from . import cast_storage +from . import slice as nd_slice + +# Use different verison of SymbolBase +# When possible, use cython to speedup part of computation. +# pylint: disable=unused-import +try: + if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0: + from .._ctypes.ndarray import NDArrayBase, _set_ndarray_class + elif _sys.version_info >= (3, 0): + from .._cy3.ndarray import NDArrayBase, _set_ndarray_class + else: + from .._cy2.ndarray import NDArrayBase, _set_ndarray_class +except ImportError: + if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0: + raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1") + from .._ctypes.ndarray import NDArrayBase, _set_ndarray_class + +# pylint: enable=unused-import +_STORAGE_AUX_TYPES = { + 'row_sparse': [np.int64], + 'csr': [np.int64, np.int64] +} + + +def _new_alloc_handle(stype, shape, ctx, delay_alloc, dtype, aux_types, aux_shapes=None): + """Return a new handle with specified storage type, shape, dtype and context. + + Empty handle is only used to hold results + + Returns + ------- + handle + A new empty ndarray handle + """ + hdl = NDArrayHandle() + aux_type_ids = [int(_DTYPE_NP_TO_MX[np.dtype(aux_t).type]) for aux_t in aux_types] + aux_shapes = [(0,) for aux_t in aux_types] if aux_shapes is None else aux_shapes + aux_shape_lens = [len(aux_shape) for aux_shape in aux_shapes] + aux_shapes = sum(aux_shapes, ()) + num_aux = mx_uint(len(aux_types)) + check_call(_LIB.MXNDArrayCreateSparseEx( + ctypes.c_int(int(_STORAGE_TYPE_STR_TO_ID[stype])), + c_array(mx_uint, shape), + mx_uint(len(shape)), + ctypes.c_int(ctx.device_typeid), + ctypes.c_int(ctx.device_id), + ctypes.c_int(int(delay_alloc)), + ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])), + num_aux, + c_array(ctypes.c_int, aux_type_ids), + c_array(mx_uint, aux_shape_lens), + c_array(mx_uint, aux_shapes), + ctypes.byref(hdl))) + return hdl + +class BaseSparseNDArray(NDArray): + """The base class of an NDArray stored in a sparse storage format. + + See CSRNDArray and RowSparseNDArray for more details. + """ + + def __iadd__(self, other): + raise NotImplementedError() + + def __isub__(self, other): + raise NotImplementedError() + + def __imul__(self, other): + raise NotImplementedError() + + def __idiv__(self, other): + raise NotImplementedError() + + def __itruediv__(self, other): + raise NotImplementedError() + + def _sync_copyfrom(self, source_array): + raise NotImplementedError() + + def _at(self, idx): + raise NotSupportedForSparseNDArray(self._at, '[idx]', idx) + + def _slice(self, start, stop): + raise NotSupportedForSparseNDArray(self._slice, None, start, stop) + + def reshape(self, shape): + raise NotSupportedForSparseNDArray(self.reshape, None, shape) + + def _aux_type(self, i): + """Data-type of the array's ith aux data. + + Returns + ------- + numpy.dtype + This BaseSparseNDArray's aux data type. + """ + aux_type = ctypes.c_int() + check_call(_LIB.MXNDArrayGetAuxType(self.handle, i, ctypes.byref(aux_type))) + return _DTYPE_MX_TO_NP[aux_type.value] + + @property + def _num_aux(self): + """The number of aux data used to help store the sparse ndarray. + """ + return len(_STORAGE_AUX_TYPES[self.stype]) + + @property + def _aux_types(self): + """The data types of the aux data for the BaseSparseNDArray. + """ + aux_types = [] + num_aux = self._num_aux + for i in range(num_aux): + aux_types.append(self._aux_type(i)) + return aux_types + + def asnumpy(self): + """Return a dense ``numpy.ndarray`` object with value copied from this array + """ + return self.tostype('default').asnumpy() + + def astype(self, dtype): + """Returns a copy of the array after casting to a specified type. + Parameters + ---------- + dtype : numpy.dtype or str + The type of the returned array. + Examples + -------- + >>> x = mx.nd.zeros('row_sparse', (2,3), dtype='float32') + >>> y = x.astype('int32') + >>> y.dtype + + """ + res = _zeros_sparse_ndarray(shape=self.shape, ctx=self.context, + dtype=dtype, stype=self.stype) + self.copyto(res) + return res + + def copyto(self, other): + """Copies the value of this array to another array. + + Parameters + ---------- + other : NDArray or CSRNDArray or RowSparseNDArray or Context + The destination array or context. + + Returns + ------- + NDArray or CSRNDArray or RowSparseNDArray + The copied array. + """ + if isinstance(other, NDArray): + if other.handle is self.handle: + warnings.warn('You are attempting to copy an array to itself', RuntimeWarning) + return + return _internal._copyto(self, out=other) + elif isinstance(other, Context): + hret = _ndarray_cls(_new_alloc_handle(self.stype, self.shape, other, + True, self.dtype, self._aux_types)) + return _internal._copyto(self, out=hret) + else: + raise TypeError('copyto does not support type ' + str(type(other))) + + def _data(self): + """A deep copy NDArray of the data array associated with the BaseSparseNDArray. + + This function blocks. Do not use it in performance critical code. + """ + self.wait_to_read() + hdl = NDArrayHandle() + check_call(_LIB.MXNDArrayGetDataNDArray(self.handle, ctypes.byref(hdl))) + return NDArray(hdl) + + + def _aux_data(self, i): + """ Get a deep copy NDArray of the i-th aux data array associated with the + BaseSparseNDArray. + + This function blocks. Do not use it in performance critical code. + """ + self.wait_to_read() + hdl = NDArrayHandle() + check_call(_LIB.MXNDArrayGetAuxNDArray(self.handle, i, ctypes.byref(hdl))) + return NDArray(hdl) + + +# pylint: disable=abstract-method +class CSRNDArray(BaseSparseNDArray): + """A sparse representation of 2D NDArray in the standard CSR format. + + A CSRNDArray represents an NDArray as three separate arrays: `data`, + `indptr` and `indices`. It uses the standard CSR representation where the column indices for + row i are stored in indices[indptr[i]:indptr[i+1]] and their corresponding values are stored + in values[indptr[i]:indptr[i+1]]. + + Example + ------- + >>> a = mx.nd.array([[0, 1, 0], [2, 0, 0], [0, 0, 0], [0, 0, 3]]) + >>> a = a.tostype('csr') + >>> a.indices.asnumpy() + array([1, 0, 2]) + >>> a.indptr.asnumpy() + array([0, 1, 2, 2, 3]) + >>> a.data.asnumpy() + array([ 1., 2., 3.], dtype=float32) + """ + + def __reduce__(self): + return CSRNDArray, (None,), super(CSRNDArray, self).__getstate__() + + def __iadd__(self, other): + (self + other).copyto(self) + return self + + def __isub__(self, other): + (self - other).copyto(self) + return self + + def __imul__(self, other): + (self * other).copyto(self) + return self + + def __idiv__(self, other): + (self / other).copyto(self) + return self + + def __itruediv__(self, other): + (self / other).copyto(self) + return self + + def __getitem__(self, key): + """x.__getitem__(i) <=> x[i] + + Returns a sliced view of this array. + + Parameters + ---------- + key : slice + Indexing key. + + Examples + -------- + >>> indptr = np.array([0, 2, 3, 6]) + >>> indices = np.array([0, 2, 2, 0, 1, 2]) + >>> data = np.array([1, 2, 3, 4, 5, 6]) + >>> a = mx.nd.csr_matrix(data, indptr, indices, (3, 3)) + >>> a.asnumpy() + array([[1, 0, 2], + [0, 0, 3], + [4, 5, 6]]) + >>> a[1:2].asnumpy() + array([[0, 0, 3]], dtype=float32) + """ + if isinstance(key, int): + raise ValueError("__getitem__ with int key is not implemented for CSRNDArray") + if isinstance(key, py_slice): + if key.step is not None: + raise ValueError('CSRNDArray only supports continuous slicing on axis 0') + if key.start is not None or key.stop is not None: + begin = key.start if key.start else 0 + end = key.stop if key.stop else self.shape[0] + return nd_slice(self, begin=begin, end=end) + else: + return self + if isinstance(key, tuple): + raise ValueError('Multi-dimension indexing is not supported') + + def __setitem__(self, key, value): + """x.__setitem__(i, y) <=> x[i]=y + + Set self[key] to value. Only slice key [:] is supported. + + Parameters + ---------- + key : slice + The indexing key. + value : NDArray or CSRNDArray or numpy.ndarray + The value to set. + + Examples + -------- + >>> src = mx.nd.zeros((3,3), stype='csr') + >>> src.asnumpy() + array([[ 0., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 0.]], dtype=float32) + >>> # assign CSRNDArray with same storage type + >>> x = mx.nd.ones('row_sparse', (3,3)).tostype('csr') + >>> x[:] = src + >>> x.asnumpy() + array([[ 1., 1., 1.], + [ 1., 1., 1.], + [ 1., 1., 1.]], dtype=float32) + >>> # assign NDArray to CSRNDArray + >>> x[:] = mx.nd.ones((3,3)) * 2 + >>> x.asnumpy() + array([[ 2., 2., 2.], + [ 2., 2., 2.], + [ 2., 2., 2.]], dtype=float32) + """ + if not self.writable: + raise ValueError('Failed to assign to a readonly CSRNDArray') + if isinstance(key, py_slice): + if key.step is not None or key.start is not None or key.stop is not None: + raise ValueError('Assignment with slice for CSRNDArray is not ' \ + 'implmented yet.') + if isinstance(value, NDArray): + # avoid copying to itself + if value.handle is not self.handle: + value.copyto(self) + elif isinstance(value, numeric_types): + raise ValueError("Assigning numeric types to CSRNDArray is " \ + "not implemented yet.") + elif isinstance(value, (np.ndarray, np.generic)): + # TODO(haibin/anisub) check scipy.sparse and use _sync_copy_from to + # avoid the temporary copy + warnings.warn('Assigning non-NDArray object to CSRNDArray is not efficient', + RuntimeWarning) + tmp = _array(value) + tmp.copyto(self) + else: + raise TypeError('type %s not supported' % str(type(value))) + else: + assert(isinstance(key, (int, tuple))) + raise Exception('CSRNDArray only supports [:] for assignment') + + @property + def indices(self): + """A deep copy NDArray of the indices array of the CSRNDArray. + This generates a deep copy of the column indices of the current `csr` matrix. + + Returns + ------- + NDArray + This CSRNDArray's indices array. + """ + return self._aux_data(1) + + @property + def indptr(self): + """A deep copy NDArray of the indptr array of the CSRNDArray. + This generates a deep copy of the `indptr` of the current `csr` matrix. + + Returns + ------- + NDArray + This CSRNDArray's indptr array. + """ + return self._aux_data(0) + + @property + def data(self): + """A deep copy NDArray of the data array of the CSRNDArray. + This generates a deep copy of the `data` of the current `csr` matrix. + + Returns + ------- + NDArray + This CSRNDArray's data array. + """ + return self._data() + + def tostype(self, stype): + """Return a copy of the array with chosen storage type. + + Returns + ------- + NDArray or CSRNDArray + A copy of the array with the chosen storage stype + """ + if stype == 'row_sparse': + raise ValueError("cast_storage from csr to row_sparse is not supported") + return cast_storage(self, stype=stype) + + def copyto(self, other): + """Copies the value of this array to another array. + + If ``other`` is a ``NDArray`` or ``CSRNDArray`` object, then ``other.shape`` and + ``self.shape`` should be the same. This function copies the value from + ``self`` to ``other``. + + If ``other`` is a context, a new ``CSRNDArray`` will be first created on + the target context, and the value of ``self`` is copied. + + Parameters + ---------- + other : NDArray or CSRNDArray or Context + The destination array or context. + + Returns + ------- + NDArray or CSRNDArray + The copied array. If ``other`` is an ``NDArray`` or ``CSRNDArray``, then the return + value and ``other`` will point to the same ``NDArray`` or ``CSRNDArray``. + """ + if isinstance(other, Context): + return super(CSRNDArray, self).copyto(other) + elif isinstance(other, NDArray): + stype = other.stype + if stype == 'default' or stype == 'csr': + return super(CSRNDArray, self).copyto(other) + else: + raise TypeError('copyto does not support destination NDArray stype ' + str(stype)) + else: + raise TypeError('copyto does not support type ' + str(type(other))) + +# pylint: disable=abstract-method +class RowSparseNDArray(BaseSparseNDArray): + """A sparse representation of a set of NDArray row slices at given indices. + + A RowSparseNDArray represents a multidimensional NDArray using two separate arrays: `data` and + `indices`. + + - data: an NDArray of any dtype with shape [D0, D1, ..., Dn]. + - indices: a 1-D int64 NDArray with shape [D0]. + + The `indices` stores the indices of the row slices with non-zeros, + while the values are stored in `data`. The corresponding NDArray ``dense`` + represented by RowSparseNDArray ``rsp`` has + + ``dense[rsp.indices[i], :, :, :, ...] = rsp.data[i, :, :, :, ...]`` + + >>> dense.asnumpy() + array([[ 1., 2., 3.], + [ 0., 0., 0.], + [ 4., 0., 5.], + [ 0., 0., 0.], + [ 0., 0., 0.]], dtype=float32) + >>> rsp = dense.tostype('row_sparse') + >>> rsp.indices.asnumpy() + array([0, 2], dtype=int64) + >>> rsp.data.asnumpy() + array([[ 1., 2., 3.], + [ 4., 0., 5.]], dtype=float32) + + A RowSparseNDArray is typically used to represent non-zero row-slices of a large NDArray + of shape [LARGE0, D1, .. , Dn] where LARGE0 >> D0 and most row slices are zeros. + + The indices are expected to be sorted in ascending order. + + RowSparseNDArray is used principally in the definition of gradients for operations + that have sparse gradients (e.g. sparse dot and sparse embedding). + """ + def __reduce__(self): + return RowSparseNDArray, (None,), super(RowSparseNDArray, self).__getstate__() + + def __iadd__(self, other): + (self + other).copyto(self) + return self + + def __isub__(self, other): + (self - other).copyto(self) + return self + + def __imul__(self, other): + (self * other).copyto(self) + return self + + def __idiv__(self, other): + (self / other).copyto(self) + return self + + def __itruediv__(self, other): + (self / other).copyto(self) + return self + + def __getitem__(self, key): + """x.__getitem__(i) <=> x[i] + + Returns a sliced view of this array. + + Parameters + ---------- + key : slice + Indexing key. + + Examples + -------- + >>> x = mx.nd.zeros((2, 3), stype='row_sparse') + >>> x[:].asnumpy() + array([[ 0., 0., 0.], + [ 0., 0., 0.]], dtype=float32) + """ + if isinstance(key, int): + raise Exception("__getitem__ with int key is not implemented for RowSparseNDArray yet") + if isinstance(key, py_slice): + if key.step is not None or key.start is not None or key.stop is not None: + raise Exception('RowSparseNDArray only supports [:] for __getitem__') + else: + return self + if isinstance(key, tuple): + raise ValueError('Multi-dimension indexing is not supported') + + def __setitem__(self, key, value): + """x.__setitem__(i, y) <=> x[i]=y + + Set self[key] to value. Only slice key [:] is supported. + + Parameters + ---------- + key : slice + The indexing key. + value : NDArray or numpy.ndarray + The value to set. + + Examples + -------- + >>> src = mx.nd.row_sparse([[1, 0, 2], [4, 5, 6]], [0, 2], (3,3)) + >>> src.asnumpy() + array([[ 1., 0., 2.], + [ 0., 0., 0.], + [ 4., 5., 6.]], dtype=float32) + >>> # assign RowSparseNDArray with same storage type + >>> x = mx.nd.zeros('row_sparse', (3,3)) + >>> x[:] = src + >>> x.asnumpy() + array([[ 1., 0., 2.], + [ 0., 0., 0.], + [ 4., 5., 6.]], dtype=float32) + >>> # assign NDArray to RowSparseNDArray + >>> x[:] = mx.nd.ones((3,3)) + >>> x.asnumpy() + array([[ 1., 1., 1.], + [ 1., 1., 1.], + [ 1., 1., 1.]], dtype=float32) + """ + if not self.writable: + raise ValueError('Failed to assign to a readonly RowSparseNDArray') + if isinstance(key, py_slice): + if key.step is not None or key.start is not None or key.stop is not None: + raise ValueError('Assignment with slice for RowSparseNDArray ' \ + 'is not implmented yet.') + if isinstance(value, NDArray): + # avoid copying to itself + if value.handle is not self.handle: + value.copyto(self) + elif isinstance(value, numeric_types): + raise ValueError("Assigning numeric types to RowSparseNDArray " \ + "is not implemented yet.") + elif isinstance(value, (np.ndarray, np.generic)): + warnings.warn('Assigning non-NDArray object to RowSparseNDArray is not efficient', + RuntimeWarning) + tmp = _array(value) + tmp.copyto(self) + else: + raise TypeError('type %s not supported' % str(type(value))) + else: + assert(isinstance(key, (int, tuple))) + raise TypeError('RowSparseNDArray only supports [:] for assignment') + + @property + def indices(self): + """A deep copy NDArray of the indices array of the RowSparseNDArray. + This generates a deep copy of the row indices of the current `row_sparse` matrix. + + Returns + ------- + NDArray + This RowSparseNDArray's indices array. + """ + return self._aux_data(0) + + @property + def data(self): + """A deep copy NDArray of the data array of the RowSparseNDArray. + This generates a deep copy of the `data` of the current `row_sparse` matrix. + + Returns + ------- + NDArray + This RowSparseNDArray's data array. + """ + return self._data() + + def tostype(self, stype): + """Return a copy of the array with chosen storage type. + + Returns + ------- + NDArray or RowSparseNDArray + A copy of the array with the chosen storage stype + """ + if stype == 'csr': + raise ValueError("cast_storage from row_sparse to csr is not supported") + return cast_storage(self, stype=stype) + + def copyto(self, other): + """Copies the value of this array to another array. + + If ``other`` is a ``NDArray`` or ``RowSparseNDArray`` object, then ``other.shape`` + and ``self.shape`` should be the same. This function copies the value from + ``self`` to ``other``. + + If ``other`` is a context, a new ``RowSparseNDArray`` will be first created on + the target context, and the value of ``self`` is copied. + + Parameters + ---------- + other : NDArray or RowSparseNDArray or Context + The destination array or context. + + Returns + ------- + NDArray or RowSparseNDArray + The copied array. If ``other`` is an ``NDArray`` or ``RowSparseNDArray``, then the + return value and ``other`` will point to the same ``NDArray`` or ``RowSparseNDArray``. + """ + if isinstance(other, Context): + return super(RowSparseNDArray, self).copyto(other) + elif isinstance(other, NDArray): + stype = other.stype + if stype == 'default' or stype == 'row_sparse': + return super(RowSparseNDArray, self).copyto(other) + else: + raise TypeError('copyto does not support destination NDArray stype ' + str(stype)) + else: + raise TypeError('copyto does not support type ' + str(type(other))) + +def _prepare_src_array(src, dtype, default_dtype): + """Prepare `src` and its dtype so that they can be used to construct NDArray. + `src` is converted to a `np.ndarray` if it's neither an `NDArray` nor an `np.ndarray`. + """ + if isinstance(src, NDArray): + dtype = src.dtype if dtype is None else dtype + else: + dtype = default_dtype if dtype is None else dtype + if not isinstance(src, np.ndarray): + try: + src = np.array(src, dtype=dtype) + except: + raise TypeError('values must be array like object') + return src, dtype + + +def csr_matrix(data, indptr, indices, shape, ctx=None, dtype=None, indptr_type=None, + indices_type=None): + """Creates a 2D array with compressed sparse row(CSR) format. + + Parameters + ---------- + data: array_like + An object exposing the array interface, with shape [nnz], where D0 is the number of + non-zero entries. + indptr: array_like + An object exposing the array interface, with shape [D0 + 1]. The first element in indptr + should always be zero. + indices: array_like + An object exposing the array interface, with shape [nnz]. + ctx: Context, optional + Device context (default is the current default context). + dtype: str or numpy.dtype, optional + The data type of the output array. The default dtype is ``values.dtype`` + if `values` is an `NDArray`, `float32` otherwise. + indptr_type: str or numpy.dtype, optional + The data type of the indices array. The default dtype is ``indptr.dtype`` + if `indptr` is an `NDArray`, `int64` otherwise. + indices_type: str or numpy.dtype, optional + The data type of the indices array. The default dtype is ``indices.dtype`` + if `indicies` is an `NDArray`, `int64` otherwise. + + Returns + ------- + CSRNDArray + A `CSRNDArray` with the `csr` storage representation. + + Example + ------- + >>> import mxnet as mx + >>> a = mx.nd.csr_matrix([1, 2, 3], [0, 1, 2, 2, 3], [1, 0, 2], (4, 3)) + >>> a.asnumpy() + array([[ 0., 1., 0.], + [ 2., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 3.]], dtype=float32) + """ + storage_type = 'csr' + # context + if ctx is None: + ctx = Context.default_ctx + # prepare src array and types + data, dtype = _prepare_src_array(data, dtype, mx_real_t) + indptr, indptr_type = _prepare_src_array(indptr, indptr_type, + _STORAGE_AUX_TYPES[storage_type][0]) + indices, indices_type = _prepare_src_array(indices, indices_type, + _STORAGE_AUX_TYPES[storage_type][1]) + # verify types + assert('int64' in str(indptr_type)), "expected int64 for indptr" + assert('int64' in str(indices_type)), "expected int64 for indices" + # verify shapes + aux_shapes = [indptr.shape, indices.shape] + assert(data.ndim == 1) + assert(indptr.ndim == 1) + assert(indices.ndim == 1) + assert(len(shape) == 2) + result = CSRNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype, + [indptr_type, indices_type], aux_shapes)) + # TODO(junwu): Convert data, indptr, and indices to mxnet NDArrays + # if they are not for now. In the future, we should provide a c-api + # to accept np.ndarray types to copy from to result.data and aux_data + if not isinstance(data, NDArray): + data = _array(data, ctx, dtype) + if not isinstance(indptr, NDArray): + indptr = _array(indptr, ctx, indptr_type) + if not isinstance(indices, NDArray): + indices = _array(indices, ctx, indices_type) + check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, data.handle, ctypes.c_int(-1))) + check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indptr.handle, ctypes.c_int(0))) + check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indices.handle, ctypes.c_int(1))) + return result + + +def row_sparse_array(data, indices, shape, ctx=None, dtype=None, indices_type=None): + """Creates a multidimensional row sparse array with a set of tensor slices at given indices. + + Parameters + ---------- + data: array_like + An object exposing the array interface, with shape [D0, D1, .. DK], where D0 is + the number of rows with non-zeros entries. + indices: array_like + An object exposing the array interface, with shape [D0]. + ctx : Context, optional + Device context (default is the current default context). + dtype : str or numpy.dtype, optional + The data type of the output array. The default dtype is ``data.dtype`` + if `data` is an `NDArray`, `float32` otherwise. + indices_type: str or numpy.dtype, optional + The data type of the indices array. The default dtype is ``indices.dtype`` + if `indicies` is an `NDArray`, `int64` otherwise. + + Returns + ------- + RowSparseNDArray + An `RowSparseNDArray` with the `row_sparse` storage representation. + + Example + ------- + >>> a = mx.nd.row_sparse_array([[1, 2], [3, 4]], [1, 4], (6, 2)) + >>> a.asnumpy() + array([[ 0., 0.], + [ 1., 2.], + [ 0., 0.], + [ 0., 0.], + [ 3., 4.], + [ 0., 0.]], dtype=float32) + """ + storage_type = 'row_sparse' + # context + if ctx is None: + ctx = Context.default_ctx + # prepare src array and types + data, dtype = _prepare_src_array(data, dtype, mx_real_t) + indices, indices_type = _prepare_src_array(indices, indices_type, + _STORAGE_AUX_TYPES[storage_type][0]) + # verify types + assert('int64' in str(indices_type)), "expected int64 for indices" + # verify shapes + assert(data.ndim == len(shape)) + assert(indices.ndim == 1) + result = RowSparseNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype, + [indices_type], [indices.shape])) + + # TODO(junwu): Convert data, indptr, and indices to mxnet NDArrays + # if they are not for now. In the future, we should provide a c-api + # to accept np.ndarray types to copy from to result.data and aux_data + if not isinstance(data, NDArray): + data = _array(data, ctx, dtype) + if not isinstance(indices, NDArray): + indices = _array(indices, ctx, indices_type) + check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, data.handle, ctypes.c_int(-1))) + check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indices.handle, ctypes.c_int(0))) + return result + +def _ndarray_cls(handle, writable=True, stype=None): + if stype is None: + stype = _storage_type(handle) + if stype == 'default': + return NDArray(handle, writable=writable) + elif stype == 'csr': + return CSRNDArray(handle, writable=writable) + elif stype == 'row_sparse': + return RowSparseNDArray(handle, writable=writable) + else: + raise Exception("unknown storage type") + + +_set_ndarray_class(_ndarray_cls) + + +def _zeros_sparse_ndarray(stype, shape, ctx=None, dtype=None, aux_types=None, **kwargs): + """Return a new array of given shape and type, filled with zeros. + + Parameters + ---------- + stype: string + The storage type of the empty array, such as 'row_sparse', 'csr', etc + shape : int or tuple of int + The shape of the empty array + ctx : Context, optional + An optional device context (default is the current default context) + dtype : str or numpy.dtype, optional + An optional value type (default is `float32`) + aux_types: list of numpy.dtype, optional + An optional list of types of the aux data for RowSparseNDArray or CSRNDArray + (default values depends on the storage type) + + Returns + ------- + RowSparseNDArray or CSRNDArray + A created array + Examples + -------- + >>> mx.nd.zeros((1,2), mx.cpu(), stype='csr') + + >>> mx.nd.zeros((1,2), mx.cpu(), 'float16', stype='row_sparse').asnumpy() + array([[ 0., 0.]], dtype=float16) + """ + if stype == 'default': + return _zeros_ndarray(shape, ctx=ctx, dtype=dtype, **kwargs) + if ctx is None: + ctx = Context.default_ctx + dtype = mx_real_t if dtype is None else dtype + if aux_types is None: + if stype == 'row_sparse' or stype == 'csr': + aux_types = _STORAGE_AUX_TYPES[stype] + else: + raise Exception("unknown storage type") + assert(len(aux_types) == len(_STORAGE_AUX_TYPES[stype])) + out = _ndarray_cls(_new_alloc_handle(stype, shape, ctx, True, dtype, aux_types)) + return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, out=out, **kwargs) + +def _empty_sparse_ndarray(stype, shape, ctx=None, dtype=None, aux_types=None): + """Returns a new array of given shape and type, without initializing entries. + """ + if isinstance(shape, int): + shape = (shape, ) + if ctx is None: + ctx = Context.default_ctx + if dtype is None: + dtype = mx_real_t + assert(stype is not None) + if stype == 'csr' or stype == 'row_sparse': + return _zeros_sparse_ndarray(stype, shape, ctx=ctx, dtype=dtype, aux_types=aux_types) + else: + raise Exception("unknown stype : " + str(stype)) + +def _sparse_array(source_array, ctx=None, dtype=None, aux_types=None): + """Creates a sparse array from any object exposing the array interface. + """ + if isinstance(source_array, NDArray): + assert(source_array.stype != 'default'), \ + "Please use `cast_storage` to create BaseSparseNDArray from an NDArray" + dtype = source_array.dtype if dtype is None else dtype + aux_types = source_array._aux_types if aux_types is None else aux_types + else: + # TODO(haibin/anisub) support creation from scipy object when `_sync_copy_from` is ready + raise NotImplementedError('creating BaseSparseNDArray from ' \ + ' a non-NDArray object is not implemented.') + arr = _empty_sparse_ndarray(source_array.stype, source_array.shape, ctx, dtype, aux_types) + arr[:] = source_array + return arr diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py new file mode 100644 index 000000000000..fa2cb5840f7e --- /dev/null +++ b/python/mxnet/ndarray/utils.py @@ -0,0 +1,232 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +"""Utility functions for NDArray and BaseSparseNDArray.""" +import ctypes + +from ..base import _LIB, check_call, py_str, c_str, string_types, mx_uint, NDArrayHandle, c_array +from .ndarray import NDArray, _zeros_ndarray, _empty_ndarray, _array +from .sparse_ndarray import _zeros_sparse_ndarray, _empty_sparse_ndarray, _sparse_array +from .sparse_ndarray import _ndarray_cls + + +def zeros(shape, ctx=None, dtype=None, stype=None, aux_types=None, **kwargs): + """Return a new array of given shape and type, filled with zeros. + + Parameters + ---------- + shape : int or tuple of int + The shape of the empty array + ctx : Context, optional + An optional device context (default is the current default context) + dtype : str or numpy.dtype, optional + An optional value type (default is `float32`) + stype: string, optional + The storage type of the empty array, such as 'row_sparse', 'csr', etc. + aux_types: list of numpy.dtype, optional + An optional list of types of the aux data for RowSparseNDArray or CSRNDArray + (default values depend on the storage type) + + Returns + ------- + NDArray, CSRNDArray or RowSparseNDArray + A created array + Examples + -------- + >>> mx.nd.zeros((1,2), mx.cpu(), stype='csr') + + >>> mx.nd.zeros((1,2), mx.cpu(), 'float16', stype='row_sparse').asnumpy() + array([[ 0., 0.]], dtype=float16) + """ + + if stype is None or stype == 'default': + return _zeros_ndarray(shape, ctx, dtype, **kwargs) + else: + return _zeros_sparse_ndarray(stype, shape, ctx, dtype, aux_types, **kwargs) + +def empty(shape, ctx=None, dtype=None, stype=None, aux_types=None): + """Returns a new array of given shape and type, without initializing entries. + + Parameters + ---------- + shape : int or tuple of int + The shape of the empty array. + ctx : Context, optional + An optional device context (default is the current default context). + dtype : str or numpy.dtype, optional + An optional value type (default is `float32`). + stype : str, optional + An optional storage type (default is `default`). + aux_types: list of numpy.dtype, optional + An optional list of types of the aux data for RowSparseNDArray or CSRNDArray + (default values depend on the storage type) + + Returns + ------- + NDArray, CSRNDArray or RowSparseNDArray + A created array. + + Examples + -------- + >>> mx.nd.empty(1) + + >>> mx.nd.empty((1,2), mx.gpu(0)) + + >>> mx.nd.empty((1,2), mx.gpu(0), 'float16') + + >>> mx.nd.empty((1,2), stype='csr') + + """ + if stype is None or stype == 'default': + return _empty_ndarray(shape, ctx, dtype) + else: + return _empty_sparse_ndarray(stype, shape, ctx, dtype, aux_types) + +def array(source_array, ctx=None, dtype=None, aux_types=None): + """Creates an array from any object exposing the array interface. + + Parameters + ---------- + source_array : array_like + An object exposing the array interface, an object whose `__array__` + method returns an array, or any (nested) sequence. + ctx : Context, optional + Device context (default is the current default context). + dtype : str or numpy.dtype, optional + The data type of the output array. The default dtype is ``source_array.dtype`` + if `source_array` is an `NDArray`, `float32` otherwise. + aux_types: list of numpy.dtype, optional + An optional list of types of the aux data for RowSparseNDArray or CSRNDArray + (default values depend on the storage type) + + Returns + ------- + NDArray, RowSparseNDArray or CSRNDArray + An array with the same contents as the `source_array`. + + Examples + -------- + >>> import numpy as np + >>> mx.nd.array([1, 2, 3]) + + >>> mx.nd.array([[1, 2], [3, 4]]) + + >>> mx.nd.array(np.zeros((3, 2))) + + >>> mx.nd.array(np.zeros((3, 2)), mx.gpu(0)) + + >>> mx.nd.array(mx.nd.zeros((3, 2), stype='row_sparse')) + + """ + # TODO(haibin/anisub) Check if input is scipy.sparse object with `scipy.sparse.issparse` + if isinstance(source_array, NDArray) and source_array.stype != 'default': + return _sparse_array(source_array, ctx=ctx, dtype=dtype, aux_types=aux_types) + else: + return _array(source_array, ctx=ctx, dtype=dtype) + +def load(fname): + """Loads an array from file. + + See more details in ``save``. + + Parameters + ---------- + fname : str + The filename. + + Returns + ------- + list of NDArray, RowSparseNDArray or CSRNDArray, or \ + dict of str to NDArray, RowSparseNDArray or CSRNDArray + Loaded data. + """ + if not isinstance(fname, string_types): + raise TypeError('fname required to be a string') + out_size = mx_uint() + out_name_size = mx_uint() + handles = ctypes.POINTER(NDArrayHandle)() + names = ctypes.POINTER(ctypes.c_char_p)() + check_call(_LIB.MXNDArrayLoad(c_str(fname), + ctypes.byref(out_size), + ctypes.byref(handles), + ctypes.byref(out_name_size), + ctypes.byref(names))) + if out_name_size.value == 0: + return [_ndarray_cls(NDArrayHandle(handles[i])) for i in range(out_size.value)] + else: + assert out_name_size.value == out_size.value + return dict( + (py_str(names[i]), _ndarray_cls(NDArrayHandle(handles[i]))) + for i in range(out_size.value)) + + +def save(fname, data): + """Saves a list of arrays or a dict of str->array to file. + + Examples of filenames: + + - ``/path/to/file`` + - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports) + - ``hdfs://path/to/file`` (if compiled with HDFS supports) + + Parameters + ---------- + fname : str + The filename. + data : NDArray, RowSparseNDArray or CSRNDArray, \ + or list of NDArray, RowSparseNDArray or CSRNDArray, \ + or dict of str to NDArray, RowSparseNDArray or CSRNDArray + The data to save. + + Examples + -------- + >>> x = mx.nd.zeros((2,3)) + >>> y = mx.nd.ones((1,4)) + >>> mx.nd.save('my_list', [x,y]) + >>> mx.nd.save('my_dict', {'x':x, 'y':y}) + >>> mx.nd.load('my_list') + [, ] + >>> mx.nd.load('my_dict') + {'y': , 'x': } + """ + if isinstance(data, NDArray): + data = [data] + handles = [] + if isinstance(data, dict): + keys = [] + for key, val in data.items(): + if not isinstance(key, string_types): + raise TypeError('save only accept dict str->NDArray or list of NDArray') + if not isinstance(val, NDArray): + raise TypeError('save only accept dict str->NDArray or list of NDArray') + keys.append(c_str(key)) + handles.append(val.handle) + keys = c_array(ctypes.c_char_p, keys) + elif isinstance(data, list): + for val in data: + if not isinstance(val, NDArray): + raise TypeError('save only accept dict str->NDArray or list of NDArray') + handles.append(val.handle) + keys = None + else: + raise ValueError("data needs to either be a NDArray, dict of str, NDArray pairs " + "or a list of NDarrays.") + check_call(_LIB.MXNDArraySave(c_str(fname), + mx_uint(len(handles)), + c_array(NDArrayHandle, handles), + keys)) diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py index 1ef9cc845036..e7e283f88e43 100644 --- a/python/mxnet/optimizer.py +++ b/python/mxnet/optimizer.py @@ -339,8 +339,8 @@ class SGD(Optimizer): state = momentum * state + lr * rescale_grad * clip(grad, clip_gradient) + wd * weight weight = weight - state - For details of the update algorithm see :class:`~mxnet.ndarray.sgd_update` and - :class:`~mxnet.ndarray.sgd_mom_update`. + Sparse updating is supported. For details of the update algorithm see + :class:`~mxnet.ndarray.sgd_update` and :class:`~mxnet.ndarray.sgd_mom_update`. This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`. @@ -367,7 +367,8 @@ def create_state(self, index, weight): if self.multi_precision and weight.dtype == numpy.float16: weight_master_copy = array(weight, ctx=weight.context, dtype=numpy.float32) if self.momentum != 0.0: - momentum = zeros(weight.shape, weight.context, dtype=numpy.float32) + momentum = zeros(weight.shape, weight.context, dtype=numpy.float32, + stype=weight.stype) return (momentum, weight_master_copy) if weight.dtype == numpy.float16 and not self.multi_precision: warnings.warn("Accumulating with float16 in optimizer can lead to " @@ -375,7 +376,7 @@ def create_state(self, index, weight): "Consider using multi_precision=True option of the " "SGD optimizer") if self.momentum != 0.0: - momentum = zeros(weight.shape, weight.context, dtype=weight.dtype) + momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype) return momentum def update(self, index, weight, grad, state): @@ -563,8 +564,10 @@ def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, self.epsilon = epsilon def create_state(self, index, weight): - return (zeros(weight.shape, weight.context, dtype=weight.dtype), # mean - zeros(weight.shape, weight.context, dtype=weight.dtype)) # variance + return (zeros(weight.shape, weight.context, dtype=weight.dtype, + stype=weight.stype), # mean + zeros(weight.shape, weight.context, dtype=weight.dtype, + stype=weight.stype)) # variance def update(self, index, weight, grad, state): assert(isinstance(weight, NDArray)) @@ -669,11 +672,11 @@ def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9, def create_state(self, index, weight): if self.centered: return ( - zeros(weight.shape, weight.context), # n - zeros(weight.shape, weight.context), # g - zeros(weight.shape, weight.context)) # delta + zeros(weight.shape, weight.context, stype=weight.stype), # n + zeros(weight.shape, weight.context, stype=weight.stype), # g + zeros(weight.shape, weight.context, stype=weight.stype)) # delta else: - return (zeros(weight.shape, weight.context), ) # n + return (zeros(weight.shape, weight.context, stype=weight.stype),) # n def update(self, index, weight, grad, state): assert(isinstance(weight, NDArray)) diff --git a/python/mxnet/random.py b/python/mxnet/random.py index 29b250d980ce..14bfc2731bd6 100644 --- a/python/mxnet/random.py +++ b/python/mxnet/random.py @@ -22,13 +22,13 @@ import ctypes from .base import _LIB, check_call -from ._ndarray_internal import _sample_uniform as uniform -from ._ndarray_internal import _sample_normal as normal -from ._ndarray_internal import _sample_gamma as gamma -from ._ndarray_internal import _sample_exponential as exponential -from ._ndarray_internal import _sample_poisson as poisson -from ._ndarray_internal import _sample_negbinomial as negative_binomial -from ._ndarray_internal import _sample_gennegbinomial as generalized_negative_binomial +from .ndarray._internal import _sample_uniform as uniform +from .ndarray._internal import _sample_normal as normal +from .ndarray._internal import _sample_gamma as gamma +from .ndarray._internal import _sample_exponential as exponential +from .ndarray._internal import _sample_poisson as poisson +from .ndarray._internal import _sample_negbinomial as negative_binomial +from .ndarray._internal import _sample_gennegbinomial as generalized_negative_binomial def seed(seed_state): """Seeds the random number generators in MXNet. diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py index 14cb3811deeb..2ee41884d700 100644 --- a/python/mxnet/symbol.py +++ b/python/mxnet/symbol.py @@ -40,6 +40,8 @@ from .context import Context from .ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP, _GRAD_REQ_MAP from .name import NameManager # pylint: disable=unused-import +from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID +from .ndarray.sparse_ndarray import _ndarray_cls from .executor import Executor from . import _symbol_internal as _internal from .attribute import AttrScope @@ -1263,8 +1265,9 @@ def _get_ndarray_inputs(arg_key, args, arg_names, allow_missing): raise TypeError('Only accept list of NDArrays or dict of str to NDArray') return c_array(NDArrayHandle, arg_handles), arg_arrays - def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, - shared_arg_names=None, shared_exec=None, shared_buffer=None, **kwargs): + def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None, + group2ctx=None, shared_arg_names=None, shared_exec=None, + shared_buffer=None, **kwargs): """Bind current symbol to get an executor, allocate all the arguments needed. Allows specifying data types. @@ -1306,6 +1309,9 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, type_dict : Dict of str->numpy.dtype Input type dictionary, name->dtype + stype_dict : Dict of str->str + Input storage type dictionary, name->storage_type + group2ctx : Dict of string to mx.Context The dict mapping the `ctx_group` attribute to the context assignment. @@ -1320,7 +1326,8 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, shared_buffer : Dict of string to `NDArray` The dict mapping argument names to the `NDArray` that can be reused for initializing the current executor. This buffer will be checked for reuse if one argument name - of the current executor is not found in `shared_arg_names`. + of the current executor is not found in `shared_arg_names`. The `NDArray`s are + expected have default storage type. kwargs : Dict of str->shape Input shape dictionary, name->shape @@ -1330,6 +1337,7 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, executor : mxnet.Executor The generated executor """ + # data types num_provided_arg_types = 0 provided_arg_type_names = ctypes.POINTER(ctypes.c_char_p)() # provided type argument names provided_arg_type_data = ctypes.POINTER(mx_uint)() # provided types @@ -1345,6 +1353,22 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, provided_arg_type_names = c_array(ctypes.c_char_p, provided_arg_type_names) provided_arg_type_data = c_array(ctypes.c_int, provided_arg_type_data) + # storage types + num_provided_arg_stypes = 0 + # provided storage type argument names + provided_arg_stype_names = ctypes.POINTER(ctypes.c_char_p)() + provided_arg_stype_data = ctypes.POINTER(mx_uint)() # provided storage types + if stype_dict is not None: + provided_arg_stype_names = [] + provided_arg_stype_data = [] + for k, v in stype_dict.items(): + if v in _STORAGE_TYPE_STR_TO_ID: + provided_arg_stype_names.append(c_str(k)) + provided_arg_stype_data.append(ctypes.c_int(_STORAGE_TYPE_STR_TO_ID[v])) + num_provided_arg_stypes = mx_uint(len(provided_arg_stype_names)) + provided_arg_stype_names = c_array(ctypes.c_char_p, provided_arg_stype_names) + provided_arg_stype_data = c_array(ctypes.c_int, provided_arg_stype_data) + provided_arg_shape_data = [] # shape data # argument shape index in sdata, # e.g. [sdata[indptr[0]], sdata[indptr[1]]) is the shape of the first arg @@ -1418,6 +1442,8 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, shared_buffer_names = [] shared_buffer_handles = [] for k, v in shared_buffer.items(): + assert(v.stype == 'default'), \ + "shared_buffer is expected to only contain NDArrays with default storage" shared_buffer_names.append(c_str(k)) shared_buffer_handles.append(v.handle) shared_buffer_names = c_array(ctypes.c_char_p, shared_buffer_names) @@ -1457,6 +1483,9 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, num_provided_arg_types, provided_arg_type_names, provided_arg_type_data, + num_provided_arg_stypes, + provided_arg_stype_names, + provided_arg_stype_data, mx_uint(len(shared_arg_name_list)), c_array(ctypes.c_char_p, shared_arg_name_list), ctypes.byref(shared_buffer_len), @@ -1486,11 +1515,12 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, shared_buffer[k] = v # create in_args, arg_grads, and aux_states for the current executor - arg_arrays = [NDArray(NDArrayHandle(in_arg_handles[i])) for i in range(num_in_args.value)] - grad_arrays = [NDArray(NDArrayHandle(arg_grad_handles[i])) + arg_arrays = [_ndarray_cls(NDArrayHandle(in_arg_handles[i])) \ + for i in range(num_in_args.value)] + grad_arrays = [_ndarray_cls(NDArrayHandle(arg_grad_handles[i])) if arg_grad_handles[i] is not None else None for i in range(num_in_args.value)] - aux_arrays = [NDArray(NDArrayHandle(aux_state_handles[i])) + aux_arrays = [_ndarray_cls(NDArrayHandle(aux_state_handles[i])) for i in range(num_aux_states.value)] executor = Executor(exe_handle, self, ctx, grad_req, group2ctx) @@ -1767,7 +1797,8 @@ def detach(self): def backward(self): raise NotImplementedForSymbol(self.backward, None) -def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, init=None, **kwargs): +def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, + init=None, stype=None, **kwargs): """Creates a symbolic variable with specified name. Example usage: @@ -1794,6 +1825,8 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, ini The dtype for input variable. If not specified, this value will be inferred. init : initializer (mxnet.init.*) Initializer for this variable to (optionally) override the default initializer. + stype : str + The storage type of the variable. kwargs : Additional attribute variables Additional attributes must start and end with double underscores. @@ -1821,6 +1854,8 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, ini if not isinstance(init, string_types): init = init.dumps() attr['__init__'] = init + if stype is not None: + attr['__storage_type__'] = str(_STORAGE_TYPE_STR_TO_ID[stype]) for k, v in kwargs.items(): if k.startswith('__') and k.endswith('__'): attr[k] = str(v) diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py index c5587f8d80a8..3eeb51a443c8 100644 --- a/python/mxnet/test_utils.py +++ b/python/mxnet/test_utils.py @@ -29,17 +29,20 @@ import errno import logging from contextlib import contextmanager +import scipy.sparse as sp import numpy as np import numpy.testing as npt -import mxnet as mx -from .context import Context -from .ndarray import array -from .symbol import Symbol +import numpy.random as rnd try: import requests except ImportError: # in rare cases requests may be not installed pass +import mxnet as mx +from .context import Context +from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID +from .ndarray import array +from .symbol import Symbol _rng = np.random.RandomState(1234) @@ -85,6 +88,182 @@ def random_arrays(*shapes): return arrays +def random_sample(population, k): + """Return a k length list of the elements chosen from the population sequence.""" + assert 0 <= k <= len(population) + population_copy = population[:] + np.random.shuffle(population_copy) + return population_copy[0:k] + + +def _validate_csr_generation_inputs(num_rows, num_cols, density, + distribution="uniform"): + """Validates inputs for csr generation helper functions + """ + total_nnz = int(num_rows * num_cols * density) + if density < 0 or density > 1: + raise ValueError("density has to be between 0 and 1") + + if num_rows <= 0 or num_cols <= 0: + raise ValueError("num_rows or num_cols should be greater than 0") + + if distribution == "powerlaw": + if total_nnz < 2 * num_rows: + raise ValueError("not supported for this density: %s" + " for this shape (%s, %s)" + " Please keep :" + " num_rows * num_cols * density >= 2 * num_rows" + % (density, num_rows, num_cols)) + + +def _get_uniform_dataset_csr(num_rows, num_cols, density=0.1, dtype=None): + """Returns CSRNDArray with uniform distribution + This generates a csr matrix with totalnnz unique randomly chosen numbers + from num_rows*num_cols and arranges them in the 2d array in the + following way: row_index = (random_number_generated / num_rows) + col_index = random_number_generated - row_index * num_cols + """ + _validate_csr_generation_inputs(num_rows, num_cols, density, + distribution="uniform") + csr = sp.rand(num_rows, num_cols, density, dtype=dtype, format="csr") + result = mx.nd.csr_matrix(csr.data, csr.indptr, csr.indices, + (num_rows, num_cols), dtype=dtype) + return result + + +def _get_powerlaw_dataset_csr(num_rows, num_cols, density=0.1, dtype=None): + """Returns CSRNDArray with powerlaw distribution + with exponentially increasing number of non zeros in each row. + Not supported for cases where total_nnz < 2*num_rows. This is because + the algorithm first tries to ensure that there are rows with no zeros by + putting non zeros at beginning of each row. + """ + + _validate_csr_generation_inputs(num_rows, num_cols, density, + distribution="powerlaw") + + total_nnz = int(num_rows * num_cols * density) + + unused_nnz = total_nnz + output_arr = np.zeros((num_rows, num_cols), dtype=dtype) + # Start with ones on each row so that no row is empty + for row in range(num_rows): + output_arr[row][0] = 1 + rnd.uniform(0.001, 2) + unused_nnz = unused_nnz - 1 + if unused_nnz <= 0: + return mx.nd.array(output_arr).tostype("csr") + + # Populate rest of matrix with 2^i items in ith row. + # if we have used all total nnz return the sparse matrix + # else if we reached max column size then fill up full columns until we use all nnz + col_max = 2 + for row in range(num_rows): + col_limit = min(num_cols, col_max) + # In case col_limit reached assign same value to all elements, which is much faster + if col_limit == num_cols and unused_nnz > col_limit: + output_arr[row] = 1 + rnd.uniform(0.001, 2) + unused_nnz = unused_nnz - col_limit + 1 + if unused_nnz <= 0: + return mx.nd.array(output_arr).tostype("csr") + else: + continue + for col_index in range(1, col_limit): + output_arr[row][col_index] = 1 + rnd.uniform(0.001, 2) + unused_nnz = unused_nnz - 1 + if unused_nnz <= 0: + return mx.nd.array(output_arr).tostype("csr") + col_max = col_max * 2 + + if unused_nnz > 0: + #return mx.nd.array(sp.random(num_rows, num_cols, density).toarray()).tostype("csr") + raise ValueError("not supported for this density: %s" + " for this shape (%s,%s)" % (density, num_rows, num_cols)) + else: + return mx.nd.array(output_arr).tostype("csr") + + +def rand_sparse_ndarray(shape, stype, density=None, distribution="uniform", dtype=None): + """Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np) + Parameters + ---------- + shape: list or tuple + stype: str, valid values: "csr" or "row_sparse" + density, optional: float, should be between 0 and 1 + distribution, optional: str, valid values: "uniform" or "powerlaw" + dtype, optional: numpy.dtype, default value is None + Returns + ------- + Result of type CSRNDArray or RowSparseNDArray + Examples + -------- + Below is an example of the powerlaw distribution with csr as the stype. + It calculates the nnz using the shape and density. + It fills up the ndarray with exponentially increasing number of elements. + If there are enough unused_nnzs, n+1th row will have twice more nnzs compared to nth row. + else, remaining unused_nnzs will be used in n+1th row + If number of cols is too small and we have already reached column size it will fill up + all following columns in all followings rows until we reach the required density. + + >>> csr_arr, _ = rand_sparse_ndarray(shape=(5, 16), stype="csr", + density=0.50, distribution="powerlaw") + >>> indptr = csr_arr.indptr.asnumpy() + >>> indices = csr_arr.indices.asnumpy() + >>> data = csr_arr.data.asnumpy() + >>> row2nnz = len(data[indptr[1]:indptr[2]]) + >>> row3nnz = len(data[indptr[2]:indptr[3]]) + >>> assert(row3nnz == 2*row2nnz) + >>> row4nnz = len(data[indptr[3]:indptr[4]]) + >>> assert(row4nnz == 2*row3nnz) + """ + density = rnd.rand() if density is None else density + dtype = default_dtype() if dtype is None else dtype + if stype == 'row_sparse': + assert (distribution == "uniform"), \ + "Distribution %s not supported for row_sparse" % (distribution) + # sample index + idx_sample = rnd.rand(shape[0]) + indices = np.argwhere(idx_sample < density).flatten() + if indices.shape[0] == 0: + result = mx.nd.zeros(shape, stype='row_sparse', dtype=dtype) + return result, (np.array([], dtype=dtype), np.array([], dtype='int64')) + # generate random values + val = rnd.rand(indices.shape[0], *shape[1:]).astype(dtype) + arr = mx.nd.row_sparse_array(val, indices, shape, indices_type=np.int64, dtype=dtype) + return arr, (val, indices) + elif stype == 'csr': + assert len(shape) == 2 + if distribution == "uniform": + csr = _get_uniform_dataset_csr(shape[0], shape[1], density, dtype=dtype) + return csr, (csr.indptr, csr.indices, csr.data) + elif distribution == "powerlaw": + csr = _get_powerlaw_dataset_csr(shape[0], shape[1], density, dtype=dtype) + return csr, (csr.indptr, csr.indices, csr.data) + else: + assert(False), "Distribution not supported: %s" % (distribution) + else: + assert(False), "unknown storage type" + + +def rand_ndarray(shape, stype, density=None, dtype=None): + if stype == 'default': + arr = mx.nd.array(random_arrays(shape), dtype=dtype) + else: + arr, _ = rand_sparse_ndarray(shape, stype, density=density, dtype=dtype) + return arr + + +def rand_shape_2d(dim0=10, dim1=10): + return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1) + + +def rand_shape_3d(dim0=10, dim1=10, dim2=10): + return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1), rnd.randint(1, dim2 + 1) + + +def rand_shape_nd(n, dim=10): + return rnd.randint(1, dim+1, size=n) + + def np_reduce(dat, axis, keepdims, numpy_reduce_func): """Compatible reduce for old version of NumPy. @@ -316,7 +495,8 @@ def _parse_location(sym, location, ctx): % (str(set(sym.list_arguments())), str(set(location.keys())))) else: location = {k: v for k, v in zip(sym.list_arguments(), location)} - location = {k: mx.nd.array(v, ctx=ctx) for k, v in location.items()} + location = {k: mx.nd.array(v, ctx=ctx) if isinstance(v, np.ndarray) \ + else v for k, v in location.items()} return location @@ -437,7 +617,8 @@ def numeric_grad(executor, location, aux_states=None, eps=1e-4, use_forward_trai def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rtol=1e-2, - atol=None, grad_nodes=None, use_forward_train=True, ctx=None): + atol=None, grad_nodes=None, use_forward_train=True, ctx=None, + grad_stype_dict=None): """Verify an operation by checking backward pass via finite difference method. Based on Theano's `theano.gradient.verify_grad` [1] @@ -454,7 +635,7 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto - if type is dict of str -> numpy.ndarray maps the name of arguments to the corresponding numpy.ndarray. *In either case, value of all the arguments must be provided.* - aux_states : ist or tuple or dict, optional + aux_states : list or tuple or dict, optional The auxiliary states required when generating the executor for the symbol. numeric_eps : float, optional Delta for the finite difference method that approximates the gradient. @@ -466,6 +647,8 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto Whether to use is_train=True when computing the finite-difference. ctx : Context, optional Check the gradient computation on the specified device. + grad_stype_dict : dict of str->str, optional + Storage type dictionary for gradient ndarrays. References --------- ..[1] https://github.com/Theano/Theano/blob/master/theano/gradient.py @@ -489,7 +672,7 @@ def random_projection(shape): location_npy = {k:v.asnumpy() for k, v in location.items()} aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx) if aux_states is not None: - aux_states_npy = {k:v.asnumpy() for k, v in aux_states.items()} + aux_states_npy = {k: v.asnumpy() for k, v in aux_states.items()} else: aux_states_npy = None if grad_nodes is None: @@ -516,6 +699,14 @@ def random_projection(shape): + [("__random_proj", _rng.normal(0, 0.01, size=out_shape[0]))]) args_grad = {k: mx.nd.array(v, ctx=ctx) for k, v in args_grad_npy.items()} + if grad_stype_dict is not None: + assert isinstance(grad_stype_dict, dict), "grad_stype_dict must be a dict" + for k, v in grad_stype_dict.items(): + if k in args_grad and v in _STORAGE_TYPE_STR_TO_ID and v != 'default': + # create an uninitialized sparse ndarray for executor + # if the symbolic grad is expected to be zero, it should not be initialized at all + args_grad[k] = mx.nd.zeros(args_grad[k].shape, args_grad[k].context, + args_grad[k].dtype, v) executor = out.bind(ctx, grad_req=grad_req, args=location, args_grad=args_grad, aux_states=aux_states) @@ -607,15 +798,15 @@ def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None, g[:] = 0 executor.forward(is_train=False) - outputs = [x.asnumpy() for x in executor.outputs] + outputs = [x.asnumpy() for x in executor.outputs] for output_name, expect, output in zip(sym.list_outputs(), expected, outputs): assert_almost_equal(expect, output, rtol, atol, ("EXPECTED_%s"%output_name, "FORWARD_%s"%output_name)) def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=None, - aux_states=None, grad_req='write', ctx=None): + aux_states=None, grad_req='write', ctx=None, grad_stypes=None): """Compares a symbol's backward results with the expected ones. Prints error messages if the backward results are not the same as the expected results. @@ -651,6 +842,8 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol= Gradient requirements. 'write', 'add' or 'null'. ctx : Context, optional Running context. + grad_stypes: dict of str->str + dictionary of mapping argument name to stype for the gradient Example ------- @@ -676,14 +869,24 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol= if isinstance(expected, (list, tuple)): expected = {k:v for k, v in zip(sym.list_arguments(), expected)} args_grad_npy = {k:_rng.normal(size=v.shape) for k, v in expected.items()} - args_grad_data = {k: mx.nd.array(v, ctx=ctx) for k, v in args_grad_npy.items()} + args_grad_data = {} + for k, v in args_grad_npy.items(): + nd = mx.nd.array(v, ctx=ctx) + if grad_stypes is not None and k in grad_stypes: + out = nd.tostype(grad_stypes[k]) + args_grad_data[k] = out + else: + args_grad_data[k] = nd + if isinstance(grad_req, str): grad_req = {k:grad_req for k in sym.list_arguments()} elif isinstance(grad_req, (list, tuple)): grad_req = {k:v for k, v in zip(sym.list_arguments(), grad_req)} - executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, aux_states=aux_states) + executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, + aux_states=aux_states, grad_req=grad_req) executor.forward(is_train=True) + if isinstance(out_grads, (tuple, list)): out_grads = [mx.nd.array(v, ctx=ctx) for v in out_grads] elif isinstance(out_grads, (dict)): diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 93458d21ac5a..0fe3fe3e302e 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -172,6 +172,39 @@ int MXNDArrayCreateEx(const mx_uint *shape, API_END(); } +int MXNDArrayCreateSparseEx(int storage_type, + const mx_uint *shape, + mx_uint ndim, + int dev_type, + int dev_id, + int delay_alloc, + int dtype, + mx_uint num_aux, + int *aux_type, + mx_uint *aux_ndims, + const mx_uint *aux_shape, + NDArrayHandle *out) { + API_BEGIN(); + std::vector aux_types; + std::vector aux_shapes; + auto shape_start = aux_shape; + for (size_t i = 0; i < num_aux; i++) { + // types + aux_types.push_back(aux_type[i]); + // shapes + aux_shapes.emplace_back(shape_start, shape_start + aux_ndims[i]); + shape_start += aux_ndims[i]; + } + *out = new NDArray( + NDArrayStorageType(storage_type), + TShape(shape, shape + ndim), + Context::Create(static_cast(dev_type), dev_id), + delay_alloc != 0, + dtype, aux_types, aux_shapes); + API_END(); +} + + int MXNDArrayLoadFromRawBytes(const void *buf, size_t size, NDArrayHandle *out) { @@ -215,6 +248,23 @@ int MXNDArraySyncCopyToCPU(NDArrayHandle handle, API_END(); } +/*! + * \brief Copy src.data() to dst.data() if i = -1, else dst.aux_data(i) if i >= 0 + * This function blocks. Do not use it in performance critical code. + * \param handle_dst handle of a dst ndarray whose data/aux_data has been allocated + * \param handle_src handle of a src ndarray which has default storage type + * \param i dst data blob indicator + */ +int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst, + const NDArrayHandle handle_src, + const int i) { + API_BEGIN(); + NDArray* dst = static_cast(handle_dst); + NDArray* src = static_cast(handle_src); + dst->SyncCopyFromNDArray(*src, -1, i); + API_END(); +} + int MXNDArrayWaitToRead(NDArrayHandle handle) { API_BEGIN(); static_cast(handle)->WaitToRead(); @@ -351,6 +401,18 @@ MXNET_DLL int MXNDArrayReshape(NDArrayHandle handle, API_END_HANDLE_ERROR(delete ptr); } +int MXNDArrayGetStorageType(NDArrayHandle handle, + int *out_storage_type) { + API_BEGIN(); + NDArray *arr = static_cast(handle); + if (!arr->is_none()) { + *out_storage_type = arr->storage_type(); + } else { + *out_storage_type = kUndefinedStorage; + } + API_END(); +} + int MXNDArrayGetShape(NDArrayHandle handle, mx_uint *out_dim, const mx_uint **out_pdata) { @@ -400,6 +462,42 @@ int MXNDArrayGetDType(NDArrayHandle handle, API_END(); } +int MXNDArrayGetAuxType(NDArrayHandle handle, + mx_uint i, + int *out_type) { + API_BEGIN(); + NDArray *arr = static_cast(handle); + *out_type = arr->aux_type(i); + API_END(); +} + +/*! + * \brief Get a deep copy of the ith aux data blob + * in the form of an NDArray of default storage type. + * This function blocks. Do not use it in performance critical code. + */ +int MXNDArrayGetAuxNDArray(NDArrayHandle handle, + mx_uint i, + NDArrayHandle *out) { + API_BEGIN(); + NDArray *arr = static_cast(handle); + *out = new NDArray(arr->aux_ndarray(i)); + API_END(); +} + +/*! + * \brief Get a deep copy of the data blob + * in the form of an NDArray of default storage type. + * This function blocks. Do not use it in performance critical code. + */ +int MXNDArrayGetDataNDArray(NDArrayHandle handle, + NDArrayHandle *out) { + API_BEGIN(); + NDArray *arr = static_cast(handle); + *out = new NDArray(arr->data_ndarray()); + API_END(); +} + int MXNDArrayGetContext(NDArrayHandle handle, int *out_dev_type, int *out_dev_id) { @@ -735,6 +833,24 @@ int MXKVStorePullEx(KVStoreHandle handle, API_END(); } +int MXKVStorePullRowSparse(KVStoreHandle handle, + mx_uint num, + const char** keys, + NDArrayHandle* vals, + const NDArrayHandle* row_ids, + int priority) { + API_BEGIN(); + std::vector v_keys(num); + std::vector> v_val_rowids(num); + for (mx_uint i = 0; i < num; ++i) { + v_keys[i] = keys[i]; + v_val_rowids[i] = std::make_pair(static_cast(vals[i]), + *static_cast(row_ids[i])); + } + static_cast(handle)->PullRowSparse(v_keys, v_val_rowids, priority); + API_END(); +} + int MXKVStoreSetUpdater(KVStoreHandle handle, MXKVStoreUpdater updater, void* updater_handle) { diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h index 846b53973b07..fee3f03f6db0 100644 --- a/src/c_api/c_api_common.h +++ b/src/c_api/c_api_common.h @@ -76,6 +76,8 @@ struct MXAPIThreadLocalEntry { std::vector arg_shapes, out_shapes, aux_shapes; /*! \brief result holder for returning type flags */ std::vector arg_types, out_types, aux_types; + /*! \brief result holder for returning storage types */ + std::vector arg_storage_types, out_storage_types, aux_storage_types; /*! \brief result holder for returning shape dimensions */ std::vector arg_shape_ndim, out_shape_ndim, aux_shape_ndim; /*! \brief result holder for returning shape pointer */ diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc index a4c48e426879..631c1a7d93eb 100644 --- a/src/c_api/c_api_executor.cc +++ b/src/c_api/c_api_executor.cc @@ -198,6 +198,9 @@ int MXExecutorBindEX(SymbolHandle symbol_handle, * \param num_provided_arg_dtypes number of user provided in_arg and axu_state dtypes * \param provided_arg_dtype_names argument name list of provided dtypes * \param provided_arg_dtypes data of provided dtypes + * \param num_provided_arg_stypes number of user provided in_arg and axu_state storage types + * \param provided_arg_stype_names argument name list of provided storage types + * \param provided_arg_stypes data of provided storage types * \param num_shared_arg_names number of parameter names passed from _bind_ith_exec * \param shared_arg_name_list parameter name list passed from _bind_ith_exec * \param shared_buffer_len number of shared data arrays passed from _bind_ith_exec @@ -230,6 +233,9 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle, const mx_uint num_provided_arg_dtypes, const char** provided_arg_dtype_names, const int* provided_arg_dtypes, + const mx_uint num_provided_arg_stypes, + const char** provided_arg_stype_names, + const int* provided_arg_stypes, const mx_uint num_shared_arg_names, const char** shared_arg_name_list, int* shared_buffer_len, @@ -254,7 +260,7 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle, // attr_dict for setting up type_dict and arg/aux ctx std::unordered_map> attr_dict; - if (nullptr == provided_arg_dtypes || nullptr != g2c_keys) { + if (nullptr == provided_arg_dtypes || nullptr != g2c_keys || nullptr == provided_arg_stypes) { std::vector> attrs = sym->ListAttrsRecursive(); attr_dict.reserve(attrs.size()); @@ -280,6 +286,23 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle, } } + // setup arg_stype_map + std::unordered_map arg_stype_map; + if (nullptr == provided_arg_stypes) { // use attr_dict + for (const auto& arg_name : in_arg_names) { + const auto it = attr_dict.find(arg_name); + if (it == attr_dict.end() || !it->second.count("__storage_type__")) { + arg_stype_map[arg_name] = kDefaultStorage; + } + } + } else { // use user input type_dict + // create stype map for in_args and aux_states + arg_stype_map.reserve(num_provided_arg_stypes); + for (mx_uint i = 0; i < num_provided_arg_stypes; ++i) { + arg_stype_map[provided_arg_stype_names[i]] = provided_arg_stypes[i]; + } + } + // create default ctx Context ctx = Context::Create(static_cast(dev_type), dev_id); // create ctx map @@ -420,9 +443,10 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle, std::vector aux_state_vec; *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec, - aux_state_ctx_vec, arg_shape_map, arg_dtype_map, grad_req_type_vec, - shared_arg_name_set, &in_arg_vec, &arg_grad_vec, &aux_state_vec, - use_shared_buffer? &shared_buffer_map : nullptr, + aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map, + grad_req_type_vec, shared_arg_name_set, &in_arg_vec, + &arg_grad_vec, &aux_state_vec, + use_shared_buffer ? &shared_buffer_map : nullptr, reinterpret_cast(shared_exec_handle)); // copy ndarray ptrs to ret->handles so that front end diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc index 3202f55abea7..d392baf45d3e 100644 --- a/src/c_api/c_api_ndarray.cc +++ b/src/c_api/c_api_ndarray.cc @@ -18,7 +18,8 @@ */ /*! - * \file c_api_symbolic.cc + * Copyright (c) 2016 by Contributors + * \file c_api_ndarray.cc * \brief C API of mxnet */ @@ -150,14 +151,17 @@ void SetContext(Context* p_ctx, #endif // MXNET_USE_CUDA } +// Set the shape, dtype and storage type void SetShapeType(const nnvm::Op* op, const nnvm::NodeAttrs& attrs, const Context& ctx, const std::vector& ndinputs, - std::vector* p_ndoutputs) { + std::vector* p_ndoutputs, + int* dispatch_stype) { std::vector& ndoutputs = *p_ndoutputs; static auto& infershape = nnvm::Op::GetAttr("FInferShape"); static auto& infertype = nnvm::Op::GetAttr("FInferType"); + static auto& inferstorage = nnvm::Op::GetAttr("FInferStorageType"); MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get(); // infer shape std::vector& in_shapes = ret->arg_shapes; @@ -193,9 +197,35 @@ void SetShapeType(const nnvm::Op* op, CHECK(infertype[op](attrs, &in_types, &out_types)); CHECK_EQ(out_types.size(), ndoutputs.size()); + // infer storage type + auto& in_storage_types = ret->arg_storage_types; + auto& out_storage_types = ret->out_storage_types; + in_storage_types.clear(); + out_storage_types.clear(); + for (auto& i : ndinputs) { + in_storage_types.push_back(i.storage_type()); + } + for (auto& i : ndoutputs) { + out_storage_types.push_back(i.storage_type()); + } + if (inferstorage.count(op)) { + CHECK(inferstorage[op](attrs, ctx, &in_storage_types, &out_storage_types)); + CHECK_EQ(out_storage_types.size(), ndoutputs.size()); + } + + bool contains_non_default = common::ContainsNonDefaultStorage(in_storage_types); + contains_non_default |= common::ContainsNonDefaultStorage(out_storage_types); + int kNonDefaultStorage = -2; + *dispatch_stype = contains_non_default ? kNonDefaultStorage : kDefaultStorage; for (size_t i = 0; i < ndoutputs.size(); ++i) { + NDArrayStorageType storage_type = static_cast(out_storage_types[i]); if (ndoutputs[i].is_none()) { - ndoutputs[i] = NDArray(out_shapes[i], ctx, true, out_types[i]); + // if failed to infer the storage type, assume the output storage is dense + if (storage_type == kDefaultStorage || out_storage_types[i] == kUndefinedStorage) { + ndoutputs[i] = NDArray(out_shapes[i], ctx, true, out_types[i]); + } else { + ndoutputs[i] = NDArray(storage_type, out_shapes[i], ctx, true, out_types[i]); + } } else { CHECK_EQ(ndoutputs[i].shape(), out_shapes[i]) << i << "th output has invalid shape. " @@ -212,7 +242,7 @@ void SetShapeType(const nnvm::Op* op, void SetDependency(std::vector *p_read_vars, std::vector *p_write_vars, std::vector *p_requested, - std::vector *p_auxidx, + std::vector *p_mutate_idx, const nnvm::Op* op, const nnvm::NodeAttrs& attrs, const Context& ctx, @@ -224,7 +254,7 @@ void SetDependency(std::vector *p_read_vars, std::vector& read_vars = *p_read_vars; std::vector& write_vars = *p_write_vars; std::vector& requested = *p_requested; - std::vector& auxidx = *p_auxidx; + std::vector& mutate_idx = *p_mutate_idx; if (tmp_resource.count(op)) { int ntmp = 0; @@ -250,15 +280,30 @@ void SetDependency(std::vector *p_read_vars, write_vars.push_back(i.var()); } if (mutate.count(op)) { - auxidx = mutate[op](attrs); - std::sort(auxidx.begin(), auxidx.end()); - for (auto & i : auxidx) { + mutate_idx = mutate[op](attrs); + std::sort(mutate_idx.begin(), mutate_idx.end()); + for (auto & i : mutate_idx) { write_vars.push_back(ndinputs[i].var()); } } Engine::Get()->DeduplicateVarHandle(&read_vars, &write_vars); } +inline void SetWriteInplaceReq(const std::vector &ndinputs, + const std::vector &ndoutputs, + std::vector *req) { + std::unordered_set in_vars; + for (auto &nd : ndinputs) { + in_vars.insert(nd.var()); + } + for (size_t i = 0; i < ndoutputs.size(); i++) { + // output NDArray shares the memory with the input NDArray + if (in_vars.find(ndoutputs[i].var()) != in_vars.end()) { + req->at(i) = kWriteInplace; + } + } +} + void PushFCompute(const FCompute& fn, const nnvm::Op* op, const nnvm::NodeAttrs& attrs, @@ -267,24 +312,75 @@ void PushFCompute(const FCompute& fn, const std::vector& write_vars, const std::vector& requested, const std::vector& ndinputs, - const std::vector& ndoutputs) { + const std::vector& ndoutputs, + const std::vector& mutate_idx) { + using namespace common; bool is_train = AutogradRuntime::Get()->IsTraining(); Engine::Get()->PushAsync( - [ctx, attrs, fn, ndinputs, ndoutputs, requested, is_train]( + [ctx, attrs, fn, ndinputs, ndoutputs, requested, is_train, mutate_idx]( RunContext rctx, engine::CallbackOnComplete on_complete) { std::vector input_blobs, output_blobs; - for (auto& i : ndinputs) { - input_blobs.push_back(i.data()); - } - for (auto& i : ndoutputs) { - output_blobs.push_back(i.data()); + // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays + std::vector pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src; + // mapping from index in input_blobs to index in pre_temp_dst + std::unordered_map in_temp_idx_map; + // populate input blobs and output blobs + SetupDefaultBlobs(ndinputs, &input_blobs, &pre_temp_src, &pre_temp_dst, &in_temp_idx_map); + SetupDefaultBlobs(ndoutputs, &output_blobs, &post_temp_dst, &post_temp_src); + // add mutable inputs to post temp list + for (const auto idx : mutate_idx) { + auto map_iter = in_temp_idx_map.find(idx); + if (map_iter != in_temp_idx_map.end()) { + post_temp_src.push_back(pre_temp_dst[map_iter->second]); + post_temp_dst.push_back(ndinputs[idx]); + } } OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested}; std::vector req(output_blobs.size(), kWriteTo); - fn(attrs, opctx, input_blobs, req, output_blobs); + if (ctx.dev_mask() == gpu::kDevMask) { +#if MXNET_USE_CUDA + CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx); + fn(attrs, opctx, input_blobs, req, output_blobs); + // cast to original storage type, if necessary + CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx); + rctx.get_stream()->Wait(); +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } else { + CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx); + fn(attrs, opctx, input_blobs, req, output_blobs); + // cast to original storage type, if necessary + CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx); + } + on_complete(); + }, ctx, read_vars, write_vars, FnProperty::kNormal, + 0, PROFILER_MESSAGE(op->name.c_str())); +} + +void PushFComputeEx(const FComputeEx& fn, + const nnvm::Op* op, + const nnvm::NodeAttrs& attrs, + const Context& ctx, + const std::vector& read_vars, + const std::vector& write_vars, + const std::vector& requested, + const std::vector& ndinputs, + const std::vector& ndoutputs) { + Engine::Get()->PushAsync( + [ctx, attrs, fn, ndinputs, ndoutputs, requested]( + RunContext rctx, + engine::CallbackOnComplete on_complete) { + std::vector input_blobs, output_blobs; + OpContext opctx{false, rctx, + engine::CallbackOnComplete(), + requested}; + std::vector req(ndoutputs.size(), kWriteTo); + SetWriteInplaceReq(ndinputs, ndoutputs, &req); + fn(attrs, opctx, ndinputs, req, ndoutputs); if (ctx.dev_mask() == gpu::kDevMask) { rctx.get_stream()->Wait(); } @@ -301,7 +397,9 @@ void PushOperator(const OpStatePtr& state, const std::vector& write_vars, const std::vector& requested, const std::vector& ndinputs, - const std::vector& ndoutputs) { + const std::vector& ndoutputs, + const std::vector& mutate_idx) { + using namespace common; static auto& fexec_type = nnvm::Op::GetAttr("FExecType"); bool is_train = AutogradRuntime::Get()->IsTraining(); @@ -314,15 +412,40 @@ void PushOperator(const OpStatePtr& state, if (fcompute != nullptr) { CHECK(exec_type == ExecType::kSync || exec_type == ExecType::kAsync); Engine::Get()->PushAsync( - [state, fcompute, ndinputs, ndoutputs, requested, is_train, exec_type]( + [state, fcompute, ndinputs, ndoutputs, requested, is_train, exec_type, mutate_idx]( RunContext rctx, engine::CallbackOnComplete on_complete) { OpContext opctx{is_train, rctx, on_complete, requested}; + std::vector input_blobs, output_blobs; - for (const auto& i : ndinputs) input_blobs.push_back(i.data()); - for (const auto& i : ndoutputs) output_blobs.push_back(i.data()); + // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays + std::vector pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src; + // mapping from index in input_blobs to index in pre_temp_dst + std::unordered_map in_temp_idx_map; + // populate input blobs and output blobs + SetupDefaultBlobs(ndinputs, &input_blobs, &pre_temp_src, &pre_temp_dst, &in_temp_idx_map); + SetupDefaultBlobs(ndoutputs, &output_blobs, &post_temp_dst, &post_temp_src); + // add mutable inputs to post temp list + for (const auto idx : mutate_idx) { + if (in_temp_idx_map.find(idx) != in_temp_idx_map.end()) { + post_temp_src.push_back(pre_temp_dst[in_temp_idx_map[idx]]); + post_temp_dst.push_back(ndinputs[idx]); + } + } std::vector req(output_blobs.size(), kWriteTo); - fcompute(state, opctx, input_blobs, req, output_blobs); + if (rctx.get_ctx().dev_mask() == gpu::kDevMask) { +#if MXNET_USE_CUDA + CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx); + fcompute(state, opctx, input_blobs, req, output_blobs); + CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx); +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } else { + CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx); + fcompute(state, opctx, input_blobs, req, output_blobs); + CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx); + } if (exec_type == ExecType::kSync) { if (rctx.get_ctx().dev_mask() == gpu::kDevMask) { rctx.get_stream()->Wait(); @@ -342,6 +465,7 @@ void PushOperator(const OpStatePtr& state, engine::CallbackOnComplete on_complete) { OpContext opctx{is_train, rctx, on_complete, requested}; std::vector req(ndoutputs.size(), kWriteTo); + SetWriteInplaceReq(ndinputs, ndoutputs, &req); fcompute_ex(state, opctx, ndinputs, req, ndoutputs); if (exec_type == ExecType::kSync) { if (rctx.get_ctx().dev_mask() == gpu::kDevMask) { @@ -363,8 +487,6 @@ void ImperativeInvokeImpl(const Context& default_ctx, const nnvm::NodeAttrs& attrs, std::vector* p_ndinputs, std::vector* p_ndoutputs) { - static auto& fcpu = nnvm::Op::GetAttr("FCompute"); - static auto& fgpu = nnvm::Op::GetAttr("FCompute"); static auto& ndfunc = nnvm::Op::GetAttr("FNDArrayFunction"); static auto& createop = nnvm::Op::GetAttr("FCreateOpState"); MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get(); @@ -379,29 +501,32 @@ void ImperativeInvokeImpl(const Context& default_ctx, } else { // TODO(piiswrong): infer ctx Context ctx; + int stype; SetContext(&ctx, attrs, ndinputs, ndoutputs, default_ctx); - SetShapeType(op, attrs, ctx, ndinputs, &ndoutputs); + SetShapeType(op, attrs, ctx, ndinputs, &ndoutputs, &stype); std::vector read_vars, write_vars; std::vector requested; - std::vector auxidx; - SetDependency(&read_vars, &write_vars, &requested, &auxidx, + std::vector mutate_idx; + SetDependency(&read_vars, &write_vars, &requested, &mutate_idx, op, attrs, ctx, ndinputs, ndoutputs); - FCompute fn; - if (ctx.dev_mask() == cpu::kDevMask && fcpu.count(op)) { - fn = fcpu[op]; - } else if (ctx.dev_mask() == gpu::kDevMask && fgpu.count(op)) { - fn = fgpu[op]; - } - - if (fn) { + FCompute fn = common::GetFCompute(op, "FCompute", ctx); + FComputeEx fn_ex = common::GetFCompute(op, "FComputeEx", ctx); + if (fn_ex && stype != kDefaultStorage) { if (AutogradRuntime::Get()->IsRecording()) { AutogradRuntime::Get()->RecordImperativeFCompute(op, attrs, &ndinputs, &ndoutputs); } - PushFCompute(fn, op, attrs, ctx, read_vars, write_vars, + PushFComputeEx(fn_ex, op, attrs, ctx, read_vars, write_vars, requested, ndinputs, ndoutputs); + } else if (fn) { + if (AutogradRuntime::Get()->IsRecording()) { + AutogradRuntime::Get()->RecordImperativeFCompute(op, + attrs, &ndinputs, &ndoutputs); + } + PushFCompute(fn, op, attrs, ctx, read_vars, write_vars, + requested, ndinputs, ndoutputs, mutate_idx); } else if (createop.count(op)) { auto state = createop[op](attrs, ctx, ret->arg_shapes, ret->arg_types); @@ -411,7 +536,7 @@ void ImperativeInvokeImpl(const Context& default_ctx, } write_vars.push_back(state.get_var()); PushOperator(state, op, attrs, ctx, read_vars, write_vars, - requested, ndinputs, ndoutputs); + requested, ndinputs, ndoutputs, mutate_idx); } else { LOG(FATAL) << "Operator " << op->name << " is not implemented for " @@ -461,6 +586,28 @@ int MXImperativeInvoke(AtomicSymbolCreator creator, API_END(); } +int MXImperativeInvokeEx(AtomicSymbolCreator creator, + int num_inputs, + NDArrayHandle *inputs, + int *num_outputs, + NDArrayHandle **outputs, + int num_params, + const char **param_keys, + const char **param_vals, + const int **out_stypes) { // outputs storage types + API_BEGIN(); + MXImperativeInvoke(creator, num_inputs, inputs, num_outputs, outputs, + num_params, param_keys, param_vals); + MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get(); + NDArray** output_nds = reinterpret_cast(*outputs); + ret->out_types.resize(*num_outputs); + for (int i = 0; i < *num_outputs; ++i) { + ret->out_types[i] = output_nds[i]->storage_type(); + } + *out_stypes = dmlc::BeginPtr(ret->out_types); + API_END(); +} + int MXCreateCachedOp(SymbolHandle handle, CachedOpHandle *out) { nnvm::Symbol* sym = static_cast(handle); @@ -540,6 +687,24 @@ int MXInvokeCachedOp(CachedOpHandle handle, API_END(); } +int MXInvokeCachedOpEx(CachedOpHandle handle, + int num_inputs, + NDArrayHandle *inputs, + int *num_outputs, + NDArrayHandle **outputs, + const int **out_stypes) { // outputs storage types + API_BEGIN(); + MXInvokeCachedOp(handle, num_inputs, inputs, num_outputs, outputs); + MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get(); + NDArray** output_nds = reinterpret_cast(*outputs); + ret->out_types.resize(*num_outputs); + for (int i = 0; i < *num_outputs; ++i) { + ret->out_types[i] = output_nds[i]->storage_type(); + } + *out_stypes = dmlc::BeginPtr(ret->out_types); + API_END(); +} + int MXAutogradIsTraining(bool* curr) { API_BEGIN(); *curr = AutogradRuntime::Get()->IsTraining(); diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc index e2c29b888ada..d526aea0d35f 100644 --- a/src/c_api/c_api_symbolic.cc +++ b/src/c_api/c_api_symbolic.cc @@ -29,6 +29,7 @@ #include #include "./c_api_common.h" #include "../operator/operator_common.h" +#include "../executor/exec_pass.h" namespace mxnet { namespace op { @@ -459,7 +460,7 @@ int MXSymbolInferShape(SymbolHandle sym, } try { - g = nnvm::pass::InferShape(std::move(g), arg_shapes, "__shape__"); + g = mxnet::exec::InferShape(std::move(g), arg_shapes, "__shape__"); } catch (const mxnet::op::InferShapeError &err) { throw dmlc::Error(err.msg); } @@ -544,7 +545,7 @@ int MXSymbolInferType(SymbolHandle sym, mxnet::MatchArguments(g.indexed_graph(), kwargs, &arg_types, "InferType"); } - g = nnvm::pass::InferType(std::move(g), arg_types, "__dtype__"); + g = mxnet::exec::InferType(std::move(g), arg_types, "__dtype__"); // copy back CopyAttr(g.indexed_graph(), g.GetAttr("dtype"), &(ret->arg_types), &(ret->out_types), &(ret->aux_types)); diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc index 5ca01492800e..dda4fda1ed8f 100644 --- a/src/c_api/c_predict_api.cc +++ b/src/c_api/c_predict_api.cc @@ -32,6 +32,7 @@ #include #include "./c_api_common.h" #include "../operator/operator_common.h" +#include "../executor/exec_pass.h" using namespace mxnet; @@ -194,7 +195,7 @@ int MXPredCreatePartialOut(const char* symbol_json_str, } } nnvm::Graph g; g.outputs = sym.outputs; - g = nnvm::pass::InferShape(std::move(g), in_shapes, "__shape__"); + g = mxnet::exec::InferShape(std::move(g), in_shapes, "__shape__"); bool infer_complete = (g.GetAttr("shape_num_unknown_nodes") == 0); CHECK(infer_complete) << "The shape information of is not enough to get the shapes"; diff --git a/src/common/utils.cc b/src/common/utils.cc new file mode 100644 index 000000000000..125e4e5dc7d7 --- /dev/null +++ b/src/common/utils.cc @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file utils.cc + * \brief cpu implementation of util functions + */ + +#include "./utils.h" +#include "../operator/tensor/cast_storage-inl.h" + +namespace mxnet { +namespace common { + +template<> +void CastStorageDispatch(const OpContext& ctx, + const NDArray& input, + const NDArray& output) { + mxnet::op::CastStorageComputeImpl(ctx, input, output); +} + +} // namespace common +} // namespace mxnet diff --git a/src/common/utils.cu b/src/common/utils.cu new file mode 100644 index 000000000000..093480a98907 --- /dev/null +++ b/src/common/utils.cu @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file utils.cu + * \brief gpu implementation of util functions + */ + +#include "./utils.h" +#include "../operator/tensor/cast_storage-inl.h" + +namespace mxnet { +namespace common { + +template<> +void CastStorageDispatch(const OpContext& ctx, + const NDArray& input, + const NDArray& output) { + mxnet::op::CastStorageComputeImpl(ctx, input, output); +} + +} // namespace common +} // namespace mxnet diff --git a/src/common/utils.h b/src/common/utils.h index 85e30970f1a0..92631a9b5c34 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -24,7 +24,14 @@ #ifndef MXNET_COMMON_UTILS_H_ #define MXNET_COMMON_UTILS_H_ -#if DMLC_USE_CXX11 +#include +#include +#include +#include +#include +#include +#include + #include #include #include @@ -33,15 +40,100 @@ #include #include #include -#endif // DMLC_USE_CXX11 - -#include -#include +#include namespace mxnet { namespace common { -#if DMLC_USE_CXX11 +template +void CastStorageDispatch(const OpContext& ctx, const NDArray& input, const NDArray& output); + +/* + * \brief setup default-storage tblobs from source NDArrays. If any source NDArray has non-default + * storage, it creates a temp NDArray with default storage and uses the temp tblob. The + * function also records the indices of non-default source NDArrays and the indices of + * their corresponding temporary NDArrays in the temp array. + * \param src list of source NDArray + * \param blobs list of tblobs to return + * \param temp_src list of source NDArrays which requires temporary default storage representation + * \param temp_dst list of temporary destination NDArrays for default storage representation + * \param idx_map mapping from indices in source NDArrays to indices in temp_dst. When not set, + indices are not recorded + * \return true if any source NDArray need to cast storage + */ +inline bool SetupDefaultBlobs(const std::vector& src, + std::vector *blobs, + std::vector *temp_src, + std::vector *temp_dst, + std::unordered_map *idx_map = nullptr) { + bool require_cast = false; + for (size_t i = 0; i < src.size(); i++) { + auto& nd = src[i]; + if (nd.storage_type() != kDefaultStorage) { + if (idx_map != nullptr) { + (*idx_map)[i] = temp_dst->size(); + } + NDArray temp(nd.shape(), nd.ctx(), false, nd.dtype()); + temp_src->emplace_back(nd); + temp_dst->emplace_back(temp); + blobs->emplace_back(temp.data()); + require_cast = true; + } else { + blobs->push_back(nd.data()); + } + } + return require_cast; +} + +/* + * \brief cast the NDArrays in `src` and store the result in NDArrays in `dst`. + * This is only used for storage fallback in executor. + * When storage_fallback is false, and `MXNET_EXEC_STORAGE_FALLBACK` == 0, + * storage fallback is disallowed. + * \param src list of source NDArray to cast + * \param dst list of destionation NDArray which hold the result of cast_storage operation + * \param ctx operator context for cast_storage operation + * \param storage_fallback whether storage_fallback is allowed. When set to false, + * its value depends on `MXNET_EXEC_STORAGE_FALLBACK`. + */ +template +inline void CastNonDefaultStorage(const std::vector& src, + const std::vector& dst, + const OpContext& ctx, + bool storage_fallback = false) { + CHECK_GE(dst.size(), src.size()); + if (src.size() == 0) return; + if (storage_fallback == false) { + storage_fallback = dmlc::GetEnv("MXNET_EXEC_STORAGE_FALLBACK", true); + } + if (storage_fallback == false) { + LOG(FATAL) << "Storage type conversion detected during execution. " + << "You are probably executing an operator which " + << "doesn't support NDArray inputs with non-default storage."; + } + for (size_t i = 0; i < src.size(); i++) { + CastStorageDispatch(ctx, src[i], dst[i]); + } +} + +// Check if any storage type is not default storage +inline bool ContainsNonDefaultStorage(const StorageTypeVector& vstorage) { + for (const auto& i : vstorage) { + if (i != kUndefinedStorage && i != kDefaultStorage) return true; + } + return false; +} + +// Check if any NDArray in the list has default storage +inline bool ContainsDefaultStorage(const std::vector& ndarrays) { + for (const auto &nd : ndarrays) { + if (nd.storage_type() == kDefaultStorage) { + return true; + } + } + return false; +} + // heuristic to dermine number of threads per GPU inline int GetNumThreadPerGPU() { // This is resource efficient option. @@ -56,6 +148,67 @@ inline int GetExecNumMatchColor() { return std::min(num_match_color, GetNumThreadPerGPU()); } +template +V ParallelAccumulate(const T* a, const int n, V start) { + V sum = start; +#pragma omp parallel for reduction(+:sum) + for (int i = 0; i < n; ++i) { + sum += a[i]; + } + return sum; +} + +/*! + * \brief + * Helper function for ParallelSort. + * DO NOT call this function directly. + * Use the interface ParallelSort instead. + * Ref: https://github.com/dmlc/difacto/blob/master/src/common/parallel_sort.h + */ +template +void ParallelSortHelper(RandomIt first, size_t len, + size_t grainsize, const Compare& comp) { + if (len < grainsize) { + std::sort(first, first+len, comp); + } else { + std::thread thr(ParallelSortHelper, first, len/2, grainsize, comp); + ParallelSortHelper(first+len/2, len - len/2, grainsize, comp); + thr.join(); + std::inplace_merge(first, first+len/2, first+len, comp); + } +} + +/*! + * \brief + * Sort the elements in the range [first, last) into the ascending order defined by + * the comparator comp. + * If the length of the range [first, last) is greater than a certain threshold, + * the range will be recursively divided into two and assign two threads + * to sort each half range. + * Ref: https://github.com/dmlc/difacto/blob/master/src/common/parallel_sort.h + */ +template +void ParallelSort(RandomIt first, RandomIt last, size_t num_threads, Compare comp) { + const auto num = std::distance(first, last); + size_t grainsize = std::max(num / num_threads + 5, static_cast(1024*16)); + ParallelSortHelper(first, num, grainsize, comp); +} + +/*! + * \brief + * Sort the elements in the range [first, last) into ascending order. + * The elements are compared using the default < operator. + * If the length of the range [first, last) is greater than a certain threshold, + * the range will be recursively divided into two and assign two threads + * to sort each half range. + * Ref: https://github.com/dmlc/difacto/blob/master/src/common/parallel_sort.h + */ +template +void ParallelSort(RandomIt first, RandomIt last, size_t num_threads) { + ParallelSort(first, last, num_threads, + std::less::value_type>()); +} + /*! * \brief Random Engine */ @@ -159,8 +312,6 @@ FCompType GetFCompute(const nnvm::Op* op, const std::string& name, } } -#endif // DMLC_USE_CXX11 - } // namespace common } // namespace mxnet #endif // MXNET_COMMON_UTILS_H_ diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc index 046460b85900..ed8cbac68ae0 100644 --- a/src/executor/attach_op_execs_pass.cc +++ b/src/executor/attach_op_execs_pass.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "../common/utils.h" #include "./exec_pass.h" @@ -40,29 +41,87 @@ const OperatorProperty* OpPropGetOpProperty(const NodeAttrs& attrs); namespace exec { -// forward executor -class StatefulComputeExecutor : public OpExecutor { +// abstract OpExecutor which provides storage fallback procedure on +// non-default inputs and outputs +// FComputeExecutor and FStatefulComputeExecutor inherit from this class +class StorageFallbackOpExecutor : public OpExecutor { public: - void Run(RunContext rctx) override { + explicit StorageFallbackOpExecutor(const std::vector &mutate_idx) + : mutate_idx_(mutate_idx) {} + + void Setup() override { + using namespace common; + in_data_.clear(); out_data_.clear(); + pre_temp_src_.clear(); pre_temp_dst_.clear(); + post_temp_src_.clear(); post_temp_dst_.clear(); + in_temp_idx_map_.clear(); + SetupDefaultBlobs(in_array, &in_data_, &pre_temp_src_, &pre_temp_dst_, &in_temp_idx_map_); + SetupDefaultBlobs(out_array, &out_data_, &post_temp_dst_, &post_temp_src_); + for (const auto idx : mutate_idx_) { + auto map_iter = in_temp_idx_map_.find(idx); + if (map_iter != in_temp_idx_map_.end()) { + post_temp_src_.push_back(pre_temp_dst_[map_iter->second]); + post_temp_dst_.push_back(in_array[idx]); + } + } + } + + protected: + // storage fallback before fcompute is launched + void PreFCompute(bool is_gpu) { + using namespace common; + if (is_gpu) { +#if MXNET_USE_CUDA + CastNonDefaultStorage(pre_temp_src_, pre_temp_dst_, op_ctx); +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } else { + CastNonDefaultStorage(pre_temp_src_, pre_temp_dst_, op_ctx); + } + } + + // storage fallback after fcompute is completed + void PostFCompute(bool is_gpu) { + using namespace common; + if (is_gpu) { +#if MXNET_USE_CUDA + CastNonDefaultStorage(post_temp_src_, post_temp_dst_, op_ctx); +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } else { + CastNonDefaultStorage(post_temp_src_, post_temp_dst_, op_ctx); + } + } + + // default storage tensor blobs for fcompute + std::vector in_data_, out_data_; + // source NDArray for cast storage + std::vector pre_temp_src_, post_temp_src_; + // destination NDArray for cast storage + std::vector pre_temp_dst_, post_temp_dst_; + // mapping from index in input_blobs to index in pre_temp_dst + std::unordered_map in_temp_idx_map_; + // indices of mutatable inputs + std::vector mutate_idx_; +}; + + +// stateful compute executor +class StatefulComputeExecutor : public StorageFallbackOpExecutor { + public: + void Run(RunContext rctx, bool is_gpu) override { op_ctx.run_ctx = rctx; + PreFCompute(is_gpu); fcompute_(state_, op_ctx, in_data_, req, out_data_); + PostFCompute(is_gpu); #if MKL_EXPERIMENTAL == 1 mkl_tblobs_prv_to_cpu(in_data_); mkl_tblobs_prv_to_cpu(out_data_); #endif } - void Setup() override { - in_data_.clear(); - for (size_t i = 0; i < in_array.size(); ++i) { - in_data_.push_back(in_array[i].data()); - } - out_data_.clear(); - for (size_t i = 0; i < out_array.size(); ++i) { - out_data_.push_back(out_array[i].data()); - } - } - ExecType exec_type() const override { return exec_type_; } @@ -73,22 +132,23 @@ class StatefulComputeExecutor : public OpExecutor { explicit StatefulComputeExecutor(const OpStatePtr& state, const FStatefulCompute& fcompute, - ExecType exec_type) - : state_(state), fcompute_(fcompute), exec_type_(exec_type) {} + ExecType exec_type, + const std::vector &mutate_idx) + : StorageFallbackOpExecutor(mutate_idx), + state_(state), fcompute_(fcompute), exec_type_(exec_type) {} private: friend Graph AttachOpExecs(Graph g); OpStatePtr state_; FStatefulCompute fcompute_; ExecType exec_type_; - std::vector in_data_, out_data_; }; -// forward executor +// stateful compute_ex executor class StatefulComputeExExecutor : public OpExecutor { public: - void Run(RunContext rctx) override { + void Run(RunContext rctx, bool is_gpu) override { op_ctx.run_ctx = rctx; fcompute_(state_, op_ctx, in_array, req, out_array); } @@ -116,42 +176,60 @@ class StatefulComputeExExecutor : public OpExecutor { }; -// fcompute executor executor -class FComputeExecutor : public OpExecutor { +// fcompute executor +class FComputeExecutor : public StorageFallbackOpExecutor { public: - void Run(RunContext rctx) override { + void Run(RunContext rctx, bool is_gpu) override { + using namespace common; op_ctx.run_ctx = rctx; + PreFCompute(is_gpu); fcompute_(attrs_, op_ctx, in_data_, req, out_data_); + PostFCompute(is_gpu); #if MKL_EXPERIMENTAL == 1 mkl_tblobs_prv_to_cpu(in_data_); mkl_tblobs_prv_to_cpu(out_data_); #endif } - void Setup() override { - in_data_.resize(in_array.size()); - out_data_.resize(out_array.size()); - auto get_blob = [](const NDArray& nd) { - return nd.data(); - }; - std::transform(in_array.begin(), in_array.end(), in_data_.begin(), get_blob); - std::transform(out_array.begin(), out_array.end(), out_data_.begin(), get_blob); + ExecType exec_type() const override { + return exec_type_; } + explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute, + ExecType exec_type, const std::vector &mutate_idx) + : StorageFallbackOpExecutor(mutate_idx), + attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) { + } + + private: + NodeAttrs attrs_; + FCompute fcompute_; + ExecType exec_type_; +}; + +// fcompute_ex executor +class FComputeExExecutor : public OpExecutor { + public: + void Run(RunContext rctx, bool is_gpu) override { + op_ctx.run_ctx = rctx; + fcompute_(attrs_, op_ctx, in_array, req, out_array); + } + + void Setup() override {} + ExecType exec_type() const override { return exec_type_; } - explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute, - ExecType exec_type) + explicit FComputeExExecutor(const NodeAttrs& attrs, FComputeEx fcompute, + ExecType exec_type) : attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) { } private: NodeAttrs attrs_; - FCompute fcompute_; + FComputeEx fcompute_; ExecType exec_type_; - std::vector in_data_, out_data_; }; // pass to attach operator executors @@ -170,6 +248,8 @@ Graph AttachOpExecs(Graph g) { const auto& vctx = g.GetAttr("context"); const auto& saved_states = g.GetAttr< std::unordered_map >("saved_states"); + const auto& dispatch_stypes = g.GetAttr("dispatch_stypes"); + // get the graph const auto& idx = g.indexed_graph(); @@ -207,7 +287,8 @@ Graph AttachOpExecs(Graph g) { FStatefulCompute fcompute = common::GetFCompute( op, "FStatefulCompute", vctx[i]); if (fcompute != nullptr) { - ret[i] = std::make_shared(state, fcompute, exec_type); + ret[i] = std::make_shared(state, fcompute, + exec_type, mutate_index); } else { FStatefulComputeEx fcompute_ex = common::GetFCompute( op, "FStatefulComputeEx", vctx[i]); @@ -226,7 +307,7 @@ Graph AttachOpExecs(Graph g) { if (fcompute != nullptr) { ret[i] = std::make_shared( dynamic_cast(ret[fwd_id].get())->state_, - fcompute, exec_type); + fcompute, exec_type, mutate_index); } else { FStatefulComputeEx fcompute_ex = common::GetFCompute( op, "FStatefulComputeEx", vctx[i]); @@ -239,11 +320,15 @@ Graph AttachOpExecs(Graph g) { } } else { FCompute fcompute = common::GetFCompute(op, "FCompute", vctx[i]); - if (fcompute != nullptr) { + FComputeEx fcomp_ex = common::GetFCompute(op, "FComputeEx", vctx[i]); + if (fcomp_ex != nullptr && dispatch_stypes[i] != kDefaultStorage) { + ret[i] = std::make_shared( + inode.source->attrs, fcomp_ex, exec_type); + } else if (fcompute != nullptr) { ret[i] = std::make_shared( - inode.source->attrs, fcompute, exec_type); + inode.source->attrs, fcompute, exec_type, mutate_index); } else { - LOG(FATAL) << "FCompute not registered " << op->name; + LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name; } } } diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index 0eda71d98214..326262147b9f 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -27,9 +27,12 @@ #include #include #include +#include #include +#include #include #include +#include namespace mxnet { namespace exec { @@ -37,6 +40,12 @@ namespace exec { /*! \brief reuse graph definition */ using nnvm::Graph; +const int kBadStorageID = -1; +const int kExternalStorageID = -2; +const int kDynamicStorageID = -3; + +const int kNonDefaultStorage = -2; + /*! * \brief executor to execute an operator * This is a graph executor dependent interface @@ -44,7 +53,7 @@ using nnvm::Graph; */ class OpExecutor { public: - /*! \brief input arrays */ + /*! \brief input data arrays, which may be either input or aux */ std::vector in_array; /*! \brief output data arrays */ std::vector out_array; @@ -65,7 +74,7 @@ class OpExecutor { * This function call do not synchronize the stream. * \param rctx The runtime context passed in by environment. */ - virtual void Run(RunContext rctx) = 0; + virtual void Run(RunContext rctx, bool is_gpu) = 0; /*! \return the execution type */ virtual ExecType exec_type() const = 0; /*! \return return engine variable for operator states */ @@ -123,6 +132,45 @@ Graph AttachOpResources(Graph g); */ Graph DetectInplaceAddTo(Graph g); +/*! + * \brief Infer shapes in the graph given the information. + * \param graph The input graph. + * \param shape_inputs The shapes of input symbols to the graph. + * \param shape_attr_key The key to the node attribute that can indicate shape. This is + * the place where manual hint for shapes could be injected. + * \return A graph with new attribute "shape" containing inferred shape of each NodeEntry. + * The index of ShapeVector is given by graph.indexed_graph().entry_id. + */ +Graph InferShape(Graph graph, + nnvm::ShapeVector shape_inputs, + const std::string& shape_attr_key = ""); + +/*! + * \brief Infer types in the graph given the information. + * \param graph The input graph. + * \param dtype_inputs The types of input symbols to the graph. + * \param dtype_attr_key The key to the node attribute that can indicate types. This is + * the place where manual hint for types could be injected. + * \return A graph with new attribute "dtype" containing inferred type of each NodeEntry. + * The index of ShapeVector is given by graph.indexed_graph().entry_id. + */ +Graph InferType(Graph graph, + nnvm::DTypeVector dtype_inputs, + const std::string& dtype_attr_key = ""); + +/*! + * \brief Infer storage types in the graph given the information. + * \param graph The input graph. + * \param storage_type_inputs The storage types of input symbols to the graph. + * \param storage_type_attr_key The key to the node attribute that can indicate storage types. + This is the place where manual hint for types could be injected. + * \return A graph with new attribute "storage_type" containing inferred type of each NodeEntry. + * The index of StorageTypeVector is given by graph.indexed_graph().entry_id. + */ +Graph InferStorageType(Graph graph, + StorageTypeVector storage_type_inputs, + const std::string& storage_type_attr_key = ""); + } // namespace exec } // namespace mxnet diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 6dc8cf39970e..9c4398343b1c 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -30,9 +30,15 @@ #include "./exec_pass.h" #include "./graph_executor.h" #include "../engine/profiler.h" +#include "../common/utils.h" namespace mxnet { namespace exec { + +GraphExecutor::GraphExecutor() { + log_verbose_ = dmlc::GetEnv("MXNET_EXEC_VERBOSE_LOGGING", false); +} + GraphExecutor::~GraphExecutor() { for (auto& n : op_nodes_) { if (n.cached_opr != nullptr) { @@ -47,6 +53,30 @@ GraphExecutor::~GraphExecutor() { } } +inline NDArray InitZeros(const NDArrayStorageType stype, const TShape &shape, + const Context &ctx, const int dtype) { + // NDArray with default storage + if (stype == kDefaultStorage) { + NDArray ret(shape, ctx, false, dtype); + ret = 0; + return ret; + } + // NDArray with non-default storage. Storage allocation is always delayed. + return NDArray(stype, shape, ctx, true, dtype); +} + +inline void EmplaceBackZeros(const NDArrayStorageType stype, const TShape &shape, + const Context &ctx, const int dtype, + std::vector *vec) { + // NDArray with default storage + if (stype == kDefaultStorage) { + vec->emplace_back(shape, ctx, false, dtype); + vec->back() = 0; + } else { + // NDArray with non-default storage. Storage allocation is always delayed. + vec->emplace_back(stype, shape, ctx, true, dtype); + } +} void GraphExecutor::Forward(bool is_train) { RunOps(is_train, 0, num_forward_nodes_); } @@ -438,6 +468,29 @@ void HandleInferTypeError(const size_t num_forward_inputs, << oss.str(); } +void HandleInferStorageTypeError(const size_t num_forward_inputs, + const nnvm::IndexedGraph& idx, + const StorageTypeVector& inferred_stypes) { + int cnt = 10; + std::ostringstream oss; + for (size_t i = 0; i < num_forward_inputs; ++i) { + const uint32_t nid = idx.input_nodes().at(i); + const uint32_t eid = idx.entry_id(nid, 0); + const int inferred_stype = inferred_stypes[eid]; + if (inferred_stype == -1) { + const std::string& arg_name = idx[nid].source->attrs.name; + oss << arg_name << ": " << inferred_stype << ", "; + if (--cnt == 0) { + oss << "..."; + break; + } + } + } + LOG(FATAL) << "InferStoragetType pass cannot decide storage type for the following arguments " + "(-1 means unknown stype). Please consider providing them as inputs:\n" + << oss.str(); +} + /*! * \brief GraphExecutor initializer for regular bind flow in which * input arguments and gradients are provided by users. This initializer @@ -475,21 +528,25 @@ void GraphExecutor::Init(nnvm::Symbol symbol, data_entry_.resize(idx.num_node_entries()); nnvm::ShapeVector arg_shapes; nnvm::DTypeVector arg_dtypes; + StorageTypeVector arg_stypes; for (size_t i = 0; i < num_forward_inputs_; ++i) { const uint32_t nid = idx.input_nodes().at(i); const std::string& arg_name = idx[nid].source->attrs.name; + size_t eid = idx.entry_id(nid, 0); if (mutable_nodes.count(nid)) { CHECK_LT(aux_top, aux_states.size()); - data_entry_[idx.entry_id(nid, 0)] = aux_states[aux_top]; + data_entry_[eid] = aux_states[aux_top]; arg_shapes.push_back(aux_states[aux_top].shape()); arg_dtypes.push_back(aux_states[aux_top].dtype()); + arg_stypes.push_back(aux_states[aux_top].storage_type()); aux_state_map_.emplace(arg_name, aux_states[aux_top]); ++aux_top; } else { CHECK_LT(arg_top, in_args.size()); - data_entry_[idx.entry_id(nid, 0)] = in_args[arg_top]; + data_entry_[eid] = in_args[arg_top]; arg_shapes.push_back(in_args[arg_top].shape()); arg_dtypes.push_back(in_args[arg_top].dtype()); + arg_stypes.push_back(in_args[arg_top].storage_type()); in_arg_map_.emplace(arg_name, in_args[arg_top]); if (kNullOp != grad_req_types[arg_top]) { grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_store[arg_top]); @@ -497,23 +554,33 @@ void GraphExecutor::Init(nnvm::Symbol symbol, } ++arg_top; } + if (log_verbose_) { + LOG(INFO) << "\tassign data entry\t" << eid << " as stype " + << data_entry_[eid].storage_type() << " (input)"; + } } // expand arg_shapes and arg_dtypes to contain backward inputs arg_shapes.resize(idx.input_nodes().size(), TShape()); - g = nnvm::pass::InferShape(g, arg_shapes, "__shape__"); + g = InferShape(std::move(g), arg_shapes, "__shape__"); if (g.GetAttr("shape_num_unknown_nodes") != 0U) { HandleInferShapeError(num_forward_inputs_, g.indexed_graph(), g.GetAttr("shape")); } arg_dtypes.resize(idx.input_nodes().size(), -1); - g = nnvm::pass::InferType(g, arg_dtypes, "__dtype__"); + g = InferType(std::move(g), arg_dtypes, "__dtype__"); if (g.GetAttr("dtype_num_unknown_nodes") != 0U) { HandleInferTypeError(num_forward_inputs_, g.indexed_graph(), g.GetAttr("dtype")); } + g = InferStorageType(std::move(g), arg_stypes, "__storage_type__"); + if (g.GetAttr("storage_type_num_unknown_nodes") != 0U) { + HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(), + g.GetAttr("storage_type")); + } + // Initialize the rest attributes of the graph. // This function can be called by regular bind // operation flow as well. @@ -529,6 +596,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol, void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, const nnvm::ShapeVector& inferred_shapes, const nnvm::DTypeVector& inferred_dtypes, + const StorageTypeVector& inferred_stypes, const std::vector& in_arg_ctxes, const std::vector& arg_grad_ctxes, const std::vector& aux_state_ctxes, @@ -546,22 +614,37 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, const uint32_t eid = idx.entry_id(nid, 0); const TShape& inferred_shape = inferred_shapes[eid]; const int inferred_dtype = inferred_dtypes[eid]; + const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid]; const std::string& arg_name = idx[nid].source->attrs.name; if (mutable_nodes.count(nid)) { // aux_states - aux_state_vec->emplace_back(inferred_shape, aux_state_ctxes[aux_top], false, inferred_dtype); - aux_state_vec->back() = 0; + EmplaceBackZeros(inferred_stype, inferred_shape, aux_state_ctxes[aux_top], + inferred_dtype, aux_state_vec); data_entry_[eid] = aux_state_vec->back(); aux_state_map_.emplace(arg_name, aux_state_vec->back()); ++aux_top; + if (log_verbose_) { + LOG(INFO) << "\tassign aux entry\t" << eid << "\t as stype " << inferred_stype; + } } else { // in_args - in_arg_vec->emplace_back(inferred_shape, in_arg_ctxes[arg_top], false, inferred_dtype); - in_arg_vec->back() = 0; + EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top], + inferred_dtype, in_arg_vec); data_entry_[eid] = in_arg_vec->back(); + if (log_verbose_) { + LOG(INFO) << "\tassign data entry\t" << eid << "\tas stype " << inferred_stype; + } + // Get the storage type for grad if (kNullOp == grad_req_types[arg_top]) { arg_grad_vec->emplace_back(); } else { - arg_grad_vec->emplace_back(inferred_shape, arg_grad_ctxes[arg_top], false, inferred_dtype); - arg_grad_vec->back() = 0; + // Init based on storage type + auto grad_oid = grad_store_.size() + num_forward_outputs_; + auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); + auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid]; + EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top], + inferred_dtype, arg_grad_vec); + if (log_verbose_) { + LOG(INFO) << "\tassign grad entry\t" << grad_eid << "\tas stype " << grad_stype; + } grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); arg_grad_map_.emplace(arg_name, arg_grad_vec->back()); } @@ -573,33 +656,40 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, /*! * \brief If the requested ndarray's shape size is less than - * the corresponding shared_data_array's shape size, reuse - * the memory allocation; otherwise, create a zero ndarray. + * the corresponding shared_data_array's shape size and the + * storage type is default storage, reuse the memory allocation + * in shared_buffer; otherwise, create a zero ndarray. */ NDArray ReshapeOrCreate(const std::string& name, const TShape& dest_arg_shape, const int dest_arg_dtype, + const NDArrayStorageType dest_arg_stype, const Context& ctx, std::unordered_map* shared_buffer) { + if (dest_arg_dtype != kDefaultStorage) { + return InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype); + } auto it = shared_buffer->find(name); if (it != shared_buffer->end()) { if (it->second.shape().Size() >= dest_arg_shape.Size()) { // memory can be reused CHECK_EQ(it->second.dtype(), dest_arg_dtype) << "Requested arg array's dtype does not match the reusable ndarray"; + CHECK_EQ(it->second.storage_type(), kDefaultStorage) + << "shared_buffer should only contain NDArrays with default storage type."; return it->second.Reshape(dest_arg_shape); } else { LOG(WARNING) << "Bucketing: data " << name << " has a shape " << dest_arg_shape << ", which is larger than already allocated shape " << it->second.shape() << ". Need to re-allocate. Consider putting default bucket key to be " << "the bucket taking the largest input for better memory sharing."; - it->second = NDArray(dest_arg_shape, ctx, false, dest_arg_dtype); - it->second = 0; + // the NDArrays in shared_buffer are guaranteed to be of default storage + it->second = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype); return it->second; } // arg_array.shape().Size() >= arg_shape.Size() } else { - auto p = shared_buffer->emplace(name, NDArray(dest_arg_shape, ctx, false, dest_arg_dtype)); - p.first->second = 0; - return p.first->second; + auto ret = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype); + shared_buffer->emplace(name, ret); + return ret; } // if (it != shared_buffer->end()) } @@ -612,6 +702,7 @@ NDArray ReshapeOrCreate(const std::string& name, void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, const nnvm::ShapeVector& inferred_shapes, const nnvm::DTypeVector& inferred_dtypes, + const StorageTypeVector& inferred_stypes, const std::vector& in_arg_ctxes, const std::vector& arg_grad_ctxes, const std::vector& aux_state_ctxes, @@ -631,9 +722,12 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, const uint32_t eid = idx.entry_id(nid, 0); const TShape& inferred_shape = inferred_shapes[eid]; const int inferred_dtype = inferred_dtypes[eid]; + const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid]; const std::string& arg_name = idx[nid].source->attrs.name; - if (mutable_nodes.count(nid)) { // aux_states - if (nullptr != shared_exec) { + // aux_states + if (mutable_nodes.count(nid)) { + if (nullptr != shared_exec && inferred_stype == kDefaultStorage && + shared_exec->aux_state_map().at(arg_name).storage_type() == kDefaultStorage) { const NDArray& aux_nd = shared_exec->aux_state_map().at(arg_name); CHECK_EQ(inferred_shape, aux_nd.shape()) << "Inferred shape does not match shared_exec.aux_array's shape." @@ -647,16 +741,18 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, << arg_name << " for the current executor"; aux_state_vec->emplace_back(aux_nd); } else { - aux_state_vec->emplace_back(inferred_shape, aux_state_ctxes[aux_top], - false, inferred_dtype); - aux_state_vec->back() = 0; + EmplaceBackZeros(inferred_stype, inferred_shape, aux_state_ctxes[aux_top], + inferred_dtype, aux_state_vec); } // if (has_shared_exec) data_entry_[eid] = aux_state_vec->back(); aux_state_map_.emplace(arg_name, aux_state_vec->back()); ++aux_top; - } else { // in_args + } else { // in_args and grad for in_args if (shared_arg_names.count(arg_name)) { // model parameter - if (nullptr != shared_exec) { + // model parameter + if (nullptr != shared_exec && inferred_stype == kDefaultStorage && + shared_exec->in_arg_map().at(arg_name).storage_type() == kDefaultStorage) { + // try to reuse memory from shared_exec const NDArray& in_arg_nd = shared_exec->in_arg_map().at(arg_name); CHECK_EQ(inferred_shape, in_arg_nd.shape()) << "Inferred shape does not match shared_exec.arg_array's shape" @@ -669,33 +765,43 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, " be resued for creating NDArray of the argument" << arg_name << " for the current executor"; in_arg_vec->emplace_back(in_arg_nd); - if (kNullOp == grad_req_types[arg_top]) { - arg_grad_vec->emplace_back(); - } else { + } else { + // doesn't have shared_exec, or non-default storage + EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top], + inferred_dtype, in_arg_vec); + } + // gradient for model parameter + if (kNullOp == grad_req_types[arg_top]) { + arg_grad_vec->emplace_back(); + } else { + auto grad_oid = grad_store_.size() + num_forward_outputs_; + auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); + auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid]; + if (nullptr != shared_exec && grad_stype == kDefaultStorage && + shared_exec->arg_grad_map().at(arg_name).storage_type() == kDefaultStorage) { + // try to reuse memory from shared_exec arg_grad_vec->emplace_back(shared_exec->arg_grad_map().at(arg_name)); - grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); - } // if (kNullOp == grad_req_types[arg_top]) - } else { // !has shared_exec - in_arg_vec->emplace_back(inferred_shape, in_arg_ctxes[arg_top], false, inferred_dtype); - in_arg_vec->back() = 0; - if (kNullOp == grad_req_types[arg_top]) { - arg_grad_vec->emplace_back(); } else { - arg_grad_vec->emplace_back(inferred_shape, arg_grad_ctxes[arg_top], - false, inferred_dtype); - arg_grad_vec->back() = 0; - grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); - } // if (kNullOp == grad_req_types[arg_top]) - } // if (has_shared_exec) + EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top], + inferred_dtype, arg_grad_vec); + } + grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); + } } else { // !shared_arg_names.count(arg_name) + // model parameter in_arg_vec->emplace_back(ReshapeOrCreate(arg_name, inferred_shape, inferred_dtype, - in_arg_ctxes[arg_top], shared_buffer)); + inferred_stype, in_arg_ctxes[arg_top], + shared_buffer)); + // gradient for model parameter if (kNullOp == grad_req_types[arg_top]) { arg_grad_vec->emplace_back(); } else { + auto grad_oid = grad_store_.size() + num_forward_outputs_; + auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); + auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid]; arg_grad_vec->emplace_back(ReshapeOrCreate("grad of " + arg_name, inferred_shape, - inferred_dtype, arg_grad_ctxes[arg_top], - shared_buffer)); + inferred_dtype, grad_stype, + arg_grad_ctxes[arg_top], shared_buffer)); grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); } // if (kNullOp == grad_req_types[arg_top]) } // if (shared_arg_names.count(arg_name)) @@ -718,14 +824,35 @@ void GraphExecutor::FinishInitGraph(nnvm::Symbol symbol, Executor* shared_exec, const nnvm::NodeEntryMap& feed_dict) { const auto& idx = g.indexed_graph(); + // dispatch based on stype per operator + const auto& vstorage_type = g.GetAttr("storage_type"); + StorageTypeVector dispatch_stypes(idx.num_nodes(), kUndefinedStorage); + for (size_t nid = 0; nid < idx.num_nodes(); nid++) { + const auto& inode = idx[nid]; + auto num_outputs = inode.source->num_outputs(); + auto num_inputs = inode.inputs.size(); + StorageTypeVector vs(num_inputs + num_outputs, kUndefinedStorage); + for (size_t i = 0; i < num_inputs; i++) { + auto e = inode.inputs[i]; + vs[i] = vstorage_type[idx.entry_id(e)]; + CHECK_NE(vs[i], kUndefinedStorage); + } + for (uint32_t i = 0; i < num_outputs; ++i) { + uint32_t eid = idx.entry_id(nid, i); + vs[i + num_inputs] = vstorage_type[eid]; + } + bool contains_non_default = common::ContainsNonDefaultStorage(vs); + dispatch_stypes[nid] = contains_non_default ? kNonDefaultStorage : kDefaultStorage; + } + g.attrs["dispatch_stypes"] = std::make_shared(std::move(dispatch_stypes)); + + // data entries for output gradients for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) { data_entry_[idx.entry_id(idx.outputs()[j])] = grad_store_[j - num_forward_outputs_].second; } { // memory allocator - const int kBadStorageID = -1; - const int kExternalStorageID = -2; nnvm::StorageVector arg_storage_id(idx.num_node_entries(), kBadStorageID); for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) { arg_storage_id[idx.entry_id(idx.outputs()[j])] = kExternalStorageID; @@ -735,6 +862,9 @@ void GraphExecutor::FinishInitGraph(nnvm::Symbol symbol, data_entry_[eid] = kv.second; arg_storage_id[eid] = kExternalStorageID; } + for (size_t i = 0; i < idx.num_node_entries(); i++) { + if (vstorage_type[i] != kDefaultStorage) arg_storage_id[i] = kDynamicStorageID; + } g.attrs["storage"] = std::make_shared(std::move(arg_storage_id)); g = nnvm::ApplyPass(g, "PlanMemory"); } @@ -792,6 +922,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol, const std::vector& aux_state_ctxes, const std::unordered_map& arg_shape_map, const std::unordered_map& arg_dtype_map, + const std::unordered_map& arg_stype_map, const std::vector& grad_req_types, const std::unordered_set& shared_arg_names, std::vector* in_arg_vec, @@ -811,6 +942,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol, const nnvm::IndexedGraph& idx = g.indexed_graph(); nnvm::ShapeVector arg_shapes(idx.input_nodes().size(), TShape()); nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1); + StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage); for (size_t i = 0; i < num_forward_inputs_; ++i) { const uint32_t nid = idx.input_nodes().at(i); const std::string& name = idx[nid].source->attrs.name; @@ -822,29 +954,41 @@ void GraphExecutor::Init(nnvm::Symbol symbol, if (arg_dtype_map.end() != it2) { arg_dtypes[i] = it2->second; } + auto it3 = arg_stype_map.find(name); + if (arg_stype_map.end() != it3) { + arg_stypes[i] = it3->second; + } } - g = nnvm::pass::InferShape(g, arg_shapes, "__shape__"); + g = InferShape(std::move(g), arg_shapes, "__shape__"); if (g.GetAttr("shape_num_unknown_nodes") != 0U) { HandleInferShapeError(num_forward_inputs_, g.indexed_graph(), g.GetAttr("shape")); } - g = nnvm::pass::InferType(g, arg_dtypes, "__dtype__"); + g = InferType(std::move(g), arg_dtypes, "__dtype__"); if (g.GetAttr("dtype_num_unknown_nodes") != 0U) { HandleInferTypeError(num_forward_inputs_, g.indexed_graph(), g.GetAttr("dtype")); } + g = InferStorageType(std::move(g), arg_stypes, "__storage_type__"); + if (g.GetAttr("storage_type_num_unknown_nodes") != 0U) { + HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(), + g.GetAttr("storage_type")); + } + // Create in_args, arg_grads, and aux_states using // the inferred shapes and dtypes. if (nullptr == shared_buffer) { // regular simple bind InitArguments(idx, g.GetAttr("shape"), g.GetAttr("dtype"), + g.GetAttr("storage_type"), in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes, grad_req_types, in_arg_vec, arg_grad_vec, aux_state_vec); } else { // simple bind using shared data arrays and shared_exec InitArguments(idx, g.GetAttr("shape"), g.GetAttr("dtype"), + g.GetAttr("storage_type"), in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes, grad_req_types, shared_arg_names, shared_exec, shared_buffer, in_arg_vec, arg_grad_vec, aux_state_vec); @@ -905,20 +1049,29 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { const auto& vdtype = graph_.GetAttr("dtype"); const auto& vshape = graph_.GetAttr("shape"); const auto& vstorage = graph_.GetAttr("storage_id"); + const auto& vstorage_type = graph_.GetAttr("storage_type"); const auto& vctx = graph_.GetAttr("context"); CHECK_EQ(idx.num_node_entries(), vshape.size()); CHECK_EQ(idx.num_node_entries(), vdtype.size()); CHECK_EQ(idx.num_node_entries(), vstorage.size()); CHECK_EQ(data_entry_.size(), vshape.size()); std::vector data_context(idx.num_node_entries()); + std::vector data_storage_type(idx.num_node_entries(), kUndefinedStorage); for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { for (uint32_t i = 0; i < idx[nid].source->num_outputs(); ++i) { - data_context[idx.entry_id(nid, i)] = vctx[nid]; + auto eid = idx.entry_id(nid, i); + data_context[eid] = vctx[nid]; + CHECK_NE(vstorage_type[nid], kUndefinedStorage); + data_storage_type[eid] = (NDArrayStorageType) vstorage_type[nid]; } } // information about the pool - using PoolEntry = std::pair; + struct PoolEntry { + Context ctx; + size_t bytes; + NDArrayStorageType stype; + }; std::vector pool_info; // assign array to head gradient @@ -926,26 +1079,36 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { uint32_t nid = idx.input_nodes().at(i); uint32_t oid = head_grad_map_.at(idx[nid].source); uint32_t eid = idx.entry_id(idx.outputs()[oid]); + NDArrayStorageType stype = (NDArrayStorageType) vstorage_type[eid]; CHECK_NE(vshape[eid].ndim(), 0U); CHECK_NE(vdtype[eid], -1); - data_entry_[idx.entry_id(nid, 0)] = - NDArray(vshape[eid], data_context[eid], false, vdtype[eid]); + auto data_eid = idx.entry_id(nid, 0); + // initialize based on storage_type + if (stype != kDefaultStorage) { + data_entry_[data_eid] = NDArray(stype, vshape[eid], data_context[eid], true, vdtype[eid]); + } else { + data_entry_[data_eid] = NDArray(vshape[eid], data_context[eid], false, vdtype[eid]); + } + if (log_verbose_) { + LOG(INFO) << "\tinit head_g entry\t" << data_eid << "\tas stype " << stype; + } } // get maximum bytes in each pool for (size_t i = 0; i < vshape.size(); ++i) { if (!data_entry_[i].is_none()) continue; size_t bytes = vshape[i].Size() * mshadow::mshadow_sizeof(vdtype[i]); int storage_id = vstorage[i]; + // skip pool allocation for kBadStorageID, kExternalStorageID and kDynamicStorageID if (storage_id < 0) continue; size_t sid = static_cast(storage_id); if (sid >= pool_info.size()) { - pool_info.resize(sid + 1, PoolEntry{Context::CPU(), size_t(0)}); + pool_info.resize(sid + 1, PoolEntry{Context::CPU(), size_t(0), kUndefinedStorage}); } PoolEntry& info = pool_info[sid]; - if (info.second == 0) { - info = PoolEntry{data_context[i], bytes}; + if (info.bytes == 0) { + info = PoolEntry{data_context[i], bytes, data_storage_type[i]}; } else { - info.second = std::max(info.second, bytes); + info.bytes = std::max(info.bytes, bytes); } } // construct the re-use pool, if needed @@ -966,13 +1129,14 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { sorted_pool_index.push_back(i); } auto pool_comparator = [&pool_info](int lhs, int rhs){ - return pool_info[lhs].second > pool_info[rhs].second; + return pool_info[lhs].bytes > pool_info[rhs].bytes; }; std::sort(sorted_pool_index.begin(), sorted_pool_index.end(), pool_comparator); for (size_t i : sorted_pool_index) { - const Context& ctx = pool_info[i].first; - size_t bytes = pool_info[i].second; + const Context& ctx = pool_info[i].ctx; + size_t bytes = pool_info[i].bytes; + NDArrayStorageType storage_type = pool_info[i].stype; bool allocated = false; for (auto it = free_pool.lower_bound(bytes); it != free_pool.end(); ++it) { if (it->second.ctx() == ctx && it->first >= bytes) { @@ -987,7 +1151,9 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { CHECK_LE(nword, std::numeric_limits::max()); // allocate float arrays TShape shape{static_cast(nword)}; - NDArray nd(shape, ctx); + // TODO(junwu): adding delay_alloc=true to create nd + // is a temporary solution. + NDArray nd(shape, ctx, true); data_pool_[i] = nd; // put the new allocated arrays to shared pool if (shared_pool != nullptr) { @@ -997,15 +1163,22 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { } CHECK_EQ(data_pool_.size(), pool_info.size()); // assign the data entries - for (size_t i = 0; i < data_entry_.size(); ++i) { // avoid pre-allocated arrays if (!data_entry_[i].is_none()) continue; // assign allocated array by storage id int storage_id = vstorage[i]; - CHECK_GE(storage_id, 0) << "Do not support runtime shape op yet"; - const NDArray& src = data_pool_.at(storage_id); - data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); + auto storage_type = (NDArrayStorageType) vstorage_type[i]; + if (storage_type == kDefaultStorage) { + CHECK_GE(storage_id, 0) << "Do not support runtime shape op yet"; + const NDArray& src = data_pool_.at(storage_id); + data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); + } else { + data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i]); + } + if (log_verbose_) { + LOG(INFO) << "\tinit data entry\t" << i << "\tas stype " << storage_type; + } } } @@ -1020,11 +1193,28 @@ void GraphExecutor::InitCachedOps() { const auto& vctx = graph_.GetAttr("context"); const auto& addto_entry = graph_.GetAttr >("addto_entry"); const auto& skip_plus_node = graph_.GetAttr >("skip_plus_node"); + const auto& vstorage_type = graph_.GetAttr("storage_type"); op_nodes_.resize(idx.num_nodes()); // setup the array and requirements. for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { const auto& inode = idx[nid]; + if (log_verbose_) { + if (inode.source->is_variable()) { + LOG(INFO) << "node " << nid << " var"; + } else { + LOG(INFO) << "node " << nid << " " << inode.source->attrs.op->name; + auto exec = op_execs[nid]; + for (const auto& e : inode.inputs) { + auto eid = idx.entry_id(e); + LOG(INFO) << "\t\tinput " << eid << " stype: " << vstorage_type[eid]; + } + for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) { + uint32_t eid = idx.entry_id(nid, index); + LOG(INFO) << "\t\toutput " << eid << " stype: " << vstorage_type[eid]; + } + } + } if (inode.source->is_variable()) continue; #if MXNET_USE_PROFILER op_nodes_[nid].opr_name = inode.source->op()->name.c_str(); @@ -1104,7 +1294,7 @@ void GraphExecutor::InitCachedOps() { if (is_async) { exec->op_ctx.async_on_complete = on_complete; } - exec->Run(ctx); + exec->Run(ctx, is_gpu); // call on complete only if it is async op if (!is_async) { if (is_gpu) { @@ -1265,7 +1455,8 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) { CHECK_EQ(opnode.exec->out_array.size(), 1U); CopyFromTo(opnode.exec->in_array[0], &(opnode.exec->out_array[0])); } else if (opnode.exec->exec_type() == ExecType::kLocal) { - opnode.exec->Run(RunContext{opnode.ctx, nullptr}); + bool is_gpu = opnode.ctx.dev_mask() == gpu::kDevMask; + opnode.exec->Run(RunContext{opnode.ctx, nullptr}, is_gpu); } else if (opnode.cached_opr != nullptr) { #if MXNET_USE_PROFILER bool profiling = engine::Profiler::Get()->GetState() == engine::Profiler::kRunning; @@ -1335,7 +1526,7 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start, RunContext ctx, Engine::CallbackOnComplete on_complete) { // Run all opr in the sub-graph for (auto &exec : exec_list) { - exec->Run(ctx); + exec->Run(ctx, is_gpu); } if (is_gpu) { #if MXNET_USE_CUDA @@ -1370,6 +1561,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol, const std::vector& aux_state_ctxes, const std::unordered_map& arg_shape_map, const std::unordered_map& arg_dtype_map, + const std::unordered_map& arg_stype_map, const std::vector& grad_req_types, const std::unordered_set& shared_arg_names, std::vector* in_args, @@ -1380,7 +1572,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol, auto exec = new exec::GraphExecutor(); exec->Init(symbol, default_ctx, group2ctx, in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes, - arg_shape_map, arg_dtype_map, + arg_shape_map, arg_dtype_map, arg_stype_map, grad_req_types, shared_arg_names, in_args, arg_grads, aux_states, shared_buffer, shared_exec); diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h index dc50bef002ab..48222f05fae2 100644 --- a/src/executor/graph_executor.h +++ b/src/executor/graph_executor.h @@ -59,6 +59,7 @@ class GraphExecutor : public Executor { friend class autograd::AutogradRuntime; using Executor::MonitorCallback; + GraphExecutor(); virtual ~GraphExecutor(); void Forward(bool is_train) override; void PartialForward(bool is_train, int step, int *step_left) override; @@ -96,6 +97,7 @@ class GraphExecutor : public Executor { const std::vector& aux_state_ctxes, const std::unordered_map& arg_shape_map, const std::unordered_map& arg_dtype_map, + const std::unordered_map& arg_stype_map, const std::vector& grad_req_types, const std::unordered_set& shared_arg_names, std::vector* in_arg_vec, @@ -141,6 +143,7 @@ class GraphExecutor : public Executor { void InitArguments(const nnvm::IndexedGraph& idx, const nnvm::ShapeVector& inferred_shapes, const nnvm::DTypeVector& inferred_dtypes, + const StorageTypeVector& inferred_stypes, const std::vector& in_arg_ctxes, const std::vector& arg_grad_ctxes, const std::vector& aux_state_ctxes, @@ -153,6 +156,7 @@ class GraphExecutor : public Executor { void InitArguments(const nnvm::IndexedGraph& idx, const nnvm::ShapeVector& inferred_shapes, const nnvm::DTypeVector& inferred_dtypes, + const StorageTypeVector& inferred_stypes, const std::vector& in_arg_ctxes, const std::vector& arg_grad_ctxes, const std::vector& aux_state_ctxes, @@ -201,7 +205,8 @@ class GraphExecutor : public Executor { std::vector op_nodes_; // internal data entry of each node std::vector data_entry_; - // internal data pool of allocated entries + // internal data pool of allocated entries. + // these allocated entries can be used for static memory sharing between executors. std::vector data_pool_; // output arrays std::vector output_arrays_; @@ -233,6 +238,8 @@ class GraphExecutor : public Executor { bool prefer_bulk_execution_; // cached segment operator std::vector cached_seg_opr_; + // verbose logging + bool log_verbose_ = false; }; } // namespace exec diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc new file mode 100644 index 000000000000..144c3713e205 --- /dev/null +++ b/src/executor/infer_graph_attr_pass.cc @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file infer_graph_attr_pass.cc + * \brief infer graph shape, dtype, and storage type + */ + +#include +#include +#include "./exec_pass.h" + +namespace mxnet { +namespace exec { + +template +bool ApplyOpInferAttr(const nnvm::Graph& g, + const FInfer& finfer, + const NodeAttrs& attrs, + const uint32_t nid, + std::vector* in_attrs, + std::vector* out_attrs) { + return finfer(attrs, in_attrs, out_attrs); +} + +template<> +bool ApplyOpInferAttr(const nnvm::Graph& g, + const FInferStorageType& finfer, + const NodeAttrs& attrs, + const uint32_t nid, + std::vector* in_attrs, + std::vector* out_attrs) { + const ContextVector& ctxes = g.GetAttr("context"); + return finfer(attrs, ctxes[nid], in_attrs, out_attrs); +} + +/*!\brief + * This is a duplicate of the InferAttr function in nnvm with minor modification + * to support inferring storage type whose function signature is different from + * shape/type inference functions'. The nnvm InferAttr will be deprecated + * in the future. Please use interfaces InferShape, InferType, and InferStorageType + * to call this function. + */ +template +nnvm::Graph InferAttr(nnvm::Graph &&ret, + const AttrType empty_val, + const char* infer_name, + const char* input_name, + const char* attr_key_name, + const char* attr_name, + const char* unknown_name, + IsNone fis_none, + FDefault fdefault, + bool backward_identity_assign) { + using nnvm::IndexedGraph; + using nnvm::Op; + using AttrVector = std::vector; + using dmlc::any; + + const IndexedGraph& idx = ret.indexed_graph(); + static auto& finfer_shape = + Op::GetAttr(infer_name); + static auto& is_backward = + Op::GetAttr("TIsBackward"); + // gradient function, used to get node correspondence. + static auto& fgrad = + Op::GetAttr("FGradient"); + // reshape shape vector + AttrVector rshape; + if (ret.attrs.count(attr_name) != 0) { + rshape = ret.MoveCopyAttr(attr_name); + } else { + rshape.resize(idx.num_node_entries(), empty_val); + } + + if (ret.attrs.count(input_name) != 0) { + const AttrVector& shape_args = ret.GetAttr(input_name); + CHECK_LE(shape_args.size(), idx.input_nodes().size()) + << "More provided " << attr_name << "s than number of arguments."; + for (size_t i = 0; i < shape_args.size(); ++i) { + rshape[idx.entry_id(idx.input_nodes()[i], 0)] = shape_args[i]; + } + // erase the provided arguments + ret.attrs.erase(input_name); + } + + // get the shape hints + std::string shape_hints_key = std::string(attr_name) + "_hints"; + if (ret.attrs.count(shape_hints_key)) { + nnvm::NodeEntryMap shape_hints = + ret.GetAttr>(shape_hints_key); + for (const auto& kv : shape_hints) { + nnvm::NodeEntry e = kv.first; + if (idx.exist(e.node.get())) { + rshape[idx.entry_id(kv.first)] = kv.second; + } + } + } + + std::string shape_attr_key; + if (ret.attrs.count(attr_key_name) != 0) { + shape_attr_key = ret.GetAttr(attr_key_name); + // erase the provided arguments + ret.attrs.erase(attr_key_name); + } + // Temp space for shape inference. + std::vector ishape, oshape; + + // inference step function for nid + auto infer_step = [&](uint32_t nid, bool last_iter) { + const auto& inode = idx[nid]; + const uint32_t num_inputs = inode.inputs.size(); + const uint32_t num_outputs = inode.source->num_outputs(); + if (inode.source->is_variable()) { + // Variable node. No operator. Only one output entry. + CHECK(inode.source->op() == nullptr); + CHECK_EQ(num_outputs, 1U); + const uint32_t out_ent_id = idx.entry_id(nid, 0); + if (shape_attr_key.length() != 0 && fis_none(rshape[out_ent_id])) { + auto it = inode.source->attrs.dict.find(shape_attr_key); + if (it != inode.source->attrs.dict.end()) { + std::istringstream is(it->second); + CHECK(is >> rshape[out_ent_id]) << "Invalid attribute"; + } + } + } else if (is_backward.get(inode.source->op(), false) && + inode.control_deps.size() && backward_identity_assign) { + CHECK_GE(inode.control_deps.size(), 1U) + << "BackwardOp need to have control_deps to its forward op"; + const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; + nnvm::NodePtr fwd_ptr = inode.source->control_deps[0]; + CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable"; + // use gradient function to find out the correspondence. + std::vector ograd(fwd_ptr->num_outputs()); + for (size_t i = 0; i < ograd.size(); ++i) { + ograd[i].index = static_cast(i); + } + // input gradient list + auto igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd); + const nnvm::Node* igrad_node = nullptr; + // Input gradient assignement + for (size_t i = 0; i < igrad.size(); ++i) { + if (igrad[i].node->op() == inode.source->op()) { + uint32_t eid = idx.entry_id(nid, igrad[i].index); + if (fis_none(rshape[eid])) { + rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])]; + } else { + CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) + << "Backward shape inconsistent with the forward shape"; + } + if (igrad_node == nullptr) { + igrad_node = igrad[i].node.get(); + } else { + CHECK(igrad_node == igrad[i].node.get()); + } + } + } + // out grad entries + CHECK(igrad_node != nullptr) + << "Cannot find matching backward op for " << inode.source->attrs.name; + for (size_t i = 0; i < igrad_node->inputs.size(); ++i) { + const nnvm::NodeEntry& e = igrad_node->inputs[i]; + if (e.node == nullptr) { + uint32_t eid = idx.entry_id(inode.inputs[i]); + if (fis_none(rshape[eid])) { + rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)]; + } + } + } + } else { + bool forward_known = true; + // Forward operator inference. + ishape.resize(num_inputs, empty_val); + for (uint32_t i = 0; i < ishape.size(); ++i) { + ishape[i] = rshape[idx.entry_id(inode.inputs[i])]; + if (fis_none(ishape[i])) forward_known = false; + } + oshape.resize(num_outputs, empty_val); + for (uint32_t i = 0; i < oshape.size(); ++i) { + oshape[i] = rshape[idx.entry_id(nid, i)]; + if (fis_none(oshape[i])) forward_known = false; + } + auto finfer = finfer_shape.get(inode.source->op(), fdefault); + if (!forward_known) { + if (finfer != nullptr) { + // Call inference function of the operator. + try { + forward_known = ApplyOpInferAttr(ret, finfer, inode.source->attrs, + nid, &ishape, &oshape); + } catch (const std::exception& e) { + throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what()); + } + } else { + CHECK(!last_iter) + << "Attribute " << infer_name + << " is not registed by op " << inode.source->op()->name + << " we are not able to complete the inference because of this"; + } + } + // Save to the result map. + for (uint32_t i = 0; i < num_inputs; ++i) { + rshape[idx.entry_id(inode.inputs[i])] = ishape[i]; + } + for (uint32_t i = 0; i < num_outputs; ++i) { + rshape[idx.entry_id(nid, i)] = oshape[i]; + } + } + }; + + size_t last_num_unknown; + size_t num_unknown = rshape.size(); + int i = 0; + do { + if (i % 2 == 0) { + for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { + infer_step(nid, false); + } + } else { + // backward inference + for (uint32_t i = idx.num_nodes(); i != 0; --i) { + infer_step(i - 1, false); + } + } + last_num_unknown = num_unknown; + num_unknown = 0; + for (size_t j = 0; j < idx.num_node_entries(); ++j) { + if (fis_none(rshape[j])) { + ++num_unknown; + } + } + ++i; + } while (num_unknown > 0 && last_num_unknown > num_unknown); + // set the shapes + ret.attrs[attr_name] = std::make_shared(std::move(rshape)); + // number of nodes who knows the shape. + ret.attrs[unknown_name] = std::make_shared(num_unknown); + return ret; +} + +// inference fucntion for same type +inline bool SameType(const nnvm::NodeAttrs& attrs, + std::vector *iattr, + std::vector *oattr) { + int def_v = -1; + for (int v : *oattr) { + if (v != -1) { + def_v = v; break; + } + } + if (def_v == -1) { + for (int v : *iattr) { + if (v != -1) { + def_v = v; break; + } + } + } + if (def_v == -1) return false; + for (int& v : *oattr) { + v = def_v; + } + for (int& v : *iattr) { + v = def_v; + } + return true; +} + +// assigning default type N to both input and output attrs with value -1 +template +inline bool DefaultType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *iattr, + std::vector *oattr) { + // TODO(junwu): check whether need to use ctx + for (int& v : *oattr) { + if (v == none) v = default_val; + } + for (int& v : *iattr) { + if (v == none) v = default_val; + } + return true; +} + +nnvm::Graph InferShape(nnvm::Graph graph, + nnvm::ShapeVector shape_inputs, + const std::string& shape_attr_key) { + using dmlc::any; + if (shape_inputs.size() != 0) { + graph.attrs["shape_inputs"] = std::make_shared(std::move(shape_inputs)); + } + if (shape_attr_key.length() != 0) { + graph.attrs["shape_attr_key"] = std::make_shared(std::move(shape_attr_key)); + } + return InferAttr( + std::move(graph), nnvm::TShape(), + "FInferShape", "shape_inputs", "shape_attr_key", + "shape", "shape_num_unknown_nodes", + [](const nnvm::TShape& s) { return s.ndim() == 0 || s.Size() == 0; }, + nullptr, true); +} + +nnvm::Graph InferType(nnvm::Graph graph, + nnvm::DTypeVector dtype_inputs, + const std::string& dtype_attr_key) { + using dmlc::any; + if (dtype_inputs.size() != 0) { + graph.attrs["dtype_inputs"] = std::make_shared(std::move(dtype_inputs)); + } + if (dtype_attr_key.length() != 0) { + graph.attrs["dtype_attr_key"] = std::make_shared(std::move(dtype_attr_key)); + } + return InferAttr( + std::move(graph), -1, + "FInferType", "dtype_inputs", "dtype_attr_key", + "dtype", "dtype_num_unknown_nodes", + [](const int t) { return t == -1; }, + SameType, true); +} + +nnvm::Graph InferStorageType(nnvm::Graph graph, + StorageTypeVector storage_type_inputs, + const std::string& storage_type_attr_key) { + using dmlc::any; + if (storage_type_inputs.size() != 0) { + graph.attrs["storage_type_inputs"] = std::make_shared(std::move(storage_type_inputs)); + } + if (storage_type_attr_key.length() != 0) { + graph.attrs["storage_type_attr_key"] = std::make_shared(std::move(storage_type_attr_key)); + } + // for storage type, the backward attr is not necessarily the same as it's correspondence + const int kDefaultStorage = 0; + return InferAttr( + std::move(graph), -1, + "FInferStorageType", "storage_type_inputs", "storage_type_attr_key", + "storage_type", "storage_type_num_unknown_nodes", + [](const int t) { return t == -1; }, + DefaultType, false); +} + +} // namespace exec +} // namespace mxnet diff --git a/src/executor/inplace_addto_detect_pass.cc b/src/executor/inplace_addto_detect_pass.cc index 26a91e3f1b5e..9359d8863594 100644 --- a/src/executor/inplace_addto_detect_pass.cc +++ b/src/executor/inplace_addto_detect_pass.cc @@ -62,6 +62,8 @@ Graph DetectInplaceAddTo(Graph g) { uint32_t eid_rhs = idx.entry_id(inode.inputs[1]); if (ref_count[eid_rhs] != 1) continue; if (inode.inputs[0].node_id >= inode.inputs[1].node_id) continue; + // TODO(haibin) support inplace addto for Dynamic Storage + if (storage_id[eid_rhs] == kDynamicStorageID) continue; CHECK_NE(storage_id[eid_rhs], sid); storage_id[eid_rhs] = sid; addto_entry[eid_rhs] = 1; diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h index c5ec10618080..ade7c1a53bd2 100644 --- a/src/io/iter_batchloader.h +++ b/src/io/iter_batchloader.h @@ -41,7 +41,7 @@ namespace io { class BatchLoader : public IIterator { public: explicit BatchLoader(IIterator *base): - base_(base), head_(1), num_overflow_(0) { + head_(1), num_overflow_(0), base_(base) { } virtual ~BatchLoader(void) { @@ -52,7 +52,7 @@ class BatchLoader : public IIterator { std::vector > kwargs_left; // init batch param, it could have similar param with kwargs_left = param_.InitAllowUnknown(kwargs); - // Init space for out_ + // Init space for out out_.inst_index = new unsigned[param_.batch_size]; out_.batch_size = param_.batch_size; out_.data.clear(); @@ -69,6 +69,7 @@ class BatchLoader : public IIterator { } head_ = 1; } + virtual bool Next(void) { out_.num_batch_padd = 0; out_.batch_size = param_.batch_size; @@ -128,23 +129,25 @@ class BatchLoader : public IIterator { return out_; } - private: + protected: /*! \brief batch parameters */ BatchParam param_; /*! \brief output data */ TBlobBatch out_; - /*! \brief base iterator */ - IIterator *base_; /*! \brief on first */ int head_; /*! \brief number of overflow instances that readed in round_batch mode */ int num_overflow_; + /*! \brief tensor to hold data */ + std::vector data_; + + private: + /*! \brief base iterator */ + IIterator *base_; /*! \brief data shape */ std::vector shape_; /*! \brief unit size */ std::vector unit_size_; - /*! \brief tensor to hold data */ - std::vector data_; // initialize the data holder by using from the first batch. inline void InitData(const DataInst& first_batch) { shape_.resize(first_batch.data.size()); diff --git a/src/io/iter_libsvm.cc b/src/io/iter_libsvm.cc new file mode 100644 index 000000000000..803d19e74481 --- /dev/null +++ b/src/io/iter_libsvm.cc @@ -0,0 +1,288 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file iter_libsvm.cc + * \brief define a LibSVM Reader to read in arrays + */ +#include +#include +#include +#include +#include +#include "./iter_sparse_prefetcher.h" +#include "./iter_sparse_batchloader.h" + +namespace mxnet { +namespace io { +// LibSVM parameters +struct LibSVMIterParam : public dmlc::Parameter { + /*! \brief path to data libsvm file */ + std::string data_libsvm; + /*! \brief data shape */ + TShape data_shape; + /*! \brief path to label libsvm file */ + std::string label_libsvm; + /*! \brief label shape */ + TShape label_shape; + /*! \brief partition the data into multiple parts */ + int num_parts; + /*! \brief the index of the part will read*/ + int part_index; + // declare parameters + DMLC_DECLARE_PARAMETER(LibSVMIterParam) { + DMLC_DECLARE_FIELD(data_libsvm) + .describe("The input LibSVM file or a directory path."); + DMLC_DECLARE_FIELD(data_shape) + .describe("The shape of one example."); + DMLC_DECLARE_FIELD(label_libsvm).set_default("NULL") + .describe("The input LibSVM file or a directory path. " + "If NULL, all labels will be read from ``data_libsvm``."); + index_t shape1[] = {1}; + DMLC_DECLARE_FIELD(label_shape).set_default(TShape(shape1, shape1 + 1)) + .describe("The shape of one label."); + DMLC_DECLARE_FIELD(num_parts).set_default(1) + .describe("partition the data into multiple parts"); + DMLC_DECLARE_FIELD(part_index).set_default(0) + .describe("the index of the part will read"); + } +}; + +class LibSVMIter: public SparseIIterator { + public: + LibSVMIter() {} + virtual ~LibSVMIter() {} + + // intialize iterator loads data in + virtual void Init(const std::vector >& kwargs) { + param_.InitAllowUnknown(kwargs); + CHECK_EQ(param_.data_shape.ndim(), 1) << "dimension of data_shape is expected to be 1"; + CHECK_GT(param_.num_parts, 0) << "number of parts should be positive"; + CHECK_GE(param_.part_index, 0) << "part index should be non-negative"; + data_parser_.reset(dmlc::Parser::Create(param_.data_libsvm.c_str(), + param_.part_index, + param_.num_parts, "libsvm")); + if (param_.label_libsvm != "NULL") { + label_parser_.reset(dmlc::Parser::Create(param_.label_libsvm.c_str(), + param_.part_index, + param_.num_parts, "libsvm")); + CHECK_GT(param_.label_shape.Size(), 1) + << "label_shape is not expected to be (1,) when param_.label_libsvm is set."; + } else { + CHECK_EQ(param_.label_shape.Size(), 1) + << "label_shape is expected to be (1,) when param_.label_libsvm is NULL"; + } + // both data and label are of CSRStorage in libsvm format + if (param_.label_shape.Size() > 1) { + out_.data.resize(6); + } else { + // only data is of CSRStorage in libsvm format. + out_.data.resize(4); + } + } + + virtual void BeforeFirst() { + data_parser_->BeforeFirst(); + if (label_parser_.get() != nullptr) { + label_parser_->BeforeFirst(); + } + data_ptr_ = label_ptr_ = 0; + data_size_ = label_size_ = 0; + inst_counter_ = 0; + end_ = false; + } + + virtual bool Next() { + if (end_) return false; + while (data_ptr_ >= data_size_) { + if (!data_parser_->Next()) { + end_ = true; return false; + } + data_ptr_ = 0; + data_size_ = data_parser_->Value().size; + } + out_.index = inst_counter_++; + CHECK_LT(data_ptr_, data_size_); + const auto data_row = data_parser_->Value()[data_ptr_++]; + // data, indices and indptr + out_.data[0] = AsDataBlob(data_row); + out_.data[1] = AsIdxBlob(data_row); + out_.data[2] = AsIndPtrPlaceholder(data_row); + + if (label_parser_.get() != nullptr) { + while (label_ptr_ >= label_size_) { + CHECK(label_parser_->Next()) + << "Data LibSVM's row is smaller than the number of rows in label_libsvm"; + label_ptr_ = 0; + label_size_ = label_parser_->Value().size; + } + CHECK_LT(label_ptr_, label_size_); + const auto label_row = label_parser_->Value()[label_ptr_++]; + // data, indices and indptr + out_.data[3] = AsDataBlob(label_row); + out_.data[4] = AsIdxBlob(label_row); + out_.data[5] = AsIndPtrPlaceholder(label_row); + } else { + out_.data[3] = AsScalarLabelBlob(data_row); + } + return true; + } + + virtual const DataInst &Value(void) const { + return out_; + } + + virtual const NDArrayStorageType GetStorageType(bool is_data) const { + if (is_data) return kCSRStorage; + return param_.label_shape.Size() > 1 ? kCSRStorage : kDefaultStorage; + } + + virtual const TShape GetShape(bool is_data) const { + if (is_data) return param_.data_shape; + return param_.label_shape; + } + + private: + inline TBlob AsDataBlob(const dmlc::Row& row) { + const real_t* ptr = row.value; + TShape shape(mshadow::Shape1(row.length)); + return TBlob((real_t*) ptr, shape, cpu::kDevMask); // NOLINT(*) + } + + inline TBlob AsIdxBlob(const dmlc::Row& row) { + const uint64_t* ptr = row.index; + TShape shape(mshadow::Shape1(row.length)); + return TBlob((int64_t*) ptr, shape, cpu::kDevMask, mshadow::kInt64); // NOLINT(*) + } + + inline TBlob AsIndPtrPlaceholder(const dmlc::Row& row) { + return TBlob(nullptr, mshadow::Shape1(0), cpu::kDevMask, mshadow::kInt64); + } + + inline TBlob AsScalarLabelBlob(const dmlc::Row& row) { + const real_t* ptr = row.label; + return TBlob((real_t*) ptr, mshadow::Shape1(1), cpu::kDevMask); // NOLINT(*) + } + + LibSVMIterParam param_; + // output instance + DataInst out_; + // internal instance counter + unsigned inst_counter_{0}; + // at end + bool end_{false}; + // label parser + size_t label_ptr_{0}, label_size_{0}; + size_t data_ptr_{0}, data_size_{0}; + std::unique_ptr > label_parser_; + std::unique_ptr > data_parser_; +}; + + +DMLC_REGISTER_PARAMETER(LibSVMIterParam); + +MXNET_REGISTER_IO_ITER(LibSVMIter) +.describe(R"code(Returns the LibSVM file iterator. This iterator is experimental and +should be used with care. + +The input data is similar to libsvm file format, except that the indices are expected to be +zero-based instead of one-based. Details of the libsvm format are available at +`https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/` + +In this function, the `data_shape` parameter is used to set the shape of each line of the data. +The dimension of both `data_shape` and `label_shape` are expected to be 1. + +When `label_libsvm` is set to ``NULL``, both data and label are read from the same file specified +by `data_libsvm`. Otherwise, data is read from `data_libsvm` and label from `label_libsvm`, +in this case, if `data_libsvm` contains label, it will ignored. + +The `LibSVMIter` only support `round_batch` parameter set to ``True`` for now. So, if `batch_size` +is 3 and there are 4 total rows in libsvm file, 2 more examples +are consumed at the first round. If `reset` function is called after first round, +the call is ignored and remaining examples are returned in the second round. + +If ``data_libsvm = 'data/'`` is set, then all the files in this directory will be read. + +Examples:: + + // Contents of libsvm file ``data.t``. + 1.0 0:0.5 2:1.2 + -2.0 + -3.0 0:0.6 1:2.4 2:1.2 + 4 2:-1.2 + + // Creates a `LibSVMIter` with `batch_size`=3. + LibSVMIter = mx.io.LibSVMIter(data_libsvm = 'data.t', data_shape = (3,), + batch_size = 3) + + // The first batch (data and label) + [[ 0.5 0. 1.2 ] + [ 0. 0. 0. ] + [ 0.6 2.4 1.2 ]] + + [ 1. -2. -3.] + + // The second batch (data and label) + [[ 0. 0. -1.2 ] + [ 0.5 0. 1.2 ] + [ 0. 0. 0. ]] + + [ 4. 1. -2.] + + // Contents of libsvm file ``label.t`` + 1.0 + -2.0 0:0.125 + -3.0 2:1.2 + 4 1:1.0 2:-1.2 + + // Creates a `LibSVMIter` with specified label file + LibSVMIter = mx.io.LibSVMIter(data_libsvm = 'data.t', data_shape = (3,), + label_libsvm = 'label.t', label_shape = (3,), batch_size = 3) + + // Two batches of data read from the above iterator are as follows(data and label): + // The first batch + [[ 0.5 0. 1.2 ] + [ 0. 0. 0. ] + [ 0.6 2.4 1.2 ]] + + [[ 0. 0. 0. ] + [ 0.125 0. 0. ] + [ 0. 0. 1.2 ]] + + // The second batch + [[ 0. 0. -1.2 ] + [ 0.5 0. 1.2 ] + [ 0. 0. 0. ]] + + [[ 0. 1. -1.2 ] + [ 0. 0. 0. ] + [ 0.125 0. 0. ]] + +)code" ADD_FILELINE) +.add_arguments(LibSVMIterParam::__FIELDS__()) +.add_arguments(BatchParam::__FIELDS__()) +.add_arguments(PrefetcherParam::__FIELDS__()) +.set_body([]() { + return new SparsePrefetcherIter( + new SparseBatchLoader( + new LibSVMIter())); + }); + +} // namespace io +} // namespace mxnet diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h index 89960c71a12f..a743b5132821 100644 --- a/src/io/iter_prefetcher.h +++ b/src/io/iter_prefetcher.h @@ -46,8 +46,7 @@ namespace io { class PrefetcherIter : public IIterator { public: explicit PrefetcherIter(IIterator* base) - : loader_(base), out_(nullptr) { - } + : loader_(base), out_(nullptr) {} ~PrefetcherIter() { while (recycle_queue_.size() != 0) { @@ -56,21 +55,24 @@ class PrefetcherIter : public IIterator { delete batch; } delete out_; - iter_.Destroy(); + iter.Destroy(); } - virtual void Init(const std::vector >& kwargs) { + void InitParams(const std::vector >& kwargs) { std::vector > kwargs_left; // init image rec param kwargs_left = param_.InitAllowUnknown(kwargs); - // use the kwarg to init batch loader - loader_->Init(kwargs); // maximum prefetch threaded iter internal size const int kMaxPrefetchBuffer = 16; // init thread iter - iter_.set_max_capacity(kMaxPrefetchBuffer); + iter.set_max_capacity(kMaxPrefetchBuffer); + } - iter_.Init([this](DataBatch **dptr) { + virtual void Init(const std::vector >& kwargs) { + InitParams(kwargs); + // use the kwarg to init batch loader + loader_->Init(kwargs); + iter.Init([this](DataBatch **dptr) { if (!loader_->Next()) return false; const TBlobBatch& batch = loader_->Value(); if (*dptr == nullptr) { @@ -109,7 +111,7 @@ class PrefetcherIter : public IIterator { } virtual void BeforeFirst(void) { - iter_.BeforeFirst(); + iter.BeforeFirst(); } virtual bool Next(void) { @@ -124,9 +126,9 @@ class PrefetcherIter : public IIterator { arr.WaitToWrite(); } recycle_queue_.pop(); - iter_.Recycle(&old_batch); + iter.Recycle(&old_batch); } - return iter_.Next(&out_); + return iter.Next(&out_); } virtual const DataBatch &Value(void) const { return *out_; @@ -135,16 +137,16 @@ class PrefetcherIter : public IIterator { protected: /*! \brief prefetcher parameters */ PrefetcherParam param_; - /*! \brief internal batch loader */ - std::unique_ptr > loader_; + /*! \brief backend thread */ + dmlc::ThreadedIter iter; private: + /*! \brief internal batch loader */ + std::unique_ptr > loader_; /*! \brief output data */ DataBatch *out_; /*! \brief queue to be recycled */ std::queue recycle_queue_; - /*! \brief backend thread */ - dmlc::ThreadedIter iter_; }; } // namespace io } // namespace mxnet diff --git a/src/io/iter_sparse.h b/src/io/iter_sparse.h new file mode 100644 index 000000000000..beaf5c682998 --- /dev/null +++ b/src/io/iter_sparse.h @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file iter_sparse.h + * \brief mxnet sparse data iterator + */ +#ifndef MXNET_IO_ITER_SPARSE_H_ +#define MXNET_IO_ITER_SPARSE_H_ + +#include +#include + +namespace mxnet { +/*! + * \brief iterator type + * \param DType data type + */ +template +class SparseIIterator : public IIterator { + public: + /*! \brief storage type of the data or label */ + virtual const NDArrayStorageType GetStorageType(bool is_data) const = 0; + /*! \brief shape of the data or label */ + virtual const TShape GetShape(bool is_data) const = 0; +}; // class SparseIIterator + +} // namespace mxnet +#endif // MXNET_IO_ITER_SPARSE_H_ diff --git a/src/io/iter_sparse_batchloader.h b/src/io/iter_sparse_batchloader.h new file mode 100644 index 000000000000..d5c9bd2f4578 --- /dev/null +++ b/src/io/iter_sparse_batchloader.h @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file iter_sparse_batchloader.h + * \brief define a batch adapter to create sparse tblob batch + */ +#ifndef MXNET_IO_ITER_SPARSE_BATCHLOADER_H_ +#define MXNET_IO_ITER_SPARSE_BATCHLOADER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include "./inst_vector.h" +#include "./image_iter_common.h" +#include "./iter_batchloader.h" +#include "./iter_sparse.h" + +namespace mxnet { +namespace io { + +/*! \brief create a batch iterator from single instance iterator */ +class SparseBatchLoader : public BatchLoader, public SparseIIterator { + public: + explicit SparseBatchLoader(SparseIIterator *base): + BatchLoader(base), sparse_base_(base) { + } + + virtual ~SparseBatchLoader(void) {} + + inline void Init(const std::vector >& kwargs) { + BatchLoader::Init(kwargs); + data_stype_ = sparse_base_->GetStorageType(true); + label_stype_ = sparse_base_->GetStorageType(false); + if (param_.round_batch == 0) { + LOG(FATAL) << "sparse batch loader doesn't support round_batch == false yet"; + } + } + + virtual void BeforeFirst(void) { + BatchLoader::BeforeFirst(); + } + + virtual bool Next(void) { + out_.num_batch_padd = 0; + out_.batch_size = param_.batch_size; + this->head_ = 0; + // if overflown from previous round, directly return false, until before first is called + if (num_overflow_ != 0) return false; + index_t top = 0; + inst_cache_.clear(); + while (sparse_base_->Next()) { + inst_cache_.emplace_back(sparse_base_->Value()); + if (inst_cache_.size() >= param_.batch_size) break; + } + // no more data instance + if (inst_cache_.size() == 0) { + return false; + } + if (inst_cache_.size() < param_.batch_size) { + CHECK_GT(param_.round_batch, 0); + num_overflow_ = 0; + sparse_base_->BeforeFirst(); + for (; inst_cache_.size() < param_.batch_size; ++num_overflow_) { + CHECK(sparse_base_->Next()) << "number of input must be bigger than batch size"; + inst_cache_.emplace_back(sparse_base_->Value()); + } + } + out_.num_batch_padd = num_overflow_; + CHECK_EQ(inst_cache_.size(), param_.batch_size); + this->InitDataFromBatch(); + for (size_t j = 0; j < inst_cache_.size(); j++) { + const auto& d = inst_cache_[j]; + out_.inst_index[top] = d.index; + // TODO(haibin) double check the type? + int64_t unit_size = 0; + for (size_t i = 0; i < d.data.size(); ++i) { + // indptr tensor + if (IsIndPtr(i)) { + auto indptr = data_[i].get(); + if (j == 0) indptr[0] = 0; + indptr[j + 1] = indptr[j] + unit_size; + offsets_[i] = j; + } else { + // indices and values tensor + unit_size = d.data[i].shape_.Size(); + MSHADOW_TYPE_SWITCH(data_[i].type_flag_, DType, { + const auto begin = offsets_[i]; + const auto end = offsets_[i] + unit_size; + mshadow::Copy(data_[i].get().Slice(begin, end), + d.data[i].get_with_shape(mshadow::Shape1(unit_size))); + }); + offsets_[i] += unit_size; + } + } + } + return true; + } + + virtual const TBlobBatch &Value(void) const { + return BatchLoader::Value(); + } + + virtual const NDArrayStorageType GetStorageType(bool is_data) const { + return sparse_base_->GetStorageType(is_data); + } + + virtual const TShape GetShape(bool is_data) const { + TShape inst_shape = sparse_base_->GetShape(is_data); + std::vector shape_vec; + shape_vec.push_back(param_.batch_size); + for (index_t dim = 0; dim < inst_shape.ndim(); ++dim) { + shape_vec.push_back(inst_shape[dim]); + } + return TShape(shape_vec.begin(), shape_vec.end()); + } + + private: + /*! \brief base sparse iterator */ + SparseIIterator *sparse_base_; + /*! \brief data instances */ + std::vector inst_cache_; + /*! \brief data storage type */ + NDArrayStorageType data_stype_; + /*! \brief data label type */ + NDArrayStorageType label_stype_; + /*! \brief tensor offset for slicing */ + std::vector offsets_; + + // check whether ith position is the indptr tensor for a CSR tensor + inline bool IsIndPtr(size_t i) { + auto data_num_aux = num_aux_data(data_stype_); + auto label_num_aux = num_aux_data(label_stype_); + auto label_indptr_offset = data_num_aux + 1 + label_num_aux; + // data indptr + if (i == data_num_aux && data_stype_ == kCSRStorage) { + return true; + } + // label indptr + if (i == label_indptr_offset && label_stype_ == kCSRStorage && data_stype_ == kCSRStorage) { + return true; + } + return false; + } + + // initialize the data holder by using from the batch + inline void InitDataFromBatch() { + CHECK(data_stype_ == kCSRStorage || label_stype_ == kCSRStorage); + CHECK_GT(inst_cache_.size(), 0); + out_.data.clear(); + data_.clear(); + offsets_.clear(); + + size_t total_size = inst_cache_[0].data.size(); + data_.resize(total_size); + offsets_.resize(total_size, 0); + std::vector vec_sizes(total_size, 0); + // accumulate the memory required for a batch + for (size_t i = 0; i < total_size; ++i) { + size_t size = 0; + // vec_size for indptr + if (IsIndPtr(i)) { + size = param_.batch_size + 1; + } else { + for (const auto &d : inst_cache_) size += d.data[i].shape_.Size(); + } + vec_sizes[i] = size; + } + + CHECK_EQ(vec_sizes[0], vec_sizes[1]); + for (size_t i = 0; i < total_size; ++i) { + int src_type_flag = inst_cache_[0].data[i].type_flag_; + // init object attributes + TShape dst_shape(mshadow::Shape1(vec_sizes[i])); + data_[i].resize(mshadow::Shape1(vec_sizes[i]), src_type_flag); + CHECK(data_[i].dptr_ != nullptr); + out_.data.push_back(TBlob(data_[i].dptr_, dst_shape, cpu::kDevMask, src_type_flag)); + } + } +}; // class BatchLoader +} // namespace io +} // namespace mxnet +#endif // MXNET_IO_ITER_SPARSE_BATCHLOADER_H_ diff --git a/src/io/iter_sparse_prefetcher.h b/src/io/iter_sparse_prefetcher.h new file mode 100644 index 000000000000..3908f9bd3826 --- /dev/null +++ b/src/io/iter_sparse_prefetcher.h @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file iter_sparse_prefetcher.h + * \brief define a prefetcher using threaditer to keep k batch fetched + */ +#ifndef MXNET_IO_ITER_SPARSE_PREFETCHER_H_ +#define MXNET_IO_ITER_SPARSE_PREFETCHER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "./inst_vector.h" +#include "./image_iter_common.h" +#include "./iter_prefetcher.h" +#include "./iter_sparse.h" + +namespace mxnet { +namespace io { +// iterator on sparse data +class SparsePrefetcherIter : public PrefetcherIter { + public: + explicit SparsePrefetcherIter(SparseIIterator* base) + : PrefetcherIter(base), sparse_loader_(base) {} + + ~SparsePrefetcherIter() {} + + virtual void Init(const std::vector >& kwargs) { + PrefetcherIter::InitParams(kwargs); + // use the kwarg to init batch loader + sparse_loader_->Init(kwargs); + iter.Init([this](DataBatch **dptr) { + if (!sparse_loader_->Next()) return false; + const TBlobBatch& batch = sparse_loader_->Value(); + if (*dptr == nullptr) { + // allocate databatch + *dptr = new DataBatch(); + (*dptr)->num_batch_padd = batch.num_batch_padd; + // (*dptr)->data.at(0) => data + // (*dptr)->data.at(1) => label + (*dptr)->data.resize(2); + (*dptr)->index.resize(batch.batch_size); + size_t data_iter = 0; + for (size_t i = 0; i < (*dptr)->data.size(); ++i) { + bool is_data = i == 0; + auto stype = this->GetStorageType(is_data); + auto dtype = param_.dtype ? param_.dtype.value() : batch.data[data_iter].type_flag_; + if (stype == kDefaultStorage) { + (*dptr)->data.at(i) = NDArray(batch.data[data_iter].shape_, + Context::CPU(), false, dtype); + } else { + (*dptr)->data.at(i) = NDArray(stype, this->GetShape(is_data), + Context::CPU(), false, dtype); + } + data_iter += num_aux_data(stype) + 1; + } + } + // copy data over + size_t data_iter = 0; + for (size_t i = 0; i < (*dptr)->data.size(); ++i) { + auto& nd = ((*dptr)->data)[i]; + auto stype = nd.storage_type(); + auto& data_i = ((*dptr)->data)[i]; + if (stype == kDefaultStorage) { + CopyFromTo(data_i.data(), batch.data[data_iter]); + } else if (stype == kCSRStorage) { + auto& values = batch.data[data_iter]; + auto& indices = batch.data[data_iter + 1]; + auto& indptr = batch.data[data_iter + 2]; + // allocate memory + CHECK_EQ(indices.shape_.Size(), values.shape_.Size()); + nd.CheckAndAllocAuxData(csr::kIdx, indices.shape_); + nd.CheckAndAllocData(values.shape_); + nd.CheckAndAllocAuxData(csr::kIndPtr, indptr.shape_); + // copy values, indices and indptr + CopyFromTo(data_i.data(), values); + CopyFromTo(data_i.aux_data(csr::kIdx), indices); + CopyFromTo(data_i.aux_data(csr::kIndPtr), indptr); + } else { + LOG(FATAL) << "Storage type not implemented: " << stype; + } + data_iter += num_aux_data(stype) + 1; + (*dptr)->num_batch_padd = batch.num_batch_padd; + } + if (batch.inst_index) { + std::copy(batch.inst_index, + batch.inst_index + batch.batch_size, + (*dptr)->index.begin()); + } + return true; + }, + [this]() { sparse_loader_->BeforeFirst(); }); + } + + virtual void BeforeFirst(void) { + PrefetcherIter::BeforeFirst(); + } + + virtual bool Next(void) { + return PrefetcherIter::Next(); + } + virtual const DataBatch &Value(void) const { + return PrefetcherIter::Value(); + } + + virtual const NDArrayStorageType GetStorageType(bool is_data) const { + return sparse_loader_->GetStorageType(is_data); + } + + virtual const TShape GetShape(bool is_data) const { + return sparse_loader_->GetShape(is_data); + } + + private: + /*! \brief internal sparse batch loader */ + SparseIIterator* sparse_loader_; + + inline void CopyFromTo(TBlob dst, const TBlob src) { + MSHADOW_TYPE_SWITCH(src.type_flag_, DType, { + mshadow::Copy(dst.FlatTo1D(), src.FlatTo1D()); + }); + } +}; +} // namespace io +} // namespace mxnet +#endif // MXNET_IO_ITER_SPARSE_PREFETCHER_H_ diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index ade9c95feda7..cd0d3ab02825 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -21,13 +21,17 @@ */ #ifndef MXNET_KVSTORE_COMM_H_ #define MXNET_KVSTORE_COMM_H_ +#include #include #include #include #include #include #include +#include #include "mxnet/ndarray.h" +#include "../ndarray/ndarray_function.h" +#include "../operator/tensor/sparse_retain-inl.h" namespace mxnet { namespace kvstore { /** @@ -40,9 +44,10 @@ class Comm { } virtual ~Comm() { } /** - * \brief init key with the data shape + * \brief init key with the data shape and storage shape */ - virtual void Init(int key, const TShape& shape, int dtype = mshadow::kFloat32) = 0; + virtual void Init(int key, const NDArrayStorageType stype, + const TShape& shape, int dtype = mshadow::kFloat32) = 0; /** * \brief returns src[0] + .. + src[src.size()-1] */ @@ -55,6 +60,18 @@ class Comm { int key, const NDArray& src, const std::vector dst, int priority) = 0; + /** + * \brief broadcast src to dst[i] with target row_ids for every i + * \param dst a list of destination row_sparse NDArray and its target row_ids to broadcast, + where the row_ids are expected to be unique and sorted + * \param use_copy if set to true, directly copy src to dst[i] without looking up the + provided row_ids + */ + virtual void BroadcastRowSparse(int key, const NDArray& src, + const std::vector>& dst, + const bool use_copy, + const int priority) = 0; + /** * \brief return a pinned contex */ @@ -75,43 +92,85 @@ class CommCPU : public Comm { CommCPU() { nthread_reduction_ = dmlc::GetEnv("MXNET_KVSTORE_REDUCTION_NTHREADS", 4); bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000); + // TODO(junwu) delete the following data member, now for benchmark only + is_serial_push_ = dmlc::GetEnv("MXNET_KVSTORE_SERIAL_PUSH", 0); } virtual ~CommCPU() { } - void Init(int key, const TShape& shape, int type = mshadow::kFloat32) override { - merge_buf_[key].merged = NDArray(shape, pinned_ctx_, false, type); + void Init(int key, const NDArrayStorageType stype, const TShape& shape, + int type = mshadow::kFloat32) override { + if (stype == kDefaultStorage) { + merge_buf_[key].merged = NDArray(shape, pinned_ctx_, false, type); + } else { + merge_buf_[key].merged = NDArray(stype, shape, pinned_ctx_, true, type); + } } const NDArray& Reduce(int key, const std::vector& src, int priority) override { + auto& buf = merge_buf_[key]; // avoid extra copy for single device, but it may bring problems for // abnormal usage of kvstore if (src.size() == 1) { - return src[0]; + if (src[0].storage_type() == kDefaultStorage) { + return src[0]; + } else { // if sparse and only one GPU, always update weight on CPU + CopyFromTo(src[0], &buf.merged, priority); + return buf.merged; + } } - std::vector const_vars(src.size() - 1); - std::vector reduce(src.size()); - auto& buf = merge_buf_[key]; - CopyFromTo(src[0], &buf.merged, priority); - reduce[0] = buf.merged; - if (buf.copy_buf.empty()) { - buf.copy_buf.resize(src.size()-1); - for (size_t j = 0; j < src.size() - 1; ++j) { - buf.copy_buf[j] = NDArray( - src[0].shape(), pinned_ctx_, false, src[0].dtype()); + if (buf.merged.storage_type() == kDefaultStorage) { + std::vector const_vars(src.size() - 1); + std::vector reduce(src.size()); + CopyFromTo(src[0], &buf.merged, priority); + reduce[0] = buf.merged; + + if (buf.copy_buf.empty()) { + buf.copy_buf.resize(src.size()-1); + for (size_t j = 0; j < src.size() - 1; ++j) { + // allocate NDArray basd on storage type + buf.copy_buf[j] = NDArray( + src[0].shape(), pinned_ctx_, false, src[0].dtype()); + } } - } - for (size_t i = 1; i < src.size(); ++i) { - CopyFromTo(src[i], &(buf.copy_buf[i-1]), priority); - reduce[i] = buf.copy_buf[i-1]; - const_vars[i-1] = reduce[i].var(); - } + for (size_t i = 1; i < src.size(); ++i) { + CopyFromTo(src[i], &(buf.copy_buf[i-1]), priority); + reduce[i] = buf.copy_buf[i-1]; + const_vars[i-1] = reduce[i].var(); + } + + Engine::Get()->PushSync([reduce, this](RunContext rctx) { + ReduceSumCPU(reduce); + }, Context::CPU(), const_vars, {reduce[0].var()}, + FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce")); - Engine::Get()->PushSync([reduce, this](RunContext rctx) { - ReduceSumCPU(reduce); - }, Context::CPU(), const_vars, {reduce[0].var()}, - FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce")); + } else { + // buf.merged is a sparse ndarray. + std::vector const_vars(src.size()); + std::vector reduce(src.size()); + + if (buf.copy_buf.empty()) { + buf.copy_buf.resize(src.size()); + for (size_t j = 0; j < src.size(); ++j) { + buf.copy_buf[j] = NDArray( + src[0].storage_type(), src[0].shape(), pinned_ctx_, true, src[0].dtype()); + } + } + for (size_t i = 0; i < src.size(); ++i) { + CopyFromTo(src[i], &(buf.copy_buf[i]), priority); + reduce[i] = buf.copy_buf[i]; + const_vars[i] = reduce[i].var(); + } + auto result = buf.merged; + Engine::Get()->PushSync([reduce, result, this](RunContext rctx) { + NDArray out = result; + is_serial_push_? + ReduceSumCPUExSerial(reduce, &out) + : mxnet::ndarray::ElementwiseSum(rctx.get_stream(), reduce, &out); + }, Context::CPU(), const_vars, {result.var()}, + FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce")); + } return buf.merged; } @@ -129,7 +188,113 @@ class CommCPU : public Comm { } } + void BroadcastRowSparse(int key, const NDArray& src, + const std::vector>& dst, + const bool use_copy, + const int priority) override { + using namespace mshadow; + CHECK_EQ(src.storage_type(), kRowSparseStorage) + << "BroadcastRowSparse expects row-sparse src NDArray"; + CHECK_EQ(src.ctx().dev_mask(), Context::kCPU) + << "BroadcastRowSparse with src on gpu context not supported"; + for (size_t i = 0; i < dst.size(); ++i) { + NDArray* out = dst[i].first; + NDArray row_id = dst[i].second; + if (use_copy) { + CopyFromTo(src, out, priority); + } else { + CHECK_EQ(out->storage_type(), kRowSparseStorage) + << "BroadcastRowSparse expects row_sparse dst NDArray"; + CHECK_EQ(row_id.ctx().dev_mask(), Context::kCPU) + << "BroadcastRowSparse with row_indices on gpu context not supported"; + // retain according to unique indices + const bool use_sparse_retain = (src.shape()[0] != src.storage_shape()[0]) + || (row_id.dtype() != out->aux_type(rowsparse::kIdx)) + || (out->ctx().dev_mask() != Context::kGPU); + if (use_sparse_retain) { // use sparse_retain op + const bool is_to_gpu = out->ctx().dev_mask() == Context::kGPU; + NDArray out_cpu = is_to_gpu? NDArray(kRowSparseStorage, src.shape(), + src.ctx(), true, src.dtype(), src.aux_types()) : *out; + Engine::Get()->PushSync([=](RunContext rctx) { + const TBlob& indices = row_id.data(); + NDArray temp = out_cpu; // get rid of const qualifier + op::SparseRetainOpForwardRspImpl(rctx.get_stream(), + src, indices, kWriteTo, + &temp); + }, Context::CPU(), {src.var(), row_id.var()}, {out_cpu.var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain")); + if (is_to_gpu) { + CopyFromTo(out_cpu, out, priority); + } + } else { // direct copy rows + Engine::Get()->PushSync([=](RunContext rctx) { + CopyRetainedRowsToGPU(rctx.get_stream(), rctx.get_stream(), + src, row_id, out); + }, out->ctx(), {src.var(), row_id.var()}, {out->var()}, + FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("KVStoreCopyRetainedRowsToGPU")); + } + } + } + } + private: + /*! + * \brief When src is a rsp with full rows, + * simply copy retained rows directly from cpu to gpu + * without invoking sparse_retain op. + */ + void CopyRetainedRowsToGPU(mshadow::Stream* cpu_stream, + mshadow::Stream* gpu_stream, + const NDArray& src, + const NDArray& indices, + NDArray* dst) { +#if MXNET_USE_CUDA == 1 + CHECK_EQ(src.storage_type(), kRowSparseStorage) + << "CopyRetainedRowsToGPU expects row-sparse src NDArray"; + CHECK_EQ(src.ctx().dev_mask(), Context::kCPU) + << "CopyRetainedRowsToGPU with src on gpu context not supported"; + CHECK_EQ(src.storage_shape()[0], src.shape()[0]) + << "CopyRetainedRowsToGPU only supports src rsp with full rows"; + CHECK_EQ(indices.storage_type(), kDefaultStorage); + CHECK_EQ(indices.ctx().dev_mask(), Context::kCPU); + CHECK_EQ(dst->storage_type(), kRowSparseStorage); + CHECK_EQ(dst->ctx().dev_mask(), Context::kGPU); + CHECK_EQ(indices.dtype(), dst->aux_type(rowsparse::kIdx)) + << "CopyRetainedRowsToGPU only supports same data type for idx array and dst aux_data(0)"; + if (!src.storage_initialized() || indices.data().Size() == 0U) { + op::FillZerosRspImpl(gpu_stream, dst); + return; + } + using namespace mshadow; + + const TBlob& src_data = src.data(); + const TBlob& idx_data = indices.data(); + const size_t row_length = src.shape().ProdShape(1, src.shape().ndim()); + const size_t num_rows_retained = idx_data.Size(); + dst->CheckAndAlloc({Shape1(num_rows_retained)}); + TBlob dst_data = dst->data(); + TBlob dst_idx_data = dst->aux_data(rowsparse::kIdx); + MSHADOW_TYPE_SWITCH(src.dtype(), DType, { + MSHADOW_IDX_TYPE_SWITCH(indices.dtype(), IType, { + // copy idx array + Tensor dst_idx_tensor = dst_idx_data.FlatTo1D(gpu_stream); + const Tensor idx_tensor = idx_data.FlatTo1D(cpu_stream); + Copy(dst_idx_tensor, idx_tensor, gpu_stream); + // copy src data + const Tensor src_data_tensor = src_data.get_with_shape( + Shape2(src_data.shape_[0], row_length), cpu_stream); + Tensor dst_data_tensor = dst_data.get_with_shape( + Shape2(dst_data.shape_[0], row_length), gpu_stream); + for (size_t i = 0; i < num_rows_retained; ++i) { + Copy(dst_data_tensor[i], src_data_tensor[idx_tensor[i]], gpu_stream); + } + }) + }) +#else + LOG(FATAL) << "GPU not enabled"; +#endif + } + // reduce sum into val[0] inline void ReduceSumCPU(const std::vector &in_data) { MSHADOW_TYPE_SWITCH(in_data[0].dtype(), DType, { @@ -144,6 +309,78 @@ class CommCPU : public Comm { }); } + // serial implementation of reduce sum for row sparse NDArray. + inline void ReduceSumCPUExSerial(const std::vector &in, NDArray *out) { + using namespace rowsparse; + using namespace mshadow; + auto stype = out->storage_type(); + CHECK_EQ(stype, kRowSparseStorage) << "Unexpected storage type " << stype; + size_t total_num_rows = 0; + size_t num_in = in.size(); + // skip the ones with empty indices and values + std::vector skip(num_in, false); + // the values tensor of the inputs + MSHADOW_TYPE_SWITCH(out->dtype(), DType, { + MSHADOW_IDX_TYPE_SWITCH(out->aux_type(kIdx), IType, { + std::vector> in_vals(num_in); + std::vector> in_indices(num_in); + // offset to the values tensor of all inputs + std::vector offsets(num_in, 0); + std::vector num_rows(num_in, 0); + for (size_t i = 0; i < num_in; i++) { + if (!in[i].storage_initialized()) { + skip[i] = true; + continue; + } + auto size = in[i].aux_shape(kIdx).Size(); + num_rows[i] = size; + total_num_rows += size; + in_vals[i] = in[i].data().FlatTo2D(); + in_indices[i] = in[i].aux_data(kIdx).FlatTo1D(); + } + std::vector indices; + indices.reserve(total_num_rows); + // gather indices from all inputs + for (size_t i = 0; i < num_in; i++) { + for (size_t j = 0; j < num_rows[i]; j++) { + indices.emplace_back(in_indices[i][j]); + } + } + CHECK_EQ(indices.size(), total_num_rows); + // dedup indices + std::sort(indices.begin(), indices.end()); + indices.resize(std::unique(indices.begin(), indices.end()) - indices.begin()); + // the one left are unique non-zero rows + size_t nnr = indices.size(); + // allocate memory for output + out->CheckAndAlloc({Shape1(nnr)}); + auto idx_data = out->aux_data(kIdx).FlatTo1D(); + auto val_data = out->data().FlatTo2D(); + + for (size_t i = 0; i < nnr; i++) { + // copy indices back + idx_data[i] = indices[i]; + bool zeros = true; + for (size_t j = 0; j < num_in; j++) { + if (skip[j]) continue; + size_t offset = offsets[j]; + if (offset < num_rows[j]) { + if (indices[i] == in_indices[j][offset]) { + if (zeros) { + Copy(val_data[i], in_vals[j][offset], nullptr); + zeros = false; + } else { + val_data[i] += in_vals[j][offset]; + } + offsets[j] += 1; + } + } + } + } + }); + }); + } + template inline static void ReduceSumCPU( const std::vector &dptr, size_t offset, index_t size) { @@ -209,6 +446,7 @@ class CommCPU : public Comm { std::unordered_map merge_buf_; size_t bigarray_bound_; int nthread_reduction_; + bool is_serial_push_; }; /** @@ -227,8 +465,13 @@ class CommDevice : public Comm { virtual ~CommDevice() { } - void Init(int key, const TShape& shape, int dtype = mshadow::kFloat32) override { - sorted_key_attrs_.push_back(std::make_tuple(key, shape, dtype)); + void Init(int key, const NDArrayStorageType stype, const TShape& shape, + int dtype = mshadow::kFloat32) override { + if (stype == kDefaultStorage) { + sorted_key_attrs_.push_back(std::make_tuple(key, shape, dtype)); + } else { + LOG(FATAL) << "storage type " << stype << " not implemented for device yet"; + } } const NDArray& Reduce(int key, const std::vector& src, @@ -296,6 +539,13 @@ class CommDevice : public Comm { } } + void BroadcastRowSparse(int key, const NDArray& src, + const std::vector>& dst, + const bool use_copy, + const int priority) override { + LOG(FATAL) << "Not implemented yet"; + } + private: void EnableP2P(const std::vector& devs) { #if MXNET_USE_CUDA diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 52c7c132cb5c..e8efdc14fdc0 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -25,6 +25,8 @@ #define MXNET_KVSTORE_KVSTORE_DIST_H_ #include #include +#include +#include #include "./kvstore_local.h" #include "mxnet/engine.h" #include "ps/ps.h" @@ -60,6 +62,7 @@ class KVStoreDist : public KVStoreLocal { } } bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000); + log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false); } virtual ~KVStoreDist() { @@ -81,7 +84,7 @@ class KVStoreDist : public KVStoreLocal { const std::vector& values) override { CheckUnique(keys); for (size_t i = 0; i < keys.size(); ++i) { - comm_->Init(keys[i], values[i].shape(), values[i].dtype()); + comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } if (get_rank() == 0) { Push_(keys, values, 0, false); @@ -115,17 +118,19 @@ class KVStoreDist : public KVStoreLocal { // use the same array for merging to guarantee that pull always happens // after the previous push on this key auto& recv_buf = comm_buf_[key]; + const auto storage_type = grouped_vals[i][0]->storage_type(); + CHECK_EQ(storage_type, kDefaultStorage) + << "Expected stype of value to be kDefaultStorage"; if (recv_buf.is_none()) { // it may happen for the first time a no-rank-0 worker pull the weight. - recv_buf = NDArray( - grouped_vals[i][0]->shape(), pinned_ctx_, false, grouped_vals[i][0]->dtype()); + recv_buf = NDArray(grouped_vals[i][0]->shape(), pinned_ctx_, + true, grouped_vals[i][0]->dtype()); } #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(recv_buf.data()); #endif real_t* data = static_cast(recv_buf.data().dptr_); size_t size = recv_buf.shape().Size(); - auto pull_from_servers = [this, key, data, size]( RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys @@ -134,7 +139,7 @@ class KVStoreDist : public KVStoreLocal { // issue pull, false means no delete auto vals = new ps::SArray(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPull( - pskv.keys, vals, &pskv.lens, 0, [vals, cb](){ delete vals; cb(); }); + pskv.keys, vals, &pskv.lens, kDefaultPushPull, [vals, cb](){ delete vals; cb(); }); }; CHECK_NOTNULL(Engine::Get())->PushAsync( @@ -144,12 +149,55 @@ class KVStoreDist : public KVStoreLocal { {recv_buf.var()}, FnProperty::kNormal, priority, - PROFILER_MESSAGE("KVStoreDistPull")); + PROFILER_MESSAGE("KVStoreDistDefaultPull")); comm_->Broadcast(key, recv_buf, grouped_vals[i], priority); } } + void PullRowSparse(const std::vector& keys, + const std::vector>& val_rowids, + const int priority = 0) { + std::vector uniq_keys; + std::vector>> grouped_val_rowids; + GroupKVPairs(keys, val_rowids, &uniq_keys, &grouped_val_rowids); + + for (size_t i = 0; i < uniq_keys.size(); ++i) { + int key = uniq_keys[i]; + // use the same array for merging to guarantee that pull always happens + // after the previous push on this key + auto& recv_buf = comm_buf_[key]; + auto& grouped_val_rowid = grouped_val_rowids[i]; + const auto storage_type = grouped_val_rowid[0].first->storage_type(); + CHECK_EQ(storage_type, kRowSparseStorage) + << "expected kRowSparseStorage, but got " << storage_type; + if (recv_buf.is_none()) { + // it may happen for the first time a no-rank-0 worker pull the weight. + recv_buf = NDArray(storage_type, grouped_val_rowid[0].first->shape(), + pinned_ctx_, true, grouped_val_rowid[0].first->dtype()); + } + auto &target_val_rowids = grouped_val_rowids[i]; + const size_t num_vals = target_val_rowids.size(); + size_t num_rows = 0; + // TODO(haibin) refactor this for loop + for (size_t i = 0; i < num_vals; i++) { + auto &row_id = target_val_rowids[i].second; + NDArray indices = row_id.Copy(pinned_ctx_); + Unique(&indices, priority); + target_val_rowids[i].second = indices; + num_rows += indices.shape().Size(); + } + if (num_vals > 1) { + // TODO(haibin) aggregate over all unique indices + LOG(FATAL) << "RowSparsePull with multiple values is not implemented yet"; + } else { + auto& indices = target_val_rowids[0].second; + PullRowSparse_(key, &recv_buf, indices, priority); + comm_->BroadcastRowSparse(key, recv_buf, grouped_val_rowid, num_vals == 1, priority); + } + } + } + void set_updater(const Updater& updater) override { CHECK(updater) << "invalid updater"; if (IsServerNode()) { @@ -222,41 +270,130 @@ class KVStoreDist : public KVStoreLocal { NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; auto& send_buf = comm_buf_[key]; + const auto storage_type = merged.storage_type(); if (merged.ctx().dev_mask() == cpu::kDevMask) { + // make sure the previous push/pull is completed + send_buf.WaitToWrite(); send_buf = merged; // avoid memory copy } else { if (send_buf.is_none()) { - send_buf = NDArray(merged.shape(), pinned_ctx_, false, merged.dtype()); + if (storage_type == kDefaultStorage) { + send_buf = NDArray(merged.shape(), pinned_ctx_, false, merged.dtype()); + } else { + send_buf = NDArray(storage_type, merged.shape(), pinned_ctx_, true, merged.dtype()); + } } CopyFromTo(merged, &send_buf); } // push to servers - send_buf.WaitToRead(); - size_t size = send_buf.shape().Size(); + if (storage_type == kDefaultStorage) { + send_buf.WaitToRead(); + size_t size = send_buf.shape().Size(); +#if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(send_buf.data()); +#endif + real_t* data = static_cast(send_buf.data().dptr_); + auto push_to_servers = + [this, key, data, size](RunContext rctx, Engine::CallbackOnComplete cb) { + // convert to ps keys + PSKV& pskv = EncodeKey(key, size); + // do push. false means no delete + ps::SArray vals(data, size, false); + CHECK_NOTNULL(ps_worker_)->ZPush( + pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); }); + }; + Engine::Get()->PushAsync( + push_to_servers, + pinned_ctx_, + {send_buf.var()}, + {}, + FnProperty::kNormal, + priority, + PROFILER_MESSAGE("KVStoreDistDefaultPush")); + } else if (storage_type == kRowSparseStorage) { + PushRowSparse(key, send_buf, priority); + } else { + LOG(FATAL) << "unknown storage type"; + } + } + } + + // pull row sparse weight into `recv_buf` based on indices given by `indices` + void PullRowSparse_(int key, NDArray *recv_buf, const NDArray& indices, int priority) { + using namespace rowsparse; + auto pull_from_servers = [this, key, recv_buf, indices] + (RunContext rctx, Engine::CallbackOnComplete cb) { + // allocate memory for the buffer + size_t num_rows = indices.shape().Size(); + recv_buf->CheckAndAlloc({mshadow::Shape1(num_rows)}); +#if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(recv_buf->data()); +#endif + real_t* data = static_cast(recv_buf->data().dptr_); + auto indices_data = indices.data(); + const auto offsets = indices_data.dptr(); + const auto unit_len = recv_buf->shape().ProdShape(1, recv_buf->shape().ndim()); + const int64_t size = num_rows * unit_len; + // convert to ps keys in row sparse format + PSKV& pskv = EncodeRowSparseKey(key, size, num_rows, offsets, + unit_len, recv_buf->shape()[0]); + if (this->log_verbose_) { + LOG(INFO) << "worker " << get_rank() << " pull lens: " << pskv.lens << " keys: " + << pskv.keys << " size: " << size; + } + auto vals = new ps::SArray(data, size, false); + CHECK_NOTNULL(ps_worker_)->ZPull(pskv.keys, vals, &pskv.lens, kRowSparsePushPull, + [vals, cb]() { delete vals; cb(); }); + // copy indices to recv_buf + mshadow::Copy(recv_buf->aux_data(kIdx).FlatTo1D(), + indices_data.FlatTo1D()); + }; + CHECK_NOTNULL(Engine::Get())->PushAsync( + pull_from_servers, + pinned_ctx_, + {indices.var()}, + {recv_buf->var()}, + FnProperty::kNormal, + priority, + PROFILER_MESSAGE("KVStoreDistRowSparsePull")); + } + + // push row sparse gradient + void PushRowSparse(int key, const NDArray &send_buf, int priority) { + using namespace rowsparse; + auto push_to_servers = [this, key, &send_buf] + (RunContext rctx, Engine::CallbackOnComplete cb) { #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(send_buf.data()); #endif real_t* data = static_cast(send_buf.data().dptr_); - auto push_to_servers = - [this, key, data, size](RunContext rctx, Engine::CallbackOnComplete cb) { - // convert to ps keys - PSKV& pskv = EncodeKey(key, size); - - // do push. false means no delete - ps::SArray vals(data, size, false); - CHECK_NOTNULL(ps_worker_)->ZPush( - pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); }); - }; - Engine::Get()->PushAsync( - push_to_servers, - pinned_ctx_, - {send_buf.var()}, - {}, - FnProperty::kNormal, - priority, - PROFILER_MESSAGE("KVStoreDistPush")); - } + bool init = send_buf.storage_initialized(); + const int64_t num_rows = init ? send_buf.aux_shape(kIdx)[0] : 0; + const auto offsets = init ? send_buf.aux_data(kIdx).dptr() : nullptr; + const auto unit_len = send_buf.shape().ProdShape(1, send_buf.shape().ndim()); + const int64_t size = num_rows * unit_len; + + // convert to ps keys in row sparse format + PSKV& pskv = EncodeRowSparseKey(key, size, num_rows, offsets, + unit_len, send_buf.shape()[0]); + if (this->log_verbose_) { + LOG(INFO) << "worker " << get_rank() << " push lens: " << pskv.lens << " keys: " + << pskv.keys << " size: " << size; + } + ps::SArray vals(data, size, false); + CHECK_NOTNULL(ps_worker_)->ZPush(pskv.keys, vals, pskv.lens, kRowSparsePushPull, [cb]() { + cb(); + }); + }; + Engine::Get()->PushAsync( + push_to_servers, + pinned_ctx_, + {send_buf.var()}, + {}, + FnProperty::kNormal, + priority, + PROFILER_MESSAGE("KVStoreDistRowSparsePush")); } /** @@ -284,7 +421,7 @@ class KVStoreDist : public KVStoreLocal { std::unordered_map ps_kv_; /** - * \brief serizelize EncodeKey + * \brief serizelize EncodeRowSparseKey and EncodeKey */ std::mutex mu_; @@ -331,6 +468,64 @@ class KVStoreDist : public KVStoreLocal { return pskv; } + // TODO(haibin) this encoding method for row sparse keys doesn't allow cross-layer batching + inline PSKV& EncodeRowSparseKey(const int key, const int64_t size, const int64_t num_rows, + const int64_t *offsets, const size_t unit_len, + const int64_t total_num_rows) { + using namespace common; + mu_.lock(); + PSKV& pskv = ps_kv_[key]; + mu_.unlock(); + pskv.keys.clear(); + pskv.lens.clear(); + // TODO(haibin) cache this information + auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); + int num_servers = krs.size(); + CHECK_GT(num_servers, 0); + + if (total_num_rows * unit_len >= bigarray_bound_) { + pskv.size = 0; + int64_t start_row = 0; + // parition it to all servers + for (int i = 0; i < num_servers; ++i) { + // calculate partition ranges + int64_t part_num_rows = + llround(static_cast(total_num_rows) / num_servers * (i + 1)) - + llround(static_cast(total_num_rows) / num_servers * i); + auto end_row = start_row + part_num_rows; + auto lb = std::lower_bound(offsets, offsets + num_rows, start_row); + auto ub = std::upper_bound(offsets, offsets + num_rows, end_row - 1); + ps::Key master_key = krs[i].begin() + key; + pskv.keys.push_back(master_key); + pskv.lens.push_back(0); + for (auto offset = lb; offset < ub; offset++) { + ps::Key ps_key = krs[i].begin() + key + (*offset - start_row); + CHECK_LT(ps_key, krs[i].end()); + pskv.keys.push_back(ps_key); + pskv.lens.push_back(unit_len); + pskv.size += unit_len; + } + start_row = end_row; + } + CHECK_EQ(static_cast(pskv.size), size); + } else { + // send it to a single random picked server + int server = (key * 9973) % num_servers; + ps::Key master_key = krs[server].begin() + key; + pskv.keys.push_back(master_key); + pskv.lens.push_back(0); + for (int64_t i = 0; i < num_rows; i++) { + ps::Key ps_key = krs[server].begin() + key + offsets[i]; + CHECK_LT(ps_key, krs[server].end()); + pskv.keys.push_back(ps_key); + pskv.lens.push_back(unit_len); + } + pskv.size = size; + } + return pskv; + } + + /** * \brief for worker to push and pull data */ @@ -345,6 +540,7 @@ class KVStoreDist : public KVStoreLocal { size_t bigarray_bound_; /// \brief send & recver buffer std::unordered_map comm_buf_; + bool log_verbose_; }; } // namespace kvstore diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 4e9f887173c5..2ad90ae15d10 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -33,10 +33,13 @@ #include #include "ps/ps.h" #include "mxnet/kvstore.h" +#include "../operator/tensor/elemwise_binary_op.h" namespace mxnet { namespace kvstore { +static const int kRowSparsePushPull = 1; +static const int kDefaultPushPull = 0; static const int kStopServer = -1; static const int kSyncMode = -2; @@ -110,8 +113,9 @@ class KVStoreDistServer { static_cast(ps_server_)->set_request_handle( std::bind(&KVStoreDistServer::CommandHandle, this, _1, _2)); ps_server_->set_request_handle( - std::bind(&KVStoreDistServer::DataHandle, this, _1, _2, _3)); + std::bind(&KVStoreDistServer::DataHandleEx, this, _1, _2, _3)); sync_mode_ = false; + log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false); } ~KVStoreDistServer() { @@ -136,6 +140,11 @@ class KVStoreDistServer { } private: + struct MergeBuf { + std::vector request; + NDArray array; + }; + void CommandHandle(const ps::SimpleData& recved, ps::SimpleApp* app) { if (recved.head == kStopServer) { exec_.Stop(); @@ -151,9 +160,200 @@ class KVStoreDistServer { app->Response(recved); } - void DataHandle(const ps::KVMeta& req_meta, - const ps::KVPairs& req_data, - ps::KVServer* server) { + void DataHandleEx(const ps::KVMeta& req_meta, + const ps::KVPairs& req_data, + ps::KVServer* server) { + if (req_meta.cmd == kRowSparsePushPull) { + DataHandleRowSparse(req_meta, req_data, server); + } else { + DataHandleDefault(req_meta, req_data, server); + } + return; + } + + inline void ApplyUpdates(const int key, MergeBuf *merged, NDArray *stored, + ps::KVServer* server) { + if (merged->request.size() == (size_t) ps::NumWorkers()) { + // let the main thread to execute updater_, which is necessary for python + if (updater_) { + exec_.Exec([this, key, merged, stored](){ + CHECK(updater_); + updater_(key, merged->array, stored); + }); + } else { + // if no updater, just copy + CopyFromTo(merged->array, stored); + } + if (log_verbose_) { + LOG(INFO) << "sync response to " << merged->request.size() << " workers"; + } + for (const auto& req : merged->request) { + server->Response(req); + } + merged->request.clear(); + stored->WaitToRead(); + } else { + merged->array.WaitToRead(); + } + } + + void DecodeRowIds(const ps::SArray &keys, int64_t *indices, + const int64_t master_key, const int64_t num_rows) { + indices[0] = 0; + for (int64_t i = 1; i <= num_rows; i++) { + int key = DecodeKey(keys[i]); + auto row_id = key - master_key; + indices[i - 1] = row_id; + } + } + + void DataHandleRowSparse(const ps::KVMeta& req_meta, + const ps::KVPairs& req_data, + ps::KVServer* server) { + int master_key = DecodeKey(req_data.keys[0]); + auto num_rows = req_data.keys.size() - 1; + auto& stored = store_[master_key]; + if (req_meta.push) { + CHECK_GT(req_data.lens.size(), 0) << "req_data.lens cannot be empty"; + CHECK_EQ(req_data.lens[0], 0); + real_t* data = req_data.vals.data(); + if (stored.is_none()) { + if (log_verbose_) LOG(INFO) << "initial push: " << master_key; + // initialization + CHECK_GT(num_rows, 0) << "init with empty data is not supported"; + auto unit_len = req_data.lens[1]; + CHECK_GT(unit_len, 0); + size_t ds[] = {num_rows, (size_t) unit_len}; + TShape dshape(ds, ds + 2); + CHECK_EQ(req_data.vals.size(), num_rows * unit_len); + TBlob recv_blob(data, dshape, cpu::kDevMask); // NOLINT(*) + NDArray recved = NDArray(recv_blob, 0); + // TODO(haibin) temporarily initialized as dense NDArray. We need inplace operator + // support for rowsparse ndarrays. And after that `stored` should be initialized as + // RowSparse NDArray + stored = NDArray(kRowSparseStorage, dshape, Context()); + CopyFromTo(recved, &stored, 0); + stored.WaitToRead(); + server->Response(req_meta); + return; + } + // synced push + if (sync_mode_) { + if (log_verbose_) LOG(INFO) << "sync push: " << master_key << " " << req_data.keys; + auto& merged = merge_buf_[master_key]; + if (merged.array.is_none()) { + merged.array = NDArray(kRowSparseStorage, stored.shape(), Context()); + } + if (num_rows == 0) { + // reset to zeros + if (merged.request.size() == 0) { + merged.array = NDArray(kRowSparseStorage, stored.shape(), Context()); + } else { + // nothing to aggregate + } + merged.request.push_back(req_meta); + ApplyUpdates(master_key, &merged, &stored, server); + return; + } + auto unit_len = req_data.lens[1]; + CHECK_GT(unit_len, 0); + // indices + std::vector indices(num_rows); + DecodeRowIds(req_data.keys, indices.data(), master_key, num_rows); + // data + TBlob idx_blob(indices.data(), mshadow::Shape1(num_rows), cpu::kDevMask); + size_t ds[] = {(size_t) num_rows, (size_t) unit_len}; + TShape dshape(ds, ds + 2); + TBlob recv_blob(data, dshape, cpu::kDevMask); // NOLINT(*) + // row_sparse NDArray + NDArray recved(kRowSparseStorage, stored.shape(), recv_blob, {idx_blob}, 0); + + if (merged.request.size() == 0) { + CopyFromTo(recved, &merged.array, 0); + } else { + NDArray out(kRowSparseStorage, stored.shape(), Context()); + std::vector const_vars; + const_vars.push_back(recved.var()); + const_vars.push_back(merged.array.var()); + // accumulate row_sparse gradients + // TODO(haibin) override + operator for row_sparse NDArray + // instead of calling BinaryComputeRspRsp directly + using namespace mshadow; + Engine::Get()->PushSync([recved, merged, out](RunContext ctx) { + std::vector inputs, outputs; + inputs.push_back(recved); + inputs.push_back(merged.array); + outputs.push_back(out); + op::BinaryComputeRspRspImpl({}, {}, inputs, {kWriteTo}, outputs); + }, recved.ctx(), const_vars, {out.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); + CopyFromTo(out, &merged.array, 0); + } + merged.request.push_back(req_meta); + ApplyUpdates(master_key, &merged, &stored, server); + } else { + // async push + if (log_verbose_) LOG(INFO) << "async push: " << master_key; + if (num_rows == 0) { + server->Response(req_meta); + return; + } + auto unit_len = req_data.lens[1]; + CHECK_GT(unit_len, 0); + // indices + std::vector indices(num_rows); + DecodeRowIds(req_data.keys, indices.data(), master_key, num_rows); + TBlob idx_blob(indices.data(), mshadow::Shape1(num_rows), cpu::kDevMask); + size_t ds[] = {(size_t) num_rows, (size_t) unit_len}; + TShape dshape(ds, ds + 2); + TBlob recv_blob(data, dshape, cpu::kDevMask); // NOLINT(*) + NDArray recved(kRowSparseStorage, stored.shape(), recv_blob, {idx_blob}, 0); + exec_.Exec([this, master_key, &recved, &stored](){ + CHECK(updater_); + updater_(master_key, recved, &stored); + }); + server->Response(req_meta); + stored.WaitToRead(); + } + } else { + // pull + if (log_verbose_) LOG(INFO) << "pull: " << master_key; + ps::KVPairs response; + if (num_rows == 0) { + std::vector lens(req_data.keys.size(), 0); + response.keys = req_data.keys; + response.lens.CopyFrom(lens.begin(), lens.end()); + server->Response(req_meta, response); + return; + } + CHECK(!stored.is_none()) << "init " << master_key << " first"; + auto shape = stored.shape(); + auto unit_len = shape.ProdShape(1, shape.ndim()); + const float* data = stored.data().dptr(); + auto len = unit_len * num_rows; + // concat values + response.vals.resize(len); + for (size_t i = 1; i <= num_rows; i++) { + int key = DecodeKey(req_data.keys[i]); + int64_t row_id = key - master_key; + const auto src = data + row_id * unit_len; + auto begin = (i - 1) * unit_len; + auto end = i * unit_len; + response.vals.segment(begin, end).CopyFrom(src, unit_len); + } + // setup response + response.keys = req_data.keys; + std::vector lens(req_data.keys.size(), unit_len); + lens[0] = 0; + response.lens.CopyFrom(lens.begin(), lens.end()); + server->Response(req_meta, response); + } + } + + void DataHandleDefault(const ps::KVMeta& req_meta, + const ps::KVPairs &req_data, + ps::KVServer* server) { + CHECK_EQ(req_meta.cmd, kDefaultPushPull); // do some check CHECK_EQ(req_data.keys.size(), (size_t)1); if (req_meta.push) { @@ -185,35 +385,13 @@ class KVStoreDistServer { if (merged.array.is_none()) { merged.array = NDArray(dshape, Context()); } - if (merged.request.size() == 0) { CopyFromTo(recved, &merged.array, 0); } else { merged.array += recved; } - merged.request.push_back(req_meta); - - if (merged.request.size() == (size_t)ps::NumWorkers()) { - // let the main thread to execute updater_, which is necessary for - // python - if (updater_) { - exec_.Exec([this, key, &merged, &stored](){ - CHECK(updater_); - updater_(key, merged.array, &stored); - }); - } else { - // if no updater, just copy - CopyFromTo(merged.array, &stored); - } - for (const auto& req : merged.request) { - server->Response(req); - } - merged.request.clear(); - stored.WaitToRead(); - } else { - merged.array.WaitToRead(); - } + ApplyUpdates(key, &merged, &stored, server); } else { // async push exec_.Exec([this, key, &recved, &stored](){ @@ -227,7 +405,7 @@ class KVStoreDistServer { // pull ps::KVPairs response; CHECK(!stored.is_none()) << "init " << key << " first"; - int len = stored.shape()[0]; + auto len = stored.shape().Size(); response.keys = req_data.keys; response.lens = {len}; // TODO(mli) try to remove this CopyFrom @@ -249,16 +427,13 @@ class KVStoreDistServer { KVStore::Updater updater_; std::unordered_map store_; - - struct MergeBuf { - std::vector request; - NDArray array; - }; std::unordered_map merge_buf_; Executor exec_; - ps::KVServer* ps_server_; + + // whether to LOG verbose information + bool log_verbose_; }; } // namespace kvstore diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 536a89b46e13..d8c399edf017 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -62,7 +62,7 @@ class KVStoreLocal : public KVStore { CHECK(local_.find(keys[i]) == local_.end()) << "duplicate init of key " << keys[i]; local_[keys[i]] = values[i].Copy(pinned_ctx_); - comm_->Init(keys[i], values[i].shape(), values[i].dtype()); + comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } } @@ -100,7 +100,11 @@ class KVStoreLocal : public KVStore { } updater_(key, merged, &local); } else { - local = merged; + if (merged.storage_type() != local.storage_type()) { + local = merged.Copy(local.ctx()); + } else { + local = merged; + } } } } @@ -120,6 +124,30 @@ class KVStoreLocal : public KVStore { } } + void PullRowSparse(const std::vector& keys, + const std::vector>& val_rowids, + int priority = 0) override { + std::vector uniq_keys; + std::vector>> grouped_val_rowids; + GroupKVPairs(keys, val_rowids, &uniq_keys, &grouped_val_rowids); + for (size_t i = 0; i < uniq_keys.size(); ++i) { + int key = uniq_keys[i]; + const NDArray& local = local_[key]; + CHECK(!local.is_none()) << "key " << key << " has not been inited"; + CHECK_EQ(local.storage_type(), kRowSparseStorage) + << "PullRowSparse expects row_sparse src NDArray"; + auto &target_val_rowids = grouped_val_rowids[i]; + const size_t num_vals = target_val_rowids.size(); + for (size_t i = 0; i < num_vals; i++) { + auto &row_id = target_val_rowids[i].second; + NDArray indices = row_id.Copy(pinned_ctx_); + Unique(&indices, priority); + target_val_rowids[i].second = indices; + } + comm_->BroadcastRowSparse(key, local, grouped_val_rowids[i], false, priority); + } + } + void Push(const std::vector& str_keys, const std::vector& values, int priority) override { @@ -136,6 +164,14 @@ class KVStoreLocal : public KVStore { Pull(keys, values, priority); } + void PullRowSparse(const std::vector& str_keys, + const std::vector>& val_rowids, + const int priority = 0) override { + std::vector keys(str_keys.size()); + LookupKeys(str_keys, &keys); + PullRowSparse(keys, val_rowids, priority); + } + protected: /** * \brief group values on keys @@ -178,6 +214,28 @@ class KVStoreLocal : public KVStore { } } + /** + * \brief sort and get unique values. Output is expected to be on cpu_pinned context + */ + void Unique(NDArray *out, int priority = 0) { + CHECK_EQ(out->ctx().dev_mask(), pinned_ctx_.dev_mask()) + << "Unique expects input with `pinned_ctx_`"; + Engine::Get()->PushSync([out](RunContext rctx) { + NDArray *output = out; + CHECK_EQ(out->shape().ndim(), 1) << "Unique expects 1D inputs"; + const auto size = out->shape()[0]; + auto out_data = output->data(); + MSHADOW_IDX_TYPE_SWITCH(out_data.type_flag_, IType, { + auto dptr = output->data().dptr(); + common::ParallelSort(dptr, dptr + size, omp_get_max_threads()); + auto num_unique_idx = std::unique(dptr, dptr + size) - dptr; + *output = output->Reshape(mshadow::Shape1(num_unique_idx)); + }); + }, pinned_ctx_, {}, {out->var()}, + FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreUnique")); + out->WaitToRead(); + } + /// reducer and broadcaster Comm* comm_; /// pinned context diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 8e71df729b73..0d2968626d79 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -30,6 +30,9 @@ #include #include #include "./ndarray_function.h" +#include "../common/utils.h" +#include "../operator/tensor/matrix_op-inl.h" +#include "../operator/tensor/init_op.h" #include "./autograd.h" #if MXNET_USE_OPENCV @@ -52,6 +55,8 @@ NDArray NDArray::grad() const { NDArray NDArray::Reshape(const TShape &shape) const { using namespace autograd; + CHECK(storage_type() == kDefaultStorage) << "Reshape for storage type " << + storage_type() << " is not implemented yet"; if (AutogradRuntime::Get()->IsTraining()) { CHECK_GE(shape_.Size(), shape.Size()) << "NDArray.Reshape: target shape must have must have the same size as " @@ -82,13 +87,15 @@ NDArray NDArray::Reshape(const TShape &shape) const { } } - NDArray NDArray::Slice(index_t begin, index_t end) const { using namespace autograd; - NDArray ret = *this; + using namespace mshadow; CHECK(!is_none()) << "NDArray is not initialized"; CHECK_LT(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")"; CHECK_GE(shape_[0], end) << "Slice end index out of range"; + CHECK_EQ(storage_type(), kDefaultStorage); + NDArray ret = *this; + auto stype = storage_type(); size_t length = shape_.ProdShape(1, shape_.ndim()); MSHADOW_TYPE_SWITCH(ret.dtype(), DType, { ret.byte_offset_ += begin * length * sizeof(DType); @@ -115,8 +122,9 @@ NDArray NDArray::Slice(index_t begin, index_t end) const { } } - NDArray NDArray::At(index_t idx) const { + CHECK(storage_type() == kDefaultStorage) << "Storage type " + << storage_type() << " doesn't support At()"; NDArray ret = this->Slice(idx, idx+1); if (shape_.ndim() > 1) { return ret.Reshape(TShape(shape_.data()+1, shape_.data()+shape_.ndim())); @@ -125,6 +133,24 @@ NDArray NDArray::At(index_t idx) const { } } +/*! + * \brief Return deep copy of the current ndarry's aux_data(i) + * as an NDArray of default storage type. This function blocks. + */ +NDArray NDArray::aux_ndarray(size_t i) const { + CHECK_NE(storage_type(), kDefaultStorage); + CHECK(i < ptr_->aux_shapes.size()); + // create a delay_alloc default ndarray as output + NDArray ret(TShape(), ctx(), true, aux_type(i)); + ret.SyncCopyFromNDArray(*this, i); + return ret; +} + +NDArray NDArray::data_ndarray() const { + NDArray ret(TShape(), ctx(), true, dtype_); + ret.SyncCopyFromNDArray(*this); + return ret; +} bool NDArray::fresh_out_grad() const { if (entry_.ag_node != nullptr) return entry_.ag_node->fresh_out_grad; @@ -239,11 +265,11 @@ void BinaryOp(const NDArray &lhs, // redirect everything to mshadow operations switch (lhs.ctx().dev_mask()) { case cpu::kDevMask: { - Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) { - TBlob tmp = ret.data(); - ndarray::Eval(lhs.data(), rhs.data(), &tmp, ctx); - }, lhs.ctx(), const_vars, {ret.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); + Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) { + TBlob tmp = ret.data(); + ndarray::Eval(lhs.data(), rhs.data(), &tmp, ctx); + }, lhs.ctx(), const_vars, {ret.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); break; } #if MXNET_USE_CUDA @@ -269,6 +295,7 @@ void SetValueOp(const real_t &rhs, NDArray *out) { switch (ret.ctx().dev_mask()) { case cpu::kDevMask: { Engine::Get()->PushSync([rhs, ret](RunContext ctx) { + CHECK(ret.storage_type() == kDefaultStorage); TBlob tmp = ret.data(); ndarray::Eval(rhs, &tmp, ctx); }, ret.ctx(), {}, {ret.var()}, @@ -340,6 +367,134 @@ void ScalarOp(const NDArray &lhs, } } +size_t num_aux_data(NDArrayStorageType stype) { + size_t num = 0; + switch (stype) { + case kDefaultStorage: num = 0; break; + case kCSRStorage: num = 2; break; + case kRowSparseStorage: num = 1; break; + default: LOG(FATAL) << "Unknown storage type" << stype; break; + } + return num; +} + +// Make a copy of a CSR NDArray +template +inline void CopyFromToCsrImpl(const NDArray from, NDArray *to, RunContext ctx) { + using namespace mshadow; + CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type"; + // if source storage is not initialized, fill destination with zeros + auto s = ctx.get_stream(); + if (!from.storage_initialized()) { + op::FillZerosCsrImpl(s, to); + return; + } + // Allocate storage + to->CheckAndAllocAuxData(csr::kIndPtr, from.aux_shape(csr::kIndPtr)); + to->CheckAndAllocAuxData(csr::kIdx, from.aux_shape(csr::kIdx)); + to->CheckAndAllocData(from.aux_shape(csr::kIdx)); + TBlob val = to->data(); + TBlob indptr = to->aux_data(csr::kIndPtr); + TBlob idx = to->aux_data(csr::kIdx); + ndarray::Copy(from.data(), &val, + from.ctx(), to->ctx(), ctx); + ndarray::Copy(from.aux_data(csr::kIndPtr), &indptr, + from.ctx(), to->ctx(), ctx); + ndarray::Copy(from.aux_data(csr::kIdx), &idx, + from.ctx(), to->ctx(), ctx); +} + +// Make a copy of a row-sparse NDArray +template +inline void CopyFromToRspImpl(const NDArray from, NDArray *to, RunContext ctx) { + using namespace mshadow; + CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type"; + // if source is zeros, fill destination with zeros, too + auto s = ctx.get_stream(); + if (!from.storage_initialized()) { + op::FillZerosRspImpl(s, to); + return; + } + auto aux_shape = from.aux_shape(rowsparse::kIdx); + to->CheckAndAlloc({aux_shape}); + TBlob val = to->data(); + TBlob idx = to->aux_data(rowsparse::kIdx); + ndarray::Copy(from.data(), &val, + from.ctx(), to->ctx(), ctx); + ndarray::Copy(from.aux_data(rowsparse::kIdx), &idx, + from.ctx(), to->ctx(), ctx); +} + +// Make a copy of a dense NDArray +template +inline void CopyFromToDnsImpl(const NDArray from, NDArray *to, RunContext ctx) { + using namespace mshadow; + CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type"; + TBlob tmp = to->data(); + ndarray::Copy(from.data(), &tmp, + from.ctx(), to->ctx(), ctx); +} + +// Make a copy of an NDArray based on storage type +template +void CopyFromToImpl(const NDArray from, NDArray *to, RunContext rctx) { + using namespace std; + using namespace mshadow; + // if storage type doesn't match, cast the storage first + auto from_stype = from.storage_type(); + auto to_stype = to->storage_type(); + CHECK(from_stype == kDefaultStorage + || to_stype == kDefaultStorage + || from_stype == to_stype) + << "Copying ndarray of stype = " << from_stype + << " to stype = " << to_stype << " is not supported"; + const auto from_ctx = from.ctx(); + const auto to_ctx = to->ctx(); + auto s = rctx.get_stream(); + bool is_train = mxnet::autograd::AutogradRuntime::Get()->IsTraining(); + std::vector requested; + if (is_same::value && from_stype != to_stype) { + requested.push_back(ResourceManager::Get()->Request(from_ctx, + ResourceRequest(ResourceRequest::kTempSpace))); + } + OpContext opctx{is_train, + rctx, + engine::CallbackOnComplete(), + requested}; + if (from_ctx == to_ctx && from_stype != to_stype) { + // same ctx, different stypes, use cast op directly without copying + common::CastStorageDispatch(opctx, from, *to); + } else { + NDArray casted_nd; // an intermediate result before copying from to to + if (from_stype == to_stype) { + casted_nd = from; // same stype, no need to cast from + } else { // different stypes on different ctx needs an temporary casted_nd + TShape shape = from.shape(); + if (to_stype == kDefaultStorage) { + casted_nd = NDArray(shape, from_ctx); + } else { + casted_nd = NDArray(to_stype, shape, from_ctx); + } + // convert from_nd to the same stype as to_nd + common::CastStorageDispatch(opctx, from, casted_nd); + } + + if (to_stype == kDefaultStorage) { + CopyFromToDnsImpl(casted_nd, to, rctx); + } else if (to_stype == kRowSparseStorage) { + CopyFromToRspImpl(casted_nd, to, rctx); + } else if (to_stype == kCSRStorage) { + CopyFromToCsrImpl(casted_nd, to, rctx); + } else { + LOG(FATAL) << "unknown storage type" << to_stype; + } + } + if (is_same::value || is_same::value) { + // Wait GPU kernel to complete + rctx.get_stream()->Wait(); + } +} + void CopyFromTo(const NDArray &from, NDArray *to, int priority) { if (from.var() == to->var()) { // skip to copy to itself @@ -354,44 +509,33 @@ void CopyFromTo(const NDArray &from, NDArray *to, int priority) { NDArray ret = *to; int a = from.ctx().dev_mask(); int b = to->ctx().dev_mask(); - std::vector const_vars; if (from.var() != ret.var()) const_vars.push_back(from.var()); if (a == cpu::kDevMask && b == cpu::kDevMask) { Engine::Get()->PushSync([from, ret](RunContext ctx) { - TBlob tmp = ret.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); + NDArray nd(ret); + CopyFromToImpl(from, &nd, ctx); }, from.ctx(), const_vars, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("CopyCPU2CPU")); } else { #if MXNET_USE_CUDA if (a == cpu::kDevMask && b == gpu::kDevMask) { Engine::Get()->PushSync([from, ret](RunContext ctx) { - TBlob tmp = ret.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); + NDArray nd(ret); + CopyFromToImpl(from, &nd, ctx); }, ret.ctx(), const_vars, {ret.var()}, FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("CopyCPU2GPU")); } else if (a == gpu::kDevMask && b == cpu::kDevMask) { Engine::Get()->PushSync([from, ret](RunContext ctx) { - TBlob tmp = ret.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); + NDArray nd(ret); + CopyFromToImpl(from, &nd, ctx); }, from.ctx(), const_vars, {ret.var()}, FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2CPU")); } else if (a == gpu::kDevMask && b == gpu::kDevMask) { Engine::Get()->PushSync([from, ret](RunContext ctx) { - TBlob tmp = ret.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); + NDArray nd(ret); + CopyFromToImpl(from, &nd, ctx); }, from.ctx(), const_vars, {ret.var()}, from.dtype() != ret.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2GPU")); @@ -665,34 +809,76 @@ NDArray &NDArray::operator/=(const real_t &src) { /* magic number for ndarray version 1, with int64_t TShape */ static const uint32_t NDARRAY_V1_MAGIC = 0xF993fac8; +/* magic number for ndarray version 2, with storage type */ +static const uint32_t NDARRAY_V2_MAGIC = 0xF993fac9; + void NDArray::Save(dmlc::Stream *strm) const { - strm->Write(NDARRAY_V1_MAGIC); + // write magic number to mark this version + // for storage type + strm->Write(NDARRAY_V2_MAGIC); + + // save storage type + int32_t stype = storage_type(); + strm->Write(&stype, sizeof(stype)); + + const int32_t nad = num_aux_data(storage_type()); + // save storage shape if ndarray is sparse + if (nad > 0) { + storage_shape().Save(strm); + } + + // save shape shape_.Save(strm); if (is_none()) return; + // save context Context ctx = this->ctx(); ctx.Save(strm); TBlob save_data; - NDArray temp; + NDArray nd_cpu; // a copy of *this on cpu if (ctx.dev_mask() != cpu::kDevMask) { - temp = this->Copy(Context::CPU()); - temp.WaitToRead(); - save_data = temp.data(); + nd_cpu = this->Copy(Context::CPU()); + nd_cpu.WaitToRead(); + save_data = nd_cpu.data(); } else { this->WaitToRead(); save_data = this->data(); + nd_cpu = *this; } + // save type flag int32_t type_flag = save_data.type_flag_; strm->Write(&type_flag, sizeof(type_flag)); + + // save aux_types and aux_shapes + if (nad > 0) { + for (int i = 0; i < nad; ++i) { + int32_t aux_type_flag = aux_type(i); + strm->Write(&aux_type_flag, sizeof(aux_type_flag)); + aux_shape(i).Save(strm); + } + } + + // save data CHECK(save_data.CheckContiguous()); size_t type_size = mshadow::mshadow_sizeof(type_flag); - strm->Write(save_data.dptr_, type_size * shape_.Size()); + // save data could be values of sparse tensors + // must use save_data.shape_ instead of this->shape_ + strm->Write(save_data.dptr_, type_size * save_data.shape_.Size()); + + // save aux data + if (nad > 0) { + for (int i = 0; i < nad; ++i) { + TBlob save_data = nd_cpu.aux_data(i); + // save aux_data + CHECK(save_data.CheckContiguous()); + size_t aux_type_size = mshadow::mshadow_sizeof(aux_type(i)); + strm->Write(save_data.dptr_, aux_type_size * save_data.Size()); + } + } } -bool LegacyTShapeLoad(dmlc::Stream *strm, TShape *shape) { - uint32_t magic; - if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t)) return false; +bool LegacyTShapeLoad(dmlc::Stream *strm, TShape *shape, const uint32_t magic) { switch (magic) { case NDARRAY_V1_MAGIC: return shape->Load(strm); @@ -708,10 +894,10 @@ bool LegacyTShapeLoad(dmlc::Stream *strm, TShape *shape) { } } -bool NDArray::Load(dmlc::Stream *strm) { +bool NDArray::LegacyLoad(dmlc::Stream *strm, const uint32_t magic) { // load shape TShape shape; - if (!LegacyTShapeLoad(strm, &shape)) return false; + if (!LegacyTShapeLoad(strm, &shape, magic)) return false; if (shape.ndim() == 0) { *this = NDArray(); return true; } @@ -739,6 +925,88 @@ bool NDArray::Load(dmlc::Stream *strm) { } } +bool NDArray::Load(dmlc::Stream *strm) { + uint32_t magic; + if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t)) return false; + if (magic != NDARRAY_V2_MAGIC) { + return LegacyLoad(strm, magic); + } + + // load storage type + int32_t stype; + if (strm->Read(&stype, sizeof(stype)) != sizeof(stype)) return false; + const int32_t nad = num_aux_data(static_cast(stype)); + + // load storage shape + TShape sshape; + if (nad > 0) { + if (!sshape.Load(strm)) return false; + } + + // load shape + TShape shape; + if (!shape.Load(strm)) return false; + if (shape.ndim() == 0) { + *this = NDArray(); return true; + } + + // load context + Context ctx; + if (!ctx.Load(strm)) return false; + + // load type flag + int32_t type_flag; + if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag)) return false; + + // load aux_types and aux_shapes + std::vector aux_types; + std::vector aux_shapes; + if (nad > 0) { + aux_types.resize(nad); + aux_shapes.resize(nad); + for (int i = 0; i < nad; ++i) { + // load aux_type(i) + if (strm->Read(&aux_types[i], sizeof(aux_types[i])) != sizeof(aux_types[i])) return false; + // load aux_shapes(i) + if (!aux_shapes[i].Load(strm)) return false; + } + } + + // load data into CPU + NDArray temp; + if (0 == nad) { + temp = NDArray(shape, Context::CPU(), false, type_flag); + } else { + temp = NDArray(static_cast(stype), shape, + Context::CPU(), false, type_flag, + aux_types, aux_shapes, sshape); + } + // load data + TBlob load_data = temp.data(); + size_t type_size = mshadow::mshadow_sizeof(type_flag); + size_t nread = type_size * load_data.Size(); + if (strm->Read(load_data.dptr_, nread) != nread) return false; + + // load aux_data + if (nad > 0) { + for (int i = 0; i < nad; ++i) { + load_data = temp.aux_data(i); + type_size = mshadow::mshadow_sizeof(load_data.type_flag_); + nread = type_size * load_data.Size(); + if (strm->Read(load_data.dptr_, nread) != nread) return false; + } + } + + if (ctx.dev_mask() == cpu::kDevMask) { + *this = std::move(temp); return true; + } else { +#if MXNET_USE_CUDA + *this = temp.Copy(ctx); return true; +#else + *this = std::move(temp); return true; +#endif + } +} const uint64_t kMXAPINDArrayListMagic = 0x112; @@ -771,7 +1039,16 @@ void NDArray::Load(dmlc::Stream* fi, } NDArray NDArray::Copy(Context ctx) const { - NDArray ret(shape(), ctx, true, dtype_); + NDArray ret; + if (kDefaultStorage == storage_type()) { + ret = NDArray(shape(), ctx, true, dtype_); + } else if (kUndefinedStorage != storage_type()) { + ret = NDArray(storage_type(), shape(), ctx, true, dtype_, + ptr_->aux_types, ptr_->aux_shapes, storage_shape()); + } else { + LOG(FATAL) << "NDArray::Copy cannot copy undefined storage-type ndarray to ctx.dev_type=" + << ctx.dev_type << ", ctx.dev_id=" << ctx.dev_id; + } CopyFromTo(*this, &ret); return ret; } @@ -804,6 +1081,101 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const { } } +/*! + * \brief Copy src.data()/aux_data(i) to dst->data()/aux_data(j). + */ +void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) { + if (i >= 0) { + CHECK_NE(src.storage_type(), kDefaultStorage); + } else { + CHECK(!src.is_none()) << "src dense ndarray must have been initialized"; + } + if (j >= 0) { + CHECK_NE(storage_type(), kDefaultStorage); + } else { + CHECK(!this->is_none()) << "dst dense ndarray must have been initialized"; + } + + if (src.var() == var()) { + // skip to copy to itself + LOG(WARNING) << "SyncCopyFromNDArray does not support copying to self"; + return; + } + const int src_dev_mask = src.ctx().dev_mask(); + const int dst_dev_mask = ctx().dev_mask(); + std::vector const_vars; + const_vars.push_back(src.var()); + + // get or create a dst tblob for copying src to it + // if dst is a dense format and has not been allocated, allocate memory for it + // else if dst is not initialized, allocate corresponding data blob for it + auto get_dst_data = [&](const TShape& src_shape) { + if (this->storage_type() == kDefaultStorage) { + this->ReshapeAndAlloc(src_shape); + } else if (!this->storage_initialized()) { + if (j < 0) { + this->CheckAndAllocData(src_shape); + } else { + this->CheckAndAllocAuxData(j, src_shape); + } + } + TBlob dst_data = (j >= 0? this->aux_data(j) : this->data()); + CHECK_LE(src_shape.Size(), dst_data.shape_.Size()); + return dst_data; + }; + + if (src_dev_mask == cpu::kDevMask && dst_dev_mask == cpu::kDevMask) { + Engine::Get()->PushSync([&](RunContext rctx) { + const TBlob src_data = (i >= 0? src.aux_data(i) : src.data()); + TBlob dst_data = get_dst_data(src_data.shape_); + ndarray::Copy(src_data, &dst_data, src.ctx(), this->ctx(), rctx); + }, this->ctx(), const_vars, {this->var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2CPU")); + } else { +#if MXNET_USE_CUDA + if (src_dev_mask == cpu::kDevMask && dst_dev_mask == gpu::kDevMask) { + Engine::Get()->PushSync([&](RunContext rctx) { + const TBlob src_data = (i >= 0? src.aux_data(i) : src.data()); + TBlob dst_data = get_dst_data(src_data.shape_); + ndarray::Copy(src_data, &dst_data, src.ctx(), this->ctx(), rctx); + rctx.get_stream()->Wait(); + }, this->ctx(), const_vars, {this->var()}, + FnProperty::kCopyToGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2GPU")); + } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == cpu::kDevMask) { + Engine::Get()->PushSync([&](RunContext rctx) { + const TBlob src_data = (i >= 0? src.aux_data(i) : src.data()); + TBlob dst_data = get_dst_data(src_data.shape_); + ndarray::Copy(src_data, &dst_data, src.ctx(), this->ctx(), rctx); + rctx.get_stream()->Wait(); + }, this->ctx(), const_vars, {this->var()}, + FnProperty::kCopyFromGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2CPU")); + } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == gpu::kDevMask) { + Engine::Get()->PushSync([&](RunContext rctx) { + const TBlob src_data = (i >= 0? src.aux_data(i) : src.data()); + TBlob dst_data = get_dst_data(src_data.shape_); + ndarray::Copy(src_data, &dst_data, src.ctx(), this->ctx(), rctx); + rctx.get_stream()->Wait(); + }, this->ctx(), const_vars, {this->var()}, + src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU, + 0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2GPU")); + } else { + LOG(FATAL) << "unknown device mask"; + } +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } + // The copy operation was pushed to engine to execute. + // Need to wait here for it being completed. + // The reason for pushing the copy operation to engine + // is because when copying data from a sparse tensor + // to the current one, that sparse ndarray's storage_shape/aux_shape + // may not be ready or changed and we need to ensure + // thread safty for reading the correct shape info to allocate + // memory for the current ndarray. + WaitToRead(); +} + void NDArray::SyncCopyToCPU(void *data, size_t size) const { TShape dshape = this->shape(); CHECK_EQ(dshape.Size(), size) diff --git a/src/ndarray/ndarray_function-inl.h b/src/ndarray/ndarray_function-inl.h index 2be55f50f934..b284e0378647 100644 --- a/src/ndarray/ndarray_function-inl.h +++ b/src/ndarray/ndarray_function-inl.h @@ -30,27 +30,28 @@ // macro to help specialize evaluation function #ifndef DECL_TERNARY -#define DECL_TERNARY(XPU, OP, FUN) \ - template<> \ - void Eval(const TBlob &lhs, const TBlob &mhs, \ - const TBlob &rhs, TBlob *ret, RunContext ctx) { \ - FUN(lhs, mhs, rhs, ret, ctx); \ +#define DECL_TERNARY(XPU, OP, FUN) \ + template<> \ + void Eval(const TBlob &lhs, const TBlob &mhs, \ + const TBlob &rhs, TBlob *ret, RunContext ctx) { \ + FUN(lhs, mhs, rhs, ret, ctx); \ } #endif #ifndef DECL_BINARY -#define DECL_BINARY(XPU, OP, FUN) \ - template<> \ +#define DECL_BINARY(XPU, OP, FUN) \ + template<> \ void Eval(const TBlob &lhs, const TBlob &rhs, TBlob *ret, RunContext ctx) { \ - FUN(lhs, rhs, ret, ctx); \ + FUN(lhs, rhs, ret, ctx); \ } #endif #ifndef DECL_SCALAR -#define DECL_SCALAR(XPU, OP, FUN, REVERSE) \ - template<> \ - void Eval(const TBlob &lhs, const real_t &rhs, TBlob *ret, RunContext ctx) { \ - FUN(lhs, rhs, ret, ctx); \ +#define DECL_SCALAR(XPU, OP, FUN, REVERSE) \ + template<> \ + void Eval(const TBlob &lhs, const real_t &rhs, \ + TBlob *ret, RunContext ctx) { \ + FUN(lhs, rhs, ret, ctx); \ } #endif @@ -62,10 +63,11 @@ namespace mxnet { namespace ndarray { + // true implementation template -inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs, - TBlob *ret, RunContext ctx) { +void EvalBinary_(const TBlob &lhs, const TBlob &rhs, + TBlob *ret, RunContext ctx) { using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); CHECK_EQ(ret->type_flag_, lhs.type_flag_) @@ -79,10 +81,9 @@ inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs, }); } - template -inline void EvalOneHot_(const TBlob &index, const TBlob &rhs, - TBlob *ret, RunContext ctx) { +void EvalOneHot_(const TBlob &index, const TBlob &rhs, + TBlob *ret, RunContext ctx) { LOG(INFO) << "The operator onehot_encode is deprecated; use one_hot instead."; using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); @@ -99,8 +100,8 @@ inline void EvalOneHot_(const TBlob &index, const TBlob &rhs, } template -inline void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs, - TBlob *ret, RunContext ctx) { +void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs, + TBlob *ret, RunContext ctx) { using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); // TODO(eric): support mixed type choose, i.e. int index and float rhs. @@ -116,8 +117,8 @@ inline void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs, } template -inline void EvalMatFillRowElem_(const TBlob &lhs, const TBlob &mhs, const TBlob &rhs, - TBlob *ret, RunContext ctx) { +void EvalMatFillRowElem_(const TBlob &lhs, const TBlob &mhs, const TBlob &rhs, + TBlob *ret, RunContext ctx) { using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); ret->get(s) @@ -127,8 +128,8 @@ inline void EvalMatFillRowElem_(const TBlob &lhs, const TBlob &mhs, const TBlob } template -inline void EvalScalar_(const TBlob &lhs, const real_t &rhs, - TBlob *ret, RunContext ctx) { +void EvalScalar_(const TBlob &lhs, const real_t &rhs, + TBlob *ret, RunContext ctx) { using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); CHECK_EQ(ret->type_flag_, lhs.type_flag_) @@ -148,7 +149,7 @@ inline void EvalScalar_(const TBlob &lhs, const real_t &rhs, template<> void EvalClip(const TBlob &src, const real_t &a_min, const real_t &a_max, - TBlob *ret, RunContext ctx) { + TBlob *ret, RunContext ctx) { typedef DEVICE xpu; using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); @@ -163,12 +164,11 @@ void EvalClip(const TBlob &src, const real_t &a_min, const real_t &a_max } template<> -void EvalRandom( - const real_t &a, - const real_t &b, - const Resource &resource, - TBlob *ret, - RunContext ctx) { +void EvalRandom(const real_t &a, + const real_t &b, + const Resource &resource, + TBlob *ret, + RunContext ctx) { typedef DEVICE xpu; mshadow::Stream *s = ctx.get_stream(); switch (ret->type_flag_) { @@ -444,6 +444,7 @@ DECL_SCALAR(DEVICE, Plus, EvalScalar_, true) DECL_SCALAR(DEVICE, Minus, EvalScalar_, true) DECL_SCALAR(DEVICE, Mul, EvalScalar_, true) DECL_SCALAR(DEVICE, Div, EvalScalar_, true) + // for reverse seq DECL_SCALAR(DEVICE, Plus, EvalScalar_, false) DECL_SCALAR(DEVICE, Minus, EvalScalar_, false) diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index e4af86d2c824..5cea7942efa6 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -25,6 +25,7 @@ // this will be invoked by gcc and compile CPU version #include "./ndarray_function.h" #include "./ndarray_function-inl.h" +#include "../common/utils.h" namespace mxnet { namespace ndarray { @@ -44,5 +45,138 @@ void Copy(const TBlob &from, TBlob *to, } }) } + +template +void ElementwiseSumRspImpl(mshadow::Stream* s, + const std::vector& nds, + const std::vector& uniq_row_idx, + NDArray* out, + const int nthreads = 4) { +#pragma omp parallel num_threads(nthreads) + { + const size_t nnr = uniq_row_idx.size(); + const int num_threads = omp_get_num_threads(); + size_t row_block_len = (nnr + num_threads - 1) / num_threads; + const size_t row_block_start = omp_get_thread_num() * row_block_len; + if (row_block_start < nnr) { + const size_t row_block_end = std::min(row_block_start+row_block_len, nnr); + + const size_t row_length = out->data().shape_.ProdShape(1, out->data().shape_.ndim()); + auto out_values = out->data().get_with_shape( + mshadow::Shape2(out->storage_shape()[0], row_length), s); + auto out_indices = out->aux_data(rowsparse::kIdx).FlatTo1D(); + for (size_t i = row_block_start; i < row_block_end; ++i) { + out_indices[i] = uniq_row_idx[i]; + } + for (const auto& nd : nds) { + if (nd.storage_initialized()) { + const auto nd_indices = nd.aux_data(rowsparse::kIdx).FlatTo1D(); + const auto nd_values = nd.data().get_with_shape( + mshadow::Shape2(nd.storage_shape()[0], row_length), s); + const auto nd_num_rows = nd.aux_shape(rowsparse::kIdx).Size(); + const IType* nd_indices_start = &nd_indices[0]; + const IType* nd_indices_end = nd_indices_start + nd_num_rows; + const IType* row_idx_ptr = std::lower_bound(nd_indices_start, nd_indices_end, + out_indices[row_block_start]); + // skip this nd if all of its row indices are smaller than out_indices[row_block_start] + // or current row block is not covered by [*row_idx_ptr, nd_indices_end). + if (nd_indices_end == row_idx_ptr || *row_idx_ptr > out_indices[row_block_end-1]) { + continue; + } + for (size_t irow = row_block_start; + irow < row_block_end && row_idx_ptr != nd_indices_end;) { + if (out_indices[irow] == *row_idx_ptr) { + auto out_value_cur_row = out_values[irow]; + const auto offset = row_idx_ptr - nd_indices_start; + auto nd_value_cur_row = nd_values[offset]; + for (size_t j = 0; j < nd_value_cur_row.shape_[0]; ++j) { + out_value_cur_row[j] += nd_value_cur_row[j]; + } + ++irow; + ++row_idx_ptr; + } else if (out_indices[irow] < *row_idx_ptr) { + ++irow; + } else { + ++row_idx_ptr; + } + } + } + } + } + } +} + +/*! + * \brief Given a vector of ndarrays, generate a index vector containing + * all the unique row indices of the ndarrays. + */ +template +void GetUniqueRspRowIdx(const std::vector& nds, + std::vector* uniq_row_idx) { + using namespace rowsparse; + size_t total_num_rows = 0; + for (const auto& nd : nds) { + CHECK_EQ(nd.storage_type(), kRowSparseStorage); + if (nd.storage_initialized()) { + total_num_rows += nd.aux_shape(kIdx).Size(); + } + } + + uniq_row_idx->resize(total_num_rows); + int nthreads = omp_get_max_threads(); + int offset = 0; + for (const auto& nd : nds) { + if (nd.storage_initialized()) { + const IType* nd_row_idx = nd.aux_data(kIdx).dptr(); + const int num_rows = nd.aux_shape(kIdx).Size(); +#pragma omp parallel for num_threads(nthreads) + for (int i = 0; i < num_rows; ++i) { + (*uniq_row_idx)[offset+i] = nd_row_idx[i]; + } + offset += num_rows; + } + } + + common::ParallelSort(uniq_row_idx->begin(), uniq_row_idx->end(), nthreads); + auto it = std::unique(uniq_row_idx->begin(), uniq_row_idx->end()); + uniq_row_idx->resize(it - uniq_row_idx->begin()); +} + +void ElementwiseSumRsp(mshadow::Stream* s, const std::vector& nds, NDArray* out) { + if (nds.empty()) return; + using namespace rowsparse; + CHECK_EQ(out->storage_type(), kRowSparseStorage) + << "Expected row sparse storage type (" + << out->storage_type() << " given)"; + + MSHADOW_TYPE_SWITCH(out->dtype(), DType, { + MSHADOW_IDX_TYPE_SWITCH(out->aux_type(kIdx), IType, { + std::vector uniq_row_idx; + GetUniqueRspRowIdx(nds, &uniq_row_idx); + out->CheckAndAlloc({mshadow::Shape1(uniq_row_idx.size())}); + out->data().FlatTo2D() = static_cast(0); + ElementwiseSumRspImpl(s, nds, uniq_row_idx, out, omp_get_max_threads()); + }); + }); +} + +/*! + * \brief Parallel cpu impl of elemwise sum for sparse tensors. + * Currently only support row sparse sum. + */ +template<> +void ElementwiseSum(mshadow::Stream* s, + const std::vector& nds, + NDArray* out) { + if (nds.empty()) return; + + if (nds[0].storage_type() == kRowSparseStorage) { + ElementwiseSumRsp(s, nds, out); + } else { + LOG(FATAL) << "ElementwiseSum has not been implemented for storage_type = << " + << nds[0].storage_type(); + } +} + } // namespace ndarray } // namespace mxnet diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h index b1ed58db3e74..65c59185f691 100644 --- a/src/ndarray/ndarray_function.h +++ b/src/ndarray/ndarray_function.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "../operator/mshadow_op.h" @@ -168,6 +169,14 @@ void ElementwiseSum(const std::vector source, TBlob *out, RunContext ctx); +/*! + * \brief Interface for parallel impl of elemwise sum for sparse matrices + */ +template +void ElementwiseSum(mshadow::Stream* s, + const std::vector& nds, + NDArray* out); + // broadcasting template void EvalBroadcast(TBlob const& src, TBlob* ret, int size, RunContext ctx); diff --git a/src/nnvm/legacy_op_util.cc b/src/nnvm/legacy_op_util.cc index 2bba5f1c3655..6e601780080b 100644 --- a/src/nnvm/legacy_op_util.cc +++ b/src/nnvm/legacy_op_util.cc @@ -60,19 +60,20 @@ class OperatorState { opr_ = opr; fwd_init_ = bwd_init_ = false; - in_data_.resize(prop->ListArguments().size()); + in_data_fwd_.resize(prop->ListArguments().size()); + in_data_bwd_.resize(prop->ListArguments().size()); out_data_.resize(prop->NumOutputs()); aux_data_.resize(prop->ListAuxiliaryStates().size()); - in_grad_.resize(in_data_.size()); + in_grad_.resize(in_data_fwd_.size()); out_grad_.resize(prop->NumVisibleOutputs()); std::vector out_grad_ptr(out_grad_.size()); for (size_t i = 0; i < out_grad_.size(); ++i) { out_grad_ptr[i] = &out_grad_[i]; } - std::vector in_data_ptr(in_data_.size()); - for (size_t i = 0; i < in_data_.size(); ++i) { - in_data_ptr[i] = &in_data_[i]; + std::vector in_data_ptr(in_data_fwd_.size()); + for (size_t i = 0; i < in_data_fwd_.size(); ++i) { + in_data_ptr[i] = &in_data_bwd_[i]; } std::vector out_data_ptr(out_data_.size()); for (size_t i = 0; i < out_data_.size(); ++i) { @@ -89,16 +90,19 @@ class OperatorState { const std::vector& req, const std::vector& outputs) { if (!fwd_init_) { - CHECK_EQ(inputs.size(), in_data_.size() + aux_data_.size()); + CHECK_EQ(inputs.size(), in_data_fwd_.size() + aux_data_.size()); CHECK_EQ(outputs.size(), out_data_.size()); - for (size_t i = 0; i < in_data_.size(); ++i) in_data_[i] = inputs[i]; + // in_data_bwd_ has the same tblobs as the ones in in_data_fwd_, except that the ones + // referred by arg_data_ptr_ will be overriden + for (size_t i = 0; i < in_data_fwd_.size(); ++i) in_data_fwd_[i] = inputs[i]; + for (size_t i = 0; i < in_data_fwd_.size(); ++i) in_data_bwd_[i] = inputs[i]; for (size_t i = 0; i < aux_data_.size(); ++i) { - aux_data_[i] = inputs[i + in_data_.size()]; + aux_data_[i] = inputs[i + in_data_fwd_.size()]; } for (size_t i = 0; i < out_data_.size(); ++i) out_data_[i] = outputs[i]; fwd_init_ = true; } - opr_->Forward(ctx, in_data_, req, out_data_, aux_data_); + opr_->Forward(ctx, in_data_fwd_, req, out_data_, aux_data_); } void Backward(const OpContext &ctx, @@ -108,6 +112,8 @@ class OperatorState { if (!bwd_init_) { CHECK(fwd_init_); CHECK_EQ(arg_data_ptr_.size() + aux_data_.size(), inputs.size()); + // override tblobs pointed by arg_data_ptr_ since they might not contain + // initialized data during forward pass. for (size_t i = 0; i < arg_data_ptr_.size(); ++i) { *arg_data_ptr_[i] = inputs[i]; } @@ -118,13 +124,19 @@ class OperatorState { for (size_t i = 0; i < outputs.size(); ++i) in_grad_[i] = outputs[i]; bwd_init_ = true; } - opr_->Backward(ctx, out_grad_, in_data_, out_data_, req, in_grad_, aux_data_); + opr_->Backward(ctx, out_grad_, in_data_bwd_, out_data_, req, in_grad_, aux_data_); } private: Operator *opr_; bool fwd_init_, bwd_init_; - std::vector in_data_, aux_data_, out_data_, in_grad_, out_grad_; + // input data blobs for forward and backward + // in_data_fwd_ and in_data_bwd_ will hold different tblobs when StorageFallbackOpExecutor + // performs storage fallback on a non-default input NDArray. The one in in_data_fwd_ is + // generated when setting up forward executor, while the one in in_data_bwd_ is generated + // when setting up backward executor. + std::vector in_data_fwd_, in_data_bwd_; + std::vector aux_data_, out_data_, in_grad_, out_grad_; std::vector arg_data_ptr_; }; diff --git a/src/operator/batch_norm.cc b/src/operator/batch_norm.cc index 86f47dd6163f..866b7fe619cb 100644 --- a/src/operator/batch_norm.cc +++ b/src/operator/batch_norm.cc @@ -230,7 +230,7 @@ void BatchNormOp::DoBackward(mshadow::Stream *, #pragma omp parallel for for (int channel = 0; channel < static_cast(channelCount); ++channel) { const AccReal *weight = weights.dptr(); - const AccReal w = weight ? weight[channel] : AccReal(1); + const AccReal w = !param_.fix_gamma ? weight[channel] : AccReal(1); AccReal mean, invstd; if (is_train_and_not_global_stats) { mean = saveMeanDataPtr[channel]; diff --git a/src/operator/batch_norm.cu b/src/operator/batch_norm.cu index 64f7d9373823..9a8b576a16ee 100644 --- a/src/operator/batch_norm.cu +++ b/src/operator/batch_norm.cu @@ -283,7 +283,7 @@ __global__ void BatchNormalizationUpdateOutputKernel( } // Write normalized and update the output - const AccReal gamma = weight.numElements() > 0 + const AccReal gamma = ((flags & FIX_GAMMA_FLAG) == 0 && weight.numElements() > 0) ? ScalarConvert::to(weight[plane]) : ScalarConvert::to(1); const AccReal beta = bias.numElements() > 0 ? ScalarConvert::to(bias[plane]) @@ -332,7 +332,7 @@ static __global__ void BatchNormalizationBackwardKernel( invstd = VARIANCE_TO_INVSTD(tensors.runningVar[plane], eps); } - const AccReal weightVal = tensors.weight.numElements() > 0 ? + const AccReal weightVal = ((flags & FIX_GAMMA_FLAG) == 0 && tensors.weight.numElements() > 0) ? ScalarConvert::to(tensors.weight[plane]) : AccReal(1); const AccReal norm = AccReal(1) / N; diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h index 9b398f947e30..f60bb590a2e6 100644 --- a/src/operator/elemwise_op_common.h +++ b/src/operator/elemwise_op_common.h @@ -80,6 +80,42 @@ inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs, return true; } +// Only inferring output storage types from input for now +template +inline bool ElemwiseStorageAttr(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + auto deduce = [&](std::vector *vec, const char *name, AttrType& result, + bool fallback) { + auto &v = *vec; + for (size_t i = 0; i < vec->size(); ++i) { + if (v[i] == kUndefinedStorage) { + // if input type is unknown, assume it's default storage + CHECK(assign(&v[i], kDefaultStorage)); + } else if (assign(&result, v[i]) == false && fallback) { + result = kDefaultStorage; + } + } + }; + AttrType dattr = kUndefinedStorage; + deduce(in_attrs, "input", dattr, enable_fallback); + if (reverse_infer) { + LOG(FATAL) << "not implemented yet"; + } + auto write = [&](std::vector *vec, const char *name) { + for (size_t i = 0; i < vec->size(); ++i) { + CHECK(assign(&(*vec)[i], dattr)) + << "Incompatible attr in node " << attrs.name << " at " << i << "-th " + << name << ": " << "expected " << dattr << ", got " << (*vec)[i]; + } + }; + if (is_none(dattr)) dattr = kDefaultStorage; + write(out_attrs, "output"); + return true; +} + template inline bool ElemwiseShape(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, @@ -108,6 +144,18 @@ inline bool ElemwiseType(const nnvm::NodeAttrs& attrs, attrs, in_attrs, out_attrs, -1); } +template +inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + // TODO(junwu): add ctx info into storage inference logic + CHECK_EQ(in_attrs->size(), static_cast(n_in)) << " in operator " << attrs.name; + CHECK_EQ(out_attrs->size(), static_cast(n_out)) << " in operator " << attrs.name; + return ElemwiseStorageAttr( + attrs, in_attrs, out_attrs); +} + // Transfer gradient and input to FGradient function struct ElemwiseGradUseIn { const char *op_name; diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h index 0af7d026d9d5..3162ab6b7b16 100644 --- a/src/operator/mxnet_op.h +++ b/src/operator/mxnet_op.h @@ -25,8 +25,12 @@ #ifndef MXNET_OPERATOR_MXNET_OP_H_ #define MXNET_OPERATOR_MXNET_OP_H_ +#include #include #include +#ifdef __CUDACC__ +#include "../common/cuda_utils.h" +#endif // __CUDACC__ namespace mxnet { namespace op { @@ -40,6 +44,8 @@ const float PI = 3.14159265358979323846; using std::isnan; #endif +template +int get_num_threads(const int N); #ifdef __CUDACC__ #define CUDA_KERNEL_LOOP(i, n) \ @@ -47,6 +53,13 @@ using std::isnan; i < (n); \ i += blockDim.x * gridDim.x) +inline cudaDeviceProp cuda_get_device_prop() { + int device; + CUDA_CALL(cudaGetDevice(&device)); + cudaDeviceProp deviceProp; + CUDA_CALL(cudaGetDeviceProperties(&deviceProp, device)); + return deviceProp; +} /*! * \brief Get the number of blocks for cuda kernel given N @@ -55,8 +68,18 @@ inline int cuda_get_num_blocks(const int N) { using namespace mshadow::cuda; return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum); } + +template<> +inline int get_num_threads(const int N) { + using namespace mshadow::cuda; + return kBaseThreadNum * cuda_get_num_blocks(N); +} #endif // __CUDACC__ +template<> +inline int get_num_threads(const int N) { + return omp_get_max_threads(); +} /*! \brief operator request type switch */ #define MXNET_ASSIGN_REQ_SWITCH(req, ReqType, ...) \ @@ -216,7 +239,6 @@ __global__ void mxnet_generic_kernel(int N, Args... args) { } } - template struct Kernel { template diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h index 2d46bd3230ce..dc53e1a7d232 100644 --- a/src/operator/operator_common.h +++ b/src/operator/operator_common.h @@ -29,12 +29,15 @@ #include #include #include +#include +#include #include #include #include #include #include #include "../common/cuda_utils.h" +#include "../common/utils.h" namespace mxnet { namespace op { @@ -125,6 +128,19 @@ inline std::string type_string(const int& x) { return "unknown"; } +/*! \brief get string representation of storage_type */ +inline std::string stype_string(const int& x) { + switch (x) { + case kDefaultStorage: + return "default"; + case kCSRStorage: + return "csr"; + case kRowSparseStorage: + return "row_sparse"; + } + return "unknown"; +} + /*! * \brief Assign x to y. Checks for compatiblity when y is not empty. * Allow missing dim in both x and y (as 0). @@ -201,6 +217,24 @@ inline bool type_assign(int *y, const int& x) { } \ } +/*! + * \brief macro assign type to out if out is unknown (-1) otherwise check consistency + * Use macro so we can see the error file more clearly + * \param type_array the storage type array to store the result + * \param index the index of in the array + * \param type the inferred storage type + */ +#define STORAGE_TYPE_ASSIGN_CHECK(type_array, index, type) \ + { \ + if (!type_assign(&(type_array)[index], type)) { \ + std::ostringstream os; \ + os << "Storage type inconsistent, Provided=" \ + << stype_string((type_array)[index]) << ',' \ + << " inferred storage type=" << stype_string(type); \ + throw ::mxnet::op::InferTypeError(os.str(), index); \ + } \ + } + // helper macro to implement bind dispatch #if MXNET_USE_CUDA #define DO_BIND_DISPATCH(Method, ...) \ @@ -333,6 +367,54 @@ inline void ParamParser(nnvm::NodeAttrs* attrs) { attrs->parsed = std::move(param); } +/*! \brief Perform storage fallback to invoke fcompute. + * \param attrs attributes of the operator + * \param ctx operator context + * \param inputs inputs of fcompute + * \param req req of fcompute + * \param outputs outputs of fcompute + * \param fcompute + * \param fname name of the operator + * \param mutate_idx the indices of mutable inputs + */ +template +void FCompExFallback(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs, + FCompute fcompute, + const std::string& fname, + std::vector mutate_idx = {}) { + using namespace mxnet::common; + std::vector in_blobs, out_blobs; + std::vector pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src; + // mapping from index in input_blobs to index in pre_temp_dst + std::unordered_map in_temp_idx_map; + SetupDefaultBlobs(inputs, &in_blobs, &pre_temp_src, &pre_temp_dst, &in_temp_idx_map); + SetupDefaultBlobs(outputs, &out_blobs, &post_temp_dst, &post_temp_src); + for (const auto idx : mutate_idx) { + auto map_iter = in_temp_idx_map.find(idx); + if (map_iter != in_temp_idx_map.end()) { + post_temp_src.push_back(pre_temp_dst[map_iter->second]); + post_temp_dst.push_back(inputs[idx]); + } + } + CastNonDefaultStorage(pre_temp_src, pre_temp_dst, ctx, true); + fcompute(attrs, ctx, in_blobs, req, out_blobs); + CastNonDefaultStorage(post_temp_src, post_temp_dst, ctx, true); +} + +#define CHECK_RSP_ALL_ROWS_NON_ZERO(rsp, func, param) \ + { \ + CHECK(rsp.storage_shape()[0] == rsp.shape()[0]) << func \ + << " for RowSparse " << param << " is only implemented for " \ + << "RowSparse " << param << " with all rows containing non-zeros. " \ + << "Expects " << param << ".values.shape[0] (" << rsp.storage_shape()[0] \ + << ") == " << param << ".shape[0] (" << rsp.shape()[0] << ")."; \ + } + + } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_OPERATOR_COMMON_H_ diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 70759b15251a..28707aae4ce8 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -36,6 +36,7 @@ #include "./mshadow_op.h" #include "./elemwise_op_common.h" #include "mxnet_op.h" +#include "./tensor/init_op.h" namespace mxnet { namespace op { @@ -102,6 +103,167 @@ inline void SGDUpdate(const nnvm::NodeAttrs& attrs, }); } +/*! \brief kernel for sparse sgd + */ +template +struct SGDDnsRspKernel { + // DType is the output data type + // IType is row sparse idx type + // i is the ith row in row sparse gradient + template + MSHADOW_XINLINE static void Map(int i, const index_t row_length, DType* out, const DType* weight, + const IType* grad_idx, const DType *grad_val, + const DType clip_gradient, const DType lr, + const DType wd, const DType rescale_grad) { + for (index_t j = 0; j < row_length; j++) { + index_t data_i = grad_idx[i] * row_length + j; + index_t grad_i = i * row_length + j; + if (clip_gradient >= 0.0f) { + KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] - + (lr) * mshadow_op::clip::Map(rescale_grad * grad_val[grad_i], clip_gradient)); + } else { + KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] - + (lr * rescale_grad) * grad_val[grad_i]); + } + } + } +}; + +template +inline void SGDUpdateDnsRspImpl(const SGDParam& param, + const OpContext &ctx, + const TBlob& weight, + const NDArray& grad, + const OpReqType& req, + TBlob *out) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mshadow_op; + using namespace mxnet_op; + Stream* s = ctx.get_stream(); + CHECK_EQ(grad.storage_type(), kRowSparseStorage); + // if gradients are zeros, no weights are updated + if (!grad.storage_initialized() || req == kNullOp) return; + CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update"; + CHECK_GT(weight.shape_.Size(), 0); + + MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + DType* weight_data = weight.dptr(); + IType* grad_idx = grad.aux_data(rowsparse::kIdx).dptr(); + DType* grad_val = grad.data().dptr(); + index_t num_rows = grad.aux_shape(rowsparse::kIdx)[0]; + auto row_length = weight.shape_.ProdShape(1, weight.ndim()); + Kernel, xpu>::Launch(s, num_rows, row_length, + out->dptr(), weight_data, grad_idx, grad_val, + static_cast(param.clip_gradient), + static_cast(param.lr), static_cast(param.wd), + static_cast(param.rescale_grad)); + }); + }); + }); +} + +/*! \brief kernel for sparse sgd + */ +template +struct SGDRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, const index_t num_cols, DType* out, const DType* weight, + const DType *grad, const DType clip_gradient, const DType lr, + const DType wd, const DType rescale_grad) { + bool contains_non_zeros = false; + index_t j = 0; + index_t offset = i * num_cols; + for (; j < num_cols; ++j) { + if (grad[offset + j] != 0) { + contains_non_zeros = true; + break; + } + } + if (!contains_non_zeros) return; + const DType rate = 1.f - lr * wd; + for (index_t j = 0; j < num_cols; j++) { + auto index = offset + j; + if (clip_gradient >= 0.0f) { + KERNEL_ASSIGN(out[index], req, rate * weight[index] - + lr * mshadow_op::clip::Map(rescale_grad * grad[index], clip_gradient)); + } else { + KERNEL_ASSIGN(out[index], req, rate * weight[index] - + lr * rescale_grad * grad[index]); + } + } + } +}; + +template +inline void SGDUpdateRspDnsImpl(const SGDParam& param, + const OpContext &ctx, + const NDArray& weight, + const TBlob& grad, + const OpReqType req, + NDArray *out) { + using namespace mshadow; + using namespace mxnet_op; + using namespace rowsparse; + CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDUpdate", "weights"); + CHECK_EQ(weight.storage_type(), kRowSparseStorage); + if (req == kNullOp) return; + CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_update"; + CHECK(weight.storage_initialized()); + Stream* s = ctx.get_stream(); + MSHADOW_REAL_TYPE_SWITCH(weight.dtype(), DType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + DType* weight_data = weight.data().dptr(); + DType* grad_data = grad.dptr(); + index_t num_rows = weight.aux_shape(kIdx)[0]; + auto num_cols = weight.shape().ProdShape(1, weight.shape().ndim()); + Kernel, xpu>::Launch(s, num_rows, num_cols, + out->data().dptr(), weight_data, grad_data, + static_cast(param.clip_gradient), + static_cast(param.lr), static_cast(param.wd), + static_cast(param.rescale_grad)); + }); + }); +} + +template +inline void SGDUpdateRspRspImpl(const SGDParam& param, + const OpContext& ctx, + const NDArray& weight, + const NDArray& grad, + const OpReqType& req, + NDArray *out) { + CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDUpdate", "weights"); + // reuse dns rsp implementation when storage_shape == shape + TBlob out_blob = out->data(); + SGDUpdateDnsRspImpl(param, ctx, weight.data(), grad, req, &out_blob); +} + +template +inline void SGDUpdateEx(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mshadow_op; + const SGDParam& param = nnvm::get(attrs.parsed); + auto weight_stype = inputs[0].storage_type(); + auto grad_stype = inputs[1].storage_type(); + if (weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage) { + NDArray out = outputs[0]; + SGDUpdateRspRspImpl(param, ctx, inputs[0], inputs[1], req[0], &out); + } else if (weight_stype == kRowSparseStorage && grad_stype == kDefaultStorage) { + NDArray out = outputs[0]; + SGDUpdateRspDnsImpl(param, ctx, inputs[0], inputs[1].data(), req[0], &out); + } else { + FCompExFallback(attrs, ctx, inputs, req, outputs, SGDUpdate, "SGDUpdate"); + } +} + struct SGDMomParam : public dmlc::Parameter { float lr; float momentum; @@ -275,6 +437,196 @@ inline void MP_SGDMomUpdate(const nnvm::NodeAttrs& attrs, }); } +template +struct SGDMomDnsRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data, + DType* mom_data, const DType* weight_data, const IType* grad_idx, + const DType* grad_data, const DType clip_gradient, const DType momentum, + const DType lr, const DType wd, const DType rescale_grad) { + const DType rate = lr * wd; + for (index_t j = 0; j < row_length; j++) { + index_t data_i = grad_idx[i] * row_length + j; + index_t grad_i = i * row_length + j; + if (clip_gradient >= 0.0f) { + mom_data[data_i] = momentum * mom_data[data_i] + - rate * weight_data[data_i] + - lr * + mshadow_op::clip::Map(rescale_grad * grad_data[grad_i], + clip_gradient); + } else { + mom_data[data_i] = momentum * mom_data[data_i] + - rate * weight_data[data_i] + - lr * rescale_grad * grad_data[grad_i]; + } + KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]); + } + } +}; + +template +inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param, + const OpContext& ctx, + const TBlob& weight, + const NDArray& grad, + const TBlob& mom, + const OpReqType& req, + TBlob *out) { + using namespace mxnet_op; + using namespace rowsparse; + Stream* s = ctx.get_stream(); + if (!grad.storage_initialized() || req == kNullOp) return; + CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update"; + CHECK_GT(weight.shape_.Size(), 0); + CHECK_GT(mom.shape_.Size(), 0); + + MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + DType* weight_data = weight.dptr(); + IType* grad_idx = grad.aux_data(kIdx).dptr(); + DType* grad_val = grad.data().dptr(); + DType* mom_data = mom.dptr(); + DType* out_data = out->dptr(); + index_t num_rows = grad.aux_shape(kIdx)[0]; + auto row_length = weight.shape_.ProdShape(1, weight.ndim()); + Kernel, xpu>::Launch(s, num_rows, row_length, + out_data, mom_data, weight_data, grad_idx, grad_val, + static_cast(param.clip_gradient), static_cast(param.momentum), + static_cast(param.lr), static_cast(param.wd), + static_cast(param.rescale_grad)); + }); + }); + }); +} + +template +struct SGDMomRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, index_t num_cols, DType* out, DType* mom, + const DType* weight, const DType *grad, + const DType clip_gradient, const DType momentum, + const DType lr, const DType wd, const DType rescale_grad) { + bool contains_non_zeros = false; + index_t j = 0; + index_t offset = i * num_cols; + for (; j < num_cols; ++j) { + if (grad[offset + j] != 0) { + contains_non_zeros = true; + break; + } + } + if (!contains_non_zeros) return; + const DType rate = lr * wd; + for (index_t j = 0; j < num_cols; j++) { + auto index = offset + j; + if (clip_gradient >= 0.0f) { + mom[index] = momentum * mom[index] - rate * weight[index] + - lr * mshadow_op::clip::Map(rescale_grad * grad[index], clip_gradient); + } else { + mom[index] = momentum * mom[index] - rate * weight[index] + - lr * rescale_grad * grad[index]; + } + KERNEL_ASSIGN(out[index], req, weight[index] + mom[index]); + } + } +}; + +template +inline void SGDMomUpdateRspDnsImpl(const SGDMomParam& param, + const OpContext &ctx, + const NDArray& weight, + const TBlob& grad, + const NDArray& mom, + const OpReqType req, + NDArray *out) { + using namespace mshadow; + using namespace mxnet_op; + using namespace rowsparse; + CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDMomUpdate", "weights"); + Stream* s = ctx.get_stream(); + CHECK_EQ(weight.storage_type(), kRowSparseStorage); + if (req == kNullOp) return; + CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update"; + CHECK(weight.storage_initialized()); + // fill mom with zero values if not initialized yet + if (!mom.storage_initialized()) { + NDArray mom_zeros = mom; + FillDnsZerosRspImpl(s, &mom_zeros); + } + MSHADOW_REAL_TYPE_SWITCH(weight.dtype(), DType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + DType* weight_data = weight.data().dptr(); + DType* grad_data = grad.dptr(); + DType* mom_data = mom.data().dptr(); + index_t num_rows = weight.aux_shape(kIdx)[0]; + auto num_cols = weight.shape().ProdShape(1, weight.shape().ndim()); + Kernel, xpu>::Launch(s, num_rows, num_cols, + out->data().dptr(), mom_data, weight_data, grad_data, + static_cast(param.clip_gradient), static_cast(param.momentum), + static_cast(param.lr), static_cast(param.wd), + static_cast(param.rescale_grad)); + }); + }); +} + + +template +inline void SGDMomUpdateRspRspRspImpl(const SGDMomParam& param, + const OpContext& ctx, + const NDArray& weight, + const NDArray& grad, + const NDArray& mom, + const OpReqType& req, + NDArray *out) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mxnet_op; + using namespace rowsparse; + CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDMomUpdate", "weights"); + Stream* s = ctx.get_stream(); + // fill mom with zero values in order to reuse the sgd mom dns impl + if (!mom.storage_initialized()) { + NDArray mom_zeros = mom; + FillDnsZerosRspImpl(s, &mom_zeros); + } + TBlob out_blob = out->data(); + // reuse dns rsp implementation when storage_shape == shape + SGDMomUpdateDnsRspDnsImpl(param, ctx, weight.data(), grad, + mom.data(), req, &out_blob); +} + +template +inline void SGDMomUpdateEx(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace mxnet_op; + const SGDMomParam& param = nnvm::get(attrs.parsed); + auto &weight = inputs[0]; + auto &grad = inputs[1]; + auto &mom = inputs[2]; + auto weight_stype = weight.storage_type(); + auto grad_stype = grad.storage_type(); + auto mom_stype = mom.storage_type(); + CHECK_EQ(weight_stype, mom_stype) << "Inconsistent storage type detected between mom.stype = " + << mom_stype << " and weight.stype = " << weight_stype; + if (weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage && + mom_stype == kRowSparseStorage) { + NDArray out = outputs[0]; + SGDMomUpdateRspRspRspImpl(param, ctx, weight, grad, mom, req[0], &out); + } else if (weight_stype == kRowSparseStorage && grad_stype == kDefaultStorage && + mom_stype == kRowSparseStorage) { + NDArray out = outputs[0]; + SGDMomUpdateRspDnsImpl(param, ctx, weight, grad.data(), mom, req[0], &out); + } else { + // inputs[2] is a mutable input + FCompExFallback(attrs, ctx, inputs, req, outputs, + SGDMomUpdate, "SGDMomUpdate", {2}); + } +} + struct AdamParam : public dmlc::Parameter { float lr; float beta1; @@ -348,6 +700,147 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs, }); } +/*! + * Note: this kernel performs sparse adam update. For each row-slice in row_sparse + * gradient, it finds the corresponding elements in weight, mean and var and performs + * the update. + * The kernel assumes dense weight/mean/var, and row_sparse gradient + */ +template +struct AdamDnsRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data, + DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx, + const DType* grad_data, const DType clip_gradient, const DType beta1, const DType beta2, + const DType lr, const DType wd, const DType epsilon, const DType rescale_grad) { + using nnvm::dim_t; + using namespace mshadow_op; + const dim_t row_offset = grad_idx[i] * row_length; + for (dim_t j = 0; j < row_length; j++) { + // index in data/mean/var + const dim_t data_i = row_offset + j; + // index in grad + const dim_t grad_i = i * row_length + j; + const DType grad_rescaled = grad_data[grad_i] * rescale_grad + weight_data[data_i] * wd; + if (clip_gradient >= 0.0f) { + mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * + clip::Map(grad_rescaled, clip_gradient); + var_data[data_i] = beta2 * var_data[data_i] + (1.f - beta2) * square::Map( + clip::Map(grad_rescaled, clip_gradient)); + } else { + mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled; + var_data[data_i] = beta2 * var_data[data_i] + + (1.f - beta2) * grad_rescaled * grad_rescaled; + } + KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] / + (square_root::Map(var_data[data_i]) + epsilon)); + } + } +}; + + +template +inline void AdamUpdateDnsRspDnsImpl(const AdamParam& param, + const OpContext& ctx, + const TBlob& weight, + const NDArray& grad, + const TBlob& mean, + const TBlob& var, + const OpReqType& req, + TBlob *out) { + using namespace mxnet_op; + using namespace rowsparse; + Stream* s = ctx.get_stream(); + if (!grad.storage_initialized() || req == kNullOp) return; + CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse adam_update"; + CHECK_GT(weight.shape_.Size(), 0); + CHECK_GT(mean.shape_.Size(), 0); + CHECK_GT(var.shape_.Size(), 0); + + MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + const DType* weight_data = weight.dptr(); + const IType* grad_idx = grad.aux_data(kIdx).dptr(); + const DType* grad_val = grad.data().dptr(); + DType* mean_data = mean.dptr(); + DType* var_data = var.dptr(); + DType* out_data = out->dptr(); + nnvm::dim_t num_rows = grad.aux_shape(kIdx)[0]; + const auto row_length = weight.shape_.ProdShape(1, weight.ndim()); + Kernel, xpu>::Launch(s, num_rows, row_length, + out_data, mean_data, var_data, weight_data, grad_idx, grad_val, + static_cast(param.clip_gradient), static_cast(param.beta1), + static_cast(param.beta2), static_cast(param.lr), + static_cast(param.wd), static_cast(param.epsilon), + static_cast(param.rescale_grad)); + }); + }); + }); +} + +template +inline void AdamUpdateRspRspRspImpl(const AdamParam& param, + const OpContext& ctx, + const NDArray& weight, + const NDArray& grad, + const NDArray& mean, + const NDArray& var, + const OpReqType& req, + NDArray *out) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mxnet_op; + using namespace rowsparse; + CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "AdamUpdate", "weights"); + Stream* s = ctx.get_stream(); + // fill mean and variance with zero values in order to reuse the sgd mom dns impl + if (!mean.storage_initialized()) { + NDArray mean_zeros = mean; + FillDnsZerosRspImpl(s, &mean_zeros); + } + if (!var.storage_initialized()) { + NDArray var_zeros = var; + FillDnsZerosRspImpl(s, &var_zeros); + } + TBlob out_blob = out->data(); + // reuse dns rsp implementation when storage_shape == shape + AdamUpdateDnsRspDnsImpl(param, ctx, weight.data(), grad, mean.data(), + var.data(), req, &out_blob); +} + + +template +inline void AdamUpdateEx(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + const AdamParam& param = nnvm::get(attrs.parsed); + mshadow::Stream* s = ctx.get_stream(); + const auto weight_stype = inputs[0].storage_type(); + const auto grad_stype = inputs[1].storage_type(); + const auto mean_stype = inputs[2].storage_type(); + const auto var_stype = inputs[3].storage_type(); + + const auto out_stype = outputs[0].storage_type(); + CHECK_EQ(mean_stype, weight_stype) << "Inconsistent storage type detected between " + << " mean.stype = " << mean_stype << " and weight.stype = " << weight_stype; + CHECK_EQ(var_stype, weight_stype) << "Inconsistent storage type detected between " + << " var.stype = " << var_stype << " and weight.stype = " << weight_stype; + if (weight_stype == kRowSparseStorage && mean_stype == kRowSparseStorage && + var_stype == kRowSparseStorage && grad_stype == kRowSparseStorage && + out_stype == kRowSparseStorage) { + NDArray out = outputs[0]; + AdamUpdateRspRspRspImpl(param, ctx, inputs[0], inputs[1], inputs[2], + inputs[3], req[0], &out); + } else { + LOG(FATAL) << "Unexpected storage types: weight.stype = " << weight_stype + << ", var.stype = " << var_stype << ", mean.stype = " << mean_stype + << ", grad.stype = " << grad_stype; + } +} + // This RMSProp code follows the version in // http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) // by Alex Graves, 2013. diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc index b26c333edaef..9b2b088c5095 100644 --- a/src/operator/optimizer_op.cc +++ b/src/operator/optimizer_op.cc @@ -40,6 +40,9 @@ It updates the weights using:: weight = weight - learning_rate * gradient +If weight is stored with `row_sparse` storage type, +only the row slices whose indices appear in grad.indices are updated. + )code" ADD_FILELINE) .set_num_inputs(2) .set_num_outputs(1) @@ -47,6 +50,7 @@ It updates the weights using:: .set_attr("FInferShape", ElemwiseShape<2, 1>) .set_attr("FInferType", ElemwiseType<2, 1>) .set_attr("FCompute", SGDUpdate) +.set_attr("FComputeEx", SGDUpdateEx) .add_argument("weight", "NDArray-or-Symbol", "Weight") .add_argument("grad", "NDArray-or-Symbol", "Gradient") .add_arguments(SGDParam::__FIELDS__()); @@ -70,6 +74,9 @@ It updates the weights using:: Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch. +If weights are stored with `row_sparse` storage type, +only the row slices whose indices appear in grad.indices are updated (for both weight and momentum). + )code" ADD_FILELINE) .set_num_inputs(3) .set_num_outputs(1) @@ -81,6 +88,7 @@ Where the parameter ``momentum`` is the decay rate of momentum estimates at each return std::vector{2}; }) .set_attr("FCompute", SGDMomUpdate) +.set_attr("FComputeEx", SGDMomUpdateEx) .add_argument("weight", "NDArray-or-Symbol", "Weight") .add_argument("grad", "NDArray-or-Symbol", "Gradient") .add_argument("mom", "NDArray-or-Symbol", "Momentum") @@ -152,6 +160,7 @@ It updates the weights using:: return std::vector{2, 3}; }) .set_attr("FCompute", AdamUpdate) +.set_attr("FComputeEx", AdamUpdateEx) .add_argument("weight", "NDArray-or-Symbol", "Weight") .add_argument("grad", "NDArray-or-Symbol", "Gradient") .add_argument("mean", "NDArray-or-Symbol", "Moving mean") diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu index 0e74e303dbc9..fe45f4be8c66 100644 --- a/src/operator/optimizer_op.cu +++ b/src/operator/optimizer_op.cu @@ -28,10 +28,12 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(sgd_update) -.set_attr("FCompute", SGDUpdate); +.set_attr("FCompute", SGDUpdate) +.set_attr("FComputeEx", SGDUpdateEx); NNVM_REGISTER_OP(sgd_mom_update) -.set_attr("FCompute", SGDMomUpdate); +.set_attr("FCompute", SGDMomUpdate) +.set_attr("FComputeEx", SGDMomUpdateEx); NNVM_REGISTER_OP(mp_sgd_update) .set_attr("FCompute", MP_SGDUpdate); @@ -40,7 +42,8 @@ NNVM_REGISTER_OP(mp_sgd_mom_update) .set_attr("FCompute", MP_SGDMomUpdate); NNVM_REGISTER_OP(adam_update) -.set_attr("FCompute", AdamUpdate); +.set_attr("FCompute", AdamUpdate) +.set_attr("FComputeEx", AdamUpdateEx); NNVM_REGISTER_OP(rmsprop_update) .set_attr("FCompute", RMSPropUpdate); diff --git a/src/operator/random/sample_op.cc b/src/operator/random/sample_op.cc index 8d87d2b99d14..363163cbc697 100644 --- a/src/operator/random/sample_op.cc +++ b/src/operator/random/sample_op.cc @@ -61,7 +61,8 @@ Example:: [ 0.54488319, 0.84725171]] )code" ADD_FILELINE) -.set_attr("FCompute", SampleUniform_); +.set_attr("FCompute", SampleUniform_) +.set_attr("FComputeEx", SampleUniformEx_); // Add "normal" alias for backward compatibility MXNET_OPERATOR_REGISTER_SAMPLE(random_normal, SampleNormalParam) @@ -78,7 +79,8 @@ Example:: random_normal(loc=0, scale=1, shape=(2,2)) = [[ 1.89171135, -1.16881478], [-1.23474145, 1.55807114]] )code" ADD_FILELINE) -.set_attr("FCompute", SampleNormal_); +.set_attr("FCompute", SampleNormal_) +.set_attr("FComputeEx", SampleNormalEx_); MXNET_OPERATOR_REGISTER_SAMPLE(random_gamma, SampleGammaParam) .add_alias("_sample_gamma") @@ -91,7 +93,8 @@ Example:: random_gamma(alpha=9, beta=0.5, shape=(2,2)) = [[ 7.10486984, 3.37695289], [ 3.91697288, 3.65933681]] )code" ADD_FILELINE) -.set_attr("FCompute", SampleGamma_); +.set_attr("FCompute", SampleGamma_) +.set_attr("FComputeEx", SampleGammaEx_); MXNET_OPERATOR_REGISTER_SAMPLE(random_exponential, SampleExponentialParam) .add_alias("_sample_exponential") diff --git a/src/operator/random/sample_op.cu b/src/operator/random/sample_op.cu index 0d4b2e5a8270..7bdb9faf334e 100644 --- a/src/operator/random/sample_op.cu +++ b/src/operator/random/sample_op.cu @@ -28,21 +28,20 @@ namespace op { // GPU versions of uniform and normal distribution. template<> -void SampleUniform_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { +void SampleUniformDnsImpl(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const OpReqType& req, + TBlob* output) { using namespace mxnet::op; using namespace mshadow::expr; typedef gpu xpu; mshadow::Stream *s = ctx.get_stream(); const SampleUniformParam& param = nnvm::get(attrs.parsed); mshadow::Random *prnd = ctx.requested[0].get_random(s); - if (outputs[0].type_flag_ != mshadow::kFloat32) { - MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + if (output->type_flag_ != mshadow::kFloat32) { + MSHADOW_REAL_TYPE_SWITCH(output->type_flag_, DType, { // Not float32: use workspace and copy to output - mshadow::Tensor out = outputs[0].FlatTo2D(s); + mshadow::Tensor out = output->FlatTo2D(s); mshadow::Tensor workspace = ctx.requested[1].get_space_typed (mshadow::Shape1(out.shape_.Size()), s); @@ -51,27 +50,36 @@ void SampleUniform_(const nnvm::NodeAttrs& attrs, }); } else { // float32: write directly into output - mshadow::Tensor out = outputs[0].FlatTo2D(s); + mshadow::Tensor out = output->FlatTo2D(s); prnd->SampleUniform(&out, param.low, param.high); } } template<> -void SampleNormal_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { +void SampleUniform_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + TBlob out = outputs[0]; + SampleUniformDnsImpl(attrs, ctx, req[0], &out); +} + +template<> +void SampleNormalDnsImpl(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const OpReqType& req, + TBlob* output) { using namespace mxnet::op; using namespace mshadow::expr; typedef gpu xpu; mshadow::Stream *s = ctx.get_stream(); const SampleNormalParam& param = nnvm::get(attrs.parsed); mshadow::Random *prnd = ctx.requested[0].get_random(s); - if (outputs[0].type_flag_ != mshadow::kFloat32) { - MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + if (output->type_flag_ != mshadow::kFloat32) { + MSHADOW_REAL_TYPE_SWITCH(output->type_flag_, DType, { // Not float32: use workspace and copy to output - mshadow::Tensor out = outputs[0].FlatTo2D(s); + mshadow::Tensor out = output->FlatTo2D(s); mshadow::Tensor workspace = ctx.requested[1].get_space_typed (mshadow::Shape1(out.shape_.Size()), s); @@ -80,16 +88,28 @@ void SampleNormal_(const nnvm::NodeAttrs& attrs, }); } else { // float32: write directly into output - mshadow::Tensor out = outputs[0].FlatTo2D(s); + mshadow::Tensor out = output->FlatTo2D(s); prnd->SampleGaussian(&out, param.loc, param.scale); } } +template<> +void SampleNormal_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + TBlob out = outputs[0]; + SampleNormalDnsImpl(attrs, ctx, req[0], &out); +} + NNVM_REGISTER_OP(random_uniform) -.set_attr("FCompute", SampleUniform_); +.set_attr("FCompute", SampleUniform_) +.set_attr("FComputeEx", SampleUniformEx_); NNVM_REGISTER_OP(random_normal) -.set_attr("FCompute", SampleNormal_); +.set_attr("FCompute", SampleNormal_) +.set_attr("FComputeEx", SampleNormalEx_); } // namespace op } // namespace mxnet diff --git a/src/operator/random/sample_op.h b/src/operator/random/sample_op.h index a1a6a2345b1b..0cd3f6bc2efb 100644 --- a/src/operator/random/sample_op.h +++ b/src/operator/random/sample_op.h @@ -232,29 +232,75 @@ struct SampleGenNegBinomialParam : public dmlc::Parameter; + template -void SampleUniform_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { +void SampleComputeEx_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs, + FSampleCompute fcomp) { + NDArray output = outputs[0]; + mshadow::Stream *s = ctx.get_stream(); + if (output.storage_type() == kRowSparseStorage) { + // indices + nnvm::dim_t nnr = output.shape()[0]; + output.CheckAndAlloc({mshadow::Shape1(nnr)}); + PopulateFullIdxRspImpl(s, &output); + // data + TBlob out_blob = output.data(); + fcomp(attrs, ctx, req[0], &out_blob); + } else { + LOG(FATAL) << "Unexpected storage type for SampleComputeEx_: " + << output.storage_type(); + } +} + +template +void SampleUniformDnsImpl(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const OpReqType& req, + TBlob* output) { using namespace mxnet::op; using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); const SampleUniformParam& param = nnvm::get(attrs.parsed); - MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + MSHADOW_REAL_TYPE_SWITCH(output->type_flag_, DType, { mshadow::Random *prnd = ctx.requested[0].get_random(s); - mshadow::Tensor out = outputs[0].FlatTo2D(s); + mshadow::Tensor out = output->FlatTo2D(s); prnd->SampleUniform(&out, param.low, param.high); }); } template -void SampleNormal_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { +void SampleUniform_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + TBlob out = outputs[0]; + SampleUniformDnsImpl(attrs, ctx, req[0], &out); +} + + +template +void SampleUniformEx_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + SampleComputeEx_(attrs, ctx, inputs, req, outputs, SampleUniformDnsImpl); +} + +template +void SampleNormalDnsImpl(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const OpReqType& req, + TBlob* outputs) { using namespace mxnet::op; using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); @@ -268,11 +314,29 @@ void SampleNormal_(const nnvm::NodeAttrs& attrs, } template -void SampleGamma_(const nnvm::NodeAttrs& attrs, +void SampleNormal_(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { + TBlob out = outputs[0]; + SampleNormalDnsImpl(attrs, ctx, req[0], &out); +} + +template +void SampleNormalEx_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + SampleComputeEx_(attrs, ctx, inputs, req, outputs, SampleNormalDnsImpl); +} + +template +void SampleGammaDnsImpl(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const OpReqType& req, + TBlob* outputs) { using namespace mxnet::op; using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); @@ -286,6 +350,25 @@ void SampleGamma_(const nnvm::NodeAttrs& attrs, }); } +template +void SampleGamma_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + TBlob out = outputs[0]; + SampleGammaDnsImpl(attrs, ctx, req[0], &out); +} + +template +void SampleGammaEx_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + SampleComputeEx_(attrs, ctx, inputs, req, outputs, SampleGammaDnsImpl); +} + template void SampleExponential_(const nnvm::NodeAttrs& attrs, const OpContext& ctx, diff --git a/src/operator/tensor/cast_storage-inl.cuh b/src/operator/tensor/cast_storage-inl.cuh new file mode 100644 index 000000000000..afef53e979ea --- /dev/null +++ b/src/operator/tensor/cast_storage-inl.cuh @@ -0,0 +1,589 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2017 by Contributors + * \file cast_storage-inl.cuh + * \brief implementation of cast_storage op on GPU + */ +#ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_ +#define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_ + +#include +#include +#include +#include +#include "./util/tensor_util-inl.cuh" + +namespace mxnet { +namespace op { + +/*! + * \brief GPU Kernel for filling the value array of the rsp tensor. + * Parallelized by rsp tensor elements: 1 thread/element + */ +struct CastDnsRspValsKernel { + /*! + * \brief + * \param tid global thread id + * \param rsp_val value array of rsp tensor to store data + * \param row_idx indices of non-zero rows + * \param dns dense matrix data + * \param nnr number of non-zero rows + * \param row_length number of elements per row + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* rsp_val, + const RType* row_idx, + const DType* dns, + const nnvm::dim_t nnr, + const nnvm::dim_t row_length) { + using nnvm::dim_t; + if (tid < nnr*row_length) { + const dim_t row_id = tid / row_length; + const dim_t row_el = tid % row_length; + const dim_t dns_idx = row_idx[row_id] * row_length + row_el; + rsp_val[tid] = dns[dns_idx]; + } + } +}; + +template +inline mshadow::Tensor AllocateTempDataForCast(const OpContext& op_ctx, + const mshadow::Shape& shape) { + Resource rsc = ResourceManager::Get()->Request(op_ctx.run_ctx.ctx, + ResourceRequest(ResourceRequest::kTempSpace)); + mshadow::Stream *stream = op_ctx.run_ctx.get_stream(); + return rsc.get_space_typed(shape, stream); +}; + +/*! + * \brief GPU implementation of casting a dns tensor to rsp type. + */ +inline void CastStorageDnsRspImpl(const OpContext& ctx, + const gpu& gpu_dev, + const TBlob& dns, + NDArray* rsp) { + CHECK(rsp != nullptr); + CHECK_EQ(rsp->storage_type(), kRowSparseStorage); + CHECK_EQ(dns.shape_, rsp->shape()); + using mshadow::Shape1; + using mxnet_op::Kernel; + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(rowsparse::kIdx), RType, { // row idx type + const dim_t num_rows = dns.shape_[0]; + const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim()); + const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize; + const dim_t threads_per_block = mshadow::cuda::kBaseThreadNum; + const dim_t min_num_warps = 512; + dim_t num_threads; + // TODO: remove kernel dependency on warpSize=32 + if (threads_per_warp != 32) { + LOG(FATAL) << "CastStorageDnsRspImpl GPU kernels expect warpSize=32"; + } + // Determine temporary device storage requirements + dim_t* row_flg = NULL; + void* d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + row_flg, + row_flg, + num_rows, + mshadow::Stream::GetStream(s)); + + // Allocate temp storage for marking non-zero rows and for cub's prefix sum + auto workspace = AllocateTempDataForCast(ctx, Shape1(num_rows*sizeof(dim_t) + + temp_storage_bytes)); + row_flg = reinterpret_cast(workspace.dptr_); + d_temp_storage = workspace.dptr_ + num_rows*sizeof(dim_t); + + // Mark non-zero rows as 'one' in row_flg + // Different kernel versions are optimized for different matrix instances + // (1) 'Thread kernel' (one thread computing one row) + // (2) 'Warp kernel' (one warp computing one row) + // (3) 'Block kernel' (one thread block computing one row) + const int kernel_version = 0; + switch (kernel_version) { + case 1: + num_threads = num_rows; + Kernel::Launch(s, num_threads, + row_flg, dns.dptr(), num_rows, row_length); + break; + case 2: + num_threads = num_rows * threads_per_warp; + Kernel::Launch(s, num_threads, + row_flg, dns.dptr(), num_rows, row_length); + break; + case 3: + num_threads = num_rows * threads_per_block; + Kernel::Launch(s, num_threads, + row_flg, dns.dptr(), num_rows, row_length); + break; + default: + if (row_length < threads_per_warp) { + num_threads = num_rows; + Kernel::Launch(s, num_threads, + row_flg, dns.dptr(), num_rows, row_length); + } else if (row_length < threads_per_block || num_rows > min_num_warps) { + num_threads = num_rows * threads_per_warp; + Kernel::Launch(s, num_threads, + row_flg, dns.dptr(), num_rows, row_length); + } else { + num_threads = num_rows * threads_per_block; + Kernel::Launch(s, num_threads, + row_flg, dns.dptr(), num_rows, row_length); + } + break; + } + // Compute non-zero row indices through inclusive prefix sum + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + row_flg, + row_flg, + num_rows, + mshadow::Stream::GetStream(s)); + + // Get total number of non-zero rows from device + dim_t nnr = 0; + CUDA_CALL(cudaMemcpy(&nnr, &row_flg[num_rows-1], sizeof(dim_t), cudaMemcpyDeviceToHost)); + + // Allocate rsp tensor row index array and fill + rsp->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(nnr)); + if (0 == nnr) return; + RType* row_idx = rsp->aux_data(rowsparse::kIdx).dptr(); + num_threads = num_rows; + Kernel::Launch(s, num_threads, + row_idx, row_flg, num_rows); + + // Construct shape of rsp tensor data, allocate, and fill + auto storage_shape = dns.shape_; + storage_shape[0] = nnr; + rsp->CheckAndAllocData(storage_shape); + num_threads = nnr * row_length; + Kernel::Launch(s, num_threads, + rsp->data().dptr(), row_idx, dns.dptr(), nnr, row_length); + }); + }); +} + +/*! + * \brief Thread kernel for initializing the indptr in a csr matrix. + * Parallelized by matrix rows: 1 thread/row + */ +struct CastDnsCsrIndPtrThreadKernel { + /*! + * \brief + * \param tid global thread id + * \param indptr index pointer array of the csr matrix + * \param dns dense matrix + * \param num_rows number of rows of the dense matrix + * \param num_cols number of columns of the dense matrix + */ + template + __device__ __forceinline__ static void Map(int tid, + IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + if (tid == 0) { + indptr[tid] = 0; + } + if (tid < num_rows) { + dim_t nnz = 0; + const dim_t offset = tid * num_cols; + for (dim_t j = 0; j < num_cols; ++j) { + if (dns[offset+j] != 0) { + nnz++; + } + } + indptr[tid+1] = nnz; + } + } +}; + +/*! + * \brief Thread kernel for initializing the col_idx and value array of the csr matrix. + * Parallelized by matrix rows: 1 thread/row + */ +struct CastDnsCsrColIdxAndValsThreadKernel { + /*! + * \brief + * \param tid global thread id + * \param val data array of the csr matrix + * \param col_idx column index array of the csr matrix + * \param indptr index pointer array of the csr matrix + * \param dns dense matrix + * \param num_rows number of rows of the dense matrix + * \param num_cols number of columns of the dense matrix + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* val, + CType* col_idx, + const IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + if (tid < num_rows) { + const dim_t offset = tid * num_cols; + dim_t k = indptr[tid]; + for (dim_t j = 0; j < num_cols; ++j) { + if (dns[offset+j] != 0) { + val[k] = dns[offset+j]; + col_idx[k] = j; + ++k; + } + } + } + } +}; + +/*! + * \brief Warp kernel for initializing the indptr in a csr matrix. + * Parallelized by matrix rows: 1 warp/row + */ +struct CastDnsCsrIndPtrWarpKernel { + template + __device__ __forceinline__ static void Map(int tid, + IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + typedef cub::WarpReduce WarpReduce; + const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32; + __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block]; + + if (tid == 0) { + indptr[tid] = 0; + } + const dim_t warp_id = tid / 32; // global warp id + const dim_t warp_lane = threadIdx.x / 32; // local warp id within thread block + const dim_t lane = tid & (32-1); // local thread id within warp + if (warp_id < num_rows) { + dim_t lane_nnz = 0; + const dim_t offset = warp_id * num_cols; + for (dim_t j = lane; j < num_cols; j+=32) { + if (dns[offset+j] != 0) { + lane_nnz++; + } + } + dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(lane_nnz); + if (lane == 0) { + indptr[warp_id+1] = aggr; + } + } + } +}; + +/*! + * \brief Warp kernel for initializing the col_idx and value array of the csr matrix. + * Parallelized by matrix rows: 1 warp/row + */ +struct CastDnsCsrColIdxAndValsWarpKernel { + template + __device__ __forceinline__ static void Map(int tid, + DType* val, + CType* col_idx, + const IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + typedef cub::WarpScan WarpScan; + const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32; + __shared__ typename WarpScan::TempStorage temp_storage[warps_per_block]; + __shared__ volatile dim_t warp_nnz[warps_per_block]; + + const dim_t warp_id = tid / 32; // global warp id + const dim_t warp_lane = threadIdx.x / 32; // local warp id within thread block + const dim_t lane = tid & (32-1); // local thread id within warp + if (warp_id < num_rows) { + const dim_t offset = warp_id * num_cols; + dim_t k = indptr[warp_id]; + dim_t nnz; + for (dim_t j = lane; j < num_cols+lane; j+=32) { + nnz = 0; + if (j < num_cols) { + if (dns[offset+j] != 0) { + nnz++; + } + } + if (lane == 31) { + warp_nnz[warp_lane] = nnz; + } + // Compute index each thread has to write to + WarpScan(temp_storage[warp_lane]).ExclusiveSum(nnz, nnz); + if (j < num_cols) { + if (dns[offset+j] != 0) { + val[k+nnz] = dns[offset+j]; + col_idx[k+nnz] = j; + } + } + if (lane == 31) { + warp_nnz[warp_lane] += nnz; + } + __syncwarp(); + k += warp_nnz[warp_lane]; + } + } + } +}; + +/*! + * \brief Block kernel for initializing the indptr in a csr matrix. + * Parallelized by matrix rows: 1 threadBlock/row + */ +struct CastDnsCsrIndPtrBlockKernel { + template + __device__ __forceinline__ static void Map(int tid, + IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using mshadow::cuda::kBaseThreadNum; + using nnvm::dim_t; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + if (tid == 0) { + indptr[tid] = 0; + } + if (blockIdx.x < num_rows) { + dim_t lane_nnz = 0; + const dim_t offset = blockIdx.x * num_cols; + for (dim_t j = threadIdx.x; j < num_cols; j+=kBaseThreadNum) { + if (dns[offset+j] != 0) { + lane_nnz++; + } + } + dim_t aggr = BlockReduce(temp_storage).Sum(lane_nnz); + if (threadIdx.x == 0) { + indptr[blockIdx.x+1] = aggr; + } + } + } +}; + +/*! + * \brief Block kernel for initializing the col_idx and value array of the csr matrix. + * Parallelized by matrix rows: 1 threadBlock/row + */ +struct CastDnsCsrColIdxAndValsBlockKernel { + template + __device__ __forceinline__ static void Map(int tid, + DType* val, + CType* col_idx, + const IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using mshadow::cuda::kBaseThreadNum; + using nnvm::dim_t; + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + __shared__ volatile dim_t block_nnz; + + if (blockIdx.x < num_rows) { + const dim_t offset = blockIdx.x * num_cols; + dim_t k = indptr[blockIdx.x]; + dim_t nnz; + for (dim_t j = threadIdx.x; j < num_cols+threadIdx.x; j+=kBaseThreadNum) { + nnz = 0; + if (j < num_cols) { + if (dns[offset+j] != 0) { + nnz++; + } + } + if (threadIdx.x == kBaseThreadNum-1) { + block_nnz = nnz; + } + // Compute index each thread has to write to + BlockScan(temp_storage).ExclusiveSum(nnz, nnz); + if (j < num_cols) { + if (dns[offset+j] != 0) { + val[k+nnz] = dns[offset+j]; + col_idx[k+nnz] = j; + } + } + if (threadIdx.x == kBaseThreadNum-1) { + block_nnz += nnz; + } + __syncthreads(); + k += block_nnz; + } + } + } +}; + +/*! + * \brief GPU implementation of casting a dense matrix to csr type. + */ +inline void CastStorageDnsCsrImpl(const OpContext& ctx, + const gpu& gpu_dev, + const TBlob& dns, + NDArray* csr) { + CHECK(csr != nullptr); + CHECK_EQ(csr->storage_type(), kCSRStorage); + CHECK_EQ(dns.shape_.ndim(), 2); + CHECK_EQ(dns.shape_, csr->shape()); + using mshadow::Shape1; + using mxnet_op::Kernel; + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, { // col_idx type + const dim_t num_rows = dns.shape_[0]; + const dim_t num_cols = dns.shape_[1]; + const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize; + const dim_t threads_per_block = mshadow::cuda::kBaseThreadNum; + const dim_t min_num_warps = 512; + dim_t num_threads; + // TODO: remove kernel dependency on warpSize=32 + if (threads_per_warp != 32) { + LOG(FATAL) << "CastStorageDnsCsrImpl GPU kernels expect warpSize=32"; + } + csr->CheckAndAllocAuxData(csr::kIndPtr, Shape1(num_rows+1)); + IType* indptr = csr->aux_data(csr::kIndPtr).dptr(); + DType* dns_data = dns.dptr(); + + // Different kernel versions are optimized for different matrix instances + // (1) 'Thread kernel' (one thread computing one row) + // (2) 'Warp kernel' (one warp computing one row) + // (3) 'Block kernel' (one thread block computing one row) + const int kernel_version = 0; + switch (kernel_version) { + case 1: + num_threads = num_rows; + Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + break; + case 2: + num_threads = num_rows * threads_per_warp; + Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + break; + case 3: + num_threads = num_rows * threads_per_block; + Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + break; + default: + if (num_cols < threads_per_warp) { + num_threads = num_rows; + Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + } else if (num_cols < threads_per_block || num_rows > min_num_warps) { + num_threads = num_rows * threads_per_warp; + Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + } else { + num_threads = num_rows * threads_per_block; + Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + } + break; + } + + // Determine temporary device storage requirements + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + indptr, + indptr, + num_rows+1, + mshadow::Stream::GetStream(s)); + + // Allocate temporary storage + auto workspace = AllocateTempDataForCast(ctx, Shape1(temp_storage_bytes)); + + d_temp_storage = workspace.dptr_; + + // Compute indptr through inclusive prefix sum + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + indptr, + indptr, + num_rows+1, + mshadow::Stream::GetStream(s)); + + // Receive total number of nnz values from device + IType nnz = 0; + CUDA_CALL(cudaMemcpy(&nnz, &(indptr[num_rows]), sizeof(IType), cudaMemcpyDeviceToHost)); + + // Allocate column index array and data array of the csr matrix + csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast(nnz))); + csr->CheckAndAllocData(Shape1(static_cast(nnz))); + + // Compute and fill column index array and data array of the csr matrix + switch (kernel_version) { + case 1: + num_threads = num_rows; + Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + break; + case 2: + num_threads = num_rows * threads_per_warp; + Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + break; + case 3: + num_threads = num_rows * threads_per_block; + Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + break; + default: + if (num_cols < threads_per_warp) { + num_threads = num_rows; + Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + } else if (num_cols < threads_per_block || num_rows > min_num_warps) { + num_threads = num_rows * threads_per_warp; + Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + } else { + num_threads = num_rows * threads_per_block; + Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + } + break; + } + }); + }); + }); +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_ diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h new file mode 100644 index 000000000000..acb30a9eff2b --- /dev/null +++ b/src/operator/tensor/cast_storage-inl.h @@ -0,0 +1,392 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file cast_storage-inl.h + * \brief cast_storage implementation for dense and sparse tensors + */ +#ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_ +#define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_ + +#include +#include +#include +#include "../mxnet_op.h" +#include "../operator_common.h" +#ifdef __CUDACC__ +#include "./cast_storage-inl.cuh" +#endif // __CUDACC__ + + +namespace mxnet { +namespace op { + +/*! + * \brief CPU Kernel for marking row_idx of a RSP tensor per row. + */ +struct MarkRspRowIdx { + // i represents the row index of the tensor data + template + MSHADOW_CINLINE static void Map(int i, + RType* row_idx, + const DType* data, + const nnvm::dim_t row_length) { + using nnvm::dim_t; + dim_t j = 0; + dim_t offset = i * row_length; + for (; j < row_length; ++j) { + if (data[offset+j] != 0) { + break; + } + } + if (row_length == j) { + row_idx[i] = 0; // mark as zero for zero row + } else { + row_idx[i] = 1; // mark as one for non-zero row + } + } +}; + +/*! + * \brief CPU implementation of casting a dns tensor to rsp type. + */ +inline void CastStorageDnsRspImpl(const OpContext& ctx, + const cpu& cpu_dev, + const TBlob& dns, + NDArray* rsp) { + using namespace rowsparse; + using namespace mshadow; + using nnvm::dim_t; + CHECK(rsp != nullptr); + CHECK_EQ(rsp->storage_type(), kRowSparseStorage); + CHECK_EQ(dns.shape_, rsp->shape()); + mshadow::Stream* s = ctx.get_stream(); + MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(kIdx), RType, { // row idx type + const dim_t num_rows = dns.shape_[0]; + const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim()); + rsp->CheckAndAllocAuxData(kIdx, Shape1(num_rows)); + TBlob row_idx_blob = rsp->aux_data(kIdx); + RType* row_idx = row_idx_blob.dptr(); + dim_t num_threads = num_rows; + mxnet_op::Kernel::Launch(s, num_threads, + row_idx, dns.dptr(), row_length); + dim_t nnr = 0; + nnr = common::ParallelAccumulate(row_idx, num_rows, nnr); + rsp->set_aux_shape(kIdx, Shape1(nnr)); + if (0 == nnr) return; + auto storage_shape = dns.shape_; + storage_shape[0] = nnr; + rsp->CheckAndAllocData(storage_shape); + auto dns_data = dns.get_with_shape(Shape2(num_rows, row_length), s); + auto rsp_data = rsp->data().get_with_shape(Shape2(nnr, row_length), s); + dim_t idx = 0; + for (dim_t i = 0; i < num_rows; ++i) { + if (row_idx[i] > 0) { + row_idx[idx] = i; + Copy(rsp_data[idx], dns_data[i], s); + ++idx; + } + } + }); + }); +} + +// TODO(haibin) Use memcopy instead will be much faster than assigning each individual element +struct CastStorageRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, + const nnvm::dim_t row_length, + const IType* idx, + const DType *data, + DType* dns) { + using nnvm::dim_t; + IType rid = idx[i]; + dim_t dns_offset = rid * row_length; + dim_t rsp_offset = i * row_length; + for (dim_t col = 0; col < row_length; col++) { + dns[dns_offset + col] = data[rsp_offset + col]; + } + } +}; + +/*! + * \brief This function assumes that the memory for dns has been allocated already + * since the shape is known at binding stage. + */ +template +void CastStorageRspDnsImpl(const OpContext& ctx, + const NDArray& rsp, + TBlob* dns) { + mshadow::Stream* s = ctx.get_stream(); + CHECK_EQ(rsp.storage_type(), kRowSparseStorage); + using nnvm::dim_t; + MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, { + // assign zeros + mxnet_op::Kernel::Launch(s, dns->Size(), dns->dptr()); + if (rsp.storage_initialized()) { + // copy over row by row + auto in_idx = rsp.aux_data(rowsparse::kIdx).FlatTo1D(s).dptr_; + auto in_data = rsp.data().dptr(); + auto out_data = dns->dptr(); + auto shape = rsp.shape(); + const dim_t num_rows = rsp.aux_shape(rowsparse::kIdx).Size(); + const dim_t row_length = shape.ProdShape(1, shape.ndim()); + const dim_t num_threads = num_rows; + mxnet_op::Kernel::Launch(s, num_threads, + row_length, in_idx, in_data, out_data); + } + }); + }); +} + +/*! + * \brief CPU kernel for initializing the indptr in a csr matrix. + */ +struct FillCsrIndPtr { + /*! + * \brief + * \param i the i-th row of the dns tensor + * \param indptr the indptr of the csr tensor + * \param dns the dns tensor + * \param num_rows number of rows of the dns tensor + * \param num_cols number of columns of the dns tensor + */ + template + MSHADOW_CINLINE static void Map(int i, + IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + indptr[i+1] = 0; + const dim_t offset = i * num_cols; + for (dim_t j = 0; j < num_cols; ++j) { + if (dns[offset+j] != 0) { + ++indptr[i+1]; + } + } + } +}; + +/*! + * \brief CPU kernel for initializing the col_idx and value array of the csr matrix. + */ +struct FillCsrColIdxAndVals { + /*! + * \brief + * \param i the i-th row of the dns tensor + * \param val value array of the csr tensor + * \param col_idx column idx array of the csr tensor + * \param indptr indptr array of the csr tensor + * \param dns dns tensor + * \param num_rows number of rows of the dns tensor + * \param num_cols number of columns of the dns tensor + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* val, + CType* col_idx, + const IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + const dim_t offset = i * num_cols; + IType k = indptr[i]; + for (dim_t j = 0; j < num_cols; ++j) { + if (dns[offset+j] != 0) { + val[k] = dns[offset+j]; + col_idx[k] = j; + ++k; + } + } + } +}; + +/*! + * \brief CPU implementation of casting a dns matrix to csr type. + */ +inline void CastStorageDnsCsrImpl(const OpContext& ctx, + const cpu& cpu_dev, + const TBlob& dns, + NDArray* csr) { + CHECK(csr != nullptr); + CHECK_EQ(csr->storage_type(), kCSRStorage); + CHECK_EQ(dns.shape_.ndim(), 2); + CHECK_EQ(dns.shape_, csr->shape()); + using mshadow::Shape1; + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, { // col idx type + const dim_t num_rows = dns.shape_[0]; + const dim_t num_cols = dns.shape_[1]; + csr->CheckAndAllocAuxData(csr::kIndPtr, mshadow::Shape1(num_rows+1)); + IType* indptr = csr->aux_data(csr::kIndPtr).dptr(); + DType* dns_data = dns.dptr(); + dim_t num_threads = num_rows; + mxnet_op::Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + // single thread to accumulate indptr + // indptr[num_rows] indicates the number of non-zero elements + indptr[0] = 0; + for (dim_t i = 0; i < num_rows; ++i) { + indptr[i+1] += indptr[i]; + } + // allocate column idx array and value array + csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast(indptr[num_rows]))); + csr->CheckAndAllocData(Shape1(static_cast(indptr[num_rows]))); + // fill col_idx and value arrays of the csr + mxnet_op::Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + }); + }); + }); +} + +/*! + * \brief This is the kernel for copying csr.data to its corresponding dns matrix. + */ +struct CopyCsrDataToDns { + /*! + * \brief + * \param i the i-th row of the dns tensor + * \param dns_data data blob of the dns tensor + * \param col_idx column idx array of the csr tensor + * \param indptr indptr array of the csr tensor + * \param csr_data data blob of the csr tensor + * \param num_cols number of columns of the dns tensor + */ + template + MSHADOW_XINLINE static void Map(int i, + DType* dns_data, + const CType* col_idx, + const IType* indptr, + const DType* csr_data, + const nnvm::dim_t num_cols) { + const nnvm::dim_t offset = i * num_cols; + for (IType j = indptr[i]; j < indptr[i+1]; ++j) { + dns_data[offset+col_idx[j]] = csr_data[j]; + } + } +}; + +/*! + * \brief Casts a csr matrix to dns format. + */ +template +void CastStorageCsrDnsImpl(const OpContext& ctx, + const NDArray& csr, + TBlob* dns) { + CHECK(dns != nullptr); + CHECK_EQ(csr.storage_type(), kCSRStorage); + CHECK_EQ(dns->shape_.ndim(), 2); + CHECK_EQ(dns->shape_, csr.shape()); + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIndPtr), IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIdx), CType, { // col idx type + const dim_t num_rows = dns->shape_[0]; + const dim_t num_cols = dns->shape_[1]; + DType* dns_data = dns->dptr(); + dim_t num_threads = dns->shape_.Size(); + mxnet_op::Kernel::Launch(s, num_threads, dns_data); + if (!csr.storage_initialized()) return; + const IType* indptr = csr.aux_data(csr::kIndPtr).dptr(); + const CType* col_idx = csr.aux_data(csr::kIdx).dptr(); + const DType* csr_data = csr.data().dptr(); + num_threads = num_rows; + mxnet_op::Kernel::Launch(s, num_threads, + dns_data, col_idx, indptr, csr_data, num_cols); + }); + }); + }); +} + +template +void CastStorageComputeImpl(const OpContext& ctx, + const NDArray& input, + const NDArray& output) { + const auto src_stype = input.storage_type(); + const auto dst_stype = output.storage_type(); + if (src_stype == kRowSparseStorage && dst_stype == kDefaultStorage) { + TBlob ret = output.data(); + CastStorageRspDnsImpl(ctx, input, &ret); + } else if (src_stype == kDefaultStorage && dst_stype == kRowSparseStorage) { + NDArray ret = output; // get rid of the const qualifer + CastStorageDnsRspImpl(ctx, xpu(), input.data(), &ret); + } else if (src_stype == kDefaultStorage && dst_stype == kCSRStorage) { + NDArray ret = output; // get rid of the const qualifer + CastStorageDnsCsrImpl(ctx, xpu(), input.data(), &ret); + } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) { + TBlob ret = output.data(); + CastStorageCsrDnsImpl(ctx, input, &ret); + } else { + LOG(FATAL) << "Not implemented"; + } +} + +struct CastStorageParam : public dmlc::Parameter { + int stype; + DMLC_DECLARE_PARAMETER(CastStorageParam) { + DMLC_DECLARE_FIELD(stype) + .add_enum("default", kDefaultStorage) + .add_enum("row_sparse", kRowSparseStorage) + .add_enum("csr", kCSRStorage) + .describe("Output storage type."); + } +}; + +inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1U); + CHECK_EQ(out_attrs->size(), 1U); + CHECK_NE(in_attrs->at(0), kUndefinedStorage) + << "src ndarray's storage type must be specified"; + const CastStorageParam& param = nnvm::get(attrs.parsed); + CHECK_NE(param.stype, kUndefinedStorage) + << "dst ndarray's storage type must be specified"; + TYPE_ASSIGN_CHECK(*out_attrs, 0, param.stype); + return true; +} + +template +void CastStorageComputeEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1); + CHECK_EQ(outputs.size(), 1); + if (req[0] == kNullOp) return; + CHECK_EQ(req[0], kWriteTo) << "CastStorageComputeEx expects req[0] == kWriteTo"; + CastStorageComputeImpl(ctx, inputs[0], outputs[0]); +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_ diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc new file mode 100644 index 000000000000..0ad063cd0ed5 --- /dev/null +++ b/src/operator/tensor/cast_storage.cc @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file cast_storage.cc + * \brief CPU Implementation of cast_storage operator. + */ + +#include "./cast_storage-inl.h" +#include "../elemwise_op_common.h" +#include "../tensor/elemwise_unary_op.h" + +namespace mxnet { +namespace op { + +DMLC_REGISTER_PARAMETER(CastStorageParam); +NNVM_REGISTER_OP(cast_storage) +.describe(R"code(Casts tensor storage type to the new type. + +When an NDArray with default storage type is cast to csr or row_sparse storage, +the result is compact, which means: + +- for csr, zero values will not be retained +- for row_sparse, row slices of all zeros will not be retained + +The storage type of ``cast_storage`` output depends on stype parameter: + +- cast_storage(csr, 'default') = default +- cast_storage(row_sparse, 'default') = default +- cast_storage(default, 'csr') = csr +- cast_storage(default, 'row_sparse') = row_sparse + +Example:: + + dense = [[ 0., 1., 0.], + [ 2., 0., 3.], + [ 0., 0., 0.], + [ 0., 0., 0.]] + + # cast to row_sparse storage type + rsp = cast_storage(default, 'default') + rsp.indices = [0, 1] + rsp.values = [[ 0., 1., 0.], + [ 2., 0., 3.]] + + # cast to row_sparse storage type + csr = cast_storage(default, 'default') + csr.indices = [1, 0, 2] + csr.values = [ 1., 2., 3.] + csr.indptr = [0, 1, 3, 3, 3] + +)code" ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FInferShape", ElemwiseShape<1, 1>) +.set_attr("FInferType", ElemwiseType<1, 1>) +.set_attr("FInferStorageType", CastStorageInferStorageType) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", CastStorageComputeEx) +.set_attr("FGradient", ElemwiseGradUseNone{"_copy"}) +.add_argument("data", "NDArray-or-Symbol", "The input.") +.add_arguments(CastStorageParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/cast_storage.cu b/src/operator/tensor/cast_storage.cu new file mode 100644 index 000000000000..1be5f79ae297 --- /dev/null +++ b/src/operator/tensor/cast_storage.cu @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file cast_storage.cu + * \brief GPU Implementation of cast_storage operator. + */ +#include "./cast_storage-inl.h" +#include "../tensor/elemwise_unary_op.h" + +namespace mxnet { +namespace op { + +NNVM_REGISTER_OP(cast_storage) +.set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", CastStorageComputeEx); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/dot-inl.cuh b/src/operator/tensor/dot-inl.cuh new file mode 100644 index 000000000000..41c3faaf419f --- /dev/null +++ b/src/operator/tensor/dot-inl.cuh @@ -0,0 +1,883 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2017 by Contributors + * \file dot-inl.cuh + * \brief implementation of matrix dot op on GPU + */ +#ifndef MXNET_OPERATOR_TENSOR_DOT_INL_CUH_ +#define MXNET_OPERATOR_TENSOR_DOT_INL_CUH_ + +#include +#include +#include "./util/tensor_util-inl.cuh" + +namespace mxnet { +namespace op { + +/*! + * \brief GPU scalar kernel of dot(csr, dns1) = dns2 + * Parallelization by output matrix elements: 1 thread/element + */ +template +struct DotCsrDnsDnsScalarKernel { + /*! + * \brief This function represents performing an inner product between a row of lhs + * and a column of rhs and then assigning the value to out[tid]. + * \param tid global thread id + * \param out output matrix data + * \param data_l csr matrix data + * \param indptr_l csr matrix row index pointer + * \param col_idx_l csr matrix column indices + * \param data_r dns1 matrix data of rhs + * \param num_cols_r dns1 matrix number of columns + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { + const nnvm::dim_t irow = tid / num_cols_r; // row id of the lhs + const nnvm::dim_t icol = tid % num_cols_r; // col id of the rhs + DType sum = 0; + for (IType j = indptr_l[irow]; j < indptr_l[irow+1]; ++j) { + const CType cur_col = col_idx_l[j]; // corresponding row id of the rhs + sum += data_l[j] * data_r[cur_col*num_cols_r+icol]; + } + KERNEL_ASSIGN(out[tid], req, sum); + } +}; + +/*! + * \brief GPU vector kernel of dot(csr, dns1) = dns2 + * Parallelization by output matrix elements: 1 warp/element + */ +template +struct DotCsrDnsDnsVectorKernel { + /*! + * \brief see DotCsrDnsDnsScalarKernel Map for documentation. + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + __shared__ volatile DType vals[mshadow::cuda::kBaseThreadNum]; + const dim_t warp_id = tid / 32; // global warp id + const dim_t lane = tid & (32-1); // local thread id within warp + const dim_t irow = warp_id / num_cols_r; // lhs row that this warp computes + const dim_t kcol = warp_id % num_cols_r; // rhs column that this warp computes + + // Range of nnz elements in this row + const dim_t low = static_cast(indptr_l[irow]); + const dim_t high = static_cast(indptr_l[irow+1]); + + // Compute running sum per thread + DType sum = 0; + for (dim_t j = low+lane; j < high; j+=32) { + sum += data_l[j] * data_r[col_idx_l[j]*num_cols_r + kcol]; + } + vals[threadIdx.x] = sum; __syncwarp(); + + // Parallel reduction in shared memory + if (lane < 16) {vals[threadIdx.x] += vals[threadIdx.x+16];} __syncwarp(); + if (lane < 8) {vals[threadIdx.x] += vals[threadIdx.x+ 8];} __syncwarp(); + if (lane < 4) {vals[threadIdx.x] += vals[threadIdx.x+ 4];} __syncwarp(); + if (lane < 2) {vals[threadIdx.x] += vals[threadIdx.x+ 2];} __syncwarp(); + if (lane < 1) {vals[threadIdx.x] += vals[threadIdx.x+ 1];} __syncwarp(); + + if (lane == 0) { + KERNEL_ASSIGN(out[irow*num_cols_r+kcol], req, vals[threadIdx.x]); + } + } +}; + +/*! + * \brief GPU scalar kernel of dot(csr.T, dns1) = dns2 + * Parallelization by output matrix elements: 1 thread/element + */ +template +struct DotCsrTransDnsDnsScalarKernel { + /*! + * \brief This function represents performing an inner product between a column of lhs + * and a column of rhs and then assigning the value to out[tid]. + * \param tid global thread id + * \param out output matrix + * \param data_l csr matrix data + * \param indptr_l csr matrix row index pointer + * \param col_idx_l csr matrix column indices + * \param data_r dns1 matrix data of rhs + * \param num_rows_l csr matrix number of rows (= number of columns of csr.T) + * \param num_cols_r dns1 matrix number of columns + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_rows_l, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + const dim_t irow = tid / num_cols_r; // col id of the lhs + const dim_t icol = tid % num_cols_r; // col id of the rhs + DType sum = 0; + + // Each thread scans each column with binary search to find nnz elements in its row + for (dim_t k = 0; k < num_rows_l; ++k) { + const dim_t low = static_cast(indptr_l[k]); + const dim_t high = static_cast(indptr_l[k+1]); + if (low == high || irow < col_idx_l[low] || irow > col_idx_l[high-1]) continue; + dim_t j = high, l = low, r = high - 1; + while (l <= r) { + dim_t m = l + (r - l) / 2; + if (col_idx_l[m] == irow) { + j = m; break; + } + if (col_idx_l[m] < irow) { + l = m + 1; + } else { + r = m - 1; + } + } + if (j < high) { + sum += data_l[j] * data_r[k*num_cols_r+icol]; + } + } + KERNEL_ASSIGN(out[tid], req, sum); + } +}; + +/*! + * \brief GPU warp kernel of dot(csr.T, dns1) = dns2 + * Parallelization by columns: 1 warp computes one lhs column for one rhs column + */ +struct DotCsrTransDnsDnsWarpKernel { + /*! + * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation. + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + const dim_t warp_id = tid / 32; // global warp id + const dim_t lane = tid & (32-1); // local thread id within warp + const dim_t icol = warp_id / num_cols_r; // lhs column that this warp computes + const dim_t kcol = warp_id % num_cols_r; // rhs column that this warp computes + + // Compute range of nnz elements in this column + const dim_t low = static_cast(indptr_l[icol]); + const dim_t high = static_cast(indptr_l[icol+1]); + + // Iterate through the nnz elements in this column + for (dim_t j = low+lane; j < high; j+=32) { + const dim_t irow = static_cast(col_idx_l[j]); + const DType val = data_l[j]*data_r[icol*num_cols_r+kcol]; + atomicAdd(static_cast(&(out[irow*num_cols_r+kcol])), val); + } + } +}; + +/*! + * \brief GPU thread block kernel of dot(csr.T, dns1) = dns2 + * Parallelization by columns: 1 thread block computes one lhs column for all rhs columns + */ +struct DotCsrTransDnsDnsThreadBlockKernel { + /*! + * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation. + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + const dim_t warps_per_block = blockDim.x / 32; // number of warps in this thread block + const dim_t warp_id = tid / 32; // global warp id + const dim_t lane = tid & (32-1); // local thread id within warp + const dim_t icol = blockIdx.x; // lhs column that this thread block computes + const dim_t kcol = warp_id % warps_per_block; // rhs column where warp starts computing (offset) + + // Compute range of nnz elements in this lhs column + const dim_t low = static_cast(indptr_l[icol]); + const dim_t high = static_cast(indptr_l[icol+1]); + + // Iterate through the nnz elements in this lhs column + for (dim_t j = low+lane; j < high; j+=32) { + const dim_t irow = static_cast(col_idx_l[j]); + const DType datum_l = data_l[j]; + // Iterate over rhs columns that this warp computes + for (dim_t k = kcol; k < num_cols_r; k+=warps_per_block) { + const DType val = datum_l*data_r[icol*num_cols_r+k]; + atomicAdd(static_cast(&(out[irow*num_cols_r+k])), val); + } + } + } +}; + +/*! + * \brief GPU warp block kernel of dot(csr.T, dns1) = dns2 + * Parallelization by columns: 1 warp computes one lhs column for all rhs columns + */ +struct DotCsrTransDnsDnsWarpBlockKernel { + /*! + * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation. + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + const dim_t warp_id = tid / 32; // global warp id + const dim_t lane = tid & (32-1); // local thread id within warp + const dim_t icol = warp_id; // lhs column that this warp computes + + // Compute range of nnz elements in this column + const dim_t low = static_cast(indptr_l[icol]); + const dim_t high = static_cast(indptr_l[icol+1]); + + // Iterate through the nnz elements in lhs column + for (dim_t j = low+lane; j < high; j+=32) { + const dim_t irow = static_cast(col_idx_l[j]); + const DType datum_l = data_l[j]; + // Iterate over all rhs columns + for (dim_t k = 0; k < num_cols_r; k++) { + const DType val = datum_l*data_r[icol*num_cols_r+k]; + atomicAdd(static_cast(&(out[irow*num_cols_r+k])), val); + } + } + } +}; + +/*! + * \brief GPU warp kernel of dot(csr.T, dns) = rsp + * Parallelization by columns: 1 warp computes one lhs column for one rhs column + */ +struct DotCsrTransDnsRspWarpKernel { + /*! + * \brief + * \param tid global thread id + * \param out output rsp matrix data + * \param row_flg_sum_out inclusive prefix sum array over 0/1 marked row flag array + * \param data_l csr matrix data + * \param indptr_l csr matrix row index pointer + * \param col_idx_l csr matrix column indices + * \param data_r dns matrix data + * \param num_cols_r dns matrix number of columns + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const nnvm::dim_t* row_flg_sum_out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + const dim_t warp_id = tid / 32; // global warp id + const dim_t lane = tid & (32-1); // local thread id within warp + const dim_t icol = warp_id / num_cols_r; // lhs column that this warp computes + const dim_t kcol = warp_id % num_cols_r; // rhs column that this warp computes + + // Compute range of nnz elements in this column + const dim_t low = static_cast(indptr_l[icol]); + const dim_t high = static_cast(indptr_l[icol+1]); + + // Iterate through the nnz elements in this column + for (dim_t j = low+lane; j < high; j+=32) { + const dim_t irow = static_cast(col_idx_l[j]); + const dim_t rsp_row = row_flg_sum_out[irow]-1; + const DType val = data_l[j]*data_r[icol*num_cols_r+kcol]; + atomicAdd(static_cast(&(out[rsp_row*num_cols_r+kcol])), val); + } + } +}; + +/*! + * \brief GPU Kernel of dot(csr.T, rsp1) = rsp2 + * Parallelization by rows: 1 thread/row + * TODO: write a faster kernel optimized for GPU + */ +struct DotCsrTransRspRspByRowsKernel { + /*! + * \brief + * \param tid global thread id + * \param out output rsp matrix data + * \param row_idx_out output rsp matrix non-zero row indices + * \param data_l csr matrix data + * \param indptr_l csr matrix row index pointer + * \param col_idx_l csr matrix column indices + * \param data_r rsp1 matrix data + * \param row_idx_r rsp1 matrix non-zero row indices + * \param num_cols_r rsp1 matrix number of cols + * \param nnr_r rsp1 matrix number of non-zero rows + * \param nnr_out output rsp matrix number of non-zero rows + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const RType* row_idx_out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const RType* row_idx_r, + const nnvm::dim_t num_cols_r, + const nnvm::dim_t nnr_r, + const nnvm::dim_t nnr_out) { + using nnvm::dim_t; + // This thread computes non-zero row 'tid' of the output matrix + // The actual row id corresponding to the lhs row is row_idx_out[tid] + if (tid < nnr_out) { + const dim_t offset_out = tid * num_cols_r; + // Iterate over rhs matrix rows (or, equivalently, lhs columns worthy taking a look at) + for (dim_t i = 0; i < nnr_r; i++) { + const RType j = row_idx_r[i]; // j is the actual rhs row id (= lhs column id) + if (indptr_l[j] == indptr_l[j+1]) continue; + const dim_t offset_r = i * num_cols_r; + // Iterate over lhs column j to find possible non-zero value in this row + // TODO: remove sequential search, this is a bottleneck + for (IType k = indptr_l[j]; k < indptr_l[j+1]; k++) { + const CType col_idx = col_idx_l[k]; + if (col_idx == row_idx_out[tid]) { + for (dim_t l = 0; l < num_cols_r; l++) { + out[offset_out+l] += data_l[k] * data_r[offset_r+l]; + } + } else if (col_idx > row_idx_out[tid]) { + break; + } + } + } + } + } +}; + +/*! + * \brief GPU Kernel of dot(csr, rsp) = dns + * Parallelization by output elements: 1 thread/element + */ +struct DotCsrRspDnsScalarKernel { + /*! + * \brief + * \param tid global thread id + * \param out output dns matrix data + * \param data_l csr matrix data + * \param indptr_l csr matrix row index pointer + * \param col_idx_l csr matrix column indices + * \param data_r rsp matrix data + * \param row_idx_r rsp matrix non-zero row indices + * \param row_flg_r rsp matrix auxiliary array holding storage indices of non-zero rows + * \param nnr_r rsp matrix number of non-zero rows + * \param num_rows output dns matrix number of rows + * \param num_cols output dns matrix number of columns + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const RType* row_idx_r, + const RType* row_flg_r, + const nnvm::dim_t nnr_r, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + if (tid < num_rows*num_cols) { + const dim_t i = static_cast(tid) / num_cols; // i = row this thread computes + const dim_t k = static_cast(tid) % num_cols; // k = col this thread computes + // Compute inner product of i-th row and k-th col + DType sum = 0; + for (IType j = indptr_l[i]; j < indptr_l[i+1]; j++) { + const dim_t csr_col = col_idx_l[j]; + const dim_t rsp_row_idx = row_flg_r[csr_col]; + if (rsp_row_idx > 0) { + sum += data_l[j] * data_r[(rsp_row_idx-1)*num_cols+k]; + } + } + if (sum != 0) { + out[i*num_cols+k] += sum; + } + } + } +}; + +/*! + * \brief GPU Impl of dot(csr, dns1) = dns2 and dot(csr.T, dns1) = dns2 + */ +inline void DotCsrDnsDnsImpl(const OpContext& ctx, + const gpu& gpu_dev, + const NDArray& lhs, + const TBlob& rhs, + const OpReqType req, + const bool trans_lhs, + TBlob* ret) { + if (kNullOp == req) return; + CHECK_EQ(lhs.storage_type(), kCSRStorage); + if (!lhs.storage_initialized()) return; + + using mshadow::cuda::kBaseThreadNum; + using mxnet_op::Kernel; + using mxnet_op::set_zero; + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + + const dim_t num_rows_l = lhs.shape()[0]; + const dim_t num_cols_r = rhs.shape_[1]; + const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize; + const dim_t threads_per_block = kBaseThreadNum; + dim_t num_threads; + // TODO: remove kernel dependency on warpSize=32 + if (threads_per_warp != 32) { + LOG(FATAL) << "DotCsrDnsDnsImpl GPU kernels expect warpSize=32"; + } + + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob& data_r = rhs; + const TBlob data_out = *ret; + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + if (kWriteTo == req) { + num_threads = data_out.Size(); + Kernel::Launch(s, num_threads, data_out.dptr()); + } + if (trans_lhs) { + // Different kernel versions are optimized for different matrix instances + // TODO: switch between kernel versions depending on input + // (1) 'Scalar kernel' (one thread computing one output element ) + // (2) 'Warp kernel' (one warp computing one lhs column for one rhs column ) + // (3) 'Thread block kernel' (one thread block computing one lhs column for all rhs columns) + // (4) 'Warp block kernel' (one warp computing one lhs column for all rhs columns) + const int kernel_version = 0; + switch (kernel_version) { + case 1: + num_threads = data_out.Size(); + MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { + Kernel, gpu>::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_rows_l, num_cols_r); + }); + break; + case 2: + num_threads = threads_per_warp * num_rows_l * num_cols_r; + Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + break; + case 3: + num_threads = threads_per_block * num_rows_l; + Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + break; + case 4: + num_threads = threads_per_warp * num_rows_l; + Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + break; + default: + num_threads = threads_per_warp * num_rows_l * num_cols_r; + Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + break; + } + } else { + // Different kernel versions are optimized for different matrix instances + // (1) 'Scalar kernel' (one thread computing one output element) + // (2) 'Vector kernel' (one warp computing one output element) + const int kernel_version = 0; + switch (kernel_version) { + case 1: + num_threads = data_out.Size(); + MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { + Kernel, gpu>::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + }); + break; + case 2: + num_threads = threads_per_warp * num_rows_l * num_cols_r; + MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { + Kernel, gpu>::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + }); + break; + default: + if (num_cols_r > 4) { + num_threads = data_out.Size(); + MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { + Kernel, gpu>::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + }); + } else { + num_threads = threads_per_warp * num_rows_l * num_cols_r; + MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { + Kernel, gpu>::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + }); + } + break; + } + } + }); + }); + }); +} + +/*! + * \brief GPU Impl of dot(csr, dns) = rsp and dot(csr.T, dns) = rsp + */ +inline void DotCsrDnsRspImpl(const OpContext& ctx, + const gpu& gpu_dev, + const NDArray& lhs, + const TBlob& rhs, + const OpReqType req, + const bool trans_lhs, + NDArray* ret) { + if (kNullOp == req) return; + CHECK_EQ(lhs.storage_type(), kCSRStorage); + CHECK_EQ(ret->storage_type(), kRowSparseStorage); + CHECK_EQ(req, kWriteTo); + if (!lhs.storage_initialized()) return; + + using mshadow::Shape1; + using mxnet_op::Kernel; + using mxnet_op::set_zero; + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob& data_r = rhs; + + const dim_t num_rows_l = lhs.shape()[0]; + const dim_t num_cols_l = lhs.shape()[1]; + const dim_t num_cols_r = rhs.shape_[1]; + const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize; + dim_t num_threads; + // TODO: remove kernel dependency on warpSize=32 + if (threads_per_warp != 32) { + LOG(FATAL) << "DotCsrDnsRspImpl GPU kernels expect warpSize=32"; + } + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + if (trans_lhs) { + // Compute number of non-zero rows (nnr) of output matrix + // - alloc temp storage for row_flg array and for cub's prefix sum + // - mark non-zero columns of csr matrix in row_flg + // - compute inclusive prefix sum over marked array + // - copy last value (nnr_out) from device to host + dim_t* row_flg_out = NULL; + void* d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + row_flg_out, + row_flg_out, + num_cols_l, + mshadow::Stream::GetStream(s)); + mshadow::Tensor workspace = ctx.requested[0] + .get_space_typed(Shape1(num_cols_l * sizeof(dim_t) + + temp_storage_bytes), s); + row_flg_out = reinterpret_cast(workspace.dptr_); + d_temp_storage = workspace.dptr_ + num_cols_l*sizeof(dim_t); + num_threads = num_cols_l; + Kernel::Launch(s, num_threads, row_flg_out); + num_threads = num_rows_l * threads_per_warp; + Kernel::Launch(s, num_threads, + row_flg_out, col_idx_l.dptr(), indptr_l.dptr(), + num_rows_l, num_cols_l); + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + row_flg_out, + row_flg_out, + num_cols_l, + mshadow::Stream::GetStream(s)); + dim_t nnr_out = 0; + CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t), + cudaMemcpyDeviceToHost)); + + // Allocate output matrix space + ret->CheckAndAlloc({Shape1(nnr_out)}); + const TBlob data_out_blob = ret->data(); + const TBlob row_idx_out_blob = ret->aux_data(rowsparse::kIdx); + MSHADOW_IDX_TYPE_SWITCH(row_idx_out_blob.type_flag_, RType, { // row idx type + DType* data_out = data_out_blob.dptr(); + RType* row_idx_out = row_idx_out_blob.dptr(); + num_threads = nnr_out * num_cols_r; + Kernel::Launch(s, num_threads, data_out); + num_threads = nnr_out; + Kernel::Launch(s, num_threads, row_idx_out); + + // Fill row_idx array of output matrix, using the row_flg values + num_threads = num_cols_l; + Kernel::Launch(s, num_threads, + row_idx_out, row_flg_out, num_cols_l); + + // Perform matrix-matrix multiply + num_threads = threads_per_warp * num_rows_l * num_cols_r; + Kernel::Launch(s, num_threads, + data_out, row_flg_out, + data_l.dptr(), indptr_l.dptr(), col_idx_l.dptr(), + data_r.dptr(), num_cols_r); + }); + } else { + LOG(FATAL) << "DotCsrDnsRspImpl has not implemented dot(csr, dns) = rsp yet."; + } + }); + }); + }); +} + +/*! + * \brief GPU Impl of dot(csr, rsp1) = rsp2 and dot(csr.T, rsp1) = rsp2 + * TODO: Optimize for GPU; this is a baseline implementation providing + * the operator functionality, it is not yet fully optimized for GPU. + */ +inline void DotCsrRspRspImpl(const OpContext& ctx, + const gpu& gpu_dev, + const NDArray& lhs, + const NDArray& rhs, + const OpReqType req, + const bool trans_lhs, + NDArray* ret) { + if (kNullOp == req) return; + // Reuse dot(csr, dns) implementation if rhs rsp matrix is in fact dense + if (rhs.storage_shape()[0] == rhs.shape()[0]) { + DotCsrDnsRspImpl(ctx, gpu_dev, lhs, rhs.data(), req, trans_lhs, ret); + return; + } + CHECK_EQ(lhs.storage_type(), kCSRStorage); + CHECK_EQ(rhs.storage_type(), kRowSparseStorage); + CHECK_EQ(ret->storage_type(), kRowSparseStorage); + if (!lhs.storage_initialized() || !rhs.storage_initialized()) return; + CHECK_EQ(req, kWriteTo); + + using mshadow::Shape1; + using mxnet_op::Kernel; + using mxnet_op::set_zero; + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob data_r = rhs.data(); + const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx); + + const dim_t num_rows_l = lhs.shape()[0]; + const dim_t num_cols_l = lhs.shape()[1]; + const dim_t num_cols_r = rhs.shape()[1]; + const dim_t nnr_r = rhs.storage_shape()[0]; + const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize; + dim_t num_threads; + // TODO: remove kernel dependency on warpSize=32 + if (threads_per_warp != 32) { + LOG(FATAL) << "DotCsrRspRspImpl GPU kernels expect warpSize=32"; + } + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, { // row idx type + if (trans_lhs) { + // Compute number of non-zero rows (nnr) of output matrix + // - alloc temp storage for row_flg array and for cub's prefix sum + // - mark non-zero columns of csr matrix in row_flg + // - compute inclusive prefix sum over marked array + // - copy last value (nnr_out) from device to host + dim_t* row_flg_out = NULL; + void* d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + row_flg_out, + row_flg_out, + num_cols_l, + mshadow::Stream::GetStream(s)); + mshadow::Tensor workspace = ctx.requested[0] + .get_space_typed(Shape1(num_cols_l * sizeof(dim_t) + + temp_storage_bytes), s); + row_flg_out = reinterpret_cast(workspace.dptr_); + d_temp_storage = workspace.dptr_ + num_cols_l*sizeof(dim_t); + num_threads = num_cols_l; + Kernel::Launch(s, num_threads, row_flg_out); + num_threads = num_rows_l * threads_per_warp; + Kernel::Launch(s, num_threads, + row_flg_out, col_idx_l.dptr(), indptr_l.dptr(), + num_rows_l, num_cols_l); + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + row_flg_out, + row_flg_out, + num_cols_l, + mshadow::Stream::GetStream(s)); + dim_t nnr_out = 0; + CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t), + cudaMemcpyDeviceToHost)); + + // Allocate output matrix space + ret->CheckAndAlloc({mshadow::Shape1(nnr_out)}); + const TBlob data_out_blob = ret->data(); + const TBlob row_idx_out_blob = ret->aux_data(rowsparse::kIdx); + DType* data_out = data_out_blob.dptr(); + RType* row_idx_out = row_idx_out_blob.dptr(); + num_threads = nnr_out * num_cols_r; + Kernel::Launch(s, num_threads, data_out); + num_threads = nnr_out; + Kernel::Launch(s, num_threads, row_idx_out); + + // Fill row_idx array of output matrix, using the row_flg values + num_threads = num_cols_l; + Kernel::Launch(s, num_threads, + row_idx_out, row_flg_out, num_cols_l); + + // Perform matrix-matrix multiply + num_threads = nnr_out; + Kernel::Launch(s, num_threads, + data_out, row_idx_out, + data_l.dptr(), indptr_l.dptr(), col_idx_l.dptr(), + data_r.dptr(), row_idx_r.dptr(), + num_cols_r, nnr_r, nnr_out); + } else { + LOG(FATAL) << "DotCsrRspRspImpl has not implemented dot(csr, rsp1) = rsp2 yet."; + } + }); + }); + }); + }); +} + +/*! + * \brief GPU Impl of dot(csr, rsp) = dns and dot(csr.T, rsp) = dns + */ +inline void DotCsrRspDnsImpl(const OpContext& ctx, + const gpu& gpu_dev, + const NDArray& lhs, + const NDArray& rhs, + const OpReqType req, + const bool trans_lhs, + TBlob* ret) { + // Reuse dot(csr, dns) implementation if rhs rsp matrix is in fact dense + if (rhs.storage_shape()[0] == rhs.shape()[0]) { + DotCsrDnsDnsImpl(ctx, gpu_dev, lhs, rhs.data(), req, trans_lhs, ret); + return; + } + if (kNullOp == req) return; + CHECK_EQ(lhs.storage_type(), kCSRStorage); + CHECK_EQ(rhs.storage_type(), kRowSparseStorage); + + using mxnet_op::Kernel; + using mxnet_op::set_zero; + mshadow::Stream* s = ctx.get_stream(); + if (!lhs.storage_initialized() || !rhs.storage_initialized()) { + if (kWriteTo == req) { + MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, { // data type + Kernel::Launch(s, ret->Size(), ret->dptr()); + }); + } + return; + } + + using nnvm::dim_t; + const dim_t num_rows = ret->shape_[0]; + const dim_t num_cols = ret->shape_[1]; + const dim_t nnr_r = rhs.storage_shape()[0]; + dim_t num_threads; + + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob data_r = rhs.data(); + const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx); + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, { // row idx type + if (kWriteTo == req) { + num_threads = num_rows*num_cols; + Kernel::Launch(s, num_threads, ret->dptr()); + } + if (trans_lhs) { + LOG(FATAL) << "DotCsrRspDnsImpl has not implemented dot(csr.T, rsp) = dns yet."; + } else { + // TODO: Consider implementing a vector kernel for SpMV (similar to DotCsrDnsDns) + // Alloc temp storage for row_flg array + RType* row_flg_r = ctx.requested[0] + .get_space_typed(mshadow::Shape1(rhs.shape()[0]), s).dptr_; + num_threads = rhs.shape()[0]; + Kernel::Launch(s, num_threads, row_flg_r); + // Set row_flg index array + num_threads = nnr_r; + Kernel::Launch(s, num_threads, + row_flg_r, row_idx_r.dptr(), nnr_r); + // Perform sparse matrix-matrix multiply + num_threads = num_rows*num_cols; + Kernel::Launch(s, num_threads, + ret->dptr(), + data_l.dptr(), indptr_l.dptr(), col_idx_l.dptr(), + data_r.dptr(), row_idx_r.dptr(), row_flg_r, rhs.storage_shape()[0], + num_rows, num_cols); + } + }); + }); + }); + }); +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_DOT_INL_CUH_ diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h new file mode 100644 index 000000000000..7b7d82b01b91 --- /dev/null +++ b/src/operator/tensor/dot-inl.h @@ -0,0 +1,1007 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file dot-inl.h + * \brief Function definition of matrix dot operator + */ + +#ifndef MXNET_OPERATOR_TENSOR_DOT_INL_H_ +#define MXNET_OPERATOR_TENSOR_DOT_INL_H_ + +#include +#include +#include +#include +#include +#include "../mshadow_op.h" +#include "../elemwise_op_common.h" +#include "../mxnet_op.h" +#ifdef __CUDACC__ +#include "./dot-inl.cuh" +#endif // __CUDACC__ + +namespace mxnet { +namespace op { + +struct DotParam : public dmlc::Parameter { + bool transpose_a; + bool transpose_b; + DMLC_DECLARE_PARAMETER(DotParam) { + DMLC_DECLARE_FIELD(transpose_a) + .describe("If true then transpose the first input before dot.") + .set_default(false); + DMLC_DECLARE_FIELD(transpose_b) + .describe("If true then transpose the second input before dot.") + .set_default(false); + } +}; + +template +void DotForward_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + const DotParam& param = nnvm::get(attrs.parsed); + Stream *s = ctx.get_stream(); + CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_) + << "Binary function only support input/output with the same type"; + CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_) + << "Binary function only support input/output with the same type"; + CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) + << "dot only supports float32 and float64"; + MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + if (inputs[0].ndim() == 1 && inputs[1].ndim() == 1) { + CHECK_NE(req[0], kAddTo) << "AddTo not yet suported"; + Tensor out = outputs[0].get(s); + VectorDot(out, + inputs[0].get(s), + inputs[1].get(s)); + } else { + int ma, na, mb, nb, m, n; + if (param.transpose_a) { + ma = inputs[0].size(0); + na = inputs[0].Size()/ma; + m = na; + } else { + na = inputs[0].size(inputs[0].ndim()-1); + ma = inputs[0].Size()/na; + m = ma; + } + if (param.transpose_b) { + nb = inputs[1].size(inputs[1].ndim()-1); + mb = inputs[1].Size()/nb; + n = mb; + } else { + mb = inputs[1].size(0); + nb = inputs[1].Size()/mb; + n = nb; + } + Tensor input0 = + inputs[0].get_with_shape(Shape2(ma, na), s); + Tensor input1 = + inputs[1].get_with_shape(Shape2(mb, nb), s); + Tensor out = + outputs[0].get_with_shape(Shape2(m, n), s); + if (param.transpose_a && param.transpose_b) { + ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1.T())); + } else if (!param.transpose_a && param.transpose_b) { + ASSIGN_DISPATCH(out, req[0], dot(input0, input1.T())); + } else if (param.transpose_a && !param.transpose_b) { + ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1)); + } else { + ASSIGN_DISPATCH(out, req[0], dot(input0, input1)); + } + } + }); +} + +template +void DotBackward_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + const DotParam& param = nnvm::get(attrs.parsed); + Stream *s = ctx.get_stream(); + CHECK_NE(req[0], kWriteInplace); + CHECK_NE(req[1], kWriteInplace); + MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + if (inputs[1].ndim() == 1 && inputs[2].ndim() == 1) { + Tensor mout_grad = inputs[0].get(s); + Tensor mlhs_data = inputs[1].get(s); + Tensor mrhs_data = inputs[2].get(s); + Tensor mlhs_grad = outputs[0].get(s); + Tensor mrhs_grad = outputs[1].get(s); + ASSIGN_DISPATCH(mrhs_grad, req[1], + broadcast_scalar(mout_grad, mlhs_data.shape_) * mlhs_data); + ASSIGN_DISPATCH(mlhs_grad, req[0], + broadcast_scalar(mout_grad, mlhs_data.shape_) * mrhs_data); + } else { + int ma, na, mb, nb, m, n; + if (param.transpose_a) { + ma = outputs[0].size(0); + na = outputs[0].Size()/ma; + m = na; + } else { + na = outputs[0].size(outputs[0].ndim()-1); + ma = outputs[0].Size()/na; + m = ma; + } + if (param.transpose_b) { + nb = outputs[1].size(outputs[1].ndim()-1); + mb = outputs[1].Size()/nb; + n = mb; + } else { + mb = outputs[1].size(0); + nb = outputs[1].Size()/mb; + n = nb; + } + Tensor mout_grad = + inputs[0].get_with_shape(Shape2(m, n), s); + Tensor mlhs_data = + inputs[1].get_with_shape(Shape2(ma, na), s); + Tensor mrhs_data = + inputs[2].get_with_shape(Shape2(mb, nb), s); + Tensor mlhs_grad = + outputs[0].get_with_shape(Shape2(ma, na), s); + Tensor mrhs_grad = + outputs[1].get_with_shape(Shape2(mb, nb), s); + if (param.transpose_a && param.transpose_b) { + // Gradient of z = dot(x.T, y.T) + // dy = dot(x, dz).T = dot(dz.T, x.T) + // dx = dot(dz, y).T = dot(y.T, dz.T) + ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data.T())); + ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data.T(), mout_grad.T())); + } else if (!param.transpose_a && param.transpose_b) { + // Gradient of z = dot(x, y.T) + // dy = dot(x.T, dz).T = dot(dz.T, x) + // dx = dot(dz, y) + ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data)); + ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data)); + } else if (param.transpose_a && !param.transpose_b) { + // Gradient of z = dot(x.T, y) + // dy = dot(x, dz) + // dx = dot(dz, y.T).T = dot(y, dz.T) + ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data, mout_grad)); + ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data, mout_grad.T())); + } else { + // Gradient of z = dot(x, y) + // dy = dot(x.T, dz) + // dx = dot(dz, y.T) + ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data.T(), mout_grad)); + ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data.T())); + } + } + }); +} + +inline bool DotForwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + const DotParam& param = nnvm::get(attrs.parsed); + // csr has many zero columns, so the result of dot(csr.T, matrix) should be rsp + // TODO(stefan/haibin/jun): check type_assign return value + if (param.transpose_a && kCSRStorage == (*in_attrs)[0]) { + type_assign(&((*out_attrs)[0]), kRowSparseStorage); + } else { + type_assign(&((*out_attrs)[0]), kDefaultStorage); + } + return true; +} + +inline bool DotBackwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 3U); + CHECK_EQ(out_attrs->size(), 2U); + const DotParam& param = nnvm::get(attrs.parsed); + type_assign(&((*out_attrs)[0]), kDefaultStorage); + if (!param.transpose_a && kCSRStorage == (*in_attrs)[1]) { + type_assign(&((*out_attrs)[1]), kRowSparseStorage); + } else { + type_assign(&((*out_attrs)[1]), kDefaultStorage); + } + return true; +} + +/*! + * \brief CPU Kernel of dot(csr, dns1) = dns2 + * Parallelization by row blocks + */ +struct DotCsrDnsDnsByRowBlocks { + /*! + * \brief + * \param i the i-th thread + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t seg_len, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + const dim_t seg_start = i * seg_len; + if (seg_start >= num_rows) return; + const dim_t seg_end = std::min(seg_start + seg_len, num_rows); + for (dim_t j = seg_start; j < seg_end; ++j) { + if (indptr_l[j] == indptr_l[j+1]) continue; + const dim_t offset_out = j * num_cols; + for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) { + const DType val = data_l[k]; + const dim_t offset_r = col_idx_l[k] * num_cols; + for (dim_t l = 0; l < num_cols; ++l) { + out[offset_out+l] += data_r[offset_r+l] * val; + } + } + } + } +}; + +/*! + * \brief CPU Kernel of dot(csr.T(), dns1) = dns2 + * Parallelization by row blocks + */ +struct DotCsrTransDnsDnsByRowBlocks { + /*! + * \brief + * \param i the i-th thread + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t seg_len, + const nnvm::dim_t num_rows_l, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + const dim_t seg_start = i * seg_len; + if (seg_start >= num_rows) return; + const dim_t seg_end = (i + 1) * seg_len; + for (dim_t j = 0; j < num_rows_l; ++j) { + if (indptr_l[j] == indptr_l[j+1]) continue; + const dim_t offset_r = j * num_cols; + for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) { + const CType col_idx = col_idx_l[k]; + if (col_idx < seg_start || col_idx >= seg_end) continue; + const dim_t offset_out = col_idx * num_cols; + const DType val = data_l[k]; + for (dim_t l = 0; l < num_cols; ++l) { + out[offset_out+l] += data_r[offset_r+l] * val; + } + } + } + } +}; + +/*! + * \brief CPU Kernel of dot(csr.T(), dns) = rsp + * Parallelization by row blocks. + * This kernel fills up the row_idx array of the rsp + * with 1 for nonzero rows and 0 for zero rows. + * The matrix will be compacted after this kernel call. + */ +struct DotCsrTransDnsRspByRowBlocks { + /*! + * \brief + * \param i the i-th thread + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* out, + RType* row_idx, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t seg_len, + const nnvm::dim_t num_rows_l, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + const dim_t seg_start = i * seg_len; + if (seg_start >= num_rows) return; + const dim_t seg_end = (i + 1) * seg_len; + for (dim_t j = 0; j < num_rows_l; ++j) { + if (indptr_l[j] == indptr_l[j+1]) continue; + const dim_t offset_r = j * num_cols; + for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) { + const CType col_idx = col_idx_l[k]; + if (col_idx < seg_start || col_idx >= seg_end) continue; + const dim_t offset_out = col_idx * num_cols; + row_idx[col_idx] = 1; + const DType val = data_l[k]; + for (dim_t l = 0; l < num_cols; ++l) { + out[offset_out+l] += data_r[offset_r+l] * val; + } + } + } + } +}; + +/*! + * \brief CPU Kernel of dot(csr, rsp) = dns + * Parallelization by row blocks + */ +struct DotCsrRspDnsByRowBlocks { + /*! + * \brief + * \param i the i-th thread + * \param nnr_r storage_shape[0] of the rsp + * \param num_rows dns.shape[0] + * \param num_cols dns.shape[1] + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const RType* row_idx_r, + const nnvm::dim_t nnr_r, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols, + const nnvm::dim_t seg_len) { + using nnvm::dim_t; + const dim_t seg_start = i * seg_len; + if (seg_start >= num_rows) return; + const dim_t seg_end = std::min(seg_start + seg_len, num_rows); + for (dim_t j = seg_start; j < seg_end; ++j) { + if (indptr_l[j] == indptr_l[j+1]) continue; + const dim_t offset_out = j * num_cols; + // Use binary search to find the lower_bound of val in row_idx array + const RType* first = row_idx_r; + const RType* last = row_idx_r + nnr_r; + const CType val = col_idx_l[indptr_l[j]]; + const RType* it; + int count = last - first, step; + while (count > 0) { + it = first; + step = count / 2; + it += step; + if (*it < val) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + const RType* row_idx_ptr = first; + // end of binary search + if (row_idx_ptr == row_idx_r+nnr_r || *row_idx_ptr> col_idx_l[indptr_l[j+1]-1]) continue; + for (IType k = indptr_l[j]; k < indptr_l[j+1] && row_idx_ptr != row_idx_r+nnr_r;) { + if (col_idx_l[k] == *row_idx_ptr) { + const dim_t offset_r = (row_idx_ptr - row_idx_r) * num_cols; + for (dim_t l = 0; l < num_cols; ++l) { + out[offset_out+l] += data_l[k] * data_r[offset_r+l]; + } + ++k; + ++row_idx_ptr; + } else if (col_idx_l[k] < *row_idx_ptr) { + ++k; + } else { + ++row_idx_ptr; + } + } + } + } +}; + +/*! + * \brief CPU Kernel of dot(csr.T(), rsp1) = rsp2, with row_idx marked for non-zero rows + * Parallelization by row blocks + */ +struct DotCsrTransRspRspByRowBlocks { + /*! + * \brief + * \param i the i-th thread + * \param num_rows_l number of rows of lhs matrix + * \param nnr_r number of non-zero rows of rhs matrix + * \param num_rows number of rows of out matrix + * \param num_cols number of cols of out matrix + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* out, + RType* row_idx_out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const RType* row_idx_r, + const nnvm::dim_t num_rows_l, + const nnvm::dim_t nnr_r, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols, + const nnvm::dim_t seg_len) { + using nnvm::dim_t; + const dim_t seg_start = i * seg_len; + if (seg_start >= num_rows) return; + const dim_t seg_end = (i + 1) * seg_len; + for (dim_t rid = 0; rid < nnr_r; ++rid) { + const RType j = row_idx_r[rid]; + if (indptr_l[j] == indptr_l[j+1]) continue; + const dim_t offset_r = rid * num_cols; + for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) { + const CType col_idx = col_idx_l[k]; + if (col_idx < seg_start || col_idx >= seg_end) continue; + row_idx_out[col_idx] = 1; // mark nonzero row as 1 + const dim_t offset_out = col_idx * num_cols; + for (dim_t l = 0; l < num_cols; ++l) { + out[offset_out+l] += data_r[offset_r+l] * data_l[k]; + } + } + } + } +}; + +/*! + * \brief CPU Impl of dot(csr, dns1) = dns2 and dot(csr.T, dns1) = dns2 + */ +inline void DotCsrDnsDnsImpl(const OpContext& ctx, + const cpu& cpu_dev, + const NDArray& lhs, + const TBlob& rhs, + const OpReqType req, + const bool trans_lhs, + TBlob* ret) { + if (kNullOp == req) return; + CHECK_EQ(lhs.storage_type(), kCSRStorage); + if (!lhs.storage_initialized()) return; + + using nnvm::dim_t; + + mshadow::Stream* s = ctx.get_stream(); + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob& data_r = rhs; + const TBlob data_out = *ret; + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + dim_t num_threads; + if (kWriteTo == req) { + num_threads = data_out.Size(); + mxnet_op::Kernel::Launch( + s, num_threads, data_out.dptr()); + } + num_threads = mxnet_op::get_num_threads(data_out.shape_[0]); + dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads; + if (trans_lhs) { + mxnet_op::Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), seg_len, + lhs.shape()[0], data_out.shape_[0], data_out.shape_[1]); + } else { + mxnet_op::Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), seg_len, + data_out.shape_[0], data_out.shape_[1]); + } + }); + }); + }); +} + +/*! + * \brief CPU Impl of dot(csr.T, dns) = rsp + */ +inline void DotCsrDnsRspImpl(const OpContext& ctx, + const cpu& cpu_dev, + const NDArray& lhs, + const TBlob& rhs, + const OpReqType req, + const bool trans_lhs, + NDArray* ret) { + if (kNullOp == req) return; + CHECK_EQ(lhs.storage_type(), kCSRStorage); + CHECK_EQ(ret->storage_type(), kRowSparseStorage); + if (!lhs.storage_initialized()) return; + CHECK_EQ(req, kWriteTo); + + using mxnet_op::set_zero; + using nnvm::dim_t; + + mshadow::Stream* s = ctx.get_stream(); + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob& data_r = rhs; + + // pre-allocate spaces for ret using the dense dimension size + ret->CheckAndAlloc({mshadow::Shape1(lhs.shape()[1])}); + const TBlob data_out = ret->data(); + const TBlob row_idx_out = ret->aux_data(rowsparse::kIdx); + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + MSHADOW_IDX_TYPE_SWITCH(row_idx_out.type_flag_, RType, { // row idx type + dim_t num_threads = data_out.Size(); + mxnet_op::Kernel::Launch(s, num_threads, data_out.dptr()); + RType* row_idx = row_idx_out.dptr(); + num_threads = row_idx_out.Size(); + mxnet_op::Kernel::Launch(s, num_threads, row_idx); + num_threads = mxnet_op::get_num_threads(data_out.shape_[0]); + dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads; + if (trans_lhs) { + mxnet_op::Kernel::Launch(s, num_threads, + data_out.dptr(), row_idx, data_l.dptr(), + indptr_l.dptr(), col_idx_l.dptr(), data_r.dptr(), + seg_len, lhs.shape()[0], data_out.shape_[0], data_out.shape_[1]); + dim_t nnr = 0; + nnr = mxnet::common::ParallelAccumulate(row_idx, ret->shape()[0], nnr); + ret->set_aux_shape(rowsparse::kIdx, mshadow::Shape1(nnr)); + if (0 == nnr) return; + mshadow::Tensor rsp_data = data_out.FlatTo2D(s); + dim_t idx = 0; + for (index_t i = 0; i < ret->shape()[0]; ++i) { + if (row_idx[i] > 0) { + row_idx[idx] = i; + mshadow::Copy(rsp_data[idx], rsp_data[i], s); + ++idx; + } + } + } else { + LOG(FATAL) << "DotCsrDnsRspImpl has not implemented dot(csr, dns)=rsp yet."; + } + }); + }); + }); + }); +} + +/*! + * \brief CPU Impl of dot(csr, rsp) = dns + */ +inline void DotCsrRspDnsImpl(const OpContext& ctx, + const cpu& cpu_dev, + const NDArray& lhs, + const NDArray& rhs, + const OpReqType req, + const bool trans_lhs, + TBlob* ret) { + if (kNullOp == req) return; + // reuse csr dns implementation when storage_shape == shape for rhs + if (rhs.storage_shape()[0] == rhs.shape()[0]) { // if rsp is actually dense + DotCsrDnsDnsImpl(ctx, cpu_dev, lhs, rhs.data(), req, trans_lhs, ret); + return; + } + + CHECK_EQ(lhs.storage_type(), kCSRStorage); + CHECK_EQ(rhs.storage_type(), kRowSparseStorage); + mshadow::Stream* s = ctx.get_stream(); + if (!lhs.storage_initialized() || !rhs.storage_initialized()) { + if (kWriteTo == req) { + MSHADOW_SGL_DBL_TYPE_SWITCH(ret->type_flag_, DType, { // data type + mxnet_op::Kernel::Launch( + s, ret->Size(), ret->dptr()); + }); + } + return; + } + using nnvm::dim_t; + + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob data_r = rhs.data(); + const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx); + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, { // row idx type + dim_t num_threads; + if (kWriteTo == req) { + num_threads = ret->Size(); + mxnet_op::Kernel::Launch(s, num_threads, + ret->dptr()); + } + num_threads = mxnet_op::get_num_threads(ret->shape_[0]); + dim_t seg_len = (ret->shape_[0] + num_threads - 1) / num_threads; + if (trans_lhs) { + LOG(FATAL) << "DotCsrRspDnsImpl has not implemented dot(csr.T, rsp) = dns yet"; + } else { + mxnet_op::Kernel::Launch(s, num_threads, + ret->dptr(), data_l.dptr(), + indptr_l.dptr(), col_idx_l.dptr(), data_r.dptr(), + row_idx_r.dptr(), rhs.storage_shape()[0], + ret->shape_[0], ret->shape_[1], seg_len); + } + }); + }); + }); + }); +} + +/*! + * \brief CPU Impl of dot(csr.T, rsp1) = rsp2 + */ +inline void DotCsrRspRspImpl(const OpContext& ctx, + const cpu& cpu_dev, + const NDArray& lhs, + const NDArray& rhs, + const OpReqType req, + const bool trans_lhs, + NDArray* ret) { + if (kNullOp == req) return; + // reuse csr dns implementation when storage_shape == shape for rhs + if (rhs.storage_shape()[0] == rhs.shape()[0]) { // if rsp is actually dense + DotCsrDnsRspImpl(ctx, cpu_dev, lhs, rhs.data(), req, trans_lhs, ret); + return; + } + + CHECK_EQ(lhs.storage_type(), kCSRStorage); + CHECK_EQ(rhs.storage_type(), kRowSparseStorage); + CHECK_EQ(ret->storage_type(), kRowSparseStorage); + if (!lhs.storage_initialized() || !rhs.storage_initialized()) return; + CHECK_EQ(req, kWriteTo); + + using mxnet_op::set_zero; + using nnvm::dim_t; + + mshadow::Stream* s = ctx.get_stream(); + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob data_r = rhs.data(); + const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx); + + // pre-allocate spaces for ret using the dense dimension size + if (ret->storage_type() == kRowSparseStorage) { + ret->CheckAndAlloc({mshadow::Shape1(lhs.shape()[1])}); + } + const TBlob data_out = ret->data(); + const TBlob row_idx_out = ret->aux_data(rowsparse::kIdx); + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, { // row idx type + dim_t num_threads = data_out.Size(); + mxnet_op::Kernel::Launch(s, num_threads, data_out.dptr()); + num_threads = mxnet_op::get_num_threads(data_out.shape_[0]); + dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads; + if (trans_lhs) { + RType* row_idx = row_idx_out.dptr(); + num_threads = row_idx_out.Size(); + mxnet_op::Kernel::Launch(s, num_threads, row_idx); + mxnet_op::Kernel::Launch(s, num_threads, + data_out.dptr(), row_idx, data_l.dptr(), + indptr_l.dptr(), col_idx_l.dptr(), data_r.dptr(), + row_idx_r.dptr(), lhs.shape()[0], rhs.storage_shape()[0], + ret->shape()[0], ret->shape()[1], seg_len); + dim_t nnr = 0; + nnr = mxnet::common::ParallelAccumulate(row_idx, ret->shape()[0], nnr); + ret->set_aux_shape(rowsparse::kIdx, mshadow::Shape1(nnr)); + if (0 == nnr) return; + mshadow::Tensor rsp_data = data_out.FlatTo2D(s); + dim_t idx = 0; + for (index_t i = 0; i < ret->shape()[0]; ++i) { + if (row_idx[i] > 0) { + row_idx[idx] = i; + mshadow::Copy(rsp_data[idx], rsp_data[i], s); + ++idx; + } + } + } else { + LOG(FATAL) << "DotCsrRspRspImpl has not implemented dot(csr, rsp) = rsp2 yet"; + } + }); + }); + }); + }); +} + +inline bool DotShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + const DotParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + TShape& lshape = (*in_attrs)[0]; + TShape& rshape = (*in_attrs)[1]; + if (lshape.ndim() == 1 && rshape.ndim() == 1) { + CHECK(!param.transpose_a && !param.transpose_b) << "Cannot transpose vectors"; + CHECK_EQ(lshape[0], rshape[0]) << "dot shape error: " << lshape << " X " << rshape; + SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape1(1)); + } else { + bool Ta = param.transpose_a, Tb = param.transpose_b; + TShape L[2], R[2]; + if (Ta) { + L[0] = mshadow::Shape1(lshape[0]); + L[1] = lshape.ndim() > 1 ? TShape(&lshape[1], &lshape[lshape.ndim()]) : TShape(1); + } else { + L[0] = lshape.ndim() > 1 ? TShape(&lshape[0], &lshape[lshape.ndim()-1]) : TShape(1); + L[1] = mshadow::Shape1(lshape[lshape.ndim()-1]); + } + if (Tb) { + R[0] = rshape.ndim() > 1 ? TShape(&rshape[0], &rshape[rshape.ndim()-1]) : TShape(1); + R[1] = mshadow::Shape1(rshape[rshape.ndim()-1]); + } else { + R[0] = mshadow::Shape1(rshape[0]); + R[1] = rshape.ndim() > 1 ? TShape(&rshape[1], &rshape[rshape.ndim()]) : TShape(1); + } + + if (L[!Ta].Size() != 0 && R[Tb].Size() != 0) { + CHECK_EQ(L[!Ta].Size(), R[Tb].Size()) + << "dot shape error: " << lshape << " X " << rshape; + } + std::vector buf; + if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], &L[Ta][L[Ta].ndim()]); + if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], &R[!Tb][R[!Tb].ndim()]); + TShape oshape(buf.begin(), buf.end()); + SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape); + } + return true; +} + +template +void DotForwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const DotParam& param = nnvm::get(attrs.parsed); + CHECK(!param.transpose_b) << "transposing rhs of the sparse dot op is not supported"; + CHECK_EQ(inputs[0].shape().ndim(), 2) << "sparse dot only supports 2 dimensional lhs"; + CHECK_EQ(inputs[1].shape().ndim(), 2) << "sparse dot only supports 2 dimensional rhs"; + auto lhs_stype = inputs[0].storage_type(); + auto rhs_stype = inputs[1].storage_type(); + auto out_stype = outputs[0].storage_type(); + if (lhs_stype == kCSRStorage && rhs_stype == kDefaultStorage && out_stype == kDefaultStorage) { + TBlob ret = outputs[0].data(); + DotCsrDnsDnsImpl(ctx, xpu(), inputs[0], inputs[1].data(), req[0], param.transpose_a, &ret); + } else if (lhs_stype == kCSRStorage && rhs_stype == kRowSparseStorage + && out_stype == kDefaultStorage) { + TBlob ret = outputs[0].data(); + DotCsrRspDnsImpl(ctx, xpu(), inputs[0], inputs[1], req[0], param.transpose_a, &ret); + } else if (lhs_stype == kCSRStorage && rhs_stype == kDefaultStorage + && out_stype == kRowSparseStorage) { + NDArray out = outputs[0]; + DotCsrDnsRspImpl(ctx, xpu(), inputs[0], inputs[1].data(), req[0], param.transpose_a, &out); + } else if (lhs_stype == kCSRStorage && rhs_stype == kRowSparseStorage + && out_stype == kRowSparseStorage) { + NDArray ret = outputs[0]; + DotCsrRspRspImpl(ctx, xpu(), inputs[0], inputs[1], req[0], param.transpose_a, &ret); + } else { + FCompExFallback(attrs, ctx, inputs, req, outputs, DotForward_, "DotForward_"); + } +} + +template +void DotBackwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 2U); + CHECK_EQ(req.size(), 2U); + CHECK_EQ(kNullOp, req[0]) + << "sparse dot does not support computing the gradient of the csr/lhs"; + CHECK_NE(req[1], kWriteInplace) << "DotBackwardEx does not support WriteInplace"; + + const DotParam& param = nnvm::get(attrs.parsed); + CHECK(!param.transpose_b) << "sparse dot only supports dot(A, X) and dot(A.T(), X)"; + CHECK_EQ(inputs[0].shape().ndim(), 2) << "sparse dot only supports 2 dimensional lhs"; + CHECK_EQ(inputs[1].shape().ndim(), 2) << "sparse dot only supports 2 dimensional rhs"; + const auto ograd_stype = inputs[0].storage_type(); + const auto lhs_stype = inputs[1].storage_type(); + const auto rhs_stype = inputs[2].storage_type(); + const auto grad_rhs_stype = outputs[1].storage_type(); + if (ograd_stype == kDefaultStorage // ograd dns format + && lhs_stype == kCSRStorage // csr input lhs of the op + && grad_rhs_stype == kDefaultStorage) { // grad(rhs) dns format + TBlob ret = outputs[1].data(); + DotCsrDnsDnsImpl(ctx, xpu(), inputs[1], inputs[0].data(), req[1], !param.transpose_a, &ret); + } else if (ograd_stype == kDefaultStorage + && lhs_stype == kCSRStorage + && grad_rhs_stype == kRowSparseStorage) { + NDArray ret = outputs[1]; + DotCsrDnsRspImpl(ctx, xpu(), inputs[1], inputs[0].data(), req[1], !param.transpose_a, &ret); + } else { + FCompExFallback(attrs, ctx, inputs, req, outputs, DotBackward_, "DotBackward_"); + } +} + +template +void BatchDotForward_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + mshadow::Stream *s = ctx.get_stream(); + const DotParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_) + << "Binary function only support input/output with the same type"; + CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_) + << "Binary function only support input/output with the same type"; + CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) + << "dot only supports float32 and float64"; + MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + mshadow::Tensor out = outputs[0].get(s); + mshadow::Tensor mlhs = inputs[0].get(s); + mshadow::Tensor mrhs = inputs[1].get(s); + mshadow::Tensor workspace = + ctx.requested[0].get_space_typed(mshadow::Shape1(3 * out.size(0)), s); + if (kNullOp != req[0]) { + if (param.transpose_a && param.transpose_b) { + mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + workspace); + } else if (!param.transpose_a && param.transpose_b) { + mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + workspace); + } else if (param.transpose_a && !param.transpose_b) { + mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + workspace); + } else { + mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + workspace); + } + } + }); +} + +template +void BatchDotBackward_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + mshadow::Stream *s = ctx.get_stream(); + const DotParam& param = nnvm::get(attrs.parsed); + CHECK_NE(req[1], kWriteInplace); + CHECK_NE(req[0], kWriteInplace); + CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) + << "dot only supports float32 and float64"; + MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + mshadow::Tensor mout_grad = inputs[0].get(s); + mshadow::Tensor mlhs_data = inputs[1].get(s); + mshadow::Tensor mrhs_data = inputs[2].get(s); + mshadow::Tensor mlhs_grad = outputs[0].get(s); + mshadow::Tensor mrhs_grad = outputs[1].get(s); + mshadow::Tensor workspace = + ctx.requested[0].get_space_typed( + mshadow::Shape2(2, 3 * mout_grad.size(0)), s); + mshadow::Tensor rhs_workspace = workspace[0]; + mshadow::Tensor lhs_workspace = workspace[1]; + if (param.transpose_a && param.transpose_b) { + // Gradient of z = dot(x.T, y.T) + // dy = dot(x, dz).T = dot(dz.T, x.T) + // dx = dot(dz, y).T = dot(y.T, dz.T) + if (kNullOp != req[1]) { + mshadow::BatchGEMM(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f, + (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, + rhs_workspace); + } + if (kNullOp != req[0]) { + mshadow::BatchGEMM(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + lhs_workspace); + } + } else if (!param.transpose_a && param.transpose_b) { + // Gradient of z = dot(x, y.T) + // dy = dot(x.T, dz).T = dot(dz.T, x) + // dx = dot(dz, y) + if (kNullOp != req[1]) { + mshadow::BatchGEMM(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f, + (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, + rhs_workspace); + } + if (kNullOp != req[0]) { + mshadow::BatchGEMM(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + lhs_workspace); + } + } else if (param.transpose_a && !param.transpose_b) { + // Gradient of z = dot(x.T, y) + // dy = dot(x, dz) + // dx = dot(dz, y.T).T = dot(y, dz.T) + if (kNullOp != req[1]) { + mshadow::BatchGEMM(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f, + (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, + rhs_workspace); + } + if (kNullOp != req[0]) { + mshadow::BatchGEMM(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + lhs_workspace); + } + } else { + // Gradient of z = dot(x, y) + // dy = dot(x.T, dz) + // dx = dot(dz, y.T) + if (kNullOp != req[1]) { + mshadow::BatchGEMM(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f, + (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, + rhs_workspace); + } + if (kNullOp != req[0]) { + mshadow::BatchGEMM(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + lhs_workspace); + } + } + }); +} + +inline bool BatchDotShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + const DotParam& param = nnvm::get(attrs.parsed); + TShape& lshape = (*in_attrs)[0]; + TShape& rshape = (*in_attrs)[1]; + if (lshape.ndim() == 3 && rshape.ndim() == 3) { + CHECK(lshape[0] == rshape[0]) + << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape + << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b; + index_t out_m = param.transpose_a ? lshape[2] : lshape[1]; + index_t lshape_k = param.transpose_a ? lshape[1] : lshape[2]; + index_t out_n = param.transpose_b ? rshape[1] : rshape[2]; + index_t rshape_k = param.transpose_b ? rshape[2] : rshape[1]; + CHECK(lshape_k == rshape_k) + << "batch_dot shape error(shape mismatch): " << lshape << " X " << rshape + << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b; + SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape3(lshape[0], out_m, out_n)); + } else { + LOG(FATAL) << "batch_dot currently only support 3D*3D array" + << lshape << " v.s. " << rshape; + } + return true; +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_DOT_INL_H_ diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc new file mode 100644 index 000000000000..c455702fc638 --- /dev/null +++ b/src/operator/tensor/dot.cc @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file dot.cc + * \brief CPU Implementation of matrix dot + */ + +#include "./dot-inl.h" + +namespace mxnet { +namespace op { +DMLC_REGISTER_PARAMETER(DotParam); + +NNVM_REGISTER_OP(dot) +.describe(R"doc(Dot product of two arrays. + +``dot``'s behavior depends on the input array dimensions: + +- 1-D arrays: inner product of vectors +- 2-D arrays: matrix multiplication +- N-D arrays: a sum product over the last axis of the first input and the first + axis of the second input + + For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape `(k,r,s)`, the + result array will have shape `(n,m,r,s)`. It is computed by:: + + dot(x,y)[i,j,a,b] = sum(x[i,j,:]*y[:,a,b]) + + Example:: + + x = reshape([0,1,2,3,4,5,6,7], shape=(2,2,2)) + y = reshape([7,6,5,4,3,2,1,0], shape=(2,2,2)) + dot(x,y)[0,0,1,1] = 0 + sum(x[0,0,:]*y[:,1,1]) = 0 + +The storage type of ``dot`` output depends on storage types of inputs and transpose options: + +- dot(csr, default) = default +- dot(csr.T, default) = row_sparse +- dot(csr, row_sparse) = default +- otherwise, ``dot`` generates output with default storage + +)doc" ADD_FILELINE) +.set_num_inputs(2) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"lhs", "rhs"}; + }) +.set_attr("FInferShape", DotShape) +.set_attr("FInferType", ElemwiseType<2, 1>) +.set_attr("FInferStorageType", DotForwardInferStorageType) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", DotForward_) +.set_attr("FComputeEx", DotForwardEx) +.set_attr("FGradient", ElemwiseGradUseIn{"_backward_dot"}) +.add_argument("lhs", "NDArray-or-Symbol", "The first input") +.add_argument("rhs", "NDArray-or-Symbol", "The second input") +.add_arguments(DotParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_dot) +.set_num_inputs(3) +.set_num_outputs(2) +.set_attr_parser(ParamParser) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", DotBackwardInferStorageType) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", DotBackward_) +.set_attr("FComputeEx", DotBackwardEx) +.add_arguments(DotParam::__FIELDS__()); + +NNVM_REGISTER_OP(batch_dot) +.describe(R"doc(Batchwise dot product. + +``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and +``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`. + +For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape +`(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`, +which is computed by:: + + batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:]) + +)doc" ADD_FILELINE) +.set_num_inputs(2) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"lhs", "rhs"}; + }) +.set_attr("FInferShape", BatchDotShape) +.set_attr("FInferType", ElemwiseType<2, 1>) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", BatchDotForward_) +.set_attr("FGradient", ElemwiseGradUseIn{"_backward_batch_dot"}) +.add_argument("lhs", "NDArray-or-Symbol", "The first input") +.add_argument("rhs", "NDArray-or-Symbol", "The second input") +.add_arguments(DotParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_batch_dot) +.set_num_inputs(3) +.set_num_outputs(2) +.set_attr_parser(ParamParser) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("TIsBackward", true) +.set_attr("FCompute", BatchDotBackward_); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/dot.cu b/src/operator/tensor/dot.cu new file mode 100644 index 000000000000..8ee2e2832fbb --- /dev/null +++ b/src/operator/tensor/dot.cu @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file dot.cu + * \brief GPU Implementation of matrix dot + */ + +#include "./dot-inl.h" + +namespace mxnet { +namespace op { + +NNVM_REGISTER_OP(dot) +.set_attr("FCompute", DotForward_) +.set_attr("FComputeEx", DotForwardEx); + +NNVM_REGISTER_OP(_backward_dot) +.set_attr("FCompute", DotBackward_) +.set_attr("FComputeEx", DotBackwardEx); + +NNVM_REGISTER_OP(batch_dot) +.set_attr("FCompute", BatchDotForward_); + +NNVM_REGISTER_OP(_backward_batch_dot) +.set_attr("FCompute", BatchDotBackward_); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc index c80d46a883ea..8c97849e20dc 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc @@ -123,6 +123,7 @@ Example:: .set_attr("FCompute", BinaryBroadcastCompute) .set_attr("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"}); + NNVM_REGISTER_OP(_backward_broadcast_mul) .set_num_inputs(3) .set_num_outputs(2) diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h index 87b0d46a63c9..ddcad5e61ba0 100644 --- a/src/operator/tensor/elemwise_binary_op.h +++ b/src/operator/tensor/elemwise_binary_op.h @@ -28,10 +28,12 @@ #include #include #include +#include #include "../mxnet_op.h" #include "../mshadow_op.h" #include "../elemwise_op_common.h" -#include "../mxnet_op.h" +#include "./init_op.h" +#include "../../common/utils.h" namespace mxnet { namespace op { @@ -141,6 +143,120 @@ void BinaryBackwardUseNone_(const nnvm::NodeAttrs& attrs, } } +// TODO(haibin) This is a single-thread inefficient implementation +// This implementation only works on CPU +template +void BinaryComputeRspRspImpl(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + if (req[0] == kNullOp) return; + CHECK(req[0] == kWriteTo) << "only kWriteTo is supported for rowsparse elemwise_add"; + using namespace rowsparse; + using namespace mshadow; + auto &lhs = inputs[0]; + auto &rhs = inputs[1]; + auto &output = outputs[0]; + + bool init_l = lhs.storage_initialized(); + bool init_r = rhs.storage_initialized(); + Stream *s = ctx.get_stream(); + // both inputs are zeros + if (!init_l && !init_r) { + NDArray out = output; + FillZerosRspImpl(s, &out); + return; + } + // Memory Estimation: This is (roughly) the number of result rows. We still + // need to subtract the number of common rows + unsigned int num_rows_l = lhs.aux_shape(kIdx)[0]; + unsigned int num_rows_r = rhs.aux_shape(kIdx)[0]; + unsigned int num_rows_total = num_rows_l + num_rows_r; + auto row_len = output.shape().ProdShape(1, output.shape().ndim()); + output.CheckAndAlloc({Shape1(num_rows_total)}); + CHECK_GT(row_len, 0); + MSHADOW_TYPE_SWITCH(output.dtype(), DType, { + MSHADOW_TYPE_SWITCH(lhs.aux_type(kIdx), IType, { + // Indices + auto indices_l = lhs.aux_data(kIdx).dptr(); + auto indices_r = rhs.aux_data(kIdx).dptr(); + auto indices_out = output.aux_data(kIdx).dptr(); + // Data + auto data_l = lhs.data().get_with_shape(Shape2(num_rows_l, row_len), s); + auto data_r = rhs.data().get_with_shape(Shape2(num_rows_r, row_len), s); + auto out = output.data().get_with_shape(Shape2(num_rows_total, row_len), s); + + // TODO(haibin) A more appropriate way: Copy to output, then apply ops + size_t iter_l = 0; + size_t iter_r = 0; + size_t iter_out = 0; + int32_t num_common_rows = 0; + while (iter_l < num_rows_l && iter_r < num_rows_r) { + auto idx_l = indices_l[iter_l]; + auto idx_r = indices_r[iter_r]; + if (idx_l == idx_r) { + // Same row + indices_out[iter_out] = idx_l; + Copy(out[iter_out], data_l[iter_l++], s); + out[iter_out] += data_r[iter_r++]; + num_common_rows++; + } else if (idx_l < idx_r) { + // Left only + indices_out[iter_out] = idx_l; + Copy(out[iter_out], data_l[iter_l++], s); + } else { + // Right only + indices_out[iter_out] = idx_r; + Copy(out[iter_out], data_r[iter_r++], s); + } + iter_out++; + } + // Copying over the rest of the rows + while (iter_l < num_rows_l) { + indices_out[iter_out] = indices_l[iter_l]; + Copy(out[iter_out++], data_l[iter_l++], s); + } + while (iter_r < num_rows_r) { + indices_out[iter_out] = indices_r[iter_r]; + Copy(out[iter_out++], data_r[iter_r++], s); + } + auto new_sshape = TShape(output.aux_shape(rowsparse::kIdx)); + CHECK_GT(new_sshape[0], num_common_rows); + new_sshape[0] -= num_common_rows; + output.set_aux_shape(rowsparse::kIdx, new_sshape); + }); + }); +} + +template +void BinaryComputeEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + CHECK_EQ(inputs.size(), 2); + CHECK_EQ(outputs.size(), 1); + if (typeid(OP) == typeid(mshadow::op::plus)) { + // If any input is dense, fallback to FCompute + // TODO(haibin) implement dns + rsp in a separate kernel + if (common::ContainsDefaultStorage(inputs)) { + FCompExFallback(attrs, ctx, inputs, req, outputs, + BinaryCompute, "BinaryCompute"); + return; + } + CHECK_EQ(inputs[0].storage_type(), kRowSparseStorage) << "Sparse type not supported yet"; + CHECK_EQ(inputs[1].storage_type(), kRowSparseStorage) << "Sparse type not supported yet"; + BinaryComputeRspRspImpl(attrs, ctx, inputs, req, outputs); + return; + } else { + LOG(FATAL) << "Not implemented"; + } +} + template void BinaryBackwardUseNone(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -152,6 +268,55 @@ void BinaryBackwardUseNone(const nnvm::NodeAttrs& attrs, }); } +// Only implemented for _backward_add for now +template +void BinaryBackwardUseNoneRsp(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + CHECK_EQ(inputs[0].storage_type(), kRowSparseStorage); + CHECK_EQ(outputs[0].storage_type(), kRowSparseStorage); + CHECK_EQ(outputs[1].storage_type(), kRowSparseStorage); + CHECK(typeid(LOP) == typeid(mshadow_op::identity)); + CHECK(typeid(ROP) == typeid(mshadow_op::identity)); + TShape shape = inputs[0].aux_shape(rowsparse::kIdx); + outputs[0].CheckAndAlloc({shape}); + outputs[1].CheckAndAlloc({shape}); + MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, { + MSHADOW_TYPE_SWITCH(outputs[0].aux_type(rowsparse::kIdx), IType, { + auto lgrad_idx = outputs[0].aux_data(rowsparse::kIdx).FlatTo1D(s); + auto rgrad_idx = outputs[1].aux_data(rowsparse::kIdx).FlatTo1D(s); + auto ograd_idx = inputs[0].aux_data(rowsparse::kIdx).FlatTo1D(s); + auto lgrad = outputs[0].data().FlatTo1D(s); + Tensor rgrad = outputs[1].data().FlatTo1D(s); + Tensor ograd = inputs[0].data().FlatTo1D(s); + ASSIGN_DISPATCH(lgrad, req[0], F(ograd)); + ASSIGN_DISPATCH(rgrad, req[1], F(ograd)); + ASSIGN_DISPATCH(lgrad_idx, req[0], F(ograd_idx)); + ASSIGN_DISPATCH(rgrad_idx, req[1], F(ograd_idx)); + }); + }); +} +// Only implemented for _backward_add for now +template +void BinaryBackwardUseNoneEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + auto stype = inputs[0].storage_type(); + CHECK_EQ(stype, kRowSparseStorage) << "Not implemented yet"; + BinaryBackwardUseNoneRsp(attrs, ctx, inputs, req, outputs); + // TODO(haibin) fallback for kDefaultStorage +} + template void BinaryBackwardUseNoneWithHalf2(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -232,7 +397,7 @@ void BinaryBackwardUseInWithHalf2(const nnvm::NodeAttrs& attrs, [](const NodeAttrs& attrs){ \ return std::vector >{{0, 0}, {1, 0}}; \ }) \ - .add_argument("lhs", "NDArray-or-Symbol", "first input") \ + .add_argument("lhs", "NDArray-or-Symbol", "first input") \ .add_argument("rhs", "NDArray-or-Symbol", "second input") } // namespace op diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index 65d4ca9aadd6..ed0b6fb96aa1 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -28,9 +28,18 @@ namespace mxnet { namespace op { MXNET_OPERATOR_REGISTER_BINARY(elemwise_add) .add_alias("_add").add_alias("_plus").add_alias("_Plus") -.describe("Adds arguments element-wise.") +.describe(R"code(Adds arguments element-wise. + +The storage type of ``elemwise_add`` output depends on storage types of inputs + +- elemwise_add(row_sparse, row_sparse) = row_sparse +- otherwise, ``elemwise_add`` generates output with default storage + +)code") .set_attr("FCompute", BinaryCompute) -.set_attr("FGradient", CloneGradient{"_backward_add"}); +.set_attr("FGradient", CloneGradient{"_backward_add"}) +.set_attr("FComputeEx", BinaryComputeEx) +.set_attr("FInferStorageType", ElemwiseStorageType<2, 1>); // specialized gradient add function to do add to optimization // this must differ from elemwise_add to prevent add to optimization in forward pass. @@ -46,7 +55,10 @@ NNVM_REGISTER_OP(_backward_add) return std::vector >{{0, 0}, {0, 1}}; }) .set_attr("FCompute", BinaryBackwardUseNone); + mshadow_op::identity>) +.set_attr("FComputeEx", + BinaryBackwardUseNoneEx) +.set_attr("FInferStorageType", ElemwiseStorageType<1, 2>); MXNET_OPERATOR_REGISTER_BINARY(_sub) .add_alias("_minus").add_alias("_Minus") diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index 652be72f3fab..40757300c68d 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -22,6 +22,7 @@ * \brief elementwise sum operator */ #include "./elemwise_sum.h" +#include "../../ndarray/ndarray_function.h" namespace mxnet { namespace op { @@ -54,6 +55,53 @@ std::vector ElementWiseSumGrad( return ret; } +bool ElementWiseSumShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(out_attrs->size(), 1); + return ElemwiseAttr( + attrs, in_attrs, out_attrs, TShape()); +} + +bool ElementWiseSumType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(out_attrs->size(), 1); + return ElemwiseAttr( + attrs, in_attrs, out_attrs, -1); +} + +bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK(!in_attrs->empty()); + CHECK_EQ(out_attrs->size(), 1U); + return ElemwiseStorageAttr( + attrs, in_attrs, out_attrs); +} + +void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK(!inputs.empty()); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + if (req[0] == kNullOp) return; + CHECK_EQ(req[0], kWriteTo) << "ElementWiseSumComputeExCPU only supports req = kWriteTo"; + using namespace mshadow; + Stream* s = ctx.get_stream(); + NDArray out_nd = outputs[0]; + if (inputs[0].storage_type() == kRowSparseStorage) { + mxnet::ndarray::ElementwiseSum(s, inputs, &out_nd); + } else { + FCompExFallback(attrs, ctx, inputs, req, outputs, + ElementWiseSumCompute, "ElementWiseSumCompute"); + } +} + NNVM_REGISTER_OP(add_n) .add_alias("ElementWiseSum") .describe(R"doc(Adds all input arguments element-wise. @@ -62,6 +110,12 @@ NNVM_REGISTER_OP(add_n) add\_n(a_1, a_2, ..., a_n) = a_1 + a_2 + ... + a_n ``add_n`` is potentially more efficient than calling ``add`` by `n` times. + +The storage type of ``add_n`` output depends on storage types of inputs + +- add_n(row_sparse, row_sparse, ..) = row_sparse +- otherwise, ``add_n`` generates output with default storage + )doc" ADD_FILELINE) .set_attr_parser(ParamParser) .set_num_inputs([](const nnvm::NodeAttrs& attrs) { @@ -79,16 +133,16 @@ NNVM_REGISTER_OP(add_n) }) .set_attr("key_var_num_args", "num_args") .set_attr("FCompute", ElementWiseSumCompute) +.set_attr("FComputeEx", ElementWiseSumComputeExCPU) .set_attr( "FInplaceOption", [](const NodeAttrs& attrs) { return std::vector >{{0, 0}}; }) -.set_attr("FInferShape", ElemwiseShape<-1, 1>) -.set_attr("FInferType", ElemwiseType<-1, 1>) -.set_attr("FGradient", CloneGradient{"_backward_add_n"}) +.set_attr("FInferShape", ElementWiseSumShape) +.set_attr("FInferType", ElementWiseSumType) +.set_attr("FInferStorageType", ElementWiseSumForwardInferStorageType) +.set_attr("FGradient", ElementWiseSumGrad) .add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments"); - - } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc index defe72d3738c..e94b8bfb9fea 100644 --- a/src/operator/tensor/elemwise_unary_op.cc +++ b/src/operator/tensor/elemwise_unary_op.cc @@ -70,7 +70,9 @@ MXNET_OPERATOR_REGISTER_UNARY(_copy) [](const NodeAttrs& attrs){ return std::vector{true}; }) +.set_attr("FInferStorageType", ElemwiseStorageType<1, 1>) .set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", IdentityComputeEx) .set_attr("FGradient", ElemwiseGradUseNone{"_copy"}); NNVM_REGISTER_OP(_backward_copy) @@ -85,7 +87,9 @@ NNVM_REGISTER_OP(_backward_copy) [](const NodeAttrs& attrs){ return std::vector{true}; }) -.set_attr("FCompute", IdentityCompute); +.set_attr("FInferStorageType", ElemwiseStorageType<1, 1>) +.set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", IdentityComputeEx); MXNET_OPERATOR_REGISTER_UNARY(BlockGrad) .add_alias("stop_gradient") @@ -162,7 +166,9 @@ NNVM_REGISTER_OP(_identity_with_attr_like_rhs) .set_attr("FIgnoreInputs", [](const NodeAttrs& attrs) { return std::vector(1, 1); }) .set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", IdentityLikeRhsComputeEx) .set_attr("FInferShape", ElemwiseShape<2, 1>) +.set_attr("FInferStorageType", IdentityAttrLikeRhsStorageType) .set_attr( "FGradient", [](const nnvm::NodePtr& n, const std::vector& ograds) { @@ -219,6 +225,7 @@ NNVM_REGISTER_OP(_backward_cast) }) .set_attr("FCompute", CastCompute); + // negative MXNET_OPERATOR_REGISTER_UNARY(negative) .MXNET_DESCRIBE("Numerical negative of the argument, element-wise.") diff --git a/src/operator/tensor/elemwise_unary_op.cu b/src/operator/tensor/elemwise_unary_op.cu index 4211ea305b4e..f5d711c01a29 100644 --- a/src/operator/tensor/elemwise_unary_op.cu +++ b/src/operator/tensor/elemwise_unary_op.cu @@ -40,7 +40,8 @@ NNVM_REGISTER_OP(_backward_sigmoid) // copy NNVM_REGISTER_OP(_copy) -.set_attr("FCompute", IdentityCompute); +.set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", IdentityComputeEx); NNVM_REGISTER_OP(_backward_copy) .set_attr("FCompute", IdentityCompute); @@ -53,7 +54,9 @@ NNVM_REGISTER_OP(make_loss) // identity output as first input, but attributes are constrainted to be like rhs NNVM_REGISTER_OP(_identity_with_attr_like_rhs) -.set_attr("FCompute", IdentityCompute); +.set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", IdentityLikeRhsComputeEx); + NNVM_REGISTER_OP(Cast) .set_attr("FCompute", CastCompute); diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h index b6994844e0fe..16477b1973d3 100644 --- a/src/operator/tensor/elemwise_unary_op.h +++ b/src/operator/tensor/elemwise_unary_op.h @@ -31,15 +31,17 @@ #include "../mshadow_op.h" #include "../elemwise_op_common.h" #include "../special_functions-inl.h" +#include "./broadcast_reduce-inl.h" +#include "./init_op.h" namespace mxnet { namespace op { template void UnaryLaunch(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { using namespace mshadow; using namespace mxnet_op; Stream *s = ctx.get_stream(); @@ -95,6 +97,108 @@ void IdentityCompute(const nnvm::NodeAttrs& attrs, }); } +template +void IdentityComputeRspRspImpl(const nnvm::NodeAttrs& attrs, + mshadow::Stream *s, + const NDArray& input, + const OpReqType req, + NDArray* output) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace rowsparse; + if (req == kNullOp) return; + CHECK_EQ(req, kWriteTo) << "kWriteTo is expected for IdentityComputeRspRspImpl"; + if (!input.storage_initialized()) { + FillZerosRspImpl(s, output); + return; + } + TShape shape = input.aux_shape(kIdx); + output->CheckAndAlloc({shape}); + MSHADOW_TYPE_SWITCH(output->dtype(), DType, { + MSHADOW_TYPE_SWITCH(output->aux_type(kIdx), AuxType, { + auto out_d = output->data().FlatTo1D(s); + auto out_aux = output->aux_data(kIdx).FlatTo1D(s); + auto in_aux = input.aux_data(kIdx).FlatTo1D(s); + ASSIGN_DISPATCH(out_d, req, + F(input.data().FlatTo1D(s))); + ASSIGN_DISPATCH(out_aux, req, F(in_aux)); + }); + }); +} + +template +void IdentityComputeEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const auto in_stype = inputs[0].storage_type(); + const auto out_stype = outputs[0].storage_type(); + mshadow::Stream *s = ctx.get_stream(); + if (req[0] == kNullOp) return; + if (in_stype == out_stype) { + if (in_stype == kDefaultStorage) { // dense ndarray + IdentityCompute(attrs, ctx, {inputs[0].data()}, req, {outputs[0].data()}); + } else if (in_stype == kRowSparseStorage || in_stype == kCSRStorage) { // sparse ndarray + if (!inputs[0].storage_initialized()) { + FillComputeZerosEx(attrs, ctx, inputs, req, outputs); + return; + } + CHECK_NE(req[0], kAddTo) << "kAddTo is not supported for IdentityComputeEx"; + const size_t n = mxnet::num_aux_data(out_stype); + outputs[0].CheckAndAlloc(inputs[0].aux_shapes()); + IdentityCompute(attrs, ctx, {inputs[0].data()}, req, {outputs[0].data()}); + for (size_t i = 0; i < n; ++i) { + IdentityCompute(attrs, ctx, {inputs[0].aux_data(i)}, req, {outputs[0].aux_data(i)}); + } + } else { + LOG(FATAL) << "IdentityComputeEx does not support input stype = " << in_stype; + } + } else { + FCompExFallback(attrs, ctx, inputs, req, outputs, IdentityCompute, "IdentityCompute"); + } +} + +inline bool IdentityAttrLikeRhsStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + // TODO(junwu): add ctx info into storage inference logic + CHECK_EQ(in_attrs->size(), static_cast(2)) << " in operator " << attrs.name; + CHECK_EQ(out_attrs->size(), static_cast(1)) << " in operator " << attrs.name; + auto &in = *in_attrs; + auto &out = *out_attrs; + CHECK_NE(in[1], kUndefinedStorage) << "rhs storage type must be known"; + if (in[0] == kUndefinedStorage) STORAGE_TYPE_ASSIGN_CHECK(in, 0, in[1]); + if (out[0] == kUndefinedStorage) STORAGE_TYPE_ASSIGN_CHECK(out, 0, in[1]); + return true; +} + +template +void IdentityLikeRhsComputeEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(inputs.size(), 2); + CHECK_EQ(outputs.size(), 1); + Stream *s = ctx.get_stream(); + const auto in_stype = inputs[0].storage_type(); + const auto out_stype = outputs[0].storage_type(); + if (in_stype == out_stype) { + std::vector in{inputs[0]}; + IdentityComputeEx(attrs, ctx, in, req, outputs); + } else { + LOG(FATAL) << "IdentityLikeRhsComputeEx not implemented for in_stype = " << in_stype + << " out_stype = " << out_stype; + } +} + struct CastParam : public dmlc::Parameter { // use int for enumeration int dtype; @@ -186,4 +290,5 @@ struct relu_grad { } // namespace op } // namespace mxnet + #endif // MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_H_ diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc index e5cb41088e22..8c5d4f5411f8 100644 --- a/src/operator/tensor/indexing_op.cc +++ b/src/operator/tensor/indexing_op.cc @@ -104,7 +104,6 @@ NNVM_REGISTER_OP(_backward_Embedding) .set_attr("TIsBackward", true) .set_attr("FCompute", EmbeddingOpBackward); - NNVM_REGISTER_OP(take) .describe(R"code(Takes elements from an input array along the given axis. diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h index ef42b01fb5b6..a9ee408082d4 100644 --- a/src/operator/tensor/indexing_op.h +++ b/src/operator/tensor/indexing_op.h @@ -40,6 +40,9 @@ #include "../elemwise_op_common.h" #include "../mxnet_op.h" #include "./sort_op.h" +#include "./dot-inl.h" +#include "./init_op.h" +#include "./matrix_op-inl.h" namespace mxnet { namespace op { diff --git a/src/operator/tensor/init_op.cc b/src/operator/tensor/init_op.cc index 8dac22a64966..9f333d2d5efe 100644 --- a/src/operator/tensor/init_op.cc +++ b/src/operator/tensor/init_op.cc @@ -39,6 +39,7 @@ NNVM_REGISTER_OP(_zeros) .set_attr("FInferShape", InitShape) .set_attr("FInferType", InitType) .set_attr("FCompute", FillCompute) +.set_attr("FComputeEx", FillComputeZerosEx) .add_arguments(InitOpParam::__FIELDS__()); NNVM_REGISTER_OP(_ones) diff --git a/src/operator/tensor/init_op.cu b/src/operator/tensor/init_op.cu index 6e2b65cc8519..cbee203c2b31 100644 --- a/src/operator/tensor/init_op.cu +++ b/src/operator/tensor/init_op.cu @@ -27,7 +27,8 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_zeros) -.set_attr("FCompute", FillCompute); +.set_attr("FCompute", FillCompute) +.set_attr("FComputeEx", FillComputeZerosEx); NNVM_REGISTER_OP(_ones) .set_attr("FCompute", FillCompute); diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h index bdc74d332491..0cd81d77133c 100644 --- a/src/operator/tensor/init_op.h +++ b/src/operator/tensor/init_op.h @@ -33,6 +33,8 @@ #include #include #include "../elemwise_op_common.h" +#include "../mxnet_op.h" + namespace mxnet { namespace op { @@ -129,7 +131,6 @@ inline bool InitType(const nnvm::NodeAttrs& attrs, return true; } - template void FillCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -145,6 +146,91 @@ void FillCompute(const nnvm::NodeAttrs& attrs, }); } +// Fill in the indices and values of a RowSparse NDArray to represent a zeros NDArray, +// instead of the usual compact representation. +template +inline void FillDnsZerosRspImpl(mshadow::Stream *s, NDArray *dst) { + using namespace rowsparse; + using namespace mshadow::expr; + using namespace mshadow; + using namespace mxnet_op; + CHECK_EQ(dst->storage_type(), kRowSparseStorage); + MSHADOW_REAL_TYPE_SWITCH(dst->dtype(), DType, { + MSHADOW_IDX_TYPE_SWITCH(dst->aux_type(kIdx), IType, { + auto num_rows = dst->shape()[0]; + dst->CheckAndAlloc({Shape1(num_rows)}); + auto idx = dst->aux_data(kIdx).FlatTo1D(s); + auto val = dst->data(); + Kernel::Launch(s, val.Size(), val.dptr()); + ASSIGN_DISPATCH(idx, kWriteTo, range(0, num_rows, 1, 1)); + }); + }); +} + +struct PopulateFullIdxRspKernel { + template + MSHADOW_XINLINE static void Map(int i, IType* out) { + KERNEL_ASSIGN(out[i], kWriteTo, i); + } +}; + +// Fill full indices NDArray with zeros by updating the aux shape. +template +void PopulateFullIdxRspImpl(mshadow::Stream *s, NDArray *dst) { + using namespace rowsparse; + CHECK_EQ(dst->storage_type(), kRowSparseStorage); + nnvm::dim_t nnr = dst->shape()[0]; + dst->CheckAndAllocAuxData(kIdx, mshadow::Shape1(nnr)); + MSHADOW_IDX_TYPE_SWITCH(dst->aux_type(kIdx), IType, { + IType* idx = dst->aux_data(kIdx).dptr(); + mxnet_op::Kernel::Launch(s, nnr, idx); + }); +} + +// Fill a rsp NDArray with zeros by updating the aux shape. +template +void FillZerosRspImpl(mshadow::Stream *s, NDArray *dst) { + if (!dst->storage_initialized()) return; + // reset the shapes if it's not zeros + auto storage_shape = dst->storage_shape(); + storage_shape[0] = 0; + dst->set_aux_shape(rowsparse::kIdx, TShape(mshadow::Shape1(0))); +} + +// Fill a CSR NDArray with zeros by updating the aux shape. +template +void FillZerosCsrImpl(mshadow::Stream *s, NDArray *dst) { + if (!dst->storage_initialized()) return; + // reset the shapes if it's not zeros + TShape new_shape(mshadow::Shape1(0)); + dst->set_aux_shape(csr::kIndPtr, new_shape); + dst->set_aux_shape(csr::kIdx, new_shape); +} + +template +void FillComputeZerosEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + CHECK_EQ(outputs.size(), 1); + auto stype = outputs[0].storage_type(); + if (req[0] == kNullOp) return; + CHECK_EQ(req[0], kWriteTo) << "kWriteTo is expected for FillComputeZerosEx"; + if (stype == kRowSparseStorage) { + NDArray nd(outputs[0]); + FillZerosRspImpl(s, &nd); + } else if (stype == kCSRStorage) { + NDArray nd(outputs[0]); + FillZerosCsrImpl(s, &nd); + } else { + // no fallback is required since the output doesn't depend on input + LOG(FATAL) << "storage type " << stype << " not implemented."; + } +} template void RangeCompute(const nnvm::NodeAttrs& attrs, diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h index af0de593c1be..4654b37ab2bc 100644 --- a/src/operator/tensor/matrix_op-inl.h +++ b/src/operator/tensor/matrix_op-inl.h @@ -28,6 +28,7 @@ #include #include #include +#include #include "../mshadow_op.h" #include "../elemwise_op_common.h" #include "../channel_op_common.h" @@ -368,364 +369,6 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs, return true; } -struct DotParam : public dmlc::Parameter { - bool transpose_a; - bool transpose_b; - DMLC_DECLARE_PARAMETER(DotParam) { - DMLC_DECLARE_FIELD(transpose_a) - .describe("If true then transpose the first input before dot.") - .set_default(false); - DMLC_DECLARE_FIELD(transpose_b) - .describe("If true then transpose the second input before dot.") - .set_default(false); - } -}; - -template -void DotForward_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - using namespace mshadow; - using namespace mshadow::expr; - const DotParam& param = nnvm::get(attrs.parsed); - Stream *s = ctx.get_stream(); - CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_) - << "Binary function only support input/output with the same type"; - CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_) - << "Binary function only support input/output with the same type"; - CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) - << "dot only supports float32 and float64"; - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { - if (inputs[0].ndim() == 1 && inputs[1].ndim() == 1) { - CHECK_NE(req[0], kAddTo) << "AddTo not yet suported"; - Tensor out = outputs[0].get(s); - VectorDot(out, - inputs[0].get(s), - inputs[1].get(s)); - } else { - int ma, na, mb, nb, m, n; - if (param.transpose_a) { - ma = inputs[0].size(0); - na = inputs[0].Size()/ma; - m = na; - } else { - na = inputs[0].size(inputs[0].ndim()-1); - ma = inputs[0].Size()/na; - m = ma; - } - if (param.transpose_b) { - nb = inputs[1].size(inputs[1].ndim()-1); - mb = inputs[1].Size()/nb; - n = mb; - } else { - mb = inputs[1].size(0); - nb = inputs[1].Size()/mb; - n = nb; - } - Tensor input0 = - inputs[0].get_with_shape(Shape2(ma, na), s); - Tensor input1 = - inputs[1].get_with_shape(Shape2(mb, nb), s); - Tensor out = - outputs[0].get_with_shape(Shape2(m, n), s); - if (param.transpose_a && param.transpose_b) { - ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1.T())); - } else if (!param.transpose_a && param.transpose_b) { - ASSIGN_DISPATCH(out, req[0], dot(input0, input1.T())); - } else if (param.transpose_a && !param.transpose_b) { - ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1)); - } else { - ASSIGN_DISPATCH(out, req[0], dot(input0, input1)); - } - } - }); -} - -template -void DotBackward_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - using namespace mshadow; - using namespace mshadow::expr; - const DotParam& param = nnvm::get(attrs.parsed); - Stream *s = ctx.get_stream(); - CHECK_NE(req[0], kWriteInplace); - CHECK_NE(req[1], kWriteInplace); - CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) - << "dot only supports float32 and float64"; - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { - if (inputs[1].ndim() == 1 && inputs[2].ndim() == 1) { - Tensor mout_grad = inputs[0].get(s); - Tensor mlhs_data = inputs[1].get(s); - Tensor mrhs_data = inputs[2].get(s); - Tensor mlhs_grad = outputs[0].get(s); - Tensor mrhs_grad = outputs[1].get(s); - ASSIGN_DISPATCH(mrhs_grad, req[1], - broadcast_scalar(mout_grad, mlhs_data.shape_) * mlhs_data); - ASSIGN_DISPATCH(mlhs_grad, req[0], - broadcast_scalar(mout_grad, mlhs_data.shape_) * mrhs_data); - } else { - int ma, na, mb, nb, m, n; - if (param.transpose_a) { - ma = outputs[0].size(0); - na = outputs[0].Size()/ma; - m = na; - } else { - na = outputs[0].size(outputs[0].ndim()-1); - ma = outputs[0].Size()/na; - m = ma; - } - if (param.transpose_b) { - nb = outputs[1].size(outputs[1].ndim()-1); - mb = outputs[1].Size()/nb; - n = mb; - } else { - mb = outputs[1].size(0); - nb = outputs[1].Size()/mb; - n = nb; - } - Tensor mout_grad = - inputs[0].get_with_shape(Shape2(m, n), s); - Tensor mlhs_data = - inputs[1].get_with_shape(Shape2(ma, na), s); - Tensor mrhs_data = - inputs[2].get_with_shape(Shape2(mb, nb), s); - Tensor mlhs_grad = - outputs[0].get_with_shape(Shape2(ma, na), s); - Tensor mrhs_grad = - outputs[1].get_with_shape(Shape2(mb, nb), s); - if (param.transpose_a && param.transpose_b) { - // Gradient of z = dot(x.T, y.T) - // dy = dot(x, dz).T = dot(dz.T, x.T) - // dx = dot(dz, y).T = dot(y.T, dz.T) - ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data.T())); - ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data.T(), mout_grad.T())); - } else if (!param.transpose_a && param.transpose_b) { - // Gradient of z = dot(x, y.T) - // dy = dot(x.T, dz).T = dot(dz.T, x) - // dx = dot(dz, y) - ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data)); - ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data)); - } else if (param.transpose_a && !param.transpose_b) { - // Gradient of z = dot(x.T, y) - // dy = dot(x, dz) - // dx = dot(dz, y.T).T = dot(y, dz.T) - ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data, mout_grad)); - ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data, mout_grad.T())); - } else { - // Gradient of z = dot(x, y) - // dy = dot(x.T, dz) - // dx = dot(dz, y.T) - ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data.T(), mout_grad)); - ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data.T())); - } - } - }); -} - -inline bool DotShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - const DotParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(in_attrs->size(), 2U); - CHECK_EQ(out_attrs->size(), 1U); - TShape& lshape = (*in_attrs)[0]; - TShape& rshape = (*in_attrs)[1]; - if (lshape.ndim() == 1 && rshape.ndim() == 1) { - CHECK(!param.transpose_a && !param.transpose_b) << "Cannot transpose vectors"; - CHECK_EQ(lshape[0], rshape[0]) << "dot shape error: " << lshape << " X " << rshape; - SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape1(1)); - } else { - bool Ta = param.transpose_a, Tb = param.transpose_b; - TShape L[2], R[2]; - if (Ta) { - L[0] = mshadow::Shape1(lshape[0]); - L[1] = lshape.ndim() > 1 ? TShape(&lshape[1], &lshape[lshape.ndim()]) : TShape(1); - } else { - L[0] = lshape.ndim() > 1 ? TShape(&lshape[0], &lshape[lshape.ndim()-1]) : TShape(1); - L[1] = mshadow::Shape1(lshape[lshape.ndim()-1]); - } - if (Tb) { - R[0] = rshape.ndim() > 1 ? TShape(&rshape[0], &rshape[rshape.ndim()-1]) : TShape(1); - R[1] = mshadow::Shape1(rshape[rshape.ndim()-1]); - } else { - R[0] = mshadow::Shape1(rshape[0]); - R[1] = rshape.ndim() > 1 ? TShape(&rshape[1], &rshape[rshape.ndim()]) : TShape(1); - } - - if (L[!Ta].Size() != 0 && R[Tb].Size() != 0) { - CHECK_EQ(L[!Ta].Size(), R[Tb].Size()) - << "dot shape error: " << lshape << " X " << rshape; - } - std::vector buf; - if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], &L[Ta][L[Ta].ndim()]); - if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], &R[!Tb][R[!Tb].ndim()]); - TShape oshape(buf.begin(), buf.end()); - SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape); - } - return true; -} - -template -void BatchDotForward_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - using namespace mshadow; - using namespace mshadow::expr; - mshadow::Stream *s = ctx.get_stream(); - const DotParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_) - << "Binary function only support input/output with the same type"; - CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_) - << "Binary function only support input/output with the same type"; - CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) - << "dot only supports float32 and float64"; - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { - mshadow::Tensor out = outputs[0].get(s); - mshadow::Tensor mlhs = inputs[0].get(s); - mshadow::Tensor mrhs = inputs[1].get(s); - mshadow::Tensor workspace = - ctx.requested[0].get_space_typed(mshadow::Shape1(3 * out.size(0)), s); - if (kNullOp != req[0]) { - if (param.transpose_a && param.transpose_b) { - mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - workspace); - } else if (!param.transpose_a && param.transpose_b) { - mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - workspace); - } else if (param.transpose_a && !param.transpose_b) { - mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - workspace); - } else { - mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - workspace); - } - } - }); -} - -template -void BatchDotBackward_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - using namespace mshadow; - using namespace mshadow::expr; - mshadow::Stream *s = ctx.get_stream(); - const DotParam& param = nnvm::get(attrs.parsed); - CHECK_NE(req[1], kWriteInplace); - CHECK_NE(req[0], kWriteInplace); - CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) - << "dot only supports float32 and float64"; - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { - mshadow::Tensor mout_grad = inputs[0].get(s); - mshadow::Tensor mlhs_data = inputs[1].get(s); - mshadow::Tensor mrhs_data = inputs[2].get(s); - mshadow::Tensor mlhs_grad = outputs[0].get(s); - mshadow::Tensor mrhs_grad = outputs[1].get(s); - mshadow::Tensor workspace = - ctx.requested[0].get_space_typed( - mshadow::Shape2(2, 3 * mout_grad.size(0)), s); - mshadow::Tensor rhs_workspace = workspace[0]; - mshadow::Tensor lhs_workspace = workspace[1]; - if (param.transpose_a && param.transpose_b) { - // Gradient of z = dot(x.T, y.T) - // dy = dot(x, dz).T = dot(dz.T, x.T) - // dx = dot(dz, y).T = dot(y.T, dz.T) - if (kNullOp != req[1]) { - mshadow::BatchGEMM(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f, - (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, - rhs_workspace); - } - if (kNullOp != req[0]) { - mshadow::BatchGEMM(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - lhs_workspace); - } - } else if (!param.transpose_a && param.transpose_b) { - // Gradient of z = dot(x, y.T) - // dy = dot(x.T, dz).T = dot(dz.T, x) - // dx = dot(dz, y) - if (kNullOp != req[1]) { - mshadow::BatchGEMM(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f, - (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, - rhs_workspace); - } - if (kNullOp != req[0]) { - mshadow::BatchGEMM(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - lhs_workspace); - } - } else if (param.transpose_a && !param.transpose_b) { - // Gradient of z = dot(x.T, y) - // dy = dot(x, dz) - // dx = dot(dz, y.T).T = dot(y, dz.T) - if (kNullOp != req[1]) { - mshadow::BatchGEMM(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f, - (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, - rhs_workspace); - } - if (kNullOp != req[0]) { - mshadow::BatchGEMM(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - lhs_workspace); - } - } else { - // Gradient of z = dot(x, y) - // dy = dot(x.T, dz) - // dx = dot(dz, y.T) - if (kNullOp != req[1]) { - mshadow::BatchGEMM(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f, - (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, - rhs_workspace); - } - if (kNullOp != req[0]) { - mshadow::BatchGEMM(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - lhs_workspace); - } - } - }); -} - -inline bool BatchDotShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 2U); - CHECK_EQ(out_attrs->size(), 1U); - const DotParam& param = nnvm::get(attrs.parsed); - TShape& lshape = (*in_attrs)[0]; - TShape& rshape = (*in_attrs)[1]; - if (lshape.ndim() == 3 && rshape.ndim() == 3) { - CHECK(lshape[0] == rshape[0]) - << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape - << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b; - index_t out_m = param.transpose_a ? lshape[2] : lshape[1]; - index_t lshape_k = param.transpose_a ? lshape[1] : lshape[2]; - index_t out_n = param.transpose_b ? rshape[1] : rshape[2]; - index_t rshape_k = param.transpose_b ? rshape[2] : rshape[1]; - CHECK(lshape_k == rshape_k) - << "batch_dot shape error(shape mismatch): " << lshape << " X " << rshape - << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b; - SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape3(lshape[0], out_m, out_n)); - } else { - LOG(FATAL) << "batch_dot currently only support 3D*3D array" - << lshape << " v.s. " << rshape; - } - return true; -} - struct SliceParam : public dmlc::Parameter { nnvm::Tuple > begin, end; DMLC_DECLARE_PARAMETER(SliceParam) { @@ -845,6 +488,96 @@ void Slice(const nnvm::NodeAttrs& attrs, }); } +// slice the indptr of a csr +struct SliceCsrIndPtr { + template + MSHADOW_XINLINE static void Map(int i, IType* out, const IType* in, const IType* base) { + KERNEL_ASSIGN(out[i], kWriteTo, in[i] - *base); + } +}; + +/* + * a wrapper to launch SliceCsrIndPtr kernel. + * slice [src[begin] .. src[end]) and store in dst[0, end - begin) + */ +template +void SliceCsrIndPtrImpl(const int begin, const int end, RunContext ctx, + const IType* src, IType* dst) { + using namespace mshadow; + using namespace mxnet_op; + Stream *s = ctx.get_stream(); + int indptr_len = end - begin + 1; + Kernel::Launch(s, indptr_len, dst, src + begin, src + begin); +} + +/* + * Slice a CSR NDArray + * Only implemented for CPU + */ +template +void SliceCsrImpl(const SliceParam ¶m, const OpContext& ctx, + const NDArray &in, OpReqType req, const NDArray &out) { + using namespace mshadow; + using namespace mxnet_op; + using namespace csr; + CHECK((std::is_same::value)) << "Slice for CSR input only implemented for CPU"; + if (req == kNullOp) return; + CHECK_NE(req, kAddTo) << "kAddTo for Slice on CSR input is not supported"; + CHECK_NE(req, kWriteInplace) << "kWriteInplace for Slice on CSR input is not supported"; + Stream *s = ctx.get_stream(); + int begin = *param.begin[0]; + int end = *param.end[0]; + int indptr_len = end - begin + 1; + out.CheckAndAllocAuxData(kIndPtr, Shape1(indptr_len)); + if (!in.storage_initialized()) { + out.set_aux_shape(kIndPtr, Shape1(0)); + return; + } + // assume idx indptr share the same type + MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIndPtr), RType, { + MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIdx), IType, { + MSHADOW_TYPE_SWITCH(in.dtype(), DType, { + auto in_indptr = in.aux_data(kIndPtr).dptr(); + auto out_indptr = out.aux_data(kIndPtr).dptr(); + SliceCsrIndPtrImpl(begin, end, ctx.run_ctx, in_indptr, out_indptr); + + // retrieve nnz (CPU implementation) + int nnz = out_indptr[indptr_len - 1]; + // copy indices and values + out.CheckAndAllocAuxData(kIdx, Shape1(nnz)); + out.CheckAndAllocData(Shape1(nnz)); + auto in_idx = in.aux_data(kIdx).dptr(); + auto out_idx = out.aux_data(kIdx).dptr(); + auto in_data = in.data().dptr(); + auto out_data = out.data().dptr(); + int offset = in_indptr[begin]; + // this is also a CPU-only implementation + memcpy(out_idx, in_idx + offset, nnz * sizeof(IType)); + memcpy(out_data, in_data + offset, nnz * sizeof(DType)); + }); + }); + }); +} + +template +void SliceEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1); + CHECK_EQ(outputs.size(), 1); + const SliceParam& param = nnvm::get(attrs.parsed); + auto in_stype = inputs[0].storage_type(); + CHECK_NE(in_stype, kDefaultStorage) + << "SliceEx is not expected to execute for input with default storage type"; + if (in_stype == kCSRStorage) { + SliceCsrImpl(param, ctx, inputs[0], req[0], outputs[0]); + } else { + LOG(FATAL) << "Slice not implemented for storage type" << in_stype; + } +} + inline bool SliceAssignShape(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, std::vector *out_attrs) { diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index e7e8f5548a1c..d409b9ec6056 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -34,7 +34,6 @@ DMLC_REGISTER_PARAMETER(ClipParam); DMLC_REGISTER_PARAMETER(SimpleCropAssignScalarParam); DMLC_REGISTER_PARAMETER(SliceParam); DMLC_REGISTER_PARAMETER(SliceAxisParam); -DMLC_REGISTER_PARAMETER(DotParam); DMLC_REGISTER_PARAMETER(RepeatParam); DMLC_REGISTER_PARAMETER(TileParam); DMLC_REGISTER_PARAMETER(ReverseParam); @@ -263,6 +262,9 @@ and ``end=(e_1, e_2, ... e_n)`` indices will result in an array with the shape The resulting array's *k*-th dimension contains elements from the *k*-th dimension of the input array with the open range ``[b_k, e_k)``. +For an input array of non-default storage type(e.g. `csr` or `row_sparse`), it only supports +slicing on the first dimension. + Example:: x = [[ 1., 2., 3., 4.], @@ -276,8 +278,10 @@ Example:: .set_attr_parser(ParamParser) .set_attr("FInferShape", SliceShape) .set_attr("FInferType", ElemwiseType<1, 1>) +.set_attr("FInferStorageType", ElemwiseStorageType<1, 1>) .set_attr("FGradient", ElemwiseGradUseNone{"_backward_slice"}) .set_attr("FCompute", Slice) +.set_attr("FComputeEx", SliceEx) .add_argument("data", "NDArray-or-Symbol", "Source input") .add_arguments(SliceParam::__FIELDS__()); @@ -370,94 +374,6 @@ NNVM_REGISTER_OP(_backward_slice_axis) .set_attr("TIsBackward", true) .set_attr("FCompute", SliceAxisGrad_); -NNVM_REGISTER_OP(dot) -.describe(R"doc(Dot product of two arrays. - -``dot``'s behavior depends on the input array dimensions: - -- 1-D arrays: inner product of vectors -- 2-D arrays: matrix multiplication -- N-D arrays: a sum product over the last axis of the first input and the first - axis of the second input - - For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape `(k,r,s)`, the - result array will have shape `(n,m,r,s)`. It is computed by:: - - dot(x,y)[i,j,a,b] = sum(x[i,j,:]*y[:,a,b]) - - Example:: - - x = reshape([0,1,2,3,4,5,6,7], shape=(2,2,2)) - y = reshape([7,6,5,4,3,2,1,0], shape=(2,2,2)) - dot(x,y)[0,0,1,1] = 0 - sum(x[0,0,:]*y[:,1,1]) = 0 -)doc" ADD_FILELINE) -.set_num_inputs(2) -.set_num_outputs(1) -.set_attr_parser(ParamParser) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - return std::vector{"lhs", "rhs"}; - }) -.set_attr("FInferShape", DotShape) -.set_attr("FInferType", ElemwiseType<2, 1>) -.set_attr("FCompute", DotForward_) -.set_attr("FGradient", ElemwiseGradUseIn{"_backward_dot"}) -.add_argument("lhs", "NDArray-or-Symbol", "The first input") -.add_argument("rhs", "NDArray-or-Symbol", "The second input") -.add_arguments(DotParam::__FIELDS__()); - -NNVM_REGISTER_OP(_backward_dot) -.set_num_inputs(3) -.set_num_outputs(2) -.set_attr_parser(ParamParser) -.set_attr("TIsBackward", true) -.set_attr("FCompute", DotBackward_) -.add_arguments(DotParam::__FIELDS__()); - -NNVM_REGISTER_OP(batch_dot) -.describe(R"doc(Batchwise dot product. - -``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and -``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`. - -For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape -`(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`, -which is computed by:: - - batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:]) - -)doc" ADD_FILELINE) -.set_num_inputs(2) -.set_num_outputs(1) -.set_attr_parser(ParamParser) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - return std::vector{"lhs", "rhs"}; - }) -.set_attr("FInferShape", BatchDotShape) -.set_attr("FInferType", ElemwiseType<2, 1>) -.set_attr("FResourceRequest", - [](const NodeAttrs& attrs) { - return std::vector{ResourceRequest::kTempSpace}; - }) -.set_attr("FCompute", BatchDotForward_) -.set_attr("FGradient", ElemwiseGradUseIn{"_backward_batch_dot"}) -.add_argument("lhs", "NDArray-or-Symbol", "The first input") -.add_argument("rhs", "NDArray-or-Symbol", "The second input") -.add_arguments(DotParam::__FIELDS__()); - -NNVM_REGISTER_OP(_backward_batch_dot) -.set_num_inputs(3) -.set_num_outputs(2) -.set_attr_parser(ParamParser) -.set_attr("FResourceRequest", - [](const NodeAttrs& attrs) { - return std::vector{ResourceRequest::kTempSpace}; - }) -.set_attr("TIsBackward", true) -.set_attr("FCompute", BatchDotBackward_); - NNVM_REGISTER_OP(clip) .describe(R"code(Clips (limits) the values in an array. diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu index ca40419a9367..3cf2a7a753d0 100644 --- a/src/operator/tensor/matrix_op.cu +++ b/src/operator/tensor/matrix_op.cu @@ -57,18 +57,6 @@ NNVM_REGISTER_OP(slice_axis) NNVM_REGISTER_OP(_backward_slice_axis) .set_attr("FCompute", SliceAxisGrad_); -NNVM_REGISTER_OP(dot) -.set_attr("FCompute", DotForward_); - -NNVM_REGISTER_OP(_backward_dot) -.set_attr("FCompute", DotBackward_); - -NNVM_REGISTER_OP(batch_dot) -.set_attr("FCompute", BatchDotForward_); - -NNVM_REGISTER_OP(_backward_batch_dot) -.set_attr("FCompute", BatchDotBackward_); - NNVM_REGISTER_OP(clip) .set_attr("FCompute", Clip); diff --git a/src/operator/tensor/sparse_retain-inl.h b/src/operator/tensor/sparse_retain-inl.h new file mode 100644 index 000000000000..5add57c83b24 --- /dev/null +++ b/src/operator/tensor/sparse_retain-inl.h @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file sparse_retain-inl.h + * \brief +*/ +#ifndef MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_ +#define MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_ + +#include +#include +#include +#include "./init_op.h" +#include "../mshadow_op.h" +#include "../elemwise_op_common.h" +#include "../mxnet_op.h" + +namespace mxnet { +namespace op { + +/*! + * \brief sparse retain namespace + */ +namespace sr { +enum SparseRetainOpInputs {kArr, kIdx}; +enum SparseRetainOpOutputs {kOut}; +} // namespace sr + +inline bool SparseRetainOpShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2U) + << "sparse_retain operator takes 2 arguments (" << in_attrs->size() << " given)"; + CHECK_EQ(out_attrs->size(), 1U); + + TShape tshape((*in_attrs)[sr::kArr]); + shape_assign(&tshape, (*out_attrs)[sr::kOut]); + SHAPE_ASSIGN_CHECK(*in_attrs, sr::kArr, tshape); + SHAPE_ASSIGN_CHECK(*out_attrs, sr::kOut, tshape); + return true; +} + +inline bool SparseRetainOpType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + CHECK_NE((*in_attrs)[sr::kIdx], -1) << "Index type must be set for sparse_retain operator"; + + TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[sr::kArr]); + TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[sr::kOut]); + return (*in_attrs)[0] != -1; +} + +inline bool SparseRetainForwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + type_assign(&(in_attrs->at(sr::kArr)), kRowSparseStorage); + type_assign(&(in_attrs->at(sr::kIdx)), kDefaultStorage); + type_assign(&(out_attrs->at(sr::kOut)), kRowSparseStorage); + return true; +} + +inline bool SparseRetainBackwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 2U); + + type_assign(&(in_attrs->at(sr::kOut)), kDefaultStorage); + type_assign(&(in_attrs->at(sr::kIdx)), kDefaultStorage); + type_assign(&(out_attrs->at(sr::kArr)), kRowSparseStorage); + type_assign(&(out_attrs->at(sr::kIdx)), kDefaultStorage); + return true; +} + +/*! + * \brief Each thread searches for a user input index in the input + * row sparse ndarray alternatively. This ensures each thread + * has the almost the same workload. The overhead is the binary + * search. If all the indices of the idx array are contained + * in the in_idx, one should use SparseRetainRspRowBlockKernel instead, + * where each thread only perform binary search once. + */ +struct SparseRetainRspThreadKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, RType* out_idx, + const DType* in_data, const RType* in_idx, + const IType* idx, const size_t nnr, + const size_t row_length) { + const RType irow = idx[i]; + int j = -1, left = 0, right = nnr - 1; + while (left <= right) { + int m = left + (right - left) / 2; + const auto in_idx_m = in_idx[m]; + if (in_idx_m == irow) { + j = m; + break; + } else if (in_idx_m < irow) { + left = m + 1; + } else { + right = m - 1; + } + } + out_idx[i] = idx[i]; + if (j >= 0) { + const size_t in_offset = j * row_length; + const size_t out_offset = i * row_length; + for (size_t k = 0; k < row_length; ++k) { + out_data[out_offset+k] = in_data[in_offset+k]; + } + } + } +}; + +/*! + * \brief This kernel should be invoked when the row indices + * to be retained are all in the input rsp. + * Each thread searches for a subarray of indices of + * the user-input idx array for retain. The first index + * in the subarray will be searched for using binary search. + * The rest of the indices will be searched for starting from + * the lower bound of the binary search. This kernel assumes + * that idx has been sorted in ascending order. + */ +struct SparseRetainRspRowBlockKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, RType* out_idx, + const DType* in_data, const RType* in_idx, + const IType* idx, const size_t num_indices, + const size_t nnr, const size_t row_length, + const size_t seg_len) { + const size_t seg_start = i * seg_len; + if (seg_start >= num_indices) return; + const size_t seg_end = (seg_start+seg_len < num_indices? seg_start+seg_len : num_indices); + for (size_t j = seg_start; j < seg_end; ++j) { + out_idx[j] = idx[j]; + } + // use binary search to find the lower bound of idx[seg_start] in in_idx + const RType* first = in_idx; + const RType* last = in_idx + nnr; + const auto val = idx[seg_start]; + const RType* it; + int count = last - first, step; + while (count > 0) { + it = first; + step = count / 2; + it += step; + if (*it < val) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + size_t cur_row_idx = first - in_idx; + // end of binary search + if (cur_row_idx == nnr || in_idx[cur_row_idx] > idx[seg_end-1]) { + return; + } + size_t cur_idx = seg_start; + while (cur_row_idx < nnr && cur_idx < seg_end) { + if (in_idx[cur_row_idx] == idx[cur_idx]) { + const size_t in_offset = cur_row_idx * row_length; + const size_t out_offset = cur_idx * row_length; + for (size_t k = 0; k < row_length; ++k) { + out_data[out_offset+k] = in_data[in_offset+k]; + } + ++cur_row_idx; + ++cur_idx; + } else if (in_idx[cur_row_idx] < idx[cur_idx]) { + ++cur_row_idx; + } else { + ++cur_idx; + } + } + } +}; + +/*! + * Copy input indices to output indices. + * Only used when input rsp is dense. + */ +struct SparseRetainCopyIndices { + template + MSHADOW_XINLINE static void Map(int i, RType* out_idx, IType* idx) { + out_idx[i] = idx[i]; + } +}; + +/*! + * Copy input retained rows to output rows. + * Only used when input rsp is dense. + * This kernel is only used when ctx is on GPU. + * So it's parallelized by out_rows' elements, + * instead of rows. + * For CPU ctx, we simply call mshadow::Copy. + */ +struct SparseRetainCopyRetainedRowsFromDns { + template + MSHADOW_XINLINE static void Map(int i, DType* out_rows, const DType* in_rows, + const RType* in_row_idx, const IType* idx, + const size_t row_length) { + const size_t irow = i / row_length; + const size_t icol = i % row_length; + out_rows[i] = in_rows[static_cast(idx[irow]) * row_length + icol]; + } +}; + +template +void SparseRetainOpForwardRspImpl(mshadow::Stream *s, + const NDArray& input_nd, + const TBlob& idx_data, + const OpReqType req, + NDArray* output_nd) { + if (req == kNullOp) return; + CHECK_EQ(req, kWriteTo) << "SparseRetainOpForwardRspImpl only support req = kWriteTo now"; + CHECK_EQ(input_nd.storage_type(), kRowSparseStorage) + << "SparseRetainOpForwardRspImpl operator only takes row sparse NDArray as input"; + CHECK_EQ(output_nd->storage_type(), kRowSparseStorage) + << "SparseRetainOpForwardRspImpl operator only outputs row sparse NDArray"; + + if (!input_nd.storage_initialized() + || idx_data.Size() == 0U + || input_nd.shape()[0] == 0) { + FillZerosRspImpl(s, output_nd); + return; + } + + const TBlob input_data = input_nd.data(); + const TBlob input_idx = input_nd.aux_data(rowsparse::kIdx); + + output_nd->CheckAndAlloc({mshadow::Shape1(idx_data.Size())}); + TBlob output_data = output_nd->data(); + TBlob output_idx = output_nd->aux_data(rowsparse::kIdx); + const auto row_length = input_data.shape_.ProdShape(1, input_data.shape_.ndim()); + + using namespace mxnet_op; + MSHADOW_TYPE_SWITCH(output_data.type_flag_, DType, { // output data type + Kernel::Launch(s, output_data.Size(), output_data.dptr()); + MSHADOW_IDX_TYPE_SWITCH(output_idx.type_flag_, RType, { // row index data type + MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, { // index array data type + if (input_idx.Size() == input_nd.shape()[0]) { // input rsp is dense + using namespace mshadow; + // copy indices + Tensor output_idx_tensor = output_idx.FlatTo1D(s); + const size_t num_rows_retained = output_idx.Size(); + if (output_idx.type_flag_ == idx_data.type_flag_) { // same type, use Copy + const Tensor idx_tensor = idx_data.FlatTo1D(s); + Copy(output_idx_tensor, idx_tensor, s); + } else { // different index types, use Kernel::Launch + Kernel::Launch(s, num_rows_retained, + output_idx.dptr(), idx_data.dptr()); + } + // copy data + if (std::is_same::value) { // For cpu, we can access output_idx_tensor[i] + const Tensor input_tensor = + input_data.get_with_shape(Shape2(input_data.shape_[0], row_length), s); + Tensor output_tensor = + output_data.get_with_shape(Shape2(output_data.shape_[0], row_length), + s); + for (size_t i = 0; i < num_rows_retained; ++i) { + Copy(output_tensor[i], input_tensor[output_idx_tensor[i]], s); + } + } else { // For gpu, have to kernel launch + Kernel::Launch(s, output_data.Size(), + output_data.dptr(), input_data.dptr(), input_idx.dptr(), + idx_data.dptr(), row_length); + } + } else { // input rsp is not dense + Kernel::Launch(s, idx_data.Size(), + output_data.dptr(), output_idx.dptr(), input_data.dptr(), + input_idx.dptr(), idx_data.dptr(), input_data.shape_[0], row_length); + } + }); + }); + }); +} + +template +void SparseRetainOpForwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + if (req[sr::kOut] == kNullOp) return; + CHECK_EQ(req[sr::kOut], kWriteTo) << "sparse_retain only supports req=\'write\'"; + CHECK_EQ(inputs[sr::kIdx].storage_type(), kDefaultStorage) + << "sparse_retain operator only takes default NDArray as its index array"; + if (inputs[sr::kArr].storage_type() == kRowSparseStorage) { + NDArray output_nd = outputs[sr::kOut]; + SparseRetainOpForwardRspImpl(ctx.get_stream(), inputs[sr::kArr], + inputs[sr::kIdx].data(), req[sr::kOut], &output_nd); + } else { + LOG(FATAL) << "sparse_retain op only supports row-sparse ndarrays as input"; + } +} + +template +struct SparseRetainRspGradKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* in_grad, RType* in_grad_idx, + const DType* out_grad, const IType* idx, + const size_t row_length) { + const RType irow = idx[i]; + in_grad_idx[i] = irow; + const size_t out_offset = irow * row_length; + const size_t in_offset = i * row_length; + for (size_t j = 0; j < row_length; ++j) { + KERNEL_ASSIGN(in_grad[in_offset+j], req, out_grad[out_offset+j]); + } + } +}; + +template +void SparseRetainOpBackwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(req.size(), 2U); + CHECK_EQ(req[sr::kIdx], kNullOp); + if (req[sr::kArr] == kNullOp) return; + CHECK_EQ(req[sr::kArr], kWriteTo); + + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 2U) + << "sparse_retain does not support calculating gradients of indices"; + + CHECK_EQ(inputs[sr::kOut].storage_type(), kDefaultStorage) + << "sparse_retain backward only takes default NDArray as ograd"; + CHECK_EQ(inputs[sr::kIdx].storage_type(), kDefaultStorage) + << "sparse_retain backward only takes default NDArray as its index array"; + CHECK_EQ(outputs[sr::kArr].storage_type(), kRowSparseStorage) + << "sparse_retain backward only outputs row sparse NDArray as grad of input"; + + using namespace mxnet_op; + using namespace mshadow; + Stream *s = ctx.get_stream(); + const TBlob idx_data = inputs[sr::kIdx].data(); + if (idx_data.Size() == 0U) { + NDArray output = outputs[sr::kArr]; + FillZerosRspImpl(s, &output); + return; + } + + const TBlob out_grad_data = inputs[sr::kOut].data(); + + NDArray in_grad_nd = outputs[sr::kArr]; + in_grad_nd.CheckAndAlloc({mshadow::Shape1(idx_data.Size())}); + TBlob in_grad_data = in_grad_nd.data(); + TBlob in_grad_idx = in_grad_nd.aux_data(rowsparse::kIdx); + const auto row_length = out_grad_data.shape_.ProdShape(1, out_grad_data.shape_.ndim()); + + MSHADOW_TYPE_SWITCH(out_grad_data.type_flag_, DType, { // output data type + MSHADOW_IDX_TYPE_SWITCH(in_grad_idx.type_flag_, RType, { // row index data type + MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, { // index array data type + MXNET_ASSIGN_REQ_SWITCH(req[sr::kArr], req_type, { + Kernel, xpu>::Launch( + s, in_grad_idx.Size(), in_grad_data.dptr(), in_grad_idx.dptr(), + out_grad_data.dptr(), idx_data.dptr(), row_length); + }); + }); + }); + }); +} + + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_ diff --git a/src/operator/tensor/sparse_retain.cc b/src/operator/tensor/sparse_retain.cc new file mode 100644 index 000000000000..9b5b90e46835 --- /dev/null +++ b/src/operator/tensor/sparse_retain.cc @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file sparse_retain.cc + * \brief +*/ + +#include "./sparse_retain-inl.h" +namespace mxnet { +namespace op { + +NNVM_REGISTER_OP(sparse_retain) +.describe(R"code(pick rows specified by user input index array from a row sparse matrix +and save them in the output sparse matrix. + +Example:: + + data = [[1, 2], [3, 4], [5, 6]] + indices = [0, 1, 3] + shape = (4, 2) + rsp_in = row_sparse(data, indices) + to_retain = [0, 3] + rsp_out = sparse_retain(rsp_in, to_retain) + rsp_out.values = [[1, 2], [5, 6]] + rsp_out.indices = [0, 3] + +The storage type of ``sparse_retain`` output depends on storage types of inputs + +- sparse_retain(row_sparse, default) = row_sparse +- otherwise, ``sparse_retain`` is not supported + +)code" ADD_FILELINE) +.set_num_inputs(2) +.set_num_outputs(1) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "indices"}; + }) +.set_attr("FInferShape", SparseRetainOpShape) +.set_attr("FInferType", SparseRetainOpType) +.set_attr("FInferStorageType", SparseRetainForwardInferStorageType) +.set_attr("FComputeEx", SparseRetainOpForwardEx) +.set_attr("FGradient", + [](const nnvm::NodePtr& n, const std::vector& ograds) { + return MakeNonlossGradNode("_backward_sparse_retain", n, ograds, + {n->inputs[sr::kIdx]}, n->attrs.dict); + }) +.add_argument("data", "NDArray-or-Symbol", "The input array for sparse_retain operator.") +.add_argument("indices", "NDArray-or-Symbol", "The index array of rows ids that will be retained."); + +NNVM_REGISTER_OP(_backward_sparse_retain) +.set_num_inputs(2) +.set_num_outputs(2) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", SparseRetainBackwardInferStorageType) +.set_attr("FComputeEx", SparseRetainOpBackwardEx); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/sparse_retain.cu b/src/operator/tensor/sparse_retain.cu new file mode 100644 index 000000000000..f2a5b15dada4 --- /dev/null +++ b/src/operator/tensor/sparse_retain.cu @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file sparse_retain.cu + * \brief +*/ + +#include "./sparse_retain-inl.h" +namespace mxnet { +namespace op { + +NNVM_REGISTER_OP(sparse_retain) +.set_attr("FComputeEx", SparseRetainOpForwardEx); + +NNVM_REGISTER_OP(_backward_sparse_retain) +.set_attr("FComputeEx", SparseRetainOpBackwardEx); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/square_sum-inl.h b/src/operator/tensor/square_sum-inl.h new file mode 100644 index 000000000000..beb77c37b8d2 --- /dev/null +++ b/src/operator/tensor/square_sum-inl.h @@ -0,0 +1,456 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file square_sum-inl.h + * \brief This is a temporary solution for fusing operators + * square and sum together as a composite op for row sparse tensors. + * The purpose for fusing square and sum for row sparse tensors + * is that the gradient of the fused operator depends on the input + * ndarray and thus its gradient is a row-sparse ndarray too. + * This fused op will become deprecated after the functionality + * of fusing operators is finished in the future. + */ + +#ifndef MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_ +#define MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_ + +#include +#include +#include +#include "../mxnet_op.h" +#include "./broadcast_reduce_op.h" + +namespace mxnet { +namespace op { + +inline bool SquareSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 1U); + CHECK_EQ(out_attrs->size(), 1U); + const ReduceAxesParam& param = nnvm::get(attrs.parsed); + if (in_attrs->at(0) == kRowSparseStorage) { // current impl + if (param.axis[0] == 1 && param.keepdims) { // sum per row and keep dims + STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage); + } else { + STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage); + } + } else { // fallback + type_assign(&((*in_attrs)[0]), kDefaultStorage); + type_assign(&((*out_attrs)[0]), kDefaultStorage); + } + return true; +} + +inline bool SquareSumBackwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + const ReduceAxesParam& param = nnvm::get(attrs.parsed); + if (in_attrs->at(0) == kDefaultStorage || in_attrs->at(0) == kRowSparseStorage) { + STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kRowSparseStorage); + STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage); + } else { // fallback + type_assign(&((*in_attrs)[0]), kDefaultStorage); + type_assign(&((*in_attrs)[1]), kDefaultStorage); + type_assign(&((*out_attrs)[0]), kDefaultStorage); + } + return true; +} + +/*! + * \brief square sum of a rsp + * if axis = -1, same as mx.nd.sum(tensor*tensor) + * if axis = 0, same as mx.nd.sum(tensor*tensor, axis=0) + * if axis = 1, same as mx.nd.sum(tensor*tensor, axis=1) + * where tensor*tensor is elemwise multiplication of two ndarrays. + */ +template +struct SquareSumRspKernel; + +/*! + * \brief square sum of a rsp on axis=0 without keeping the dim + */ +template +struct SquareSumRspKernel { + /*! + * \param j the element index in out_data and column id of in_data + */ + template + MSHADOW_XINLINE static void Map(int j, DType* out_data, const DType* in_data, + const int64_t nnr, const int64_t num_cols) { + DType sum = 0; + for (int64_t i = 0; i < nnr; ++i) { + const DType val = in_data[i*num_cols+j]; + sum += val * val; + } + KERNEL_ASSIGN(out_data[j], req, sum); + } +}; + +/*! + * \brief square sum of a rsp on axis=1 without keeping the dim + */ +template +struct SquareSumRspKernel { + /*! + * \param i the i-th non-zero row of in_data + */ + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, const IType* in_row_idx, + const DType* in_data, const int64_t num_cols) { + DType sum = 0; + const int64_t offset = i * num_cols; + for (int64_t j = 0; j < num_cols; ++j) { + const DType val = in_data[offset+j]; + sum += val * val; + } + KERNEL_ASSIGN(out_data[in_row_idx[i]], req, sum); + } +}; + +/*! + * \brief square sum of a rsp on axis=1 keeping the dim + */ +template +struct SquareSumRspKernel { + /*! + * \param i the i-th non-zero row of in_data + */ + template + MSHADOW_XINLINE static void Map(int i, IType* out_row_idx, DType* out_data, + const IType* in_row_idx, const DType* in_data, + const int64_t num_cols) { + DType sum = 0; + out_row_idx[i] = in_row_idx[i]; + const int64_t offset = i * num_cols; + for (int64_t j = 0; j < num_cols; ++j) { + const DType val = in_data[offset+j]; + sum += val * val; + } + KERNEL_ASSIGN(out_data[i], req, sum); + } +}; + +template +struct SquareSumRspGradKernel; + +template +struct SquareSumRspGradKernel { + /*! + * \param i element index in in_grad and in_data + * \param in_grad_row_idx row_idx of the gradient of the op's input + * \param in_grad gradient of the op's input + * \param out_grad gradient of the op's output + * \param in_row_idx row idx of the op's input + * \param in_data op's input + */ + template + MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad, + const DType* out_grad, const IType* in_row_idx, + const DType* in_data, const int64_t num_cols) { + const int64_t row = i / num_cols; + in_grad_row_idx[row] = in_row_idx[row]; + KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[i%num_cols]); + } +}; + +template +struct SquareSumRspGradKernel { + /*! + * \param i element index in in_grad and in_data + * \param in_grad_row_idx row_idx of the gradient of the op's input + * \param in_grad gradient of the op's input + * \param out_grad gradient of the op's output + * \param in_row_idx row idx of the op's input + * \param in_data op's input + */ + template + MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad, + const DType* out_grad, const IType* in_row_idx, + const DType* in_data, const int64_t num_cols) { + const int64_t row = i / num_cols; + in_grad_row_idx[row] = in_row_idx[row]; + KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[in_row_idx[row]]); + } +}; + +/*! + * Note: This kernel assumes that the ograd and in_data + * are all rsp and have equal row_idx array, or + * in_data is a full rsp. + */ +template +struct SquareSumRspGradKernel { + /*! + * \param i index of igrad.data() + * \param in_grad_row_idx row_idx of the gradient of the op's input + * \param in_grad gradient of the op's input + * \param out_grad_row_idx row_idx of the gradient of the op's output + * \param out_grad gradient of the op's output + * \param in_data op's input + */ + template + MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad, + const IType* out_grad_row_idx, const DType* out_grad, + const DType* in_data, const int64_t num_cols) { + const int64_t row = i / num_cols; + in_grad_row_idx[row] = out_grad_row_idx[row]; + KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[row]); + } +}; + +template +void SquareSumRspImpl(const nnvm::NodeAttrs& attrs, + mshadow::Stream* s, + const NDArray& input, + const OpReqType req, + NDArray* output) { + if (req == kNullOp) return; + const ReduceAxesParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(param.axis.ndim(), 1U) << "_square_sum(row_sparse_matrix) only supports axis=0 or 1"; + CHECK(param.axis[0] == 0 || param.axis[0] == 1) + << "_square_sum(row_sparse_matrix) only supports axis=0 or 1"; + CHECK_EQ(input.storage_type(), kRowSparseStorage) + << "_square_sum op only supports row-sparse matrix as input"; + int64_t out_data_size = 0; + if (param.axis[0] == 0) { // axis = 0 + CHECK_EQ(output->storage_type(), kDefaultStorage); + out_data_size = input.storage_shape()[1]; + } else if (param.keepdims) { // axis = 1, keepdims = true + CHECK_EQ(output->storage_type(), kRowSparseStorage); + out_data_size = input.storage_shape()[0]; + } else { // axis = 1, keepdims = false + CHECK_EQ(output->storage_type(), kDefaultStorage); + out_data_size = input.shape()[0]; + } + CHECK_NE(req, kWriteInplace); + + using namespace mxnet_op; + if (!input.storage_initialized()) { + if (req == kWriteTo) { + if (output->storage_type() == kDefaultStorage) { + MSHADOW_TYPE_SWITCH(output->data().type_flag_, DType, { + Kernel::Launch(s, out_data_size, output->data().dptr()); + }) + } else if (output->storage_type() == kRowSparseStorage) { + FillZerosRspImpl(s, output); + } else { + LOG(FATAL) << "SquareSumRspImpl only supports row-sparse/dense output storage type"; + } + } + return; + } + + if (output->storage_type() == kRowSparseStorage) { + output->CheckAndAlloc({input.aux_shape(rowsparse::kIdx)}); + } + const TBlob& out_data = output->data(); + const int64_t nnr = input.storage_shape()[0]; + const int64_t num_cols = input.storage_shape()[1]; + const TBlob& in_data = input.data(); + if (0 == param.axis[0]) { // axis = 0, output is dense + MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + Kernel, xpu>::Launch(s, num_cols, + out_data.dptr(), input.data().dptr(), nnr, num_cols); + }) + }) + } else { // axis = 1 + const TBlob in_row_idx = input.aux_data(rowsparse::kIdx); + if (param.keepdims) { // output is rsp + const TBlob out_row_idx = output->aux_data(rowsparse::kIdx); + MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(in_row_idx.type_flag_, IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + Kernel, xpu>::Launch(s, nnr, + out_row_idx.dptr(), out_data.dptr(), in_row_idx.dptr(), + in_data.dptr(), num_cols); + }) + }) + }) + } else { // output is dense + if (req == kWriteTo) { + MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { + Kernel::Launch(s, out_data_size, out_data.dptr()); + }) + } + MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(in_row_idx.type_flag_, IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + Kernel, xpu>::Launch(s, nnr, + out_data.dptr(), in_row_idx.dptr(), in_data.dptr(), num_cols); + }) + }) + }) + } + } +} + +template +void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs, + mshadow::Stream* s, + const NDArray& ograd, + const NDArray& input, + const OpReqType req, + NDArray* igrad) { + if (req == kNullOp) return; + const ReduceAxesParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(param.axis.ndim(), 1U) << "_square_sum(row_sparse_matrix) only supports axis=0/1"; + CHECK(param.axis[0] == 0 || param.axis[0] == 1) + << "_square_sum(row_sparse_matrix) only supports axis=0 or 1"; + CHECK(ograd.storage_type() == kDefaultStorage || ograd.storage_type() == kRowSparseStorage); + CHECK_EQ(input.storage_type(), kRowSparseStorage); + CHECK_EQ(igrad->storage_type(), kRowSparseStorage); + CHECK_EQ(req, kWriteTo); + if (!input.storage_initialized()) { + FillZerosRspImpl(s, igrad); + return; + } + + using namespace mxnet_op; + // TODO(junwu) change the input of CheckAndAlloc + // if we want to support differen row idx arrays + // for ograd and input when they are both row-sparse ndarrays + igrad->CheckAndAlloc({input.aux_shape(rowsparse::kIdx)}); + const int64_t num_cols = input.storage_shape()[1]; + const TBlob& igrad_data = igrad->data(); + const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx); + const TBlob& ograd_data = ograd.data(); + const TBlob& in_data = input.data(); + const TBlob in_row_idx = input.aux_data(rowsparse::kIdx); + if (ograd.storage_type() == kDefaultStorage) { + if (0 == param.axis[0]) { // forward is sum per column + MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + Kernel, xpu>::Launch( + s, igrad_data.Size(), igrad_row_idx.dptr(), + igrad_data.dptr(), ograd_data.dptr(), + in_row_idx.dptr(), in_data.dptr(), num_cols); + }) + }) + }) + } else { // forward is sum per row + MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + Kernel, xpu>::Launch( + s, igrad_data.Size(), igrad_row_idx.dptr(), + igrad_data.dptr(), ograd_data.dptr(), + in_row_idx.dptr(), in_data.dptr(), num_cols); + }) + }) + }) + } + } else if (ograd.storage_type() == kRowSparseStorage) { + CHECK_EQ(1, param.axis[0]) << "SquareSumRspGradImpl only supports axis = 1" + " when ograd_stype = kRowSparseStorage"; + CHECK_EQ(ograd.shape().ndim(), 2U); + const TBlob ograd_row_idx = ograd.aux_data(rowsparse::kIdx); + CHECK(ograd_row_idx.Size() == in_row_idx.Size() || in_row_idx.Size() == in_data.shape_[0]); + MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, { + if (std::is_same::value) { + const IType* first1 = ograd_row_idx.dptr(); + const IType* last1 = first1 + ograd_row_idx.Size(); + const IType* first2 = in_row_idx.dptr(); + // when ograd_row_idx and in_row_idx have the same size and input is not a full rsp + // ograd_row_idx and in_row_idx are expected to have the same elements + if (ograd_row_idx.Size() == in_row_idx.Size() && in_row_idx.Size() != in_data.shape_[0]) { + CHECK(std::equal(first1, last1, first2)) << "SquareSumRspGradImpl only supports" + " equal ograd_row_idx and input_row_idx" + " when ograd and input are both" + " row-sparse"; + } + } else { + LOG(FATAL) << "SquareSumRspGradImpl has not implemented GPU version when" + " ograd and input are both row-sparse"; + } + MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + Kernel, xpu>::Launch( + s, igrad_data.Size(), igrad_row_idx.dptr(), + igrad_data.dptr(), ograd_row_idx.dptr(), + ograd_data.dptr(), in_data.dptr(), num_cols); + }) + }) + }) + } else { + LOG(FATAL) << "SquareSumRspGradImpl only supports ograd_stype" + << " = kDefaultStorage/kRowSparseStorage"; + } +} + +template +void SquareSumOpForwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + mshadow::Stream* s = ctx.get_stream(); + const NDArrayStorageType istype = inputs[0].storage_type(); + if (istype == kRowSparseStorage) { + CHECK_EQ(inputs[0].shape().ndim(), 2U) << "_square_sum op only supports" + " 2D ndarray as input"; + NDArray output = outputs[0]; + SquareSumRspImpl(attrs, s, inputs[0], req[0], &output); + } else { + LOG(FATAL) << "_square_sum op only supports row-sparse ndarray" + " as input, while input stype = " + << istype; + } +} + +template +void SquareSumOpBackwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + mshadow::Stream* s = ctx.get_stream(); + const NDArrayStorageType ograd_stype = inputs[0].storage_type(); + const NDArrayStorageType input_stype = inputs[1].storage_type(); + if (input_stype == kRowSparseStorage + && (ograd_stype == kDefaultStorage || ograd_stype == kRowSparseStorage)) { + CHECK_EQ(inputs[1].shape().ndim(), 2U) << "_square_sum op only supports" + " 2D ndarray as input"; + NDArray output = outputs[0]; + SquareSumRspGradImpl(attrs, s, inputs[0], inputs[1], req[0], &output); + } else { + LOG(FATAL) << "_square_sum op backward only supports dense ndarray as ograd," + " row-sparse ndarray as input and row-sparse ndarray as igrad," + " while ograd_stype = " << ograd_stype + << " input_stype = " << input_stype; + } +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_ diff --git a/src/operator/tensor/square_sum.cc b/src/operator/tensor/square_sum.cc new file mode 100644 index 000000000000..e4b49d7f7fcb --- /dev/null +++ b/src/operator/tensor/square_sum.cc @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file square_sum.cc + * \brief CPU Implementation of square_sum op. + */ +#include "./square_sum-inl.h" + +namespace mxnet { +namespace op { +MXNET_OPERATOR_REGISTER_REDUCE(_square_sum) +.describe(R"code(Computes the square sum of array elements over a given axis +for row-sparse matrix. This is a temporary solution for fusing ops square and +sum together for row-sparse matrix to save memory for storing gradients. +It will become deprecated once the functionality of fusing operators is finished +in the future. + +Example:: + + dns = mx.nd.array([[0, 0], [1, 2], [0, 0], [3, 4], [0, 0]]) + rsp = dns.tostype('row_sparse') + sum = mx.nd._internal._square_sum(rsp, axis=1) + sum = [0, 5, 0, 25, 0] +)code" ADD_FILELINE) +.set_attr("FInferStorageType", SquareSumForwardInferStorageType) +.set_attr("FComputeEx", SquareSumOpForwardEx) +.set_attr("FGradient", ElemwiseGradUseIn{"_backward_square_sum"}); + +MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_square_sum) +.set_num_inputs(2) +.set_attr("FInferStorageType", SquareSumBackwardInferStorageType) +.set_attr("FComputeEx", SquareSumOpBackwardEx); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/util/tensor_util-inl.cuh b/src/operator/tensor/util/tensor_util-inl.cuh new file mode 100644 index 000000000000..cf268e7ae9fc --- /dev/null +++ b/src/operator/tensor/util/tensor_util-inl.cuh @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2017 by Contributors + * \file tensor_util-inl.cuh + * \brief commonly utilized tensor operator GPU kernels + */ +#ifndef MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_ +#define MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_ + +#include +#include +#include + +namespace mxnet { +namespace op { + +/*! + * \brief Thread kernel for marking non-zero rows of a tensor. + * Parallelized by tensor rows: 1 thread/row + */ +struct MarkRspRowThreadKernel { + /*! + * \brief + * \param tid global thread id + * \param row_flg row flag array to mark non-zero rows + * \param dns dense matrix data + * \param num_rows number of rows (size of first dimension of tensor) + * \param row_length number of elements per row + */ + template + __device__ __forceinline__ static void Map(int tid, + nnvm::dim_t* row_flg, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t row_length) { + using nnvm::dim_t; + if (tid < num_rows) { + dim_t j = 0; + dim_t offset = tid * row_length; + for (; j < row_length; ++j) { + if (dns[offset+j] != 0) { + break; + } + } + if (j < row_length) { + row_flg[tid] = 1; // mark as one for non-zero row + } else { + row_flg[tid] = 0; // mark as zero for zero row + } + } + } +}; + +/*! + * \brief Warp kernel for marking non-zero rows of a tensor. + * Parallelized by tensor rows: 1 warp/row + */ +struct MarkRspRowWarpKernel { + template + __device__ __forceinline__ static void Map(int tid, + nnvm::dim_t* row_flg, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t row_length) { + using nnvm::dim_t; + typedef cub::WarpReduce WarpReduce; + const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32; + __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block]; + + const dim_t warp_id = tid / 32; // global warp id + const dim_t warp_lane = threadIdx.x / 32; // local warp id within thread block + const dim_t lane = tid & (32-1); // local thread id within warp + + if (warp_id < num_rows) { + dim_t flg = 0; + dim_t offset = warp_id * row_length; + for (dim_t j = lane; j < row_length; j+=32) { + if (dns[offset+j] != 0) { + // avoid break: causes slower performance on sparse tensors (<20% density), + // due to thread divergence + flg++; + } + } + dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(flg); + if (lane == 0) { + if (aggr > 0) { + row_flg[warp_id] = 1; // mark as one for non-zero row + } else { + row_flg[warp_id] = 0; // mark as zero for zero row + } + } + } + } +}; + +/*! + * \brief Block kernel for marking non-zero rows of a tensor. + * Parallelized by tensor rows: 1 threadBlock/row + */ +struct MarkRspRowBlockKernel { + template + __device__ __forceinline__ static void Map(int tid, + nnvm::dim_t* row_flg, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t row_length) { + using nnvm::dim_t; + using mshadow::cuda::kBaseThreadNum; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + if (blockIdx.x < num_rows) { + dim_t flg = 0; + dim_t offset = blockIdx.x * row_length; + for (dim_t j = threadIdx.x; j < row_length; j+=kBaseThreadNum) { + if (dns[offset+j] != 0) { + // avoid break: causes slower performance on sparse tensors (<20% density), + // due to thread divergence + flg++; + } + } + dim_t aggr = BlockReduce(temp_storage).Sum(flg); + if (threadIdx.x == 0) { + if (aggr > 0) { + row_flg[blockIdx.x] = 1; // mark as one for non-zero row + } else { + row_flg[blockIdx.x] = 0; // mark as zero for zero row + } + } + } + } +}; + +/*! + * \brief GPU kernel to flag non-zero rows of an rsp tensor with indices. + * Parallelized by matrix rows: 1 thread/row + */ +struct SetRspRowFlgKernel { + /*! + * \brief + * \param tid global thread id + * \param row_flg array to flag storage indices of non-zero rows + * \param row_idx rsp matrix row index array storing indices of non-zero rows + * \param nnr rsp matrix number of non-zero rows (storage shape) + */ + template + __device__ __forceinline__ static void Map(int tid, + RType* row_flg, + const RType* row_idx, + const nnvm::dim_t nnr) { + if (tid < nnr) { + row_flg[row_idx[tid]] = tid+1; + } + } +}; + +/*! + * \brief GPU kernel for filling the row index array of an rsp tensor. + * Parallelized by tensor rows: 1 thread/row + */ +struct FillRspRowIdxKernel { + /*! + * \brief + * \param tid global thread id + * \param row_idx row index array to store indices of non-zero rows + * \param row_flg_sum inclusive prefix sum array over 0/1 marked row flag array + * \param num_rows rsp tensor number of rows (shape) + */ + template + __device__ __forceinline__ static void Map(int tid, + RType* row_idx, + const nnvm::dim_t* row_flg_sum, + const nnvm::dim_t num_rows) { + if (tid < num_rows) { + nnvm::dim_t prev = (tid == 0)? 0 : row_flg_sum[tid-1]; + if (row_flg_sum[tid] > prev) { + row_idx[prev] = static_cast(tid); + } + } + } +}; + +/*! + * \brief GPU kernel for marking non-zero columns of a csr matrix. + * Parallelized by matrix rows: 1 warp/row + */ +struct MarkCsrColWarpKernel { + /*! + * \brief + * \param tid global thread id + * \param flg flg array to mark non-zero columns + * \param col_idx csr matrix column indices + * \param indptr csr matrix row index pointer + * \param num_rows csr matrix number of rows + * \param num_cols csr matrix number of columns + */ + template + __device__ __forceinline__ static void Map(int tid, + nnvm::dim_t* flg, + const CType* col_idx, + const IType* indptr, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + typedef unsigned long long int uint64_cu; + static_assert(sizeof(uint64_cu) == sizeof(nnvm::dim_t), "unexpected sizeof dim_t"); + + const nnvm::dim_t warp_id = tid / 32; // global warp id + const nnvm::dim_t lane = tid & (32-1); // local thread id within warp + + if (warp_id < num_rows) { + uint64_cu zero = 0; + uint64_cu one = 1; + for (IType j = indptr[warp_id]+lane; j < indptr[warp_id+1]; j+=32) { + atomicCAS(reinterpret_cast(flg+col_idx[j]), zero, one); + } + } + } +}; + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_ diff --git a/tests/ci_build/install/ubuntu_install_python.sh b/tests/ci_build/install/ubuntu_install_python.sh index bb67e3401a89..db4e9c4e0c94 100755 --- a/tests/ci_build/install/ubuntu_install_python.sh +++ b/tests/ci_build/install/ubuntu_install_python.sh @@ -24,5 +24,5 @@ apt-get update && apt-get install -y python-dev python3-dev # the version of the pip shipped with ubuntu may be too lower, install a recent version here cd /tmp && wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python2 get-pip.py -pip2 install nose pylint numpy nose-timer requests h5py -pip3 install nose pylint numpy nose-timer requests h5py +pip2 install nose pylint numpy nose-timer requests h5py scipy +pip3 install nose pylint numpy nose-timer requests h5py scipy diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc index 3fef28f79a0a..cd202ace1686 100644 --- a/tests/cpp/operator/batchnorm_test.cc +++ b/tests/cpp/operator/batchnorm_test.cc @@ -19,7 +19,7 @@ /*! * \file batchnorm_test.cc - * \brief operator unit test utility functions + * \brief batchnorm operator unit test utility functions * \author Chris Olivier */ @@ -892,8 +892,8 @@ TEST(BATCH_NORM, TestIterAll) { kwargs.push_back({ "cudnn_off", "True" }); } for (TShape shape : shapes) { - for (int g1 = 0; g1 < 2U; ++g1) { - for (int g2 = 0; g2 < 2U; ++g2) { + for (int g1 = 0; g1 < 2; ++g1) { + for (int g2 = 0; g2 < 2; ++g2) { for (int type : v2_types) { MSHADOW_REAL_TYPE_SWITCH_EX( type, DType, AccReal, diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 3fbf9f910879..af1ecfc5036f 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -22,45 +22,155 @@ sys.path.insert(0, "../../python/") import mxnet as mx import numpy as np +import numpy.random as rnd import time -def check_diff_to_scalar(A, x): +def check_diff_to_scalar(A, x, rank=None): """ assert A == x""" - assert(np.sum(np.abs((A - x).asnumpy())) == 0), A.asnumpy() + assert(np.sum(np.abs((A - x).asnumpy())) == 0), (rank, A.asnumpy(), x) # setup -keys = [3, 5, 7] +keys = ['3', '5', '7'] +rsp_keys = ['9', '11', '13'] + rate = 2 -shape = (2, 2) -big_shape = (1200, 1200) # big than BIGARRAY_BOUND +shape = (2, 3) +big_shape = (1200, 1200) # bigger than BIGARRAY_BOUND -kv = mx.kv.create('dist_sync') +def init_kv(): + kv = mx.kv.create('dist_sync') + # init kv dns keys + kv.init(keys, [mx.nd.ones(shape)] * len(keys)) + kv.init('99', mx.nd.ones(big_shape)) + # init kv row_sparse keys + kv.init(rsp_keys, [mx.nd.ones(shape).tostype('row_sparse')] * len(rsp_keys)) + kv.init('100', mx.nd.ones(big_shape).tostype('row_sparse')) + # worker info + my_rank = kv.rank + nworker = kv.num_workers + # init updater on servers + kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) + return kv, my_rank, nworker -# init kv -kv.init(keys, [mx.nd.ones(shape)] * len(keys)) -kv.init(99, mx.nd.ones(big_shape)) -# init updater on servers -kv.set_optimizer(mx.optimizer.create('test', rate)) +def test_sync_push_pull(): + kv, my_rank, nworker = init_kv() + def check_default_keys(kv, my_rank, nworker): + nrepeat = 3 + for i in range(nrepeat): + kv.push('3', mx.nd.ones(shape)*(my_rank+1)) + kv.push('99', mx.nd.ones(big_shape)*(my_rank+1)) -my_rank = kv.rank -nworker = kv.num_workers + num = (nworker + 1) * nworker * rate / 2 * nrepeat + 1 + val = mx.nd.zeros(shape) + kv.pull('3', out=val) + check_diff_to_scalar(val, num) -def test_sync_push_pull(): - nrepeat = 3 - for i in range(nrepeat): - kv.push(3, mx.nd.ones(shape)*(my_rank+1)) - kv.push(99, mx.nd.ones(big_shape)*(my_rank+1)) - - num = (nworker + 1 ) * nworker * rate / 2 * nrepeat + 1 - val = mx.nd.zeros(shape) - kv.pull(3, out = val) - check_diff_to_scalar(val, num) - # print val.asnumpy() - - val2 = mx.nd.zeros(big_shape) - kv.pull(99, out = val2) - check_diff_to_scalar(val2, num) + val2 = mx.nd.zeros(big_shape) + kv.pull('99', out=val2) + check_diff_to_scalar(val2, num) + + def check_row_sparse_keys(kv, my_rank, nworker): + nrepeat = 3 + # prepare gradient + v = mx.nd.zeros(shape) + my_row = my_rank % shape[0] + v[my_row] = my_rank + 1 + # push + for i in range(nrepeat): + kv.push('9', v.tostype('row_sparse')) + # select a random subset of rows this worker is interested in + num_rows = shape[0] + row_ids_np = np.random.randint(num_rows, size=num_rows) + row_ids = mx.nd.array(row_ids_np, dtype='int64') + # perform pull + val = mx.nd.zeros(shape, stype='row_sparse') + kv.row_sparse_pull('9', out=val, row_ids=row_ids) + # prepare updated values + updated_val = mx.nd.ones(shape) + for rank in range(nworker): + row = rank % shape[0] + updated_val[row] += (rank + 1) * rate * nrepeat + # verify subset of updated values + expected = mx.nd.zeros(shape) + for row in row_ids_np: + expected[row] = updated_val[row] + check_diff_to_scalar(val, expected) + + def check_row_sparse_keys_with_zeros(kv, my_rank, nworker): + nrepeat = 3 + # prepare gradient + v = mx.nd.zeros(shape) + big_v = mx.nd.zeros(big_shape) + # push + for i in range(nrepeat): + kv.push('11', v.tostype('row_sparse')) + kv.push('100', big_v.tostype('row_sparse')) + + # pull a subset of rows this worker is interested in + all_row_ids = np.arange(shape[0]) + val = mx.nd.ones(shape).tostype('row_sparse') + big_val = mx.nd.ones(big_shape).tostype('row_sparse') + kv.row_sparse_pull('11', out=val, row_ids=mx.nd.array(all_row_ids, dtype='int64')) + big_num_rows = shape[0] + big_all_row_ids = np.arange(big_shape[0]) + kv.row_sparse_pull('100', out=big_val, row_ids=mx.nd.array(big_all_row_ids, dtype='int64')) + # verify results + check_diff_to_scalar(val, mx.nd.ones(shape)) + check_diff_to_scalar(big_val, mx.nd.ones(big_shape)) + + def check_big_row_sparse_keys(kv, my_rank, nworker): + mx.random.seed(123) + rnd.seed(123) + density = 0.3 + nrepeat = 3 + # prepare gradient + v = mx.nd.zeros(big_shape) + idx_sample = rnd.rand(big_shape[0]) + indices = np.argwhere(idx_sample < density).flatten() + # each worker chooses a subset of the indices to update + update_rows = [] + for rank in range(nworker): + rows = [] + i = 0 + step = (rank + 1) * 2 + while i < len(indices): + rows.append(indices[i]) + i += step + update_rows.append(np.array(rows)) + # rows to update for this worker + for row in update_rows[my_rank]: + v[row] = my_rank + 1 + # push + for i in range(nrepeat): + kv.push('100', v.tostype('row_sparse')) + + # select a random subset of rows this worker is interested in + mx.random.seed(my_rank) + rnd.seed(my_rank) + num_rows = big_shape[0] + row_ids_np = np.random.randint(num_rows, size=num_rows) + row_ids = mx.nd.array(row_ids_np, dtype='int64') + # perform pull + val = mx.nd.zeros(big_shape, stype='row_sparse') + kv.row_sparse_pull('100', out=val, row_ids=row_ids) + # prepare expected result + updated_val = mx.nd.ones(big_shape) + # apply updates from each worker + for rank in range(nworker): + for row in update_rows[rank]: + updated_val[row] += (rank + 1) * rate * nrepeat + + expected = mx.nd.zeros(big_shape) + for row in row_ids_np: + expected[row] = updated_val[row] + check_diff_to_scalar(val, expected, rank=my_rank) + + check_default_keys(kv, my_rank, nworker) + check_row_sparse_keys(kv, my_rank, nworker) + check_row_sparse_keys_with_zeros(kv, my_rank, nworker) + check_big_row_sparse_keys(kv, my_rank, nworker) + print('worker ' + str(my_rank) + ' is done') if __name__ == "__main__": test_sync_push_pull() diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py new file mode 100644 index 000000000000..6d3ba989a84f --- /dev/null +++ b/tests/python/gpu/test_kvstore_gpu.py @@ -0,0 +1,51 @@ +# pylint: skip-file +import mxnet as mx +import numpy as np +from mxnet.test_utils import assert_almost_equal, default_context + +shape = (4, 4) +keys = [5, 7, 11] +str_keys = ['b', 'c', 'd'] + + +def init_kv_with_str(stype='default'): + """init kv """ + kv = mx.kv.create() + # single + kv.init('a', mx.nd.zeros(shape, stype=stype)) + # list + kv.init(str_keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys)) + return kv + + +def test_row_sparse_pull(): + kv = init_kv_with_str('row_sparse') + kv.init('e', mx.nd.ones(shape).tostype('row_sparse')) + + def check_row_sparse_pull(kv, count, ctx=default_context()): + num_rows = shape[0] + vals = [] + row_ids = [] + all_row_ids = np.arange(num_rows) + for i in range(count): + vals.append(mx.nd.zeros(shape, ctx=ctx).tostype('row_sparse')) + row_id = np.random.randint(num_rows, size=num_rows) + row_ids.append(mx.nd.array(row_id, dtype='int64')) + row_ids_to_pull = row_ids[0] if len(row_ids) == 1 else row_ids + vals_to_pull = vals[0] if len(vals) == 1 else vals + + kv.row_sparse_pull('e', out=vals_to_pull, row_ids=row_ids_to_pull) + for val, row_id in zip(vals, row_ids): + retained = val.asnumpy() + excluded_row_ids = np.setdiff1d(all_row_ids, row_id.asnumpy()) + for row in range(num_rows): + expected_val = np.zeros_like(retained[row]) + expected_val += 0 if row in excluded_row_ids else 1 + assert_almost_equal(retained[row], expected_val) + + check_row_sparse_pull(kv, 1, mx.gpu(0)) + check_row_sparse_pull(kv, 4, mx.gpu(0)) + + +if __name__ == '__main__': + test_row_sparse_pull() diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index cd8e85ac9157..a2a1fe8e06b7 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -31,6 +31,9 @@ from test_gluon import * #from test_rnn import * from test_gluon_rnn import * +from test_sparse_operator import test_cast_storage_ex, test_sparse_dot +from test_sparse_operator import test_sparse_nd_zeros, test_sparse_retain +from test_sparse_ndarray import test_create_csr, test_create_row_sparse set_default_context(mx.gpu(0)) del test_support_vector_machine_l1_svm diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py index 30dd662ff1cc..37bb5626f765 100644 --- a/tests/python/unittest/test_autograd.py +++ b/tests/python/unittest/test_autograd.py @@ -106,29 +106,41 @@ def autograd_assert(*args, **kwargs): assert same(a.asnumpy(), b.asnumpy()) def test_unary_func(): - x = nd.uniform(shape=(4, 5)) - f_exp = lambda x: nd.exp(x) - f_exp_grad = lambda x: [nd.exp(x)] - autograd_assert(x, func=f_exp, grad_func=f_exp_grad) - f_half = lambda x: x/2 - f_half_grad = lambda x: [nd.ones(x.shape) * 0.5] - autograd_assert(x, func=f_half, grad_func=f_half_grad) - f_square = lambda x: x**2 - f_square_grad = lambda x: [2*x] - autograd_assert(x, func=f_square, grad_func=f_square_grad) + def check_unary_func(x): + f_exp = lambda x: nd.exp(x) + f_exp_grad = lambda x: [nd.exp(x)] + autograd_assert(x, func=f_exp, grad_func=f_exp_grad) + f_half = lambda x: x/2 + f_half_grad = lambda x: [nd.ones(x.shape) * 0.5] + autograd_assert(x, func=f_half, grad_func=f_half_grad) + f_square = lambda x: x**2 + f_square_grad = lambda x: [2*x] + autograd_assert(x, func=f_square, grad_func=f_square_grad) + uniform = nd.uniform(shape=(4, 5)) + stypes = ['row_sparse', 'csr', 'default'] + for stype in stypes: + check_unary_func(uniform.tostype(stype)) def test_binary_func(): - x = nd.uniform(shape=(4, 5)) - y = nd.uniform(shape=(4, 5)) - f_add = lambda x, y: x+y - f_add_grad = lambda x, y: [nd.ones(x.shape), nd.ones(y.shape)] - autograd_assert(x, y, func=f_add, grad_func=f_add_grad) - f_mul = lambda x, y: x*y - f_mul_grad = lambda x, y: [y, x] - autograd_assert(x, y, func=f_mul, grad_func=f_mul_grad) - f_compose = lambda x, y: x+x*y - f_compose_grad = lambda x, y: [nd.ones(x.shape) + y, x] - autograd_assert(x, y, func=f_compose, grad_func=f_compose_grad) + def check_binary_func(x, y): + f_add = lambda x, y: x+y + f_add_grad = lambda x, y: [nd.ones(x.shape), nd.ones(y.shape)] + autograd_assert(x, y, func=f_add, grad_func=f_add_grad) + f_mul = lambda x, y: x*y + f_mul_grad = lambda x, y: [y, x] + autograd_assert(x, y, func=f_mul, grad_func=f_mul_grad) + f_compose = lambda x, y: x+x*y + f_compose_grad = lambda x, y: [nd.ones(x.shape) + y, x] + autograd_assert(x, y, func=f_compose, grad_func=f_compose_grad) + uniform_x = nd.uniform(shape=(4, 5)) + uniform_y = nd.uniform(shape=(4, 5)) + stypes = ['row_sparse', 'csr', 'default'] + for stype_x in stypes: + for stype_y in stypes: + x = uniform_x.tostype(stype_x) + y = uniform_y.tostype(stype_y) + check_binary_func(x, y) + def test_operator_with_state(): def f_fc(a, b, weight, bias): @@ -255,14 +267,19 @@ def test_retain_grad(): def test_attach_grad(): - x = mx.nd.zeros((10,)) - assert x.grad is None - x.attach_grad() - with record(): - y = x * 2 - assert y.grad is None - y.backward() - assert (x.grad.asnumpy() == 2).all() + def check_attach_grad(x): + assert x.grad is None + x.attach_grad() + with record(): + y = x * 2 + assert y.grad is None + y.backward() + assert (x.grad.asnumpy() == 2).all() + zeros = mx.nd.zeros((10, 10)) + stypes = ['default', 'row_sparse', 'csr'] + for stype in stypes: + x = zeros.tostype(stype) + check_attach_grad(x) def test_is_train(): diff --git a/tests/python/unittest/test_infer_shape.py b/tests/python/unittest/test_infer_shape.py index d7f52e216659..ccf7ffe897df 100644 --- a/tests/python/unittest/test_infer_shape.py +++ b/tests/python/unittest/test_infer_shape.py @@ -129,6 +129,24 @@ def test_incomplete_infer_concat(): assert arg_shapes['b'] == (2, 5) assert arg_shapes['d'] == (2, 15) +def test_fc_infer_type(): + mx_real_t = mx.base.mx_real_t + data = mx.symbol.Variable('data') + out = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=1000) + + # infer type + data_type = mx_real_t + arg_types, out_types, aux_types = out.infer_type(data=data_type) + arg_type_dict = dict(zip(out.list_arguments(), arg_types)) + assert len(out_types) == 1 + assert out_types[0] == mx_real_t + true_types = { + 'fc1_bias' : mx_real_t, + 'fc1_weight' : mx_real_t } + for k, v in true_types.items(): + assert arg_type_dict[k] == v + + if __name__ == "__main__": test_mlp2_infer_shape() test_mlp2_infer_error() diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py index c0f2acd4ed47..a543463f3663 100644 --- a/tests/python/unittest/test_io.py +++ b/tests/python/unittest/test_io.py @@ -17,6 +17,7 @@ # pylint: skip-file import mxnet as mx +from mxnet.test_utils import * import numpy as np import os, gzip import pickle as pickle @@ -152,6 +153,109 @@ def test_NDArrayIter_h5py(): else: assert(labelcount[i] == 100) +def test_NDArrayIter_csr(): + import scipy.sparse as sp + # creating toy data + num_rows = rnd.randint(5, 15) + num_cols = rnd.randint(1, 20) + batch_size = rnd.randint(1, num_rows) + shape = (num_rows, num_cols) + csr, _ = rand_sparse_ndarray(shape, 'csr') + dns = csr.asnumpy() + + # make iterators + csr_iter = iter(mx.io.NDArrayIter(csr, csr, batch_size)) + begin = 0 + for batch in csr_iter: + expected = np.zeros((batch_size, num_cols)) + end = begin + batch_size + expected[:num_rows - begin] = dns[begin:end] + if end > num_rows: + expected[num_rows - begin:] = dns[0:end - num_rows] + assert_almost_equal(batch.data[0].asnumpy(), expected) + begin += batch_size + +def test_LibSVMIter(): + def get_data(data_dir, data_name, url, data_origin_name): + if not os.path.isdir(data_dir): + os.system("mkdir " + data_dir) + os.chdir(data_dir) + if (not os.path.exists(data_name)): + if sys.version_info[0] >= 3: + from urllib.request import urlretrieve + else: + from urllib import urlretrieve + zippath = os.path.join(data_dir, data_origin_name) + urlretrieve(url, zippath) + import bz2 + bz_file = bz2.BZ2File(data_origin_name, 'rb') + with open(data_name, 'wb') as fout: + try: + content = bz_file.read() + fout.write(content) + finally: + bz_file.close() + os.chdir("..") + + def check_libSVMIter_synthetic(): + cwd = os.getcwd() + data_path = os.path.join(cwd, 'data.t') + label_path = os.path.join(cwd, 'label.t') + with open(data_path, 'w') as fout: + fout.write('1.0 0:0.5 2:1.2\n') + fout.write('-2.0\n') + fout.write('-3.0 0:0.6 1:2.4 2:1.2\n') + fout.write('4 2:-1.2\n') + + with open(label_path, 'w') as fout: + fout.write('1.0\n') + fout.write('-2.0 0:0.125\n') + fout.write('-3.0 2:1.2\n') + fout.write('4 1:1.0 2:-1.2\n') + + data_dir = os.path.join(cwd, 'data') + data_train = mx.io.LibSVMIter(data_libsvm=data_path, label_libsvm=label_path, + data_shape=(3, ), label_shape=(3, ), batch_size=3) + + first = mx.nd.array([[ 0.5, 0., 1.2], [ 0., 0., 0.], [ 0.6, 2.4, 1.2]]) + second = mx.nd.array([[ 0., 0., -1.2], [ 0.5, 0., 1.2], [ 0., 0., 0.]]) + i = 0 + for batch in iter(data_train): + expected = first.asnumpy() if i == 0 else second.asnumpy() + assert_almost_equal(data_train.getdata().asnumpy(), expected) + i += 1 + + def check_libSVMIter_news_data(): + news_metadata = { + 'name': 'news20.t', + 'origin_name': 'news20.t.bz2', + 'url': "http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/news20.t.bz2", + 'feature_dim': 62060, + 'num_classes': 20, + 'num_examples': 3993, + } + num_parts = 3 + batch_size = 128 + num_examples = news_metadata['num_examples'] + data_dir = os.path.join(os.getcwd(), 'data') + get_data(data_dir, news_metadata['name'], news_metadata['url'], + news_metadata['origin_name']) + path = os.path.join(data_dir, news_metadata['name']) + data_train = mx.io.LibSVMIter(data_libsvm=path, data_shape=(news_metadata['feature_dim'],), + batch_size=batch_size, num_parts=num_parts, part_index=0) + num_batches = 0 + iterator = iter(data_train) + for batch in iterator: + # check the range of labels + assert(np.sum(batch.label[0].asnumpy() > 20) == 0) + assert(np.sum(batch.label[0].asnumpy() <= 0) == 0) + num_batches += 1 + import math + expected_num_batches = math.ceil(num_examples * 1.0 / batch_size / num_parts) + assert(num_batches == int(expected_num_batches)), (num_batches, expected_num_batches) + + check_libSVMIter_synthetic() + check_libSVMIter_news_data() if __name__ == "__main__": test_NDArrayIter() @@ -159,3 +263,5 @@ def test_NDArrayIter_h5py(): test_NDArrayIter_h5py() test_MNISTIter() test_Cifar10Rec() + test_LibSVMIter() + test_NDArrayIter_csr() diff --git a/tests/python/unittest/test_kvstore.py b/tests/python/unittest/test_kvstore.py index f1e10c757fad..c517da65de92 100644 --- a/tests/python/unittest/test_kvstore.py +++ b/tests/python/unittest/test_kvstore.py @@ -18,33 +18,35 @@ # pylint: skip-file import mxnet as mx import numpy as np +from mxnet.test_utils import rand_ndarray, assert_almost_equal shape = (4, 4) keys = [5, 7, 11] str_keys = ['b', 'c', 'd'] -def init_kv(): +def init_kv(stype='default'): """init kv """ kv = mx.kv.create() # single - kv.init(3, mx.nd.zeros(shape)) + kv.init(3, mx.nd.zeros(shape=shape, stype=stype)) # list - kv.init(keys, [mx.nd.zeros(shape)] * len(keys)) + kv.init(keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys)) return kv -def init_kv_with_str(): +def init_kv_with_str(stype='default'): """init kv """ kv = mx.kv.create() # single - kv.init('a', mx.nd.zeros(shape)) + kv.init('a', mx.nd.zeros(shape, stype=stype)) # list - kv.init(str_keys, [mx.nd.zeros(shape)] * len(keys)) + kv.init(str_keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys)) return kv def check_diff_to_scalar(A, x): """ assert A == x""" assert(np.sum(np.abs((A - x).asnumpy())) == 0) + def test_single_kv_pair(): """single key-value pair push & pull""" def check_single_kv_pair(kv, key): @@ -56,6 +58,34 @@ def check_single_kv_pair(kv, key): check_single_kv_pair(init_kv(), 3) check_single_kv_pair(init_kv_with_str(), 'a') +def test_row_sparse_pull(): + kv = init_kv_with_str('row_sparse') + kv.init('e', mx.nd.ones(shape).tostype('row_sparse')) + + def check_row_sparse_pull(kv, count): + num_rows = shape[0] + vals = [] + row_ids = [] + all_row_ids = np.arange(num_rows) + for i in range(count): + vals.append(mx.nd.zeros(shape).tostype('row_sparse')) + row_id = np.random.randint(num_rows, size=num_rows) + row_ids.append(mx.nd.array(row_id, dtype='int64')) + row_ids_to_pull = row_ids[0] if len(row_ids) == 1 else row_ids + vals_to_pull = vals[0] if len(vals) == 1 else vals + + kv.row_sparse_pull('e', out=vals_to_pull, row_ids=row_ids_to_pull) + for val, row_id in zip(vals, row_ids): + retained = val.asnumpy() + excluded_row_ids = np.setdiff1d(all_row_ids, row_id.asnumpy()) + for row in range(num_rows): + expected_val = np.zeros_like(retained[row]) + expected_val += 0 if row in excluded_row_ids else 1 + assert_almost_equal(retained[row], expected_val) + + check_row_sparse_pull(kv, 1) + check_row_sparse_pull(kv, 4) + def test_init(): """test init""" def check_init(kv, key): @@ -110,10 +140,50 @@ def check_aggregator(kv, key, key_list): check_aggregator(init_kv_with_str(), 'a', str_keys) +def test_sparse_aggregator(): + """aggregate sparse ndarray on muliple devices""" + + stype = 'row_sparse' + kv = init_kv_with_str(stype) + + # devices + num_devs = 4 + devs = [mx.Context('cpu', i) for i in range(num_devs)] + + # single + vals = [rand_ndarray(shape, stype).copyto(devs[i]) for i in range(num_devs)] + expected_sum = np.zeros(shape) + for v in vals: + expected_sum += v.asnumpy() + + # prepare row_ids + all_rows = mx.nd.array(np.arange(shape[0]), dtype='int64') + kv.push('a', vals) + kv.row_sparse_pull('a', out=vals, row_ids=[all_rows] * len(vals)) + result_sum = np.zeros(shape) + for v in vals: + result_sum += v.asnumpy() + assert_almost_equal(result_sum, expected_sum * num_devs) + + # list + vals = [[rand_ndarray(shape, stype).copyto(devs[i]) for i in range(num_devs)]] * len(keys) + expected_sum = np.zeros(shape) + for v in vals[0]: + expected_sum += v.asnumpy() + + kv.push(str_keys, vals) + kv.row_sparse_pull(str_keys, out=vals, row_ids=[[all_rows] * num_devs] * len(vals)) + for vv in vals: + result_sum = np.zeros(shape) + for v in vv: + result_sum += v.asnumpy() + assert_almost_equal(result_sum, expected_sum * num_devs) + def updater(key, recv, local): """use updater: +=""" local += recv + def test_updater(dev = 'cpu'): """updater""" @@ -152,7 +222,6 @@ def check_updater(kv, key, key_list): str_kv._set_updater(updater) check_updater(str_kv, 'a', str_keys) - def test_get_type(): kvtype = 'local_allreduce_cpu' kv = mx.kv.create(kvtype) @@ -163,5 +232,7 @@ def test_get_type(): test_get_type() test_single_kv_pair() test_list_kv_pair() + test_sparse_aggregator() test_aggregator() test_updater() + test_row_sparse_pull() diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py index f522f29dae39..9e8ace563e0d 100644 --- a/tests/python/unittest/test_module.py +++ b/tests/python/unittest/test_module.py @@ -17,12 +17,15 @@ import mxnet as mx import mxnet.ndarray as nd +from mxnet.test_utils import * import numpy as np from functools import reduce from mxnet.module.executor_group import DataParallelExecutorGroup from common import assertRaises from collections import namedtuple +import numpy.random as rnd + def test_module_dtype(): dtype = np.float16 @@ -345,7 +348,6 @@ def mean_abs(x): break assert(mon_result_counts == [2, 2, 1, 6, 6, 4]) - def test_executor_group(): def get_rnn_sym(num_layers, num_words, num_hidden, num_embed, seq_len): stack = mx.rnn.SequentialRNNCell() @@ -458,6 +460,108 @@ def test_shared_exec_group(exec_grp_shared, exec_grp_created, shared_arg_names=N shared_arg_names=shared_arg_names, extra_args=extra_args) +def test_factorization_machine_module(): + """ Test factorization machine model with sparse operators """ + mx.random.seed(11) + rnd.seed(11) + + def fm(factor_size, feature_dim, init): + x = mx.symbol.Variable("data", stype='csr') + v = mx.symbol.Variable("v", shape=(feature_dim, factor_size), + init=init, stype='row_sparse') + + w1_weight = mx.symbol.var('w1_weight', shape=(feature_dim, 1), + init=init, stype='row_sparse') + w1_bias = mx.symbol.var('w1_bias', shape=(1)) + w1 = mx.symbol.broadcast_add(mx.symbol.dot(x, w1_weight), w1_bias) + + v_s = mx._symbol_internal._square_sum(data=v, axis=1, keepdims=True) + x_s = mx.symbol.square(data=x) + bd_sum = mx.sym.dot(x_s, v_s) + + w2 = mx.symbol.dot(x, v) + w2_squared = 0.5 * mx.symbol.square(data=w2) + + w_all = mx.symbol.Concat(w1, w2_squared, dim=1) + sum1 = mx.symbol.sum(data=w_all, axis=1, keepdims=True) + sum2 = 0.5 * mx.symbol.negative(bd_sum) + model = mx.sym.elemwise_add(sum1, sum2) + + y = mx.symbol.Variable("label") + model = mx.symbol.LinearRegressionOutput(data=model, label=y) + return model + + # model + ctx = default_context() + init = mx.initializer.Normal(sigma=0.01) + factor_size = 4 + feature_dim = 10000 + model = fm(factor_size, feature_dim, init) + + # data iter + num_batches = 5 + batch_size = 64 + num_samples = batch_size * num_batches + import scipy.sparse as sp + # generate some random scipy csr data + csr_sp = sp.rand(num_samples, feature_dim, density=0.1, format='csr') + csr_nd = mx.nd.csr_matrix(csr_sp.data, csr_sp.indptr, csr_sp.indices, + (num_samples, feature_dim)) + label = mx.nd.ones((num_samples,1)) + # the alternative is to use LibSVMIter + train_iter = mx.io.NDArrayIter(data=csr_nd, + label={'label':label}, + batch_size=batch_size) + # create module + mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label']) + # allocate memory by given the input data and lable shapes + mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) + # initialize parameters by uniform random numbers + mod.init_params(initializer=init) + # use Sparse SGD with learning rate 0.1 to train + sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01, + rescale_grad=1.0/batch_size) + mod.init_optimizer(optimizer=sgd) + # use accuracy as the metric + metric = mx.metric.create('MSE') + # train 10 epoch + for epoch in range(10): + train_iter.reset() + metric.reset() + for batch in train_iter: + mod.forward(batch, is_train=True) # compute predictions + mod.update_metric(metric, batch.label) # accumulate prediction accuracy + mod.backward() # compute gradients + mod.update() # update parameters + # print('Epoch %d, Training %s' % (epoch, metric.get())) + assert(metric.get()[1] < 0.02) + + +def test_module_initializer(): + def regression_model(m): + x = mx.symbol.var("data", stype='csr') + v = mx.symbol.var("v", shape=(m, 1), init=mx.init.Uniform(scale=.1), + stype='row_sparse') + model = mx.symbol.dot(lhs=x, rhs=v) + y = mx.symbol.Variable("label") + model = mx.symbol.LinearRegressionOutput(data=model, label=y, name="out") + return model + + n, m = 128, 100 + model = regression_model(m) + + data = mx.nd.zeros(shape=(n, m), stype='csr') + label = mx.nd.zeros((n, 1)) + iterator = mx.io.NDArrayIter(data=data, label={'label':label}, batch_size=n) + + # create module + mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label']) + mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label) + mod.init_params() + v = mod._arg_params['v'] + assert(v.stype == 'row_sparse') + assert(np.sum(v.asnumpy()) != 0) + def test_forward_reshape(): num_class=10 data1 = mx.sym.Variable('data1') diff --git a/tests/python/unittest/test_multi_device_exec.py b/tests/python/unittest/test_multi_device_exec.py index 6f8eb17ff34e..0a2739d9bb4e 100644 --- a/tests/python/unittest/test_multi_device_exec.py +++ b/tests/python/unittest/test_multi_device_exec.py @@ -16,6 +16,7 @@ # under the License. import os +import numpy as np import mxnet as mx def test_ctx_group(): @@ -49,5 +50,31 @@ def test_ctx_group(): else: assert arr.context == group2ctx['stage2'] +def test_ctx_group_sparse(): + with mx.AttrScope(ctx_group='stage1'): + lhs = mx.symbol.Variable('lhs', stype='csr') + rhs = mx.symbol.Variable('rhs', stype='row_sparse') + dot = mx.symbol.dot(lhs, rhs, name='dot') + + set_stage1 = set(dot.list_arguments()) + with mx.AttrScope(ctx_group='stage2'): + softmax = mx.symbol.SoftmaxOutput(data = dot, name = 'softmax') + + set_stage2 = set(softmax.list_arguments()) - set_stage1 + + group2ctx = { + 'stage1' : mx.cpu(1), + 'stage2' : mx.cpu(2) + } + texec = softmax.simple_bind(mx.cpu(0), group2ctx=group2ctx, + lhs=(32,200), rhs=(200, 5)) + + for arr, name in zip(texec.arg_arrays, softmax.list_arguments()): + if name in set_stage1: + assert arr.context == group2ctx['stage1'] + else: + assert arr.context == group2ctx['stage2'] + if __name__ == '__main__': test_ctx_group() + test_ctx_group_sparse() diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py index eae364eeaecf..3e0ac66c168d 100644 --- a/tests/python/unittest/test_ndarray.py +++ b/tests/python/unittest/test_ndarray.py @@ -373,6 +373,7 @@ def test_dot(): assert_almost_equal(c, C.asnumpy()) + def test_reduce(): sample_num = 200 def test_reduce_inner(numpy_reduce_func, nd_reduce_func, multi_axes): diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 7d56b46e21a0..f27204b119bd 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -855,75 +855,88 @@ def test_nearest_upsampling(): check_nearest_upsampling_with_shape(shapes, scale, root_scale) def test_batchnorm_training(): - for shape in [(2, 3), (2, 3, 2, 2)]: - data_tmp = np.random.normal(-0.1, 0.1, size=shape) - s = shape[1], - gamma = np.ones(s) - beta = np.ones(s) - gamma[1] = 3 - beta[0] = 3 + def check_batchnorm_training(stype): + for shape in [(2, 3), (2, 3, 2, 2)]: + data_tmp = np.random.normal(-0.1, 0.1, size=shape) + s = shape[1], + gamma = np.ones(s) + beta = np.ones(s) + gamma[1] = 3 + beta[0] = 3 - rolling_mean = np.random.uniform(size=s) - rolling_std = np.random.uniform(size=s) + rolling_mean = np.random.uniform(size=s) + rolling_std = np.random.uniform(size=s) - data = mx.symbol.Variable('data') + data = mx.symbol.Variable('data', stype=stype) + in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype), + mx.nd.array(beta).tostype(stype)] + mean_std = [mx.nd.array(rolling_mean).tostype(stype), mx.nd.array(rolling_std).tostype(stype)] - test = mx.symbol.BatchNorm_v1(data, fix_gamma=True) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm_v1(data, fix_gamma=True) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm(data, fix_gamma=True) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm(data, fix_gamma=True) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm_v1(data, fix_gamma=True, use_global_stats=True) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm_v1(data, fix_gamma=True, use_global_stats=True) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm_v1(data, fix_gamma=False) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm_v1(data, fix_gamma=False) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm(data, fix_gamma=False) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm(data, fix_gamma=False) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm_v1(data, fix_gamma=False, use_global_stats=True) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm_v1(data, fix_gamma=False, use_global_stats=True) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - # Test varying channel axis - dim = len(shape) - for chaxis in range(-dim, dim): - chaxis_true = chaxis - if chaxis < 0: - chaxis_true = dim + chaxis + # Test varying channel axis + dim = len(shape) + for chaxis in range(-dim, dim): + chaxis_true = chaxis + if chaxis < 0: + chaxis_true = dim + chaxis - shapex = shape + shapex = shape - channel_count = shapex[chaxis_true] - data_tmp = np.random.normal(-0.1, 0.1, size=shapex) + channel_count = shapex[chaxis_true] + data_tmp = np.random.normal(-0.1, 0.1, size=shapex) - gamma = np.ones(channel_count) - beta = np.ones(channel_count) - if channel_count > 1: - gamma[1] = 3 - beta[0] = 3 + gamma = np.ones(channel_count) + beta = np.ones(channel_count) + if channel_count > 1: + gamma[1] = 3 + beta[0] = 3 + + in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype), + mx.nd.array(beta).tostype(stype)] + + xrolling_mean = np.random.uniform(size=channel_count) + xrolling_std = np.random.uniform(size=channel_count) + xmean_std = [mx.nd.array(xrolling_mean).tostype(stype), + mx.nd.array(xrolling_std).tostype(stype)] - xrolling_mean = np.random.uniform(size=channel_count) - xrolling_std = np.random.uniform(size=channel_count) + test = mx.symbol.BatchNorm(data, fix_gamma=True, axis=chaxis) + check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01) - test = mx.symbol.BatchNorm(data, fix_gamma=True, axis=chaxis) - check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01) + test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True, axis=chaxis) + check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01) - test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True, axis=chaxis) - check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01) + test = mx.symbol.BatchNorm(data, fix_gamma=False, axis=chaxis) + check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01) - test = mx.symbol.BatchNorm(data, fix_gamma=False, axis=chaxis) - check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01) + test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True, axis=chaxis) + check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01) - test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True, axis=chaxis) - check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01) + stypes = ['row_sparse', 'default'] + for stype in stypes: + check_batchnorm_training(stype) def test_convolution_grouping(): num_filter = 4 diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index 3b3b92b372d8..055f6464f0ef 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -47,26 +47,43 @@ def test_lr_wd_mult(): assert not mx.test_utils.almost_equal(args1['fc2_weight'], args2['fc2_weight'], 1e-1) -def compare_optimizer(opt1, opt2, shape, dtype): - w1 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype) - g1 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype) - - w2 = w1.copyto(default_context()) - g2 = g1.copyto(default_context()) +def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default'): + if w_stype == 'default': + w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype) + w1 = w2.copyto(default_context()) + elif w_stype == 'row_sparse' or w_stype == 'csr': + w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype) + w1 = w2.copyto(default_context()).tostype('default') + else: + raise Exception("type not supported yet") + if g_stype == 'default': + g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype) + g1 = g2.copyto(default_context()) + elif g_stype == 'row_sparse' or g_stype == 'csr': + g2 = rand_ndarray(shape, g_stype, dtype=dtype) + g1 = g2.copyto(default_context()).tostype('default') + else: + raise Exception("type not supported yet") state1 = opt1.create_state(0, w1) state2 = opt2.create_state(0, w2) if state1 is not None and state2 is not None: - for s1, s2, in zip(state1, state2): - if s1 is not None or s2 is not None: - assert(same(s1.asnumpy(), s2.asnumpy())) + if isinstance(state1, tuple): + for s1, s2, in zip(state1, state2): + if s1 is not None or s2 is not None: + assert(same(s1.asnumpy(), s2.asnumpy())) + else: + assert_almost_equal(state1.asnumpy(), state2.asnumpy()) opt1.update(0, w1, g1, state1) opt2.update(0, w2, g2, state2) if state1 is not None and state2 is not None: - for s1, s2, in zip(state1, state2): - if s1 is not None or s2 is not None: - assert_almost_equal(s1.asnumpy(), s2.asnumpy(), rtol=1e-4, atol=1e-5) + if isinstance(state1, tuple): + for s1, s2, in zip(state1, state2): + if s1 is not None or s2 is not None: + assert_almost_equal(s1.asnumpy(), s2.asnumpy(), rtol=1e-4, atol=1e-5) + else: + assert_almost_equal(state1.asnumpy(), state2.asnumpy()) assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=1e-4, atol=1e-5) # SGD @@ -186,18 +203,122 @@ def test_sgd(): not kwarg['multi_precision'])): continue compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype) + # test operator fallback on cpu + if (default_context() == mx.cpu()): + compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, + g_stype='row_sparse') + if dtype != np.float16: + compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape[:2], + dtype, w_stype='csr', g_stype='csr') + +class PySparseSGD(mx.optimizer.Optimizer): + """python reference implemenation of sgd""" + def __init__(self, learning_rate=0.01, momentum=0.0, **kwargs): + super(PySparseSGD, self).__init__(learning_rate=learning_rate, **kwargs) + self.momentum = momentum + + def create_state(self, index, weight): + """Create additional optimizer state: momentum + + Parameters + ---------- + weight : NDArray + The weight data + + """ + if self.momentum == 0.0: + return None + else: + return mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype) + + def update(self, index, weight, grad, state): + """Update the parameters. + + Parameters + ---------- + index : int + An unique integer key used to index the parameters + + weight : NDArray + weight ndarray + + grad : NDArray + grad ndarray + + state : NDArray or other objects returned by init_state + The auxiliary state used in optimization. + """ + lr = self._get_lr(index) + wd = self._get_wd(index) + self._update_count(index) + num_rows = weight.shape[0] + if self.momentum == 0.0: + # Update on a per row basis, skip all-zero rows + for row in range(num_rows): + grad_row = grad[row].asnumpy() + all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row)) + if all_zeros: + continue + if self.clip_gradient is not None: + weight[row] = ((1 - lr*wd)*weight[row] - + lr*mx.nd.clip(grad[row]*self.rescale_grad, + -self.clip_gradient, self.clip_gradient)) + else: + weight[row] = (1 - lr*wd)*weight[row] - lr*self.rescale_grad*grad[row] + else: + mom = state + for row in range(num_rows): + grad_row = grad[row].asnumpy() + all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row)) + if all_zeros: + continue + if self.clip_gradient is not None: + mom[row] = (self.momentum*mom[row] - lr*wd*weight[row] - + lr*mx.nd.clip(grad[row]*self.rescale_grad, -self.clip_gradient, self.clip_gradient)) + weight[row] += mom[row] + else: + mom[row] = self.momentum*mom[row] - lr*wd*weight[row] - lr*self.rescale_grad*grad[row] + weight[row] += mom[row] + +def test_sparse_sgd(): + mx.random.seed(0) + opt1 = PySparseSGD + opt2 = mx.optimizer.SGD + shape = (3, 4, 5) + mom_options = [{}, {'momentum': 0.9}] + cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}] + rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}] + wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}] + mp_options = [{}] + for dtype in [np.float32]: + for mom_option in mom_options: + for cg_option in cg_options: + for rg_option in rg_options: + for wd_option in wd_options: + for mp_option in mp_options: + kwarg = {} + kwarg.update(mom_option) + kwarg.update(cg_option) + kwarg.update(rg_option) + kwarg.update(wd_option) + kwarg.update(mp_option) + compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, + w_stype='row_sparse', g_stype='row_sparse') + compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, + w_stype='row_sparse', g_stype='default') # ADAM class PyAdam(mx.optimizer.Optimizer): """python reference implemenation of adam""" def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, - decay_factor=(1 - 1e-8), **kwargs): + decay_factor=(1 - 1e-8), sparse_update=False, **kwargs): super(PyAdam, self).__init__(learning_rate=learning_rate, **kwargs) self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon self.decay_factor = decay_factor + self.sparse_update = sparse_update def create_state(self, index, weight): """Create additional optimizer state: mean, variance @@ -235,21 +356,28 @@ def update(self, index, weight, grad, state): mean, variance = state wd = self._get_wd(index) - grad = grad * self.rescale_grad + wd * weight - if self.clip_gradient is not None: - mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient, out=grad) - - mean *= self.beta1 - mean += grad * (1. - self.beta1) - - variance *= self.beta2 - variance += (1 - self.beta2) * mx.nd.square(grad, out=grad) - + num_rows = weight.shape[0] coef1 = 1. - self.beta1**t coef2 = 1. - self.beta2**t lr *= math.sqrt(coef2)/coef1 - - weight -= lr*mean/(mx.nd.sqrt(variance) + self.epsilon) + for row in range(num_rows): + # check row slices of all zeros + all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy())) + # skip zeros during sparse update + if all_zeros and self.sparse_update: + continue + grad[row] = grad[row] * self.rescale_grad + wd * weight[row] + # clip gradients + if self.clip_gradient is not None: + mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row]) + # update mean + mean[row] *= self.beta1 + mean[row] += grad[row] * (1. - self.beta1) + # update variance + variance[row] *= self.beta2 + variance[row] += (1 - self.beta2) * mx.nd.square(grad[row], out=grad[row]) + # update weight + weight[row] -= lr*mean[row]/(mx.nd.sqrt(variance[row]) + self.epsilon) def test_adam(): @@ -266,6 +394,8 @@ def test_adam(): {'rescale_grad': 0.8, 'wd': 0.05}] for kwarg in kwargs: compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32) + compare_optimizer(opt1(sparse_update=True, **kwarg), opt2(**kwarg), shape, + np.float32, w_stype='row_sparse', g_stype='row_sparse') # RMSProp class PyRMSProp(mx.optimizer.Optimizer): @@ -406,8 +536,10 @@ def test_rms(): {'rescale_grad': 0.8, 'wd': 0.05, 'centered': True, 'clip_weights': 0.01}] for kwarg in kwargs: compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32) + compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32, g_stype='row_sparse') if __name__ == '__main__': test_adam() test_rms() test_sgd() + test_sparse_sgd() diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py new file mode 100644 index 000000000000..1849bf7107e4 --- /dev/null +++ b/tests/python/unittest/test_sparse_ndarray.py @@ -0,0 +1,523 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pickle as pkl + +from mxnet.ndarray import NDArray +from mxnet.test_utils import * +from numpy.testing import assert_allclose +import numpy.random as rnd + +from mxnet.ndarray import RowSparseNDArray, CSRNDArray + + +def assert_fcompex(f, *args, **kwargs): + prev_val = mx.test_utils.set_env_var("MXNET_EXEC_STORAGE_FALLBACK", "0", "1") + f(*args, **kwargs) + mx.test_utils.set_env_var("MXNET_EXEC_STORAGE_FALLBACK", prev_val) + + +def sparse_nd_ones(shape, stype): + return mx.nd.ones(shape).tostype(stype) + + +def check_sparse_nd_elemwise_binary(shapes, stypes, f, g): + # generate inputs + nds = [] + for i, stype in enumerate(stypes): + if stype == 'row_sparse': + nd, _ = rand_sparse_ndarray(shapes[i], stype) + elif stype == 'default': + nd = mx.nd.array(random_arrays(shapes[i]), dtype = np.float32) + else: + assert(False) + nds.append(nd) + # check result + test = f(nds[0], nds[1]) + assert_almost_equal(test.asnumpy(), g(nds[0].asnumpy(), nds[1].asnumpy())) + + +def test_sparse_nd_elemwise_add(): + num_repeats = 10 + g = lambda x,y: x + y + op = mx.nd.elemwise_add + for i in range(num_repeats): + shape = [rand_shape_2d()] * 2 + assert_fcompex(check_sparse_nd_elemwise_binary, + shape, ['default'] * 2, op, g) + assert_fcompex(check_sparse_nd_elemwise_binary, + shape, ['default', 'row_sparse'], op, g) + assert_fcompex(check_sparse_nd_elemwise_binary, + shape, ['row_sparse', 'row_sparse'], op, g) + + +def test_sparse_nd_copy(): + def check_sparse_nd_copy(from_stype, to_stype, shape): + from_nd = rand_ndarray(shape, from_stype) + # copy to ctx + to_ctx = from_nd.copyto(default_context()) + # copy to stype + to_nd = rand_ndarray(shape, to_stype) + to_nd = from_nd.copyto(to_nd) + assert np.sum(np.abs(from_nd.asnumpy() != to_ctx.asnumpy())) == 0.0 + assert np.sum(np.abs(from_nd.asnumpy() != to_nd.asnumpy())) == 0.0 + + shape = rand_shape_2d() + shape_3d = rand_shape_3d() + stypes = ['row_sparse', 'csr'] + for stype in stypes: + check_sparse_nd_copy(stype, 'default', shape) + check_sparse_nd_copy('default', stype, shape) + check_sparse_nd_copy('row_sparse', 'row_sparse', shape_3d) + check_sparse_nd_copy('row_sparse', 'default', shape_3d) + check_sparse_nd_copy('default', 'row_sparse', shape_3d) + +def test_sparse_nd_basic(): + def check_sparse_nd_basic_rsp(): + storage_type = 'row_sparse' + shape = rand_shape_2d() + nd, (v, idx) = rand_sparse_ndarray(shape, storage_type) + assert(nd._num_aux == 1) + assert(nd.indices.dtype == np.int64) + assert(nd.stype == 'row_sparse') + + check_sparse_nd_basic_rsp() + + +def test_sparse_nd_setitem(): + def check_sparse_nd_setitem(stype, shape, dst): + x = mx.nd.zeros(shape=shape, stype=stype) + x[:] = dst + dst_nd = mx.nd.array(dst) if isinstance(dst, (np.ndarray, np.generic)) else dst + assert same(x.asnumpy(), dst_nd.asnumpy()) + + shape = rand_shape_2d() + for stype in ['row_sparse', 'csr']: + # ndarray assignment + check_sparse_nd_setitem(stype, shape, rand_ndarray(shape, 'default')) + check_sparse_nd_setitem(stype, shape, rand_ndarray(shape, stype)) + # numpy assignment + check_sparse_nd_setitem(stype, shape, np.ones(shape)) + + +def test_sparse_nd_slice(): + def check_sparse_nd_csr_slice(shape): + stype = 'csr' + A, _ = rand_sparse_ndarray(shape, stype) + A2 = A.asnumpy() + start = rnd.randint(0, shape[0] - 1) + end = rnd.randint(start + 1, shape[0]) + assert same(A[start:end].asnumpy(), A2[start:end]) + assert same(A[start:].asnumpy(), A2[start:]) + assert same(A[:end].asnumpy(), A2[:end]) + + shape = (rnd.randint(2, 10), rnd.randint(1, 10)) + check_sparse_nd_csr_slice(shape) + + +def test_sparse_nd_equal(): + for stype in ['row_sparse', 'csr']: + shape = rand_shape_2d() + x = mx.nd.zeros(shape=shape, stype=stype) + y = sparse_nd_ones(shape, stype) + z = x == y + assert (z.asnumpy() == np.zeros(shape)).all() + z = 0 == x + assert (z.asnumpy() == np.ones(shape)).all() + + +def test_sparse_nd_not_equal(): + for stype in ['row_sparse', 'csr']: + shape = rand_shape_2d() + x = mx.nd.zeros(shape=shape, stype=stype) + y = sparse_nd_ones(shape, stype) + z = x != y + assert (z.asnumpy() == np.ones(shape)).all() + z = 0 != x + assert (z.asnumpy() == np.zeros(shape)).all() + + +def test_sparse_nd_greater(): + for stype in ['row_sparse', 'csr']: + shape = rand_shape_2d() + x = mx.nd.zeros(shape=shape, stype=stype) + y = sparse_nd_ones(shape, stype) + z = x > y + assert (z.asnumpy() == np.zeros(shape)).all() + z = y > 0 + assert (z.asnumpy() == np.ones(shape)).all() + z = 0 > y + assert (z.asnumpy() == np.zeros(shape)).all() + + +def test_sparse_nd_greater_equal(): + for stype in ['row_sparse', 'csr']: + shape = rand_shape_2d() + x = mx.nd.zeros(shape=shape, stype=stype) + y = sparse_nd_ones(shape, stype) + z = x >= y + assert (z.asnumpy() == np.zeros(shape)).all() + z = y >= 0 + assert (z.asnumpy() == np.ones(shape)).all() + z = 0 >= y + assert (z.asnumpy() == np.zeros(shape)).all() + z = y >= 1 + assert (z.asnumpy() == np.ones(shape)).all() + + +def test_sparse_nd_lesser(): + for stype in ['row_sparse', 'csr']: + shape = rand_shape_2d() + x = mx.nd.zeros(shape=shape, stype=stype) + y = sparse_nd_ones(shape, stype) + z = y < x + assert (z.asnumpy() == np.zeros(shape)).all() + z = 0 < y + assert (z.asnumpy() == np.ones(shape)).all() + z = y < 0 + assert (z.asnumpy() == np.zeros(shape)).all() + + +def test_sparse_nd_lesser_equal(): + for stype in ['row_sparse', 'csr']: + shape = rand_shape_2d() + x = mx.nd.zeros(shape=shape, stype=stype) + y = sparse_nd_ones(shape, stype) + z = y <= x + assert (z.asnumpy() == np.zeros(shape)).all() + z = 0 <= y + assert (z.asnumpy() == np.ones(shape)).all() + z = y <= 0 + assert (z.asnumpy() == np.zeros(shape)).all() + z = 1 <= y + assert (z.asnumpy() == np.ones(shape)).all() + + +def test_sparse_nd_binary(): + N = 10 + def check_binary(fn, stype): + for _ in range(N): + ndim = 2 + oshape = np.random.randint(1, 6, size=(ndim,)) + bdim = 2 + lshape = list(oshape) + rshape = list(oshape[ndim-bdim:]) + for i in range(bdim): + sep = np.random.uniform(0, 1) + if sep < 0.33: + lshape[ndim-i-1] = 1 + elif sep < 0.66: + rshape[bdim-i-1] = 1 + lhs = np.random.uniform(0, 1, size=lshape) + rhs = np.random.uniform(0, 1, size=rshape) + lhs_nd = mx.nd.array(lhs).tostype(stype) + rhs_nd = mx.nd.array(rhs).tostype(stype) + assert_allclose(fn(lhs, rhs), fn(lhs_nd, rhs_nd).asnumpy(), rtol=1e-4, atol=1e-4) + + stypes = ['row_sparse', 'csr'] + for stype in stypes: + check_binary(lambda x, y: x + y, stype) + check_binary(lambda x, y: x - y, stype) + check_binary(lambda x, y: x * y, stype) + check_binary(lambda x, y: x / y, stype) + check_binary(lambda x, y: x ** y, stype) + check_binary(lambda x, y: x > y, stype) + check_binary(lambda x, y: x < y, stype) + check_binary(lambda x, y: x >= y, stype) + check_binary(lambda x, y: x <= y, stype) + check_binary(lambda x, y: x == y, stype) + + +def test_sparse_nd_binary_rop(): + N = 10 + def check(fn, stype): + for _ in range(N): + ndim = 2 + shape = np.random.randint(1, 6, size=(ndim,)) + npy = np.random.normal(0, 1, size=shape) + nd = mx.nd.array(npy).tostype(stype) + assert_allclose(fn(npy), fn(nd).asnumpy(), rtol=1e-4, atol=1e-4) + + stypes = ['row_sparse', 'csr'] + for stype in stypes: + check(lambda x: 1 + x, stype) + check(lambda x: 1 - x, stype) + check(lambda x: 1 * x, stype) + check(lambda x: 1 / x, stype) + check(lambda x: 2 ** x, stype) + check(lambda x: 1 > x, stype) + check(lambda x: 0.5 > x, stype) + check(lambda x: 0.5 < x, stype) + check(lambda x: 0.5 >= x, stype) + check(lambda x: 0.5 <= x, stype) + check(lambda x: 0.5 == x, stype) + +def test_sparse_nd_binary_iop(): + N = 10 + def check_binary(fn, stype): + for _ in range(N): + ndim = 2 + oshape = np.random.randint(1, 6, size=(ndim,)) + lshape = list(oshape) + rshape = list(oshape) + lhs = np.random.uniform(0, 1, size=lshape) + rhs = np.random.uniform(0, 1, size=rshape) + lhs_nd = mx.nd.array(lhs).tostype(stype) + rhs_nd = mx.nd.array(rhs).tostype(stype) + assert_allclose(fn(lhs, rhs), + fn(lhs_nd, rhs_nd).asnumpy(), + rtol=1e-4, atol=1e-4) + + def inplace_add(x, y): + x += y + return x + def inplace_mul(x, y): + x *= y + return x + stypes = ['csr', 'row_sparse'] + fns = [inplace_add, inplace_mul] + for stype in stypes: + for fn in fns: + check_binary(fn, stype) + +def test_sparse_nd_negate(): + def check_sparse_nd_negate(shape, stype): + npy = np.random.uniform(-10, 10, rand_shape_2d()) + arr = mx.nd.array(npy).tostype(stype) + assert_almost_equal(npy, arr.asnumpy()) + assert_almost_equal(-npy, (-arr).asnumpy()) + + # a final check to make sure the negation (-) is not implemented + # as inplace operation, so the contents of arr does not change after + # we compute (-arr) + assert_almost_equal(npy, arr.asnumpy()) + + shape = rand_shape_2d() + stypes = ['csr', 'row_sparse'] + for stype in stypes: + check_sparse_nd_negate(shape, stype) + +def test_sparse_nd_broadcast(): + sample_num = 1000 + # TODO(haibin) test with more than 2 dimensions + def test_broadcast_to(stype): + for i in range(sample_num): + ndim = 2 + target_shape = np.random.randint(1, 11, size=ndim) + shape = target_shape.copy() + axis_flags = np.random.randint(0, 2, size=ndim) + axes = [] + for (axis, flag) in enumerate(axis_flags): + if flag: + shape[axis] = 1 + dat = np.random.rand(*shape) - 0.5 + numpy_ret = dat + ndarray = mx.nd.array(dat).tostype(stype) + ndarray_ret = ndarray.broadcast_to(shape=target_shape) + if type(ndarray_ret) is mx.ndarray.NDArray: + ndarray_ret = ndarray_ret.asnumpy() + assert (ndarray_ret.shape == target_shape).all() + err = np.square(ndarray_ret - numpy_ret).mean() + assert err < 1E-8 + stypes = ['csr', 'row_sparse'] + for stype in stypes: + test_broadcast_to(stype) + + +def test_sparse_nd_transpose(): + npy = np.random.uniform(-10, 10, rand_shape_2d()) + stypes = ['csr', 'row_sparse'] + for stype in stypes: + nd = mx.nd.array(npy).tostype(stype) + assert_almost_equal(npy.T, (nd.T).asnumpy()) + +def test_sparse_nd_output_fallback(): + shape = (10, 10) + out = mx.nd.zeros(shape=shape, stype='row_sparse') + mx.nd.random_normal(shape=shape, out=out) + assert(np.sum(out.asnumpy()) != 0) + +def test_sparse_nd_random(): + """ test sparse random operator on cpu """ + # gpu random operator doesn't use fixed seed + if default_context().device_type is 'gpu': + return + shape = (100, 100) + fns = [mx.nd.random_uniform, mx.nd.random_normal, mx.nd.random_gamma] + for fn in fns: + rsp_out = mx.nd.zeros(shape=shape, stype='row_sparse') + dns_out = mx.nd.zeros(shape=shape, stype='default') + mx.random.seed(0) + np.random.seed(0) + fn(shape=shape, out=dns_out) + mx.random.seed(0) + np.random.seed(0) + fn(shape=shape, out=rsp_out) + assert_almost_equal(dns_out.asnumpy(), rsp_out.asnumpy()) + + +def test_sparse_nd_astype(): + stypes = ['row_sparse', 'csr'] + for stype in stypes: + x = mx.nd.zeros(shape=rand_shape_2d(), stype=stype, dtype='float32') + y = x.astype('int32') + assert(y.dtype == np.int32), y.dtype + + +def test_sparse_nd_pickle(): + np.random.seed(0) + repeat = 10 + dim0 = 40 + dim1 = 40 + stypes = ['row_sparse', 'csr'] + densities = [0, 0.01, 0.1, 0.2, 0.5] + stype_dict = {'row_sparse': RowSparseNDArray, 'csr': CSRNDArray} + for _ in range(repeat): + shape = rand_shape_2d(dim0, dim1) + for stype in stypes: + for density in densities: + a, _ = rand_sparse_ndarray(shape, stype, density) + assert isinstance(a, stype_dict[stype]) + data = pkl.dumps(a) + b = pkl.loads(data) + assert isinstance(b, stype_dict[stype]) + assert same(a.asnumpy(), b.asnumpy()) + + +def test_sparse_nd_save_load(): + np.random.seed(0) + repeat = 1 + stypes = ['default', 'row_sparse', 'csr'] + stype_dict = {'default': NDArray, 'row_sparse': RowSparseNDArray, 'csr': CSRNDArray} + num_data = 20 + densities = [0, 0.01, 0.1, 0.2, 0.5] + fname = 'tmp_list.bin' + for _ in range(repeat): + data_list1 = [] + for i in range(num_data): + stype = stypes[np.random.randint(0, len(stypes))] + shape = rand_shape_2d(dim0=40, dim1=40) + density = densities[np.random.randint(0, len(densities))] + data_list1.append(rand_ndarray(shape, stype, density)) + assert isinstance(data_list1[-1], stype_dict[stype]) + mx.nd.save(fname, data_list1) + + data_list2 = mx.nd.load(fname) + assert len(data_list1) == len(data_list2) + for x, y in zip(data_list1, data_list2): + assert same(x.asnumpy(), y.asnumpy()) + + data_map1 = {'ndarray xx %s' % i: x for i, x in enumerate(data_list1)} + mx.nd.save(fname, data_map1) + data_map2 = mx.nd.load(fname) + assert len(data_map1) == len(data_map2) + for k, x in data_map1.items(): + y = data_map2[k] + assert same(x.asnumpy(), y.asnumpy()) + os.remove(fname) + +def test_sparse_nd_unsupported(): + nd = mx.nd.zeros((2,2), stype='row_sparse') + fn_slice = lambda x: x._slice(None, None) + fn_at = lambda x: x._at(None) + fn_reshape = lambda x: x.reshape(None) + fns = [fn_slice, fn_at, fn_reshape] + for fn in fns: + try: + fn(nd) + assert(False) + except: + pass + +def test_create_csr(): + dim0 = 50 + dim1 = 50 + densities = [0, 0.01, 0.1, 0.2, 0.5] + for density in densities: + shape = rand_shape_2d(dim0, dim1) + matrix = rand_ndarray(shape, 'csr', density) + data = matrix.data + indptr = matrix.indptr + indices = matrix.indices + csr_created = mx.nd.csr_matrix(data=data, indptr=indptr, indices=indices, shape=shape) + assert csr_created.stype == 'csr' + assert same(csr_created.data.asnumpy(), data.asnumpy()) + assert same(csr_created.indptr.asnumpy(), indptr.asnumpy()) + assert same(csr_created.indices.asnumpy(), indices.asnumpy()) + csr_copy = mx.nd.array(csr_created) + assert(same(csr_copy.asnumpy(), csr_created.asnumpy())) + + +def test_create_row_sparse(): + dim0 = 50 + dim1 = 50 + densities = [0, 0.01, 0.1, 0.2, 0.5] + for density in densities: + shape = rand_shape_2d(dim0, dim1) + matrix = rand_ndarray(shape, 'row_sparse', density) + data = matrix.data + indices = matrix.indices + rsp_created = mx.nd.row_sparse_array(data=data, indices=indices, shape=shape) + assert rsp_created.stype == 'row_sparse' + assert same(rsp_created.data.asnumpy(), data.asnumpy()) + assert same(rsp_created.indices.asnumpy(), indices.asnumpy()) + rsp_copy = mx.nd.array(rsp_created) + assert(same(rsp_copy.asnumpy(), rsp_created.asnumpy())) + +def test_sparse_nd_empty(): + stypes = ['csr', 'row_sparse', 'default'] + for stype in stypes: + nd = mx.nd.empty((2,2), stype=stype) + assert(nd.stype == stype) + + +def test_synthetic_dataset_generator(): + def test_powerlaw_generator(csr_arr, final_row=1): + """Test power law distribution + Total Elements: 32000, Number of zeros: 3200 + Every row has 2 * non zero elements of the previous row. + Also since (2047 < 3200 < 4095) this will be true till 10th row""" + indices = csr_arr.indices.asnumpy() + indptr = csr_arr.indptr.asnumpy() + for row in range(1, final_row + 1): + nextrow = row + 1 + current_row_nnz = indices[indptr[row] - 1] + 1 + next_row_nnz = indices[indptr[nextrow] - 1] + 1 + assert next_row_nnz == 2 * current_row_nnz + + # Test if density is preserved + csr_arr_cols, _ = rand_sparse_ndarray(shape=(32, 10000), stype="csr", + density=0.01, distribution="powerlaw") + + csr_arr_small, _ = rand_sparse_ndarray(shape=(5, 5), stype="csr", + density=0.5, distribution="powerlaw") + + csr_arr_big, _ = rand_sparse_ndarray(shape=(32, 1000000), stype="csr", + density=0.4, distribution="powerlaw") + + csr_arr_square, _ = rand_sparse_ndarray(shape=(1600, 1600), stype="csr", + density=0.5, distribution="powerlaw") + assert len(csr_arr_cols.data) == 3200 + test_powerlaw_generator(csr_arr_cols, final_row=9) + test_powerlaw_generator(csr_arr_small, final_row=1) + test_powerlaw_generator(csr_arr_big, final_row=4) + test_powerlaw_generator(csr_arr_square, final_row=6) + + +if __name__ == '__main__': + import nose + nose.runmodule() diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py new file mode 100644 index 000000000000..748a89990cbd --- /dev/null +++ b/tests/python/unittest/test_sparse_operator.py @@ -0,0 +1,372 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from mxnet.test_utils import * + + +def check_elemwise_add_ex(lhs_stype, rhs_stype, shape, lhs_grad_stype=None, rhs_grad_stype=None): + lhs = mx.symbol.Variable('lhs', stype=lhs_stype) + rhs = mx.symbol.Variable('rhs', stype=rhs_stype) + lhs_nd = rand_ndarray(shape, lhs_stype) + rhs_nd = rand_ndarray(shape, rhs_stype) + lhs_np = lhs_nd.asnumpy() + rhs_np = rhs_nd.asnumpy() + + out_np = lhs_np + rhs_np + test = mx.symbol.elemwise_add(lhs, rhs) + location = {'lhs': lhs_nd, 'rhs': rhs_nd} + check_symbolic_forward(test, location, [out_np]) + check_numeric_gradient(test, location) + grad_stypes = {} + if lhs_grad_stype is not None and lhs_grad_stype != 'default': + grad_stypes['lhs'] = lhs_grad_stype + if rhs_grad_stype is not None and rhs_grad_stype != 'default': + grad_stypes['rhs'] = rhs_grad_stype + check_symbolic_backward(test, location, [out_np], [out_np, out_np], + grad_stypes=grad_stypes) + + +def test_elemwise_add_ex(): + shapes = [rand_shape_2d(), rand_shape_3d()] + for shape in shapes: + check_elemwise_add_ex('default', 'default', shape) + check_elemwise_add_ex('default', 'row_sparse', shape) + check_elemwise_add_ex('row_sparse', 'default', shape) + check_elemwise_add_ex('row_sparse', 'row_sparse', shape, + lhs_grad_stype='row_sparse', rhs_grad_stype='row_sparse') + + +# TODO(haibin) randomize this test +def test_elemwise_add_ex_multiple_stages(): + # prep data + shape = (4, 2) + ds_np = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + sp_np1 = np.array([[5, 10], [0, 0], [0, 0], [0, 0]]) + sp_np2 = np.array([[0, 0], [5, 10], [0, 0], [0, 0]]) + + val1 = mx.nd.array([[5, 10]]); + val2 = mx.nd.array([[5, 10]]); + idx1 = mx.nd.array([0], dtype=np.int64); + idx2 = mx.nd.array([1], dtype=np.int64); + sp_nd1 = mx.nd.row_sparse_array(val1, idx1, shape) + sp_nd2 = mx.nd.row_sparse_array(val2, idx2, shape) + ds_nd = mx.nd.array(ds_np) + + # sparse + sparse = sparse + sp_data1 = mx.symbol.Variable('sp_data1', stype='row_sparse') + sp_data2 = mx.symbol.Variable('sp_data2', stype='row_sparse') + ds_data = mx.symbol.Variable('ds_data') + plus = mx.symbol.elemwise_add(sp_data1, sp_data2, name='plus') + # sparse + dense = dense + test = mx.symbol.elemwise_add(plus, ds_data) + check_symbolic_forward(test, {'sp_data1': sp_nd1, 'sp_data2': sp_nd2, + 'ds_data': ds_nd}, [sp_np1 + sp_np2 + ds_np]) + + arr_grads = [mx.nd.zeros(shape) for i in range(3)] + exec_test = test.bind(default_context(), args={'sp_data1': sp_nd1, 'sp_data2': sp_nd2, + 'ds_data': ds_nd}, args_grad=arr_grads) + exec_test.forward(is_train=True) + assert_almost_equal(exec_test.outputs[0].asnumpy(), sp_np1 + sp_np2 + ds_np) + exec_test.backward(out_grads=exec_test.outputs) + assert_almost_equal(arr_grads[0].asnumpy(), arr_grads[1].asnumpy()) + +def test_cast_storage_ex(): + def check_cast_storage(shape, density, from_stype, to_stype, check_numeric_grad=True): + x = mx.symbol.Variable('x', stype=from_stype) + x_nd = rand_ndarray(shape, from_stype, density=density) + x_np = x_nd.asnumpy() + out_np = x_np + test = mx.symbol.cast_storage(x, stype=to_stype) + location = {'x': x_nd} + check_symbolic_forward(test, location, [out_np]) + # consider disable the numeric grad check for gpu block kernel since the input is large + if check_numeric_grad: + check_numeric_gradient(test, location) + grad_stypes = {'x': to_stype} + check_symbolic_backward(test, location, [out_np], [out_np], grad_stypes=grad_stypes) + + density = [1.00, 0.50, 0.10, 0.05, 0.01] + for d in density: + shape_2d = rand_shape_2d() + shape_3d = rand_shape_3d() + check_cast_storage(shape_2d, d, 'csr', 'default') + check_cast_storage(shape_2d, d, 'default', 'csr') + check_cast_storage(shape_2d, d, 'row_sparse', 'default') + check_cast_storage(shape_2d, d, 'default', 'row_sparse') + check_cast_storage(shape_3d, d, 'row_sparse', 'default') + check_cast_storage(shape_3d, d, 'default', 'row_sparse') + for i in range(4, 6): + shape = rand_shape_nd(i, 5) + check_cast_storage(shape, d, 'default', 'row_sparse') + check_cast_storage(shape, d, 'row_sparse', 'default') + # Test specific gpu kernels + if default_context().device_type is 'gpu': + dim0 = rnd.randint(1, 10) + # test gpu thread kernel + check_cast_storage((dim0, rnd.randint( 1, 32)), d, 'default', 'csr') + # test gpu warp kernel + check_cast_storage((dim0, rnd.randint( 32, 512)), d, 'default', 'csr') + # test gpu block kernel + check_cast_storage((dim0, rnd.randint(512, 1024)), d, 'default', 'csr', + check_numeric_grad=False) + # test gpu thread kernel + check_cast_storage((dim0, rnd.randint( 1, 32)), d, 'default', 'row_sparse') + # test gpu warp kernel + check_cast_storage((dim0, rnd.randint( 32, 512)), d, 'default', 'row_sparse') + # test gpu block kernel + check_cast_storage((dim0, rnd.randint(512, 1024)), d, 'default', 'row_sparse', + check_numeric_grad=False) + +def test_sparse_dot(): + def test_dot_csr(lhs_shape, rhs_shape, rhs_stype, trans_lhs, lhs_density, rhs_density): + lhs_nd = rand_ndarray(lhs_shape, 'csr', density=lhs_density) + lhs_dns = lhs_nd.tostype('default') + rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_density) + rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.tostype('default') + + out = mx.nd.dot(lhs_nd, rhs_nd, transpose_a=trans_lhs) + out_dns = mx.nd.dot(lhs_dns, rhs_dns, transpose_a=trans_lhs) + out_np = out_dns.asnumpy() + assert_almost_equal(out.asnumpy(), out_np, rtol=1e-4, atol=1e-5) + + # test symbolic forward + lhs = mx.symbol.Variable('lhs', stype='csr') + rhs = mx.symbol.Variable('rhs', stype=rhs_stype) + out = mx.symbol.dot(lhs, rhs, transpose_a=trans_lhs) + location = {'lhs': lhs_nd, 'rhs': rhs_nd} + check_symbolic_forward(out, location, [out_np], rtol=1e-3, atol=1e-4) + + # test symbolic backward + backward_trans = not trans_lhs + rhs_backward_grad = mx.nd.dot(lhs_dns, out_dns, transpose_a=backward_trans).asnumpy() + expected = {'rhs': rhs_backward_grad} + check_symbolic_backward(out, location, [out_np], expected, + grad_req={'lhs': 'null', 'rhs': 'write'}, + rtol=1e-3, atol=1e-4) + + density = [1.00, 0.50, 0.10, 0.05, 0.01] + for lhs_d in density: + lhs_shape = rand_shape_2d(50, 200) + rhs_d = 1 + test_dot_csr(lhs_shape, (lhs_shape[1], 1), 'default', False, lhs_d, rhs_d) # test gpu SpMV + test_dot_csr(lhs_shape, (lhs_shape[0], 1), 'default', True , lhs_d, rhs_d) # (vector kernel) + test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(5, 10)), 'default', False, lhs_d, rhs_d) # test gpu SpMM + test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(5, 10)), 'default', True , lhs_d, rhs_d) # (scalar kernel) + for rhs_d in density: + test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(1, 10)), 'row_sparse', False, lhs_d, rhs_d) + test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(1, 10)), 'row_sparse', True, lhs_d, rhs_d) + + +def test_sparse_slice(): + def check_csr_slice(shape, slice_input): + storage_type = 'csr' + B, _ = rand_sparse_ndarray(shape, storage_type) + np = B.asnumpy() + begin = rnd.randint(0, B.shape[0] - 1) + end = rnd.randint(begin + 1, B.shape[0]) + nd_slice = mx.nd.crop(B, begin=begin, end=end) + assert same(nd_slice.asnumpy(), np[begin:end]), (nd_slice.asnumpy(), np[begin:end]) + + shape = (rnd.randint(7, 15), rnd.randint(1, 10)) + check_csr_slice(shape, True) + check_csr_slice(shape, False) + + +def test_sparse_retain(): + def check_sparse_retain(shape, density, index_type=np.int64): + num_rows = shape[0] + rsp, _ = rand_sparse_ndarray(shape=shape, stype='row_sparse', density=density) + length = np.random.randint(1, num_rows + 1) + idx = random_sample(list(range(0, num_rows)), length) + idx.sort() + dns = rsp.asnumpy() + tensor_retained_expected = np.zeros(shape) + for i in idx: + tensor_retained_expected[i][:] = dns[i] + indices = mx.nd.array(idx, dtype=index_type) + rsp_retained = mx.nd.sparse_retain(rsp, indices=indices) + assert same(tensor_retained_expected, rsp_retained.asnumpy()) + + # check numeric gradient + data = mx.symbol.Variable('data') + idx = mx.symbol.Variable('indices') + sym = mx.sym.sparse_retain(data=data, indices=idx) + check_numeric_gradient(sym, [rsp, indices], grad_nodes=['data'], + grad_stype_dict={'data': 'row_sparse'}) + + shape = rand_shape_2d() + shape_3d = rand_shape_3d() + densities = [0.01, 0.1, 0.2, 0.5, 0.8, 1.0] + index_types = [np.float32, np.int32, np.int64] + for density in densities: + for itype in index_types: + check_sparse_retain(shape, density, itype) + check_sparse_retain(shape_3d, density, itype) + + +def test_sparse_nd_zeros(): + def check_sparse_nd_zeros(stype, shape): + zero = mx.nd.zeros(shape) + sparse_zero = mx.nd.zeros(shape=shape, stype=stype) + assert_almost_equal(sparse_zero.asnumpy(), zero.asnumpy()) + + shape = rand_shape_2d() + check_sparse_nd_zeros('row_sparse', shape) + check_sparse_nd_zeros('csr', shape) + check_sparse_nd_zeros('default', shape) + + +def test_sparse_square_sum(): + dim0 = 30 + dim1 = 30 + axes = [0, 1] + keepdims = [False, True] + densities = [0, 0.01, 0.1, 0.2, 0.5] + for density in densities: + shape = rand_shape_2d(dim0, dim1) + rsp = rand_ndarray(shape, 'row_sparse', density) + dns = rsp.tostype('default') + for axis in axes: + for keepdim in keepdims: + ret = mx.nd._internal._square_sum(rsp, axis=axis, keepdims=keepdim) + if axis == 1 and keepdim: + assert ret.stype == 'row_sparse' + else: + assert ret.stype == 'default' + ret_expected = mx.nd.sum(dns*dns, axis=axis, keepdims=keepdim) + # check forward result + assert same(ret.asnumpy(), ret_expected.asnumpy()) + + rsp_data = mx.sym.Variable('data', stype='row_sparse') + test = mx._symbol_internal._square_sum(rsp_data, axis=axis, keepdims=keepdim) + + # check symbolic backward since ograd can be a rsp + # and cannot be checked through check_numeric_gradient + # because it will add a loss layer as the output layer + # which makes ograd of the square_sum dense + if axis == 1 and keepdims: + dns_data = mx.sym.Variable('data') + baseline = mx.sym.sum(mx.sym.square(dns_data), axis=axis, keepdims=keepdim) + igrad_expected = mx.nd.empty(dns.shape) + baseline_exec = baseline.bind(default_context(), args=[dns], + args_grad=[igrad_expected]) + baseline_exec.forward(is_train=True) + baseline_exec.backward([ret_expected]) + check_symbolic_backward(test, [rsp], [ret], [igrad_expected.asnumpy()], + grad_stypes={'data': 'row_sparse'}) + + # check numeric gradient + check_numeric_gradient(test, [rsp], grad_stype_dict={'data': 'row_sparse'}, + atol=1e-2, rtol=0.1) + +def test_sparse_storage_fallback(): + """ test operators which don't implement FComputeEx or FStatefulComputeEx """ + def check_broadcast_add(shape, lhs_stype, rhs_stype): + lhs = mx.symbol.Variable('lhs', stype=lhs_stype) + rhs = mx.symbol.Variable('rhs', stype=rhs_stype) + lhs_nd = rand_ndarray(shape, lhs_stype) + rhs_nd = rand_ndarray(shape, rhs_stype) + lhs_dns = mx.nd.cast_storage(lhs_nd, stype='default') + rhs_dns = mx.nd.cast_storage(rhs_nd, stype='default') + + out_dns = (lhs_dns + rhs_dns).asnumpy() + test = mx.symbol.broadcast_add(lhs, rhs) + location = {'lhs': lhs_nd, 'rhs': rhs_nd} + check_symbolic_forward(test, location, [out_dns]) + check_numeric_gradient(test, location) + check_symbolic_backward(test, location, [out_dns], [out_dns, out_dns]) + + def np_softmax(x, axis=-1): + # fix for old numpy on Travis not supporting keepdims + # x = x - np.max(x, axis=-1, keepdims=True) + x = x - np.max(x, axis=axis, keepdims=True) + x = np.exp(x) + # x /= np.sum(x, axis=-1, keepdims=True) + x /= np.sum(x, axis=axis, keepdims=True) + return x + + def check_softmax_with_shape(lhs_stype, rhs_stype, shape, preserve_shape=False): + # bind with label + ctx = default_context() + X = mx.symbol.Variable('X', stype=lhs_stype) + L = mx.symbol.Variable('L', stype=rhs_stype) + Y = mx.symbol.SoftmaxOutput(data=X, label=L, preserve_shape=preserve_shape) + x = rand_ndarray(shape, lhs_stype) + l = rand_ndarray(shape, rhs_stype) + l[:] = np_softmax(l.asnumpy()) + grad = mx.nd.empty(shape, ctx=ctx) + exec1 = Y.bind(ctx, args = [x, l], args_grad = {'X': grad}) + exec1.forward(is_train=True) + out = exec1.outputs[0].asnumpy() + assert_almost_equal(out, np_softmax(x.asnumpy()), rtol=1e-4) + exec1.backward() + assert_almost_equal(grad.asnumpy(), np_softmax(x.asnumpy()) - l.asnumpy(), rtol=1e-4) + + def check_concat(shape, lhs_stype, rhs_stype): + x = mx.symbol.Variable('x', stype=lhs_stype) + w = mx.symbol.Variable('w', stype=rhs_stype) + test = mx.sym.Concat(x, w) + x_nd = rand_ndarray(shape, lhs_stype) + w_nd = rand_ndarray(shape, rhs_stype) + location = {'x': x_nd, 'w': w_nd} + check_numeric_gradient(test, location) + + shape = rand_shape_2d() + stypes = ['default', 'csr', 'row_sparse'] + for lhs in stypes: + for rhs in stypes: + check_broadcast_add(shape, lhs, rhs) + check_concat(shape, lhs, rhs) + check_softmax_with_shape(lhs, rhs, shape, preserve_shape=False) + check_softmax_with_shape(rhs, rhs, shape, preserve_shape=True) + + +def test_sparse_elementwise_sum(): + def check_sparse_elementwise_sum_with_shape(stype, shape, n): + # forward + inputs = [mx.symbol.Variable('arg%d' % i) for i in range(n)] + out = mx.symbol.add_n(*inputs, name='esum') + arr = [] + arr_grad = [mx.nd.empty(shape) for _ in range(n)] + densities = [0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5] + for i in range(n): + arr.append(rand_ndarray(shape, stype, np.random.randint(0, len(densities)))) + + exec1 = out.bind(default_context(), + args=arr, + args_grad=arr_grad) + exec1.forward(is_train=True) + out1 = exec1.outputs[0].asnumpy() + out = sum(a.asnumpy() for a in arr) + assert_almost_equal(out, out1) + + out_grad = mx.nd.empty(shape) + out_grad[:] = np.random.uniform(-10, 10, shape) + # backward + exec1.backward([out_grad]) + for a in arr_grad: + assert_almost_equal(a.asnumpy(), out_grad.asnumpy()) + + maxdim = 5 + for dim in range(2, maxdim): + shape = tuple(np.random.randint(5, 10, size=dim)) + check_sparse_elementwise_sum_with_shape('row_sparse', shape, np.random.randint(1, 9)) + + +if __name__ == '__main__': + import nose + nose.runmodule() diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh index fb1869f842b1..fd23f0e82b24 100755 --- a/tests/travis/run_test.sh +++ b/tests/travis/run_test.sh @@ -117,21 +117,21 @@ if [ ${TASK} == "python_test" ]; then mkdir -p ${PWD}/data if [ ${TRAVIS_OS_NAME} == "osx" ]; then - python -m nose tests/python/unittest || exit -1 - python3 -m nose tests/python/unittest || exit -1 + python -m nose -v tests/python/unittest || exit -1 + python3 -m nose -v tests/python/unittest || exit -1 # make cython3 # cython tests # export MXNET_ENFORCE_CYTHON=1 # python3 -m nose tests/python/unittest || exit -1 - python3 -m nose tests/python/train || exit -1 - python -m nose tests/python/doctest || exit -1 - python3 -m nose tests/python/doctest || exit -1 + python3 -m nose -v tests/python/train || exit -1 + python -m nose -v tests/python/doctest || exit -1 + python3 -m nose -v tests/python/doctest || exit -1 else - nosetests tests/python/unittest || exit -1 - nosetests3 tests/python/unittest || exit -1 - nosetests3 tests/python/train || exit -1 - nosetests tests/python/doctest || exit -1 - nosetests3 tests/python/doctest || exit -1 + nosetests -v tests/python/unittest || exit -1 + nosetests3 -v tests/python/unittest || exit -1 + nosetests3 -v tests/python/train || exit -1 + nosetests -v tests/python/doctest || exit -1 + nosetests3 -v tests/python/doctest || exit -1 fi exit 0 fi diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh index 94d674f3943e..f479306a31a8 100755 --- a/tests/travis/setup.sh +++ b/tests/travis/setup.sh @@ -33,8 +33,8 @@ if [ ${TRAVIS_OS_NAME} == "osx" ]; then brew install ImageMagick brew install swig if [ ${TASK} == "python_test" ]; then - python -m pip install --user nose numpy cython - python3 -m pip install --user nose numpy cython + python -m pip install --user nose numpy cython scipy + python3 -m pip install --user nose numpy cython scipy fi fi