Skip to content

Commit

Permalink
GPU implementation of cast_storage (dense to rsp) (apache#7223)
Browse files Browse the repository at this point in the history
* CastStorageDnsRsp GPU Implementation

* updating function doc and some variable types and names

* adding cuda_get_device_prop() util function

* added rand_shape function for n-dimensional tensors

* updated cast storage unit test

* added dns_to_rsp to cast storage benchmark script

* removing redundant unit test

* fix lint

* minor change in benchmark script

* fix lint

* correct function description

* change storage_type to stype

* changed scope of using namespaces

* changed variable types from index_t to dim_t
  • Loading branch information
stefanhenneking authored and eric-haibin-lin committed Aug 1, 2017
1 parent 55e4763 commit 7e1647c
Show file tree
Hide file tree
Showing 6 changed files with 503 additions and 157 deletions.
40 changes: 26 additions & 14 deletions benchmark/python/cast_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,17 @@ def measure_cost(repeat, f, *args, **kwargs):


def run_cast_storage_synthetic():
def dns_to_csr(m, n, density, ctx, repeat):
def dense_to_sparse(m, n, density, ctx, repeat, stype):
set_default_context(ctx)
data_shape = (m, n)
dns_data = rand_ndarray(data_shape, 'csr', density).todense()
dns_data = rand_ndarray(data_shape, stype, density).todense()
dns_data.wait_to_read()

# do one warm up run, verify correctness
assert same(mx.nd.cast_storage(dns_data, stype='csr').asnumpy(), dns_data.asnumpy())
assert same(mx.nd.cast_storage(dns_data, stype).asnumpy(), dns_data.asnumpy())

# start benchmarking
cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype='csr')
cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype)
results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format(density*100, str(ctx), m, n, cost*1000)
print(results)

Expand All @@ -46,24 +46,36 @@ def dns_to_csr(m, n, density, ctx, repeat):
# num_repeat number of benchmark runs to average over
# contexts mx.cpu(), mx.gpu()
# note: benchmark different contexts separately; to benchmark cpu, compile without CUDA
# benchmarks dns_to_csr, dns_to_rsp
m = [ 512, 512]
n = [50000, 100000]
density = [1.00, 0.80, 0.60, 0.40, 0.20, 0.10, 0.05, 0.02, 0.01]
num_repeat = 10
contexts = [mx.gpu()]
benchmarks = ["dns_to_csr", "dns_to_rsp"]

# run benchmark
print("==================================================")
print(" cast_storage benchmark: dense to csr, size m x n ")
print("==================================================")
headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
print(headline)
for i in range(len(n)):
for ctx in contexts:
for den in density:
dns_to_csr(m[i], n[i], den, ctx, num_repeat)
for b in benchmarks:
stype = ''
print("==================================================")
if b is "dns_to_csr":
stype = 'csr'
print(" cast_storage benchmark: dense to csr, size m x n ")
elif b is "dns_to_rsp":
stype = 'row_sparse'
print(" cast_storage benchmark: dense to rsp, size m x n ")
else:
print("invalid benchmark: %s" %b)
continue
print("==================================================")
headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
print(headline)
for i in range(len(n)):
for ctx in contexts:
for den in density:
dense_to_sparse(m[i], n[i], den, ctx, num_repeat, stype)
print("")
print("")
print("==================================================")


if __name__ == "__main__":
Expand Down
6 changes: 6 additions & 0 deletions python/mxnet/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,15 @@ def rand_ndarray(shape, stype, density=None):
def rand_shape_2d(dim0=10, dim1=10):
return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1)


def rand_shape_3d(dim0=10, dim1=10, dim2=10):
return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1), rnd.randint(1, dim2 + 1)


def rand_shape_nd(n, dim=10):
return rnd.randint(1, dim+1, size=n)


def np_reduce(dat, axis, keepdims, numpy_reduce_func):
"""Compatible reduce for old version of NumPy.
Expand Down
10 changes: 10 additions & 0 deletions src/operator/mxnet_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#include <dmlc/omp.h>
#include <mxnet/base.h>
#include <algorithm>
#ifdef __CUDACC__
#include "../common/cuda_utils.h"
#endif // __CUDACC__

namespace mxnet {
namespace op {
Expand All @@ -32,6 +35,13 @@ int get_num_threads(const int N);
i < (n); \
i += blockDim.x * gridDim.x)

inline cudaDeviceProp cuda_get_device_prop() {
int device;
CUDA_CALL(cudaGetDevice(&device));
cudaDeviceProp deviceProp;
CUDA_CALL(cudaGetDeviceProperties(&deviceProp, device));
return deviceProp;
}

/*!
* \brief Get the number of blocks for cuda kernel given N
Expand Down
Loading

0 comments on commit 7e1647c

Please sign in to comment.