Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

GPU implementation of cast_storage (dense to rsp) #7223

Merged
merged 14 commits into from
Aug 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 26 additions & 14 deletions benchmark/python/cast_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,17 @@ def measure_cost(repeat, f, *args, **kwargs):


def run_cast_storage_synthetic():
def dns_to_csr(m, n, density, ctx, repeat):
def dense_to_sparse(m, n, density, ctx, repeat, stype):
set_default_context(ctx)
data_shape = (m, n)
dns_data = rand_ndarray(data_shape, 'csr', density).todense()
dns_data = rand_ndarray(data_shape, stype, density).todense()
dns_data.wait_to_read()

# do one warm up run, verify correctness
assert same(mx.nd.cast_storage(dns_data, stype='csr').asnumpy(), dns_data.asnumpy())
assert same(mx.nd.cast_storage(dns_data, stype).asnumpy(), dns_data.asnumpy())

# start benchmarking
cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype='csr')
cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype)
results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format(density*100, str(ctx), m, n, cost*1000)
print(results)

Expand All @@ -46,24 +46,36 @@ def dns_to_csr(m, n, density, ctx, repeat):
# num_repeat number of benchmark runs to average over
# contexts mx.cpu(), mx.gpu()
# note: benchmark different contexts separately; to benchmark cpu, compile without CUDA
# benchmarks dns_to_csr, dns_to_rsp
m = [ 512, 512]
n = [50000, 100000]
density = [1.00, 0.80, 0.60, 0.40, 0.20, 0.10, 0.05, 0.02, 0.01]
num_repeat = 10
contexts = [mx.gpu()]
benchmarks = ["dns_to_csr", "dns_to_rsp"]

# run benchmark
print("==================================================")
print(" cast_storage benchmark: dense to csr, size m x n ")
print("==================================================")
headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
print(headline)
for i in range(len(n)):
for ctx in contexts:
for den in density:
dns_to_csr(m[i], n[i], den, ctx, num_repeat)
for b in benchmarks:
stype = ''
print("==================================================")
if b is "dns_to_csr":
stype = 'csr'
print(" cast_storage benchmark: dense to csr, size m x n ")
elif b is "dns_to_rsp":
stype = 'row_sparse'
print(" cast_storage benchmark: dense to rsp, size m x n ")
else:
print("invalid benchmark: %s" %b)
continue
print("==================================================")
headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
print(headline)
for i in range(len(n)):
for ctx in contexts:
for den in density:
dense_to_sparse(m[i], n[i], den, ctx, num_repeat, stype)
print("")
print("")
print("==================================================")


if __name__ == "__main__":
Expand Down
6 changes: 6 additions & 0 deletions python/mxnet/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,15 @@ def rand_ndarray(shape, stype, density=None):
def rand_shape_2d(dim0=10, dim1=10):
return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1)


def rand_shape_3d(dim0=10, dim1=10, dim2=10):
return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1), rnd.randint(1, dim2 + 1)


def rand_shape_nd(n, dim=10):
return rnd.randint(1, dim+1, size=n)


def np_reduce(dat, axis, keepdims, numpy_reduce_func):
"""Compatible reduce for old version of NumPy.

Expand Down
10 changes: 10 additions & 0 deletions src/operator/mxnet_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#include <dmlc/omp.h>
#include <mxnet/base.h>
#include <algorithm>
#ifdef __CUDACC__
#include "../common/cuda_utils.h"
#endif // __CUDACC__

namespace mxnet {
namespace op {
Expand All @@ -32,6 +35,13 @@ int get_num_threads(const int N);
i < (n); \
i += blockDim.x * gridDim.x)

inline cudaDeviceProp cuda_get_device_prop() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the function is only used for cuda, guard the function definition with #ifdef __CUDACC__.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It already is.

int device;
CUDA_CALL(cudaGetDevice(&device));
cudaDeviceProp deviceProp;
CUDA_CALL(cudaGetDeviceProperties(&deviceProp, device));
return deviceProp;
}

/*!
* \brief Get the number of blocks for cuda kernel given N
Expand Down
Loading