Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
GPU implementation of cast_storage (dense to csr) (#7081)
Browse files Browse the repository at this point in the history
* Added gpu implementation for cast_storage dense to csr, unit tests, and benchmark. Additionally, cast_storage interface change to accommodate the need of temporary storage in cuda kernels.

* fixed whitespace

* minor unittest update

* removed whitespace

* add cast storage benchmark params info
  • Loading branch information
stefanhenneking authored and piiswrong committed Jul 27, 2017
1 parent c87b085 commit 0a0edc5
Show file tree
Hide file tree
Showing 10 changed files with 513 additions and 63 deletions.
70 changes: 70 additions & 0 deletions benchmark/python/cast_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import ctypes

from mxnet.test_utils import *
import os
import time
import argparse

from mxnet.base import check_call, _LIB

parser = argparse.ArgumentParser(description="Benchmark cast storage operators",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet')
args = parser.parse_args()

def measure_cost(repeat, f, *args, **kwargs):
start = time.time()
results = []
for i in range(repeat):
(f(*args, **kwargs)).wait_to_read()
end = time.time()
diff = end - start
return diff / repeat


def run_cast_storage_synthetic():
def dns_to_csr(m, n, density, ctx, repeat):
set_default_context(ctx)
data_shape = (m, n)
dns_data = rand_ndarray(data_shape, 'csr', density).todense()
dns_data.wait_to_read()

# do one warm up run, verify correctness
assert same(mx.nd.cast_storage(dns_data, stype='csr').asnumpy(), dns_data.asnumpy())

# start benchmarking
cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype='csr')
results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format(density*100, str(ctx), m, n, cost*1000)
print(results)

check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))

# params
# m number of rows
# n number of columns
# density density of the matrix
# num_repeat number of benchmark runs to average over
# contexts mx.cpu(), mx.gpu()
# note: benchmark different contexts separately; to benchmark cpu, compile without CUDA
m = [ 512, 512]
n = [50000, 100000]
density = [1.00, 0.80, 0.60, 0.40, 0.20, 0.10, 0.05, 0.02, 0.01]
num_repeat = 10
contexts = [mx.gpu()]

# run benchmark
print("==================================================")
print(" cast_storage benchmark: dense to csr, size m x n ")
print("==================================================")
headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
print(headline)
for i in range(len(n)):
for ctx in contexts:
for den in density:
dns_to_csr(m[i], n[i], den, ctx, num_repeat)
print("")
print("==================================================")


if __name__ == "__main__":
run_cast_storage_synthetic()
6 changes: 2 additions & 4 deletions src/common/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,12 @@
namespace mxnet {
namespace common {


template<>
void CastStorageDispatch<cpu>(mshadow::Stream<cpu>* s,
void CastStorageDispatch<cpu>(const OpContext& ctx,
const NDArray& input,
const NDArray& output) {
mxnet::op::CastStorageComputeImpl(s, input, output);
mxnet::op::CastStorageComputeImpl<cpu>(ctx, input, output);
}


} // namespace common
} // namespace mxnet
4 changes: 2 additions & 2 deletions src/common/utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ namespace mxnet {
namespace common {

template<>
void CastStorageDispatch<gpu>(mshadow::Stream<gpu>* s,
void CastStorageDispatch<gpu>(const OpContext& ctx,
const NDArray& input,
const NDArray& output) {
mxnet::op::CastStorageComputeImpl(s, input, output);
mxnet::op::CastStorageComputeImpl<gpu>(ctx, input, output);
}

} // namespace common
Expand Down
7 changes: 3 additions & 4 deletions src/common/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,10 @@
#include <functional>

namespace mxnet {

namespace common {

template<typename xpu>
void CastStorageDispatch(mshadow::Stream<xpu>* s, const NDArray& input, const NDArray& output);
void CastStorageDispatch(const OpContext& ctx, const NDArray& input, const NDArray& output);

/*
* \brief Get the corresponding tensor blobs from default storage NDArrays.
Expand All @@ -55,7 +54,7 @@ inline bool GetDefaultBlobs(const std::vector<NDArray>& nds,
<< "doesn't support NDArray inputs with non-default storage.";
}
NDArray temp(nd.shape(), nd.ctx(), false);
CastStorageDispatch<xpu>(ctx.get_stream<xpu>(), nd, temp);
CastStorageDispatch<xpu>(ctx, nd, temp);
temps->push_back(temp);
blobs->push_back(temp.data());
casted = true;
Expand Down Expand Up @@ -91,7 +90,7 @@ inline void CastNonDefaultStorage(const std::vector<NDArray>& dst,
<< "You are probably executing an operator which "
<< "doesn't support NDArray inputs with non-default storage.";
}
CastStorageDispatch<xpu>(ctx.get_stream<xpu>(), src[src_idx++], dst[i]);
CastStorageDispatch<xpu>(ctx, src[src_idx++], dst[i]);
}
}
CHECK_EQ(src_idx, src.size()) << "Not all src NDArrays are casted";
Expand Down
26 changes: 18 additions & 8 deletions src/ndarray/ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ inline void CopyFromToDnsImpl(const NDArray from, NDArray *to, RunContext ctx) {

// Make a copy of an NDArray based on storage type
template<typename from_xpu, typename to_xpu>
void CopyFromToImpl(const NDArray from, NDArray *to, RunContext ctx) {
void CopyFromToImpl(const NDArray from, NDArray *to, RunContext rctx) {
using namespace std;
using namespace mshadow;
// if storage type doesn't match, cast the storage first
Expand All @@ -423,10 +423,20 @@ void CopyFromToImpl(const NDArray from, NDArray *to, RunContext ctx) {
<< " to stype = " << to_stype << " is not supported";
const auto from_ctx = from.ctx();
const auto to_ctx = to->ctx();
auto s = ctx.get_stream<from_xpu>();
auto s = rctx.get_stream<from_xpu>();
bool is_train = mxnet::autograd::AutogradRuntime::Get()->IsTraining();
std::vector<Resource> requested;
if (is_same<from_xpu, mshadow::gpu>::value && from_stype != to_stype) {
requested.push_back(ResourceManager::Get()->Request(from_ctx,
ResourceRequest(ResourceRequest::kTempSpace)));
}
OpContext opctx{is_train,
rctx,
engine::CallbackOnComplete(),
requested};
if (from_ctx == to_ctx && from_stype != to_stype) {
// same ctx, different stypes, use cast op directly without copying
common::CastStorageDispatch<from_xpu>(s, from, *to);
common::CastStorageDispatch<from_xpu>(opctx, from, *to);
} else {
NDArray casted_nd; // an intermediate result before copying from to to
if (from_stype == to_stype) {
Expand All @@ -439,22 +449,22 @@ void CopyFromToImpl(const NDArray from, NDArray *to, RunContext ctx) {
casted_nd = NDArray(to_stype, shape, from_ctx);
}
// convert from_nd to the same stype as to_nd
common::CastStorageDispatch<from_xpu>(s, from, casted_nd);
common::CastStorageDispatch<from_xpu>(opctx, from, casted_nd);
}

if (to_stype == kDefaultStorage) {
CopyFromToDnsImpl<from_xpu, to_xpu>(casted_nd, to, ctx);
CopyFromToDnsImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
} else if (to_stype == kRowSparseStorage) {
CopyFromToRspImpl<from_xpu, to_xpu>(casted_nd, to, ctx);
CopyFromToRspImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
} else if (to_stype == kCSRStorage) {
CopyFromToCsrImpl<from_xpu, to_xpu>(casted_nd, to, ctx);
CopyFromToCsrImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
} else {
LOG(FATAL) << "unknown storage type" << to_stype;
}
}
if (is_same<from_xpu, mshadow::gpu>::value || is_same<to_xpu, mshadow::gpu>::value) {
// Wait GPU kernel to complete
ctx.get_stream<gpu>()->Wait();
rctx.get_stream<gpu>()->Wait();
}
}

Expand Down
Loading

0 comments on commit 0a0edc5

Please sign in to comment.