GPU implementation of cast_storage (dense to csr) (#7081)

* Added gpu implementation for cast_storage dense to csr, unit tests, and benchmark. Additionally, cast_storage interface change to accommodate the need of temporary storage in cuda kernels. * fixed whitespace * minor unittest update * removed whitespace * add cast storage benchmark params info
apache · Jul 27, 2017 · 0a0edc5 · 0a0edc5
1 parent c87b085
commit 0a0edc5
Show file tree

Hide file tree

Showing 10 changed files with 513 additions and 63 deletions.
diff --git a/benchmark/python/cast_storage.py b/benchmark/python/cast_storage.py
@@ -0,0 +1,70 @@
+import ctypes
+
+from mxnet.test_utils import *
+import os
+import time
+import argparse
+
+from mxnet.base import check_call, _LIB
+
+parser = argparse.ArgumentParser(description="Benchmark cast storage operators",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet')
+args = parser.parse_args()
+
+def measure_cost(repeat, f, *args, **kwargs):
+    start = time.time()
+    results = []
+    for i in range(repeat):
+        (f(*args, **kwargs)).wait_to_read()
+    end = time.time()
+    diff = end - start
+    return diff / repeat
+
+
+def run_cast_storage_synthetic():
+    def dns_to_csr(m, n, density, ctx, repeat):
+        set_default_context(ctx)
+        data_shape = (m, n)
+        dns_data = rand_ndarray(data_shape, 'csr', density).todense()
+        dns_data.wait_to_read()
+
+        # do one warm up run, verify correctness
+        assert same(mx.nd.cast_storage(dns_data, stype='csr').asnumpy(), dns_data.asnumpy())
+
+        # start benchmarking
+        cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype='csr')
+        results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format(density*100, str(ctx), m, n, cost*1000)
+        print(results)
+
+    check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))
+
+    # params
+    # m           number of rows
+    # n           number of columns
+    # density     density of the matrix
+    # num_repeat  number of benchmark runs to average over
+    # contexts    mx.cpu(), mx.gpu()
+    #             note: benchmark different contexts separately; to benchmark cpu, compile without CUDA
+    m = [  512,    512]
+    n = [50000, 100000]
+    density = [1.00, 0.80, 0.60, 0.40, 0.20, 0.10, 0.05, 0.02, 0.01]
+    num_repeat = 10
+    contexts = [mx.gpu()]
+
+    # run benchmark
+    print("==================================================")
+    print(" cast_storage benchmark: dense to csr, size m x n ")
+    print("==================================================")
+    headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
+    print(headline)
+    for i in range(len(n)):
+        for ctx in contexts:
+            for den in density:
+                dns_to_csr(m[i], n[i], den, ctx, num_repeat)
+        print("")
+    print("==================================================")
+
+
+if __name__ == "__main__":
+    run_cast_storage_synthetic()
diff --git a/src/common/utils.cc b/src/common/utils.cc
@@ -10,14 +10,12 @@
 namespace mxnet {
 namespace common {
 
-
 template<>
-void CastStorageDispatch<cpu>(mshadow::Stream<cpu>* s,
+void CastStorageDispatch<cpu>(const OpContext& ctx,
                               const NDArray& input,
                               const NDArray& output) {
-  mxnet::op::CastStorageComputeImpl(s, input, output);
+  mxnet::op::CastStorageComputeImpl<cpu>(ctx, input, output);
 }
 
-
 }  // namespace common
 }  // namespace mxnet
diff --git a/src/common/utils.cu b/src/common/utils.cu
@@ -11,10 +11,10 @@ namespace mxnet {
 namespace common {
 
 template<>
-void CastStorageDispatch<gpu>(mshadow::Stream<gpu>* s,
+void CastStorageDispatch<gpu>(const OpContext& ctx,
                               const NDArray& input,
                               const NDArray& output) {
-  mxnet::op::CastStorageComputeImpl(s, input, output);
+  mxnet::op::CastStorageComputeImpl<gpu>(ctx, input, output);
 }
 
 }  // namespace common

diff --git a/src/common/utils.h b/src/common/utils.h
@@ -24,11 +24,10 @@
 #include <functional>
 
 namespace mxnet {
-
 namespace common {
 
 template<typename xpu>
-void CastStorageDispatch(mshadow::Stream<xpu>* s, const NDArray& input, const NDArray& output);
+void CastStorageDispatch(const OpContext& ctx, const NDArray& input, const NDArray& output);
 
 /*
  * \brief Get the corresponding tensor blobs from default storage NDArrays.
@@ -55,7 +54,7 @@ inline bool GetDefaultBlobs(const std::vector<NDArray>& nds,
                    << "doesn't support NDArray inputs with non-default storage.";
       }
       NDArray temp(nd.shape(), nd.ctx(), false);
-      CastStorageDispatch<xpu>(ctx.get_stream<xpu>(), nd, temp);
+      CastStorageDispatch<xpu>(ctx, nd, temp);
       temps->push_back(temp);
       blobs->push_back(temp.data());
       casted = true;
@@ -91,7 +90,7 @@ inline void CastNonDefaultStorage(const std::vector<NDArray>& dst,
                    << "You are probably executing an operator which "
                    << "doesn't support NDArray inputs with non-default storage.";
       }
-      CastStorageDispatch<xpu>(ctx.get_stream<xpu>(), src[src_idx++], dst[i]);
+      CastStorageDispatch<xpu>(ctx, src[src_idx++], dst[i]);
     }
   }
   CHECK_EQ(src_idx, src.size()) << "Not all src NDArrays are casted";

diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
@@ -410,7 +410,7 @@ inline void CopyFromToDnsImpl(const NDArray from, NDArray *to, RunContext ctx) {
 
 // Make a copy of an NDArray based on storage type
 template<typename from_xpu, typename to_xpu>
-void CopyFromToImpl(const NDArray from, NDArray *to, RunContext ctx) {
+void CopyFromToImpl(const NDArray from, NDArray *to, RunContext rctx) {
   using namespace std;
   using namespace mshadow;
   // if storage type doesn't match, cast the storage first
@@ -423,10 +423,20 @@ void CopyFromToImpl(const NDArray from, NDArray *to, RunContext ctx) {
     << " to stype = " << to_stype << " is not supported";
   const auto from_ctx = from.ctx();
   const auto to_ctx = to->ctx();
-  auto s = ctx.get_stream<from_xpu>();
+  auto s = rctx.get_stream<from_xpu>();
+  bool is_train = mxnet::autograd::AutogradRuntime::Get()->IsTraining();
+  std::vector<Resource> requested;
+  if (is_same<from_xpu, mshadow::gpu>::value && from_stype != to_stype) {
+    requested.push_back(ResourceManager::Get()->Request(from_ctx,
+        ResourceRequest(ResourceRequest::kTempSpace)));
+  }
+  OpContext opctx{is_train,
+                  rctx,
+                  engine::CallbackOnComplete(),
+                  requested};
   if (from_ctx == to_ctx && from_stype != to_stype) {
     // same ctx, different stypes, use cast op directly without copying
-    common::CastStorageDispatch<from_xpu>(s, from, *to);
+    common::CastStorageDispatch<from_xpu>(opctx, from, *to);
   } else {
     NDArray casted_nd;  // an intermediate result before copying from to to
     if (from_stype == to_stype) {
@@ -439,22 +449,22 @@ void CopyFromToImpl(const NDArray from, NDArray *to, RunContext ctx) {
         casted_nd = NDArray(to_stype, shape, from_ctx);
       }
       // convert from_nd to the same stype as to_nd
-      common::CastStorageDispatch<from_xpu>(s, from, casted_nd);
+      common::CastStorageDispatch<from_xpu>(opctx, from, casted_nd);
     }
 
     if (to_stype == kDefaultStorage) {
-      CopyFromToDnsImpl<from_xpu, to_xpu>(casted_nd, to, ctx);
+      CopyFromToDnsImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
     } else if (to_stype == kRowSparseStorage) {
-      CopyFromToRspImpl<from_xpu, to_xpu>(casted_nd, to, ctx);
+      CopyFromToRspImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
     } else if (to_stype == kCSRStorage) {
-      CopyFromToCsrImpl<from_xpu, to_xpu>(casted_nd, to, ctx);
+      CopyFromToCsrImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
     } else {
       LOG(FATAL) << "unknown storage type" << to_stype;
     }
   }
   if (is_same<from_xpu, mshadow::gpu>::value || is_same<to_xpu, mshadow::gpu>::value) {
     // Wait GPU kernel to complete
-    ctx.get_stream<gpu>()->Wait();
+    rctx.get_stream<gpu>()->Wait();
   }
 }