diff --git a/benchmark/python/cast_storage.py b/benchmark/python/cast_storage.py
new file mode 100644
index 000000000000..7ae537398c42
--- /dev/null
+++ b/benchmark/python/cast_storage.py
@@ -0,0 +1,99 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import ctypes
+
+from mxnet.test_utils import *
+import os
+import time
+import argparse
+
+from mxnet.base import check_call, _LIB
+
+parser = argparse.ArgumentParser(description="Benchmark cast storage operators",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet')
+args = parser.parse_args()
+
+def measure_cost(repeat, f, *args, **kwargs):
+    start = time.time()
+    results = []
+    for i in range(repeat):
+        (f(*args, **kwargs)).wait_to_read()
+    end = time.time()
+    diff = end - start
+    return diff / repeat
+
+
+def run_cast_storage_synthetic():
+    def dense_to_sparse(m, n, density, ctx, repeat, stype):
+        set_default_context(ctx)
+        data_shape = (m, n)
+        dns_data = rand_ndarray(data_shape, stype, density).tostype('default')
+        dns_data.wait_to_read()
+
+        # do one warm up run, verify correctness
+        assert same(mx.nd.cast_storage(dns_data, stype).asnumpy(), dns_data.asnumpy())
+
+        # start benchmarking
+        cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype)
+        results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format(density*100, str(ctx), m, n, cost*1000)
+        print(results)
+
+    check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))
+
+    # params
+    # m           number of rows
+    # n           number of columns
+    # density     density of the matrix
+    # num_repeat  number of benchmark runs to average over
+    # contexts    mx.cpu(), mx.gpu()
+    #             note: benchmark different contexts separately; to benchmark cpu, compile without CUDA
+    # benchmarks  dns_to_csr, dns_to_rsp
+    m = [  512,    512]
+    n = [50000, 100000]
+    density = [1.00, 0.80, 0.60, 0.40, 0.20, 0.10, 0.05, 0.02, 0.01]
+    num_repeat = 10
+    contexts = [mx.gpu()]
+    benchmarks = ["dns_to_csr", "dns_to_rsp"]
+
+    # run benchmark
+    for b in benchmarks:
+        stype = ''
+        print("==================================================")
+        if b is "dns_to_csr":
+            stype = 'csr'
+            print(" cast_storage benchmark: dense to csr, size m x n ")
+        elif b is "dns_to_rsp":
+            stype = 'row_sparse'
+            print(" cast_storage benchmark: dense to rsp, size m x n ")
+        else:
+            print("invalid benchmark: %s" %b)
+            continue
+        print("==================================================")
+        headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
+        print(headline)
+        for i in range(len(n)):
+            for ctx in contexts:
+                for den in density:
+                    dense_to_sparse(m[i], n[i], den, ctx, num_repeat, stype)
+            print("")
+        print("")
+
+
+if __name__ == "__main__":
+    run_cast_storage_synthetic()
diff --git a/benchmark/python/dot.py b/benchmark/python/dot.py
new file mode 100644
index 000000000000..4fe3bcdcd9c1
--- /dev/null
+++ b/benchmark/python/dot.py
@@ -0,0 +1,280 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import ctypes
+
+from mxnet.test_utils import *
+import scipy.sparse as sp
+import os
+import time
+import argparse
+
+from mxnet.base import check_call, _LIB
+from util import get_data, estimate_density
+
+parser = argparse.ArgumentParser(description="Benchmark sparse operators",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet')
+args = parser.parse_args()
+
+# some data information
+kdda = {
+    'data_mini': 'kdda.t.mini',
+    'data_name': 'kdda.t',
+    'data_origin_name': 'kdda.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2",
+    'feature_dim': 20216830,
+    'm': 200,
+    'batch_size': [64]
+}
+
+avazu = {
+    'data_mini': 'avazu-app.t.mini',
+    'data_name': 'avazu-app.t',
+    'data_origin_name': 'avazu-app.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2",
+    'feature_dim': 1000000,
+    'm': 500,
+    'batch_size': [64, 128]
+}
+
+
+def measure_cost(repeat, f, *args, **kwargs):
+    mx.nd.waitall()
+    start = time.time()
+    for i in range(repeat):
+        f(*args, **kwargs)
+    mx.nd.waitall()
+    end = time.time()
+    diff = end - start
+    return diff / repeat
+
+
+def test_dot_real(data_dict):
+    def get_iter(path, data_shape, batch_size):
+        data_train = mx.io.LibSVMIter(data_libsvm=path,
+                                      data_shape=data_shape,
+                                      batch_size=batch_size)
+        data_iter = iter(data_train)
+        return data_iter
+
+    data_dir = os.path.join(os.getcwd(), 'data')
+
+    path = os.path.join(data_dir, data_dict['data_name'])
+    if not os.path.exists(path):
+        get_data(
+            data_dir,
+            data_dict['data_name'],
+            data_dict['url'],
+            data_dict['data_origin_name']
+        )
+        assert os.path.exists(path)
+    
+    k = data_dict['feature_dim']
+    m = data_dict['m']
+    density = estimate_density(path, data_dict['feature_dim'])
+
+    mini_path = os.path.join(data_dir, data_dict['data_mini'])
+    if not os.path.exists(mini_path):
+        os.system("head -n 2000 %r > %r" % (path, mini_path))
+        assert os.path.exists(mini_path)
+    
+    print "Running Benchmarking on %r data" % data_dict['data_mini']
+    for batch_size in data_dict['batch_size']:  # iterator through different batch size of choice
+        print "batch_size is %d" % batch_size
+        # model
+        data_shape = (k, )
+        train_iter = get_iter(mini_path, data_shape, batch_size)
+        weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m))
+
+        csr_data = []
+        dns_data = []
+        num_batch = 0
+        for batch in train_iter:
+            data = train_iter.getdata()
+            csr_data.append(data)
+            dns_data.append(data.tostype('default'))
+            num_batch += 1
+        bag_of_data = [csr_data, dns_data]
+        num_repeat = 5
+        costs = []
+        for d in bag_of_data:
+            weight.wait_to_read()
+            cost = 0.
+            count = 0
+            for d_batch in d:
+                d_batch.wait_to_read()
+                cost += measure_cost(True, num_repeat, mx.nd.dot, d_batch, weight)
+                count += 1
+            costs.append(cost/count)
+        t_sparse = costs[0]
+        t_dense = costs[1]
+        ratio = t_dense / t_sparse
+        print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse')
+        fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f"
+        print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, t_sparse))
+
+
+def test_dot_synthetic():
+    """benchmark sparse mxnet dot and scipy dot operator with matrices of given density.
+    `t_sparse` is the runtime of the invoked sparse dot operator in ms, while `t_dense` is the 
+    runtime of dot(dns, dns), with the same matrices except that they are in default storage type.
+    """
+    # Benchmark MXNet's sparse dot operator
+    def bench_mx_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, rhs_den, trans_lhs, ctx, repeat):
+        set_default_context(ctx)
+        # Create matrix instances
+        lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den)
+        rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den)
+        lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.tostype('default')
+        rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.tostype('default')
+        # One warm up run, verify correctness
+        out = mx.nd.dot(lhs_nd, rhs_dns, trans_lhs)
+        out_expected = mx.nd.dot(lhs_dns, rhs_dns, trans_lhs)
+        assert_almost_equal(out.asnumpy(), out_expected.asnumpy(), rtol=1e-2, atol=1e-3)
+        # Start benchmarking
+        lhs_nd.wait_to_read()
+        rhs_nd.wait_to_read()
+        sparse_cost = measure_cost(repeat, mx.nd.dot, lhs_nd, rhs_nd, trans_lhs)
+        dense_cost = measure_cost(repeat, mx.nd.dot, lhs_dns, rhs_dns, trans_lhs)
+        speedup = dense_cost / sparse_cost
+        # Print results
+        m = lhs_shape[0]
+        k = lhs_shape[1]
+        n = rhs_shape[1]
+        results = '{:15.1f} {:15.1f} {:>10} {:8d} {:8d} {:8d} {:13.2f} {:13.2f} {:8.2f}'.format(lhs_den*100, rhs_den*100, str(ctx), m, k, n, sparse_cost*1000, dense_cost*1000, speedup)
+        print(results)
+
+    # Benchmark Scipy's sparse dot operator
+    def bench_sp_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, rhs_den, trans_lhs, ctx, repeat):
+        set_default_context(ctx)
+        assert default_context().device_type is 'cpu'
+        assert lhs_stype is 'csr'
+        assert rhs_stype is 'default'
+        # Create matrix instances
+        lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den)
+        rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den)
+        lhs_nd.wait_to_read()
+        rhs_nd.wait_to_read()
+        lhs_dns_np = np.transpose(lhs_nd.asnumpy()) if trans_lhs else lhs_nd.asnumpy()
+        rhs_dns_np = rhs_nd.asnumpy()
+        lhs_csr_sp = sp.spmatrix.transpose(sp.csr_matrix(lhs_nd.asnumpy())) if trans_lhs else sp.csr_matrix(lhs_nd.asnumpy())
+        # One warm up run
+        out = sp.spmatrix.dot(lhs_csr_sp, rhs_dns_np)
+        # Start benchmarking
+        sparse_cost = measure_cost(repeat, sp.spmatrix.dot, lhs_csr_sp, rhs_dns_np)
+        dense_cost = measure_cost(repeat, np.dot, lhs_dns_np, rhs_dns_np)
+        speedup = dense_cost / sparse_cost
+        # Print results
+        m = lhs_shape[0]
+        k = lhs_shape[1]
+        n = rhs_shape[1]
+        results = '{:15.1f} {:15.1f} {:>10} {:8d} {:8d} {:8d} {:13.2f} {:13.2f} {:8.2f}'.format(lhs_den*100, rhs_den*100, str(ctx), m, k, n, sparse_cost*1000, dense_cost*1000, speedup)
+        print(results)
+
+    check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))
+    # TODO(haibin): make these runtime options
+    # params
+    # m, n, k        rows and columns of lhs and rhs matrix
+    #                forward  pass:  m x k    * k x n = m x n
+    #                backward pass: (m x k)^T * m x n = k x n
+    # density_lhs    density of the left-hand side matrix
+    # density_rhs    density of the right-hand side matrix, if applicable
+    # num_repeat     number of benchmark runs to average over
+    # context        mx.cpu(), mx.gpu()
+    #                note: benchmark different contexts separately; to benchmark cpu, compile without CUDA
+    # mx_benchmarks  csr_dns, csr.T_dns, csr_rsp
+    # sp_benchmarks  csr_dns, csr.T_dns
+    #                note: scipy benchmarks are only conducted if context is mx.cpu()
+    m = 512
+    k = [50000, 100000]
+    n = [64, 128]
+    density_lhs = [0.64, 0.32, 0.16, 0.08, 0.04, 0.02, 0.01]
+    density_rhs = [0.64, 0.32, 0.16, 0.08, 0.04, 0.02, 0.01]
+    num_repeat = 10
+    context = mx.gpu()
+    mx_benchmarks = ["csr_dns", "csr.T_dns", "csr_rsp"]
+    sp_benchmarks = ["csr_dns", "csr.T_dns"]
+
+    headline = '{:>15} {:>15} {:>10} {:>8} {:>8} {:>8} {:>13} {:>13} {:>8}'.format('lhs_density(%)', 'rhs_density(%)', 'context', 'm', 'k', 'n', 't_sparse(ms)', 't_dense(ms)', 'speedup')
+    if "csr_dns" in mx_benchmarks:
+        print("==================================================")
+        print("  mxnet sparse dot benchmark: dot(csr, dns) = dns ")
+        print("  (matrix multiplication: m x k * k x n = m x n)  ")
+        print("==================================================")
+        print(headline)
+        transpose_lhs = False
+        for i in range(len(n)):
+            for d_lhs in density_lhs:
+                bench_mx_dot((m, k[i]), (k[i], n[i]), 'csr', 'default', d_lhs, 1, transpose_lhs, context, num_repeat)
+            print ""
+
+    if "csr_dns" in sp_benchmarks and mx.cpu() == context:
+        print("==================================================")
+        print("  scipy sparse dot benchmark: dot(csr, dns) = dns ")
+        print("  (matrix multiplication: m x k * k x n = m x n)  ")
+        print("==================================================")
+        print(headline)
+        transpose_lhs = False
+        for i in range(len(n)):
+            for d_lhs in density_lhs:
+                bench_sp_dot((m, k[i]), (k[i], n[i]), 'csr', 'default', d_lhs, 1, transpose_lhs, context, num_repeat)
+            print ""
+
+    if "csr.T_dns" in mx_benchmarks:
+        print("==================================================")
+        print(" mxnet sparse dot benchmark: dot(csr.T, dns) = rsp")
+        print("(matrix multiplication: (m x k)^T * m x n = k x n)")
+        print("==================================================")
+        print(headline)
+        transpose_lhs = True
+        for i in range(len(n)):
+            for d_lhs in density_lhs:
+                bench_mx_dot((m, k[i]), (m, n[i]), 'csr', 'default', d_lhs, 1, transpose_lhs, context, num_repeat)
+            print ""
+
+    if "csr.T_dns" in sp_benchmarks and mx.cpu() == context:
+        print("==================================================")
+        print(" scipy sparse dot benchmark: dot(csr.T, dns) = dns")
+        print("(matrix multiplication: (m x k)^T * m x n = k x n)")
+        print("==================================================")
+        print(headline)
+        transpose_lhs = True
+        for i in range(len(n)):
+            for d_lhs in density_lhs:
+                bench_sp_dot((m, k[i]), (m, n[i]), 'csr', 'default', d_lhs, 1, transpose_lhs, context, num_repeat)
+            print ""
+
+    if "csr_rsp" in mx_benchmarks:
+        print("==================================================")
+        print("  mxnet sparse dot benchmark: dot(csr, rsp) = dns ")
+        print("  (matrix multiplication: m x k * k x n = m x n)  ")
+        print("==================================================")
+        print(headline)
+        transpose_lhs = False
+        for i in range(len(n)):
+            for d_lhs in density_lhs:
+              for d_rhs in density_rhs:
+                bench_mx_dot((m, k[i]), (k[i], n[i]), 'csr', 'row_sparse', d_lhs, d_rhs, transpose_lhs, context, num_repeat)
+              print ""
+            print ""
+
+
+if __name__ == "__main__":
+    test_dot_synthetic()
+    test_dot_real(avazu)
+    test_dot_real(kdda)
diff --git a/benchmark/python/sparse_end2end.py b/benchmark/python/sparse_end2end.py
new file mode 100644
index 000000000000..62a3b77b8482
--- /dev/null
+++ b/benchmark/python/sparse_end2end.py
@@ -0,0 +1,226 @@
+from mxnet.test_utils import *
+import time
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="Run sparse linear regression " \
+                                             "with distributed kvstore",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--profiler', type=int, default=0,
+                    help='whether to use profiler')
+parser.add_argument('--num-epoch', type=int, default=1,
+                    help='number of epochs to train')
+parser.add_argument('--batch-size', type=int, default=512,
+                    help='number of examples per batch')
+parser.add_argument('--num-batch', type=int, default=99999999,
+                    help='number of batches per epoch')
+parser.add_argument('--dummy-iter', type=int, default=0,
+                    help='whether to use dummy iterator to exclude io cost')
+parser.add_argument('--kvstore', type=str, default='local',
+                    help='what kvstore to use [local, dist_sync, etc]')
+parser.add_argument('--log-level', type=str, default='debug',
+                    help='logging level [debug, info, error]')
+parser.add_argument('--dataset', type=str, default='avazu',
+                    help='what test dataset to use')
+parser.add_argument('--num-gpu', type=int, default=0,
+                    help='number of gpus to use. 0 means using cpu(0);'
+                         'otherwise, use gpu(0),...,gpu(num_gpu-1)')
+parser.add_argument('--output-dim', type=int, default=4,
+                    help='number of columns of the forward output')
+
+
+def get_libsvm_data(data_dir, data_name, url, data_origin_name):
+    if not os.path.isdir(data_dir):
+        os.system("mkdir " + data_dir)
+    os.chdir(data_dir)
+    if (not os.path.exists(data_name)):
+        import urllib
+        zippath = os.path.join(data_dir, data_origin_name)
+        urllib.urlretrieve(url, zippath)
+        os.system("bzip2 -d %r" % data_origin_name)
+    os.chdir("..")
+
+
+class DummyIter(mx.io.DataIter):
+    "A dummy iterator that always return the same batch, used for speed testing"
+    def __init__(self, real_iter):
+        super(DummyIter, self).__init__()
+        self.real_iter = real_iter
+        self.provide_data = real_iter.provide_data
+        self.provide_label = real_iter.provide_label
+        self.batch_size = real_iter.batch_size
+
+        for batch in real_iter:
+            self.the_batch = batch
+            break
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        return self.the_batch
+
+# testing dataset sources
+avazu = {
+    'data_name': 'avazu-app.t',
+    'data_origin_name': 'avazu-app.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2",
+    'feature_dim': 1000000,
+}
+
+kdda = {
+    'data_name': 'kdda.t',
+    'data_origin_name': 'kdda.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2",
+    'feature_dim': 20216830,
+}
+
+datasets = { 'kdda' : kdda, 'avazu' : avazu }
+
+
+def get_sym(feature_dim):
+     x = mx.symbol.Variable("data", stype='csr')
+     norm_init = mx.initializer.Normal(sigma=0.01)
+     w = mx.symbol.Variable("w", shape=(feature_dim, args.output_dim), init=norm_init, stype='row_sparse')
+     embed = mx.symbol.dot(x, w)
+     y = mx.symbol.Variable("softmax_label")
+     model = mx.symbol.SoftmaxOutput(data=embed, label=y, name="out")
+     return model
+
+
+def row_sparse_pull(kv, key, data, slices, weight_array, priority):
+    # if have kvstore, need to pull corresponding rows of
+    # the weights to each context
+    # column indices (NDArray type) of the csr data
+    # used as the row_idx of the weight row-sparse matrix
+    row_indices = data.indices
+    if len(slices) == 1:
+        kv.row_sparse_pull(key, weight_array, priority=priority, row_ids=row_indices)
+    else:  # more than one slices, multi-GPU training. Need to retain weight rows according to data slices
+        # TODO(junwu):
+        # the following line blocks, may need to pre-compute
+        # and cache it outside the for loop
+        indptr = data.indptr.asnumpy()
+        row_idx_array = []
+        for s in slices:
+            row_idx_array.append(row_indices[indptr[s.start]:indptr[s.stop]])
+        kv.row_sparse_pull(key, weight_array, priority=priority, row_ids=row_idx_array)
+
+
+if __name__ == '__main__':
+
+    # arg parser
+    args = parser.parse_args()
+    num_epoch = args.num_epoch
+    num_batch = args.num_batch
+    kvstore = args.kvstore
+    profiler = args.profiler > 0
+    batch_size = args.batch_size if args.num_gpu == 0 else args.num_gpu * args.batch_size
+    dummy_iter = args.dummy_iter
+    dataset = args.dataset
+    log_level = args.log_level
+    contexts = mx.context.cpu(0) if args.num_gpu < 1\
+        else [mx.context.gpu(i) for i in range(args.num_gpu)]
+
+    # create kvstore when there are gpus
+    kv = mx.kvstore.create(kvstore) if args.num_gpu >= 1 else None
+    rank = kv.rank if kv is not None else 0
+    num_worker = kv.num_workers if kv is not None else 1
+
+    # only print log for rank 0 worker
+    import logging
+    if rank != 0:
+        log_level = logging.ERROR
+    elif log_level == 'DEBUG':
+        log_level = logging.DEBUG
+    else:
+        log_level = logging.INFO
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=log_level, format=head)
+
+    # dataset
+    assert(dataset in datasets), "unknown dataset " + dataset
+    metadata = datasets[dataset]
+    feature_dim = metadata['feature_dim']
+    if logging:
+        logging.debug('preparing data ... ')
+    data_dir = os.path.join(os.getcwd(), 'data')
+    path = os.path.join(data_dir, metadata['data_name'])
+    if not os.path.exists(path):
+        get_libsvm_data(data_dir, metadata['data_name'], metadata['url'],
+                        metadata['data_origin_name'])
+        assert os.path.exists(path)
+
+    # data iterator
+    train_data = mx.io.LibSVMIter(data_libsvm=path, data_shape=(feature_dim,),
+                                  batch_size=batch_size, num_parts=num_worker,
+                                  part_index=rank)
+    if dummy_iter:
+        train_data = DummyIter(train_data)
+
+    # model
+    model = get_sym(feature_dim)
+
+    # module
+    mod = mx.mod.Module(symbol=model, data_names=['data'],
+                        label_names=['softmax_label'], context=contexts)
+    mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+    mod.init_params(initializer=mx.init.Uniform(scale=.1))
+    sgd = mx.optimizer.SGD(momentum=0.0, clip_gradient=5.0,
+                           learning_rate=0.1, rescale_grad=1.0/batch_size/num_worker)
+    mod.init_optimizer(optimizer=sgd, kvstore=kv)
+    # use accuracy as the metric
+    metric = mx.metric.create('acc')
+
+    index = mod._exec_group.param_names.index('w')
+    # weight_array bound to executors of the contexts
+    weight_array = mod._exec_group.param_arrays[index]
+
+    # start profiler
+    if profiler:
+        device = 'cpu'
+        if args.num_gpu > 0:
+            device = 'gpu' + str(args.num_gpu)
+        name = 'profile_' + args.dataset + '_' + device + '_nworker' + str(num_worker)\
+               + '_batchsize' + str(args.batch_size) + '_outdim' + str(args.output_dim) + '.json'
+        mx.profiler.profiler_set_config(mode='all', filename=name)
+        mx.profiler.profiler_set_state('run')
+
+    logging.debug('start training ...')
+    start = time.time()
+    data_iter = iter(train_data)
+    for epoch in range(num_epoch):
+        nbatch = 0
+        end_of_batch = False
+        data_iter.reset()
+        metric.reset()
+        next_batch = next(data_iter)
+        if kv is not None:
+            row_sparse_pull(kv, 'w', next_batch.data[0], mod._exec_group.slices, weight_array, -index)
+        while not end_of_batch:
+            nbatch += 1
+            batch = next_batch
+
+            mod.forward_backward(batch)
+            # update parameters
+            mod.update()
+
+            try:
+                # pre fetch next batch
+                next_batch = next(data_iter)
+                if nbatch == num_batch:
+                    raise StopIteration
+                if kv is not None:
+                    row_sparse_pull(kv, 'w', next_batch.data[0], mod._exec_group.slices, weight_array, -index)
+            except StopIteration:
+                end_of_batch = True
+            # accumulate prediction accuracy
+            mod.update_metric(metric, batch.label)
+        logging.info('epoch %d, %s' % (epoch, metric.get()))
+        if epoch == 0:
+            print "num_batches = ", nbatch
+    if profiler:
+        mx.profiler.profiler_set_state('stop')
+    end = time.time()
+    time_cost = end - start
+    logging.info('num_worker = ' + str(num_worker) + ', time cost = ' + str(time_cost))
diff --git a/benchmark/python/sparse_op.py b/benchmark/python/sparse_op.py
new file mode 100644
index 000000000000..0683aa84eacb
--- /dev/null
+++ b/benchmark/python/sparse_op.py
@@ -0,0 +1,245 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import ctypes
+
+from mxnet.test_utils import *
+import scipy.sparse as sp
+import os
+import time
+import argparse
+
+from mxnet.base import check_call, _LIB
+from util import get_data, estimate_density
+
+parser = argparse.ArgumentParser(description="Benchmark sparse operators",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet')
+args = parser.parse_args()
+
+# some data information
+kdda = {
+    'data_mini': 'kdda.t.mini',
+    'data_name': 'kdda.t',
+    'data_origin_name': 'kdda.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2",
+    'feature_dim': 20216830,
+    'm': 200,
+    'batch_size': [64]
+}
+
+avazu = {
+    'data_mini': 'avazu-app.t.mini',
+    'data_name': 'avazu-app.t',
+    'data_origin_name': 'avazu-app.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2",
+    'feature_dim': 1000000,
+    'm': 500,
+    'batch_size': [64, 128]
+}
+
+
+def measure_cost(repeat, f, *args, **kwargs):
+    # start bench
+    start = time.time()
+    results = []
+    for i in range(repeat):
+        results.append(f(*args, **kwargs))
+    for result in results:
+        result.wait_to_read()
+    end = time.time()
+    diff = end - start
+    return diff / repeat
+
+
+def test_dot_real(data_dict):
+    def get_iter(path, data_shape, batch_size):
+        data_train = mx.io.LibSVMIter(data_libsvm=path,
+                                      data_shape=data_shape,
+                                      batch_size=batch_size)
+        data_iter = iter(data_train)
+        return data_iter
+
+    data_dir = os.path.join(os.getcwd(), 'data')
+
+    path = os.path.join(data_dir, data_dict['data_name'])
+    if not os.path.exists(path):
+        get_data(
+            data_dir,
+            data_dict['data_name'],
+            data_dict['url'],
+            data_dict['data_origin_name']
+        )
+        assert os.path.exists(path)
+
+    k = data_dict['feature_dim']
+    m = data_dict['m']
+    density = estimate_density(path, data_dict['feature_dim'])
+
+    mini_path = os.path.join(data_dir, data_dict['data_mini'])
+    if not os.path.exists(mini_path):
+        os.system("head -n 2000 %r > %r" % (path, mini_path))
+        assert os.path.exists(mini_path)
+
+    print "Running Benchmarking on %r data" % data_dict['data_mini']
+    for batch_size in data_dict['batch_size']:  # iterator through different batch size of choice
+        print "batch_size is %d" % batch_size
+        # model
+        data_shape = (k, )
+        train_iter = get_iter(mini_path, data_shape, batch_size)
+        weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m))
+
+        csr_data = []
+        dns_data = []
+        num_batch = 0
+        for batch in train_iter:
+            data = train_iter.getdata()
+            csr_data.append(data)
+            dns_data.append(data.tostype('default'))
+            num_batch += 1
+        bag_of_data = [csr_data, dns_data]
+        num_repeat = 5
+        costs = []
+        for d in bag_of_data:
+            weight.wait_to_read()
+            cost = 0.
+            count = 0
+            for d_batch in d:
+                d_batch.wait_to_read()
+                cost += measure_cost(num_repeat, mx.nd.dot, d_batch, weight)
+                count += 1
+            costs.append(cost/count)
+        t_sparse = costs[0]
+        t_dense = costs[1]
+        ratio = t_dense / t_sparse
+        print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse')
+        fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f"
+        print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, t_sparse))
+
+
+def test_dot_synthetic():
+    """benchmark mx.nd.dot(sparse_ndarray, dense_ndarray) with given density.
+    `t_sparse` is the time cost of dot(csr, dns), while `t_dense` is the time cost
+    of dot(dns, dns), with the same matrix except that it is in default storage type.
+    """
+    def measure_cost_forward_baseline(repeat, dot, lhs, rhs):
+        start = time.time()
+        for i in range(repeat):
+            dot(lhs, rhs)
+        end = time.time()
+        diff = end - start
+        return diff / repeat
+
+    def measure_cost_backward_baseline(repeat, dot, transpose, lhs, rhs):
+        start = time.time()
+        for i in range(repeat):
+            dot(transpose(lhs), rhs)
+        end = time.time()
+        diff = end - start
+        return diff / repeat
+
+    def bench_dot_forward(m, k, n, density, ctx, repeat):
+        set_default_context(ctx)
+        dns = mx.nd.random_uniform(shape=(k, n)).copyto(ctx)
+        data_shape = (m, k)
+        csr_data = rand_ndarray(data_shape, 'csr', density)
+        dns_data = csr_data.tostype('default')
+        rhs_dns_np = dns.asnumpy()
+        lhs_csr_sp = sp.csr_matrix(dns_data.asnumpy())  # csr in scipy
+        lhs_dns_np = lhs_csr_sp.tostype('default')
+
+        data = [dns_data, csr_data]
+        costs = []
+        for d in data:
+            dns.wait_to_read()
+            d.wait_to_read()
+            cost = measure_cost(repeat, mx.nd.dot, d, dns)
+            costs.append(cost)
+        ratio = costs[0] / costs[1]
+
+        costs_baseline = []
+        cost = measure_cost_forward_baseline(repeat, np.dot, lhs_dns_np, rhs_dns_np)
+        costs_baseline.append(cost)
+        cost = measure_cost_forward_baseline(repeat, sp.spmatrix.dot, lhs_csr_sp, rhs_dns_np)
+        costs_baseline.append(cost)
+        ratio_baseline = costs_baseline[0] / costs_baseline[1]
+        fmt = "%0.1f\t\t%s\t%d\t%d\t%d\t%0.2f\t\t\t%0.2f\t%0.5f\t\t%0.2f\t\t\t\t%0.6f\t%0.5f"
+        print(fmt % (density * 100, str(ctx), n, m, k, ratio, costs[0], costs[1],
+                     ratio_baseline, costs_baseline[0], costs_baseline[1]))
+
+    def bench_dot_backward(m, k, n, density, ctx, repeat):
+        set_default_context(ctx)
+        dns = mx.nd.random_uniform(shape=(m, n)).copyto(ctx)
+        data_shape = (m, k)
+        csr_data = rand_ndarray(data_shape, 'csr', density)
+        dns_data = csr_data.tostype('default')
+        rhs_dns_np = dns.asnumpy()
+        lhs_csr_sp = sp.csr_matrix(dns_data.asnumpy())
+        lhs_dns_np = lhs_csr_sp.tostype('default')
+
+        data = [dns_data, csr_data]
+        costs = []
+        for d in data:
+            dns.wait_to_read()
+            d.wait_to_read()
+            cost = measure_cost(repeat, mx.nd.dot, d, dns, transpose_a=True)
+            costs.append(cost)
+        ratio = costs[0] / costs[1]
+
+        costs_baseline = []
+        cost = measure_cost_backward_baseline(repeat, np.dot, np.transpose, lhs_dns_np, rhs_dns_np)
+        costs_baseline.append(cost)
+        cost = measure_cost_backward_baseline(repeat, sp.spmatrix.dot, sp.spmatrix.transpose, lhs_csr_sp, rhs_dns_np)
+        costs_baseline.append(cost)
+        ratio_baseline = costs_baseline[0] / costs_baseline[1]
+        fmt = "%0.1f\t\t%s\t%d\t%d\t%d\t%0.2f\t\t\t%0.2f\t%0.5f\t\t%0.2f\t\t\t\t%0.6f\t%0.5f"
+        print(fmt % (density * 100, str(ctx), n, m, k, ratio, costs[0], costs[1],
+                     ratio_baseline, costs_baseline[0], costs_baseline[1]))
+
+    print("A = sparse NDArray of shape(m, k)")
+    print("B = dense NDArray of shape(k, n)")
+    print("dot_forward\tdot(csr, dns)")
+    print('density(%)\tcontext\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse'
+          '\tt_scipy_dense/t_scipy_sparse\tt_scipy_dense\tt_scipy_sparse')
+
+    check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))
+    # TODO(haibin) make these runtime options
+    m = 512
+    k = [50000, 100000]
+    n = [64, 128]
+    density = [1.00, 0.90, 0.70, 0.50, 0.30, 0.20, 0.10, 0.07, 0.05, 0.02, 0.01, 0.005, 0.001]
+    num_repeat = 10
+    # contexts = [mx.cpu(), mx.gpu(0)]
+    contexts = [mx.cpu()]
+    for i in range(2):
+        for ctx in contexts:
+            for den in density:
+                bench_dot_forward(m, k[i], n[i], den, ctx, num_repeat)
+
+    print("dot_backward\tdot(csr.T, dns)")
+    print('density(%)\tcontext\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse'
+          '\tt_scipy_dense/t_scipy_sparse\tt_scipy_dense\tt_scipy_sparse')
+    for i in range(2):
+        for ctx in contexts:
+            for den in density:
+                bench_dot_backward(m, k[i], n[i], den, ctx, num_repeat)
+
+
+if __name__ == "__main__":
+    test_dot_real(avazu)
+    test_dot_real(kdda)
+    test_dot_synthetic()
diff --git a/benchmark/python/util.py b/benchmark/python/util.py
new file mode 100644
index 000000000000..947ff4a65037
--- /dev/null
+++ b/benchmark/python/util.py
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import random
+
+
+def get_data(data_dir, data_name, url, data_origin_name):
+    if not os.path.isdir(data_dir):
+        os.system("mkdir " + data_dir)
+    os.chdir(data_dir)
+    if (not os.path.exists(data_name)):
+        import urllib
+        zippath = os.path.join(data_dir, data_origin_name)
+        urllib.urlretrieve(url, zippath)
+        os.system("bzip2 -d %r" % data_origin_name)
+    os.chdir("..")
+
+
+def estimate_density(DATA_PATH, feature_size):
+    """sample 10 times of a size of 1000 for estimating the density of the sparse dataset"""
+    if not os.path.exists(DATA_PATH):
+        raise Exception("Data is not there!")
+    density = []
+    P = 0.01
+    for _ in xrange(10):
+        num_non_zero = 0
+        num_sample = 0
+        with open(DATA_PATH) as f:
+            for line in f:
+                if (random.random() < P):
+                    num_non_zero += len(line.split(" ")) - 1
+                    num_sample += 1
+        density.append(num_non_zero * 1.0 / (feature_size * num_sample))
+    return sum(density) / len(density)
+
diff --git a/docs/api/python/ndarray.md b/docs/api/python/ndarray.md
index 5e9f7e1a1184..dc0e65dd0062 100644
--- a/docs/api/python/ndarray.md
+++ b/docs/api/python/ndarray.md
@@ -64,9 +64,21 @@ A detailed tutorial is available at
 ```
 
 In the rest of this document, we first overview the methods provided by the
-`ndarray.NDArray` class, and then list other routines provided by the
-`ndarray` package.
+`ndarray.NDArray` class and its subclasses, and then list other routines
+provided by the `ndarray` package.
 
+The `ndarray` package provides several classes:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray
+    CSRNDArray
+    RowSparseNDArray
+```
+
+We summarize the interface for each class in the following sections.
 
 ## The `NDArray` class
 
@@ -80,6 +92,7 @@ In the rest of this document, we first overview the methods provided by the
     NDArray.size
     NDArray.context
     NDArray.dtype
+    NDArray.stype
 ```
 
 ### Array conversion
@@ -94,6 +107,7 @@ In the rest of this document, we first overview the methods provided by the
     NDArray.asnumpy
     NDArray.asscalar
     NDArray.astype
+    NDArray.tostype
 ```
 
 ### Array change shape
@@ -171,6 +185,35 @@ In the rest of this document, we first overview the methods provided by the
     NDArray.wait_to_read
 ```
 
+## The `RowSparseNDArray` Class
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.copyto
+    RowSparseNDArray.tostype
+    RowSparseNDArray.__setitem__
+    RowSparseNDArray.__getitem__
+    RowSparseNDArray.data
+    RowSparseNDArray.indices
+```
+
+## The `CSRNDArray` Class
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray.copyto
+    CSRNDArray.tostype
+    CSRNDArray.__setitem__
+    CSRNDArray.__getitem__
+    CSRNDArray.data
+    CSRNDArray.indices
+    CSRNDArray.indptr
+```
+
 ## Array creation routines
 
 ```eval_rst
@@ -499,8 +542,24 @@ The `contrib.ndarray` module contains many useful experimental APIs for new feat
 <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
 
 ```eval_rst
+
+.. autoclass:: mxnet.ndarray.NDArray
+    :members:
+    :special-members:
+
+.. autoclass:: mxnet.ndarray.CSRNDArray
+    :members:
+    :special-members:
+
+.. autoclass:: mxnet.ndarray.RowSparseNDArray
+    :members:
+    :special-members:
+
 .. automodule:: mxnet.ndarray
     :members:
+    :imported-members:
+    :special-members:
+    :exclude-members: CachedOp, BaseSparseNDArray, NDArray, CSRNDArray, RowSparseNDArray
 
 .. automodule:: mxnet.random
     :members:
diff --git a/example/sparse/get_data.py b/example/sparse/get_data.py
new file mode 100644
index 000000000000..578cf2ce5226
--- /dev/null
+++ b/example/sparse/get_data.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+import os, gzip
+import pickle as pickle
+import sys
+
+def get_libsvm_data(data_dir, data_name, url, data_origin_name):
+    if not os.path.isdir(data_dir):
+        os.mkdir(data_dir)
+    os.chdir(data_dir)
+    if (not os.path.exists(data_name)):
+        import urllib
+        zippath = os.path.join(data_dir, data_origin_name)
+        urllib.urlretrieve(url, zippath)
+        os.system("bzip2 -d %r" % data_origin_name)
+    os.chdir("..")
diff --git a/example/sparse/linear_classification.py b/example/sparse/linear_classification.py
new file mode 100644
index 000000000000..567568c6eb80
--- /dev/null
+++ b/example/sparse/linear_classification.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet.test_utils import *
+from get_data import get_libsvm_data
+import time
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="Run sparse linear classification " \
+                                             "with distributed kvstore",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--profiler', type=int, default=0,
+                    help='whether to use profiler')
+parser.add_argument('--num-epoch', type=int, default=1,
+                    help='number of epochs to train')
+parser.add_argument('--batch-size', type=int, default=8192,
+                    help='number of examples per batch')
+parser.add_argument('--num-batch', type=int, default=99999999,
+                    help='number of batches per epoch')
+parser.add_argument('--dummy-iter', type=int, default=0,
+                    help='whether to use dummy iterator to exclude io cost')
+parser.add_argument('--kvstore', type=str, default='dist_sync',
+                    help='what kvstore to use [local, dist_sync, etc]')
+parser.add_argument('--log-level', type=str, default='DEBUG',
+                    help='logging level [debug, info, error]')
+parser.add_argument('--dataset', type=str, default='avazu',
+                    help='what test dataset to use')
+
+class DummyIter(mx.io.DataIter):
+    "A dummy iterator that always return the same batch, used for speed testing"
+    def __init__(self, real_iter):
+        super(DummyIter, self).__init__()
+        self.real_iter = real_iter
+        self.provide_data = real_iter.provide_data
+        self.provide_label = real_iter.provide_label
+        self.batch_size = real_iter.batch_size
+
+        for batch in real_iter:
+            self.the_batch = batch
+            break
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        return self.the_batch
+
+# testing dataset sources
+avazu = {
+    'data_name': 'avazu-app.t',
+    'data_origin_name': 'avazu-app.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2",
+    'feature_dim': 1000000,
+}
+
+kdda = {
+    'data_name': 'kdda.t',
+    'data_origin_name': 'kdda.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2",
+    'feature_dim': 20216830,
+}
+
+datasets = { 'kdda' : kdda, 'avazu' : avazu }
+
+def linear_model(feature_dim):
+     x = mx.symbol.Variable("data", stype='csr')
+     norm_init = mx.initializer.Normal(sigma=0.01)
+     weight = mx.symbol.Variable("weight", shape=(feature_dim, 1), init=norm_init, stype='row_sparse')
+     bias = mx.symbol.Variable("bias", shape=(1,), init=norm_init)
+     dot = mx.symbol.dot(x, weight)
+     pred = mx.symbol.broadcast_add(dot, bias)
+     y = mx.symbol.Variable("softmax_label")
+     model = mx.symbol.SoftmaxOutput(data=pred, label=y, name="out")
+     return model
+
+if __name__ == '__main__':
+    # arg parser
+    args = parser.parse_args()
+    num_epoch = args.num_epoch
+    num_batch = args.num_batch
+    kvstore = args.kvstore
+    profiler = args.profiler > 0
+    batch_size = args.batch_size
+    dummy_iter = args.dummy_iter
+    dataset = args.dataset
+    log_level = args.log_level
+
+    # create kvstore
+    kv = mx.kvstore.create(kvstore)
+    rank = kv.rank
+    num_worker = kv.num_workers
+
+    # only print log for rank 0 worker
+    import logging
+    if rank != 0:
+        log_level = logging.ERROR
+    elif log_level == 'DEBUG':
+        log_level = logging.DEBUG
+    else:
+        log_level = logging.INFO
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=log_level, format=head)
+
+    # dataset
+    assert(dataset in datasets), "unknown dataset " + dataset
+    metadata = datasets[dataset]
+    feature_dim = metadata['feature_dim']
+    if logging:
+        logging.debug('preparing data ... ')
+    data_dir = os.path.join(os.getcwd(), 'data')
+    path = os.path.join(data_dir, metadata['data_name'])
+    if not os.path.exists(path):
+        get_libsvm_data(data_dir, metadata['data_name'], metadata['url'],
+                        metadata['data_origin_name'])
+        assert os.path.exists(path)
+
+    # data iterator
+    train_data = mx.io.LibSVMIter(data_libsvm=path, data_shape=(feature_dim,),
+                                  batch_size=batch_size, num_parts=num_worker,
+                                  part_index=rank)
+    if dummy_iter:
+        train_data = DummyIter(train_data)
+
+    # model
+    model = linear_model(feature_dim)
+
+    # module
+    mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['softmax_label'])
+    mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+    mod.init_params(initializer=mx.init.Uniform(scale=.1))
+    sgd = mx.optimizer.SGD(momentum=0.0, clip_gradient=5.0,
+                           learning_rate=0.1, rescale_grad=1.0/batch_size/num_worker)
+    mod.init_optimizer(optimizer=sgd, kvstore=kv)
+    # use accuracy as the metric
+    metric = mx.metric.create('Accuracy')
+
+    # start profiler
+    if profiler:
+        name = 'profile_output_' + str(num_worker) + '.json'
+        mx.profiler.profiler_set_config(mode='all', filename=name)
+        mx.profiler.profiler_set_state('run')
+
+    logging.debug('start training ...')
+    start = time.time()
+    data_iter = iter(train_data)
+    for epoch in range(num_epoch):
+        nbatch = 0
+        data_iter.reset()
+        metric.reset()
+        for batch in data_iter:
+            nbatch += 1
+            row_ids = batch.data[0].indices
+            # pull sparse weight
+            index = mod._exec_group.param_names.index('weight')
+            kv.row_sparse_pull('weight', mod._exec_group.param_arrays[index],
+                               priority=-index, row_ids=[row_ids])
+            mod.forward_backward(batch)
+            # update parameters
+            mod.update()
+            # accumulate prediction accuracy
+            mod.update_metric(metric, batch.label)
+            if nbatch == num_batch:
+                break
+        logging.info('epoch %d, %s' % (epoch, metric.get()))
+    if profiler:
+        mx.profiler.profiler_set_state('stop')
+    end = time.time()
+    time_cost = end - start
+    logging.info('num_worker = ' + str(num_worker) + ', time cost = ' + str(time_cost))
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 2289354e8a5e..a43f73fe45ab 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -276,6 +276,38 @@ MXNET_DLL int MXNDArrayCreateEx(const mx_uint *shape,
                               int delay_alloc,
                               int dtype,
                               NDArrayHandle *out);
+
+
+/*!
+ * \brief create an empty sparse NDArray with specified shape and data type
+ * \param storage_type the storage type of the ndarray
+ * \param shape the pointer to the shape
+ * \param ndim the dimension of the shape
+ * \param dev_type device type, specify device we want to take
+ * \param dev_id the device id of the specific device
+ * \param delay_alloc whether to delay allocation until
+ *        the narray is first mutated
+ * \param dtype data type of created array
+ * \param num_aux the number of aux data to support this ndarray
+ * \param aux_type data type of the aux data for the created array
+ * \param aux_ndims the dimension of the shapes of aux data
+ * \param aux_shape the shapes of aux data
+ * \param out the returning handle
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXNDArrayCreateSparseEx(int storage_type,
+                                      const mx_uint *shape,
+                                      mx_uint ndim,
+                                      int dev_type,
+                                      int dev_id,
+                                      int delay_alloc,
+                                      int dtype,
+                                      mx_uint num_aux,
+                                      int *aux_type,
+                                      mx_uint *aux_ndims,
+                                      const mx_uint *aux_shape,
+                                      NDArrayHandle *out);
+
 /*!
  * \brief create a NDArray handle that is loaded from raw bytes.
  * \param buf the head of the raw bytes
@@ -350,6 +382,17 @@ MXNET_DLL int MXNDArraySyncCopyFromCPU(NDArrayHandle handle,
 MXNET_DLL int MXNDArraySyncCopyToCPU(NDArrayHandle handle,
                                      void *data,
                                      size_t size);
+/*!
+ * \brief Copy src.data() to dst.data() if i = -1, else dst.aux_data(i) if i >= 0
+ * This function blocks. Do not use it in performance critical code.
+ * \param handle_dst handle of a dst ndarray whose data/aux_data has been allocated
+ * \param handle_src handle of a src ndarray which has default storage type
+ * \param i dst data blob indicator
+ */
+MXNET_DLL int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst,
+                                           const NDArrayHandle handle_src,
+                                           const int i);
+
 /*!
  * \brief Wait until all the pending writes with respect NDArray are finished.
  *  Always call this before read data out synchronizely.
@@ -388,6 +431,7 @@ MXNET_DLL int MXNDArraySlice(NDArrayHandle handle,
                              mx_uint slice_begin,
                              mx_uint slice_end,
                              NDArrayHandle *out);
+
 /*!
  * \brief Index the NDArray along axis 0.
  * \param handle the handle to the NDArray
@@ -398,6 +442,13 @@ MXNET_DLL int MXNDArraySlice(NDArrayHandle handle,
 MXNET_DLL int MXNDArrayAt(NDArrayHandle handle,
                           mx_uint idx,
                           NDArrayHandle *out);
+
+/*!
+ * \brief get the storage type of the array
+ */
+MXNET_DLL int MXNDArrayGetStorageType(NDArrayHandle handle,
+                                      int *out_storage_type);
+
 /*!
  * \brief Reshape the NDArray.
  * \param handle the handle to the narray
@@ -436,6 +487,34 @@ MXNET_DLL int MXNDArrayGetData(NDArrayHandle handle,
  */
 MXNET_DLL int MXNDArrayGetDType(NDArrayHandle handle,
                                int *out_dtype);
+
+/*!
+ * \brief get the type of the ith aux data in NDArray
+ * \param handle the handle to the narray
+ * \param i the index of the aux data
+ * \param out_type pointer holder to get type of aux data
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXNDArrayGetAuxType(NDArrayHandle handle,
+                                  mx_uint i,
+                                  int *out_type);
+
+/*!
+ * \brief Get a deep copy of the ith aux data blob
+ * in the form of an NDArray of default storage type.
+ * This function blocks. Do not use it in performance critical code.
+ */
+MXNET_DLL int MXNDArrayGetAuxNDArray(NDArrayHandle handle,
+                                     mx_uint i,
+                                     NDArrayHandle *out);
+
+/*!
+ * \brief Get a deep copy of the data blob
+ * in the form of an NDArray of default storage type.
+ * This function blocks. Do not use it in performance critical code.
+ */
+MXNET_DLL int MXNDArrayGetDataNDArray(NDArrayHandle handle,
+                                      NDArrayHandle *out);
 /*!
  * \brief get the context of the NDArray
  * \param handle the handle to the narray
@@ -581,6 +660,28 @@ MXNET_DLL int MXImperativeInvoke(AtomicSymbolCreator creator,
                                  int num_params,
                                  const char **param_keys,
                                  const char **param_vals);
+/*!
+ * \brief invoke a nnvm op and imperative function
+ * \param creator the op
+ * \param num_inputs number of input NDArrays
+ * \param inputs input NDArrays
+ * \param num_outputs number of output NDArrays
+ * \param outputs output NDArrays
+ * \param num_params number of keyword parameters
+ * \param param_keys keys for keyword parameters
+ * \param param_vals values for keyword parameters
+ * \param out_stypes output ndarrays' stypes
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXImperativeInvokeEx(AtomicSymbolCreator creator,
+                                   int num_inputs,
+                                   NDArrayHandle *inputs,
+                                   int *num_outputs,
+                                   NDArrayHandle **outputs,
+                                   int num_params,
+                                   const char **param_keys,
+                                   const char **param_vals,
+                                   const int **out_stypes);
 /*!
  * \brief set whether to record operator for autograd
  * \param is_recording 1 when recording, 0 when not recording.
@@ -666,6 +767,30 @@ MXNET_DLL int MXCreateCachedOp(SymbolHandle handle,
  * \brief free cached operator
  */
 MXNET_DLL int MXFreeCachedOp(CachedOpHandle handle);
+/*!
+ * \brief invoke cached operator
+ */
+MXNET_DLL int MXInvokeCachedOp(CachedOpHandle handle,
+                               int num_inputs,
+                               NDArrayHandle *inputs,
+                               int *num_outputs,
+                               NDArrayHandle **outputs);
+/*!
+ * \brief invoke a cached op
+ * \param handle the handle to the cached op
+ * \param num_inputs number of input NDArrays
+ * \param inputs input NDArrays
+ * \param num_outputs number of output NDArrays
+ * \param outputs output NDArrays
+ * \param out_stypes output ndarrays' stypes
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXInvokeCachedOpEx(CachedOpHandle handle,
+                                 int num_inputs,
+                                 NDArrayHandle *inputs,
+                                 int *num_outputs,
+                                 NDArrayHandle **outputs,
+                                 const int** out_stypes);
 /*!
  * \brief invoke cached operator
  */
@@ -1017,20 +1142,20 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXSymbolInferShapePartial(SymbolHandle sym,
-                                 mx_uint num_args,
-                                 const char** keys,
-                                 const mx_uint *arg_ind_ptr,
-                                 const mx_uint *arg_shape_data,
-                                 mx_uint *in_shape_size,
-                                 const mx_uint **in_shape_ndim,
-                                 const mx_uint ***in_shape_data,
-                                 mx_uint *out_shape_size,
-                                 const mx_uint **out_shape_ndim,
-                                 const mx_uint ***out_shape_data,
-                                 mx_uint *aux_shape_size,
-                                 const mx_uint **aux_shape_ndim,
-                                 const mx_uint ***aux_shape_data,
-                                 int *complete);
+                                        mx_uint num_args,
+                                        const char** keys,
+                                        const mx_uint *arg_ind_ptr,
+                                        const mx_uint *arg_shape_data,
+                                        mx_uint *in_shape_size,
+                                        const mx_uint **in_shape_ndim,
+                                        const mx_uint ***in_shape_data,
+                                        mx_uint *out_shape_size,
+                                        const mx_uint **out_shape_ndim,
+                                        const mx_uint ***out_shape_data,
+                                        mx_uint *aux_shape_size,
+                                        const mx_uint **aux_shape_ndim,
+                                        const mx_uint ***aux_shape_data,
+                                        int *complete);
 
 /*!
  * \brief infer type of unknown input types given the known one.
@@ -1061,6 +1186,10 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
                                 mx_uint *aux_type_size,
                                 const int **aux_type_data,
                                 int *complete);
+
+
+
+
 //--------------------------------------------
 // Part 4: Executor interface
 //--------------------------------------------
@@ -1222,36 +1351,39 @@ MXNET_DLL int MXExecutorBindEX(SymbolHandle symbol_handle,
                                ExecutorHandle *out);
 
 MXNET_DLL int MXExecutorSimpleBind(SymbolHandle symbol_handle,
-                         int dev_type,
-                         int dev_id,
-                         const mx_uint num_g2c_keys,
-                         const char** g2c_keys,
-                         const int* g2c_dev_types,
-                         const int* g2c_dev_ids,
-                         const mx_uint provided_grad_req_list_len,
-                         const char** provided_grad_req_names,
-                         const char** provided_grad_req_types,
-                         const mx_uint num_provided_arg_shapes,
-                         const char** provided_arg_shape_names,
-                         const mx_uint* provided_arg_shape_data,
-                         const mx_uint* provided_arg_shape_idx,
-                         const mx_uint num_provided_arg_dtypes,
-                         const char** provided_arg_dtype_names,
-                         const int* provided_arg_dtypes,
-                         const mx_uint num_shared_arg_names,
-                         const char** shared_arg_name_list,
-                         int* shared_buffer_len,
-                         const char** shared_buffer_name_list,
-                         NDArrayHandle* shared_buffer_handle_list,
-                         const char*** updated_shared_buffer_name_list,
-                         NDArrayHandle** updated_shared_buffer_handle_list,
-                         mx_uint* num_in_args,
-                         NDArrayHandle** in_args,
-                         NDArrayHandle** arg_grads,
-                         mx_uint* num_aux_states,
-                         NDArrayHandle** aux_states,
-                         ExecutorHandle shared_exec_handle,
-                         ExecutorHandle* out);
+                                   int dev_type,
+                                   int dev_id,
+                                   const mx_uint num_g2c_keys,
+                                   const char** g2c_keys,
+                                   const int* g2c_dev_types,
+                                   const int* g2c_dev_ids,
+                                   const mx_uint provided_grad_req_list_len,
+                                   const char** provided_grad_req_names,
+                                   const char** provided_grad_req_types,
+                                   const mx_uint num_provided_arg_shapes,
+                                   const char** provided_arg_shape_names,
+                                   const mx_uint* provided_arg_shape_data,
+                                   const mx_uint* provided_arg_shape_idx,
+                                   const mx_uint num_provided_arg_dtypes,
+                                   const char** provided_arg_dtype_names,
+                                   const int* provided_arg_dtypes,
+                                   const mx_uint num_provided_arg_stypes,
+                                   const char** provided_arg_stype_names,
+                                   const int* provided_arg_stypes,
+                                   const mx_uint num_shared_arg_names,
+                                   const char** shared_arg_name_list,
+                                   int* shared_buffer_len,
+                                   const char** shared_buffer_name_list,
+                                   NDArrayHandle* shared_buffer_handle_list,
+                                   const char*** updated_shared_buffer_name_list,
+                                   NDArrayHandle** updated_shared_buffer_handle_list,
+                                   mx_uint* num_in_args,
+                                   NDArrayHandle** in_args,
+                                   NDArrayHandle** arg_grads,
+                                   mx_uint* num_aux_states,
+                                   NDArrayHandle** aux_states,
+                                   ExecutorHandle shared_exec_handle,
+                                   ExecutorHandle* out);
 /*!
  * \brief set a call back to notify the completion of operation
  */
@@ -1468,6 +1600,26 @@ MXNET_DLL int MXKVStorePullEx(KVStoreHandle handle,
                               const char** keys,
                               NDArrayHandle* vals,
                               int priority);
+
+/*!
+ * \brief pull a list of (key, value) pairs from the kvstore, where each key is a string.
+ *        The NDArray pulled back will be in row_sparse storage with only the specified
+ *        row_ids present based row_ids (others rows are zeros).
+ * \param handle handle to the kvstore
+ * \param num the number of key-value pairs
+ * \param keys the list of keys
+ * \param vals the list of values
+ * \param row_ids the list of row_id NDArrays
+ * \param priority the priority of the action
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXKVStorePullRowSparse(KVStoreHandle handle,
+                                     mx_uint num,
+                                     const char** keys,
+                                     NDArrayHandle* vals,
+                                     const NDArrayHandle* row_ids,
+                                     int priority);
+
 /*!
  * \brief user-defined updater for the kvstore
  * It's this updater's responsibility to delete \a recv and \a local
diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h
index a74d3b07b5be..85d34778dd8c 100644
--- a/include/mxnet/executor.h
+++ b/include/mxnet/executor.h
@@ -133,6 +133,7 @@ class Executor {
                               const std::vector<Context>& aux_state_ctxes,
                               const std::unordered_map<std::string, TShape>& arg_shape_map,
                               const std::unordered_map<std::string, int>& arg_dtype_map,
+                              const std::unordered_map<std::string, int>& arg_stype_map,
                               const std::vector<OpReqType>& grad_req_types,
                               const std::unordered_set<std::string>& param_names,
                               std::vector<NDArray>* in_args,
diff --git a/include/mxnet/graph_attr_types.h b/include/mxnet/graph_attr_types.h
new file mode 100644
index 000000000000..3aba0119d8ca
--- /dev/null
+++ b/include/mxnet/graph_attr_types.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_attr_types.h
+ * \brief Data structures that can appear in graph attributes.
+ */
+#ifndef MXNET_GRAPH_ATTR_TYPES_H_
+#define MXNET_GRAPH_ATTR_TYPES_H_
+
+#include <vector>
+
+namespace mxnet {
+
+/*!
+ * \brief The result holder of storage type of each NodeEntry in the graph.
+ * \note Stored under graph.attrs["storage_type"], provided by Pass "InferStorageType"
+ *
+ * \code
+ *  Graph g = ApplyPass(src_graph, "InferStorageType");
+ *  const StorageVector& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+ *  // get shape by entry id
+ *  int entry_type = stypes[g.indexed_graph().entry_id(my_entry)];
+ * \endcode
+ *
+ * \sa FInferStorageType
+ */
+using StorageTypeVector = std::vector<int>;
+
+}  // namespace mxnet
+
+#endif  // MXNET_GRAPH_ATTR_TYPES_H_
diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h
index d2924ecea1b5..9ea63b4cec79 100644
--- a/include/mxnet/kvstore.h
+++ b/include/mxnet/kvstore.h
@@ -25,6 +25,7 @@
 #define MXNET_KVSTORE_H_
 #include <dmlc/io.h>
 #include <vector>
+#include <utility>
 #include <unordered_map>
 #include <string>
 #include <functional>
@@ -173,6 +174,29 @@ class KVStore {
                     const std::vector<NDArray*>& values,
                     int priority = 0) = 0;
 
+  /*!
+   * \brief pull a list of key-value pairs from the store.
+   *        The NDArray pulled back will be in row_sparse storage with only the
+   *        specified row_ids present (others rows are zeros).
+   * \param keys the list of keys
+   * \param values the list of buffers - row_id pairs
+   * \param priority the priority of the action.
+   */
+  virtual void PullRowSparse(const std::vector<int>& str_keys,
+                             const std::vector<std::pair<NDArray*, NDArray>>& val_rowids,
+                             const int priority = 0) = 0;
+
+  /*!
+   * \brief pull a list of key-value pairs from the store, where each key is a string.
+   *        The NDArray pulled back will be in row_sparse storage with only the
+   *        specified row_ids present (others rows are zeros).
+   * \param keys the list of keys in string format
+   * \param values the list of buffers - row_id pairs
+   * \param priority the priority of the action.
+   */
+  virtual void PullRowSparse(const std::vector<std::string>& str_keys,
+                             const std::vector<std::pair<NDArray*, NDArray>>& val_rowids,
+                             const int priority = 0) = 0;
 
   /**
    * \brief the prototype of user-defined updater
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index d7dff4098b27..56e36dffbf27 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -47,7 +47,6 @@
 
 namespace mxnet {
 
-// forward declaration
 namespace autograd {
 class AGNode;
 
@@ -71,6 +70,23 @@ class AGNodeEntry {
 class AutogradRuntime;
 }  // namespace autograd
 
+// enum for storage types
+namespace csr {
+enum CSRAuxType {kIndPtr, kIdx};
+}
+
+namespace rowsparse {
+enum RowSparseAuxType {kIdx};
+}
+
+enum NDArrayStorageType {
+  kUndefinedStorage = -1,  // undefined storage
+  kDefaultStorage,         // dense
+  kRowSparseStorage,       // row sparse
+  kCSRStorage,             // csr
+};
+
+
 /*!
  * \brief ndarray interface
  */
@@ -91,10 +107,55 @@ class NDArray {
    */
   NDArray(const TShape &shape, Context ctx,
           bool delay_alloc = false, int dtype = mshadow::default_type_flag)
-      : ptr_(std::make_shared<Chunk>(shape.Size(), ctx, delay_alloc, dtype)),
+      : ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype)),
         shape_(shape), dtype_(dtype), entry_({nullptr, 0, 0}) {
 #if MKL_EXPERIMENTAL == 1
     Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
+  }
+  /*! \brief constructor for NDArray with storage type
+   */
+  NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx,
+          bool delay_alloc = true, int dtype = mshadow::default_type_flag,
+          std::vector<int> aux_types = {}, std::vector<TShape> aux_shapes = {},
+          TShape storage_shape = TShape(mshadow::Shape1(0)))
+      : shape_(shape), dtype_(dtype), entry_({nullptr, 0, 0}) {
+      // Assign default aux types if not given
+      if (aux_types.size() == 0) {
+        if (stype == kRowSparseStorage) {
+          aux_types = {mshadow::kInt64};
+        } else if (stype == kCSRStorage) {
+          aux_types = {mshadow::kInt64, mshadow::kInt64};
+        } else {
+          LOG(FATAL) << "Unknown storage type " << stype;
+        }
+      }
+      // Assign default shapes if not given
+      // unknown shapes are intialized as {0} such that Size() would return 0
+      if (aux_shapes.size() == 0) {
+        if (stype == kRowSparseStorage) {
+          aux_shapes = {TShape(mshadow::Shape1(0))};
+        } else if (stype == kCSRStorage) {
+          // aux shapes for indptr and indices
+          aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))};
+        } else {
+          LOG(FATAL) << "Unknown storage type " << stype;
+        }
+      }
+      if (storage_shape.Size() == 0) {
+        if (stype == kRowSparseStorage) {
+          storage_shape = shape;
+          storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
+        } else if (stype == kCSRStorage) {
+          storage_shape = aux_shapes[csr::kIdx];
+        } else {
+          LOG(FATAL) << "Unknown storage type " << stype;
+        }
+      }
+      ptr_ = std::make_shared<Chunk>(stype, storage_shape, ctx, delay_alloc,
+                                     dtype, aux_types, aux_shapes);
+#if MKL_EXPERIMENTAL == 1
+      Mkl_mem_ = std::make_shared<MKLMemHolder>();
 #endif
   }
   /*!
@@ -111,17 +172,82 @@ class NDArray {
     Mkl_mem_ = std::make_shared<MKLMemHolder>();
 #endif
   }
+
   /*!
-   * \return the shape of current NDArray
+   * \brief constructing a static NDArray of non-default storage that shares data with TBlob
+   *  Use with caution: allocate ONLY ONE NDArray for each TBlob,
+   *  make sure the memory region is available through out the life of NDArray
+   * \param stype the storage type of NDArray
+   * \param shape the shape of NDArray
+   * \param data the memory content of static data
+   * \param aux_data the memory content of static aux data
+   * \param dev_id the device id this tensor sits at
+   */
+  NDArray(const NDArrayStorageType stype, const TShape &shape,
+          const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
+      : ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)), shape_(shape),
+        dtype_(data.type_flag_), entry_({nullptr, 0, 0}) {
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
+  }
+
+
+  /*!
+   * \return the shape of current NDArray.
    */
   inline const TShape& shape() const {
     return shape_;
   }
+  /*!
+   * \return the shape of underlying chunk which stores the NDArray data/value.
+   *  It is only intended for non-default storage. For row-sparse storage, it is the shape of
+   *  the tensor which stores the non-zero values.
+   */
+  inline const TShape &storage_shape() const {
+    CHECK(ptr_ != nullptr);
+    CHECK_NE(storage_type(), kDefaultStorage);
+    return ptr_->storage_shape;
+  }
+
+  /*!
+   * \brief get the shape of aux_data(index)
+   * \param index the index of the aux data
+   * \return the shape of aux data at given index
+   */
+  inline const TShape& aux_shape(size_t index) const {
+    CHECK(storage_type() != kDefaultStorage);
+    return ptr_->aux_shapes[index];
+  }
+
+  /* \return the shapes of all aux data */
+  const std::vector<TShape>& aux_shapes() const {
+    CHECK(storage_type() != kDefaultStorage);
+    return ptr_->aux_shapes;
+  }
+
+  /*! returns the dtypes of all aux data */
+  const std::vector<int>& aux_types() const {
+    CHECK(storage_type() != kDefaultStorage);
+    return ptr_->aux_types;
+  }
+
+  /*!
+   * \brief For a sparse operation on a csr matrix for example,
+   * the size of the column index array
+   * is an estimated value in the beginning for allocating enough capacity
+   * for the final result. After the operation is done, the exact size of
+   * the shape is known and need to be reset using this function.
+   */
+  inline void set_aux_shape(size_t index, const TShape& shape) const {
+    ptr_->set_aux_shape(index, shape);
+  }
+
   /*!
    * \return the data TBlob
    */
   inline const TBlob& data() const {
-    CheckAndAlloc();
+    if (storage_type() == kDefaultStorage) CheckAndAlloc();
     SetTBlob();
     return tblob_;
   }
@@ -129,6 +255,26 @@ class NDArray {
    * \return the gradient ndarray.
    */
   NDArray grad() const;
+
+  /*!
+   * \return the aux TBlob
+   */
+  inline TBlob aux_data(size_t i) const {
+    auto stype = storage_type();
+    TBlob res;
+    auto shape = aux_shape(i);
+    auto type = aux_type(i);
+    MSHADOW_TYPE_SWITCH(type, DType, {
+      auto dptr = static_cast<DType*>(ptr_->aux_handles[i].dptr);
+      CHECK(stype == kRowSparseStorage || stype == kCSRStorage)
+            << "Unexpected storage type: " << stype;
+      res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type);
+    });
+#if MKL_EXPERIMENTAL == 1
+    res.Mkl_mem_ = Mkl_mem_;
+#endif
+    return res;
+  }
   /*!
    * \return the context of NDArray, this function is only valid when the NDArray is not empty
    */
@@ -141,6 +287,15 @@ class NDArray {
   inline int dtype() const {
     return dtype_;
   }
+  inline int aux_type(size_t i) const {
+    CHECK(!is_none());
+    return ptr_->aux_types[i];
+  }
+
+  inline NDArrayStorageType storage_type() const {
+    if (is_none()) return kUndefinedStorage;
+    return ptr_->storage_type;
+  }
   /*! \return whether this ndarray is not initialized */
   inline bool is_none() const {
     return ptr_.get() == nullptr;
@@ -149,6 +304,22 @@ class NDArray {
   bool fresh_out_grad() const;
   /*! \return updated grad state in entry_ */
   void set_fresh_out_grad(bool state) const;
+  // returns true if a sparse ndarray's aux_data and storage are initialized
+  inline bool storage_initialized() const {
+    if (is_none()) return false;
+    auto stype = storage_type();
+    CHECK_NE(stype, kDefaultStorage);
+    if (stype == kRowSparseStorage) {
+      CHECK_EQ(aux_shape(rowsparse::kIdx)[0], storage_shape()[0]);
+      return aux_shape(0).Size() != 0;
+    } else if (stype == kCSRStorage) {
+      CHECK_EQ(aux_shape(csr::kIdx)[0], storage_shape()[0]);
+      return aux_shape(0).Size() != 0;
+    } else {
+      LOG(FATAL) << "Unknown storage type";
+    }
+    return true;
+  }
   /*!
    * \brief Block until all the pending write operations with respect
    *    to current NDArray are finished, and read can be performed.
@@ -179,6 +350,12 @@ class NDArray {
    * \param strm the output stream
    */
   void Save(dmlc::Stream *strm) const;
+  /*!
+   * \brief load ndarrays before supporting sparse ndarrays
+   * \param strm the output stream
+   * \param magic the magic number used for version control
+   */
+  bool LegacyLoad(dmlc::Stream *strm, const uint32_t magic);
   /*!
    * \brief load the content from binary stream
    * \param strm the output stream
@@ -269,6 +446,12 @@ class NDArray {
    * \param size the size of the source array, in sizeof(DType) not raw btyes.
    */
   void SyncCopyFromCPU(const void *data, size_t size) const;
+
+  /*!
+   * \brief Copy from src.data()/aux_data(i) to this->data()/aux_data(j)
+   */
+  void SyncCopyFromNDArray(const NDArray &src, int i = -1, int j = -1);
+
   /*!
    * \brief Do a synchronize copy to a continugous CPU memory region.
    *
@@ -282,17 +465,31 @@ class NDArray {
   void SyncCopyToCPU(void *data, size_t size) const;
   /*!
    * \brief Slice a NDArray
-   * \param begin begin index in first dim
-   * \param end end index in first dim
+   * \param begin begin index in first dim (inclusive)
+   * \param end end index in first dim (exclusive)
    * \return sliced NDArray
    */
   NDArray Slice(index_t begin, index_t end) const;
+
   /*!
    * \brief Index a NDArray
    * \param idx the index
    * \return idx-th sub array NDArray
    */
   NDArray At(index_t idx) const;
+
+  /*!
+   * \brief Generate a deep copy of aux_data(i) returned as
+   * a default storage type NDArray
+   */
+  NDArray aux_ndarray(size_t i) const;
+
+  /*!
+   * \brief Generate a deep copy of data() returned as a
+   * default storage type NDArray
+   */
+  NDArray data_ndarray() const;
+
   /*!
    * \brief Create a NDArray that shares memory with current one
    *  The new array must have smaller memory size than the current array.
@@ -301,6 +498,7 @@ class NDArray {
    * \return NDArray in new shape and type.
    */
   inline NDArray AsArray(const TShape &shape, int dtype) const {
+    CHECK_EQ(storage_type(), kDefaultStorage) << "Not implemented yet";
     CHECK_GE(shape_.Size() * mshadow::mshadow_sizeof(dtype_),
              shape.Size() * mshadow::mshadow_sizeof(dtype))
         << "NDArray.AsArray: target memory size is bigger";
@@ -342,8 +540,42 @@ class NDArray {
    * This is an internal function used by system that normal user should not use
    */
   inline void CheckAndAlloc() const {
+    CHECK_EQ(storage_type(), kDefaultStorage);
     ptr_->CheckAndAlloc();
   }
+
+  /*!
+   * \brief Allocate the space if the allocation has been delayed
+   * or the requested size is bigger than the available one.
+   * This function can only be called by ndarray of default
+   * storage type and effectively changes the ndarray's shape_.
+   * Note: This function is named as this to avoid overload conflict
+   * with CheckAndAlloc(const std::vector<TShape> &aux_shapes), since
+   * TShape tmp = some_shape is equivalent to TShape tmp = {some_shape}.
+   */
+  void ReshapeAndAlloc(const TShape& shape) {
+    CHECK_EQ(storage_type(), kDefaultStorage);
+    CHECK(!is_none());
+    shape_ = shape;
+    ptr_->CheckAndAlloc(shape.Size() * mshadow::mshadow_sizeof(dtype_));
+  }
+
+  /* !
+   * \brief Alloc memory for non-default storage
+   * aux_shape is only known at run time
+   */
+  inline void CheckAndAlloc(const std::vector<TShape> &aux_shapes) const {
+    CHECK_NE(storage_type(), kDefaultStorage);
+    ptr_->CheckAndAlloc(shape_, aux_shapes, dtype_);
+  }
+  inline void CheckAndAllocData(const TShape &storage_shape) const {
+    CHECK_NE(storage_type(), kDefaultStorage);
+    ptr_->CheckAndAllocData(storage_shape, dtype_);
+  }
+  inline void CheckAndAllocAuxData(size_t i, const TShape &aux_shape) const {
+    CHECK_NE(storage_type(), kDefaultStorage);
+    ptr_->CheckAndAllocAuxData(i, aux_shape);
+  }
   /*!
    * \brief Save list of ndarray into the Stream.x
    * \param fo The stream of output.
@@ -366,44 +598,138 @@ class NDArray {
  private:
   friend class autograd::AutogradRuntime;
   /*! \brief the real data chunk that backs NDArray */
+  // shandle is used to store the actual values in the NDArray
+  // aux_handles store the aux data(such as indices) if it's needed by non-default storage.
   struct Chunk {
-    /*! \brief storage handlefrom storage engine */
+    /*! \brief storage handle from storage engine.
+               for non-default storage, shandle stores the data(value) array.
+     */
     Storage::Handle shandle;
+    /*! \brief storage handles for aux data (e.g index)
+               for row_sparse, aux_handles[0] = indices
+               for csr, aux_handles[0] = indptr, aux_handles[1] = indices
+    */
+    std::vector<Storage::Handle> aux_handles;
     /*! \brief variable from engine */
     Engine::VarHandle var;
     /*!
      * \brief if this is true, this means the data do not come
      * from Storage, and do not need to be freed
      */
+    /*! \brief construct from static data */
     bool static_data;
-    /*! \brief whether allocation is delayed */
+    /*! \brief whether data allocation is delayed. This doesn't indicate whether aux data
+               allocation is delayed. */
     bool delay_alloc;
+    // the type of the storage. The storage_type is never kUndefinedStorage once the chunk
+    // is constructed.
+    NDArrayStorageType storage_type = kDefaultStorage;
+    /*! \brief type of aux */
+    std::vector<int> aux_types;
+    // context of data
+    Context ctx;
+    // The shape of the chunk data.
+    // This might not be the same shape as the NDArray, since the storage may be sparse.
+    // The default value for storage_shape is {0} when an empty non-default NDArray is created.
+    TShape storage_shape;
+    // The shape of aux data. The default value for the shape depends on the type of storage.
+    // If aux_shapes[i].Size() is zero, aux data i is empty.
+    std::vector<TShape> aux_shapes;
+
     /*! \brief default cosntructor */
-    Chunk() : static_data(true), delay_alloc(false) {
-      var  = Engine::Get()->NewVariable();
+    Chunk() : static_data(true), delay_alloc(false) {}
+
+    /*! \brief construct a new chunk */
+    Chunk(TShape shape, Context ctx_, bool delay_alloc_, int dtype)
+        : static_data(false), delay_alloc(true), ctx(ctx_) {
+      auto size = shape.Size();
+      storage_shape = shape;
+      var = Engine::Get()->NewVariable();
+      shandle.size = size * mshadow::mshadow_sizeof(dtype);
+      shandle.ctx = ctx_;
+      if (!delay_alloc_) this->CheckAndAlloc();
     }
-    /*! \brief construct from static data */
+
     Chunk(const TBlob &data, int dev_id)
-        : static_data(true),
-          delay_alloc(false) {
+        : static_data(true), delay_alloc(false) {
+      CHECK(storage_type == kDefaultStorage);
       var = Engine::Get()->NewVariable();
       if (data.dev_mask() == cpu::kDevMask) {
-        shandle.ctx = Context::CPU();
+        ctx = Context::CPU();
       } else {
         CHECK_EQ(data.dev_mask(), gpu::kDevMask);
-        shandle.ctx = Context::GPU(dev_id);
+        ctx = Context::GPU(dev_id);
       }
+      // init shandle
+      shandle.ctx = ctx;
       shandle.dptr = data.dptr_;
       shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
+      storage_shape = data.shape_;
     }
-    /*! \brief construct a new chunk */
-    Chunk(uint64_t size, Context ctx, bool delay_alloc_, int dtype)
-        : static_data(false), delay_alloc(true) {
+    // Constructor for a non-default storage chunk
+    Chunk(NDArrayStorageType storage_type_, const TShape &storage_shape_, Context ctx_,
+          bool delay_alloc_, int dtype, const std::vector<int> &aux_types_,
+          const std::vector<TShape> &aux_shapes_)
+        : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_),
+          aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_),
+          aux_shapes(aux_shapes_) {
+      shandle.ctx = ctx;
       var = Engine::Get()->NewVariable();
-      shandle.size = size * mshadow::mshadow_sizeof(dtype);
+      // aux_handles always reflect the correct number of aux data
+      for (size_t i = 0; i < aux_shapes.size(); i++) {
+        CheckAndAllocAuxData(i, aux_shapes[i]);
+        // this line is needed in case when aux_shapes[i].Size() = 0
+        // aux_handles[i] will not be updated and take only default value.
+        aux_handles[i].ctx = ctx;
+      }
+      if (!delay_alloc) {
+        CheckAndAllocData(storage_shape, dtype);
+      }
+    }
+
+    Chunk(const NDArrayStorageType storage_type_, const TBlob &data,
+          const std::vector<TBlob> &aux_data, int dev_id)
+        : static_data(true), delay_alloc(false), storage_type(storage_type_) {
+      using namespace mshadow;
+      CHECK_NE(storage_type, kDefaultStorage);
+      // init var
+      var = Engine::Get()->NewVariable();
+      // init ctx
+      if (data.dev_mask() == cpu::kDevMask) {
+        ctx = Context::CPU();
+      } else {
+        CHECK_EQ(data.dev_mask(), gpu::kDevMask);
+        ctx = Context::GPU(dev_id);
+      }
+      // init shandle
       shandle.ctx = ctx;
-      if (!delay_alloc_) this->CheckAndAlloc();
+      shandle.dptr = data.dptr_;
+      shandle.size = data.shape_.Size() * mshadow_sizeof(data.type_flag_);
+      storage_shape = data.shape_;
+      // init aux handles
+      for (const auto &aux : aux_data) {
+        Storage::Handle aux_handle;
+        aux_handle.ctx = ctx;
+        aux_handle.dptr = aux.dptr_;
+        aux_handle.size = aux.shape_.Size() * mshadow_sizeof(aux.type_flag_);
+        aux_handles.push_back(aux_handle);
+        aux_types.emplace_back(aux.type_flag_);
+        aux_shapes.emplace_back(aux.shape_);
+      }
+    }
+
+    /*! \brief set the shape for ith aux data, and update storage shape if necessary */
+    inline void set_aux_shape(const size_t i, const TShape& shape) {
+      aux_shapes[i] = shape;
+      if (storage_shape.ndim() > 0) {
+        if (storage_type == kRowSparseStorage && i == rowsparse::kIdx) {
+          storage_shape[0] = shape[0];
+        } else if (storage_type == kCSRStorage && i == csr::kIdx) {
+          storage_shape[0] = shape[0];
+        }
+      }
     }
+
     /*! \brief check if delay alloc is on, do alloc if not yet done */
     inline void CheckAndAlloc(void) {
       if (delay_alloc) {
@@ -411,22 +737,112 @@ class NDArray {
         delay_alloc = false;
       }
     }
-    /*! \brief destructor */
-    ~Chunk() {
-      if (static_data || delay_alloc) {
-        Engine::Get()->DeleteVariable([](RunContext s) {}, shandle.ctx, var);
+
+    /*! \brief Check and alloc memory for a dense ndarray */
+    // size is the number of bytes
+    void CheckAndAlloc(uint64_t dbytes) {
+      CHECK_EQ(kDefaultStorage, storage_type);
+      if (delay_alloc) {
+        shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
+        delay_alloc = false;
+      } else if (shandle.size < dbytes) {
+        // free storage if necessary and alloc again
+        if (shandle.size > 0) Storage::Get()->Free(shandle);
+        // init storage
+        shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
+      }
+    }
+
+    inline void CheckAndAlloc(const TShape &shape, const std::vector<TShape> &aux_shapes,
+                              int dtype) {
+      // calculate size, perform allocation
+      if (kRowSparseStorage == storage_type) {
+        // For row sparse, aux_shape indicates the number of rows to allocate
+        auto aux_shape = aux_shapes[rowsparse::kIdx];
+        CheckAndAllocAuxData(rowsparse::kIdx, aux_shape);
+        TShape storage_shape(shape);
+        storage_shape[0] = aux_shape[0];
+        CheckAndAllocData(storage_shape, dtype);
+      } else if (kCSRStorage == storage_type) {
+        CheckAndAllocAuxData(csr::kIndPtr, aux_shapes[csr::kIndPtr]);
+        CheckAndAllocAuxData(csr::kIdx, aux_shapes[csr::kIdx]);
+        CheckAndAllocData(aux_shapes[csr::kIdx], dtype);
       } else {
-        Storage::Handle h = this->shandle;
-        Engine::Get()->DeleteVariable([h](RunContext s) {
-            Storage::Get()->Free(h);
-          }, shandle.ctx, var);
+        LOG(FATAL) << "Storage type " << storage_type << " not implemented for CheckAndAlloc";
+      }
+    }
+    // create storage handle for data based on shape and dtype, assuming ctx is set
+    // storage shape is also updated
+    // if data is already allocated, try reuse the storage. Otherwise, free the current one
+    // and allocate new storage
+    inline void CheckAndAllocData(const TShape &shape, int dtype) {
+      CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data";
+      auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
+      if (shandle.size < dbytes) {
+        // free storage if necessary and alloc again
+        if (shandle.size > 0) Storage::Get()->Free(shandle);
+        // init storage
+        shandle = Storage::Get()->Alloc(dbytes, ctx);
       }
+      // init shape
+      storage_shape = shape;
+      // delay_alloc is only set when data storage handle is present
+      delay_alloc = false;
+    }
+    // create storage handle for aux data based on shape
+    // this function assumes ctx, aux shapes and aux types are set
+    // aux shape is also updated
+    // if aux data is already allocated, try reuse the storage. Otherwise, free the current one
+    // and allocate new storage
+    inline void CheckAndAllocAuxData(size_t i, const TShape &shape) {
+      CHECK_EQ(shape.ndim(), 1) << "shape must be 1D in CheckAndAllocAuxData";
+      CHECK_NE(storage_type, kUndefinedStorage)
+        << "storage type cannot be kUndefinedStorage in CheckAndAllocAuxData";
+      CHECK_NE(storage_type, kDefaultStorage)
+        << "storage type cannot be kDefaultStorage in CheckAndAllocAuxData";
+      if (aux_handles.size() <= i) {
+        aux_handles.resize(i + 1);
+      }
+      size_t aux_bytes = shape.Size() * mshadow::mshadow_sizeof(aux_types[i]);
+      if (aux_handles[i].size < aux_bytes) {
+        // free storage if necessary and alloc again
+        if (aux_handles[i].size > 0) Storage::Get()->Free(aux_handles[i]);
+        // init aux storage
+        aux_handles[i] = Storage::Get()->Alloc(aux_bytes, ctx);
+      }
+      // init shape
+      set_aux_shape(i, shape);
+    }
+    /*! \brief destructor */
+    ~Chunk() {
+      bool skip_free = static_data || delay_alloc;
+      Storage::Handle h = this->shandle;
+      std::vector<Storage::Handle> aux_h = this->aux_handles;
+      Engine::Get()->DeleteVariable([h, aux_h, skip_free](RunContext s) {
+        if (skip_free == false) {
+          Storage::Get()->Free(h);
+          for (size_t i = 0; i < aux_h.size(); i++) {
+            if (aux_h[i].size > 0) Storage::Get()->Free(aux_h[i]);
+          }
+        }
+      }, shandle.ctx, var);
     }
-  };
+  };  // struct Chunk
 
   void SetTBlob() const {
-    tblob_.dptr_ = static_cast<char*>(ptr_->shandle.dptr) + byte_offset_;
-    tblob_.shape_ = shape_;
+    CHECK(ptr_ != nullptr);
+    TShape shape = shape_;
+    char *dptr = static_cast<char*>(ptr_->shandle.dptr);
+    auto stype = storage_type();
+    if (stype == kDefaultStorage) {
+      dptr += byte_offset_;
+    } else if (stype == kCSRStorage || stype == kRowSparseStorage) {
+      shape = storage_shape();
+    } else {
+      LOG(FATAL) << "unknown storage type " << stype;
+    }
+    tblob_.dptr_ = dptr;
+    tblob_.shape_ = shape;
     tblob_.type_flag_ = dtype_;
     tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
 #if MKL_EXPERIMENTAL == 1
@@ -438,7 +854,7 @@ class NDArray {
   std::shared_ptr<MKLMemHolder> Mkl_mem_;
 #endif
   /*! \brief internal data of NDArray */
-  std::shared_ptr<Chunk> ptr_;
+  std::shared_ptr<Chunk> ptr_{nullptr};
   /*! \brief shape of current NDArray */
   TShape shape_;
   /*! \brief byte offset in chunk */
@@ -455,7 +871,12 @@ class NDArray {
    *     this situation.
    */
   mutable TBlob tblob_;
-};
+};  // class NDArray
+
+/*!
+ * \return the number of aux data used for given storage type
+ */
+size_t num_aux_data(NDArrayStorageType stype);
 
 /*!
  * \brief issue an copy operation from one NDArray to another
@@ -470,7 +891,6 @@ class NDArray {
  */
 void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0);
 
-
 /*!
  * \brief Perform elementwise sum over each data from source, store result into out.
  * \param source the ndarray we want to sum
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index 1bcae0d29348..f559a921c522 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -25,7 +25,6 @@
 #ifndef MXNET_OP_ATTR_TYPES_H_
 #define MXNET_OP_ATTR_TYPES_H_
 
-
 #include <mshadow/tensor.h>
 #include <nnvm/op_attr_types.h>
 
@@ -226,6 +225,23 @@ using FCompute = std::function<void (const nnvm::NodeAttrs& attrs,
                                      const std::vector<TBlob>& inputs,
                                      const std::vector<OpReqType>& req,
                                      const std::vector<TBlob>& outputs)>;
+/*!
+ * \brief Resiger an NDArray compute function for simple stateless forward only operator
+ *
+ * \note Register under "FComputeEx<xpu, default>" and "FComputeEx<xpu, non-default>" 
+ *       Dispatched only when operators process non-default storage inputs or outputs
+ */
+using FComputeEx = std::function<void (const nnvm::NodeAttrs& attrs,
+                                       const OpContext& ctx,
+                                       const std::vector<NDArray>& inputs,
+                                       const std::vector<OpReqType>& req,
+                                       const std::vector<NDArray>& outputs)>;
+
+using FInferStorageType = std::function<bool (const NodeAttrs& attrs,
+                                              const Context& ctx,
+                                              std::vector<int>* in_attrs,
+                                              std::vector<int>* out_attrs)>;
+
 }  // namespace mxnet
 
 #endif  // MXNET_OP_ATTR_TYPES_H_
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index bfb42de8771a..7e3af8eeca81 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -41,11 +41,11 @@ class Storage {
     /*!
      * \brief Pointer to the data.
      */
-    void* dptr;
+    void* dptr{nullptr};
     /*!
      * \brief Size of the storage.
      */
-    size_t size;
+    size_t size{0};
     /*!
      * \brief Context information about device and ID.
      */
diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
index fd1a471bcf16..b4c1336de624 100644
--- a/perl-package/AI-MXNetCAPI/mxnet.i
+++ b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -1203,6 +1203,12 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
                          const mx_uint num_provided_arg_dtypes,
                          const char** in, // provided_arg_dtype_names,
                          const int* in, // provided_arg_dtypes,
+
+//---------------        sparse related variables, ignored for now
+                         const mx_uint num_provided_arg_stypes,
+                         const char** provided_arg_stype_names,
+                         const int* provided_arg_stypes,
+//---------------
                          const mx_uint num_shared_arg_names,
                          const char** in, // shared_arg_name_list,
 //------------
diff --git a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
index 640215fd7792..5d2fbd6880a1 100644
--- a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
+++ b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
@@ -820,6 +820,17 @@
     }
 }
 
+%typemap(in,numinputs=0) (const mx_uint num_provided_arg_stypes, const char** provided_arg_stype_names,
+                          const int* provided_arg_stypes)
+                         (mx_uint temp1, char* temp2, int temp3)
+{
+    $2 = &temp2;
+    $3 = &temp3;
+    $1 = 0;
+    *$2 = NULL;
+    *$3 = 0;
+}
+
 %typemap(in,numinputs=0) (mx_uint* num_aux_states,
                           NDArrayHandle** aux_states)
                          (mx_uint temp1,
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 3c3ce76a9284..72dc2b2fec8d 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -26,6 +26,7 @@
 from . import base
 from . import contrib
 from . import ndarray
+from . import ndarray as nd
 from . import name
 # use mx.sym as short for symbol
 from . import symbol as sym
@@ -34,8 +35,6 @@
 from . import io
 from . import recordio
 from . import operator
-# use mx.nd as short for mx.ndarray
-from . import ndarray as nd
 # use mx.rnd as short for mx.random
 from . import random as rnd
 from . import random
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index 5a50f80498ec..c2e6fce40de8 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -32,10 +32,19 @@
 from ..ndarray_doc import _build_doc
 
 
+_STORAGE_TYPE_ID_TO_STR = {
+    -1 : 'undefined',
+    0  : 'default',
+    1  : 'row_sparse',
+    2  : 'csr',
+}
+
+
 class NDArrayBase(object):
     """Base data structure for ndarray"""
     __slots__ = ["handle", "writable"]
     # pylint: disable= no-member
+
     def __init__(self, handle, writable=True):
         """initialize a new NDArray
 
@@ -78,7 +87,11 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
         output_vars = ctypes.POINTER(NDArrayHandle)()
         num_output = ctypes.c_int(0)
 
-    check_call(_LIB.MXImperativeInvoke(
+    # return output stypes to avoid the c_api call for checking
+    # a handle's stype in _ndarray_cls
+    out_stypes = ctypes.POINTER(ctypes.c_int)()
+
+    check_call(_LIB.MXImperativeInvokeEx(
         ctypes.c_void_p(handle),
         ctypes.c_int(len(ndargs)),
         c_array(NDArrayHandle, [arr.handle for arr in ndargs]),
@@ -86,14 +99,17 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
         ctypes.byref(output_vars),
         ctypes.c_int(len(keys)),
         c_array(ctypes.c_char_p, [c_str(key) for key in keys]),
-        c_array(ctypes.c_char_p, [c_str(str(val)) for val in vals])))
+        c_array(ctypes.c_char_p, [c_str(str(val)) for val in vals]),
+        ctypes.byref(out_stypes)))
 
     if original_output is not None:
         return original_output
     if num_output.value == 1:
-        return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle))
+        return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle),
+                            stype=_STORAGE_TYPE_ID_TO_STR[out_stypes[0]])
     else:
-        return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle))
+        return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle),
+                             stype=_STORAGE_TYPE_ID_TO_STR[out_stypes[i]])
                 for i in range(num_output.value)]
 
 
@@ -128,17 +144,24 @@ def __call__(self, *args, **kwargs):
                 "CachedOp.__call__ got unexpected keyword argument(s): " + \
                 ', '.join(kwargs.keys()))
 
-        check_call(_LIB.MXInvokeCachedOp(
+        # return output stypes to avoid the c_api call for checking
+        # a handle's stype in _ndarray_cls
+        out_stypes = ctypes.POINTER(ctypes.c_int)()
+
+        check_call(_LIB.MXInvokeCachedOpEx(
             self.handle,
             ctypes.c_int(len(args)),
             c_array(NDArrayHandle, [arr.handle for arr in args]),
             ctypes.byref(num_output),
-            ctypes.byref(output_vars)))
+            ctypes.byref(output_vars),
+            ctypes.byref(out_stypes)))
 
         if original_output is not None:
             return original_output
         if num_output.value == 1:
-            return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle))
+            return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle),
+                                stype=_STORAGE_TYPE_ID_TO_STR[out_stypes[0]])
         else:
-            return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle))
+            return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle),
+                                 stype=_STORAGE_TYPE_ID_TO_STR[out_stypes[i]])
                     for i in range(num_output.value)]
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index aad0580e7d07..d446355da0b5 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -72,6 +72,20 @@ def __str__(self):
         msg += ' is not implemented for Symbol and only available in NDArray.'
         return msg
 
+class NotSupportedForSparseNDArray(MXNetError):
+    def __init__(self, function, alias, *args):
+        super(NotSupportedForSparseNDArray, self).__init__()
+        self.function = function.__name__
+        self.alias = alias
+        self.args = [str(type(a)) for a in args]
+    def __str__(self):
+        msg = 'Function {}'.format(self.function)
+        if self.alias:
+            msg += ' (namely operator "{}")'.format(self.alias)
+        if self.args:
+            msg += ' with arguments ({})'.format(', '.join(self.args))
+        msg += ' is not supported for SparseNDArray and only available in NDArray.'
+        return msg
 
 class MXCallbackList(ctypes.Structure):
     """Structure that holds Callback information. Passed to CustomOpProp."""
diff --git a/python/mxnet/contrib/autograd.py b/python/mxnet/contrib/autograd.py
index c7fb6e17803a..2d2500e7a217 100644
--- a/python/mxnet/contrib/autograd.py
+++ b/python/mxnet/contrib/autograd.py
@@ -24,6 +24,7 @@
 import functools
 from ..base import _LIB, check_call, string_types
 from ..base import mx_uint, NDArrayHandle, c_array
+# pylint: disable= unused-import
 from ..ndarray import NDArray, zeros_like
 from ..symbol import _GRAD_REQ_MAP
 
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index baff834bb33a..5cc94a5e80ac 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -27,6 +27,7 @@
 from .base import mx_uint, NDArrayHandle, ExecutorHandle
 from .base import check_call, c_array, py_str
 from .ndarray import NDArray
+from .ndarray import _ndarray_cls
 from . import ndarray as nd
 
 # those functions are not used here, we just import them to keep backward compatibility
@@ -105,7 +106,9 @@ def _get_outputs(self):
         handles = ctypes.POINTER(NDArrayHandle)()
         check_call(_LIB.MXExecutorOutputs(self.handle,
                                           ctypes.byref(out_size), ctypes.byref(handles)))
-        return [NDArray(NDArrayHandle(handles[i])) for i in range(out_size.value)]
+        num_output = out_size.value
+        outputs = [_ndarray_cls(NDArrayHandle(handles[i])) for i in range(num_output)]
+        return outputs
 
     def forward(self, is_train=False, **kwargs):
         """Calculate the outputs specified by the bound symbol.
diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py
index 8ac1aebe72dd..f67b05de5de3 100644
--- a/python/mxnet/image/detection.py
+++ b/python/mxnet/image/detection.py
@@ -27,7 +27,7 @@
 
 from ..base import numeric_types
 from .. import ndarray as nd
-from .._ndarray_internal import _cvcopyMakeBorder as copyMakeBorder
+from ..ndarray._internal import _cvcopyMakeBorder as copyMakeBorder
 from .. import io
 from .image import RandomOrderAug, ColorJitterAug, LightingAug, ColorNormalizeAug
 from .image import ResizeAug, ForceResizeAug, CastAug, HueJitterAug, RandomGrayAug
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index 2e40019971ac..d99db214222c 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -34,9 +34,9 @@
 
 from ..base import numeric_types
 from .. import ndarray as nd
-from .. import _ndarray_internal as _internal
-from .._ndarray_internal import _cvimresize as imresize
-from .._ndarray_internal import _cvcopyMakeBorder as copyMakeBorder
+from ..ndarray import _internal
+from ..ndarray._internal import _cvimresize as imresize
+from ..ndarray._internal import _cvcopyMakeBorder as copyMakeBorder
 from .. import io
 from .. import recordio
 
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index 0404e34ea36c..4e69a8a801cb 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -34,6 +34,7 @@
 from .base import mx_real_t
 from .base import check_call, build_param_doc as _build_param_doc
 from .ndarray import NDArray
+from .ndarray import _ndarray_cls
 from .ndarray import array
 from .ndarray import concatenate
 
@@ -801,12 +802,12 @@ def iter_next(self):
     def getdata(self):
         hdl = NDArrayHandle()
         check_call(_LIB.MXDataIterGetData(self.handle, ctypes.byref(hdl)))
-        return NDArray(hdl, False)
+        return _ndarray_cls(hdl, False)
 
     def getlabel(self):
         hdl = NDArrayHandle()
         check_call(_LIB.MXDataIterGetLabel(self.handle, ctypes.byref(hdl)))
-        return NDArray(hdl, False)
+        return _ndarray_cls(hdl, False)
 
     def getindex(self):
         index_size = ctypes.c_uint64(0)
diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py
index fd0091182aea..84759263007c 100644
--- a/python/mxnet/kvstore.py
+++ b/python/mxnet/kvstore.py
@@ -22,6 +22,7 @@
 import ctypes
 import pickle
 from .ndarray import NDArray
+from .ndarray import _ndarray_cls
 from .base import _LIB
 from .base import check_call, c_array, c_str, string_types, mx_uint, py_str
 from .base import NDArrayHandle, KVStoreHandle
@@ -53,8 +54,8 @@ def _updater_wrapper(updater):
     """A wrapper for the user-defined handle."""
     def updater_handle(key, lhs_handle, rhs_handle, _):
         """ ctypes function """
-        lhs = NDArray(NDArrayHandle(lhs_handle))
-        rhs = NDArray(NDArrayHandle(rhs_handle))
+        lhs = _ndarray_cls(NDArrayHandle(lhs_handle))
+        rhs = _ndarray_cls(NDArrayHandle(rhs_handle))
         updater(key, lhs, rhs)
     return updater_handle
 
@@ -186,6 +187,8 @@ def pull(self, key, out=None, priority=0):
 
         The returned values are gauranteed to be the latest values in the store.
 
+        For row_sparse values, please use `row_sparse_pull` instead.
+
         Parameters
         ----------
         key : int or list of int
@@ -231,11 +234,89 @@ def pull(self, key, out=None, priority=0):
         [ 2.  2.  2.]]
         """
         assert(out is not None)
+        if not isinstance(out, (list, tuple)):
+            out = [out]
+        for val in out:
+            if not isinstance(val, (list, tuple)):
+                assert(val.stype == 'default')
+            else:
+                for v in val:
+                    assert(v.stype == 'default')
         ckeys, cvals = _ctype_key_value(key, out)
         check_call(_LIB.MXKVStorePullEx(
             self.handle, mx_uint(len(ckeys)), ckeys, cvals,
             ctypes.c_int(priority)))
 
+    def row_sparse_pull(self, key, out=None, priority=0, row_ids=None):
+        """ Pulls a single row_sparse value or a sequence of row_sparse values from the store
+         with specified row_ids.
+
+        `row_sparse_pull` is executed asynchronously after all previous
+        `push`/`pull`/`row_sparse_pull` calls for the same input key(s) are finished.
+
+        The returned values are guaranteed to be the latest values in the store.
+
+        Parameters
+        ----------
+        key : str or list of str
+            Keys.
+
+        out: NDArray or list of NDArray or list of list of NDArray
+            Values corresponding to the keys. The stype is expected to be row_sparse
+
+        priority : int, optional
+            The priority of the pull operation.
+            Higher priority pull operations are likely to be executed before
+            other pull actions.
+
+        row_ids : NDArray or list of NDArray
+            The row_ids for which to pull for each value. The row_ids doesn't have to be unique
+            or sorted.
+
+        Examples
+        --------
+        >>> shape = (3, 3)
+        >>> kv.init('3', mx.nd.ones(shape).tostype('row_sparse'))
+        >>> a = mx.nd.zeros(shape, stype='row_sparse')
+        >>> row_ids = mx.nd.array([0, 2], dtype='int64')
+        >>> kv.row_sparse_pull('3', out=a, row_ids=row_ids)
+        >>> print a.asnumpy()
+        [[ 1.  1.  1.]
+        [ 0.  0.  0.]
+        [ 1.  1.  1.]]
+        >>> duplicate_row_ids = mx.nd.array([2, 2], dtype='int64')
+        >>> kv.row_sparse_pull('3', out=a, row_ids=duplicate_row_ids)
+        >>> print a.asnumpy()
+        [[ 0.  0.  0.]
+        [ 0.  0.  0.]
+        [ 1.  1.  1.]]
+        >>> unsorted_row_ids = mx.nd.array([1, 0], dtype='int64')
+        >>> kv.row_sparse_pull('3', out=a, row_ids=unsorted_row_ids)
+        >>> print a.asnumpy()
+        [[ 1.  1.  1.]
+        [ 1.  1.  1.]
+        [ 0.  0.  0.]]
+        """
+        assert(out is not None)
+        assert(row_ids is not None)
+        if isinstance(row_ids, NDArray):
+            row_ids = [row_ids]
+        if not isinstance(out, (list, tuple)):
+            out = [out]
+        for val in out:
+            if not isinstance(val, (list, tuple)):
+                assert(val.stype == 'row_sparse')
+            else:
+                for v in val:
+                    assert(v.stype == 'row_sparse')
+        ckeys, cvals = _ctype_key_value(key, out)
+        _, crow_ids = _ctype_key_value(key, row_ids)
+        assert(len(crow_ids) == len(cvals)), "number of row_ids doesn't match number of values"
+
+        check_call(_LIB.MXKVStorePullRowSparse(
+            self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority)))
+
+
     def set_optimizer(self, optimizer):
         """ Registers an optimizer with the kvstore.
 
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 01b3fa50e18f..38bb15484e7b 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -93,15 +93,29 @@ def _create_kvstore(kvstore, num_device, arg_params):
 
     return (kv, update_on_kvstore)
 
-def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names,
-                        update_on_kvstore):
+def _contains_non_default_storage(params):
+    if isinstance(params, (list, tuple)):
+        for param in params:
+            if param.stype != 'default':
+                return True
+    elif isinstance(params, NDArray):
+        return param.stype != 'default'
+    else:
+        return False
+
+def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, update_on_kvstore):
     """Initialize kvstore"""
     for idx, param_on_devs in enumerate(param_arrays):
         name = param_names[idx]
         kvstore.init(name, arg_params[name])
 
         if update_on_kvstore:
-            kvstore.pull(name, param_on_devs, priority=-idx)
+            if _contains_non_default_storage(param_on_devs):
+                # skip pulling row_sparse weights
+                warnings.warn('Detected non-default weight in kvstore to pull. Please make ' \
+                              'sure to pull it with row_ids explicitly', RuntimeWarning)
+            else:
+                kvstore.pull(name, param_on_devs, priority=-idx)
 
 def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
     """Perform update of param_arrays from grad_arrays on kvstore."""
@@ -113,25 +127,36 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
         # push gradient, priority is negative index
         kvstore.push(name, grad_list, priority=-index)
         # pull back the weights
-        kvstore.pull(name, arg_list, priority=-index)
+        if _contains_non_default_storage(arg_list):
+            # skip pulling row_sparse weights
+            warnings.warn('Detected non-default weight in kvstore to pull. Please make ' \
+                          'sure to pull it with row_ids', RuntimeWarning)
+        else:
+            kvstore.pull(name, arg_list, priority=-index)
 
 def _update_params(param_arrays, grad_arrays, updater, num_device,
                    kvstore=None, param_names=None):
     """Perform update of param_arrays from grad_arrays not on kvstore."""
-    for index, pair in enumerate(zip(param_arrays, grad_arrays)):
+    for i, pair in enumerate(zip(param_arrays, grad_arrays)):
         arg_list, grad_list = pair
         if grad_list[0] is None:
             continue
+        index = i
         if kvstore:
             name = param_names[index]
             # push gradient, priority is negative index
             kvstore.push(name, grad_list, priority=-index)
             # pull back the sum gradients, to the same locations.
-            kvstore.pull(name, grad_list, priority=-index)
+            if _contains_non_default_storage(grad_list):
+                # skip pulling row_sparse weights
+                warnings.warn('Detected non-default weight in kvstore to pull. Please make ' \
+                              'sure to pull it with row_ids', RuntimeWarning)
+            else:
+                kvstore.pull(name, grad_list, priority=-index)
         for k, p in enumerate(zip(arg_list, grad_list)):
             # faked an index here, to make optimizer create diff
             # state for the same index but on diff devs, TODO(mli)
-            # use a better solution latter
+            # use a better solution later
             w, g = p
             updater(index*num_device+k, g, w)
 
diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
index 3123462f9c7c..bae166e3ffd8 100644
--- a/python/mxnet/module/base_module.py
+++ b/python/mxnet/module/base_module.py
@@ -957,7 +957,8 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
 
     def init_optimizer(self, kvstore='local', optimizer='sgd',
                        optimizer_params=(('learning_rate', 0.01),), force_init=False):
-        """Installs and initializes optimizers.
+        """Installs and initializes optimizers, as well as initialize kvstore for
+           distributed training
 
         Parameters
         ----------
diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index 058edd57eb3d..d55b2117ebd3 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -25,7 +25,6 @@
 import warnings
 
 from .. import context as ctx
-from .. import ndarray as nd
 from .. import optimizer as opt
 
 from .executor_group import DataParallelExecutorGroup
@@ -33,6 +32,7 @@
 from ..model import load_checkpoint
 from ..initializer import Uniform, InitDesc
 from ..io import DataDesc
+from ..ndarray import zeros
 
 from .base_module import BaseModule, _check_input_names, _parse_data_desc
 
@@ -427,13 +427,13 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
         else:
             assert self._arg_params is None and self._aux_params is None
             param_arrays = [
-                nd.zeros(x[0].shape, dtype=x[0].dtype)
+                zeros(shape=x[0].shape, dtype=x[0].dtype, stype=x[0].stype)
                 for x in self._exec_group.param_arrays
             ]
             self._arg_params = {name:arr for name, arr in zip(self._param_names, param_arrays)}
 
             aux_arrays = [
-                nd.zeros(x[0].shape, dtype=x[0].dtype)
+                zeros(x[0].shape, dtype=x[0].dtype)
                 for x in self._exec_group.aux_arrays
             ]
             self._aux_params = {name:arr for name, arr in zip(self._aux_names, aux_arrays)}
@@ -441,7 +441,6 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
         if shared_module is not None and shared_module.optimizer_initialized:
             self.borrow_optimizer(shared_module)
 
-
     def reshape(self, data_shapes, label_shapes=None):
         """Reshapes the module for new input shapes.
 
@@ -483,6 +482,7 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
 
         if self._params_dirty:
             self._sync_params_from_devices()
+
         (kvstore, update_on_kvstore) = \
                 _create_kvstore(kvstore, len(self._context), self._arg_params)
 
diff --git a/python/mxnet/ndarray/__init__.py b/python/mxnet/ndarray/__init__.py
new file mode 100644
index 000000000000..016e25de382c
--- /dev/null
+++ b/python/mxnet/ndarray/__init__.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""NDArray API of MXNet."""
+
+from . import _internal
+from . import op
+from .op import CachedOp
+# pylint: disable=wildcard-import, redefined-builtin
+from .ndarray import *
+from .utils import load, save, zeros, empty, array
+from .sparse_ndarray import _ndarray_cls, csr_matrix, row_sparse_array
+from .sparse_ndarray import BaseSparseNDArray, RowSparseNDArray, CSRNDArray
diff --git a/python/mxnet/_ndarray_internal.py b/python/mxnet/ndarray/_internal.py
similarity index 100%
rename from python/mxnet/_ndarray_internal.py
rename to python/mxnet/ndarray/_internal.py
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray/ndarray.py
similarity index 86%
rename from python/mxnet/ndarray.py
rename to python/mxnet/ndarray/ndarray.py
index 42f0ff5e87cf..26d5cd453a5b 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -21,6 +21,7 @@
 """NDArray API of MXNet."""
 from __future__ import absolute_import
 from __future__ import division
+
 try:
     from __builtin__ import slice as py_slice
 except ImportError:
@@ -28,40 +29,25 @@
 
 import ctypes
 import warnings
-
-import os as _os
-import sys as _sys
-
 import operator
 import numpy as np
-from .base import _LIB, string_types, numeric_types, integer_types
-from .base import c_array, py_str, c_str, mx_real_t, _Null  # pylint: disable=unused-import
-from .base import mx_uint, NDArrayHandle, check_call, OpHandle
-from .base import ctypes2buffer
-from .context import Context
-from . import _ndarray_internal as _internal
-from .ndarray_doc import _build_doc
-
-
-# Use different version of SymbolBase
-# When possible, use cython to speedup part of computation.
-# pylint: disable=unused-import
-try:
-    if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
-        from ._ctypes.ndarray import NDArrayBase, _set_ndarray_class
-        from ._ctypes.ndarray import CachedOp, _imperative_invoke
-    elif _sys.version_info >= (3, 0):
-        from ._cy3.ndarray import NDArrayBase, _set_ndarray_class, _imperative_invoke
-        from ._cy3.ndarray import CachedOp, _imperative_invoke
-    else:
-        from ._cy2.ndarray import NDArrayBase, _set_ndarray_class, _imperative_invoke
-        from ._cy2.ndarray import CachedOp, _imperative_invoke
-except ImportError:
-    if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
-        raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
-    from ._ctypes.ndarray import NDArrayBase, _set_ndarray_class, _imperative_invoke
-    from ._ctypes.ndarray import CachedOp, _imperative_invoke
-# pylint: enable=unused-import
+from ..base import _LIB, numeric_types, integer_types
+from ..base import c_array, mx_real_t
+from ..base import mx_uint, NDArrayHandle, check_call
+from ..base import ctypes2buffer
+from ..context import Context
+from . import _internal
+from .op import NDArrayBase, _STORAGE_TYPE_ID_TO_STR
+from . import broadcast_add, broadcast_mul, transpose, broadcast_not_equal, broadcast_power
+from . import broadcast_sub, broadcast_div, broadcast_to, broadcast_equal, cast_storage
+from . import broadcast_greater, broadcast_greater_equal, broadcast_lesser, broadcast_lesser_equal
+from . import zeros_like, slice
+
+__all__ = ["NDArray", "concatenate", "_DTYPE_NP_TO_MX", "_DTYPE_MX_TO_NP", "_GRAD_REQ_MAP",  \
+           "ones", "add", "arange", "divide", "equal", "full", "greater", "greater_equal",   \
+           "imdecode", "lesser", "lesser_equal", "maximum", "minimum", "moveaxis",           \
+           "multiply", "negative", "not_equal", "onehot_encode", "power", "subtract",        \
+           "true_divide", "waitall", "_new_empty_handle"]
 
 # pylint: disable= no-member
 _DTYPE_NP_TO_MX = {
@@ -74,7 +60,6 @@
     np.int8    : 5,
     np.int64   : 6,
 }
-
 _DTYPE_MX_TO_NP = {
     -1 : None,
     0 : np.float32,
@@ -85,7 +70,12 @@
     5 : np.int8,
     6 : np.int64,
 }
-
+_STORAGE_TYPE_STR_TO_ID = {
+    'undefined'  : -1,
+    'default'    : 0,
+    'row_sparse' : 1,
+    'csr'        : 2,
+}
 _GRAD_REQ_MAP = {
     'null': 0,
     'write': 1,
@@ -135,6 +125,11 @@ def waitall():
     """
     check_call(_LIB.MXNDArrayWaitAll())
 
+def _storage_type(handle):
+    storage_type = ctypes.c_int(0)
+    check_call(_LIB.MXNDArrayGetStorageType(handle, ctypes.byref(storage_type)))
+    return _STORAGE_TYPE_ID_TO_STR[storage_type.value]
+
 class NDArray(NDArrayBase):
     """An array object representing a multidimensional, homogeneous array of
 fixed-size items.
@@ -144,6 +139,7 @@ class NDArray(NDArrayBase):
     # make numpy functions return NDArray instead of numpy object array
     __array_priority__ = 1000.0
     # pylint: disable= no-member, undefined-variable
+
     def __repr__(self):
         """Returns a string representation of the array."""
         shape_info = 'x'.join(['%d' % x for x in self.shape])
@@ -151,6 +147,9 @@ def __repr__(self):
                                       self.__class__.__name__,
                                       shape_info, self.context)
 
+    def __reduce__(self):
+        return NDArray, (None,), self.__getstate__()
+
     def __add__(self, other):
         """x.__add__(y) <=> x+y <=> mx.nd.add(x, y) """
         return add(self, other)
@@ -742,7 +741,6 @@ def wait_to_read(self):
         """
         check_call(_LIB.MXNDArrayWaitToRead(self.handle))
 
-
     @property
     def ndim(self):
         """Returns the number of dimensions of this array
@@ -777,6 +775,7 @@ def shape(self):
             self.handle, ctypes.byref(ndim), ctypes.byref(pdata)))
         return tuple(pdata[:ndim.value])
 
+
     @property
     def size(self):
         """Number of elements in the array.
@@ -841,6 +840,12 @@ def dtype(self):
             self.handle, ctypes.byref(mx_dtype)))
         return _DTYPE_MX_TO_NP[mx_dtype.value]
 
+    @property
+    def stype(self):
+        """Storage-type of the array.
+        """
+        return _storage_type(self.handle)
+
     @property
     # pylint: disable= invalid-name, undefined-variable
     def T(self):
@@ -943,7 +948,7 @@ def astype(self, dtype):
         >>> y.dtype
         <type 'numpy.int32'>
         """
-        res = empty(self.shape, ctx=self.context, dtype=dtype)
+        res = _empty_ndarray(self.shape, ctx=self.context, dtype=dtype)
         self.copyto(res)
         return res
 
@@ -964,7 +969,7 @@ def copyto(self, other):
 
         Returns
         -------
-        NDArray
+        NDArray, CSRNDArray, RowSparseNDArray
             The copied array. If ``other`` is an ``NDArray``, then the return value
             and ``other`` will point to the same ``NDArray``.
 
@@ -1101,6 +1106,19 @@ def backward(self, out_grad=None, retain_graph=False, train_mode=True):
             ctypes.c_int(retain_graph),
             ctypes.c_int(train_mode)))
 
+    def tostype(self, stype):
+        """Return a copy of the array with chosen storage type.
+
+        See Also
+        ----------
+        :meth:`mxnet.ndarray.cast_storage`.
+
+        Returns
+        -------
+        NDArray, CSRNDArray or RowSparseNDArray
+            A copy of the array with the chosen storage stype
+        """
+        return cast_storage(self, stype=stype)
 
 def onehot_encode(indices, out):
     """One-hot encoding indices into matrix out.
@@ -1113,74 +1131,7 @@ def onehot_encode(indices, out):
     # pylint: enable= no-member, protected-access
 
 
-def empty(shape, ctx=None, dtype=mx_real_t):
-    """Returns a new array of given shape and type, without initializing entries.
-
-    Parameters
-    ----------
-    shape : int or tuple of int
-        The shape of the empty array.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
-    dtype : str or numpy.dtype, optional
-        An optional value type (default is `float32`).
-
-    Returns
-    -------
-    NDArray
-        A created array.
-
-    Examples
-    --------
-    >>> mx.nd.empty(1)
-    <NDArray 1 @cpu(0)>
-    >>> mx.nd.empty((1,2), mx.gpu(0))
-    <NDArray 1x2 @gpu(0)>
-    >>> mx.nd.empty((1,2), mx.gpu(0), 'float16')
-    <NDArray 1x2 @gpu(0)>
-    """
-    if isinstance(shape, integer_types):
-        shape = (shape, )
-    if ctx is None:
-        ctx = Context.default_ctx
-    return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))
-
-def zeros(shape, ctx=None, dtype=mx_real_t, **kwargs):
-    """Returns a new array filled with all zeros, with the given shape and type.
-
-    Parameters
-    ----------
-    shape : int or tuple of int
-        The shape of the empty array.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
-    dtype : str or numpy.dtype, optional
-        An optional value type (default is `float32`).
-    out : NDArray, optional
-        The output NDArray (default is `None`).
-
-    Returns
-    -------
-    NDArray
-        A created array
-
-    Examples
-    --------
-    >>> mx.nd.zeros(1).asnumpy()
-    array([ 0.], dtype=float32)
-    >>> mx.nd.zeros((1,2), mx.gpu(0))
-    <NDArray 1x2 @gpu(0)>
-    >>> mx.nd.zeros((1,2), mx.gpu(0), 'float16').asnumpy()
-    array([[ 0.,  0.]], dtype=float16)
-    """
-    # pylint: disable= unused-argument
-    if ctx is None:
-        ctx = Context.default_ctx
-    # pylint: disable= no-member, protected-access
-    return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
-    # pylint: enable= no-member, protected-access
-
-def ones(shape, ctx=None, dtype=mx_real_t, **kwargs):
+def ones(shape, ctx=None, dtype=None, **kwargs):
     """Returns a new array filled with all ones, with the given shape and type.
 
     Parameters
@@ -1212,6 +1163,7 @@ def ones(shape, ctx=None, dtype=mx_real_t, **kwargs):
     # pylint: disable= unused-argument
     if ctx is None:
         ctx = Context.default_ctx
+    dtype = mx_real_t if dtype is None else dtype
     # pylint: disable= no-member, protected-access
     return _internal._ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
     # pylint: enable= no-member, protected-access
@@ -1246,12 +1198,11 @@ def full(shape, val, ctx=None, dtype=mx_real_t, out=None):
     >>> mx.nd.full((1, 2), 2.0, dtype='float16').asnumpy()
     array([[ 2.,  2.]], dtype=float16)
     """
-    out = empty(shape, ctx, dtype) if out is None else out
+    out = _empty_ndarray(shape, ctx, dtype) if out is None else out
     out[:] = val
     return out
 
-
-def array(source_array, ctx=None, dtype=None):
+def _array(source_array, ctx=None, dtype=None):
     """Creates an array from any object exposing the array interface.
 
     Parameters
@@ -1269,18 +1220,6 @@ def array(source_array, ctx=None, dtype=None):
     -------
     NDArray
         An `NDArray` with the same contents as the `source_array`.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> mx.nd.array([1, 2, 3])
-    <NDArray 3 @cpu(0)>
-    >>> mx.nd.array([[1, 2], [3, 4]])
-    <NDArray 2x2 @cpu(0)>
-    >>> mx.nd.array(np.zeros((3, 2)))
-    <NDArray 3x2 @cpu(0)>
-    >>> mx.nd.array(np.zeros((3, 2)), mx.gpu(0))
-    <NDArray 3x2 @gpu(0)>
     """
     if isinstance(source_array, NDArray):
         dtype = source_array.dtype if dtype is None else dtype
@@ -1291,11 +1230,10 @@ def array(source_array, ctx=None, dtype=None):
                 source_array = np.array(source_array, dtype=dtype)
             except:
                 raise TypeError('source_array must be array like object')
-    arr = empty(source_array.shape, ctx, dtype)
+    arr = _empty_ndarray(source_array.shape, ctx, dtype)
     arr[:] = source_array
     return arr
 
-
 def moveaxis(tensor, source, destination):
     """Moves the `source` axis into the `destination` position
     while leaving the other axes in their original order
@@ -2309,96 +2247,6 @@ def negative(arr):
     """
     return multiply(arr, -1.0)
 
-
-def load(fname):
-    """Loads an array from file.
-
-    See more details in ``save``.
-
-    Parameters
-    ----------
-    fname : str
-        The filename.
-
-    Returns
-    -------
-    list of NDArray or dict of str to NDArray
-        Loaded data.
-    """
-    if not isinstance(fname, string_types):
-        raise TypeError('fname required to be a string')
-    out_size = mx_uint()
-    out_name_size = mx_uint()
-    handles = ctypes.POINTER(NDArrayHandle)()
-    names = ctypes.POINTER(ctypes.c_char_p)()
-    check_call(_LIB.MXNDArrayLoad(c_str(fname),
-                                  ctypes.byref(out_size),
-                                  ctypes.byref(handles),
-                                  ctypes.byref(out_name_size),
-                                  ctypes.byref(names)))
-    if out_name_size.value == 0:
-        return [NDArray(NDArrayHandle(handles[i])) for i in range(out_size.value)]
-    else:
-        assert out_name_size.value == out_size.value
-        return dict(
-            (py_str(names[i]), NDArray(NDArrayHandle(handles[i]))) for i in range(out_size.value))
-
-
-def save(fname, data):
-    """Saves a list of arrays or a dict of str->array to file.
-
-    Examples of filenames:
-
-    - ``/path/to/file``
-    - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports)
-    - ``hdfs://path/to/file`` (if compiled with HDFS supports)
-
-    Parameters
-    ----------
-    fname : str
-        The filename.
-    data : ``NDArray``, list of ``NDArray` or dict of str to ``NDArray``
-        The data to save.
-
-    Examples
-    --------
-    >>> x = mx.nd.zeros((2,3))
-    >>> y = mx.nd.ones((1,4))
-    >>> mx.nd.save('my_list', [x,y])
-    >>> mx.nd.save('my_dict', {'x':x, 'y':y})
-    >>> mx.nd.load('my_list')
-    [<NDArray 2x3 @cpu(0)>, <NDArray 1x4 @cpu(0)>]
-    >>> mx.nd.load('my_dict')
-    {'y': <NDArray 1x4 @cpu(0)>, 'x': <NDArray 2x3 @cpu(0)>}
-    """
-    if isinstance(data, NDArray):
-        data = [data]
-    handles = []
-    if isinstance(data, dict):
-        keys = []
-        for key, val in data.items():
-            if not isinstance(key, string_types):
-                raise TypeError('save only accept dict str->NDArray or list of NDArray')
-            if not isinstance(val, NDArray):
-                raise TypeError('save only accept dict str->NDArray or list of NDArray')
-            keys.append(c_str(key))
-            handles.append(val.handle)
-        keys = c_array(ctypes.c_char_p, keys)
-    elif isinstance(data, list):
-        for val in data:
-            if not isinstance(val, NDArray):
-                raise TypeError('save only accept dict str->NDArray or list of NDArray')
-            handles.append(val.handle)
-        keys = None
-    else:
-        raise ValueError("data needs to either be a NDArray, dict of str, NDArray pairs "
-                         "or a list of NDarrays.")
-    check_call(_LIB.MXNDArraySave(c_str(fname),
-                                  mx_uint(len(handles)),
-                                  c_array(NDArrayHandle, handles),
-                                  keys))
-
-
 def concatenate(arrays, axis=0, always_copy=True):
     """DEPRECATED, use ``concat`` instead
 
@@ -2435,7 +2283,7 @@ def concatenate(arrays, axis=0, always_copy=True):
         assert shape_rest2 == arr.shape[axis+1:]
         assert dtype == arr.dtype
     ret_shape = shape_rest1 + (shape_axis,) + shape_rest2
-    ret = empty(ret_shape, ctx=arrays[0].context, dtype=dtype)
+    ret = _empty_ndarray(ret_shape, ctx=arrays[0].context, dtype=dtype)
 
     idx = 0
     begin = [0 for _ in ret_shape]
@@ -2497,159 +2345,64 @@ def imdecode(str_img, clip_rect=(0, 0, 0, 0), out=None, index=0, channels=3, mea
                                    out=out)
 
 
-# pylint: disable=too-many-locals, invalid-name
-def _make_ndarray_function(handle, name):
-    """Create a NDArray function from the FunctionHandle."""
-    real_name = ctypes.c_char_p()
-    desc = ctypes.c_char_p()
-    num_args = mx_uint()
-    arg_names = ctypes.POINTER(ctypes.c_char_p)()
-    arg_types = ctypes.POINTER(ctypes.c_char_p)()
-    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
-    key_var_num_args = ctypes.c_char_p()
-    ret_type = ctypes.c_char_p()
-
-    check_call(_LIB.MXSymbolGetAtomicSymbolInfo(
-        handle, ctypes.byref(real_name), ctypes.byref(desc),
-        ctypes.byref(num_args),
-        ctypes.byref(arg_names),
-        ctypes.byref(arg_types),
-        ctypes.byref(arg_descs),
-        ctypes.byref(key_var_num_args),
-        ctypes.byref(ret_type)))
-    narg = int(num_args.value)
-    arg_names = [py_str(arg_names[i]) for i in range(narg)]
-    arg_types = [py_str(arg_types[i]) for i in range(narg)]
-    func_name = name
-    key_var_num_args = py_str(key_var_num_args.value)
-    ret_type = py_str(ret_type.value) if ret_type.value is not None else ''
-    doc_str = _build_doc(func_name,
-                         py_str(desc.value),
-                         arg_names,
-                         arg_types,
-                         [py_str(arg_descs[i]) for i in range(narg)],
-                         key_var_num_args,
-                         ret_type)
-
-    dtype_name = None
-    arr_name = None
-    ndsignature = []
-    signature = []
-    ndarg_names = []
-    kwarg_names = []
-    for i in range(narg):
-        name, atype = arg_names[i], arg_types[i]
-        if name == 'dtype':
-            dtype_name = name
-            signature.append('%s=_Null'%name)
-        elif atype.startswith('NDArray') or atype.startswith('Symbol'):
-            assert not arr_name, \
-                "Op can only have one argument with variable " \
-                "size and it must be the last argument."
-            if atype.endswith('[]'):
-                ndsignature.append('*%s'%name)
-                arr_name = name
-            else:
-                ndsignature.append('%s=None'%name)
-                ndarg_names.append(name)
-        else:
-            signature.append('%s=_Null'%name)
-            kwarg_names.append(name)
-    signature.append('out=None')
-    signature.append('name=None')
-    signature.append('**kwargs')
-    signature = ndsignature + signature
-
-    code = []
-    if arr_name:
-        code.append("""
-def %s(*%s, **kwargs):"""%(func_name, arr_name))
-        code.append("""
-    ndargs = []
-    for i in {}:
-        assert isinstance(i, NDArrayBase), \\
-            "Positional arguments must have NDArray type, " \\
-            "but got %s"%str(i)
-        ndargs.append(i)""".format(arr_name))
-        if dtype_name is not None:
-            code.append("""
-    if '%s' in kwargs:
-        kwargs['%s'] = np.dtype(kwargs['%s']).name"""%(
-            dtype_name, dtype_name, dtype_name))
-        code.append("""
-    _ = kwargs.pop('name', None)
-    out = kwargs.pop('out', None)
-    keys = list(kwargs.keys())
-    vals = list(kwargs.values())""")
-    else:
-        code.append("""
-def %s(%s):
-    ndargs = []
-    keys = list(kwargs.keys())
-    vals = list(kwargs.values())"""%(func_name, ', '.join(signature)))
-        # NDArray args
-        for name in ndarg_names: # pylint: disable=redefined-argument-from-local
-            code.append("""
-    if {name} is not None:
-        assert isinstance({name}, NDArrayBase), \\
-            "Argument {name} must have NDArray type, but got %s"%str({name})
-        ndargs.append({name})""".format(name=name))
-        # kwargs
-        for name in kwarg_names: # pylint: disable=redefined-argument-from-local
-            code.append("""
-    if %s is not _Null:
-        keys.append('%s')
-        vals.append(%s)"""%(name, name, name))
-        # dtype
-        if dtype_name is not None:
-            code.append("""
-    if %s is not _Null:
-        keys.append('%s')
-        vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
-
-    code.append("""
-    return _imperative_invoke(%d, ndargs, keys, vals, out)"""%(
-        handle.value))
-
-    local = {}
-    exec(''.join(code), None, local)  # pylint: disable=exec-used
-    ndarray_function = local[func_name]
-    ndarray_function.__name__ = func_name
-    ndarray_function.__doc__ = doc_str
-    ndarray_function.__module__ = 'mxnet.ndarray'
-    return ndarray_function
-
-
-# pylint: enable=too-many-locals, invalid-name
-def _init_ndarray_module(ndarray_class, root_namespace):
-    """List and add all the ndarray functions to current module."""
-    _set_ndarray_class(ndarray_class)
-    plist = ctypes.POINTER(ctypes.c_char_p)()
-    size = ctypes.c_uint()
-
-    check_call(_LIB.MXListAllOpNames(ctypes.byref(size),
-                                     ctypes.byref(plist)))
-    op_names = []
-    for i in range(size.value):
-        op_names.append(py_str(plist[i]))
-
-    module_obj = _sys.modules["%s.ndarray" % root_namespace]
-    module_internal = _sys.modules["%s._ndarray_internal" % root_namespace]
-    module_contrib = _sys.modules["%s.contrib.ndarray" % root_namespace]
-    for name in op_names:
-        hdl = OpHandle()
-        check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
-        function = _make_ndarray_function(hdl, name)
-        if function.__name__.startswith('_contrib_'):
-            function.__name__ = function.__name__[9:]
-            function.__module__ = 'mxnet.contrib.ndarray'
-            setattr(module_contrib, function.__name__, function)
-        elif function.__name__.startswith('_'):
-            setattr(module_internal, function.__name__, function)
-        else:
-            setattr(module_obj, function.__name__, function)
+def _zeros_ndarray(shape, ctx=None, dtype=None, **kwargs):
+    """Returns a new array filled with all zeros, with the given shape and type.
 
-_init_ndarray_module(NDArray, "mxnet")
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`).
+    out : NDArray, optional
+        The output NDArray (default is `None`).
 
-# from .base import add_fileline_to_docstring
-# add_fileline_to_docstring(__name__)
+    Returns
+    -------
+    NDArray
+        A created array
+
+    Examples
+    --------
+    >>> mx.nd.zeros(1).asnumpy()
+    array([ 0.], dtype=float32)
+    >>> mx.nd.zeros((1,2), mx.gpu(0))
+    <NDArray 1x2 @gpu(0)>
+    >>> mx.nd.zeros((1,2), mx.gpu(0), 'float16').asnumpy()
+    array([[ 0.,  0.]], dtype=float16)
+    """
+    # pylint: disable= unused-argument
+    if ctx is None:
+        ctx = Context.default_ctx
+    dtype = mx_real_t if dtype is None else dtype
+    # pylint: disable= no-member, protected-access
+    return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
+    # pylint: enable= no-member, protected-access
+
+def _empty_ndarray(shape, ctx=None, dtype=None):
+    """Returns a new array of given shape and type, without initializing entries.
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`).
+
+    Returns
+    -------
+    NDArray
+        A created array.
+
+    """
+    if isinstance(shape, int):
+        shape = (shape, )
+    if ctx is None:
+        ctx = Context.default_ctx
+    if dtype is None:
+        dtype = mx_real_t
+    return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))
diff --git a/python/mxnet/ndarray/op.py b/python/mxnet/ndarray/op.py
new file mode 100644
index 000000000000..7580362c0cc1
--- /dev/null
+++ b/python/mxnet/ndarray/op.py
@@ -0,0 +1,205 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Register backend ops in mxnet.ndarray namespace"""
+
+import sys as _sys
+import os as _os
+import ctypes
+import numpy as np  # pylint: disable=unused-import
+
+from ..ndarray_doc import _build_doc
+
+# Use different version of SymbolBase
+# When possible, use cython to speedup part of computation.
+# pylint: disable=unused-import
+try:
+    if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
+        from .._ctypes.ndarray import NDArrayBase, _set_ndarray_class, _STORAGE_TYPE_ID_TO_STR
+        from .._ctypes.ndarray import CachedOp, _imperative_invoke
+    elif _sys.version_info >= (3, 0):
+        from .._cy3.ndarray import NDArrayBase, _set_ndarray_class,\
+            _imperative_invoke, _STORAGE_TYPE_ID_TO_STR
+        from .._cy3.ndarray import CachedOp, _imperative_invoke
+    else:
+        from .._cy2.ndarray import NDArrayBase, _set_ndarray_class,\
+            _imperative_invoke, _STORAGE_TYPE_ID_TO_STR
+        from .._cy2.ndarray import CachedOp, _imperative_invoke
+except ImportError:
+    if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
+        raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
+    from .._ctypes.ndarray import NDArrayBase, _set_ndarray_class,\
+        _imperative_invoke, _STORAGE_TYPE_ID_TO_STR
+    from .._ctypes.ndarray import CachedOp, _imperative_invoke
+
+from ..base import mx_uint, check_call, _LIB, py_str, OpHandle, c_str, _Null
+# pylint: enable=unused-import
+
+
+# pylint: disable=too-many-locals, invalid-name
+def _make_ndarray_function(handle, name):
+    """Create a NDArray function from the FunctionHandle."""
+    real_name = ctypes.c_char_p()
+    desc = ctypes.c_char_p()
+    num_args = mx_uint()
+    arg_names = ctypes.POINTER(ctypes.c_char_p)()
+    arg_types = ctypes.POINTER(ctypes.c_char_p)()
+    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
+    key_var_num_args = ctypes.c_char_p()
+    ret_type = ctypes.c_char_p()
+
+    check_call(_LIB.MXSymbolGetAtomicSymbolInfo(
+        handle, ctypes.byref(real_name), ctypes.byref(desc),
+        ctypes.byref(num_args),
+        ctypes.byref(arg_names),
+        ctypes.byref(arg_types),
+        ctypes.byref(arg_descs),
+        ctypes.byref(key_var_num_args),
+        ctypes.byref(ret_type)))
+    narg = int(num_args.value)
+    arg_names = [py_str(arg_names[i]) for i in range(narg)]
+    arg_types = [py_str(arg_types[i]) for i in range(narg)]
+    func_name = name
+    key_var_num_args = py_str(key_var_num_args.value)
+    ret_type = py_str(ret_type.value) if ret_type.value is not None else ''
+    doc_str = _build_doc(func_name,
+                         py_str(desc.value),
+                         arg_names,
+                         arg_types,
+                         [py_str(arg_descs[i]) for i in range(narg)],
+                         key_var_num_args,
+                         ret_type)
+
+    dtype_name = None
+    arr_name = None
+    ndsignature = []
+    signature = []
+    ndarg_names = []
+    kwarg_names = []
+    for i in range(narg):
+        name, atype = arg_names[i], arg_types[i]
+        if name == 'dtype':
+            dtype_name = name
+            signature.append('%s=_Null'%name)
+        elif atype.startswith('NDArray') or atype.startswith('Symbol'):
+            assert not arr_name, \
+                "Op can only have one argument with variable " \
+                "size and it must be the last argument."
+            if atype.endswith('[]'):
+                ndsignature.append('*%s'%name)
+                arr_name = name
+            else:
+                ndsignature.append('%s=None'%name)
+                ndarg_names.append(name)
+        else:
+            signature.append('%s=_Null'%name)
+            kwarg_names.append(name)
+    signature.append('out=None')
+    signature.append('name=None')
+    signature.append('**kwargs')
+    signature = ndsignature + signature
+
+    code = []
+    if arr_name:
+        code.append("""
+def %s(*%s, **kwargs):"""%(func_name, arr_name))
+        code.append("""
+    ndargs = []
+    for i in {}:
+        assert isinstance(i, NDArrayBase), \\
+            "Positional arguments must have NDArray type, " \\
+            "but got %s"%str(i)
+        ndargs.append(i)""".format(arr_name))
+        if dtype_name is not None:
+            code.append("""
+    if '%s' in kwargs:
+        kwargs['%s'] = np.dtype(kwargs['%s']).name"""%(
+            dtype_name, dtype_name, dtype_name))
+        code.append("""
+    _ = kwargs.pop('name', None)
+    out = kwargs.pop('out', None)
+    keys = list(kwargs.keys())
+    vals = list(kwargs.values())""")
+    else:
+        code.append("""
+def %s(%s):
+    ndargs = []
+    keys = list(kwargs.keys())
+    vals = list(kwargs.values())"""%(func_name, ', '.join(signature)))
+        # NDArray args
+        for name in ndarg_names: # pylint: disable=redefined-argument-from-local
+            code.append("""
+    if {name} is not None:
+        assert isinstance({name}, NDArrayBase), \\
+            "Argument {name} must have NDArray type, but got %s"%str({name})
+        ndargs.append({name})""".format(name=name))
+        # kwargs
+        for name in kwarg_names: # pylint: disable=redefined-argument-from-local
+            code.append("""
+    if %s is not _Null:
+        keys.append('%s')
+        vals.append(%s)"""%(name, name, name))
+        # dtype
+        if dtype_name is not None:
+            code.append("""
+    if %s is not _Null:
+        keys.append('%s')
+        vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
+
+    code.append("""
+    return _imperative_invoke(%d, ndargs, keys, vals, out)"""%(
+        handle.value))
+
+    local = {}
+    exec(''.join(code), None, local)  # pylint: disable=exec-used
+    ndarray_function = local[func_name]
+    ndarray_function.__name__ = func_name
+    ndarray_function.__doc__ = doc_str
+    ndarray_function.__module__ = 'mxnet.ndarray'
+    return ndarray_function
+
+
+# pylint: enable=too-many-locals, invalid-name
+def _init_ndarray_module(root_namespace):
+    """List and add all the ndarray functions to current module."""
+    plist = ctypes.POINTER(ctypes.c_char_p)()
+    size = ctypes.c_uint()
+
+    check_call(_LIB.MXListAllOpNames(ctypes.byref(size),
+                                     ctypes.byref(plist)))
+    op_names = []
+    for i in range(size.value):
+        op_names.append(py_str(plist[i]))
+
+    module_obj = _sys.modules["%s.ndarray" % root_namespace]
+    module_internal = _sys.modules["%s.ndarray._internal" % root_namespace]
+    module_contrib = _sys.modules["%s.contrib.ndarray" % root_namespace]
+    for name in op_names:
+        hdl = OpHandle()
+        check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
+        function = _make_ndarray_function(hdl, name)
+        if function.__name__.startswith('_contrib_'):
+            function.__name__ = function.__name__[9:]
+            function.__module__ = 'mxnet.contrib.ndarray'
+            setattr(module_contrib, function.__name__, function)
+        elif function.__name__.startswith('_'):
+            setattr(module_internal, function.__name__, function)
+        else:
+            setattr(module_obj, function.__name__, function)
+
+# register backend operators in mx.nd
+_init_ndarray_module("mxnet")
diff --git a/python/mxnet/ndarray/sparse_ndarray.py b/python/mxnet/ndarray/sparse_ndarray.py
new file mode 100644
index 000000000000..4259fe170121
--- /dev/null
+++ b/python/mxnet/ndarray/sparse_ndarray.py
@@ -0,0 +1,906 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""SparseNDArray API of mxnet."""
+from __future__ import absolute_import
+from __future__ import division
+try:
+    from __builtin__ import slice as py_slice
+except ImportError:
+    from builtins import slice as py_slice
+
+import ctypes
+import warnings
+
+import os as _os
+import sys as _sys
+
+# import operator
+import numpy as np
+from ..base import NotSupportedForSparseNDArray
+from ..base import _LIB, numeric_types
+from ..base import c_array, mx_real_t
+from ..base import mx_uint, NDArrayHandle, check_call
+from ..context import Context
+from . import _internal
+from .ndarray import _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
+from .ndarray import _STORAGE_TYPE_STR_TO_ID
+from .ndarray import NDArray, _storage_type, _zeros_ndarray, _array
+from . import cast_storage
+from . import slice as nd_slice
+
+# Use different verison of SymbolBase
+# When possible, use cython to speedup part of computation.
+# pylint: disable=unused-import
+try:
+    if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
+        from .._ctypes.ndarray import NDArrayBase, _set_ndarray_class
+    elif _sys.version_info >= (3, 0):
+        from .._cy3.ndarray import NDArrayBase, _set_ndarray_class
+    else:
+        from .._cy2.ndarray import NDArrayBase, _set_ndarray_class
+except ImportError:
+    if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
+        raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
+    from .._ctypes.ndarray import NDArrayBase, _set_ndarray_class
+
+# pylint: enable=unused-import
+_STORAGE_AUX_TYPES = {
+    'row_sparse': [np.int64],
+    'csr': [np.int64, np.int64]
+}
+
+
+def _new_alloc_handle(stype, shape, ctx, delay_alloc, dtype, aux_types, aux_shapes=None):
+    """Return a new handle with specified storage type, shape, dtype and context.
+
+    Empty handle is only used to hold results
+
+    Returns
+    -------
+    handle
+        A new empty ndarray handle
+    """
+    hdl = NDArrayHandle()
+    aux_type_ids = [int(_DTYPE_NP_TO_MX[np.dtype(aux_t).type]) for aux_t in aux_types]
+    aux_shapes = [(0,) for aux_t in aux_types] if aux_shapes is None else aux_shapes
+    aux_shape_lens = [len(aux_shape) for aux_shape in aux_shapes]
+    aux_shapes = sum(aux_shapes, ())
+    num_aux = mx_uint(len(aux_types))
+    check_call(_LIB.MXNDArrayCreateSparseEx(
+        ctypes.c_int(int(_STORAGE_TYPE_STR_TO_ID[stype])),
+        c_array(mx_uint, shape),
+        mx_uint(len(shape)),
+        ctypes.c_int(ctx.device_typeid),
+        ctypes.c_int(ctx.device_id),
+        ctypes.c_int(int(delay_alloc)),
+        ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])),
+        num_aux,
+        c_array(ctypes.c_int, aux_type_ids),
+        c_array(mx_uint, aux_shape_lens),
+        c_array(mx_uint, aux_shapes),
+        ctypes.byref(hdl)))
+    return hdl
+
+class BaseSparseNDArray(NDArray):
+    """The base class of an NDArray stored in a sparse storage format.
+
+    See CSRNDArray and RowSparseNDArray for more details.
+    """
+
+    def __iadd__(self, other):
+        raise NotImplementedError()
+
+    def __isub__(self, other):
+        raise NotImplementedError()
+
+    def __imul__(self, other):
+        raise NotImplementedError()
+
+    def __idiv__(self, other):
+        raise NotImplementedError()
+
+    def __itruediv__(self, other):
+        raise NotImplementedError()
+
+    def _sync_copyfrom(self, source_array):
+        raise NotImplementedError()
+
+    def _at(self, idx):
+        raise NotSupportedForSparseNDArray(self._at, '[idx]', idx)
+
+    def _slice(self, start, stop):
+        raise NotSupportedForSparseNDArray(self._slice, None, start, stop)
+
+    def reshape(self, shape):
+        raise NotSupportedForSparseNDArray(self.reshape, None, shape)
+
+    def _aux_type(self, i):
+        """Data-type of the array's ith aux data.
+
+        Returns
+        -------
+        numpy.dtype
+            This BaseSparseNDArray's aux data type.
+        """
+        aux_type = ctypes.c_int()
+        check_call(_LIB.MXNDArrayGetAuxType(self.handle, i, ctypes.byref(aux_type)))
+        return _DTYPE_MX_TO_NP[aux_type.value]
+
+    @property
+    def _num_aux(self):
+        """The number of aux data used to help store the sparse ndarray.
+        """
+        return len(_STORAGE_AUX_TYPES[self.stype])
+
+    @property
+    def _aux_types(self):
+        """The data types of the aux data for the BaseSparseNDArray.
+        """
+        aux_types = []
+        num_aux = self._num_aux
+        for i in range(num_aux):
+            aux_types.append(self._aux_type(i))
+        return aux_types
+
+    def asnumpy(self):
+        """Return a dense ``numpy.ndarray`` object with value copied from this array
+        """
+        return self.tostype('default').asnumpy()
+
+    def astype(self, dtype):
+        """Returns a copy of the array after casting to a specified type.
+        Parameters
+        ----------
+        dtype : numpy.dtype or str
+            The type of the returned array.
+        Examples
+        --------
+        >>> x = mx.nd.zeros('row_sparse', (2,3), dtype='float32')
+        >>> y = x.astype('int32')
+        >>> y.dtype
+        <type 'numpy.int32'>
+        """
+        res = _zeros_sparse_ndarray(shape=self.shape, ctx=self.context,
+                                    dtype=dtype, stype=self.stype)
+        self.copyto(res)
+        return res
+
+    def copyto(self, other):
+        """Copies the value of this array to another array.
+
+        Parameters
+        ----------
+        other : NDArray or CSRNDArray or RowSparseNDArray or Context
+            The destination array or context.
+
+        Returns
+        -------
+        NDArray or CSRNDArray or RowSparseNDArray
+            The copied array.
+        """
+        if isinstance(other, NDArray):
+            if other.handle is self.handle:
+                warnings.warn('You are attempting to copy an array to itself', RuntimeWarning)
+                return
+            return _internal._copyto(self, out=other)
+        elif isinstance(other, Context):
+            hret = _ndarray_cls(_new_alloc_handle(self.stype, self.shape, other,
+                                                  True, self.dtype, self._aux_types))
+            return _internal._copyto(self, out=hret)
+        else:
+            raise TypeError('copyto does not support type ' + str(type(other)))
+
+    def _data(self):
+        """A deep copy NDArray of the data array associated with the BaseSparseNDArray.
+
+        This function blocks. Do not use it in performance critical code.
+        """
+        self.wait_to_read()
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXNDArrayGetDataNDArray(self.handle, ctypes.byref(hdl)))
+        return NDArray(hdl)
+
+
+    def _aux_data(self, i):
+        """ Get a deep copy NDArray of the i-th aux data array associated with the
+        BaseSparseNDArray.
+
+        This function blocks. Do not use it in performance critical code.
+        """
+        self.wait_to_read()
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXNDArrayGetAuxNDArray(self.handle, i, ctypes.byref(hdl)))
+        return NDArray(hdl)
+
+
+# pylint: disable=abstract-method
+class CSRNDArray(BaseSparseNDArray):
+    """A sparse representation of 2D NDArray in the standard CSR format.
+
+    A CSRNDArray represents an NDArray as three separate arrays: `data`,
+    `indptr` and `indices`. It uses the standard CSR representation where the column indices for
+    row i are stored in indices[indptr[i]:indptr[i+1]] and their corresponding values are stored
+    in values[indptr[i]:indptr[i+1]].
+
+    Example
+    -------
+    >>> a = mx.nd.array([[0, 1, 0], [2, 0, 0], [0, 0, 0], [0, 0, 3]])
+    >>> a = a.tostype('csr')
+    >>> a.indices.asnumpy()
+    array([1, 0, 2])
+    >>> a.indptr.asnumpy()
+    array([0, 1, 2, 2, 3])
+    >>> a.data.asnumpy()
+    array([ 1.,  2.,  3.], dtype=float32)
+    """
+
+    def __reduce__(self):
+        return CSRNDArray, (None,), super(CSRNDArray, self).__getstate__()
+
+    def __iadd__(self, other):
+        (self + other).copyto(self)
+        return self
+
+    def __isub__(self, other):
+        (self - other).copyto(self)
+        return self
+
+    def __imul__(self, other):
+        (self * other).copyto(self)
+        return self
+
+    def __idiv__(self, other):
+        (self / other).copyto(self)
+        return self
+
+    def __itruediv__(self, other):
+        (self / other).copyto(self)
+        return self
+
+    def __getitem__(self, key):
+        """x.__getitem__(i) <=> x[i]
+
+        Returns a sliced view of this array.
+
+        Parameters
+        ----------
+        key : slice
+            Indexing key.
+
+        Examples
+        --------
+        >>> indptr = np.array([0, 2, 3, 6])
+        >>> indices = np.array([0, 2, 2, 0, 1, 2])
+        >>> data = np.array([1, 2, 3, 4, 5, 6])
+        >>> a = mx.nd.csr_matrix(data, indptr, indices, (3, 3))
+        >>> a.asnumpy()
+        array([[1, 0, 2],
+               [0, 0, 3],
+               [4, 5, 6]])
+        >>> a[1:2].asnumpy()
+        array([[0, 0, 3]], dtype=float32)
+        """
+        if isinstance(key, int):
+            raise ValueError("__getitem__ with int key is not implemented for CSRNDArray")
+        if isinstance(key, py_slice):
+            if key.step is not None:
+                raise ValueError('CSRNDArray only supports continuous slicing on axis 0')
+            if key.start is not None or key.stop is not None:
+                begin = key.start if key.start else 0
+                end = key.stop if key.stop else self.shape[0]
+                return nd_slice(self, begin=begin, end=end)
+            else:
+                return self
+        if isinstance(key, tuple):
+            raise ValueError('Multi-dimension indexing is not supported')
+
+    def __setitem__(self, key, value):
+        """x.__setitem__(i, y) <=> x[i]=y
+
+        Set self[key] to value. Only slice key [:] is supported.
+
+        Parameters
+        ----------
+        key : slice
+            The indexing key.
+        value : NDArray or CSRNDArray or numpy.ndarray
+            The value to set.
+
+        Examples
+        --------
+        >>> src = mx.nd.zeros((3,3), stype='csr')
+        >>> src.asnumpy()
+        array([[ 0.,  0.,  0.],
+               [ 0.,  0.,  0.],
+               [ 0.,  0.,  0.]], dtype=float32)
+        >>> # assign CSRNDArray with same storage type
+        >>> x = mx.nd.ones('row_sparse', (3,3)).tostype('csr')
+        >>> x[:] = src
+        >>> x.asnumpy()
+        array([[ 1.,  1.,  1.],
+               [ 1.,  1.,  1.],
+               [ 1.,  1.,  1.]], dtype=float32)
+        >>> # assign NDArray to CSRNDArray
+        >>> x[:] = mx.nd.ones((3,3)) * 2
+        >>> x.asnumpy()
+        array([[ 2.,  2.,  2.],
+               [ 2.,  2.,  2.],
+               [ 2.,  2.,  2.]], dtype=float32)
+        """
+        if not self.writable:
+            raise ValueError('Failed to assign to a readonly CSRNDArray')
+        if isinstance(key, py_slice):
+            if key.step is not None or key.start is not None or key.stop is not None:
+                raise ValueError('Assignment with slice for CSRNDArray is not ' \
+                                 'implmented yet.')
+            if isinstance(value, NDArray):
+                # avoid copying to itself
+                if value.handle is not self.handle:
+                    value.copyto(self)
+            elif isinstance(value, numeric_types):
+                raise ValueError("Assigning numeric types to CSRNDArray is " \
+                                 "not implemented yet.")
+            elif isinstance(value, (np.ndarray, np.generic)):
+                # TODO(haibin/anisub) check scipy.sparse and use _sync_copy_from to
+                # avoid the temporary copy
+                warnings.warn('Assigning non-NDArray object to CSRNDArray is not efficient',
+                              RuntimeWarning)
+                tmp = _array(value)
+                tmp.copyto(self)
+            else:
+                raise TypeError('type %s not supported' % str(type(value)))
+        else:
+            assert(isinstance(key, (int, tuple)))
+            raise Exception('CSRNDArray only supports [:] for assignment')
+
+    @property
+    def indices(self):
+        """A deep copy NDArray of the indices array of the CSRNDArray.
+        This generates a deep copy of the column indices of the current `csr` matrix.
+
+        Returns
+        -------
+        NDArray
+            This CSRNDArray's indices array.
+        """
+        return self._aux_data(1)
+
+    @property
+    def indptr(self):
+        """A deep copy NDArray of the indptr array of the CSRNDArray.
+        This generates a deep copy of the `indptr` of the current `csr` matrix.
+
+        Returns
+        -------
+        NDArray
+            This CSRNDArray's indptr array.
+        """
+        return self._aux_data(0)
+
+    @property
+    def data(self):
+        """A deep copy NDArray of the data array of the CSRNDArray.
+        This generates a deep copy of the `data` of the current `csr` matrix.
+
+        Returns
+        -------
+        NDArray
+            This CSRNDArray's data array.
+        """
+        return self._data()
+
+    def tostype(self, stype):
+        """Return a copy of the array with chosen storage type.
+
+        Returns
+        -------
+        NDArray or CSRNDArray
+            A copy of the array with the chosen storage stype
+        """
+        if stype == 'row_sparse':
+            raise ValueError("cast_storage from csr to row_sparse is not supported")
+        return cast_storage(self, stype=stype)
+
+    def copyto(self, other):
+        """Copies the value of this array to another array.
+
+        If ``other`` is a ``NDArray`` or ``CSRNDArray`` object, then ``other.shape`` and
+        ``self.shape`` should be the same. This function copies the value from
+        ``self`` to ``other``.
+
+        If ``other`` is a context, a new ``CSRNDArray`` will be first created on
+        the target context, and the value of ``self`` is copied.
+
+        Parameters
+        ----------
+        other : NDArray or CSRNDArray or Context
+            The destination array or context.
+
+        Returns
+        -------
+        NDArray or CSRNDArray
+            The copied array. If ``other`` is an ``NDArray`` or ``CSRNDArray``, then the return
+            value and ``other`` will point to the same ``NDArray`` or ``CSRNDArray``.
+        """
+        if isinstance(other, Context):
+            return super(CSRNDArray, self).copyto(other)
+        elif isinstance(other, NDArray):
+            stype = other.stype
+            if stype == 'default' or stype == 'csr':
+                return super(CSRNDArray, self).copyto(other)
+            else:
+                raise TypeError('copyto does not support destination NDArray stype ' + str(stype))
+        else:
+            raise TypeError('copyto does not support type ' + str(type(other)))
+
+# pylint: disable=abstract-method
+class RowSparseNDArray(BaseSparseNDArray):
+    """A sparse representation of a set of NDArray row slices at given indices.
+
+    A RowSparseNDArray represents a multidimensional NDArray using two separate arrays: `data` and
+    `indices`.
+
+    - data: an NDArray of any dtype with shape [D0, D1, ..., Dn].
+    - indices: a 1-D int64 NDArray with shape [D0].
+
+    The `indices` stores the indices of the row slices with non-zeros,
+    while the values are stored in `data`. The corresponding NDArray ``dense``
+    represented by RowSparseNDArray ``rsp`` has
+
+    ``dense[rsp.indices[i], :, :, :, ...] = rsp.data[i, :, :, :, ...]``
+
+        >>> dense.asnumpy()
+        array([[ 1.,  2., 3.],
+               [ 0.,  0., 0.],
+               [ 4.,  0., 5.],
+               [ 0.,  0., 0.],
+               [ 0.,  0., 0.]], dtype=float32)
+        >>> rsp = dense.tostype('row_sparse')
+        >>> rsp.indices.asnumpy()
+        array([0, 2], dtype=int64)
+        >>> rsp.data.asnumpy()
+        array([[ 1.,  2., 3.],
+               [ 4.,  0., 5.]], dtype=float32)
+
+    A RowSparseNDArray is typically used to represent non-zero row-slices of a large NDArray
+    of shape [LARGE0, D1, .. , Dn] where LARGE0 >> D0 and most row slices are zeros.
+
+    The indices are expected to be sorted in ascending order.
+
+    RowSparseNDArray is used principally in the definition of gradients for operations
+    that have sparse gradients (e.g. sparse dot and sparse embedding).
+    """
+    def __reduce__(self):
+        return RowSparseNDArray, (None,), super(RowSparseNDArray, self).__getstate__()
+
+    def __iadd__(self, other):
+        (self + other).copyto(self)
+        return self
+
+    def __isub__(self, other):
+        (self - other).copyto(self)
+        return self
+
+    def __imul__(self, other):
+        (self * other).copyto(self)
+        return self
+
+    def __idiv__(self, other):
+        (self / other).copyto(self)
+        return self
+
+    def __itruediv__(self, other):
+        (self / other).copyto(self)
+        return self
+
+    def __getitem__(self, key):
+        """x.__getitem__(i) <=> x[i]
+
+        Returns a sliced view of this array.
+
+        Parameters
+        ----------
+        key : slice
+            Indexing key.
+
+        Examples
+        --------
+        >>> x = mx.nd.zeros((2, 3), stype='row_sparse')
+        >>> x[:].asnumpy()
+        array([[ 0.,  0.,  0.],
+               [ 0.,  0.,  0.]], dtype=float32)
+        """
+        if isinstance(key, int):
+            raise Exception("__getitem__ with int key is not implemented for RowSparseNDArray yet")
+        if isinstance(key, py_slice):
+            if key.step is not None or key.start is not None or key.stop is not None:
+                raise Exception('RowSparseNDArray only supports [:] for __getitem__')
+            else:
+                return self
+        if isinstance(key, tuple):
+            raise ValueError('Multi-dimension indexing is not supported')
+
+    def __setitem__(self, key, value):
+        """x.__setitem__(i, y) <=> x[i]=y
+
+        Set self[key] to value. Only slice key [:] is supported.
+
+        Parameters
+        ----------
+        key : slice
+            The indexing key.
+        value : NDArray or numpy.ndarray
+            The value to set.
+
+        Examples
+        --------
+        >>> src = mx.nd.row_sparse([[1, 0, 2], [4, 5, 6]], [0, 2], (3,3))
+        >>> src.asnumpy()
+        array([[ 1.,  0.,  2.],
+               [ 0.,  0.,  0.],
+               [ 4.,  5.,  6.]], dtype=float32)
+        >>> # assign RowSparseNDArray with same storage type
+        >>> x = mx.nd.zeros('row_sparse', (3,3))
+        >>> x[:] = src
+        >>> x.asnumpy()
+        array([[ 1.,  0.,  2.],
+               [ 0.,  0.,  0.],
+               [ 4.,  5.,  6.]], dtype=float32)
+        >>> # assign NDArray to RowSparseNDArray
+        >>> x[:] = mx.nd.ones((3,3))
+        >>> x.asnumpy()
+        array([[ 1.,  1.,  1.],
+               [ 1.,  1.,  1.],
+               [ 1.,  1.,  1.]], dtype=float32)
+        """
+        if not self.writable:
+            raise ValueError('Failed to assign to a readonly RowSparseNDArray')
+        if isinstance(key, py_slice):
+            if key.step is not None or key.start is not None or key.stop is not None:
+                raise ValueError('Assignment with slice for RowSparseNDArray ' \
+                                 'is not implmented yet.')
+            if isinstance(value, NDArray):
+                # avoid copying to itself
+                if value.handle is not self.handle:
+                    value.copyto(self)
+            elif isinstance(value, numeric_types):
+                raise ValueError("Assigning numeric types to RowSparseNDArray " \
+                                 "is not implemented yet.")
+            elif isinstance(value, (np.ndarray, np.generic)):
+                warnings.warn('Assigning non-NDArray object to RowSparseNDArray is not efficient',
+                              RuntimeWarning)
+                tmp = _array(value)
+                tmp.copyto(self)
+            else:
+                raise TypeError('type %s not supported' % str(type(value)))
+        else:
+            assert(isinstance(key, (int, tuple)))
+            raise TypeError('RowSparseNDArray only supports [:] for assignment')
+
+    @property
+    def indices(self):
+        """A deep copy NDArray of the indices array of the RowSparseNDArray.
+        This generates a deep copy of the row indices of the current `row_sparse` matrix.
+
+        Returns
+        -------
+        NDArray
+            This RowSparseNDArray's indices array.
+        """
+        return self._aux_data(0)
+
+    @property
+    def data(self):
+        """A deep copy NDArray of the data array of the RowSparseNDArray.
+        This generates a deep copy of the `data` of the current `row_sparse` matrix.
+
+        Returns
+        -------
+        NDArray
+            This RowSparseNDArray's data array.
+        """
+        return self._data()
+
+    def tostype(self, stype):
+        """Return a copy of the array with chosen storage type.
+
+        Returns
+        -------
+        NDArray or RowSparseNDArray
+            A copy of the array with the chosen storage stype
+        """
+        if stype == 'csr':
+            raise ValueError("cast_storage from row_sparse to csr is not supported")
+        return cast_storage(self, stype=stype)
+
+    def copyto(self, other):
+        """Copies the value of this array to another array.
+
+        If ``other`` is a ``NDArray`` or ``RowSparseNDArray`` object, then ``other.shape``
+        and ``self.shape`` should be the same. This function copies the value from
+        ``self`` to ``other``.
+
+        If ``other`` is a context, a new ``RowSparseNDArray`` will be first created on
+        the target context, and the value of ``self`` is copied.
+
+        Parameters
+        ----------
+        other : NDArray or RowSparseNDArray or Context
+            The destination array or context.
+
+        Returns
+        -------
+        NDArray or RowSparseNDArray
+            The copied array. If ``other`` is an ``NDArray`` or ``RowSparseNDArray``, then the
+            return value and ``other`` will point to the same ``NDArray`` or ``RowSparseNDArray``.
+        """
+        if isinstance(other, Context):
+            return super(RowSparseNDArray, self).copyto(other)
+        elif isinstance(other, NDArray):
+            stype = other.stype
+            if stype == 'default' or stype == 'row_sparse':
+                return super(RowSparseNDArray, self).copyto(other)
+            else:
+                raise TypeError('copyto does not support destination NDArray stype ' + str(stype))
+        else:
+            raise TypeError('copyto does not support type ' + str(type(other)))
+
+def _prepare_src_array(src, dtype, default_dtype):
+    """Prepare `src` and its dtype so that they can be used to construct NDArray.
+    `src` is converted to a `np.ndarray` if it's neither an `NDArray` nor an `np.ndarray`.
+    """
+    if isinstance(src, NDArray):
+        dtype = src.dtype if dtype is None else dtype
+    else:
+        dtype = default_dtype if dtype is None else dtype
+        if not isinstance(src, np.ndarray):
+            try:
+                src = np.array(src, dtype=dtype)
+            except:
+                raise TypeError('values must be array like object')
+    return src, dtype
+
+
+def csr_matrix(data, indptr, indices, shape, ctx=None, dtype=None, indptr_type=None,
+               indices_type=None):
+    """Creates a 2D array with compressed sparse row(CSR) format.
+
+    Parameters
+    ----------
+    data: array_like
+        An object exposing the array interface, with shape [nnz], where D0 is the number of
+        non-zero entries.
+    indptr: array_like
+        An object exposing the array interface, with shape [D0 + 1]. The first element in indptr
+        should always be zero.
+    indices: array_like
+        An object exposing the array interface, with shape [nnz].
+    ctx: Context, optional
+        Device context (default is the current default context).
+    dtype: str or numpy.dtype, optional
+        The data type of the output array. The default dtype is ``values.dtype``
+        if `values` is an `NDArray`, `float32` otherwise.
+    indptr_type: str or numpy.dtype, optional
+        The data type of the indices array. The default dtype is ``indptr.dtype``
+        if `indptr` is an `NDArray`, `int64` otherwise.
+    indices_type: str or numpy.dtype, optional
+        The data type of the indices array. The default dtype is ``indices.dtype``
+        if `indicies` is an `NDArray`, `int64` otherwise.
+
+    Returns
+    -------
+    CSRNDArray
+        A `CSRNDArray` with the `csr` storage representation.
+
+    Example
+    -------
+    >>> import mxnet as mx
+    >>> a = mx.nd.csr_matrix([1, 2, 3], [0, 1, 2, 2, 3], [1, 0, 2], (4, 3))
+    >>> a.asnumpy()
+    array([[ 0.,  1.,  0.],
+           [ 2.,  0.,  0.],
+           [ 0.,  0.,  0.],
+           [ 0.,  0.,  3.]], dtype=float32)
+    """
+    storage_type = 'csr'
+    # context
+    if ctx is None:
+        ctx = Context.default_ctx
+    # prepare src array and types
+    data, dtype = _prepare_src_array(data, dtype, mx_real_t)
+    indptr, indptr_type = _prepare_src_array(indptr, indptr_type,
+                                             _STORAGE_AUX_TYPES[storage_type][0])
+    indices, indices_type = _prepare_src_array(indices, indices_type,
+                                               _STORAGE_AUX_TYPES[storage_type][1])
+    # verify types
+    assert('int64' in str(indptr_type)), "expected int64 for indptr"
+    assert('int64' in str(indices_type)), "expected int64 for indices"
+    # verify shapes
+    aux_shapes = [indptr.shape, indices.shape]
+    assert(data.ndim == 1)
+    assert(indptr.ndim == 1)
+    assert(indices.ndim == 1)
+    assert(len(shape) == 2)
+    result = CSRNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype,
+                                          [indptr_type, indices_type], aux_shapes))
+    # TODO(junwu): Convert data, indptr, and indices to mxnet NDArrays
+    # if they are not for now. In the future, we should provide a c-api
+    # to accept np.ndarray types to copy from to result.data and aux_data
+    if not isinstance(data, NDArray):
+        data = _array(data, ctx, dtype)
+    if not isinstance(indptr, NDArray):
+        indptr = _array(indptr, ctx, indptr_type)
+    if not isinstance(indices, NDArray):
+        indices = _array(indices, ctx, indices_type)
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, data.handle, ctypes.c_int(-1)))
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indptr.handle, ctypes.c_int(0)))
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indices.handle, ctypes.c_int(1)))
+    return result
+
+
+def row_sparse_array(data, indices, shape, ctx=None, dtype=None, indices_type=None):
+    """Creates a multidimensional row sparse array with a set of tensor slices at given indices.
+
+    Parameters
+    ----------
+    data: array_like
+        An object exposing the array interface, with shape [D0, D1, .. DK], where D0 is
+        the number of rows with non-zeros entries.
+    indices: array_like
+        An object exposing the array interface, with shape [D0].
+    ctx : Context, optional
+        Device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        The data type of the output array. The default dtype is ``data.dtype``
+        if `data` is an `NDArray`, `float32` otherwise.
+    indices_type: str or numpy.dtype, optional
+        The data type of the indices array. The default dtype is ``indices.dtype``
+        if `indicies` is an `NDArray`, `int64` otherwise.
+
+    Returns
+    -------
+    RowSparseNDArray
+        An `RowSparseNDArray` with the `row_sparse` storage representation.
+
+    Example
+    -------
+    >>> a = mx.nd.row_sparse_array([[1, 2], [3, 4]], [1, 4], (6, 2))
+    >>> a.asnumpy()
+    array([[ 0.,  0.],
+           [ 1.,  2.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 3.,  4.],
+           [ 0.,  0.]], dtype=float32)
+    """
+    storage_type = 'row_sparse'
+    # context
+    if ctx is None:
+        ctx = Context.default_ctx
+    # prepare src array and types
+    data, dtype = _prepare_src_array(data, dtype, mx_real_t)
+    indices, indices_type = _prepare_src_array(indices, indices_type,
+                                               _STORAGE_AUX_TYPES[storage_type][0])
+    # verify types
+    assert('int64' in str(indices_type)), "expected int64 for indices"
+    # verify shapes
+    assert(data.ndim == len(shape))
+    assert(indices.ndim == 1)
+    result = RowSparseNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype,
+                                                [indices_type], [indices.shape]))
+
+    # TODO(junwu): Convert data, indptr, and indices to mxnet NDArrays
+    # if they are not for now. In the future, we should provide a c-api
+    # to accept np.ndarray types to copy from to result.data and aux_data
+    if not isinstance(data, NDArray):
+        data = _array(data, ctx, dtype)
+    if not isinstance(indices, NDArray):
+        indices = _array(indices, ctx, indices_type)
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, data.handle, ctypes.c_int(-1)))
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indices.handle, ctypes.c_int(0)))
+    return result
+
+def _ndarray_cls(handle, writable=True, stype=None):
+    if stype is None:
+        stype = _storage_type(handle)
+    if stype == 'default':
+        return NDArray(handle, writable=writable)
+    elif stype == 'csr':
+        return CSRNDArray(handle, writable=writable)
+    elif stype == 'row_sparse':
+        return RowSparseNDArray(handle, writable=writable)
+    else:
+        raise Exception("unknown storage type")
+
+
+_set_ndarray_class(_ndarray_cls)
+
+
+def _zeros_sparse_ndarray(stype, shape, ctx=None, dtype=None, aux_types=None, **kwargs):
+    """Return a new array of given shape and type, filled with zeros.
+
+    Parameters
+    ----------
+    stype: string
+        The storage type of the empty array, such as 'row_sparse', 'csr', etc
+    shape : int or tuple of int
+        The shape of the empty array
+    ctx : Context, optional
+        An optional device context (default is the current default context)
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`)
+    aux_types: list of numpy.dtype, optional
+        An optional list of types of the aux data for RowSparseNDArray or CSRNDArray
+        (default values depends on the storage type)
+
+    Returns
+    -------
+    RowSparseNDArray or CSRNDArray
+        A created array
+    Examples
+    --------
+    >>> mx.nd.zeros((1,2), mx.cpu(), stype='csr')
+    <CSRNDArray 1x2 @cpu(0)>
+    >>> mx.nd.zeros((1,2), mx.cpu(), 'float16', stype='row_sparse').asnumpy()
+    array([[ 0.,  0.]], dtype=float16)
+    """
+    if stype == 'default':
+        return _zeros_ndarray(shape, ctx=ctx, dtype=dtype, **kwargs)
+    if ctx is None:
+        ctx = Context.default_ctx
+    dtype = mx_real_t if dtype is None else dtype
+    if aux_types is None:
+        if stype == 'row_sparse' or stype == 'csr':
+            aux_types = _STORAGE_AUX_TYPES[stype]
+        else:
+            raise Exception("unknown storage type")
+    assert(len(aux_types) == len(_STORAGE_AUX_TYPES[stype]))
+    out = _ndarray_cls(_new_alloc_handle(stype, shape, ctx, True, dtype, aux_types))
+    return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, out=out, **kwargs)
+
+def _empty_sparse_ndarray(stype, shape, ctx=None, dtype=None, aux_types=None):
+    """Returns a new array of given shape and type, without initializing entries.
+    """
+    if isinstance(shape, int):
+        shape = (shape, )
+    if ctx is None:
+        ctx = Context.default_ctx
+    if dtype is None:
+        dtype = mx_real_t
+    assert(stype is not None)
+    if stype == 'csr' or stype == 'row_sparse':
+        return _zeros_sparse_ndarray(stype, shape, ctx=ctx, dtype=dtype, aux_types=aux_types)
+    else:
+        raise Exception("unknown stype : " + str(stype))
+
+def _sparse_array(source_array, ctx=None, dtype=None, aux_types=None):
+    """Creates a sparse array from any object exposing the array interface.
+    """
+    if isinstance(source_array, NDArray):
+        assert(source_array.stype != 'default'), \
+               "Please use `cast_storage` to create BaseSparseNDArray from an NDArray"
+        dtype = source_array.dtype if dtype is None else dtype
+        aux_types = source_array._aux_types if aux_types is None else aux_types
+    else:
+        # TODO(haibin/anisub) support creation from scipy object when `_sync_copy_from` is ready
+        raise NotImplementedError('creating BaseSparseNDArray from ' \
+                                  ' a non-NDArray object is not implemented.')
+    arr = _empty_sparse_ndarray(source_array.stype, source_array.shape, ctx, dtype, aux_types)
+    arr[:] = source_array
+    return arr
diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
new file mode 100644
index 000000000000..fa2cb5840f7e
--- /dev/null
+++ b/python/mxnet/ndarray/utils.py
@@ -0,0 +1,232 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""Utility functions for NDArray and BaseSparseNDArray."""
+import ctypes
+
+from ..base import _LIB, check_call, py_str, c_str, string_types, mx_uint, NDArrayHandle, c_array
+from .ndarray import NDArray, _zeros_ndarray, _empty_ndarray, _array
+from .sparse_ndarray import _zeros_sparse_ndarray, _empty_sparse_ndarray, _sparse_array
+from .sparse_ndarray import _ndarray_cls
+
+
+def zeros(shape, ctx=None, dtype=None, stype=None, aux_types=None, **kwargs):
+    """Return a new array of given shape and type, filled with zeros.
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array
+    ctx : Context, optional
+        An optional device context (default is the current default context)
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`)
+    stype: string, optional
+        The storage type of the empty array, such as 'row_sparse', 'csr', etc.
+    aux_types: list of numpy.dtype, optional
+        An optional list of types of the aux data for RowSparseNDArray or CSRNDArray
+        (default values depend on the storage type)
+
+    Returns
+    -------
+    NDArray, CSRNDArray or RowSparseNDArray
+        A created array
+    Examples
+    --------
+    >>> mx.nd.zeros((1,2), mx.cpu(), stype='csr')
+    <CSRNDArray 1x2 @cpu(0)>
+    >>> mx.nd.zeros((1,2), mx.cpu(), 'float16', stype='row_sparse').asnumpy()
+    array([[ 0.,  0.]], dtype=float16)
+    """
+
+    if stype is None or stype == 'default':
+        return _zeros_ndarray(shape, ctx, dtype, **kwargs)
+    else:
+        return _zeros_sparse_ndarray(stype, shape, ctx, dtype, aux_types, **kwargs)
+
+def empty(shape, ctx=None, dtype=None, stype=None, aux_types=None):
+    """Returns a new array of given shape and type, without initializing entries.
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`).
+    stype : str, optional
+        An optional storage type (default is `default`).
+    aux_types: list of numpy.dtype, optional
+        An optional list of types of the aux data for RowSparseNDArray or CSRNDArray
+        (default values depend on the storage type)
+
+    Returns
+    -------
+    NDArray, CSRNDArray or RowSparseNDArray
+        A created array.
+
+    Examples
+    --------
+    >>> mx.nd.empty(1)
+    <NDArray 1 @cpu(0)>
+    >>> mx.nd.empty((1,2), mx.gpu(0))
+    <NDArray 1x2 @gpu(0)>
+    >>> mx.nd.empty((1,2), mx.gpu(0), 'float16')
+    <NDArray 1x2 @gpu(0)>
+    >>> mx.nd.empty((1,2), stype='csr')
+    <CSRNDArray 1x2 @cpu(0)>
+    """
+    if stype is None or stype == 'default':
+        return _empty_ndarray(shape, ctx, dtype)
+    else:
+        return _empty_sparse_ndarray(stype, shape, ctx, dtype, aux_types)
+
+def array(source_array, ctx=None, dtype=None, aux_types=None):
+    """Creates an array from any object exposing the array interface.
+
+    Parameters
+    ----------
+    source_array : array_like
+        An object exposing the array interface, an object whose `__array__`
+        method returns an array, or any (nested) sequence.
+    ctx : Context, optional
+        Device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        The data type of the output array. The default dtype is ``source_array.dtype``
+        if `source_array` is an `NDArray`, `float32` otherwise.
+    aux_types: list of numpy.dtype, optional
+        An optional list of types of the aux data for RowSparseNDArray or CSRNDArray
+        (default values depend on the storage type)
+
+    Returns
+    -------
+    NDArray, RowSparseNDArray or CSRNDArray
+        An array with the same contents as the `source_array`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> mx.nd.array([1, 2, 3])
+    <NDArray 3 @cpu(0)>
+    >>> mx.nd.array([[1, 2], [3, 4]])
+    <NDArray 2x2 @cpu(0)>
+    >>> mx.nd.array(np.zeros((3, 2)))
+    <NDArray 3x2 @cpu(0)>
+    >>> mx.nd.array(np.zeros((3, 2)), mx.gpu(0))
+    <NDArray 3x2 @gpu(0)>
+    >>> mx.nd.array(mx.nd.zeros((3, 2), stype='row_sparse'))
+    <RowSparseNDArray 3x2 @cpu(0)>
+    """
+    # TODO(haibin/anisub) Check if input is scipy.sparse object with `scipy.sparse.issparse`
+    if isinstance(source_array, NDArray) and source_array.stype != 'default':
+        return _sparse_array(source_array, ctx=ctx, dtype=dtype, aux_types=aux_types)
+    else:
+        return _array(source_array, ctx=ctx, dtype=dtype)
+
+def load(fname):
+    """Loads an array from file.
+
+    See more details in ``save``.
+
+    Parameters
+    ----------
+    fname : str
+        The filename.
+
+    Returns
+    -------
+    list of NDArray, RowSparseNDArray or CSRNDArray, or \
+    dict of str to NDArray, RowSparseNDArray or CSRNDArray
+        Loaded data.
+    """
+    if not isinstance(fname, string_types):
+        raise TypeError('fname required to be a string')
+    out_size = mx_uint()
+    out_name_size = mx_uint()
+    handles = ctypes.POINTER(NDArrayHandle)()
+    names = ctypes.POINTER(ctypes.c_char_p)()
+    check_call(_LIB.MXNDArrayLoad(c_str(fname),
+                                  ctypes.byref(out_size),
+                                  ctypes.byref(handles),
+                                  ctypes.byref(out_name_size),
+                                  ctypes.byref(names)))
+    if out_name_size.value == 0:
+        return [_ndarray_cls(NDArrayHandle(handles[i])) for i in range(out_size.value)]
+    else:
+        assert out_name_size.value == out_size.value
+        return dict(
+            (py_str(names[i]), _ndarray_cls(NDArrayHandle(handles[i])))
+            for i in range(out_size.value))
+
+
+def save(fname, data):
+    """Saves a list of arrays or a dict of str->array to file.
+
+    Examples of filenames:
+
+    - ``/path/to/file``
+    - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports)
+    - ``hdfs://path/to/file`` (if compiled with HDFS supports)
+
+    Parameters
+    ----------
+    fname : str
+        The filename.
+    data : NDArray, RowSparseNDArray or CSRNDArray, \
+           or list of NDArray, RowSparseNDArray or CSRNDArray, \
+           or dict of str to NDArray, RowSparseNDArray or CSRNDArray
+        The data to save.
+
+    Examples
+    --------
+    >>> x = mx.nd.zeros((2,3))
+    >>> y = mx.nd.ones((1,4))
+    >>> mx.nd.save('my_list', [x,y])
+    >>> mx.nd.save('my_dict', {'x':x, 'y':y})
+    >>> mx.nd.load('my_list')
+    [<NDArray 2x3 @cpu(0)>, <NDArray 1x4 @cpu(0)>]
+    >>> mx.nd.load('my_dict')
+    {'y': <NDArray 1x4 @cpu(0)>, 'x': <NDArray 2x3 @cpu(0)>}
+    """
+    if isinstance(data, NDArray):
+        data = [data]
+    handles = []
+    if isinstance(data, dict):
+        keys = []
+        for key, val in data.items():
+            if not isinstance(key, string_types):
+                raise TypeError('save only accept dict str->NDArray or list of NDArray')
+            if not isinstance(val, NDArray):
+                raise TypeError('save only accept dict str->NDArray or list of NDArray')
+            keys.append(c_str(key))
+            handles.append(val.handle)
+        keys = c_array(ctypes.c_char_p, keys)
+    elif isinstance(data, list):
+        for val in data:
+            if not isinstance(val, NDArray):
+                raise TypeError('save only accept dict str->NDArray or list of NDArray')
+            handles.append(val.handle)
+        keys = None
+    else:
+        raise ValueError("data needs to either be a NDArray, dict of str, NDArray pairs "
+                         "or a list of NDarrays.")
+    check_call(_LIB.MXNDArraySave(c_str(fname),
+                                  mx_uint(len(handles)),
+                                  c_array(NDArrayHandle, handles),
+                                  keys))
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 1ef9cc845036..e7e283f88e43 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -339,8 +339,8 @@ class SGD(Optimizer):
         state = momentum * state + lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
         weight = weight - state
 
-    For details of the update algorithm see :class:`~mxnet.ndarray.sgd_update` and
-    :class:`~mxnet.ndarray.sgd_mom_update`.
+    Sparse updating is supported. For details of the update algorithm see
+    :class:`~mxnet.ndarray.sgd_update` and :class:`~mxnet.ndarray.sgd_mom_update`.
 
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
@@ -367,7 +367,8 @@ def create_state(self, index, weight):
         if self.multi_precision and weight.dtype == numpy.float16:
             weight_master_copy = array(weight, ctx=weight.context, dtype=numpy.float32)
             if self.momentum != 0.0:
-                momentum = zeros(weight.shape, weight.context, dtype=numpy.float32)
+                momentum = zeros(weight.shape, weight.context, dtype=numpy.float32,
+                                 stype=weight.stype)
             return (momentum, weight_master_copy)
         if weight.dtype == numpy.float16 and not self.multi_precision:
             warnings.warn("Accumulating with float16 in optimizer can lead to "
@@ -375,7 +376,7 @@ def create_state(self, index, weight):
                           "Consider using multi_precision=True option of the "
                           "SGD optimizer")
         if self.momentum != 0.0:
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype)
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
         return momentum
 
     def update(self, index, weight, grad, state):
@@ -563,8 +564,10 @@ def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
         self.epsilon = epsilon
 
     def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
-                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype,
+                      stype=weight.stype),  # mean
+                zeros(weight.shape, weight.context, dtype=weight.dtype,
+                      stype=weight.stype))  # variance
 
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
@@ -669,11 +672,11 @@ def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9,
     def create_state(self, index, weight):
         if self.centered:
             return (
-                zeros(weight.shape, weight.context),  # n
-                zeros(weight.shape, weight.context),  # g
-                zeros(weight.shape, weight.context))  # delta
+                zeros(weight.shape, weight.context, stype=weight.stype),  # n
+                zeros(weight.shape, weight.context, stype=weight.stype),  # g
+                zeros(weight.shape, weight.context, stype=weight.stype))  # delta
         else:
-            return (zeros(weight.shape, weight.context), )  # n
+            return (zeros(weight.shape, weight.context, stype=weight.stype),)  # n
 
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
diff --git a/python/mxnet/random.py b/python/mxnet/random.py
index 29b250d980ce..14bfc2731bd6 100644
--- a/python/mxnet/random.py
+++ b/python/mxnet/random.py
@@ -22,13 +22,13 @@
 
 import ctypes
 from .base import _LIB, check_call
-from ._ndarray_internal import _sample_uniform as uniform
-from ._ndarray_internal import _sample_normal as normal
-from ._ndarray_internal import _sample_gamma as gamma
-from ._ndarray_internal import _sample_exponential as exponential
-from ._ndarray_internal import _sample_poisson as poisson
-from ._ndarray_internal import _sample_negbinomial as negative_binomial
-from ._ndarray_internal import _sample_gennegbinomial as generalized_negative_binomial
+from .ndarray._internal import _sample_uniform as uniform
+from .ndarray._internal import _sample_normal as normal
+from .ndarray._internal import _sample_gamma as gamma
+from .ndarray._internal import _sample_exponential as exponential
+from .ndarray._internal import _sample_poisson as poisson
+from .ndarray._internal import _sample_negbinomial as negative_binomial
+from .ndarray._internal import _sample_gennegbinomial as generalized_negative_binomial
 
 def seed(seed_state):
     """Seeds the random number generators in MXNet.
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 14cb3811deeb..2ee41884d700 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -40,6 +40,8 @@
 from .context import Context
 from .ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP, _GRAD_REQ_MAP
 from .name import NameManager  # pylint: disable=unused-import
+from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
+from .ndarray.sparse_ndarray import _ndarray_cls
 from .executor import Executor
 from . import _symbol_internal as _internal
 from .attribute import AttrScope
@@ -1263,8 +1265,9 @@ def _get_ndarray_inputs(arg_key, args, arg_names, allow_missing):
             raise TypeError('Only accept list of NDArrays or dict of str to NDArray')
         return c_array(NDArrayHandle, arg_handles), arg_arrays
 
-    def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
-                    shared_arg_names=None, shared_exec=None, shared_buffer=None, **kwargs):
+    def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
+                    group2ctx=None, shared_arg_names=None, shared_exec=None,
+                    shared_buffer=None, **kwargs):
         """Bind current symbol to get an executor, allocate all the arguments needed.
         Allows specifying data types.
 
@@ -1306,6 +1309,9 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
         type_dict  : Dict of str->numpy.dtype
             Input type dictionary, name->dtype
 
+        stype_dict  : Dict of str->str
+            Input storage type dictionary, name->storage_type
+
         group2ctx : Dict of string to mx.Context
             The dict mapping the `ctx_group` attribute to the context assignment.
 
@@ -1320,7 +1326,8 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
         shared_buffer : Dict of string to `NDArray`
             The dict mapping argument names to the `NDArray` that can be reused for initializing
             the current executor. This buffer will be checked for reuse if one argument name
-            of the current executor is not found in `shared_arg_names`.
+            of the current executor is not found in `shared_arg_names`. The `NDArray`s are
+            expected have default storage type.
 
         kwargs : Dict of str->shape
             Input shape dictionary, name->shape
@@ -1330,6 +1337,7 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
         executor : mxnet.Executor
             The generated executor
         """
+        # data types
         num_provided_arg_types = 0
         provided_arg_type_names = ctypes.POINTER(ctypes.c_char_p)()  # provided type argument names
         provided_arg_type_data = ctypes.POINTER(mx_uint)()  # provided types
@@ -1345,6 +1353,22 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
             provided_arg_type_names = c_array(ctypes.c_char_p, provided_arg_type_names)
             provided_arg_type_data = c_array(ctypes.c_int, provided_arg_type_data)
 
+        # storage types
+        num_provided_arg_stypes = 0
+        # provided storage type argument names
+        provided_arg_stype_names = ctypes.POINTER(ctypes.c_char_p)()
+        provided_arg_stype_data = ctypes.POINTER(mx_uint)()  # provided storage types
+        if stype_dict is not None:
+            provided_arg_stype_names = []
+            provided_arg_stype_data = []
+            for k, v in stype_dict.items():
+                if v in _STORAGE_TYPE_STR_TO_ID:
+                    provided_arg_stype_names.append(c_str(k))
+                    provided_arg_stype_data.append(ctypes.c_int(_STORAGE_TYPE_STR_TO_ID[v]))
+            num_provided_arg_stypes = mx_uint(len(provided_arg_stype_names))
+            provided_arg_stype_names = c_array(ctypes.c_char_p, provided_arg_stype_names)
+            provided_arg_stype_data = c_array(ctypes.c_int, provided_arg_stype_data)
+
         provided_arg_shape_data = []  # shape data
         # argument shape index in sdata,
         # e.g. [sdata[indptr[0]], sdata[indptr[1]]) is the shape of the first arg
@@ -1418,6 +1442,8 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
             shared_buffer_names = []
             shared_buffer_handles = []
             for k, v in shared_buffer.items():
+                assert(v.stype == 'default'), \
+                    "shared_buffer is expected to only contain NDArrays with default storage"
                 shared_buffer_names.append(c_str(k))
                 shared_buffer_handles.append(v.handle)
             shared_buffer_names = c_array(ctypes.c_char_p, shared_buffer_names)
@@ -1457,6 +1483,9 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
                                                  num_provided_arg_types,
                                                  provided_arg_type_names,
                                                  provided_arg_type_data,
+                                                 num_provided_arg_stypes,
+                                                 provided_arg_stype_names,
+                                                 provided_arg_stype_data,
                                                  mx_uint(len(shared_arg_name_list)),
                                                  c_array(ctypes.c_char_p, shared_arg_name_list),
                                                  ctypes.byref(shared_buffer_len),
@@ -1486,11 +1515,12 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
                 shared_buffer[k] = v
 
         # create in_args, arg_grads, and aux_states for the current executor
-        arg_arrays = [NDArray(NDArrayHandle(in_arg_handles[i])) for i in range(num_in_args.value)]
-        grad_arrays = [NDArray(NDArrayHandle(arg_grad_handles[i]))
+        arg_arrays = [_ndarray_cls(NDArrayHandle(in_arg_handles[i])) \
+                      for i in range(num_in_args.value)]
+        grad_arrays = [_ndarray_cls(NDArrayHandle(arg_grad_handles[i]))
                        if arg_grad_handles[i] is not None
                        else None for i in range(num_in_args.value)]
-        aux_arrays = [NDArray(NDArrayHandle(aux_state_handles[i]))
+        aux_arrays = [_ndarray_cls(NDArrayHandle(aux_state_handles[i]))
                       for i in range(num_aux_states.value)]
 
         executor = Executor(exe_handle, self, ctx, grad_req, group2ctx)
@@ -1767,7 +1797,8 @@ def detach(self):
     def backward(self):
         raise NotImplementedForSymbol(self.backward, None)
 
-def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, init=None, **kwargs):
+def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None,
+        init=None, stype=None, **kwargs):
     """Creates a symbolic variable with specified name.
 
     Example usage:
@@ -1794,6 +1825,8 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, ini
         The dtype for input variable. If not specified, this value will be inferred.
     init : initializer (mxnet.init.*)
         Initializer for this variable to (optionally) override the default initializer.
+    stype : str
+        The storage type of the variable.
     kwargs : Additional attribute variables
         Additional attributes must start and end with double underscores.
 
@@ -1821,6 +1854,8 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, ini
         if not isinstance(init, string_types):
             init = init.dumps()
         attr['__init__'] = init
+    if stype is not None:
+        attr['__storage_type__'] = str(_STORAGE_TYPE_STR_TO_ID[stype])
     for k, v in kwargs.items():
         if k.startswith('__') and k.endswith('__'):
             attr[k] = str(v)
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index c5587f8d80a8..3eeb51a443c8 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -29,17 +29,20 @@
 import errno
 import logging
 from contextlib import contextmanager
+import scipy.sparse as sp
 import numpy as np
 import numpy.testing as npt
-import mxnet as mx
-from .context import Context
-from .ndarray import array
-from .symbol import Symbol
+import numpy.random as rnd
 try:
     import requests
 except ImportError:
     # in rare cases requests may be not installed
     pass
+import mxnet as mx
+from .context import Context
+from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
+from .ndarray import array
+from .symbol import Symbol
 
 _rng = np.random.RandomState(1234)
 
@@ -85,6 +88,182 @@ def random_arrays(*shapes):
     return arrays
 
 
+def random_sample(population, k):
+    """Return a k length list of the elements chosen from the population sequence."""
+    assert 0 <= k <= len(population)
+    population_copy = population[:]
+    np.random.shuffle(population_copy)
+    return population_copy[0:k]
+
+
+def _validate_csr_generation_inputs(num_rows, num_cols, density,
+                                    distribution="uniform"):
+    """Validates inputs for csr generation helper functions
+    """
+    total_nnz = int(num_rows * num_cols * density)
+    if density < 0 or density > 1:
+        raise ValueError("density has to be between 0 and 1")
+
+    if num_rows <= 0 or num_cols <= 0:
+        raise ValueError("num_rows or num_cols should be greater than 0")
+
+    if distribution == "powerlaw":
+        if total_nnz < 2 * num_rows:
+            raise ValueError("not supported for this density: %s"
+                             " for this shape (%s, %s)"
+                             " Please keep :"
+                             " num_rows * num_cols * density >= 2 * num_rows"
+                             % (density, num_rows, num_cols))
+
+
+def _get_uniform_dataset_csr(num_rows, num_cols, density=0.1, dtype=None):
+    """Returns CSRNDArray with uniform distribution
+    This generates a csr matrix with totalnnz unique randomly chosen numbers
+    from num_rows*num_cols and arranges them in the 2d array in the
+    following way: row_index = (random_number_generated / num_rows)
+    col_index = random_number_generated - row_index * num_cols
+    """
+    _validate_csr_generation_inputs(num_rows, num_cols, density,
+                                    distribution="uniform")
+    csr = sp.rand(num_rows, num_cols, density, dtype=dtype, format="csr")
+    result = mx.nd.csr_matrix(csr.data, csr.indptr, csr.indices,
+                              (num_rows, num_cols), dtype=dtype)
+    return result
+
+
+def _get_powerlaw_dataset_csr(num_rows, num_cols, density=0.1, dtype=None):
+    """Returns CSRNDArray with powerlaw distribution
+    with exponentially increasing number of non zeros in each row.
+    Not supported for cases where total_nnz < 2*num_rows. This is because
+    the algorithm first tries to ensure that there are rows with no zeros by
+    putting non zeros at beginning of each row.
+    """
+
+    _validate_csr_generation_inputs(num_rows, num_cols, density,
+                                    distribution="powerlaw")
+
+    total_nnz = int(num_rows * num_cols * density)
+
+    unused_nnz = total_nnz
+    output_arr = np.zeros((num_rows, num_cols), dtype=dtype)
+    # Start with ones on each row so that no row is empty
+    for row in range(num_rows):
+        output_arr[row][0] = 1 + rnd.uniform(0.001, 2)
+        unused_nnz = unused_nnz - 1
+        if unused_nnz <= 0:
+            return mx.nd.array(output_arr).tostype("csr")
+
+    # Populate rest of matrix with 2^i items in ith row.
+    # if we have used all total nnz return the sparse matrix
+    # else if we reached max column size then fill up full columns until we use all nnz
+    col_max = 2
+    for row in range(num_rows):
+        col_limit = min(num_cols, col_max)
+        # In case col_limit reached assign same value to all elements, which is much faster
+        if col_limit == num_cols and unused_nnz > col_limit:
+            output_arr[row] = 1 + rnd.uniform(0.001, 2)
+            unused_nnz = unused_nnz - col_limit + 1
+            if unused_nnz <= 0:
+                return mx.nd.array(output_arr).tostype("csr")
+            else:
+                continue
+        for col_index in range(1, col_limit):
+            output_arr[row][col_index] = 1 + rnd.uniform(0.001, 2)
+            unused_nnz = unused_nnz - 1
+            if unused_nnz <= 0:
+                return mx.nd.array(output_arr).tostype("csr")
+        col_max = col_max * 2
+
+    if unused_nnz > 0:
+        #return mx.nd.array(sp.random(num_rows, num_cols, density).toarray()).tostype("csr")
+        raise ValueError("not supported for this density: %s"
+                         " for this shape (%s,%s)" % (density, num_rows, num_cols))
+    else:
+        return mx.nd.array(output_arr).tostype("csr")
+
+
+def rand_sparse_ndarray(shape, stype, density=None, distribution="uniform", dtype=None):
+    """Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np)
+    Parameters
+    ----------
+    shape: list or tuple
+    stype: str, valid values: "csr" or "row_sparse"
+    density, optional: float, should be between 0 and 1
+    distribution, optional: str, valid values: "uniform" or "powerlaw"
+    dtype, optional: numpy.dtype, default value is None
+    Returns
+    -------
+    Result of type CSRNDArray or RowSparseNDArray
+    Examples
+    --------
+    Below is an example of the powerlaw distribution with csr as the stype.
+    It calculates the nnz using the shape and density.
+    It fills up the ndarray with exponentially increasing number of elements.
+    If there are enough unused_nnzs, n+1th row will have twice more nnzs compared to nth row.
+    else, remaining unused_nnzs will be used in n+1th row
+    If number of cols is too small and we have already reached column size it will fill up
+    all following columns in all followings rows until we reach the required density.
+
+    >>> csr_arr, _ = rand_sparse_ndarray(shape=(5, 16), stype="csr",
+                                         density=0.50, distribution="powerlaw")
+    >>> indptr = csr_arr.indptr.asnumpy()
+    >>> indices = csr_arr.indices.asnumpy()
+    >>> data = csr_arr.data.asnumpy()
+    >>> row2nnz = len(data[indptr[1]:indptr[2]])
+    >>> row3nnz = len(data[indptr[2]:indptr[3]])
+    >>> assert(row3nnz == 2*row2nnz)
+    >>> row4nnz = len(data[indptr[3]:indptr[4]])
+    >>> assert(row4nnz == 2*row3nnz)
+    """
+    density = rnd.rand() if density is None else density
+    dtype = default_dtype() if dtype is None else dtype
+    if stype == 'row_sparse':
+        assert (distribution == "uniform"), \
+               "Distribution %s not supported for row_sparse" % (distribution)
+        # sample index
+        idx_sample = rnd.rand(shape[0])
+        indices = np.argwhere(idx_sample < density).flatten()
+        if indices.shape[0] == 0:
+            result = mx.nd.zeros(shape, stype='row_sparse', dtype=dtype)
+            return result, (np.array([], dtype=dtype), np.array([], dtype='int64'))
+        # generate random values
+        val = rnd.rand(indices.shape[0], *shape[1:]).astype(dtype)
+        arr = mx.nd.row_sparse_array(val, indices, shape, indices_type=np.int64, dtype=dtype)
+        return arr, (val, indices)
+    elif stype == 'csr':
+        assert len(shape) == 2
+        if distribution == "uniform":
+            csr = _get_uniform_dataset_csr(shape[0], shape[1], density, dtype=dtype)
+            return csr, (csr.indptr, csr.indices, csr.data)
+        elif distribution == "powerlaw":
+            csr = _get_powerlaw_dataset_csr(shape[0], shape[1], density, dtype=dtype)
+            return csr, (csr.indptr, csr.indices, csr.data)
+        else:
+            assert(False), "Distribution not supported: %s" % (distribution)
+    else:
+        assert(False), "unknown storage type"
+
+
+def rand_ndarray(shape, stype, density=None, dtype=None):
+    if stype == 'default':
+        arr = mx.nd.array(random_arrays(shape), dtype=dtype)
+    else:
+        arr, _ = rand_sparse_ndarray(shape, stype, density=density, dtype=dtype)
+    return arr
+
+
+def rand_shape_2d(dim0=10, dim1=10):
+    return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1)
+
+
+def rand_shape_3d(dim0=10, dim1=10, dim2=10):
+    return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1), rnd.randint(1, dim2 + 1)
+
+
+def rand_shape_nd(n, dim=10):
+    return rnd.randint(1, dim+1, size=n)
+
+
 def np_reduce(dat, axis, keepdims, numpy_reduce_func):
     """Compatible reduce for old version of NumPy.
 
@@ -316,7 +495,8 @@ def _parse_location(sym, location, ctx):
                              % (str(set(sym.list_arguments())), str(set(location.keys()))))
     else:
         location = {k: v for k, v in zip(sym.list_arguments(), location)}
-    location = {k: mx.nd.array(v, ctx=ctx) for k, v in location.items()}
+    location = {k: mx.nd.array(v, ctx=ctx) if isinstance(v, np.ndarray) \
+               else v for k, v in location.items()}
     return location
 
 
@@ -437,7 +617,8 @@ def numeric_grad(executor, location, aux_states=None, eps=1e-4, use_forward_trai
 
 
 def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rtol=1e-2,
-                           atol=None, grad_nodes=None, use_forward_train=True, ctx=None):
+                           atol=None, grad_nodes=None, use_forward_train=True, ctx=None,
+                           grad_stype_dict=None):
     """Verify an operation by checking backward pass via finite difference method.
 
     Based on Theano's `theano.gradient.verify_grad` [1]
@@ -454,7 +635,7 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto
         - if type is dict of str -> numpy.ndarray
             maps the name of arguments to the corresponding numpy.ndarray.
         *In either case, value of all the arguments must be provided.*
-    aux_states : ist or tuple or dict, optional
+    aux_states : list or tuple or dict, optional
         The auxiliary states required when generating the executor for the symbol.
     numeric_eps : float, optional
         Delta for the finite difference method that approximates the gradient.
@@ -466,6 +647,8 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto
         Whether to use is_train=True when computing the finite-difference.
     ctx : Context, optional
         Check the gradient computation on the specified device.
+    grad_stype_dict : dict of str->str, optional
+        Storage type dictionary for gradient ndarrays.
     References
     ---------
     ..[1] https://github.com/Theano/Theano/blob/master/theano/gradient.py
@@ -489,7 +672,7 @@ def random_projection(shape):
     location_npy = {k:v.asnumpy() for k, v in location.items()}
     aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx)
     if aux_states is not None:
-        aux_states_npy = {k:v.asnumpy() for k, v in aux_states.items()}
+        aux_states_npy = {k: v.asnumpy() for k, v in aux_states.items()}
     else:
         aux_states_npy = None
     if grad_nodes is None:
@@ -516,6 +699,14 @@ def random_projection(shape):
                          + [("__random_proj", _rng.normal(0, 0.01, size=out_shape[0]))])
 
     args_grad = {k: mx.nd.array(v, ctx=ctx) for k, v in args_grad_npy.items()}
+    if grad_stype_dict is not None:
+        assert isinstance(grad_stype_dict, dict), "grad_stype_dict must be a dict"
+        for k, v in grad_stype_dict.items():
+            if k in args_grad and v in _STORAGE_TYPE_STR_TO_ID and v != 'default':
+                # create an uninitialized sparse ndarray for executor
+                # if the symbolic grad is expected to be zero, it should not be initialized at all
+                args_grad[k] = mx.nd.zeros(args_grad[k].shape, args_grad[k].context,
+                                           args_grad[k].dtype, v)
 
     executor = out.bind(ctx, grad_req=grad_req,
                         args=location, args_grad=args_grad, aux_states=aux_states)
@@ -607,15 +798,15 @@ def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
         g[:] = 0
 
     executor.forward(is_train=False)
-    outputs = [x.asnumpy() for x in executor.outputs]
 
+    outputs = [x.asnumpy() for x in executor.outputs]
     for output_name, expect, output in zip(sym.list_outputs(), expected, outputs):
         assert_almost_equal(expect, output, rtol, atol,
                             ("EXPECTED_%s"%output_name, "FORWARD_%s"%output_name))
 
 
 def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=None,
-                            aux_states=None, grad_req='write', ctx=None):
+                            aux_states=None, grad_req='write', ctx=None, grad_stypes=None):
     """Compares a symbol's backward results with the expected ones.
     Prints error messages if the backward results are not the same as the expected results.
 
@@ -651,6 +842,8 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
         Gradient requirements. 'write', 'add' or 'null'.
     ctx : Context, optional
         Running context.
+    grad_stypes: dict of str->str
+        dictionary of mapping argument name to stype for the gradient
 
     Example
     -------
@@ -676,14 +869,24 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
     if isinstance(expected, (list, tuple)):
         expected = {k:v for k, v in zip(sym.list_arguments(), expected)}
     args_grad_npy = {k:_rng.normal(size=v.shape) for k, v in expected.items()}
-    args_grad_data = {k: mx.nd.array(v, ctx=ctx) for k, v in args_grad_npy.items()}
+    args_grad_data = {}
+    for k, v in args_grad_npy.items():
+        nd = mx.nd.array(v, ctx=ctx)
+        if grad_stypes is not None and k in grad_stypes:
+            out = nd.tostype(grad_stypes[k])
+            args_grad_data[k] = out
+        else:
+            args_grad_data[k] = nd
+
     if isinstance(grad_req, str):
         grad_req = {k:grad_req for k in sym.list_arguments()}
     elif isinstance(grad_req, (list, tuple)):
         grad_req = {k:v for k, v in zip(sym.list_arguments(), grad_req)}
 
-    executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, aux_states=aux_states)
+    executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data,
+                        aux_states=aux_states, grad_req=grad_req)
     executor.forward(is_train=True)
+
     if isinstance(out_grads, (tuple, list)):
         out_grads = [mx.nd.array(v, ctx=ctx) for v in out_grads]
     elif isinstance(out_grads, (dict)):
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 93458d21ac5a..0fe3fe3e302e 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -172,6 +172,39 @@ int MXNDArrayCreateEx(const mx_uint *shape,
   API_END();
 }
 
+int MXNDArrayCreateSparseEx(int storage_type,
+                    const mx_uint *shape,
+                    mx_uint ndim,
+                    int dev_type,
+                    int dev_id,
+                    int delay_alloc,
+                    int dtype,
+                    mx_uint num_aux,
+                    int *aux_type,
+                    mx_uint *aux_ndims,
+                    const mx_uint *aux_shape,
+                    NDArrayHandle *out) {
+  API_BEGIN();
+  std::vector<int> aux_types;
+  std::vector<TShape> aux_shapes;
+  auto shape_start = aux_shape;
+  for (size_t i = 0; i < num_aux; i++) {
+    // types
+    aux_types.push_back(aux_type[i]);
+    // shapes
+    aux_shapes.emplace_back(shape_start, shape_start + aux_ndims[i]);
+    shape_start += aux_ndims[i];
+  }
+  *out = new NDArray(
+      NDArrayStorageType(storage_type),
+      TShape(shape, shape + ndim),
+      Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id),
+      delay_alloc != 0,
+      dtype, aux_types, aux_shapes);
+  API_END();
+}
+
+
 int MXNDArrayLoadFromRawBytes(const void *buf,
                               size_t size,
                               NDArrayHandle *out) {
@@ -215,6 +248,23 @@ int MXNDArraySyncCopyToCPU(NDArrayHandle handle,
   API_END();
 }
 
+/*!
+ * \brief Copy src.data() to dst.data() if i = -1, else dst.aux_data(i) if i >= 0
+ * This function blocks. Do not use it in performance critical code.
+ * \param handle_dst handle of a dst ndarray whose data/aux_data has been allocated
+ * \param handle_src handle of a src ndarray which has default storage type
+ * \param i dst data blob indicator
+ */
+int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst,
+                                 const NDArrayHandle handle_src,
+                                 const int i) {
+  API_BEGIN();
+  NDArray* dst = static_cast<NDArray*>(handle_dst);
+  NDArray* src = static_cast<NDArray*>(handle_src);
+  dst->SyncCopyFromNDArray(*src, -1, i);
+  API_END();
+}
+
 int MXNDArrayWaitToRead(NDArrayHandle handle) {
   API_BEGIN();
   static_cast<NDArray*>(handle)->WaitToRead();
@@ -351,6 +401,18 @@ MXNET_DLL int MXNDArrayReshape(NDArrayHandle handle,
   API_END_HANDLE_ERROR(delete ptr);
 }
 
+int MXNDArrayGetStorageType(NDArrayHandle handle,
+                     int *out_storage_type) {
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  if (!arr->is_none()) {
+    *out_storage_type = arr->storage_type();
+  } else {
+    *out_storage_type = kUndefinedStorage;
+  }
+  API_END();
+}
+
 int MXNDArrayGetShape(NDArrayHandle handle,
                       mx_uint *out_dim,
                       const mx_uint **out_pdata) {
@@ -400,6 +462,42 @@ int MXNDArrayGetDType(NDArrayHandle handle,
   API_END();
 }
 
+int MXNDArrayGetAuxType(NDArrayHandle handle,
+                        mx_uint i,
+                        int *out_type) {
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  *out_type = arr->aux_type(i);
+  API_END();
+}
+
+/*!
+ * \brief Get a deep copy of the ith aux data blob
+ * in the form of an NDArray of default storage type.
+ * This function blocks. Do not use it in performance critical code.
+ */
+int MXNDArrayGetAuxNDArray(NDArrayHandle handle,
+                           mx_uint i,
+                           NDArrayHandle *out) {
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  *out = new NDArray(arr->aux_ndarray(i));
+  API_END();
+}
+
+/*!
+ * \brief Get a deep copy of the data blob
+ * in the form of an NDArray of default storage type.
+ * This function blocks. Do not use it in performance critical code.
+ */
+int MXNDArrayGetDataNDArray(NDArrayHandle handle,
+                            NDArrayHandle *out) {
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  *out = new NDArray(arr->data_ndarray());
+  API_END();
+}
+
 int MXNDArrayGetContext(NDArrayHandle handle,
                         int *out_dev_type,
                         int *out_dev_id) {
@@ -735,6 +833,24 @@ int MXKVStorePullEx(KVStoreHandle handle,
   API_END();
 }
 
+int MXKVStorePullRowSparse(KVStoreHandle handle,
+                           mx_uint num,
+                           const char** keys,
+                           NDArrayHandle* vals,
+                           const NDArrayHandle* row_ids,
+                           int priority) {
+  API_BEGIN();
+  std::vector<std::string> v_keys(num);
+  std::vector<std::pair<NDArray*, NDArray>> v_val_rowids(num);
+  for (mx_uint i = 0; i < num; ++i) {
+    v_keys[i] = keys[i];
+    v_val_rowids[i] = std::make_pair(static_cast<NDArray*>(vals[i]),
+                                     *static_cast<NDArray*>(row_ids[i]));
+  }
+  static_cast<KVStore*>(handle)->PullRowSparse(v_keys, v_val_rowids, priority);
+  API_END();
+}
+
 int MXKVStoreSetUpdater(KVStoreHandle handle,
                         MXKVStoreUpdater updater,
                         void* updater_handle) {
diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index 846b53973b07..fee3f03f6db0 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -76,6 +76,8 @@ struct MXAPIThreadLocalEntry {
   std::vector<TShape> arg_shapes, out_shapes, aux_shapes;
   /*! \brief result holder for returning type flags */
   std::vector<int> arg_types, out_types, aux_types;
+  /*! \brief result holder for returning storage types */
+  std::vector<int> arg_storage_types, out_storage_types, aux_storage_types;
   /*! \brief result holder for returning shape dimensions */
   std::vector<mx_uint> arg_shape_ndim, out_shape_ndim, aux_shape_ndim;
   /*! \brief result holder for returning shape pointer */
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index a4c48e426879..631c1a7d93eb 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -198,6 +198,9 @@ int MXExecutorBindEX(SymbolHandle symbol_handle,
  * \param num_provided_arg_dtypes number of user provided in_arg and axu_state dtypes
  * \param provided_arg_dtype_names argument name list of provided dtypes
  * \param provided_arg_dtypes data of provided dtypes
+ * \param num_provided_arg_stypes number of user provided in_arg and axu_state storage types
+ * \param provided_arg_stype_names argument name list of provided storage types
+ * \param provided_arg_stypes data of provided storage types
  * \param num_shared_arg_names number of parameter names passed from _bind_ith_exec
  * \param shared_arg_name_list parameter name list passed from _bind_ith_exec
  * \param shared_buffer_len number of shared data arrays passed from _bind_ith_exec
@@ -230,6 +233,9 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
                          const mx_uint num_provided_arg_dtypes,
                          const char** provided_arg_dtype_names,
                          const int* provided_arg_dtypes,
+                         const mx_uint num_provided_arg_stypes,
+                         const char** provided_arg_stype_names,
+                         const int* provided_arg_stypes,
                          const mx_uint num_shared_arg_names,
                          const char** shared_arg_name_list,
                          int* shared_buffer_len,
@@ -254,7 +260,7 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
 
   // attr_dict for setting up type_dict and arg/aux ctx
   std::unordered_map<std::string, std::unordered_map<std::string, std::string>> attr_dict;
-  if (nullptr == provided_arg_dtypes || nullptr != g2c_keys) {
+  if (nullptr == provided_arg_dtypes || nullptr != g2c_keys || nullptr == provided_arg_stypes) {
     std::vector<std::tuple<std::string, std::string, std::string>> attrs =
       sym->ListAttrsRecursive();
     attr_dict.reserve(attrs.size());
@@ -280,6 +286,23 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
     }
   }
 
+  // setup arg_stype_map
+  std::unordered_map<std::string, int> arg_stype_map;
+  if (nullptr == provided_arg_stypes) {  // use attr_dict
+    for (const auto& arg_name : in_arg_names) {
+      const auto it = attr_dict.find(arg_name);
+      if (it == attr_dict.end() || !it->second.count("__storage_type__")) {
+        arg_stype_map[arg_name] = kDefaultStorage;
+      }
+    }
+  } else {  // use user input type_dict
+    // create stype map for in_args and aux_states
+    arg_stype_map.reserve(num_provided_arg_stypes);
+    for (mx_uint i = 0; i < num_provided_arg_stypes; ++i) {
+      arg_stype_map[provided_arg_stype_names[i]] = provided_arg_stypes[i];
+    }
+  }
+
   // create default ctx
   Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
   // create ctx map
@@ -420,9 +443,10 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
   std::vector<NDArray> aux_state_vec;
 
   *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec,
-                              aux_state_ctx_vec, arg_shape_map, arg_dtype_map, grad_req_type_vec,
-                              shared_arg_name_set, &in_arg_vec, &arg_grad_vec, &aux_state_vec,
-                              use_shared_buffer? &shared_buffer_map : nullptr,
+                              aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map,
+                              grad_req_type_vec, shared_arg_name_set, &in_arg_vec,
+                              &arg_grad_vec, &aux_state_vec,
+                              use_shared_buffer ? &shared_buffer_map : nullptr,
                               reinterpret_cast<Executor*>(shared_exec_handle));
 
   // copy ndarray ptrs to ret->handles so that front end
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 3202f55abea7..d392baf45d3e 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -18,7 +18,8 @@
  */
 
 /*!
- * \file c_api_symbolic.cc
+ *  Copyright (c) 2016 by Contributors
+ * \file c_api_ndarray.cc
  * \brief C API of mxnet
  */
 
@@ -150,14 +151,17 @@ void SetContext(Context* p_ctx,
 #endif  // MXNET_USE_CUDA
 }
 
+// Set the shape, dtype and storage type
 void SetShapeType(const nnvm::Op* op,
                   const nnvm::NodeAttrs& attrs,
                   const Context& ctx,
                   const std::vector<NDArray>& ndinputs,
-                  std::vector<NDArray>* p_ndoutputs) {
+                  std::vector<NDArray>* p_ndoutputs,
+                  int* dispatch_stype) {
   std::vector<NDArray>& ndoutputs = *p_ndoutputs;
   static auto& infershape = nnvm::Op::GetAttr<nnvm::FInferShape>("FInferShape");
   static auto& infertype = nnvm::Op::GetAttr<nnvm::FInferType>("FInferType");
+  static auto& inferstorage = nnvm::Op::GetAttr<FInferStorageType>("FInferStorageType");
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   // infer shape
   std::vector<TShape>& in_shapes  = ret->arg_shapes;
@@ -193,9 +197,35 @@ void SetShapeType(const nnvm::Op* op,
   CHECK(infertype[op](attrs, &in_types, &out_types));
   CHECK_EQ(out_types.size(), ndoutputs.size());
 
+  // infer storage type
+  auto& in_storage_types = ret->arg_storage_types;
+  auto& out_storage_types = ret->out_storage_types;
+  in_storage_types.clear();
+  out_storage_types.clear();
+  for (auto& i : ndinputs) {
+    in_storage_types.push_back(i.storage_type());
+  }
+  for (auto& i : ndoutputs) {
+    out_storage_types.push_back(i.storage_type());
+  }
+  if (inferstorage.count(op)) {
+    CHECK(inferstorage[op](attrs, ctx, &in_storage_types, &out_storage_types));
+    CHECK_EQ(out_storage_types.size(), ndoutputs.size());
+  }
+
+  bool contains_non_default = common::ContainsNonDefaultStorage(in_storage_types);
+  contains_non_default |= common::ContainsNonDefaultStorage(out_storage_types);
+  int kNonDefaultStorage = -2;
+  *dispatch_stype = contains_non_default ? kNonDefaultStorage : kDefaultStorage;
   for (size_t i = 0; i < ndoutputs.size(); ++i) {
+    NDArrayStorageType storage_type = static_cast<NDArrayStorageType>(out_storage_types[i]);
     if (ndoutputs[i].is_none()) {
-      ndoutputs[i] = NDArray(out_shapes[i], ctx, true, out_types[i]);
+      // if failed to infer the storage type, assume the output storage is dense
+      if (storage_type == kDefaultStorage || out_storage_types[i] == kUndefinedStorage) {
+        ndoutputs[i] = NDArray(out_shapes[i], ctx, true, out_types[i]);
+      } else {
+        ndoutputs[i] = NDArray(storage_type, out_shapes[i], ctx, true, out_types[i]);
+      }
     } else {
       CHECK_EQ(ndoutputs[i].shape(), out_shapes[i])
         << i << "th output has invalid shape. "
@@ -212,7 +242,7 @@ void SetShapeType(const nnvm::Op* op,
 void SetDependency(std::vector<engine::VarHandle> *p_read_vars,
                    std::vector<engine::VarHandle> *p_write_vars,
                    std::vector<Resource> *p_requested,
-                   std::vector<uint32_t> *p_auxidx,
+                   std::vector<uint32_t> *p_mutate_idx,
                    const nnvm::Op* op,
                    const nnvm::NodeAttrs& attrs,
                    const Context& ctx,
@@ -224,7 +254,7 @@ void SetDependency(std::vector<engine::VarHandle> *p_read_vars,
   std::vector<engine::VarHandle>& read_vars  = *p_read_vars;
   std::vector<engine::VarHandle>& write_vars = *p_write_vars;
   std::vector<Resource>& requested = *p_requested;
-  std::vector<uint32_t>& auxidx = *p_auxidx;
+  std::vector<uint32_t>& mutate_idx = *p_mutate_idx;
 
   if (tmp_resource.count(op)) {
     int ntmp = 0;
@@ -250,15 +280,30 @@ void SetDependency(std::vector<engine::VarHandle> *p_read_vars,
     write_vars.push_back(i.var());
   }
   if (mutate.count(op)) {
-    auxidx = mutate[op](attrs);
-    std::sort(auxidx.begin(), auxidx.end());
-    for (auto & i : auxidx) {
+    mutate_idx = mutate[op](attrs);
+    std::sort(mutate_idx.begin(), mutate_idx.end());
+    for (auto & i : mutate_idx) {
       write_vars.push_back(ndinputs[i].var());
     }
   }
   Engine::Get()->DeduplicateVarHandle(&read_vars, &write_vars);
 }
 
+inline void SetWriteInplaceReq(const std::vector<NDArray> &ndinputs,
+                               const std::vector<NDArray> &ndoutputs,
+                               std::vector<OpReqType> *req) {
+  std::unordered_set<engine::VarHandle> in_vars;
+  for (auto &nd : ndinputs) {
+    in_vars.insert(nd.var());
+  }
+  for (size_t i = 0; i < ndoutputs.size(); i++) {
+    // output NDArray shares the memory with the input NDArray
+    if (in_vars.find(ndoutputs[i].var()) != in_vars.end()) {
+      req->at(i) = kWriteInplace;
+    }
+  }
+}
+
 void PushFCompute(const FCompute& fn,
                   const nnvm::Op* op,
                   const nnvm::NodeAttrs& attrs,
@@ -267,24 +312,75 @@ void PushFCompute(const FCompute& fn,
                   const std::vector<engine::VarHandle>& write_vars,
                   const std::vector<Resource>& requested,
                   const std::vector<NDArray>& ndinputs,
-                  const std::vector<NDArray>& ndoutputs) {
+                  const std::vector<NDArray>& ndoutputs,
+                  const std::vector<uint32_t>& mutate_idx) {
+  using namespace common;
   bool is_train = AutogradRuntime::Get()->IsTraining();
   Engine::Get()->PushAsync(
-    [ctx, attrs, fn, ndinputs, ndoutputs, requested, is_train](
+    [ctx, attrs, fn, ndinputs, ndoutputs, requested, is_train, mutate_idx](
         RunContext rctx,
         engine::CallbackOnComplete on_complete) {
       std::vector<TBlob> input_blobs, output_blobs;
-      for (auto& i : ndinputs) {
-        input_blobs.push_back(i.data());
-      }
-      for (auto& i : ndoutputs) {
-        output_blobs.push_back(i.data());
+      // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays
+      std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
+      // mapping from index in input_blobs to index in pre_temp_dst
+      std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
+      // populate input blobs and output blobs
+      SetupDefaultBlobs(ndinputs, &input_blobs, &pre_temp_src, &pre_temp_dst, &in_temp_idx_map);
+      SetupDefaultBlobs(ndoutputs, &output_blobs, &post_temp_dst, &post_temp_src);
+      // add mutable inputs to post temp list
+      for (const auto idx : mutate_idx) {
+        auto map_iter = in_temp_idx_map.find(idx);
+        if (map_iter != in_temp_idx_map.end()) {
+          post_temp_src.push_back(pre_temp_dst[map_iter->second]);
+          post_temp_dst.push_back(ndinputs[idx]);
+        }
       }
       OpContext opctx{is_train, rctx,
                       engine::CallbackOnComplete(),
                       requested};
       std::vector<OpReqType> req(output_blobs.size(), kWriteTo);
-      fn(attrs, opctx, input_blobs, req, output_blobs);
+      if (ctx.dev_mask() == gpu::kDevMask) {
+#if MXNET_USE_CUDA
+        CastNonDefaultStorage<gpu>(pre_temp_src, pre_temp_dst, opctx);
+        fn(attrs, opctx, input_blobs, req, output_blobs);
+        // cast to original storage type, if necessary
+        CastNonDefaultStorage<gpu>(post_temp_src, post_temp_dst, opctx);
+        rctx.get_stream<gpu>()->Wait();
+#else
+        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+      } else {
+        CastNonDefaultStorage<cpu>(pre_temp_src, pre_temp_dst, opctx);
+        fn(attrs, opctx, input_blobs, req, output_blobs);
+        // cast to original storage type, if necessary
+        CastNonDefaultStorage<cpu>(post_temp_src, post_temp_dst, opctx);
+      }
+      on_complete();
+    }, ctx, read_vars, write_vars, FnProperty::kNormal,
+    0, PROFILER_MESSAGE(op->name.c_str()));
+}
+
+void PushFComputeEx(const FComputeEx& fn,
+                    const nnvm::Op* op,
+                    const nnvm::NodeAttrs& attrs,
+                    const Context& ctx,
+                    const std::vector<engine::VarHandle>& read_vars,
+                    const std::vector<engine::VarHandle>& write_vars,
+                    const std::vector<Resource>& requested,
+                    const std::vector<NDArray>& ndinputs,
+                    const std::vector<NDArray>& ndoutputs) {
+  Engine::Get()->PushAsync(
+    [ctx, attrs, fn, ndinputs, ndoutputs, requested](
+        RunContext rctx,
+        engine::CallbackOnComplete on_complete) {
+      std::vector<TBlob> input_blobs, output_blobs;
+      OpContext opctx{false, rctx,
+                      engine::CallbackOnComplete(),
+                      requested};
+      std::vector<OpReqType> req(ndoutputs.size(), kWriteTo);
+      SetWriteInplaceReq(ndinputs, ndoutputs, &req);
+      fn(attrs, opctx, ndinputs, req, ndoutputs);
       if (ctx.dev_mask() == gpu::kDevMask) {
         rctx.get_stream<gpu>()->Wait();
       }
@@ -301,7 +397,9 @@ void PushOperator(const OpStatePtr& state,
                   const std::vector<engine::VarHandle>& write_vars,
                   const std::vector<Resource>& requested,
                   const std::vector<NDArray>& ndinputs,
-                  const std::vector<NDArray>& ndoutputs) {
+                  const std::vector<NDArray>& ndoutputs,
+                  const std::vector<uint32_t>& mutate_idx) {
+  using namespace common;
   static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
 
   bool is_train = AutogradRuntime::Get()->IsTraining();
@@ -314,15 +412,40 @@ void PushOperator(const OpStatePtr& state,
   if (fcompute != nullptr) {
     CHECK(exec_type == ExecType::kSync || exec_type == ExecType::kAsync);
     Engine::Get()->PushAsync(
-      [state, fcompute, ndinputs, ndoutputs, requested, is_train, exec_type](
+      [state, fcompute, ndinputs, ndoutputs, requested, is_train, exec_type, mutate_idx](
           RunContext rctx,
           engine::CallbackOnComplete on_complete) {
         OpContext opctx{is_train, rctx, on_complete, requested};
+
         std::vector<TBlob> input_blobs, output_blobs;
-        for (const auto& i : ndinputs) input_blobs.push_back(i.data());
-        for (const auto& i : ndoutputs) output_blobs.push_back(i.data());
+        // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays
+        std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
+        // mapping from index in input_blobs to index in pre_temp_dst
+        std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
+        // populate input blobs and output blobs
+        SetupDefaultBlobs(ndinputs, &input_blobs, &pre_temp_src, &pre_temp_dst, &in_temp_idx_map);
+        SetupDefaultBlobs(ndoutputs, &output_blobs, &post_temp_dst, &post_temp_src);
+        // add mutable inputs to post temp list
+        for (const auto idx : mutate_idx) {
+          if (in_temp_idx_map.find(idx) != in_temp_idx_map.end()) {
+            post_temp_src.push_back(pre_temp_dst[in_temp_idx_map[idx]]);
+            post_temp_dst.push_back(ndinputs[idx]);
+          }
+        }
         std::vector<OpReqType> req(output_blobs.size(), kWriteTo);
-        fcompute(state, opctx, input_blobs, req, output_blobs);
+        if (rctx.get_ctx().dev_mask() == gpu::kDevMask) {
+#if MXNET_USE_CUDA
+          CastNonDefaultStorage<gpu>(pre_temp_src, pre_temp_dst, opctx);
+          fcompute(state, opctx, input_blobs, req, output_blobs);
+          CastNonDefaultStorage<gpu>(post_temp_src, post_temp_dst, opctx);
+#else
+          LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+        } else {
+          CastNonDefaultStorage<cpu>(pre_temp_src, pre_temp_dst, opctx);
+          fcompute(state, opctx, input_blobs, req, output_blobs);
+          CastNonDefaultStorage<cpu>(post_temp_src, post_temp_dst, opctx);
+        }
         if (exec_type == ExecType::kSync) {
           if (rctx.get_ctx().dev_mask() == gpu::kDevMask) {
             rctx.get_stream<gpu>()->Wait();
@@ -342,6 +465,7 @@ void PushOperator(const OpStatePtr& state,
           engine::CallbackOnComplete on_complete) {
         OpContext opctx{is_train, rctx, on_complete, requested};
         std::vector<OpReqType> req(ndoutputs.size(), kWriteTo);
+        SetWriteInplaceReq(ndinputs, ndoutputs, &req);
         fcompute_ex(state, opctx, ndinputs, req, ndoutputs);
         if (exec_type == ExecType::kSync) {
           if (rctx.get_ctx().dev_mask() == gpu::kDevMask) {
@@ -363,8 +487,6 @@ void ImperativeInvokeImpl(const Context& default_ctx,
                           const nnvm::NodeAttrs& attrs,
                           std::vector<NDArray>* p_ndinputs,
                           std::vector<NDArray>* p_ndoutputs) {
-  static auto& fcpu = nnvm::Op::GetAttr<FCompute>("FCompute<cpu>");
-  static auto& fgpu = nnvm::Op::GetAttr<FCompute>("FCompute<gpu>");
   static auto& ndfunc = nnvm::Op::GetAttr<FNDArrayFunction>("FNDArrayFunction");
   static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
@@ -379,29 +501,32 @@ void ImperativeInvokeImpl(const Context& default_ctx,
   } else {
     // TODO(piiswrong): infer ctx
     Context ctx;
+    int stype;
     SetContext(&ctx, attrs, ndinputs, ndoutputs, default_ctx);
-    SetShapeType(op, attrs, ctx, ndinputs, &ndoutputs);
+    SetShapeType(op, attrs, ctx, ndinputs, &ndoutputs, &stype);
 
     std::vector<engine::VarHandle> read_vars, write_vars;
     std::vector<Resource> requested;
-    std::vector<uint32_t> auxidx;
-    SetDependency(&read_vars, &write_vars, &requested, &auxidx,
+    std::vector<uint32_t> mutate_idx;
+    SetDependency(&read_vars, &write_vars, &requested, &mutate_idx,
         op, attrs, ctx, ndinputs, ndoutputs);
 
-    FCompute fn;
-    if (ctx.dev_mask() == cpu::kDevMask && fcpu.count(op)) {
-      fn = fcpu[op];
-    } else if (ctx.dev_mask() == gpu::kDevMask && fgpu.count(op)) {
-      fn = fgpu[op];
-    }
-
-    if (fn) {
+    FCompute fn = common::GetFCompute<FCompute>(op, "FCompute", ctx);
+    FComputeEx fn_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", ctx);
+    if (fn_ex && stype != kDefaultStorage) {
       if (AutogradRuntime::Get()->IsRecording()) {
         AutogradRuntime::Get()->RecordImperativeFCompute(op,
             attrs, &ndinputs, &ndoutputs);
       }
-      PushFCompute(fn, op, attrs, ctx, read_vars, write_vars,
+      PushFComputeEx(fn_ex, op, attrs, ctx, read_vars, write_vars,
           requested, ndinputs, ndoutputs);
+    } else if (fn) {
+      if (AutogradRuntime::Get()->IsRecording()) {
+        AutogradRuntime::Get()->RecordImperativeFCompute(op,
+            attrs, &ndinputs, &ndoutputs);
+      }
+      PushFCompute(fn, op, attrs, ctx, read_vars, write_vars,
+          requested, ndinputs, ndoutputs, mutate_idx);
     } else if (createop.count(op)) {
       auto state =
           createop[op](attrs, ctx, ret->arg_shapes, ret->arg_types);
@@ -411,7 +536,7 @@ void ImperativeInvokeImpl(const Context& default_ctx,
       }
       write_vars.push_back(state.get_var());
       PushOperator(state, op, attrs, ctx, read_vars, write_vars,
-          requested, ndinputs, ndoutputs);
+          requested, ndinputs, ndoutputs, mutate_idx);
     } else {
       LOG(FATAL)
         << "Operator " << op->name << " is not implemented for "
@@ -461,6 +586,28 @@ int MXImperativeInvoke(AtomicSymbolCreator creator,
   API_END();
 }
 
+int MXImperativeInvokeEx(AtomicSymbolCreator creator,
+                         int num_inputs,
+                         NDArrayHandle *inputs,
+                         int *num_outputs,
+                         NDArrayHandle **outputs,
+                         int num_params,
+                         const char **param_keys,
+                         const char **param_vals,
+                         const int **out_stypes) {  // outputs storage types
+  API_BEGIN();
+  MXImperativeInvoke(creator, num_inputs, inputs, num_outputs, outputs,
+                     num_params, param_keys, param_vals);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  NDArray** output_nds = reinterpret_cast<NDArray**>(*outputs);
+  ret->out_types.resize(*num_outputs);
+  for (int i = 0; i < *num_outputs; ++i) {
+    ret->out_types[i] = output_nds[i]->storage_type();
+  }
+  *out_stypes = dmlc::BeginPtr(ret->out_types);
+  API_END();
+}
+
 int MXCreateCachedOp(SymbolHandle handle,
                      CachedOpHandle *out) {
   nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(handle);
@@ -540,6 +687,24 @@ int MXInvokeCachedOp(CachedOpHandle handle,
   API_END();
 }
 
+int MXInvokeCachedOpEx(CachedOpHandle handle,
+                       int num_inputs,
+                       NDArrayHandle *inputs,
+                       int *num_outputs,
+                       NDArrayHandle **outputs,
+                       const int **out_stypes) {  // outputs storage types
+  API_BEGIN();
+  MXInvokeCachedOp(handle, num_inputs, inputs, num_outputs, outputs);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  NDArray** output_nds = reinterpret_cast<NDArray**>(*outputs);
+  ret->out_types.resize(*num_outputs);
+  for (int i = 0; i < *num_outputs; ++i) {
+    ret->out_types[i] = output_nds[i]->storage_type();
+  }
+  *out_stypes = dmlc::BeginPtr(ret->out_types);
+  API_END();
+}
+
 int MXAutogradIsTraining(bool* curr) {
   API_BEGIN();
   *curr = AutogradRuntime::Get()->IsTraining();
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index e2c29b888ada..d526aea0d35f 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -29,6 +29,7 @@
 #include <nnvm/symbolic.h>
 #include "./c_api_common.h"
 #include "../operator/operator_common.h"
+#include "../executor/exec_pass.h"
 
 namespace mxnet {
 namespace op {
@@ -459,7 +460,7 @@ int MXSymbolInferShape(SymbolHandle sym,
   }
 
   try {
-    g = nnvm::pass::InferShape(std::move(g), arg_shapes, "__shape__");
+    g = mxnet::exec::InferShape(std::move(g), arg_shapes, "__shape__");
   } catch (const mxnet::op::InferShapeError &err) {
     throw dmlc::Error(err.msg);
   }
@@ -544,7 +545,7 @@ int MXSymbolInferType(SymbolHandle sym,
     mxnet::MatchArguments(g.indexed_graph(), kwargs, &arg_types, "InferType");
   }
 
-  g = nnvm::pass::InferType(std::move(g), arg_types, "__dtype__");
+  g = mxnet::exec::InferType(std::move(g), arg_types, "__dtype__");
   // copy back
   CopyAttr(g.indexed_graph(), g.GetAttr<nnvm::DTypeVector>("dtype"),
            &(ret->arg_types), &(ret->out_types), &(ret->aux_types));
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index 5ca01492800e..dda4fda1ed8f 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -32,6 +32,7 @@
 #include <unordered_map>
 #include "./c_api_common.h"
 #include "../operator/operator_common.h"
+#include "../executor/exec_pass.h"
 
 using namespace mxnet;
 
@@ -194,7 +195,7 @@ int MXPredCreatePartialOut(const char* symbol_json_str,
       }
     }
     nnvm::Graph g; g.outputs = sym.outputs;
-    g = nnvm::pass::InferShape(std::move(g), in_shapes, "__shape__");
+    g = mxnet::exec::InferShape(std::move(g), in_shapes, "__shape__");
     bool infer_complete = (g.GetAttr<size_t>("shape_num_unknown_nodes") == 0);
     CHECK(infer_complete)
       << "The shape information of is not enough to get the shapes";
diff --git a/src/common/utils.cc b/src/common/utils.cc
new file mode 100644
index 000000000000..125e4e5dc7d7
--- /dev/null
+++ b/src/common/utils.cc
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file utils.cc
+ * \brief cpu implementation of util functions
+ */
+
+#include "./utils.h"
+#include "../operator/tensor/cast_storage-inl.h"
+
+namespace mxnet {
+namespace common {
+
+template<>
+void CastStorageDispatch<cpu>(const OpContext& ctx,
+                              const NDArray& input,
+                              const NDArray& output) {
+  mxnet::op::CastStorageComputeImpl<cpu>(ctx, input, output);
+}
+
+}  // namespace common
+}  // namespace mxnet
diff --git a/src/common/utils.cu b/src/common/utils.cu
new file mode 100644
index 000000000000..093480a98907
--- /dev/null
+++ b/src/common/utils.cu
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file utils.cu
+ * \brief gpu implementation of util functions
+ */
+
+#include "./utils.h"
+#include "../operator/tensor/cast_storage-inl.h"
+
+namespace mxnet {
+namespace common {
+
+template<>
+void CastStorageDispatch<gpu>(const OpContext& ctx,
+                              const NDArray& input,
+                              const NDArray& output) {
+  mxnet::op::CastStorageComputeImpl<gpu>(ctx, input, output);
+}
+
+}  // namespace common
+}  // namespace mxnet
diff --git a/src/common/utils.h b/src/common/utils.h
index 85e30970f1a0..92631a9b5c34 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -24,7 +24,14 @@
 #ifndef MXNET_COMMON_UTILS_H_
 #define MXNET_COMMON_UTILS_H_
 
-#if DMLC_USE_CXX11
+#include <dmlc/logging.h>
+#include <dmlc/omp.h>
+#include <mxnet/engine.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/op_attr_types.h>
+#include <mxnet/graph_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+
 #include <memory>
 #include <vector>
 #include <type_traits>
@@ -33,15 +40,100 @@
 #include <string>
 #include <thread>
 #include <algorithm>
-#endif  // DMLC_USE_CXX11
-
-#include <dmlc/logging.h>
-#include <mxnet/engine.h>
+#include <functional>
 
 namespace mxnet {
 namespace common {
 
-#if DMLC_USE_CXX11
+template<typename xpu>
+void CastStorageDispatch(const OpContext& ctx, const NDArray& input, const NDArray& output);
+
+/*
+ * \brief setup default-storage tblobs from source NDArrays. If any source NDArray has non-default
+ *        storage, it creates a temp NDArray with default storage and uses the temp tblob. The
+ *        function also records the indices of non-default source NDArrays and the indices of
+ *        their corresponding temporary NDArrays in the temp array.
+ * \param src list of source NDArray
+ * \param blobs list of tblobs to return
+ * \param temp_src list of source NDArrays which requires temporary default storage representation
+ * \param temp_dst list of temporary destination NDArrays for default storage representation
+ * \param idx_map mapping from indices in source NDArrays to indices in temp_dst. When not set,
+          indices are not recorded
+ * \return true if any source NDArray need to cast storage
+ */
+inline bool SetupDefaultBlobs(const std::vector<NDArray>& src,
+                              std::vector<TBlob> *blobs,
+                              std::vector<NDArray> *temp_src,
+                              std::vector<NDArray> *temp_dst,
+                              std::unordered_map<uint32_t, uint32_t> *idx_map = nullptr) {
+  bool require_cast = false;
+  for (size_t i = 0; i < src.size(); i++) {
+    auto& nd = src[i];
+    if (nd.storage_type() != kDefaultStorage) {
+      if (idx_map != nullptr) {
+        (*idx_map)[i] = temp_dst->size();
+      }
+      NDArray temp(nd.shape(), nd.ctx(), false, nd.dtype());
+      temp_src->emplace_back(nd);
+      temp_dst->emplace_back(temp);
+      blobs->emplace_back(temp.data());
+      require_cast = true;
+    } else {
+      blobs->push_back(nd.data());
+    }
+  }
+  return require_cast;
+}
+
+/*
+ * \brief cast the NDArrays in `src` and store the result in NDArrays in `dst`.
+ *        This is only used for storage fallback in executor.
+ *        When storage_fallback is false, and `MXNET_EXEC_STORAGE_FALLBACK` == 0,
+ *        storage fallback is disallowed.
+ * \param src list of source NDArray to cast
+ * \param dst list of destionation NDArray which hold the result of cast_storage operation
+ * \param ctx operator context for cast_storage operation
+ * \param storage_fallback whether storage_fallback is allowed. When set to false,
+ *        its value depends on `MXNET_EXEC_STORAGE_FALLBACK`.
+ */
+template <typename xpu>
+inline void CastNonDefaultStorage(const std::vector<NDArray>& src,
+                                  const std::vector<NDArray>& dst,
+                                  const OpContext& ctx,
+                                  bool storage_fallback = false) {
+  CHECK_GE(dst.size(), src.size());
+  if (src.size() == 0) return;
+  if (storage_fallback == false) {
+    storage_fallback = dmlc::GetEnv("MXNET_EXEC_STORAGE_FALLBACK", true);
+  }
+  if (storage_fallback == false) {
+    LOG(FATAL) << "Storage type conversion detected during execution. "
+               << "You are probably executing an operator which "
+               << "doesn't support NDArray inputs with non-default storage.";
+  }
+  for (size_t i = 0; i < src.size(); i++) {
+    CastStorageDispatch<xpu>(ctx, src[i], dst[i]);
+  }
+}
+
+// Check if any storage type is not default storage
+inline bool ContainsNonDefaultStorage(const StorageTypeVector& vstorage) {
+  for (const auto& i : vstorage) {
+    if (i != kUndefinedStorage && i != kDefaultStorage) return true;
+  }
+  return false;
+}
+
+// Check if any NDArray in the list has default storage
+inline bool ContainsDefaultStorage(const std::vector<NDArray>& ndarrays) {
+  for (const auto &nd : ndarrays) {
+    if (nd.storage_type() == kDefaultStorage) {
+      return true;
+    }
+  }
+  return false;
+}
+
 // heuristic to dermine number of threads per GPU
 inline int GetNumThreadPerGPU() {
   // This is resource efficient option.
@@ -56,6 +148,67 @@ inline int GetExecNumMatchColor() {
   return std::min(num_match_color, GetNumThreadPerGPU());
 }
 
+template<typename T, typename V>
+V ParallelAccumulate(const T* a, const int n, V start) {
+  V sum = start;
+#pragma omp parallel for reduction(+:sum)
+  for (int i = 0; i < n; ++i) {
+    sum += a[i];
+  }
+  return sum;
+}
+
+/*!
+ * \brief
+ * Helper function for ParallelSort.
+ * DO NOT call this function directly.
+ * Use the interface ParallelSort instead.
+ * Ref: https://github.com/dmlc/difacto/blob/master/src/common/parallel_sort.h
+ */
+template<typename RandomIt, typename Compare>
+void ParallelSortHelper(RandomIt first, size_t len,
+                        size_t grainsize, const Compare& comp) {
+  if (len < grainsize) {
+    std::sort(first, first+len, comp);
+  } else {
+    std::thread thr(ParallelSortHelper<RandomIt, Compare>, first, len/2, grainsize, comp);
+    ParallelSortHelper(first+len/2, len - len/2, grainsize, comp);
+    thr.join();
+    std::inplace_merge(first, first+len/2, first+len, comp);
+  }
+}
+
+/*!
+ * \brief
+ * Sort the elements in the range [first, last) into the ascending order defined by
+ * the comparator comp.
+ * If the length of the range [first, last) is greater than a certain threshold,
+ * the range will be recursively divided into two and assign two threads
+ * to sort each half range.
+ * Ref: https://github.com/dmlc/difacto/blob/master/src/common/parallel_sort.h
+ */
+template<typename RandomIt, typename Compare>
+void ParallelSort(RandomIt first, RandomIt last, size_t num_threads, Compare comp) {
+  const auto num = std::distance(first, last);
+  size_t grainsize = std::max(num / num_threads + 5, static_cast<size_t>(1024*16));
+  ParallelSortHelper(first, num, grainsize, comp);
+}
+
+/*!
+ * \brief
+ * Sort the elements in the range [first, last) into ascending order.
+ * The elements are compared using the default < operator.
+ * If the length of the range [first, last) is greater than a certain threshold,
+ * the range will be recursively divided into two and assign two threads
+ * to sort each half range.
+ * Ref: https://github.com/dmlc/difacto/blob/master/src/common/parallel_sort.h
+ */
+template<typename RandomIt>
+void ParallelSort(RandomIt first, RandomIt last, size_t num_threads) {
+  ParallelSort(first, last, num_threads,
+               std::less<typename std::iterator_traits<RandomIt>::value_type>());
+}
+
 /*!
  * \brief Random Engine
  */
@@ -159,8 +312,6 @@ FCompType GetFCompute(const nnvm::Op* op, const std::string& name,
   }
 }
 
-#endif  // DMLC_USE_CXX11
-
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 046460b85900..ed8cbac68ae0 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -24,6 +24,7 @@
 #include <mxnet/base.h>
 #include <mxnet/operator.h>
 #include <mxnet/op_attr_types.h>
+#include <mxnet/graph_attr_types.h>
 #include <nnvm/graph_attr_types.h>
 #include "../common/utils.h"
 #include "./exec_pass.h"
@@ -40,29 +41,87 @@ const OperatorProperty* OpPropGetOpProperty(const NodeAttrs& attrs);
 
 namespace exec {
 
-// forward executor
-class StatefulComputeExecutor : public OpExecutor {
+// abstract OpExecutor which provides storage fallback procedure on
+// non-default inputs and outputs
+// FComputeExecutor and FStatefulComputeExecutor inherit from this class
+class StorageFallbackOpExecutor : public OpExecutor {
  public:
-  void Run(RunContext rctx) override {
+  explicit StorageFallbackOpExecutor(const std::vector<uint32_t> &mutate_idx)
+      : mutate_idx_(mutate_idx) {}
+
+  void Setup() override {
+    using namespace common;
+    in_data_.clear(); out_data_.clear();
+    pre_temp_src_.clear(); pre_temp_dst_.clear();
+    post_temp_src_.clear(); post_temp_dst_.clear();
+    in_temp_idx_map_.clear();
+    SetupDefaultBlobs(in_array, &in_data_, &pre_temp_src_, &pre_temp_dst_, &in_temp_idx_map_);
+    SetupDefaultBlobs(out_array, &out_data_, &post_temp_dst_, &post_temp_src_);
+    for (const auto idx : mutate_idx_) {
+      auto map_iter = in_temp_idx_map_.find(idx);
+      if (map_iter != in_temp_idx_map_.end()) {
+        post_temp_src_.push_back(pre_temp_dst_[map_iter->second]);
+        post_temp_dst_.push_back(in_array[idx]);
+      }
+    }
+  }
+
+ protected:
+  // storage fallback before fcompute is launched
+  void PreFCompute(bool is_gpu) {
+    using namespace common;
+    if (is_gpu) {
+#if MXNET_USE_CUDA
+      CastNonDefaultStorage<gpu>(pre_temp_src_, pre_temp_dst_, op_ctx);
+#else
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+    } else {
+      CastNonDefaultStorage<cpu>(pre_temp_src_, pre_temp_dst_, op_ctx);
+    }
+  }
+
+  // storage fallback after fcompute is completed
+  void PostFCompute(bool is_gpu) {
+    using namespace common;
+    if (is_gpu) {
+#if MXNET_USE_CUDA
+      CastNonDefaultStorage<gpu>(post_temp_src_, post_temp_dst_, op_ctx);
+#else
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+    } else {
+      CastNonDefaultStorage<cpu>(post_temp_src_, post_temp_dst_, op_ctx);
+    }
+  }
+
+  // default storage tensor blobs for fcompute
+  std::vector<TBlob> in_data_, out_data_;
+  // source NDArray for cast storage
+  std::vector<NDArray> pre_temp_src_, post_temp_src_;
+  // destination NDArray for cast storage
+  std::vector<NDArray> pre_temp_dst_, post_temp_dst_;
+  // mapping from index in input_blobs to index in pre_temp_dst
+  std::unordered_map<uint32_t, uint32_t> in_temp_idx_map_;
+  // indices of mutatable inputs
+  std::vector<uint32_t> mutate_idx_;
+};
+
+
+// stateful compute executor
+class StatefulComputeExecutor : public StorageFallbackOpExecutor {
+ public:
+  void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
+    PreFCompute(is_gpu);
     fcompute_(state_, op_ctx, in_data_, req, out_data_);
+    PostFCompute(is_gpu);
 #if MKL_EXPERIMENTAL == 1
     mkl_tblobs_prv_to_cpu(in_data_);
     mkl_tblobs_prv_to_cpu(out_data_);
 #endif
   }
 
-  void Setup() override {
-    in_data_.clear();
-    for (size_t i = 0; i < in_array.size(); ++i) {
-      in_data_.push_back(in_array[i].data());
-    }
-    out_data_.clear();
-    for (size_t i = 0; i < out_array.size(); ++i) {
-      out_data_.push_back(out_array[i].data());
-    }
-  }
-
   ExecType exec_type() const override {
     return exec_type_;
   }
@@ -73,22 +132,23 @@ class StatefulComputeExecutor : public OpExecutor {
 
   explicit StatefulComputeExecutor(const OpStatePtr& state,
                                    const FStatefulCompute& fcompute,
-                                   ExecType exec_type)
-      : state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
+                                   ExecType exec_type,
+                                   const std::vector<uint32_t> &mutate_idx)
+      : StorageFallbackOpExecutor(mutate_idx),
+        state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
   friend Graph AttachOpExecs(Graph g);
   OpStatePtr state_;
   FStatefulCompute fcompute_;
   ExecType exec_type_;
-  std::vector<TBlob> in_data_, out_data_;
 };
 
 
-// forward executor
+// stateful compute_ex executor
 class StatefulComputeExExecutor : public OpExecutor {
  public:
-  void Run(RunContext rctx) override {
+  void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
     fcompute_(state_, op_ctx, in_array, req, out_array);
   }
@@ -116,42 +176,60 @@ class StatefulComputeExExecutor : public OpExecutor {
 };
 
 
-// fcompute executor executor
-class FComputeExecutor : public OpExecutor {
+// fcompute executor
+class FComputeExecutor : public StorageFallbackOpExecutor {
  public:
-  void Run(RunContext rctx) override {
+  void Run(RunContext rctx, bool is_gpu) override {
+    using namespace common;
     op_ctx.run_ctx = rctx;
+    PreFCompute(is_gpu);
     fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
+    PostFCompute(is_gpu);
 #if MKL_EXPERIMENTAL == 1
     mkl_tblobs_prv_to_cpu(in_data_);
     mkl_tblobs_prv_to_cpu(out_data_);
 #endif
   }
 
-  void Setup() override {
-    in_data_.resize(in_array.size());
-    out_data_.resize(out_array.size());
-    auto get_blob =  [](const NDArray& nd) {
-      return nd.data();
-    };
-    std::transform(in_array.begin(), in_array.end(), in_data_.begin(), get_blob);
-    std::transform(out_array.begin(), out_array.end(), out_data_.begin(), get_blob);
+  ExecType exec_type() const override {
+    return exec_type_;
   }
 
+  explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute,
+                            ExecType exec_type, const std::vector<uint32_t> &mutate_idx)
+      : StorageFallbackOpExecutor(mutate_idx),
+        attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) {
+  }
+
+ private:
+  NodeAttrs attrs_;
+  FCompute fcompute_;
+  ExecType exec_type_;
+};
+
+// fcompute_ex executor
+class FComputeExExecutor : public OpExecutor {
+ public:
+  void Run(RunContext rctx, bool is_gpu) override {
+    op_ctx.run_ctx = rctx;
+    fcompute_(attrs_, op_ctx, in_array, req, out_array);
+  }
+
+  void Setup() override {}
+
   ExecType exec_type() const override {
     return exec_type_;
   }
 
-  explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute,
-                            ExecType exec_type)
+  explicit FComputeExExecutor(const NodeAttrs& attrs, FComputeEx fcompute,
+                              ExecType exec_type)
       : attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) {
   }
 
  private:
   NodeAttrs attrs_;
-  FCompute fcompute_;
+  FComputeEx fcompute_;
   ExecType exec_type_;
-  std::vector<TBlob> in_data_, out_data_;
 };
 
 // pass to attach operator executors
@@ -170,6 +248,8 @@ Graph AttachOpExecs(Graph g) {
   const auto& vctx = g.GetAttr<ContextVector>("context");
   const auto& saved_states = g.GetAttr<
     std::unordered_map<const nnvm::Node*, OpStatePtr> >("saved_states");
+  const auto& dispatch_stypes = g.GetAttr<StorageTypeVector>("dispatch_stypes");
+
 
   // get the graph
   const auto& idx = g.indexed_graph();
@@ -207,7 +287,8 @@ Graph AttachOpExecs(Graph g) {
       FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
           op, "FStatefulCompute", vctx[i]);
       if (fcompute != nullptr) {
-        ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute, exec_type);
+        ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute,
+                                                           exec_type, mutate_index);
       } else {
         FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
             op, "FStatefulComputeEx", vctx[i]);
@@ -226,7 +307,7 @@ Graph AttachOpExecs(Graph g) {
       if (fcompute != nullptr) {
         ret[i] = std::make_shared<StatefulComputeExecutor>(
             dynamic_cast<StatefulComputeExecutor*>(ret[fwd_id].get())->state_,
-            fcompute, exec_type);
+            fcompute, exec_type, mutate_index);
       } else {
         FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
             op, "FStatefulComputeEx", vctx[i]);
@@ -239,11 +320,15 @@ Graph AttachOpExecs(Graph g) {
       }
     } else {
       FCompute fcompute = common::GetFCompute<FCompute>(op, "FCompute", vctx[i]);
-      if (fcompute != nullptr) {
+      FComputeEx fcomp_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", vctx[i]);
+      if (fcomp_ex != nullptr && dispatch_stypes[i] != kDefaultStorage) {
+        ret[i] = std::make_shared<FComputeExExecutor>(
+            inode.source->attrs, fcomp_ex, exec_type);
+      } else if (fcompute != nullptr) {
         ret[i] = std::make_shared<FComputeExecutor>(
-            inode.source->attrs, fcompute, exec_type);
+            inode.source->attrs, fcompute, exec_type, mutate_index);
       } else {
-        LOG(FATAL) << "FCompute not registered " << op->name;
+        LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name;
       }
     }
   }
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index 0eda71d98214..326262147b9f 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -27,9 +27,12 @@
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/operator.h>
+#include <mxnet/graph_attr_types.h>
 #include <nnvm/graph.h>
+#include <nnvm/graph_attr_types.h>
 #include <vector>
 #include <memory>
+#include <string>
 
 namespace mxnet {
 namespace exec {
@@ -37,6 +40,12 @@ namespace exec {
 /*! \brief reuse graph definition */
 using nnvm::Graph;
 
+const int kBadStorageID = -1;
+const int kExternalStorageID = -2;
+const int kDynamicStorageID = -3;
+
+const int kNonDefaultStorage = -2;
+
 /*!
  * \brief executor to execute an operator
  * This is a graph executor dependent interface
@@ -44,7 +53,7 @@ using nnvm::Graph;
  */
 class OpExecutor {
  public:
-  /*! \brief input arrays */
+  /*! \brief input data arrays, which may be either input or aux */
   std::vector<NDArray> in_array;
   /*! \brief output data arrays */
   std::vector<NDArray> out_array;
@@ -65,7 +74,7 @@ class OpExecutor {
    *  This function call do not synchronize the stream.
    * \param rctx The runtime context passed in by environment.
    */
-  virtual void Run(RunContext rctx) = 0;
+  virtual void Run(RunContext rctx, bool is_gpu) = 0;
   /*! \return the execution type */
   virtual ExecType exec_type() const = 0;
   /*! \return return engine variable for operator states */
@@ -123,6 +132,45 @@ Graph AttachOpResources(Graph g);
  */
 Graph DetectInplaceAddTo(Graph g);
 
+/*!
+ * \brief Infer shapes in the graph given the information.
+ * \param graph The input graph.
+ * \param shape_inputs The shapes of input symbols to the graph.
+ * \param shape_attr_key The key to the node attribute that can indicate shape. This is
+ *                       the place where manual hint for shapes could be injected.
+ * \return A graph with new attribute "shape" containing inferred shape of each NodeEntry.
+ *         The index of ShapeVector is given by graph.indexed_graph().entry_id.
+ */
+Graph InferShape(Graph graph,
+                 nnvm::ShapeVector shape_inputs,
+                 const std::string& shape_attr_key = "");
+
+/*!
+ * \brief Infer types in the graph given the information.
+ * \param graph The input graph.
+ * \param dtype_inputs The types of input symbols to the graph.
+ * \param dtype_attr_key The key to the node attribute that can indicate types. This is
+ *                       the place where manual hint for types could be injected.
+ * \return A graph with new attribute "dtype" containing inferred type of each NodeEntry.
+ *         The index of ShapeVector is given by graph.indexed_graph().entry_id.
+ */
+Graph InferType(Graph graph,
+                nnvm::DTypeVector dtype_inputs,
+                const std::string& dtype_attr_key = "");
+
+/*!
+ * \brief Infer storage types in the graph given the information.
+ * \param graph The input graph.
+ * \param storage_type_inputs The storage types of input symbols to the graph.
+ * \param storage_type_attr_key The key to the node attribute that can indicate storage types.
+                                This is the place where manual hint for types could be injected.
+ * \return A graph with new attribute "storage_type" containing inferred type of each NodeEntry.
+ *         The index of StorageTypeVector is given by graph.indexed_graph().entry_id.
+ */
+Graph InferStorageType(Graph graph,
+                       StorageTypeVector storage_type_inputs,
+                       const std::string& storage_type_attr_key = "");
+
 }  // namespace exec
 }  // namespace mxnet
 
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 6dc8cf39970e..9c4398343b1c 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -30,9 +30,15 @@
 #include "./exec_pass.h"
 #include "./graph_executor.h"
 #include "../engine/profiler.h"
+#include "../common/utils.h"
 
 namespace mxnet {
 namespace exec {
+
+GraphExecutor::GraphExecutor() {
+  log_verbose_ = dmlc::GetEnv("MXNET_EXEC_VERBOSE_LOGGING", false);
+}
+
 GraphExecutor::~GraphExecutor() {
   for (auto& n : op_nodes_) {
     if (n.cached_opr != nullptr) {
@@ -47,6 +53,30 @@ GraphExecutor::~GraphExecutor() {
   }
 }
 
+inline NDArray InitZeros(const NDArrayStorageType stype, const TShape &shape,
+                                const Context &ctx, const int dtype) {
+  // NDArray with default storage
+  if (stype == kDefaultStorage) {
+    NDArray ret(shape, ctx, false, dtype);
+    ret = 0;
+    return ret;
+  }
+  // NDArray with non-default storage. Storage allocation is always delayed.
+  return NDArray(stype, shape, ctx, true, dtype);
+}
+
+inline void EmplaceBackZeros(const NDArrayStorageType stype, const TShape &shape,
+                             const Context &ctx, const int dtype,
+                             std::vector<NDArray> *vec) {
+  // NDArray with default storage
+  if (stype == kDefaultStorage) {
+    vec->emplace_back(shape, ctx, false, dtype);
+    vec->back() = 0;
+  } else {
+    // NDArray with non-default storage. Storage allocation is always delayed.
+    vec->emplace_back(stype, shape, ctx, true, dtype);
+  }
+}
 void GraphExecutor::Forward(bool is_train) {
   RunOps(is_train, 0, num_forward_nodes_);
 }
@@ -438,6 +468,29 @@ void HandleInferTypeError(const size_t num_forward_inputs,
              << oss.str();
 }
 
+void HandleInferStorageTypeError(const size_t num_forward_inputs,
+                                 const nnvm::IndexedGraph& idx,
+                                 const StorageTypeVector& inferred_stypes) {
+  int cnt = 10;
+  std::ostringstream oss;
+  for (size_t i = 0; i < num_forward_inputs; ++i) {
+    const uint32_t nid = idx.input_nodes().at(i);
+    const uint32_t eid = idx.entry_id(nid, 0);
+    const int inferred_stype = inferred_stypes[eid];
+    if (inferred_stype == -1) {
+      const std::string& arg_name = idx[nid].source->attrs.name;
+      oss << arg_name << ": " << inferred_stype << ", ";
+      if (--cnt == 0) {
+        oss << "...";
+        break;
+      }
+    }
+  }
+  LOG(FATAL) << "InferStoragetType pass cannot decide storage type for the following arguments "
+                "(-1 means unknown stype). Please consider providing them as inputs:\n"
+             << oss.str();
+}
+
 /*!
  * \brief GraphExecutor initializer for regular bind flow in which
  * input arguments and gradients are provided by users. This initializer
@@ -475,21 +528,25 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
   data_entry_.resize(idx.num_node_entries());
   nnvm::ShapeVector arg_shapes;
   nnvm::DTypeVector arg_dtypes;
+  StorageTypeVector arg_stypes;
   for (size_t i = 0; i < num_forward_inputs_; ++i) {
     const uint32_t nid = idx.input_nodes().at(i);
     const std::string& arg_name = idx[nid].source->attrs.name;
+    size_t eid = idx.entry_id(nid, 0);
     if (mutable_nodes.count(nid)) {
       CHECK_LT(aux_top, aux_states.size());
-      data_entry_[idx.entry_id(nid, 0)] = aux_states[aux_top];
+      data_entry_[eid] = aux_states[aux_top];
       arg_shapes.push_back(aux_states[aux_top].shape());
       arg_dtypes.push_back(aux_states[aux_top].dtype());
+      arg_stypes.push_back(aux_states[aux_top].storage_type());
       aux_state_map_.emplace(arg_name, aux_states[aux_top]);
       ++aux_top;
     } else {
       CHECK_LT(arg_top, in_args.size());
-      data_entry_[idx.entry_id(nid, 0)] = in_args[arg_top];
+      data_entry_[eid] = in_args[arg_top];
       arg_shapes.push_back(in_args[arg_top].shape());
       arg_dtypes.push_back(in_args[arg_top].dtype());
+      arg_stypes.push_back(in_args[arg_top].storage_type());
       in_arg_map_.emplace(arg_name, in_args[arg_top]);
       if (kNullOp != grad_req_types[arg_top]) {
         grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_store[arg_top]);
@@ -497,23 +554,33 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
       }
       ++arg_top;
     }
+    if (log_verbose_) {
+      LOG(INFO) << "\tassign data entry\t" << eid << " as stype "
+                << data_entry_[eid].storage_type() << " (input)";
+    }
   }
 
   // expand arg_shapes and arg_dtypes to contain backward inputs
   arg_shapes.resize(idx.input_nodes().size(), TShape());
-  g = nnvm::pass::InferShape(g, arg_shapes, "__shape__");
+  g = InferShape(std::move(g), arg_shapes, "__shape__");
   if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) {
     HandleInferShapeError(num_forward_inputs_, g.indexed_graph(),
                           g.GetAttr<nnvm::ShapeVector>("shape"));
   }
 
   arg_dtypes.resize(idx.input_nodes().size(), -1);
-  g = nnvm::pass::InferType(g, arg_dtypes, "__dtype__");
+  g = InferType(std::move(g), arg_dtypes, "__dtype__");
   if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) {
     HandleInferTypeError(num_forward_inputs_, g.indexed_graph(),
                          g.GetAttr<nnvm::DTypeVector>("dtype"));
   }
 
+  g = InferStorageType(std::move(g), arg_stypes, "__storage_type__");
+  if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) {
+    HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(),
+                                g.GetAttr<StorageTypeVector>("storage_type"));
+  }
+
   // Initialize the rest attributes of the graph.
   // This function can be called by regular bind
   // operation flow as well.
@@ -529,6 +596,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
 void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
                                   const nnvm::ShapeVector& inferred_shapes,
                                   const nnvm::DTypeVector& inferred_dtypes,
+                                  const StorageTypeVector& inferred_stypes,
                                   const std::vector<Context>& in_arg_ctxes,
                                   const std::vector<Context>& arg_grad_ctxes,
                                   const std::vector<Context>& aux_state_ctxes,
@@ -546,22 +614,37 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
     const uint32_t eid = idx.entry_id(nid, 0);
     const TShape& inferred_shape = inferred_shapes[eid];
     const int inferred_dtype = inferred_dtypes[eid];
+    const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid];
     const std::string& arg_name = idx[nid].source->attrs.name;
     if (mutable_nodes.count(nid)) {  // aux_states
-      aux_state_vec->emplace_back(inferred_shape, aux_state_ctxes[aux_top], false, inferred_dtype);
-      aux_state_vec->back() = 0;
+      EmplaceBackZeros(inferred_stype, inferred_shape, aux_state_ctxes[aux_top],
+                       inferred_dtype, aux_state_vec);
       data_entry_[eid] = aux_state_vec->back();
       aux_state_map_.emplace(arg_name, aux_state_vec->back());
       ++aux_top;
+      if (log_verbose_) {
+        LOG(INFO) << "\tassign aux entry\t" << eid << "\t as stype " << inferred_stype;
+      }
     } else {  // in_args
-      in_arg_vec->emplace_back(inferred_shape, in_arg_ctxes[arg_top], false, inferred_dtype);
-      in_arg_vec->back() = 0;
+      EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top],
+                       inferred_dtype, in_arg_vec);
       data_entry_[eid] = in_arg_vec->back();
+      if (log_verbose_) {
+        LOG(INFO) << "\tassign data entry\t" << eid << "\tas stype " << inferred_stype;
+      }
+      // Get the storage type for grad
       if (kNullOp == grad_req_types[arg_top]) {
         arg_grad_vec->emplace_back();
       } else {
-        arg_grad_vec->emplace_back(inferred_shape, arg_grad_ctxes[arg_top], false, inferred_dtype);
-        arg_grad_vec->back() = 0;
+        // Init based on storage type
+        auto grad_oid = grad_store_.size() + num_forward_outputs_;
+        auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
+        auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
+        EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top],
+                         inferred_dtype, arg_grad_vec);
+        if (log_verbose_) {
+          LOG(INFO) << "\tassign grad entry\t" << grad_eid << "\tas stype " << grad_stype;
+        }
         grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
         arg_grad_map_.emplace(arg_name, arg_grad_vec->back());
       }
@@ -573,33 +656,40 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
 
 /*!
  * \brief If the requested ndarray's shape size is less than
- * the corresponding shared_data_array's shape size, reuse
- * the memory allocation; otherwise, create a zero ndarray.
+ * the corresponding shared_data_array's shape size and the
+ * storage type is default storage, reuse the memory allocation
+ * in shared_buffer; otherwise, create a zero ndarray.
  */
 NDArray ReshapeOrCreate(const std::string& name,
                         const TShape& dest_arg_shape,
                         const int dest_arg_dtype,
+                        const NDArrayStorageType dest_arg_stype,
                         const Context& ctx,
                         std::unordered_map<std::string, NDArray>* shared_buffer) {
+  if (dest_arg_dtype != kDefaultStorage) {
+    return InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
+  }
   auto it = shared_buffer->find(name);
   if (it != shared_buffer->end()) {
     if (it->second.shape().Size() >= dest_arg_shape.Size()) {  // memory can be reused
       CHECK_EQ(it->second.dtype(), dest_arg_dtype)
         << "Requested arg array's dtype does not match the reusable ndarray";
+      CHECK_EQ(it->second.storage_type(), kDefaultStorage)
+               << "shared_buffer should only contain NDArrays with default storage type.";
       return it->second.Reshape(dest_arg_shape);
     } else {
       LOG(WARNING) << "Bucketing: data " << name << " has a shape " << dest_arg_shape
                    << ", which is larger than already allocated shape " << it->second.shape()
                    << ". Need to re-allocate. Consider putting default bucket key to be "
                    << "the bucket taking the largest input for better memory sharing.";
-      it->second = NDArray(dest_arg_shape, ctx, false, dest_arg_dtype);
-      it->second = 0;
+      // the NDArrays in shared_buffer are guaranteed to be of default storage
+      it->second = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
       return it->second;
     }  // arg_array.shape().Size() >= arg_shape.Size()
   } else {
-    auto p = shared_buffer->emplace(name, NDArray(dest_arg_shape, ctx, false, dest_arg_dtype));
-    p.first->second = 0;
-    return p.first->second;
+    auto ret = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
+    shared_buffer->emplace(name, ret);
+    return ret;
   }  // if (it != shared_buffer->end())
 }
 
@@ -612,6 +702,7 @@ NDArray ReshapeOrCreate(const std::string& name,
 void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
                                   const nnvm::ShapeVector& inferred_shapes,
                                   const nnvm::DTypeVector& inferred_dtypes,
+                                  const StorageTypeVector& inferred_stypes,
                                   const std::vector<Context>& in_arg_ctxes,
                                   const std::vector<Context>& arg_grad_ctxes,
                                   const std::vector<Context>& aux_state_ctxes,
@@ -631,9 +722,12 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
     const uint32_t eid = idx.entry_id(nid, 0);
     const TShape& inferred_shape = inferred_shapes[eid];
     const int inferred_dtype = inferred_dtypes[eid];
+    const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid];
     const std::string& arg_name = idx[nid].source->attrs.name;
-    if (mutable_nodes.count(nid)) {  // aux_states
-      if (nullptr != shared_exec) {
+    // aux_states
+    if (mutable_nodes.count(nid)) {
+      if (nullptr != shared_exec && inferred_stype == kDefaultStorage &&
+          shared_exec->aux_state_map().at(arg_name).storage_type() == kDefaultStorage) {
         const NDArray& aux_nd = shared_exec->aux_state_map().at(arg_name);
         CHECK_EQ(inferred_shape, aux_nd.shape())
           << "Inferred shape does not match shared_exec.aux_array's shape."
@@ -647,16 +741,18 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
           << arg_name << " for the current executor";
         aux_state_vec->emplace_back(aux_nd);
       } else {
-        aux_state_vec->emplace_back(inferred_shape, aux_state_ctxes[aux_top],
-                                    false, inferred_dtype);
-        aux_state_vec->back() = 0;
+        EmplaceBackZeros(inferred_stype, inferred_shape, aux_state_ctxes[aux_top],
+                         inferred_dtype, aux_state_vec);
       }  // if (has_shared_exec)
       data_entry_[eid] = aux_state_vec->back();
       aux_state_map_.emplace(arg_name, aux_state_vec->back());
       ++aux_top;
-    } else {  // in_args
+    } else {  // in_args and grad for in_args
       if (shared_arg_names.count(arg_name)) {  // model parameter
-        if (nullptr != shared_exec) {
+        // model parameter
+        if (nullptr != shared_exec && inferred_stype == kDefaultStorage &&
+            shared_exec->in_arg_map().at(arg_name).storage_type() == kDefaultStorage) {
+          // try to reuse memory from shared_exec
           const NDArray& in_arg_nd = shared_exec->in_arg_map().at(arg_name);
           CHECK_EQ(inferred_shape, in_arg_nd.shape())
             << "Inferred shape does not match shared_exec.arg_array's shape"
@@ -669,33 +765,43 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
                " be resued for creating NDArray of the argument"
             << arg_name << " for the current executor";
           in_arg_vec->emplace_back(in_arg_nd);
-          if (kNullOp == grad_req_types[arg_top]) {
-            arg_grad_vec->emplace_back();
-          } else {
+        } else {
+          // doesn't have shared_exec, or non-default storage
+          EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top],
+                           inferred_dtype, in_arg_vec);
+        }
+        // gradient for model parameter
+        if (kNullOp == grad_req_types[arg_top]) {
+          arg_grad_vec->emplace_back();
+        } else {
+          auto grad_oid = grad_store_.size() + num_forward_outputs_;
+          auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
+          auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
+          if (nullptr != shared_exec && grad_stype == kDefaultStorage &&
+              shared_exec->arg_grad_map().at(arg_name).storage_type() == kDefaultStorage) {
+            // try to reuse memory from shared_exec
             arg_grad_vec->emplace_back(shared_exec->arg_grad_map().at(arg_name));
-            grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
-          }  // if (kNullOp == grad_req_types[arg_top])
-        } else {  // !has shared_exec
-          in_arg_vec->emplace_back(inferred_shape, in_arg_ctxes[arg_top], false, inferred_dtype);
-          in_arg_vec->back() = 0;
-          if (kNullOp == grad_req_types[arg_top]) {
-            arg_grad_vec->emplace_back();
           } else {
-            arg_grad_vec->emplace_back(inferred_shape, arg_grad_ctxes[arg_top],
-                                       false, inferred_dtype);
-            arg_grad_vec->back() = 0;
-            grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
-          }  // if (kNullOp == grad_req_types[arg_top])
-        }  // if (has_shared_exec)
+            EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top],
+                             inferred_dtype, arg_grad_vec);
+          }
+          grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
+        }
       } else {  // !shared_arg_names.count(arg_name)
+        // model parameter
         in_arg_vec->emplace_back(ReshapeOrCreate(arg_name, inferred_shape, inferred_dtype,
-                                                 in_arg_ctxes[arg_top], shared_buffer));
+                                                 inferred_stype, in_arg_ctxes[arg_top],
+                                                 shared_buffer));
+        // gradient for model parameter
         if (kNullOp == grad_req_types[arg_top]) {
           arg_grad_vec->emplace_back();
         } else {
+          auto grad_oid = grad_store_.size() + num_forward_outputs_;
+          auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
+          auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
           arg_grad_vec->emplace_back(ReshapeOrCreate("grad of " + arg_name, inferred_shape,
-                                                     inferred_dtype, arg_grad_ctxes[arg_top],
-                                                     shared_buffer));
+                                                     inferred_dtype, grad_stype,
+                                                     arg_grad_ctxes[arg_top], shared_buffer));
           grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
         }  // if (kNullOp == grad_req_types[arg_top])
       }  // if (shared_arg_names.count(arg_name))
@@ -718,14 +824,35 @@ void GraphExecutor::FinishInitGraph(nnvm::Symbol symbol,
                                     Executor* shared_exec,
                                     const nnvm::NodeEntryMap<NDArray>& feed_dict) {
   const auto& idx = g.indexed_graph();
+  // dispatch based on stype per operator
+  const auto& vstorage_type = g.GetAttr<StorageTypeVector>("storage_type");
+  StorageTypeVector dispatch_stypes(idx.num_nodes(), kUndefinedStorage);
+  for (size_t nid = 0; nid < idx.num_nodes(); nid++) {
+      const auto& inode = idx[nid];
+      auto num_outputs = inode.source->num_outputs();
+      auto num_inputs = inode.inputs.size();
+      StorageTypeVector vs(num_inputs + num_outputs, kUndefinedStorage);
+      for (size_t i = 0; i < num_inputs; i++) {
+        auto e = inode.inputs[i];
+        vs[i] = vstorage_type[idx.entry_id(e)];
+        CHECK_NE(vs[i], kUndefinedStorage);
+      }
+      for (uint32_t i = 0; i < num_outputs; ++i) {
+        uint32_t eid = idx.entry_id(nid, i);
+        vs[i + num_inputs] = vstorage_type[eid];
+      }
+      bool contains_non_default = common::ContainsNonDefaultStorage(vs);
+      dispatch_stypes[nid] = contains_non_default ? kNonDefaultStorage : kDefaultStorage;
+  }
+  g.attrs["dispatch_stypes"] = std::make_shared<dmlc::any>(std::move(dispatch_stypes));
+
+  // data entries for output gradients
   for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) {
     data_entry_[idx.entry_id(idx.outputs()[j])] = grad_store_[j - num_forward_outputs_].second;
   }
 
   {
     // memory allocator
-    const int kBadStorageID = -1;
-    const int kExternalStorageID = -2;
     nnvm::StorageVector arg_storage_id(idx.num_node_entries(), kBadStorageID);
     for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) {
       arg_storage_id[idx.entry_id(idx.outputs()[j])] = kExternalStorageID;
@@ -735,6 +862,9 @@ void GraphExecutor::FinishInitGraph(nnvm::Symbol symbol,
       data_entry_[eid] = kv.second;
       arg_storage_id[eid] = kExternalStorageID;
     }
+    for (size_t i = 0; i < idx.num_node_entries(); i++) {
+      if (vstorage_type[i] != kDefaultStorage) arg_storage_id[i] = kDynamicStorageID;
+    }
     g.attrs["storage"] = std::make_shared<dmlc::any>(std::move(arg_storage_id));
     g = nnvm::ApplyPass(g, "PlanMemory");
   }
@@ -792,6 +922,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
                          const std::vector<Context>& aux_state_ctxes,
                          const std::unordered_map<std::string, TShape>& arg_shape_map,
                          const std::unordered_map<std::string, int>& arg_dtype_map,
+                         const std::unordered_map<std::string, int>& arg_stype_map,
                          const std::vector<OpReqType>& grad_req_types,
                          const std::unordered_set<std::string>& shared_arg_names,
                          std::vector<NDArray>* in_arg_vec,
@@ -811,6 +942,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
   const nnvm::IndexedGraph& idx = g.indexed_graph();
   nnvm::ShapeVector arg_shapes(idx.input_nodes().size(), TShape());
   nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1);
+  StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage);
   for (size_t i = 0; i < num_forward_inputs_; ++i) {
     const uint32_t nid = idx.input_nodes().at(i);
     const std::string& name = idx[nid].source->attrs.name;
@@ -822,29 +954,41 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
     if (arg_dtype_map.end() != it2) {
       arg_dtypes[i] = it2->second;
     }
+    auto it3 = arg_stype_map.find(name);
+    if (arg_stype_map.end() != it3) {
+      arg_stypes[i] = it3->second;
+    }
   }
-  g = nnvm::pass::InferShape(g, arg_shapes, "__shape__");
+  g = InferShape(std::move(g), arg_shapes, "__shape__");
   if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) {
     HandleInferShapeError(num_forward_inputs_, g.indexed_graph(),
                           g.GetAttr<nnvm::ShapeVector>("shape"));
   }
 
-  g = nnvm::pass::InferType(g, arg_dtypes, "__dtype__");
+  g = InferType(std::move(g), arg_dtypes, "__dtype__");
   if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) {
     HandleInferTypeError(num_forward_inputs_, g.indexed_graph(),
                          g.GetAttr<nnvm::DTypeVector>("dtype"));
   }
 
+  g = InferStorageType(std::move(g), arg_stypes, "__storage_type__");
+  if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) {
+    HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(),
+                                g.GetAttr<StorageTypeVector>("storage_type"));
+  }
+
   // Create in_args, arg_grads, and aux_states using
   // the inferred shapes and dtypes.
   if (nullptr == shared_buffer) {  // regular simple bind
     InitArguments(idx, g.GetAttr<nnvm::ShapeVector>("shape"),
                   g.GetAttr<nnvm::DTypeVector>("dtype"),
+                  g.GetAttr<StorageTypeVector>("storage_type"),
                   in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
                   grad_req_types, in_arg_vec, arg_grad_vec, aux_state_vec);
   } else {  // simple bind using shared data arrays and shared_exec
     InitArguments(idx, g.GetAttr<nnvm::ShapeVector>("shape"),
                   g.GetAttr<nnvm::DTypeVector>("dtype"),
+                  g.GetAttr<StorageTypeVector>("storage_type"),
                   in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
                   grad_req_types, shared_arg_names, shared_exec,
                   shared_buffer, in_arg_vec, arg_grad_vec, aux_state_vec);
@@ -905,20 +1049,29 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
   const auto& vdtype = graph_.GetAttr<DTypeVector>("dtype");
   const auto& vshape = graph_.GetAttr<ShapeVector>("shape");
   const auto& vstorage = graph_.GetAttr<StorageVector>("storage_id");
+  const auto& vstorage_type = graph_.GetAttr<StorageTypeVector>("storage_type");
   const auto& vctx = graph_.GetAttr<ContextVector>("context");
   CHECK_EQ(idx.num_node_entries(), vshape.size());
   CHECK_EQ(idx.num_node_entries(), vdtype.size());
   CHECK_EQ(idx.num_node_entries(), vstorage.size());
   CHECK_EQ(data_entry_.size(), vshape.size());
   std::vector<Context> data_context(idx.num_node_entries());
+  std::vector<NDArrayStorageType> data_storage_type(idx.num_node_entries(), kUndefinedStorage);
   for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
     for (uint32_t i = 0; i < idx[nid].source->num_outputs(); ++i) {
-      data_context[idx.entry_id(nid, i)] = vctx[nid];
+      auto eid = idx.entry_id(nid, i);
+      data_context[eid] = vctx[nid];
+      CHECK_NE(vstorage_type[nid], kUndefinedStorage);
+      data_storage_type[eid] = (NDArrayStorageType) vstorage_type[nid];
     }
   }
 
   // information about the pool
-  using PoolEntry = std::pair<Context, size_t>;
+  struct PoolEntry {
+    Context ctx;
+    size_t bytes;
+    NDArrayStorageType stype;
+  };
   std::vector<PoolEntry> pool_info;
 
   // assign array to head gradient
@@ -926,26 +1079,36 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     uint32_t nid = idx.input_nodes().at(i);
     uint32_t oid = head_grad_map_.at(idx[nid].source);
     uint32_t eid = idx.entry_id(idx.outputs()[oid]);
+    NDArrayStorageType stype = (NDArrayStorageType) vstorage_type[eid];
     CHECK_NE(vshape[eid].ndim(), 0U);
     CHECK_NE(vdtype[eid], -1);
-    data_entry_[idx.entry_id(nid, 0)] =
-        NDArray(vshape[eid], data_context[eid], false, vdtype[eid]);
+    auto data_eid = idx.entry_id(nid, 0);
+    // initialize based on storage_type
+    if (stype != kDefaultStorage) {
+      data_entry_[data_eid] = NDArray(stype, vshape[eid], data_context[eid], true, vdtype[eid]);
+    } else {
+      data_entry_[data_eid] = NDArray(vshape[eid], data_context[eid], false, vdtype[eid]);
+    }
+    if (log_verbose_) {
+      LOG(INFO) << "\tinit head_g entry\t" << data_eid << "\tas stype " << stype;
+    }
   }
   // get maximum bytes in each pool
   for (size_t i = 0; i < vshape.size(); ++i) {
     if (!data_entry_[i].is_none()) continue;
     size_t bytes = vshape[i].Size() * mshadow::mshadow_sizeof(vdtype[i]);
     int storage_id = vstorage[i];
+    // skip pool allocation for kBadStorageID, kExternalStorageID and kDynamicStorageID
     if (storage_id < 0) continue;
     size_t sid = static_cast<size_t>(storage_id);
     if (sid >= pool_info.size()) {
-      pool_info.resize(sid + 1, PoolEntry{Context::CPU(), size_t(0)});
+      pool_info.resize(sid + 1, PoolEntry{Context::CPU(), size_t(0), kUndefinedStorage});
     }
     PoolEntry& info = pool_info[sid];
-    if (info.second == 0) {
-      info = PoolEntry{data_context[i], bytes};
+    if (info.bytes == 0) {
+      info = PoolEntry{data_context[i], bytes, data_storage_type[i]};
     } else {
-      info.second = std::max(info.second, bytes);
+      info.bytes = std::max(info.bytes, bytes);
     }
   }
   // construct the re-use pool, if needed
@@ -966,13 +1129,14 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     sorted_pool_index.push_back(i);
   }
   auto pool_comparator = [&pool_info](int lhs, int rhs){
-    return pool_info[lhs].second > pool_info[rhs].second;
+    return pool_info[lhs].bytes > pool_info[rhs].bytes;
   };
   std::sort(sorted_pool_index.begin(), sorted_pool_index.end(), pool_comparator);
 
   for (size_t i : sorted_pool_index) {
-    const Context& ctx = pool_info[i].first;
-    size_t bytes = pool_info[i].second;
+    const Context& ctx = pool_info[i].ctx;
+    size_t bytes = pool_info[i].bytes;
+    NDArrayStorageType storage_type = pool_info[i].stype;
     bool allocated = false;
     for (auto it = free_pool.lower_bound(bytes); it != free_pool.end(); ++it) {
       if (it->second.ctx() == ctx && it->first >= bytes) {
@@ -987,7 +1151,9 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
       CHECK_LE(nword, std::numeric_limits<nnvm::dim_t>::max());
       // allocate float arrays
       TShape shape{static_cast<nnvm::dim_t>(nword)};
-      NDArray nd(shape, ctx);
+      // TODO(junwu): adding delay_alloc=true to create nd
+      // is a temporary solution.
+      NDArray nd(shape, ctx, true);
       data_pool_[i] = nd;
       // put the new allocated arrays to shared pool
       if (shared_pool != nullptr)  {
@@ -997,15 +1163,22 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
   }
   CHECK_EQ(data_pool_.size(), pool_info.size());
   // assign the data entries
-
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     // avoid pre-allocated arrays
     if (!data_entry_[i].is_none()) continue;
     // assign allocated array by storage id
     int storage_id = vstorage[i];
-    CHECK_GE(storage_id, 0) << "Do not support runtime shape op yet";
-    const NDArray& src = data_pool_.at(storage_id);
-    data_entry_[i] = src.AsArray(vshape[i], vdtype[i]);
+    auto storage_type = (NDArrayStorageType) vstorage_type[i];
+    if (storage_type == kDefaultStorage) {
+      CHECK_GE(storage_id, 0) << "Do not support runtime shape op yet";
+      const NDArray& src = data_pool_.at(storage_id);
+      data_entry_[i] = src.AsArray(vshape[i], vdtype[i]);
+    } else {
+      data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i]);
+    }
+    if (log_verbose_) {
+      LOG(INFO) << "\tinit data entry\t" << i << "\tas stype " << storage_type;
+    }
   }
 }
 
@@ -1020,11 +1193,28 @@ void GraphExecutor::InitCachedOps() {
   const auto& vctx = graph_.GetAttr<ContextVector>("context");
   const auto& addto_entry = graph_.GetAttr<std::vector<int> >("addto_entry");
   const auto& skip_plus_node = graph_.GetAttr<std::vector<int> >("skip_plus_node");
+  const auto& vstorage_type = graph_.GetAttr<StorageTypeVector>("storage_type");
 
   op_nodes_.resize(idx.num_nodes());
   // setup the array and requirements.
   for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
     const auto& inode = idx[nid];
+    if (log_verbose_) {
+      if (inode.source->is_variable()) {
+        LOG(INFO) << "node " << nid << " var";
+      } else {
+        LOG(INFO) << "node " << nid << " " << inode.source->attrs.op->name;
+        auto exec = op_execs[nid];
+        for (const auto& e : inode.inputs) {
+          auto eid = idx.entry_id(e);
+          LOG(INFO) << "\t\tinput " << eid << " stype: " << vstorage_type[eid];
+        }
+        for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+          uint32_t eid = idx.entry_id(nid, index);
+          LOG(INFO) << "\t\toutput " << eid << " stype: " << vstorage_type[eid];
+        }
+      }
+    }
     if (inode.source->is_variable()) continue;
 #if MXNET_USE_PROFILER
     op_nodes_[nid].opr_name = inode.source->op()->name.c_str();
@@ -1104,7 +1294,7 @@ void GraphExecutor::InitCachedOps() {
       if (is_async) {
         exec->op_ctx.async_on_complete = on_complete;
       }
-      exec->Run(ctx);
+      exec->Run(ctx, is_gpu);
       // call on complete only if it is async op
       if (!is_async) {
         if (is_gpu) {
@@ -1265,7 +1455,8 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
       CHECK_EQ(opnode.exec->out_array.size(), 1U);
       CopyFromTo(opnode.exec->in_array[0], &(opnode.exec->out_array[0]));
     } else if (opnode.exec->exec_type() == ExecType::kLocal) {
-      opnode.exec->Run(RunContext{opnode.ctx, nullptr});
+      bool is_gpu = opnode.ctx.dev_mask() == gpu::kDevMask;
+      opnode.exec->Run(RunContext{opnode.ctx, nullptr}, is_gpu);
     } else if (opnode.cached_opr != nullptr) {
 #if MXNET_USE_PROFILER
       bool profiling = engine::Profiler::Get()->GetState() == engine::Profiler::kRunning;
@@ -1335,7 +1526,7 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
       RunContext ctx, Engine::CallbackOnComplete on_complete) {
     // Run all opr in the sub-graph
     for (auto &exec : exec_list) {
-      exec->Run(ctx);
+      exec->Run(ctx, is_gpu);
     }
     if (is_gpu) {
 #if MXNET_USE_CUDA
@@ -1370,6 +1561,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
                                const std::vector<Context>& aux_state_ctxes,
                                const std::unordered_map<std::string, TShape>& arg_shape_map,
                                const std::unordered_map<std::string, int>& arg_dtype_map,
+                               const std::unordered_map<std::string, int>& arg_stype_map,
                                const std::vector<OpReqType>& grad_req_types,
                                const std::unordered_set<std::string>& shared_arg_names,
                                std::vector<NDArray>* in_args,
@@ -1380,7 +1572,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
   auto exec = new exec::GraphExecutor();
   exec->Init(symbol, default_ctx, group2ctx,
              in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
-             arg_shape_map, arg_dtype_map,
+             arg_shape_map, arg_dtype_map, arg_stype_map,
              grad_req_types, shared_arg_names,
              in_args, arg_grads, aux_states,
              shared_buffer, shared_exec);
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index dc50bef002ab..48222f05fae2 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -59,6 +59,7 @@ class GraphExecutor : public Executor {
   friend class autograd::AutogradRuntime;
   using Executor::MonitorCallback;
 
+  GraphExecutor();
   virtual ~GraphExecutor();
   void Forward(bool is_train) override;
   void PartialForward(bool is_train, int step, int *step_left) override;
@@ -96,6 +97,7 @@ class GraphExecutor : public Executor {
             const std::vector<Context>& aux_state_ctxes,
             const std::unordered_map<std::string, TShape>& arg_shape_map,
             const std::unordered_map<std::string, int>& arg_dtype_map,
+            const std::unordered_map<std::string, int>& arg_stype_map,
             const std::vector<OpReqType>& grad_req_types,
             const std::unordered_set<std::string>& shared_arg_names,
             std::vector<NDArray>* in_arg_vec,
@@ -141,6 +143,7 @@ class GraphExecutor : public Executor {
   void InitArguments(const nnvm::IndexedGraph& idx,
                      const nnvm::ShapeVector& inferred_shapes,
                      const nnvm::DTypeVector& inferred_dtypes,
+                     const StorageTypeVector& inferred_stypes,
                      const std::vector<Context>& in_arg_ctxes,
                      const std::vector<Context>& arg_grad_ctxes,
                      const std::vector<Context>& aux_state_ctxes,
@@ -153,6 +156,7 @@ class GraphExecutor : public Executor {
   void InitArguments(const nnvm::IndexedGraph& idx,
                      const nnvm::ShapeVector& inferred_shapes,
                      const nnvm::DTypeVector& inferred_dtypes,
+                     const StorageTypeVector& inferred_stypes,
                      const std::vector<Context>& in_arg_ctxes,
                      const std::vector<Context>& arg_grad_ctxes,
                      const std::vector<Context>& aux_state_ctxes,
@@ -201,7 +205,8 @@ class GraphExecutor : public Executor {
   std::vector<OpNode> op_nodes_;
   // internal data entry of each node
   std::vector<NDArray> data_entry_;
-  // internal data pool of allocated entries
+  // internal data pool of allocated entries.
+  // these allocated entries can be used for static memory sharing between executors.
   std::vector<NDArray> data_pool_;
   // output arrays
   std::vector<NDArray> output_arrays_;
@@ -233,6 +238,8 @@ class GraphExecutor : public Executor {
   bool prefer_bulk_execution_;
   // cached segment operator
   std::vector<CachedSegOpr> cached_seg_opr_;
+  // verbose logging
+  bool log_verbose_ = false;
 };
 
 }  // namespace exec
diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc
new file mode 100644
index 000000000000..144c3713e205
--- /dev/null
+++ b/src/executor/infer_graph_attr_pass.cc
@@ -0,0 +1,356 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file infer_graph_attr_pass.cc
+ * \brief infer graph shape, dtype, and storage type
+ */
+
+#include <mxnet/op_attr_types.h>
+#include <mxnet/graph_attr_types.h>
+#include "./exec_pass.h"
+
+namespace mxnet {
+namespace exec {
+
+template<typename AttrType, typename FInfer>
+bool ApplyOpInferAttr(const nnvm::Graph& g,
+                      const FInfer& finfer,
+                      const NodeAttrs& attrs,
+                      const uint32_t nid,
+                      std::vector<AttrType>* in_attrs,
+                      std::vector<AttrType>* out_attrs) {
+  return finfer(attrs, in_attrs, out_attrs);
+}
+
+template<>
+bool ApplyOpInferAttr<int, FInferStorageType>(const nnvm::Graph& g,
+                                              const FInferStorageType& finfer,
+                                              const NodeAttrs& attrs,
+                                              const uint32_t nid,
+                                              std::vector<int>* in_attrs,
+                                              std::vector<int>* out_attrs) {
+  const ContextVector& ctxes = g.GetAttr<ContextVector>("context");
+  return finfer(attrs, ctxes[nid], in_attrs, out_attrs);
+}
+
+/*!\brief
+ * This is a duplicate of the InferAttr function in nnvm with minor modification
+ * to support inferring storage type whose function signature is different from
+ * shape/type inference functions'. The nnvm InferAttr will be deprecated
+ * in the future. Please use interfaces InferShape, InferType, and InferStorageType
+ * to call this function.
+ */
+template<typename AttrType, typename FInferType, typename IsNone, typename FDefault>
+nnvm::Graph InferAttr(nnvm::Graph &&ret,
+                      const AttrType empty_val,
+                      const char* infer_name,
+                      const char* input_name,
+                      const char* attr_key_name,
+                      const char* attr_name,
+                      const char* unknown_name,
+                      IsNone fis_none,
+                      FDefault fdefault,
+                      bool backward_identity_assign) {
+  using nnvm::IndexedGraph;
+  using nnvm::Op;
+  using AttrVector = std::vector<AttrType>;
+  using dmlc::any;
+
+  const IndexedGraph& idx = ret.indexed_graph();
+  static auto& finfer_shape =
+      Op::GetAttr<FInferType>(infer_name);
+  static auto& is_backward =
+      Op::GetAttr<nnvm::TIsBackward>("TIsBackward");
+  // gradient function, used to get node correspondence.
+  static auto& fgrad =
+      Op::GetAttr<nnvm::FGradient>("FGradient");
+  // reshape shape vector
+  AttrVector rshape;
+  if (ret.attrs.count(attr_name) != 0) {
+    rshape = ret.MoveCopyAttr<AttrVector>(attr_name);
+  } else {
+    rshape.resize(idx.num_node_entries(), empty_val);
+  }
+
+  if (ret.attrs.count(input_name) != 0) {
+    const AttrVector& shape_args = ret.GetAttr<AttrVector>(input_name);
+    CHECK_LE(shape_args.size(), idx.input_nodes().size())
+        << "More provided " << attr_name << "s than number of arguments.";
+    for (size_t i = 0; i < shape_args.size(); ++i) {
+      rshape[idx.entry_id(idx.input_nodes()[i], 0)] = shape_args[i];
+    }
+    // erase the provided arguments
+    ret.attrs.erase(input_name);
+  }
+
+  // get the shape hints
+  std::string shape_hints_key = std::string(attr_name) + "_hints";
+  if (ret.attrs.count(shape_hints_key)) {
+    nnvm::NodeEntryMap<AttrType> shape_hints =
+      ret.GetAttr<nnvm::NodeEntryMap<AttrType>>(shape_hints_key);
+    for (const auto& kv : shape_hints) {
+      nnvm::NodeEntry e = kv.first;
+      if (idx.exist(e.node.get())) {
+        rshape[idx.entry_id(kv.first)] = kv.second;
+      }
+    }
+  }
+
+  std::string shape_attr_key;
+  if (ret.attrs.count(attr_key_name) != 0) {
+    shape_attr_key = ret.GetAttr<std::string>(attr_key_name);
+    // erase the provided arguments
+    ret.attrs.erase(attr_key_name);
+  }
+  // Temp space for shape inference.
+  std::vector<AttrType> ishape, oshape;
+
+  // inference step function for nid
+  auto infer_step = [&](uint32_t nid, bool last_iter) {
+    const auto& inode = idx[nid];
+    const uint32_t num_inputs = inode.inputs.size();
+    const uint32_t num_outputs = inode.source->num_outputs();
+    if (inode.source->is_variable()) {
+      // Variable node. No operator. Only one output entry.
+      CHECK(inode.source->op() == nullptr);
+      CHECK_EQ(num_outputs, 1U);
+      const uint32_t out_ent_id = idx.entry_id(nid, 0);
+      if (shape_attr_key.length() != 0 && fis_none(rshape[out_ent_id])) {
+        auto it = inode.source->attrs.dict.find(shape_attr_key);
+        if (it != inode.source->attrs.dict.end()) {
+          std::istringstream is(it->second);
+          CHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
+        }
+      }
+    } else if (is_backward.get(inode.source->op(), false) &&
+               inode.control_deps.size() && backward_identity_assign) {
+      CHECK_GE(inode.control_deps.size(), 1U)
+        << "BackwardOp need to have control_deps to its forward op";
+      const IndexedGraph::Node& fnode = idx[inode.control_deps[0]];
+      nnvm::NodePtr fwd_ptr = inode.source->control_deps[0];
+      CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable";
+      // use gradient function to find out the correspondence.
+      std::vector<nnvm::NodeEntry> ograd(fwd_ptr->num_outputs());
+      for (size_t i = 0; i < ograd.size(); ++i) {
+        ograd[i].index = static_cast<uint32_t>(i);
+      }
+      // input gradient list
+      auto igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd);
+      const nnvm::Node* igrad_node = nullptr;
+      // Input gradient assignement
+      for (size_t i = 0; i < igrad.size(); ++i) {
+        if (igrad[i].node->op() == inode.source->op()) {
+          uint32_t eid = idx.entry_id(nid, igrad[i].index);
+          if (fis_none(rshape[eid])) {
+            rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])];
+          } else {
+            CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])])
+                << "Backward shape inconsistent with the forward shape";
+          }
+          if (igrad_node == nullptr) {
+            igrad_node = igrad[i].node.get();
+          } else {
+            CHECK(igrad_node == igrad[i].node.get());
+          }
+        }
+      }
+      // out grad entries
+      CHECK(igrad_node != nullptr)
+        << "Cannot find matching backward op for " << inode.source->attrs.name;
+      for (size_t i = 0; i < igrad_node->inputs.size(); ++i) {
+        const nnvm::NodeEntry& e = igrad_node->inputs[i];
+        if (e.node == nullptr) {
+          uint32_t eid = idx.entry_id(inode.inputs[i]);
+          if (fis_none(rshape[eid])) {
+            rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)];
+          }
+        }
+      }
+    } else {
+      bool forward_known = true;
+      // Forward operator inference.
+      ishape.resize(num_inputs, empty_val);
+      for (uint32_t i = 0; i < ishape.size(); ++i) {
+        ishape[i] = rshape[idx.entry_id(inode.inputs[i])];
+        if (fis_none(ishape[i])) forward_known = false;
+      }
+      oshape.resize(num_outputs, empty_val);
+      for (uint32_t i = 0; i < oshape.size(); ++i) {
+        oshape[i] = rshape[idx.entry_id(nid, i)];
+        if (fis_none(oshape[i])) forward_known = false;
+      }
+      auto finfer = finfer_shape.get(inode.source->op(), fdefault);
+      if (!forward_known) {
+        if (finfer != nullptr) {
+          // Call inference function of the operator.
+          try {
+            forward_known = ApplyOpInferAttr(ret, finfer, inode.source->attrs,
+                                             nid, &ishape, &oshape);
+          } catch (const std::exception& e) {
+            throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what());
+          }
+        } else {
+          CHECK(!last_iter)
+              << "Attribute " << infer_name
+              << " is not registed by op " << inode.source->op()->name
+              << " we are not able to complete the inference because of this";
+        }
+      }
+      // Save to the result map.
+      for (uint32_t i = 0; i < num_inputs; ++i) {
+        rshape[idx.entry_id(inode.inputs[i])] = ishape[i];
+      }
+      for (uint32_t i = 0; i < num_outputs; ++i) {
+        rshape[idx.entry_id(nid, i)] = oshape[i];
+      }
+    }
+  };
+
+  size_t last_num_unknown;
+  size_t num_unknown = rshape.size();
+  int i = 0;
+  do {
+    if (i % 2 == 0) {
+      for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+        infer_step(nid, false);
+      }
+    } else {
+      // backward inference
+      for (uint32_t i = idx.num_nodes(); i != 0; --i) {
+        infer_step(i - 1, false);
+      }
+    }
+    last_num_unknown = num_unknown;
+    num_unknown = 0;
+    for (size_t j = 0; j < idx.num_node_entries(); ++j) {
+      if (fis_none(rshape[j])) {
+        ++num_unknown;
+      }
+    }
+    ++i;
+  } while (num_unknown > 0 && last_num_unknown > num_unknown);
+  // set the shapes
+  ret.attrs[attr_name] = std::make_shared<any>(std::move(rshape));
+  // number of nodes who knows the shape.
+  ret.attrs[unknown_name] = std::make_shared<any>(num_unknown);
+  return ret;
+}
+
+// inference fucntion for same type
+inline bool SameType(const nnvm::NodeAttrs& attrs,
+                     std::vector<int> *iattr,
+                     std::vector<int> *oattr) {
+  int def_v = -1;
+  for (int v : *oattr) {
+    if (v != -1) {
+      def_v = v; break;
+    }
+  }
+  if (def_v == -1) {
+    for (int v : *iattr) {
+      if (v != -1) {
+        def_v = v; break;
+      }
+    }
+  }
+  if (def_v == -1) return false;
+  for (int& v : *oattr) {
+    v = def_v;
+  }
+  for (int& v : *iattr) {
+    v = def_v;
+  }
+  return true;
+}
+
+// assigning default type N to both input and output attrs with value -1
+template <int default_val, int none>
+inline bool DefaultType(const nnvm::NodeAttrs& attrs,
+                        const Context& ctx,
+                        std::vector<int> *iattr,
+                        std::vector<int> *oattr) {
+  // TODO(junwu): check whether need to use ctx
+  for (int& v : *oattr) {
+    if (v == none) v = default_val;
+  }
+  for (int& v : *iattr) {
+    if (v == none) v = default_val;
+  }
+  return true;
+}
+
+nnvm::Graph InferShape(nnvm::Graph graph,
+                       nnvm::ShapeVector shape_inputs,
+                       const std::string& shape_attr_key) {
+  using dmlc::any;
+  if (shape_inputs.size() != 0) {
+    graph.attrs["shape_inputs"] = std::make_shared<any>(std::move(shape_inputs));
+  }
+  if (shape_attr_key.length() != 0) {
+    graph.attrs["shape_attr_key"] = std::make_shared<any>(std::move(shape_attr_key));
+  }
+  return InferAttr<nnvm::TShape, nnvm::FInferShape>(
+      std::move(graph), nnvm::TShape(),
+      "FInferShape", "shape_inputs", "shape_attr_key",
+      "shape", "shape_num_unknown_nodes",
+      [](const nnvm::TShape& s) { return s.ndim() == 0 || s.Size() == 0; },
+      nullptr, true);
+}
+
+nnvm::Graph InferType(nnvm::Graph graph,
+                      nnvm::DTypeVector dtype_inputs,
+                      const std::string& dtype_attr_key) {
+  using dmlc::any;
+  if (dtype_inputs.size() != 0) {
+    graph.attrs["dtype_inputs"] = std::make_shared<any>(std::move(dtype_inputs));
+  }
+  if (dtype_attr_key.length() != 0) {
+    graph.attrs["dtype_attr_key"] = std::make_shared<any>(std::move(dtype_attr_key));
+  }
+  return InferAttr<int, nnvm::FInferType>(
+      std::move(graph), -1,
+      "FInferType", "dtype_inputs", "dtype_attr_key",
+      "dtype", "dtype_num_unknown_nodes",
+      [](const int t) { return t == -1; },
+      SameType, true);
+}
+
+nnvm::Graph InferStorageType(nnvm::Graph graph,
+                             StorageTypeVector storage_type_inputs,
+                             const std::string& storage_type_attr_key) {
+  using dmlc::any;
+  if (storage_type_inputs.size() != 0) {
+    graph.attrs["storage_type_inputs"] = std::make_shared<any>(std::move(storage_type_inputs));
+  }
+  if (storage_type_attr_key.length() != 0) {
+    graph.attrs["storage_type_attr_key"] = std::make_shared<any>(std::move(storage_type_attr_key));
+  }
+  // for storage type, the backward attr is not necessarily the same as it's correspondence
+  const int kDefaultStorage = 0;
+  return InferAttr<int, FInferStorageType>(
+      std::move(graph), -1,
+      "FInferStorageType", "storage_type_inputs", "storage_type_attr_key",
+      "storage_type", "storage_type_num_unknown_nodes",
+      [](const int t) { return t == -1; },
+      DefaultType<kDefaultStorage, -1>, false);
+}
+
+}  // namespace exec
+}  // namespace mxnet
diff --git a/src/executor/inplace_addto_detect_pass.cc b/src/executor/inplace_addto_detect_pass.cc
index 26a91e3f1b5e..9359d8863594 100644
--- a/src/executor/inplace_addto_detect_pass.cc
+++ b/src/executor/inplace_addto_detect_pass.cc
@@ -62,6 +62,8 @@ Graph DetectInplaceAddTo(Graph g) {
     uint32_t eid_rhs  = idx.entry_id(inode.inputs[1]);
     if (ref_count[eid_rhs] != 1) continue;
     if (inode.inputs[0].node_id >= inode.inputs[1].node_id) continue;
+    // TODO(haibin) support inplace addto for Dynamic Storage
+    if (storage_id[eid_rhs] == kDynamicStorageID) continue;
     CHECK_NE(storage_id[eid_rhs], sid);
     storage_id[eid_rhs] = sid;
     addto_entry[eid_rhs] = 1;
diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h
index c5ec10618080..ade7c1a53bd2 100644
--- a/src/io/iter_batchloader.h
+++ b/src/io/iter_batchloader.h
@@ -41,7 +41,7 @@ namespace io {
 class BatchLoader : public IIterator<TBlobBatch> {
  public:
   explicit BatchLoader(IIterator<DataInst> *base):
-      base_(base), head_(1), num_overflow_(0) {
+    head_(1), num_overflow_(0), base_(base) {
   }
 
   virtual ~BatchLoader(void) {
@@ -52,7 +52,7 @@ class BatchLoader : public IIterator<TBlobBatch> {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init batch param, it could have similar param with
     kwargs_left = param_.InitAllowUnknown(kwargs);
-    // Init space for out_
+    // Init space for out
     out_.inst_index = new unsigned[param_.batch_size];
     out_.batch_size = param_.batch_size;
     out_.data.clear();
@@ -69,6 +69,7 @@ class BatchLoader : public IIterator<TBlobBatch> {
     }
     head_ = 1;
   }
+
   virtual bool Next(void) {
     out_.num_batch_padd = 0;
     out_.batch_size = param_.batch_size;
@@ -128,23 +129,25 @@ class BatchLoader : public IIterator<TBlobBatch> {
     return out_;
   }
 
- private:
+ protected:
   /*! \brief batch parameters */
   BatchParam param_;
   /*! \brief output data */
   TBlobBatch out_;
-  /*! \brief base iterator */
-  IIterator<DataInst> *base_;
   /*! \brief on first */
   int head_;
   /*! \brief number of overflow instances that readed in round_batch mode */
   int num_overflow_;
+  /*! \brief tensor to hold data */
+  std::vector<TBlobContainer> data_;
+
+ private:
+  /*! \brief base iterator */
+  IIterator<DataInst> *base_;
   /*! \brief data shape */
   std::vector<TShape> shape_;
   /*! \brief unit size */
   std::vector<size_t> unit_size_;
-  /*! \brief tensor to hold data */
-  std::vector<TBlobContainer> data_;
   // initialize the data holder by using from the first batch.
   inline void InitData(const DataInst& first_batch) {
     shape_.resize(first_batch.data.size());
diff --git a/src/io/iter_libsvm.cc b/src/io/iter_libsvm.cc
new file mode 100644
index 000000000000..803d19e74481
--- /dev/null
+++ b/src/io/iter_libsvm.cc
@@ -0,0 +1,288 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file iter_libsvm.cc
+ * \brief define a LibSVM Reader to read in arrays
+ */
+#include <mxnet/io.h>
+#include <dmlc/base.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <dmlc/data.h>
+#include "./iter_sparse_prefetcher.h"
+#include "./iter_sparse_batchloader.h"
+
+namespace mxnet {
+namespace io {
+// LibSVM parameters
+struct LibSVMIterParam : public dmlc::Parameter<LibSVMIterParam> {
+  /*! \brief path to data libsvm file */
+  std::string data_libsvm;
+  /*! \brief data shape */
+  TShape data_shape;
+  /*! \brief path to label libsvm file */
+  std::string label_libsvm;
+  /*! \brief label shape */
+  TShape label_shape;
+  /*! \brief partition the data into multiple parts */
+  int num_parts;
+  /*! \brief the index of the part will read*/
+  int part_index;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(LibSVMIterParam) {
+    DMLC_DECLARE_FIELD(data_libsvm)
+        .describe("The input LibSVM file or a directory path.");
+    DMLC_DECLARE_FIELD(data_shape)
+        .describe("The shape of one example.");
+    DMLC_DECLARE_FIELD(label_libsvm).set_default("NULL")
+        .describe("The input LibSVM file or a directory path. "
+                  "If NULL, all labels will be read from ``data_libsvm``.");
+    index_t shape1[] = {1};
+    DMLC_DECLARE_FIELD(label_shape).set_default(TShape(shape1, shape1 + 1))
+        .describe("The shape of one label.");
+    DMLC_DECLARE_FIELD(num_parts).set_default(1)
+        .describe("partition the data into multiple parts");
+    DMLC_DECLARE_FIELD(part_index).set_default(0)
+        .describe("the index of the part will read");
+  }
+};
+
+class LibSVMIter: public SparseIIterator<DataInst> {
+ public:
+  LibSVMIter() {}
+  virtual ~LibSVMIter() {}
+
+  // intialize iterator loads data in
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    CHECK_EQ(param_.data_shape.ndim(), 1) << "dimension of data_shape is expected to be 1";
+    CHECK_GT(param_.num_parts, 0) << "number of parts should be positive";
+    CHECK_GE(param_.part_index, 0) << "part index should be non-negative";
+    data_parser_.reset(dmlc::Parser<uint64_t>::Create(param_.data_libsvm.c_str(),
+                                                      param_.part_index,
+                                                      param_.num_parts, "libsvm"));
+    if (param_.label_libsvm != "NULL") {
+      label_parser_.reset(dmlc::Parser<uint64_t>::Create(param_.label_libsvm.c_str(),
+                                                         param_.part_index,
+                                                         param_.num_parts, "libsvm"));
+      CHECK_GT(param_.label_shape.Size(), 1)
+        << "label_shape is not expected to be (1,) when param_.label_libsvm is set.";
+    } else {
+      CHECK_EQ(param_.label_shape.Size(), 1)
+        << "label_shape is expected to be (1,) when param_.label_libsvm is NULL";
+    }
+    // both data and label are of CSRStorage in libsvm format
+    if (param_.label_shape.Size() > 1) {
+      out_.data.resize(6);
+    } else {
+      // only data is of CSRStorage in libsvm format.
+      out_.data.resize(4);
+    }
+  }
+
+  virtual void BeforeFirst() {
+    data_parser_->BeforeFirst();
+    if (label_parser_.get() != nullptr) {
+      label_parser_->BeforeFirst();
+    }
+    data_ptr_ = label_ptr_ = 0;
+    data_size_ = label_size_ = 0;
+    inst_counter_ = 0;
+    end_ = false;
+  }
+
+  virtual bool Next() {
+    if (end_) return false;
+    while (data_ptr_ >= data_size_) {
+      if (!data_parser_->Next()) {
+        end_ = true; return false;
+      }
+      data_ptr_ = 0;
+      data_size_ = data_parser_->Value().size;
+    }
+    out_.index = inst_counter_++;
+    CHECK_LT(data_ptr_, data_size_);
+    const auto data_row = data_parser_->Value()[data_ptr_++];
+    // data, indices and indptr
+    out_.data[0] = AsDataBlob(data_row);
+    out_.data[1] = AsIdxBlob(data_row);
+    out_.data[2] = AsIndPtrPlaceholder(data_row);
+
+    if (label_parser_.get() != nullptr) {
+      while (label_ptr_ >= label_size_) {
+        CHECK(label_parser_->Next())
+            << "Data LibSVM's row is smaller than the number of rows in label_libsvm";
+        label_ptr_ = 0;
+        label_size_ = label_parser_->Value().size;
+      }
+      CHECK_LT(label_ptr_, label_size_);
+      const auto label_row = label_parser_->Value()[label_ptr_++];
+      // data, indices and indptr
+      out_.data[3] = AsDataBlob(label_row);
+      out_.data[4] = AsIdxBlob(label_row);
+      out_.data[5] = AsIndPtrPlaceholder(label_row);
+    } else {
+      out_.data[3] = AsScalarLabelBlob(data_row);
+    }
+    return true;
+  }
+
+  virtual const DataInst &Value(void) const {
+    return out_;
+  }
+
+  virtual const NDArrayStorageType GetStorageType(bool is_data) const {
+    if (is_data) return kCSRStorage;
+    return param_.label_shape.Size() > 1 ? kCSRStorage : kDefaultStorage;
+  }
+
+  virtual const TShape GetShape(bool is_data) const {
+    if (is_data) return param_.data_shape;
+    return param_.label_shape;
+  }
+
+ private:
+  inline TBlob AsDataBlob(const dmlc::Row<uint64_t>& row) {
+    const real_t* ptr = row.value;
+    TShape shape(mshadow::Shape1(row.length));
+    return TBlob((real_t*) ptr, shape, cpu::kDevMask);  // NOLINT(*)
+  }
+
+  inline TBlob AsIdxBlob(const dmlc::Row<uint64_t>& row) {
+    const uint64_t* ptr = row.index;
+    TShape shape(mshadow::Shape1(row.length));
+    return TBlob((int64_t*) ptr, shape, cpu::kDevMask, mshadow::kInt64);  // NOLINT(*)
+  }
+
+  inline TBlob AsIndPtrPlaceholder(const dmlc::Row<uint64_t>& row) {
+    return TBlob(nullptr, mshadow::Shape1(0), cpu::kDevMask, mshadow::kInt64);
+  }
+
+  inline TBlob AsScalarLabelBlob(const dmlc::Row<uint64_t>& row) {
+    const real_t* ptr = row.label;
+    return TBlob((real_t*) ptr, mshadow::Shape1(1), cpu::kDevMask);  // NOLINT(*)
+  }
+
+  LibSVMIterParam param_;
+  // output instance
+  DataInst out_;
+  // internal instance counter
+  unsigned inst_counter_{0};
+  // at end
+  bool end_{false};
+  // label parser
+  size_t label_ptr_{0}, label_size_{0};
+  size_t data_ptr_{0}, data_size_{0};
+  std::unique_ptr<dmlc::Parser<uint64_t> > label_parser_;
+  std::unique_ptr<dmlc::Parser<uint64_t> > data_parser_;
+};
+
+
+DMLC_REGISTER_PARAMETER(LibSVMIterParam);
+
+MXNET_REGISTER_IO_ITER(LibSVMIter)
+.describe(R"code(Returns the LibSVM file iterator. This iterator is experimental and
+should be used with care.
+
+The input data is similar to libsvm file format, except that the indices are expected to be
+zero-based instead of one-based. Details of the libsvm format are available at
+`https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/`
+
+In this function, the `data_shape` parameter is used to set the shape of each line of the data.
+The dimension of both `data_shape` and `label_shape` are expected to be 1.
+
+When `label_libsvm` is set to ``NULL``, both data and label are read from the same file specified
+by `data_libsvm`. Otherwise, data is read from `data_libsvm` and label from `label_libsvm`,
+in this case, if `data_libsvm` contains label, it will ignored.
+
+The `LibSVMIter` only support `round_batch` parameter set to ``True`` for now. So, if `batch_size`
+is 3 and there are 4 total rows in libsvm file, 2 more examples
+are consumed at the first round. If `reset` function is called after first round,
+the call is ignored and remaining examples are returned in the second round.
+
+If ``data_libsvm = 'data/'`` is set, then all the files in this directory will be read.
+
+Examples::
+
+  // Contents of libsvm file ``data.t``.
+  1.0 0:0.5 2:1.2
+  -2.0
+  -3.0 0:0.6 1:2.4 2:1.2
+  4 2:-1.2
+
+  // Creates a `LibSVMIter` with `batch_size`=3.
+  LibSVMIter = mx.io.LibSVMIter(data_libsvm = 'data.t', data_shape = (3,),
+  batch_size = 3)
+
+  // The first batch (data and label)
+  [[ 0.5         0.          1.2 ]
+   [ 0.          0.          0.  ]
+   [ 0.6         2.4         1.2 ]]
+
+  [ 1. -2. -3.]
+
+  // The second batch (data and label)
+  [[ 0.          0.         -1.2 ]
+   [ 0.5         0.          1.2 ]
+   [ 0.          0.          0. ]]
+
+  [ 4.  1. -2.]
+
+  // Contents of libsvm file ``label.t``
+  1.0
+  -2.0 0:0.125
+  -3.0 2:1.2
+  4 1:1.0 2:-1.2
+
+  // Creates a `LibSVMIter` with specified label file
+  LibSVMIter = mx.io.LibSVMIter(data_libsvm = 'data.t', data_shape = (3,),
+  label_libsvm = 'label.t', label_shape = (3,), batch_size = 3)
+
+  // Two batches of data read from the above iterator are as follows(data and label):
+  // The first batch
+  [[ 0.5         0.          1.2       ]
+   [ 0.          0.          0.        ]
+   [ 0.6         2.4         1.2      ]]
+
+  [[ 0.          0.          0.        ]
+   [ 0.125       0.          0.        ]
+   [ 0.          0.          1.2      ]]
+
+  // The second batch
+  [[ 0.          0.         -1.2       ]
+   [ 0.5         0.          1.2       ]
+   [ 0.          0.          0.        ]]
+
+  [[ 0.          1.         -1.2       ]
+   [ 0.          0.          0.        ]
+   [ 0.125       0.          0.        ]]
+
+)code" ADD_FILELINE)
+.add_arguments(LibSVMIterParam::__FIELDS__())
+.add_arguments(BatchParam::__FIELDS__())
+.add_arguments(PrefetcherParam::__FIELDS__())
+.set_body([]() {
+    return new SparsePrefetcherIter(
+        new SparseBatchLoader(
+            new LibSVMIter()));
+  });
+
+}  // namespace io
+}  // namespace mxnet
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index 89960c71a12f..a743b5132821 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -46,8 +46,7 @@ namespace io {
 class PrefetcherIter : public IIterator<DataBatch> {
  public:
   explicit PrefetcherIter(IIterator<TBlobBatch>* base)
-      : loader_(base), out_(nullptr) {
-  }
+      : loader_(base), out_(nullptr) {}
 
   ~PrefetcherIter() {
     while (recycle_queue_.size() != 0) {
@@ -56,21 +55,24 @@ class PrefetcherIter : public IIterator<DataBatch> {
       delete batch;
     }
     delete out_;
-    iter_.Destroy();
+    iter.Destroy();
   }
 
-  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+  void InitParams(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init image rec param
     kwargs_left = param_.InitAllowUnknown(kwargs);
-    // use the kwarg to init batch loader
-    loader_->Init(kwargs);
     // maximum prefetch threaded iter internal size
     const int kMaxPrefetchBuffer = 16;
     // init thread iter
-    iter_.set_max_capacity(kMaxPrefetchBuffer);
+    iter.set_max_capacity(kMaxPrefetchBuffer);
+  }
 
-    iter_.Init([this](DataBatch **dptr) {
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    InitParams(kwargs);
+    // use the kwarg to init batch loader
+    loader_->Init(kwargs);
+    iter.Init([this](DataBatch **dptr) {
         if (!loader_->Next()) return false;
         const TBlobBatch& batch = loader_->Value();
         if (*dptr == nullptr) {
@@ -109,7 +111,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
   }
 
   virtual void BeforeFirst(void) {
-    iter_.BeforeFirst();
+    iter.BeforeFirst();
   }
 
   virtual bool Next(void) {
@@ -124,9 +126,9 @@ class PrefetcherIter : public IIterator<DataBatch> {
         arr.WaitToWrite();
       }
       recycle_queue_.pop();
-      iter_.Recycle(&old_batch);
+      iter.Recycle(&old_batch);
     }
-    return iter_.Next(&out_);
+    return iter.Next(&out_);
   }
   virtual const DataBatch &Value(void) const {
     return *out_;
@@ -135,16 +137,16 @@ class PrefetcherIter : public IIterator<DataBatch> {
  protected:
   /*! \brief prefetcher parameters */
   PrefetcherParam param_;
-  /*! \brief internal batch loader */
-  std::unique_ptr<IIterator<TBlobBatch> > loader_;
+  /*! \brief backend thread */
+  dmlc::ThreadedIter<DataBatch> iter;
 
  private:
+  /*! \brief internal batch loader */
+  std::unique_ptr<IIterator<TBlobBatch> > loader_;
   /*! \brief output data */
   DataBatch *out_;
   /*! \brief queue to be recycled */
   std::queue<DataBatch*> recycle_queue_;
-  /*! \brief backend thread */
-  dmlc::ThreadedIter<DataBatch> iter_;
 };
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/io/iter_sparse.h b/src/io/iter_sparse.h
new file mode 100644
index 000000000000..beaf5c682998
--- /dev/null
+++ b/src/io/iter_sparse.h
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file iter_sparse.h
+ * \brief mxnet sparse data iterator
+ */
+#ifndef MXNET_IO_ITER_SPARSE_H_
+#define MXNET_IO_ITER_SPARSE_H_
+
+#include <mxnet/io.h>
+#include <mxnet/ndarray.h>
+
+namespace mxnet {
+/*!
+ * \brief iterator type
+ * \param DType data type
+ */
+template<typename DType>
+class SparseIIterator : public IIterator<DType> {
+ public:
+  /*! \brief storage type of the data or label */
+  virtual const NDArrayStorageType GetStorageType(bool is_data) const = 0;
+  /*! \brief shape of the data or label */
+  virtual const TShape GetShape(bool is_data) const = 0;
+};  // class SparseIIterator
+
+}  // namespace mxnet
+#endif  // MXNET_IO_ITER_SPARSE_H_
diff --git a/src/io/iter_sparse_batchloader.h b/src/io/iter_sparse_batchloader.h
new file mode 100644
index 000000000000..d5c9bd2f4578
--- /dev/null
+++ b/src/io/iter_sparse_batchloader.h
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file iter_sparse_batchloader.h
+ * \brief define a batch adapter to create sparse tblob batch
+ */
+#ifndef MXNET_IO_ITER_SPARSE_BATCHLOADER_H_
+#define MXNET_IO_ITER_SPARSE_BATCHLOADER_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <dmlc/logging.h>
+#include <mshadow/tensor.h>
+#include <utility>
+#include <vector>
+#include <string>
+#include "./inst_vector.h"
+#include "./image_iter_common.h"
+#include "./iter_batchloader.h"
+#include "./iter_sparse.h"
+
+namespace mxnet {
+namespace io {
+
+/*! \brief create a batch iterator from single instance iterator */
+class SparseBatchLoader : public BatchLoader, public SparseIIterator<TBlobBatch> {
+ public:
+  explicit SparseBatchLoader(SparseIIterator<DataInst> *base):
+      BatchLoader(base), sparse_base_(base) {
+  }
+
+  virtual ~SparseBatchLoader(void) {}
+
+  inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    BatchLoader::Init(kwargs);
+    data_stype_ = sparse_base_->GetStorageType(true);
+    label_stype_ = sparse_base_->GetStorageType(false);
+    if (param_.round_batch == 0) {
+      LOG(FATAL) << "sparse batch loader doesn't support round_batch == false yet";
+    }
+  }
+
+  virtual void BeforeFirst(void) {
+    BatchLoader::BeforeFirst();
+  }
+
+  virtual bool Next(void) {
+    out_.num_batch_padd = 0;
+    out_.batch_size = param_.batch_size;
+    this->head_ = 0;
+    // if overflown from previous round, directly return false, until before first is called
+    if (num_overflow_ != 0) return false;
+    index_t top = 0;
+    inst_cache_.clear();
+    while (sparse_base_->Next()) {
+      inst_cache_.emplace_back(sparse_base_->Value());
+      if (inst_cache_.size() >= param_.batch_size) break;
+    }
+    // no more data instance
+    if (inst_cache_.size() == 0) {
+      return false;
+    }
+    if (inst_cache_.size() < param_.batch_size) {
+      CHECK_GT(param_.round_batch, 0);
+      num_overflow_ = 0;
+      sparse_base_->BeforeFirst();
+      for (; inst_cache_.size() < param_.batch_size; ++num_overflow_) {
+        CHECK(sparse_base_->Next()) << "number of input must be bigger than batch size";
+        inst_cache_.emplace_back(sparse_base_->Value());
+      }
+    }
+    out_.num_batch_padd = num_overflow_;
+    CHECK_EQ(inst_cache_.size(), param_.batch_size);
+    this->InitDataFromBatch();
+    for (size_t j = 0; j < inst_cache_.size(); j++) {
+      const auto& d = inst_cache_[j];
+      out_.inst_index[top] = d.index;
+      // TODO(haibin) double check the type?
+      int64_t unit_size = 0;
+      for (size_t i = 0; i < d.data.size(); ++i) {
+        // indptr tensor
+        if (IsIndPtr(i)) {
+          auto indptr = data_[i].get<cpu, 1, int64_t>();
+          if (j == 0) indptr[0] = 0;
+          indptr[j + 1] = indptr[j] + unit_size;
+          offsets_[i] = j;
+        } else {
+          // indices and values tensor
+          unit_size = d.data[i].shape_.Size();
+          MSHADOW_TYPE_SWITCH(data_[i].type_flag_, DType, {
+            const auto begin = offsets_[i];
+            const auto end = offsets_[i] + unit_size;
+            mshadow::Copy(data_[i].get<cpu, 1, DType>().Slice(begin, end),
+                          d.data[i].get_with_shape<cpu, 1, DType>(mshadow::Shape1(unit_size)));
+            });
+          offsets_[i] += unit_size;
+        }
+      }
+    }
+    return true;
+  }
+
+  virtual const TBlobBatch &Value(void) const {
+    return BatchLoader::Value();
+  }
+
+  virtual const NDArrayStorageType GetStorageType(bool is_data) const {
+    return sparse_base_->GetStorageType(is_data);
+  }
+
+  virtual const TShape GetShape(bool is_data) const {
+    TShape inst_shape = sparse_base_->GetShape(is_data);
+    std::vector<index_t> shape_vec;
+    shape_vec.push_back(param_.batch_size);
+    for (index_t dim = 0; dim < inst_shape.ndim(); ++dim) {
+      shape_vec.push_back(inst_shape[dim]);
+    }
+    return TShape(shape_vec.begin(), shape_vec.end());
+  }
+
+ private:
+  /*! \brief base sparse iterator */
+  SparseIIterator<DataInst> *sparse_base_;
+  /*! \brief data instances */
+  std::vector<DataInst> inst_cache_;
+  /*! \brief data storage type */
+  NDArrayStorageType data_stype_;
+  /*! \brief data label type */
+  NDArrayStorageType label_stype_;
+  /*! \brief tensor offset for slicing */
+  std::vector<size_t> offsets_;
+
+  // check whether ith position is the indptr tensor for a CSR tensor
+  inline bool IsIndPtr(size_t i) {
+    auto data_num_aux = num_aux_data(data_stype_);
+    auto label_num_aux = num_aux_data(label_stype_);
+    auto label_indptr_offset = data_num_aux + 1 + label_num_aux;
+    // data indptr
+    if (i == data_num_aux && data_stype_ == kCSRStorage) {
+      return true;
+    }
+    // label indptr
+    if (i == label_indptr_offset && label_stype_ == kCSRStorage && data_stype_ == kCSRStorage) {
+      return true;
+    }
+    return false;
+  }
+
+  // initialize the data holder by using from the batch
+  inline void InitDataFromBatch() {
+    CHECK(data_stype_ == kCSRStorage || label_stype_ == kCSRStorage);
+    CHECK_GT(inst_cache_.size(), 0);
+    out_.data.clear();
+    data_.clear();
+    offsets_.clear();
+
+    size_t total_size = inst_cache_[0].data.size();
+    data_.resize(total_size);
+    offsets_.resize(total_size, 0);
+    std::vector<size_t> vec_sizes(total_size, 0);
+    // accumulate the memory required for a batch
+    for (size_t i = 0; i < total_size; ++i) {
+      size_t size = 0;
+      // vec_size for indptr
+      if (IsIndPtr(i)) {
+        size = param_.batch_size + 1;
+      } else {
+        for (const auto &d : inst_cache_) size += d.data[i].shape_.Size();
+      }
+      vec_sizes[i] = size;
+    }
+
+    CHECK_EQ(vec_sizes[0], vec_sizes[1]);
+    for (size_t i = 0; i < total_size; ++i) {
+      int src_type_flag = inst_cache_[0].data[i].type_flag_;
+      // init object attributes
+      TShape dst_shape(mshadow::Shape1(vec_sizes[i]));
+      data_[i].resize(mshadow::Shape1(vec_sizes[i]), src_type_flag);
+      CHECK(data_[i].dptr_ != nullptr);
+      out_.data.push_back(TBlob(data_[i].dptr_, dst_shape, cpu::kDevMask, src_type_flag));
+    }
+  }
+};  // class BatchLoader
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_IO_ITER_SPARSE_BATCHLOADER_H_
diff --git a/src/io/iter_sparse_prefetcher.h b/src/io/iter_sparse_prefetcher.h
new file mode 100644
index 000000000000..3908f9bd3826
--- /dev/null
+++ b/src/io/iter_sparse_prefetcher.h
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file iter_sparse_prefetcher.h
+ * \brief define a prefetcher using threaditer to keep k batch fetched
+ */
+#ifndef MXNET_IO_ITER_SPARSE_PREFETCHER_H_
+#define MXNET_IO_ITER_SPARSE_PREFETCHER_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <dmlc/logging.h>
+#include <dmlc/threadediter.h>
+#include <dmlc/optional.h>
+#include <mshadow/tensor.h>
+#include <climits>
+#include <utility>
+#include <string>
+#include <vector>
+#include <queue>
+#include <algorithm>
+#include "./inst_vector.h"
+#include "./image_iter_common.h"
+#include "./iter_prefetcher.h"
+#include "./iter_sparse.h"
+
+namespace mxnet {
+namespace io {
+// iterator on sparse data
+class SparsePrefetcherIter : public PrefetcherIter {
+ public:
+  explicit SparsePrefetcherIter(SparseIIterator<TBlobBatch>* base)
+      : PrefetcherIter(base), sparse_loader_(base) {}
+
+  ~SparsePrefetcherIter() {}
+
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    PrefetcherIter::InitParams(kwargs);
+    // use the kwarg to init batch loader
+    sparse_loader_->Init(kwargs);
+    iter.Init([this](DataBatch **dptr) {
+        if (!sparse_loader_->Next()) return false;
+        const TBlobBatch& batch = sparse_loader_->Value();
+        if (*dptr == nullptr) {
+          // allocate databatch
+          *dptr = new DataBatch();
+          (*dptr)->num_batch_padd = batch.num_batch_padd;
+          // (*dptr)->data.at(0) => data
+          // (*dptr)->data.at(1) => label
+          (*dptr)->data.resize(2);
+          (*dptr)->index.resize(batch.batch_size);
+          size_t data_iter = 0;
+          for (size_t i = 0; i < (*dptr)->data.size(); ++i) {
+            bool is_data = i == 0;
+            auto stype = this->GetStorageType(is_data);
+            auto dtype = param_.dtype ? param_.dtype.value() : batch.data[data_iter].type_flag_;
+            if (stype == kDefaultStorage) {
+              (*dptr)->data.at(i) = NDArray(batch.data[data_iter].shape_,
+                                            Context::CPU(), false, dtype);
+            } else {
+              (*dptr)->data.at(i) = NDArray(stype, this->GetShape(is_data),
+                                            Context::CPU(), false, dtype);
+            }
+            data_iter += num_aux_data(stype) + 1;
+          }
+        }
+        // copy data over
+        size_t data_iter = 0;
+        for (size_t i = 0; i < (*dptr)->data.size(); ++i) {
+          auto& nd = ((*dptr)->data)[i];
+          auto stype = nd.storage_type();
+          auto& data_i = ((*dptr)->data)[i];
+          if (stype == kDefaultStorage) {
+            CopyFromTo(data_i.data(), batch.data[data_iter]);
+          } else if (stype == kCSRStorage) {
+            auto& values = batch.data[data_iter];
+            auto& indices = batch.data[data_iter + 1];
+            auto& indptr = batch.data[data_iter + 2];
+            // allocate memory
+            CHECK_EQ(indices.shape_.Size(), values.shape_.Size());
+            nd.CheckAndAllocAuxData(csr::kIdx, indices.shape_);
+            nd.CheckAndAllocData(values.shape_);
+            nd.CheckAndAllocAuxData(csr::kIndPtr, indptr.shape_);
+            // copy values, indices and indptr
+            CopyFromTo(data_i.data(), values);
+            CopyFromTo(data_i.aux_data(csr::kIdx), indices);
+            CopyFromTo(data_i.aux_data(csr::kIndPtr), indptr);
+          } else {
+            LOG(FATAL) << "Storage type not implemented: " << stype;
+          }
+          data_iter += num_aux_data(stype) + 1;
+          (*dptr)->num_batch_padd = batch.num_batch_padd;
+        }
+        if (batch.inst_index) {
+          std::copy(batch.inst_index,
+                    batch.inst_index + batch.batch_size,
+                    (*dptr)->index.begin());
+        }
+       return true;
+      },
+      [this]() { sparse_loader_->BeforeFirst(); });
+  }
+
+  virtual void BeforeFirst(void) {
+    PrefetcherIter::BeforeFirst();
+  }
+
+  virtual bool Next(void) {
+    return PrefetcherIter::Next();
+  }
+  virtual const DataBatch &Value(void) const {
+    return PrefetcherIter::Value();
+  }
+
+  virtual const NDArrayStorageType GetStorageType(bool is_data) const {
+    return sparse_loader_->GetStorageType(is_data);
+  }
+
+  virtual const TShape GetShape(bool is_data) const {
+    return sparse_loader_->GetShape(is_data);
+  }
+
+ private:
+  /*! \brief internal sparse batch loader */
+  SparseIIterator<TBlobBatch>* sparse_loader_;
+
+  inline void CopyFromTo(TBlob dst, const TBlob src) {
+    MSHADOW_TYPE_SWITCH(src.type_flag_, DType, {
+      mshadow::Copy(dst.FlatTo1D<cpu, DType>(), src.FlatTo1D<cpu, DType>());
+    });
+  }
+};
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_IO_ITER_SPARSE_PREFETCHER_H_
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index ade9c95feda7..cd0d3ab02825 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -21,13 +21,17 @@
  */
 #ifndef MXNET_KVSTORE_COMM_H_
 #define MXNET_KVSTORE_COMM_H_
+#include <dmlc/omp.h>
 #include <string>
 #include <algorithm>
 #include <utility>
 #include <limits>
 #include <vector>
 #include <tuple>
+#include <thread>
 #include "mxnet/ndarray.h"
+#include "../ndarray/ndarray_function.h"
+#include "../operator/tensor/sparse_retain-inl.h"
 namespace mxnet {
 namespace kvstore {
 /**
@@ -40,9 +44,10 @@ class Comm {
   }
   virtual ~Comm() { }
   /**
-   * \brief init key with the data shape
+   * \brief init key with the data shape and storage shape
    */
-  virtual void Init(int key, const TShape& shape, int dtype = mshadow::kFloat32) = 0;
+  virtual void Init(int key, const NDArrayStorageType stype,
+                    const TShape& shape, int dtype = mshadow::kFloat32) = 0;
   /**
    * \brief returns src[0] + .. + src[src.size()-1]
    */
@@ -55,6 +60,18 @@ class Comm {
       int key, const NDArray& src,
       const std::vector<NDArray*> dst, int priority) = 0;
 
+  /**
+   * \brief broadcast src to dst[i] with target row_ids for every i
+   * \param dst a list of destination row_sparse NDArray and its target row_ids to broadcast,
+            where the row_ids are expected to be unique and sorted
+   * \param use_copy if set to true, directly copy src to dst[i] without looking up the
+            provided row_ids
+   */
+  virtual void BroadcastRowSparse(int key, const NDArray& src,
+                                  const std::vector<std::pair<NDArray*, NDArray>>& dst,
+                                  const bool use_copy,
+                                  const int priority) = 0;
+
   /**
    * \brief return a pinned contex
    */
@@ -75,43 +92,85 @@ class CommCPU : public Comm {
   CommCPU() {
     nthread_reduction_ = dmlc::GetEnv("MXNET_KVSTORE_REDUCTION_NTHREADS", 4);
     bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000);
+    // TODO(junwu) delete the following data member, now for benchmark only
+    is_serial_push_ = dmlc::GetEnv("MXNET_KVSTORE_SERIAL_PUSH", 0);
   }
   virtual ~CommCPU() { }
 
-  void Init(int key, const TShape& shape, int type = mshadow::kFloat32) override {
-    merge_buf_[key].merged = NDArray(shape, pinned_ctx_, false, type);
+  void Init(int key, const NDArrayStorageType stype, const TShape& shape,
+            int type = mshadow::kFloat32) override {
+    if (stype == kDefaultStorage) {
+      merge_buf_[key].merged = NDArray(shape, pinned_ctx_, false, type);
+    } else {
+      merge_buf_[key].merged = NDArray(stype, shape, pinned_ctx_, true, type);
+    }
   }
 
   const NDArray& Reduce(int key, const std::vector<NDArray>& src,
                         int priority) override {
+    auto& buf = merge_buf_[key];
     // avoid extra copy for single device, but it may bring problems for
     // abnormal usage of kvstore
     if (src.size() == 1) {
-      return src[0];
+      if (src[0].storage_type() == kDefaultStorage) {
+        return src[0];
+      } else {  // if sparse and only one GPU, always update weight on CPU
+        CopyFromTo(src[0], &buf.merged, priority);
+        return buf.merged;
+      }
     }
-    std::vector<Engine::VarHandle> const_vars(src.size() - 1);
-    std::vector<NDArray> reduce(src.size());
-    auto& buf = merge_buf_[key];
-    CopyFromTo(src[0], &buf.merged, priority);
-    reduce[0] = buf.merged;
 
-    if (buf.copy_buf.empty()) {
-      buf.copy_buf.resize(src.size()-1);
-      for (size_t j = 0; j < src.size() - 1; ++j) {
-        buf.copy_buf[j] = NDArray(
-          src[0].shape(), pinned_ctx_, false, src[0].dtype());
+    if (buf.merged.storage_type() == kDefaultStorage) {
+      std::vector<Engine::VarHandle> const_vars(src.size() - 1);
+      std::vector<NDArray> reduce(src.size());
+      CopyFromTo(src[0], &buf.merged, priority);
+      reduce[0] = buf.merged;
+
+      if (buf.copy_buf.empty()) {
+        buf.copy_buf.resize(src.size()-1);
+        for (size_t j = 0; j < src.size() - 1; ++j) {
+          // allocate NDArray basd on storage type
+          buf.copy_buf[j] = NDArray(
+            src[0].shape(), pinned_ctx_, false, src[0].dtype());
+        }
       }
-    }
-    for (size_t i = 1; i < src.size(); ++i) {
-      CopyFromTo(src[i], &(buf.copy_buf[i-1]), priority);
-      reduce[i] = buf.copy_buf[i-1];
-      const_vars[i-1] = reduce[i].var();
-    }
+      for (size_t i = 1; i < src.size(); ++i) {
+        CopyFromTo(src[i], &(buf.copy_buf[i-1]), priority);
+        reduce[i] = buf.copy_buf[i-1];
+        const_vars[i-1] = reduce[i].var();
+      }
+
+      Engine::Get()->PushSync([reduce, this](RunContext rctx) {
+          ReduceSumCPU(reduce);
+        }, Context::CPU(), const_vars, {reduce[0].var()},
+        FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce"));
 
-    Engine::Get()->PushSync([reduce, this](RunContext rctx) {
-        ReduceSumCPU(reduce);
-      }, Context::CPU(), const_vars, {reduce[0].var()},
-      FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce"));
+    } else {
+      // buf.merged is a sparse ndarray.
+      std::vector<Engine::VarHandle> const_vars(src.size());
+      std::vector<NDArray> reduce(src.size());
+
+      if (buf.copy_buf.empty()) {
+        buf.copy_buf.resize(src.size());
+        for (size_t j = 0; j < src.size(); ++j) {
+          buf.copy_buf[j] = NDArray(
+            src[0].storage_type(), src[0].shape(), pinned_ctx_, true, src[0].dtype());
+        }
+      }
+      for (size_t i = 0; i < src.size(); ++i) {
+        CopyFromTo(src[i], &(buf.copy_buf[i]), priority);
+        reduce[i] = buf.copy_buf[i];
+        const_vars[i] = reduce[i].var();
+      }
+      auto result = buf.merged;
+      Engine::Get()->PushSync([reduce, result, this](RunContext rctx) {
+          NDArray out = result;
+          is_serial_push_?
+            ReduceSumCPUExSerial(reduce, &out)
+            : mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), reduce, &out);
+        }, Context::CPU(), const_vars, {result.var()},
+        FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce"));
+    }
 
     return buf.merged;
   }
@@ -129,7 +188,113 @@ class CommCPU : public Comm {
     }
   }
 
+  void BroadcastRowSparse(int key, const NDArray& src,
+                          const std::vector<std::pair<NDArray*, NDArray>>& dst,
+                          const bool use_copy,
+                          const int priority) override {
+    using namespace mshadow;
+    CHECK_EQ(src.storage_type(), kRowSparseStorage)
+      << "BroadcastRowSparse expects row-sparse src NDArray";
+    CHECK_EQ(src.ctx().dev_mask(), Context::kCPU)
+      << "BroadcastRowSparse with src on gpu context not supported";
+    for (size_t i = 0; i < dst.size(); ++i) {
+      NDArray* out = dst[i].first;
+      NDArray row_id = dst[i].second;
+      if (use_copy) {
+        CopyFromTo(src, out, priority);
+      } else {
+        CHECK_EQ(out->storage_type(), kRowSparseStorage)
+                 << "BroadcastRowSparse expects row_sparse dst NDArray";
+        CHECK_EQ(row_id.ctx().dev_mask(), Context::kCPU)
+                 << "BroadcastRowSparse with row_indices on gpu context not supported";
+        // retain according to unique indices
+        const bool use_sparse_retain = (src.shape()[0] != src.storage_shape()[0])
+          || (row_id.dtype() != out->aux_type(rowsparse::kIdx))
+          || (out->ctx().dev_mask() != Context::kGPU);
+        if (use_sparse_retain) {  // use sparse_retain op
+          const bool is_to_gpu = out->ctx().dev_mask() == Context::kGPU;
+          NDArray out_cpu = is_to_gpu? NDArray(kRowSparseStorage, src.shape(),
+              src.ctx(), true, src.dtype(), src.aux_types()) : *out;
+          Engine::Get()->PushSync([=](RunContext rctx) {
+              const TBlob& indices = row_id.data();
+              NDArray temp = out_cpu;  // get rid of const qualifier
+              op::SparseRetainOpForwardRspImpl<cpu>(rctx.get_stream<cpu>(),
+                                                    src, indices, kWriteTo,
+                                                    &temp);
+            }, Context::CPU(), {src.var(), row_id.var()}, {out_cpu.var()},
+            FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain"));
+          if (is_to_gpu) {
+            CopyFromTo(out_cpu, out, priority);
+          }
+        } else {  // direct copy rows
+          Engine::Get()->PushSync([=](RunContext rctx) {
+              CopyRetainedRowsToGPU(rctx.get_stream<cpu>(), rctx.get_stream<gpu>(),
+                                    src, row_id, out);
+            }, out->ctx(), {src.var(), row_id.var()}, {out->var()},
+            FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("KVStoreCopyRetainedRowsToGPU"));
+        }
+      }
+    }
+  }
+
  private:
+  /*!
+   * \brief When src is a rsp with full rows,
+   * simply copy retained rows directly from cpu to gpu
+   * without invoking sparse_retain op.
+   */
+  void CopyRetainedRowsToGPU(mshadow::Stream<cpu>* cpu_stream,
+                             mshadow::Stream<gpu>* gpu_stream,
+                             const NDArray& src,
+                             const NDArray& indices,
+                             NDArray* dst) {
+#if MXNET_USE_CUDA == 1
+    CHECK_EQ(src.storage_type(), kRowSparseStorage)
+      << "CopyRetainedRowsToGPU expects row-sparse src NDArray";
+    CHECK_EQ(src.ctx().dev_mask(), Context::kCPU)
+      << "CopyRetainedRowsToGPU with src on gpu context not supported";
+    CHECK_EQ(src.storage_shape()[0], src.shape()[0])
+      << "CopyRetainedRowsToGPU only supports src rsp with full rows";
+    CHECK_EQ(indices.storage_type(), kDefaultStorage);
+    CHECK_EQ(indices.ctx().dev_mask(), Context::kCPU);
+    CHECK_EQ(dst->storage_type(), kRowSparseStorage);
+    CHECK_EQ(dst->ctx().dev_mask(), Context::kGPU);
+    CHECK_EQ(indices.dtype(), dst->aux_type(rowsparse::kIdx))
+      << "CopyRetainedRowsToGPU only supports same data type for idx array and dst aux_data(0)";
+    if (!src.storage_initialized() || indices.data().Size() == 0U) {
+      op::FillZerosRspImpl(gpu_stream, dst);
+      return;
+    }
+    using namespace mshadow;
+
+    const TBlob& src_data = src.data();
+    const TBlob& idx_data = indices.data();
+    const size_t row_length = src.shape().ProdShape(1, src.shape().ndim());
+    const size_t num_rows_retained = idx_data.Size();
+    dst->CheckAndAlloc({Shape1(num_rows_retained)});
+    TBlob dst_data = dst->data();
+    TBlob dst_idx_data = dst->aux_data(rowsparse::kIdx);
+    MSHADOW_TYPE_SWITCH(src.dtype(), DType, {
+      MSHADOW_IDX_TYPE_SWITCH(indices.dtype(), IType, {
+        // copy idx array
+        Tensor<gpu, 1, IType> dst_idx_tensor = dst_idx_data.FlatTo1D<gpu, IType>(gpu_stream);
+        const Tensor<cpu, 1, IType> idx_tensor = idx_data.FlatTo1D<cpu, IType>(cpu_stream);
+        Copy(dst_idx_tensor, idx_tensor, gpu_stream);
+        // copy src data
+        const Tensor<cpu, 2, DType> src_data_tensor = src_data.get_with_shape<cpu, 2, DType>(
+            Shape2(src_data.shape_[0], row_length), cpu_stream);
+        Tensor<gpu, 2, DType> dst_data_tensor = dst_data.get_with_shape<gpu, 2, DType>(
+            Shape2(dst_data.shape_[0], row_length), gpu_stream);
+        for (size_t i = 0; i < num_rows_retained; ++i) {
+          Copy(dst_data_tensor[i], src_data_tensor[idx_tensor[i]], gpu_stream);
+        }
+      })
+    })
+#else
+    LOG(FATAL) << "GPU not enabled";
+#endif
+  }
+
   // reduce sum into val[0]
   inline void ReduceSumCPU(const std::vector<NDArray> &in_data) {
     MSHADOW_TYPE_SWITCH(in_data[0].dtype(), DType, {
@@ -144,6 +309,78 @@ class CommCPU : public Comm {
     });
   }
 
+  // serial implementation of reduce sum for row sparse NDArray.
+  inline void ReduceSumCPUExSerial(const std::vector<NDArray> &in, NDArray *out) {
+    using namespace rowsparse;
+    using namespace mshadow;
+    auto stype = out->storage_type();
+    CHECK_EQ(stype, kRowSparseStorage) << "Unexpected storage type " << stype;
+    size_t total_num_rows = 0;
+    size_t num_in = in.size();
+    // skip the ones with empty indices and values
+    std::vector<bool> skip(num_in, false);
+    // the values tensor of the inputs
+    MSHADOW_TYPE_SWITCH(out->dtype(), DType, {
+      MSHADOW_IDX_TYPE_SWITCH(out->aux_type(kIdx), IType, {
+        std::vector<Tensor<cpu, 2, DType>> in_vals(num_in);
+        std::vector<Tensor<cpu, 1, IType>> in_indices(num_in);
+        // offset to the values tensor of all inputs
+        std::vector<size_t> offsets(num_in, 0);
+        std::vector<size_t> num_rows(num_in, 0);
+        for (size_t i = 0; i < num_in; i++) {
+          if (!in[i].storage_initialized()) {
+            skip[i] = true;
+            continue;
+          }
+          auto size = in[i].aux_shape(kIdx).Size();
+          num_rows[i] = size;
+          total_num_rows += size;
+          in_vals[i] = in[i].data().FlatTo2D<cpu, DType>();
+          in_indices[i] = in[i].aux_data(kIdx).FlatTo1D<cpu, IType>();
+        }
+        std::vector<IType> indices;
+        indices.reserve(total_num_rows);
+        // gather indices from all inputs
+        for (size_t i = 0; i < num_in; i++) {
+          for (size_t j = 0; j < num_rows[i]; j++) {
+            indices.emplace_back(in_indices[i][j]);
+          }
+        }
+        CHECK_EQ(indices.size(), total_num_rows);
+        // dedup indices
+        std::sort(indices.begin(), indices.end());
+        indices.resize(std::unique(indices.begin(), indices.end()) - indices.begin());
+        // the one left are unique non-zero rows
+        size_t nnr = indices.size();
+        // allocate memory for output
+        out->CheckAndAlloc({Shape1(nnr)});
+        auto idx_data = out->aux_data(kIdx).FlatTo1D<cpu, IType>();
+        auto val_data = out->data().FlatTo2D<cpu, DType>();
+
+        for (size_t i = 0; i < nnr; i++) {
+          // copy indices back
+          idx_data[i] = indices[i];
+          bool zeros = true;
+          for (size_t j = 0; j < num_in; j++) {
+            if (skip[j]) continue;
+            size_t offset = offsets[j];
+            if (offset < num_rows[j]) {
+              if (indices[i] == in_indices[j][offset]) {
+                if (zeros) {
+                  Copy(val_data[i], in_vals[j][offset], nullptr);
+                  zeros = false;
+                } else {
+                  val_data[i] += in_vals[j][offset];
+                }
+                offsets[j] += 1;
+              }
+            }
+          }
+        }
+      });
+    });
+  }
+
   template<typename DType>
   inline static void ReduceSumCPU(
       const std::vector<DType*> &dptr, size_t offset, index_t size) {
@@ -209,6 +446,7 @@ class CommCPU : public Comm {
   std::unordered_map<int, BufferEntry> merge_buf_;
   size_t bigarray_bound_;
   int nthread_reduction_;
+  bool is_serial_push_;
 };
 
 /**
@@ -227,8 +465,13 @@ class CommDevice : public Comm {
 
   virtual ~CommDevice() { }
 
-  void Init(int key, const TShape& shape, int dtype = mshadow::kFloat32) override {
-    sorted_key_attrs_.push_back(std::make_tuple(key, shape, dtype));
+  void Init(int key, const NDArrayStorageType stype, const TShape& shape,
+            int dtype = mshadow::kFloat32) override {
+    if (stype == kDefaultStorage) {
+      sorted_key_attrs_.push_back(std::make_tuple(key, shape, dtype));
+    } else {
+      LOG(FATAL) << "storage type " << stype << " not implemented for device yet";
+    }
   }
 
   const NDArray& Reduce(int key, const std::vector<NDArray>& src,
@@ -296,6 +539,13 @@ class CommDevice : public Comm {
     }
   }
 
+  void BroadcastRowSparse(int key, const NDArray& src,
+                          const std::vector<std::pair<NDArray*, NDArray>>& dst,
+                          const bool use_copy,
+                          const int priority) override {
+    LOG(FATAL) << "Not implemented yet";
+  }
+
  private:
   void EnableP2P(const std::vector<Context>& devs) {
 #if MXNET_USE_CUDA
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index 52c7c132cb5c..e8efdc14fdc0 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -25,6 +25,8 @@
 #define MXNET_KVSTORE_KVSTORE_DIST_H_
 #include <string>
 #include <vector>
+#include <algorithm>
+#include <utility>
 #include "./kvstore_local.h"
 #include "mxnet/engine.h"
 #include "ps/ps.h"
@@ -60,6 +62,7 @@ class KVStoreDist : public KVStoreLocal {
       }
     }
     bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000);
+    log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false);
   }
 
   virtual ~KVStoreDist() {
@@ -81,7 +84,7 @@ class KVStoreDist : public KVStoreLocal {
             const std::vector<NDArray>& values) override {
     CheckUnique(keys);
     for (size_t i = 0; i < keys.size(); ++i) {
-      comm_->Init(keys[i], values[i].shape(), values[i].dtype());
+      comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype());
     }
     if (get_rank() == 0) {
       Push_(keys, values, 0, false);
@@ -115,17 +118,19 @@ class KVStoreDist : public KVStoreLocal {
       // use the same array for merging to guarantee that pull always happens
       // after the previous push on this key
       auto& recv_buf = comm_buf_[key];
+      const auto storage_type = grouped_vals[i][0]->storage_type();
+      CHECK_EQ(storage_type, kDefaultStorage)
+               << "Expected stype of value to be kDefaultStorage";
       if (recv_buf.is_none()) {
         // it may happen for the first time a no-rank-0 worker pull the weight.
-        recv_buf = NDArray(
-          grouped_vals[i][0]->shape(), pinned_ctx_, false, grouped_vals[i][0]->dtype());
+        recv_buf = NDArray(grouped_vals[i][0]->shape(), pinned_ctx_,
+                           true, grouped_vals[i][0]->dtype());
       }
 #if MKL_EXPERIMENTAL == 1
       mkl_set_tblob_eager_mode(recv_buf.data());
 #endif
       real_t* data = static_cast<real_t*>(recv_buf.data().dptr_);
       size_t size = recv_buf.shape().Size();
-
       auto pull_from_servers = [this, key, data, size](
           RunContext rctx, Engine::CallbackOnComplete cb) {
         // convert to ps keys
@@ -134,7 +139,7 @@ class KVStoreDist : public KVStoreLocal {
         // issue pull, false means no delete
         auto vals = new ps::SArray<real_t>(data, size, false);
         CHECK_NOTNULL(ps_worker_)->ZPull(
-        pskv.keys, vals, &pskv.lens, 0, [vals, cb](){ delete vals; cb(); });
+        pskv.keys, vals, &pskv.lens, kDefaultPushPull, [vals, cb](){ delete vals; cb(); });
       };
 
       CHECK_NOTNULL(Engine::Get())->PushAsync(
@@ -144,12 +149,55 @@ class KVStoreDist : public KVStoreLocal {
           {recv_buf.var()},
           FnProperty::kNormal,
           priority,
-          PROFILER_MESSAGE("KVStoreDistPull"));
+          PROFILER_MESSAGE("KVStoreDistDefaultPull"));
 
       comm_->Broadcast(key, recv_buf, grouped_vals[i], priority);
     }
   }
 
+  void PullRowSparse(const std::vector<int>& keys,
+                     const std::vector<std::pair<NDArray*, NDArray>>& val_rowids,
+                     const int priority = 0) {
+    std::vector<int> uniq_keys;
+    std::vector<std::vector<std::pair<NDArray*, NDArray>>> grouped_val_rowids;
+    GroupKVPairs(keys, val_rowids, &uniq_keys, &grouped_val_rowids);
+
+    for (size_t i = 0; i < uniq_keys.size(); ++i) {
+      int key = uniq_keys[i];
+      // use the same array for merging to guarantee that pull always happens
+      // after the previous push on this key
+      auto& recv_buf = comm_buf_[key];
+      auto& grouped_val_rowid = grouped_val_rowids[i];
+      const auto storage_type = grouped_val_rowid[0].first->storage_type();
+      CHECK_EQ(storage_type, kRowSparseStorage)
+               << "expected kRowSparseStorage, but got " << storage_type;
+      if (recv_buf.is_none()) {
+        // it may happen for the first time a no-rank-0 worker pull the weight.
+        recv_buf = NDArray(storage_type, grouped_val_rowid[0].first->shape(),
+                           pinned_ctx_, true, grouped_val_rowid[0].first->dtype());
+      }
+      auto &target_val_rowids = grouped_val_rowids[i];
+      const size_t num_vals = target_val_rowids.size();
+      size_t num_rows = 0;
+      // TODO(haibin) refactor this for loop
+      for (size_t i = 0; i < num_vals; i++) {
+        auto &row_id = target_val_rowids[i].second;
+        NDArray indices = row_id.Copy(pinned_ctx_);
+        Unique(&indices, priority);
+        target_val_rowids[i].second = indices;
+        num_rows += indices.shape().Size();
+      }
+      if (num_vals > 1) {
+        // TODO(haibin) aggregate over all unique indices
+        LOG(FATAL) << "RowSparsePull with multiple values is not implemented yet";
+      } else {
+        auto& indices = target_val_rowids[0].second;
+        PullRowSparse_(key, &recv_buf, indices, priority);
+        comm_->BroadcastRowSparse(key, recv_buf, grouped_val_rowid, num_vals == 1, priority);
+      }
+    }
+  }
+
   void set_updater(const Updater& updater) override {
     CHECK(updater) << "invalid updater";
     if (IsServerNode()) {
@@ -222,41 +270,130 @@ class KVStoreDist : public KVStoreLocal {
       NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0];
 
       auto& send_buf = comm_buf_[key];
+      const auto storage_type = merged.storage_type();
       if (merged.ctx().dev_mask() == cpu::kDevMask) {
+        // make sure the previous push/pull is completed
+        send_buf.WaitToWrite();
         send_buf = merged;  // avoid memory copy
       } else {
         if (send_buf.is_none()) {
-          send_buf = NDArray(merged.shape(), pinned_ctx_, false, merged.dtype());
+          if (storage_type == kDefaultStorage) {
+            send_buf = NDArray(merged.shape(), pinned_ctx_, false, merged.dtype());
+          } else {
+            send_buf = NDArray(storage_type, merged.shape(), pinned_ctx_, true, merged.dtype());
+          }
         }
         CopyFromTo(merged, &send_buf);
       }
 
       // push to servers
-      send_buf.WaitToRead();
-      size_t size = send_buf.shape().Size();
+      if (storage_type == kDefaultStorage) {
+        send_buf.WaitToRead();
+        size_t size = send_buf.shape().Size();
+#if MKL_EXPERIMENTAL == 1
+        mkl_set_tblob_eager_mode(send_buf.data());
+#endif
+        real_t* data = static_cast<real_t*>(send_buf.data().dptr_);
+        auto push_to_servers =
+            [this, key, data, size](RunContext rctx, Engine::CallbackOnComplete cb) {
+           // convert to ps keys
+          PSKV& pskv = EncodeKey(key, size);
+          // do push. false means no delete
+          ps::SArray<real_t> vals(data, size, false);
+          CHECK_NOTNULL(ps_worker_)->ZPush(
+          pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); });
+        };
+        Engine::Get()->PushAsync(
+            push_to_servers,
+            pinned_ctx_,
+            {send_buf.var()},
+            {},
+            FnProperty::kNormal,
+            priority,
+            PROFILER_MESSAGE("KVStoreDistDefaultPush"));
+      } else if (storage_type == kRowSparseStorage) {
+        PushRowSparse(key, send_buf, priority);
+      } else {
+        LOG(FATAL) << "unknown storage type";
+      }
+    }
+  }
+
+  // pull row sparse weight into `recv_buf` based on indices given by `indices`
+  void PullRowSparse_(int key, NDArray *recv_buf, const NDArray& indices, int priority) {
+    using namespace rowsparse;
+    auto pull_from_servers = [this, key, recv_buf, indices]
+                             (RunContext rctx, Engine::CallbackOnComplete cb) {
+      // allocate memory for the buffer
+      size_t num_rows = indices.shape().Size();
+      recv_buf->CheckAndAlloc({mshadow::Shape1(num_rows)});
+#if MKL_EXPERIMENTAL == 1
+      mkl_set_tblob_eager_mode(recv_buf->data());
+#endif
+      real_t* data = static_cast<real_t*>(recv_buf->data().dptr_);
+      auto indices_data = indices.data();
+      const auto offsets = indices_data.dptr<int64_t>();
+      const auto unit_len = recv_buf->shape().ProdShape(1, recv_buf->shape().ndim());
+      const int64_t size = num_rows * unit_len;
+       // convert to ps keys in row sparse format
+      PSKV& pskv = EncodeRowSparseKey(key, size, num_rows, offsets,
+                                      unit_len, recv_buf->shape()[0]);
+      if (this->log_verbose_) {
+        LOG(INFO) << "worker " << get_rank() << " pull lens: " << pskv.lens << " keys: "
+                  << pskv.keys << " size: " << size;
+      }
+      auto vals = new ps::SArray<real_t>(data, size, false);
+      CHECK_NOTNULL(ps_worker_)->ZPull(pskv.keys, vals, &pskv.lens, kRowSparsePushPull,
+        [vals, cb]() { delete vals; cb(); });
+      // copy indices to recv_buf
+      mshadow::Copy(recv_buf->aux_data(kIdx).FlatTo1D<cpu, int64_t>(),
+                    indices_data.FlatTo1D<cpu, int64_t>());
+    };
+    CHECK_NOTNULL(Engine::Get())->PushAsync(
+        pull_from_servers,
+        pinned_ctx_,
+        {indices.var()},
+        {recv_buf->var()},
+        FnProperty::kNormal,
+        priority,
+        PROFILER_MESSAGE("KVStoreDistRowSparsePull"));
+  }
+
+  // push row sparse gradient
+  void PushRowSparse(int key, const NDArray &send_buf, int priority) {
+    using namespace rowsparse;
+    auto push_to_servers = [this, key, &send_buf]
+                           (RunContext rctx, Engine::CallbackOnComplete cb) {
 #if MKL_EXPERIMENTAL == 1
       mkl_set_tblob_eager_mode(send_buf.data());
 #endif
       real_t* data = static_cast<real_t*>(send_buf.data().dptr_);
-      auto push_to_servers =
-          [this, key, data, size](RunContext rctx, Engine::CallbackOnComplete cb) {
-         // convert to ps keys
-        PSKV& pskv = EncodeKey(key, size);
-
-        // do push. false means no delete
-        ps::SArray<real_t> vals(data, size, false);
-        CHECK_NOTNULL(ps_worker_)->ZPush(
-        pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); });
-      };
-      Engine::Get()->PushAsync(
-          push_to_servers,
-          pinned_ctx_,
-          {send_buf.var()},
-          {},
-          FnProperty::kNormal,
-          priority,
-          PROFILER_MESSAGE("KVStoreDistPush"));
-    }
+      bool init = send_buf.storage_initialized();
+      const int64_t num_rows = init ? send_buf.aux_shape(kIdx)[0] : 0;
+      const auto offsets = init ? send_buf.aux_data(kIdx).dptr<int64_t>() : nullptr;
+      const auto unit_len = send_buf.shape().ProdShape(1, send_buf.shape().ndim());
+      const int64_t size = num_rows * unit_len;
+
+       // convert to ps keys in row sparse format
+      PSKV& pskv = EncodeRowSparseKey(key, size, num_rows, offsets,
+                                      unit_len, send_buf.shape()[0]);
+      if (this->log_verbose_) {
+        LOG(INFO) << "worker " << get_rank() << " push lens: " << pskv.lens << " keys: "
+                  << pskv.keys << " size: " << size;
+      }
+      ps::SArray<real_t> vals(data, size, false);
+      CHECK_NOTNULL(ps_worker_)->ZPush(pskv.keys, vals, pskv.lens, kRowSparsePushPull, [cb]() {
+        cb();
+      });
+    };
+    Engine::Get()->PushAsync(
+        push_to_servers,
+        pinned_ctx_,
+        {send_buf.var()},
+        {},
+        FnProperty::kNormal,
+        priority,
+        PROFILER_MESSAGE("KVStoreDistRowSparsePush"));
   }
 
   /**
@@ -284,7 +421,7 @@ class KVStoreDist : public KVStoreLocal {
   std::unordered_map<int, PSKV> ps_kv_;
 
   /**
-   * \brief serizelize EncodeKey
+   * \brief serizelize EncodeRowSparseKey and EncodeKey
    */
   std::mutex mu_;
 
@@ -331,6 +468,64 @@ class KVStoreDist : public KVStoreLocal {
     return pskv;
   }
 
+  // TODO(haibin) this encoding method for row sparse keys doesn't allow cross-layer batching
+  inline PSKV& EncodeRowSparseKey(const int key, const int64_t size, const int64_t num_rows,
+                                  const int64_t *offsets, const size_t unit_len,
+                                  const int64_t total_num_rows) {
+    using namespace common;
+    mu_.lock();
+    PSKV& pskv = ps_kv_[key];
+    mu_.unlock();
+    pskv.keys.clear();
+    pskv.lens.clear();
+    // TODO(haibin) cache this information
+    auto krs = ps::Postoffice::Get()->GetServerKeyRanges();
+    int num_servers = krs.size();
+    CHECK_GT(num_servers, 0);
+
+    if (total_num_rows * unit_len >= bigarray_bound_) {
+      pskv.size = 0;
+      int64_t start_row = 0;
+      // parition it to all servers
+      for (int i = 0; i < num_servers; ++i) {
+        // calculate partition ranges
+        int64_t part_num_rows =
+            llround(static_cast<double>(total_num_rows) / num_servers * (i + 1)) -
+            llround(static_cast<double>(total_num_rows) / num_servers * i);
+        auto end_row = start_row + part_num_rows;
+        auto lb = std::lower_bound(offsets, offsets + num_rows, start_row);
+        auto ub = std::upper_bound(offsets, offsets + num_rows, end_row - 1);
+        ps::Key master_key = krs[i].begin() + key;
+        pskv.keys.push_back(master_key);
+        pskv.lens.push_back(0);
+        for (auto offset = lb; offset < ub; offset++) {
+          ps::Key ps_key = krs[i].begin() + key + (*offset - start_row);
+          CHECK_LT(ps_key, krs[i].end());
+          pskv.keys.push_back(ps_key);
+          pskv.lens.push_back(unit_len);
+          pskv.size += unit_len;
+        }
+        start_row = end_row;
+      }
+      CHECK_EQ(static_cast<size_t>(pskv.size), size);
+    } else {
+      // send it to a single random picked server
+      int server = (key * 9973) % num_servers;
+      ps::Key master_key = krs[server].begin() + key;
+      pskv.keys.push_back(master_key);
+      pskv.lens.push_back(0);
+      for (int64_t i = 0; i < num_rows; i++) {
+        ps::Key ps_key = krs[server].begin() + key + offsets[i];
+        CHECK_LT(ps_key, krs[server].end());
+        pskv.keys.push_back(ps_key);
+        pskv.lens.push_back(unit_len);
+      }
+      pskv.size = size;
+    }
+    return pskv;
+  }
+
+
   /**
    * \brief for worker to push and pull data
    */
@@ -345,6 +540,7 @@ class KVStoreDist : public KVStoreLocal {
   size_t bigarray_bound_;
   /// \brief send & recver buffer
   std::unordered_map<int, NDArray> comm_buf_;
+  bool log_verbose_;
 };
 
 }  // namespace kvstore
diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h
index 4e9f887173c5..2ad90ae15d10 100644
--- a/src/kvstore/kvstore_dist_server.h
+++ b/src/kvstore/kvstore_dist_server.h
@@ -33,10 +33,13 @@
 #include <vector>
 #include "ps/ps.h"
 #include "mxnet/kvstore.h"
+#include "../operator/tensor/elemwise_binary_op.h"
 
 namespace mxnet {
 namespace kvstore {
 
+static const int kRowSparsePushPull = 1;
+static const int kDefaultPushPull = 0;
 static const int kStopServer = -1;
 static const int kSyncMode = -2;
 
@@ -110,8 +113,9 @@ class KVStoreDistServer {
     static_cast<ps::SimpleApp*>(ps_server_)->set_request_handle(
         std::bind(&KVStoreDistServer::CommandHandle, this, _1, _2));
     ps_server_->set_request_handle(
-        std::bind(&KVStoreDistServer::DataHandle, this, _1, _2, _3));
+        std::bind(&KVStoreDistServer::DataHandleEx, this, _1, _2, _3));
     sync_mode_ = false;
+    log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false);
   }
 
   ~KVStoreDistServer() {
@@ -136,6 +140,11 @@ class KVStoreDistServer {
   }
 
  private:
+  struct MergeBuf {
+    std::vector<ps::KVMeta> request;
+    NDArray array;
+  };
+
   void CommandHandle(const ps::SimpleData& recved, ps::SimpleApp* app) {
     if (recved.head == kStopServer) {
       exec_.Stop();
@@ -151,9 +160,200 @@ class KVStoreDistServer {
     app->Response(recved);
   }
 
-  void DataHandle(const ps::KVMeta& req_meta,
-                  const ps::KVPairs<real_t>& req_data,
-                  ps::KVServer<real_t>* server) {
+  void DataHandleEx(const ps::KVMeta& req_meta,
+                    const ps::KVPairs<real_t>& req_data,
+                    ps::KVServer<real_t>* server) {
+    if (req_meta.cmd == kRowSparsePushPull) {
+      DataHandleRowSparse(req_meta, req_data, server);
+    } else {
+      DataHandleDefault(req_meta, req_data, server);
+    }
+    return;
+  }
+
+  inline void ApplyUpdates(const int key, MergeBuf *merged, NDArray *stored,
+                           ps::KVServer<real_t>* server) {
+    if (merged->request.size() == (size_t) ps::NumWorkers()) {
+      // let the main thread to execute updater_, which is necessary for python
+      if (updater_) {
+        exec_.Exec([this, key, merged, stored](){
+            CHECK(updater_);
+            updater_(key, merged->array, stored);
+          });
+      } else {
+        // if no updater, just copy
+        CopyFromTo(merged->array, stored);
+      }
+      if (log_verbose_)  {
+        LOG(INFO) << "sync response to " << merged->request.size() << " workers";
+      }
+      for (const auto& req : merged->request) {
+        server->Response(req);
+      }
+      merged->request.clear();
+      stored->WaitToRead();
+    } else {
+      merged->array.WaitToRead();
+    }
+  }
+
+  void DecodeRowIds(const ps::SArray<ps::Key> &keys, int64_t *indices,
+                    const int64_t master_key, const int64_t num_rows) {
+    indices[0] = 0;
+    for (int64_t i = 1; i <= num_rows; i++) {
+      int key = DecodeKey(keys[i]);
+      auto row_id = key - master_key;
+      indices[i - 1] = row_id;
+    }
+  }
+
+  void DataHandleRowSparse(const ps::KVMeta& req_meta,
+                       const ps::KVPairs<real_t>& req_data,
+                       ps::KVServer<real_t>* server) {
+    int master_key = DecodeKey(req_data.keys[0]);
+    auto num_rows = req_data.keys.size() - 1;
+    auto& stored = store_[master_key];
+    if (req_meta.push) {
+      CHECK_GT(req_data.lens.size(), 0) << "req_data.lens cannot be empty";
+      CHECK_EQ(req_data.lens[0], 0);
+      real_t* data = req_data.vals.data();
+      if (stored.is_none()) {
+        if (log_verbose_) LOG(INFO) << "initial push: " << master_key;
+        // initialization
+        CHECK_GT(num_rows, 0) << "init with empty data is not supported";
+        auto unit_len = req_data.lens[1];
+        CHECK_GT(unit_len, 0);
+        size_t ds[] = {num_rows, (size_t) unit_len};
+        TShape dshape(ds, ds + 2);
+        CHECK_EQ(req_data.vals.size(), num_rows * unit_len);
+        TBlob recv_blob(data, dshape, cpu::kDevMask);  // NOLINT(*)
+        NDArray recved = NDArray(recv_blob, 0);
+        // TODO(haibin) temporarily initialized as dense NDArray. We need inplace operator
+        // support for rowsparse ndarrays. And after that `stored` should be initialized as
+        // RowSparse NDArray
+        stored = NDArray(kRowSparseStorage, dshape, Context());
+        CopyFromTo(recved, &stored, 0);
+        stored.WaitToRead();
+        server->Response(req_meta);
+        return;
+      }
+      // synced push
+      if (sync_mode_) {
+        if (log_verbose_) LOG(INFO) << "sync push: " << master_key << " " << req_data.keys;
+        auto& merged = merge_buf_[master_key];
+        if (merged.array.is_none()) {
+          merged.array = NDArray(kRowSparseStorage, stored.shape(), Context());
+        }
+        if (num_rows == 0) {
+          // reset to zeros
+          if (merged.request.size() == 0) {
+            merged.array = NDArray(kRowSparseStorage, stored.shape(), Context());
+          } else {
+            // nothing to aggregate
+          }
+          merged.request.push_back(req_meta);
+          ApplyUpdates(master_key, &merged,  &stored, server);
+          return;
+        }
+        auto unit_len = req_data.lens[1];
+        CHECK_GT(unit_len, 0);
+        // indices
+        std::vector<int64_t> indices(num_rows);
+        DecodeRowIds(req_data.keys, indices.data(), master_key, num_rows);
+        // data
+        TBlob idx_blob(indices.data(), mshadow::Shape1(num_rows), cpu::kDevMask);
+        size_t ds[] = {(size_t) num_rows, (size_t) unit_len};
+        TShape dshape(ds, ds + 2);
+        TBlob recv_blob(data, dshape, cpu::kDevMask); // NOLINT(*)
+        // row_sparse NDArray
+        NDArray recved(kRowSparseStorage, stored.shape(), recv_blob, {idx_blob}, 0);
+
+        if (merged.request.size() == 0) {
+          CopyFromTo(recved, &merged.array, 0);
+        } else {
+          NDArray out(kRowSparseStorage, stored.shape(), Context());
+          std::vector<Engine::VarHandle> const_vars;
+          const_vars.push_back(recved.var());
+          const_vars.push_back(merged.array.var());
+          // accumulate row_sparse gradients
+          // TODO(haibin) override + operator for row_sparse NDArray
+          // instead of calling BinaryComputeRspRsp directly
+          using namespace mshadow;
+          Engine::Get()->PushSync([recved, merged, out](RunContext ctx) {
+              std::vector<NDArray> inputs, outputs;
+              inputs.push_back(recved);
+              inputs.push_back(merged.array);
+              outputs.push_back(out);
+              op::BinaryComputeRspRspImpl<cpu, cpu>({}, {}, inputs, {kWriteTo}, outputs);
+            }, recved.ctx(), const_vars, {out.var()},
+            FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+          CopyFromTo(out, &merged.array, 0);
+        }
+        merged.request.push_back(req_meta);
+        ApplyUpdates(master_key, &merged,  &stored, server);
+      } else {
+        // async push
+        if (log_verbose_) LOG(INFO) << "async push: " << master_key;
+        if (num_rows == 0) {
+          server->Response(req_meta);
+          return;
+        }
+        auto unit_len = req_data.lens[1];
+        CHECK_GT(unit_len, 0);
+        // indices
+        std::vector<int64_t> indices(num_rows);
+        DecodeRowIds(req_data.keys, indices.data(), master_key, num_rows);
+        TBlob idx_blob(indices.data(), mshadow::Shape1(num_rows), cpu::kDevMask);
+        size_t ds[] = {(size_t) num_rows, (size_t) unit_len};
+        TShape dshape(ds, ds + 2);
+        TBlob recv_blob(data, dshape, cpu::kDevMask); // NOLINT(*)
+        NDArray recved(kRowSparseStorage, stored.shape(), recv_blob, {idx_blob}, 0);
+        exec_.Exec([this, master_key, &recved, &stored](){
+            CHECK(updater_);
+            updater_(master_key, recved, &stored);
+          });
+        server->Response(req_meta);
+        stored.WaitToRead();
+      }
+    } else {
+      // pull
+      if (log_verbose_) LOG(INFO) << "pull: " << master_key;
+      ps::KVPairs<real_t> response;
+      if (num_rows == 0) {
+        std::vector<int> lens(req_data.keys.size(), 0);
+        response.keys = req_data.keys;
+        response.lens.CopyFrom(lens.begin(), lens.end());
+        server->Response(req_meta, response);
+        return;
+      }
+      CHECK(!stored.is_none()) << "init " << master_key << " first";
+      auto shape = stored.shape();
+      auto unit_len = shape.ProdShape(1, shape.ndim());
+      const float* data = stored.data().dptr<float>();
+      auto len = unit_len * num_rows;
+      // concat values
+      response.vals.resize(len);
+      for (size_t i = 1; i <= num_rows; i++) {
+        int key = DecodeKey(req_data.keys[i]);
+        int64_t row_id = key - master_key;
+        const auto src = data + row_id * unit_len;
+        auto begin = (i - 1) * unit_len;
+        auto end = i * unit_len;
+        response.vals.segment(begin, end).CopyFrom(src, unit_len);
+      }
+      // setup response
+      response.keys = req_data.keys;
+      std::vector<int> lens(req_data.keys.size(), unit_len);
+      lens[0] = 0;
+      response.lens.CopyFrom(lens.begin(), lens.end());
+      server->Response(req_meta, response);
+    }
+  }
+
+  void DataHandleDefault(const ps::KVMeta& req_meta,
+                         const ps::KVPairs<real_t> &req_data,
+                         ps::KVServer<real_t>* server) {
+    CHECK_EQ(req_meta.cmd, kDefaultPushPull);
     // do some check
     CHECK_EQ(req_data.keys.size(), (size_t)1);
     if (req_meta.push) {
@@ -185,35 +385,13 @@ class KVStoreDistServer {
         if (merged.array.is_none()) {
           merged.array = NDArray(dshape, Context());
         }
-
         if (merged.request.size() == 0) {
           CopyFromTo(recved, &merged.array, 0);
         } else {
           merged.array += recved;
         }
-
         merged.request.push_back(req_meta);
-
-        if (merged.request.size() == (size_t)ps::NumWorkers()) {
-          // let the main thread to execute updater_, which is necessary for
-          // python
-          if (updater_) {
-            exec_.Exec([this, key, &merged, &stored](){
-                CHECK(updater_);
-                updater_(key, merged.array, &stored);
-              });
-          } else {
-            // if no updater, just copy
-            CopyFromTo(merged.array, &stored);
-          }
-          for (const auto& req : merged.request) {
-            server->Response(req);
-          }
-          merged.request.clear();
-          stored.WaitToRead();
-        } else {
-          merged.array.WaitToRead();
-        }
+        ApplyUpdates(key, &merged, &stored, server);
       } else {
         // async push
         exec_.Exec([this, key, &recved, &stored](){
@@ -227,7 +405,7 @@ class KVStoreDistServer {
       // pull
       ps::KVPairs<real_t> response;
       CHECK(!stored.is_none()) << "init " << key << " first";
-      int len = stored.shape()[0];
+      auto len = stored.shape().Size();
       response.keys = req_data.keys;
       response.lens = {len};
       // TODO(mli) try to remove this CopyFrom
@@ -249,16 +427,13 @@ class KVStoreDistServer {
   KVStore::Updater updater_;
 
   std::unordered_map<int, NDArray> store_;
-
-  struct MergeBuf {
-    std::vector<ps::KVMeta> request;
-    NDArray array;
-  };
   std::unordered_map<int, MergeBuf> merge_buf_;
 
   Executor exec_;
-
   ps::KVServer<float>* ps_server_;
+
+  // whether to LOG verbose information
+  bool log_verbose_;
 };
 
 }  // namespace kvstore
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 536a89b46e13..d8c399edf017 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -62,7 +62,7 @@ class KVStoreLocal : public KVStore {
       CHECK(local_.find(keys[i]) == local_.end())
           << "duplicate init of key " << keys[i];
       local_[keys[i]] = values[i].Copy(pinned_ctx_);
-      comm_->Init(keys[i], values[i].shape(), values[i].dtype());
+      comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype());
     }
   }
 
@@ -100,7 +100,11 @@ class KVStoreLocal : public KVStore {
         }
         updater_(key, merged,  &local);
       } else {
-        local = merged;
+        if (merged.storage_type() != local.storage_type()) {
+          local = merged.Copy(local.ctx());
+        } else {
+          local = merged;
+        }
       }
     }
   }
@@ -120,6 +124,30 @@ class KVStoreLocal : public KVStore {
     }
   }
 
+  void PullRowSparse(const std::vector<int>& keys,
+                     const std::vector<std::pair<NDArray*, NDArray>>& val_rowids,
+                     int priority = 0) override {
+    std::vector<int> uniq_keys;
+    std::vector<std::vector<std::pair<NDArray*, NDArray>>> grouped_val_rowids;
+    GroupKVPairs(keys, val_rowids, &uniq_keys, &grouped_val_rowids);
+    for (size_t i = 0; i < uniq_keys.size(); ++i) {
+      int key = uniq_keys[i];
+      const NDArray& local = local_[key];
+      CHECK(!local.is_none()) << "key " << key << " has not been inited";
+      CHECK_EQ(local.storage_type(), kRowSparseStorage)
+               << "PullRowSparse expects row_sparse src NDArray";
+      auto &target_val_rowids = grouped_val_rowids[i];
+      const size_t num_vals = target_val_rowids.size();
+      for (size_t i = 0; i < num_vals; i++) {
+        auto &row_id = target_val_rowids[i].second;
+        NDArray indices = row_id.Copy(pinned_ctx_);
+        Unique(&indices, priority);
+        target_val_rowids[i].second = indices;
+      }
+      comm_->BroadcastRowSparse(key, local, grouped_val_rowids[i], false, priority);
+    }
+  }
+
   void Push(const std::vector<std::string>& str_keys,
             const std::vector<NDArray>& values,
             int priority) override {
@@ -136,6 +164,14 @@ class KVStoreLocal : public KVStore {
     Pull(keys, values, priority);
   }
 
+  void PullRowSparse(const std::vector<std::string>& str_keys,
+                     const std::vector<std::pair<NDArray*, NDArray>>& val_rowids,
+                     const int priority = 0) override {
+    std::vector<int> keys(str_keys.size());
+    LookupKeys(str_keys, &keys);
+    PullRowSparse(keys, val_rowids, priority);
+  }
+
  protected:
   /**
    * \brief group values on keys
@@ -178,6 +214,28 @@ class KVStoreLocal : public KVStore {
     }
   }
 
+  /**
+   * \brief sort and get unique values. Output is expected to be on cpu_pinned context
+   */
+  void Unique(NDArray *out, int priority = 0) {
+    CHECK_EQ(out->ctx().dev_mask(), pinned_ctx_.dev_mask())
+             << "Unique expects input with `pinned_ctx_`";
+    Engine::Get()->PushSync([out](RunContext rctx) {
+        NDArray *output = out;
+        CHECK_EQ(out->shape().ndim(), 1) << "Unique expects 1D inputs";
+        const auto size = out->shape()[0];
+        auto out_data = output->data();
+        MSHADOW_IDX_TYPE_SWITCH(out_data.type_flag_, IType, {
+          auto dptr = output->data().dptr<IType>();
+          common::ParallelSort(dptr, dptr + size, omp_get_max_threads());
+          auto num_unique_idx = std::unique(dptr, dptr + size) - dptr;
+          *output = output->Reshape(mshadow::Shape1(num_unique_idx));
+        });
+      }, pinned_ctx_, {}, {out->var()},
+      FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreUnique"));
+    out->WaitToRead();
+  }
+
   /// reducer and broadcaster
   Comm* comm_;
   /// pinned context
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 8e71df729b73..0d2968626d79 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -30,6 +30,9 @@
 #include <mxnet/resource.h>
 #include <mshadow/tensor.h>
 #include "./ndarray_function.h"
+#include "../common/utils.h"
+#include "../operator/tensor/matrix_op-inl.h"
+#include "../operator/tensor/init_op.h"
 #include "./autograd.h"
 
 #if MXNET_USE_OPENCV
@@ -52,6 +55,8 @@ NDArray NDArray::grad() const {
 
 NDArray NDArray::Reshape(const TShape &shape) const {
   using namespace autograd;
+  CHECK(storage_type() == kDefaultStorage) << "Reshape for storage type " <<
+        storage_type() << " is not implemented yet";
   if (AutogradRuntime::Get()->IsTraining()) {
     CHECK_GE(shape_.Size(), shape.Size())
       << "NDArray.Reshape: target shape must have must have the same size as "
@@ -82,13 +87,15 @@ NDArray NDArray::Reshape(const TShape &shape) const {
   }
 }
 
-
 NDArray NDArray::Slice(index_t begin, index_t end) const {
   using namespace autograd;
-  NDArray ret = *this;
+  using namespace mshadow;
   CHECK(!is_none()) << "NDArray is not initialized";
   CHECK_LT(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")";
   CHECK_GE(shape_[0], end) << "Slice end index out of range";
+  CHECK_EQ(storage_type(), kDefaultStorage);
+  NDArray ret = *this;
+  auto stype = storage_type();
   size_t length = shape_.ProdShape(1, shape_.ndim());
   MSHADOW_TYPE_SWITCH(ret.dtype(), DType, {
     ret.byte_offset_ += begin * length * sizeof(DType);
@@ -115,8 +122,9 @@ NDArray NDArray::Slice(index_t begin, index_t end) const {
   }
 }
 
-
 NDArray NDArray::At(index_t idx) const {
+  CHECK(storage_type() == kDefaultStorage) << "Storage type "
+                                           << storage_type() << " doesn't support At()";
   NDArray ret = this->Slice(idx, idx+1);
   if (shape_.ndim() > 1) {
     return ret.Reshape(TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
@@ -125,6 +133,24 @@ NDArray NDArray::At(index_t idx) const {
   }
 }
 
+/*!
+ * \brief Return deep copy of the current ndarry's aux_data(i)
+ * as an NDArray of default storage type. This function blocks.
+ */
+NDArray NDArray::aux_ndarray(size_t i) const {
+  CHECK_NE(storage_type(), kDefaultStorage);
+  CHECK(i < ptr_->aux_shapes.size());
+  // create a delay_alloc default ndarray as output
+  NDArray ret(TShape(), ctx(), true, aux_type(i));
+  ret.SyncCopyFromNDArray(*this, i);
+  return ret;
+}
+
+NDArray NDArray::data_ndarray() const {
+  NDArray ret(TShape(), ctx(), true, dtype_);
+  ret.SyncCopyFromNDArray(*this);
+  return ret;
+}
 
 bool NDArray::fresh_out_grad() const {
   if (entry_.ag_node != nullptr) return entry_.ag_node->fresh_out_grad;
@@ -239,11 +265,11 @@ void BinaryOp(const NDArray &lhs,
   // redirect everything to mshadow operations
   switch (lhs.ctx().dev_mask()) {
     case cpu::kDevMask: {
-      Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::Eval<cpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
-        }, lhs.ctx(), const_vars, {ret.var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+        Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
+            TBlob tmp = ret.data();
+            ndarray::Eval<cpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
+          }, lhs.ctx(), const_vars, {ret.var()},
+          FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #if MXNET_USE_CUDA
@@ -269,6 +295,7 @@ void SetValueOp(const real_t &rhs, NDArray *out) {
   switch (ret.ctx().dev_mask()) {
     case cpu::kDevMask: {
       Engine::Get()->PushSync([rhs, ret](RunContext ctx) {
+          CHECK(ret.storage_type() == kDefaultStorage);
           TBlob tmp = ret.data();
           ndarray::Eval<cpu>(rhs, &tmp, ctx);
         }, ret.ctx(), {}, {ret.var()},
@@ -340,6 +367,134 @@ void ScalarOp(const NDArray &lhs,
   }
 }
 
+size_t num_aux_data(NDArrayStorageType stype) {
+  size_t num = 0;
+  switch (stype) {
+    case kDefaultStorage: num = 0; break;
+    case kCSRStorage: num = 2; break;
+    case kRowSparseStorage: num = 1; break;
+     default: LOG(FATAL) << "Unknown storage type" << stype; break;
+  }
+  return num;
+}
+
+// Make a copy of a CSR NDArray
+template<typename from_xpu, typename to_xpu>
+inline void CopyFromToCsrImpl(const NDArray from, NDArray *to, RunContext ctx) {
+  using namespace mshadow;
+  CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type";
+  // if source storage is not initialized, fill destination with zeros
+  auto s = ctx.get_stream<to_xpu>();
+  if (!from.storage_initialized()) {
+    op::FillZerosCsrImpl<to_xpu>(s, to);
+    return;
+  }
+  // Allocate storage
+  to->CheckAndAllocAuxData(csr::kIndPtr, from.aux_shape(csr::kIndPtr));
+  to->CheckAndAllocAuxData(csr::kIdx, from.aux_shape(csr::kIdx));
+  to->CheckAndAllocData(from.aux_shape(csr::kIdx));
+  TBlob val = to->data();
+  TBlob indptr = to->aux_data(csr::kIndPtr);
+  TBlob idx = to->aux_data(csr::kIdx);
+  ndarray::Copy<from_xpu, to_xpu>(from.data(), &val,
+                                  from.ctx(), to->ctx(), ctx);
+  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIndPtr), &indptr,
+                                  from.ctx(), to->ctx(), ctx);
+  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIdx), &idx,
+                                  from.ctx(), to->ctx(), ctx);
+}
+
+// Make a copy of a row-sparse NDArray
+template<typename from_xpu, typename to_xpu>
+inline void CopyFromToRspImpl(const NDArray from, NDArray *to, RunContext ctx) {
+  using namespace mshadow;
+  CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type";
+  // if source is zeros, fill destination with zeros, too
+  auto s = ctx.get_stream<to_xpu>();
+  if (!from.storage_initialized()) {
+    op::FillZerosRspImpl<to_xpu>(s, to);
+    return;
+  }
+  auto aux_shape = from.aux_shape(rowsparse::kIdx);
+  to->CheckAndAlloc({aux_shape});
+  TBlob val = to->data();
+  TBlob idx = to->aux_data(rowsparse::kIdx);
+  ndarray::Copy<from_xpu, to_xpu>(from.data(), &val,
+                                  from.ctx(), to->ctx(), ctx);
+  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(rowsparse::kIdx), &idx,
+                                  from.ctx(), to->ctx(), ctx);
+}
+
+// Make a copy of a dense NDArray
+template<typename from_xpu, typename to_xpu>
+inline void CopyFromToDnsImpl(const NDArray from, NDArray *to, RunContext ctx) {
+  using namespace mshadow;
+  CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type";
+  TBlob tmp = to->data();
+  ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
+                                  from.ctx(), to->ctx(), ctx);
+}
+
+// Make a copy of an NDArray based on storage type
+template<typename from_xpu, typename to_xpu>
+void CopyFromToImpl(const NDArray from, NDArray *to, RunContext rctx) {
+  using namespace std;
+  using namespace mshadow;
+  // if storage type doesn't match, cast the storage first
+  auto from_stype = from.storage_type();
+  auto to_stype = to->storage_type();
+  CHECK(from_stype == kDefaultStorage
+      || to_stype == kDefaultStorage
+      || from_stype == to_stype)
+    << "Copying ndarray of stype = " << from_stype
+    << " to stype = " << to_stype << " is not supported";
+  const auto from_ctx = from.ctx();
+  const auto to_ctx = to->ctx();
+  auto s = rctx.get_stream<from_xpu>();
+  bool is_train = mxnet::autograd::AutogradRuntime::Get()->IsTraining();
+  std::vector<Resource> requested;
+  if (is_same<from_xpu, mshadow::gpu>::value && from_stype != to_stype) {
+    requested.push_back(ResourceManager::Get()->Request(from_ctx,
+        ResourceRequest(ResourceRequest::kTempSpace)));
+  }
+  OpContext opctx{is_train,
+                  rctx,
+                  engine::CallbackOnComplete(),
+                  requested};
+  if (from_ctx == to_ctx && from_stype != to_stype) {
+    // same ctx, different stypes, use cast op directly without copying
+    common::CastStorageDispatch<from_xpu>(opctx, from, *to);
+  } else {
+    NDArray casted_nd;  // an intermediate result before copying from to to
+    if (from_stype == to_stype) {
+      casted_nd = from;  // same stype, no need to cast from
+    } else {  // different stypes on different ctx needs an temporary casted_nd
+      TShape shape = from.shape();
+      if (to_stype == kDefaultStorage) {
+        casted_nd = NDArray(shape, from_ctx);
+      } else {
+        casted_nd = NDArray(to_stype, shape, from_ctx);
+      }
+      // convert from_nd to the same stype as to_nd
+      common::CastStorageDispatch<from_xpu>(opctx, from, casted_nd);
+    }
+
+    if (to_stype == kDefaultStorage) {
+      CopyFromToDnsImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
+    } else if (to_stype == kRowSparseStorage) {
+      CopyFromToRspImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
+    } else if (to_stype == kCSRStorage) {
+      CopyFromToCsrImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
+    } else {
+      LOG(FATAL) << "unknown storage type" << to_stype;
+    }
+  }
+  if (is_same<from_xpu, mshadow::gpu>::value || is_same<to_xpu, mshadow::gpu>::value) {
+    // Wait GPU kernel to complete
+    rctx.get_stream<gpu>()->Wait();
+  }
+}
+
 void CopyFromTo(const NDArray &from, NDArray *to, int priority) {
   if (from.var() == to->var()) {
     // skip to copy to itself
@@ -354,44 +509,33 @@ void CopyFromTo(const NDArray &from, NDArray *to, int priority) {
   NDArray ret = *to;
   int a = from.ctx().dev_mask();
   int b = to->ctx().dev_mask();
-
   std::vector<Engine::VarHandle> const_vars;
   if (from.var() != ret.var()) const_vars.push_back(from.var());
 
   if (a == cpu::kDevMask && b == cpu::kDevMask) {
     Engine::Get()->PushSync([from, ret](RunContext ctx) {
-        TBlob tmp = ret.data();
-        ndarray::Copy<cpu, cpu>(from.data(), &tmp,
-                                from.ctx(), ret.ctx(), ctx);
+        NDArray nd(ret);
+        CopyFromToImpl<cpu, cpu>(from, &nd, ctx);
       }, from.ctx(), const_vars, {ret.var()},
       FnProperty::kNormal, priority, PROFILER_MESSAGE("CopyCPU2CPU"));
   } else {
 #if MXNET_USE_CUDA
     if (a == cpu::kDevMask && b == gpu::kDevMask) {
       Engine::Get()->PushSync([from, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::Copy<cpu, gpu>(from.data(), &tmp,
-                                  from.ctx(), ret.ctx(), ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
+          NDArray nd(ret);
+          CopyFromToImpl<cpu, gpu>(from, &nd, ctx);
         }, ret.ctx(), const_vars, {ret.var()},
         FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("CopyCPU2GPU"));
     } else if (a == gpu::kDevMask && b == cpu::kDevMask) {
       Engine::Get()->PushSync([from, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::Copy<gpu, cpu>(from.data(), &tmp,
-                                  from.ctx(), ret.ctx(), ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
+          NDArray nd(ret);
+          CopyFromToImpl<gpu, cpu>(from, &nd, ctx);
         }, from.ctx(), const_vars, {ret.var()},
         FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2CPU"));
     } else if (a == gpu::kDevMask && b == gpu::kDevMask) {
       Engine::Get()->PushSync([from, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::Copy<gpu, gpu>(from.data(), &tmp,
-                                  from.ctx(), ret.ctx(), ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
+          NDArray nd(ret);
+          CopyFromToImpl<gpu, gpu>(from, &nd, ctx);
         }, from.ctx(), const_vars, {ret.var()},
         from.dtype() != ret.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
         priority, PROFILER_MESSAGE("CopyGPU2GPU"));
@@ -665,34 +809,76 @@ NDArray &NDArray::operator/=(const real_t &src) {
 /* magic number for ndarray version 1, with int64_t TShape */
 static const uint32_t NDARRAY_V1_MAGIC = 0xF993fac8;
 
+/* magic number for ndarray version 2, with storage type */
+static const uint32_t NDARRAY_V2_MAGIC = 0xF993fac9;
+
 void NDArray::Save(dmlc::Stream *strm) const {
-  strm->Write(NDARRAY_V1_MAGIC);
+  // write magic number to mark this version
+  // for storage type
+  strm->Write(NDARRAY_V2_MAGIC);
+
+  // save storage type
+  int32_t stype = storage_type();
+  strm->Write(&stype, sizeof(stype));
+
+  const int32_t nad = num_aux_data(storage_type());
+  // save storage shape if ndarray is sparse
+  if (nad > 0) {
+    storage_shape().Save(strm);
+  }
+
+  // save shape
   shape_.Save(strm);
   if (is_none()) return;
+
   // save context
   Context ctx = this->ctx();
   ctx.Save(strm);
   TBlob save_data;
-  NDArray temp;
+  NDArray nd_cpu;  // a copy of *this on cpu
   if (ctx.dev_mask() != cpu::kDevMask) {
-    temp = this->Copy(Context::CPU());
-    temp.WaitToRead();
-    save_data = temp.data();
+    nd_cpu = this->Copy(Context::CPU());
+    nd_cpu.WaitToRead();
+    save_data = nd_cpu.data();
   } else {
     this->WaitToRead();
     save_data = this->data();
+    nd_cpu = *this;
   }
+
   // save type flag
   int32_t type_flag = save_data.type_flag_;
   strm->Write(&type_flag, sizeof(type_flag));
+
+  // save aux_types and aux_shapes
+  if (nad > 0) {
+    for (int i = 0; i < nad; ++i) {
+      int32_t aux_type_flag = aux_type(i);
+      strm->Write(&aux_type_flag, sizeof(aux_type_flag));
+      aux_shape(i).Save(strm);
+    }
+  }
+
+  // save data
   CHECK(save_data.CheckContiguous());
   size_t type_size = mshadow::mshadow_sizeof(type_flag);
-  strm->Write(save_data.dptr_, type_size * shape_.Size());
+  // save data could be values of sparse tensors
+  // must use save_data.shape_ instead of this->shape_
+  strm->Write(save_data.dptr_, type_size * save_data.shape_.Size());
+
+  // save aux data
+  if (nad > 0) {
+    for (int i = 0; i < nad; ++i) {
+      TBlob save_data = nd_cpu.aux_data(i);
+      // save aux_data
+      CHECK(save_data.CheckContiguous());
+      size_t aux_type_size = mshadow::mshadow_sizeof(aux_type(i));
+      strm->Write(save_data.dptr_, aux_type_size * save_data.Size());
+    }
+  }
 }
 
-bool LegacyTShapeLoad(dmlc::Stream *strm, TShape *shape) {
-  uint32_t magic;
-  if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t)) return false;
+bool LegacyTShapeLoad(dmlc::Stream *strm, TShape *shape, const uint32_t magic) {
   switch (magic) {
     case NDARRAY_V1_MAGIC:
       return shape->Load(strm);
@@ -708,10 +894,10 @@ bool LegacyTShapeLoad(dmlc::Stream *strm, TShape *shape) {
   }
 }
 
-bool NDArray::Load(dmlc::Stream *strm) {
+bool NDArray::LegacyLoad(dmlc::Stream *strm, const uint32_t magic) {
   // load shape
   TShape shape;
-  if (!LegacyTShapeLoad(strm, &shape)) return false;
+  if (!LegacyTShapeLoad(strm, &shape, magic)) return false;
   if (shape.ndim() == 0) {
     *this = NDArray(); return true;
   }
@@ -739,6 +925,88 @@ bool NDArray::Load(dmlc::Stream *strm) {
   }
 }
 
+bool NDArray::Load(dmlc::Stream *strm) {
+  uint32_t magic;
+  if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t)) return false;
+  if (magic != NDARRAY_V2_MAGIC) {
+    return LegacyLoad(strm, magic);
+  }
+
+  // load storage type
+  int32_t stype;
+  if (strm->Read(&stype, sizeof(stype)) != sizeof(stype)) return false;
+  const int32_t nad = num_aux_data(static_cast<NDArrayStorageType>(stype));
+
+  // load storage shape
+  TShape sshape;
+  if (nad > 0) {
+    if (!sshape.Load(strm)) return false;
+  }
+
+  // load shape
+  TShape shape;
+  if (!shape.Load(strm)) return false;
+  if (shape.ndim() == 0) {
+    *this = NDArray(); return true;
+  }
+
+  // load context
+  Context ctx;
+  if (!ctx.Load(strm)) return false;
+
+  // load type flag
+  int32_t type_flag;
+  if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag)) return false;
+
+  // load aux_types and aux_shapes
+  std::vector<int32_t> aux_types;
+  std::vector<TShape> aux_shapes;
+  if (nad > 0) {
+    aux_types.resize(nad);
+    aux_shapes.resize(nad);
+    for (int i = 0; i < nad; ++i) {
+      // load aux_type(i)
+      if (strm->Read(&aux_types[i], sizeof(aux_types[i])) != sizeof(aux_types[i])) return false;
+      // load aux_shapes(i)
+      if (!aux_shapes[i].Load(strm)) return false;
+    }
+  }
+
+  // load data into CPU
+  NDArray temp;
+  if (0 == nad) {
+    temp = NDArray(shape, Context::CPU(), false, type_flag);
+  } else {
+    temp = NDArray(static_cast<NDArrayStorageType>(stype), shape,
+                   Context::CPU(), false, type_flag,
+                   aux_types, aux_shapes, sshape);
+  }
+  // load data
+  TBlob load_data = temp.data();
+  size_t type_size = mshadow::mshadow_sizeof(type_flag);
+  size_t nread = type_size * load_data.Size();
+  if (strm->Read(load_data.dptr_, nread) != nread) return false;
+
+  // load aux_data
+  if (nad > 0) {
+    for (int i = 0; i < nad; ++i) {
+      load_data = temp.aux_data(i);
+      type_size = mshadow::mshadow_sizeof(load_data.type_flag_);
+      nread = type_size * load_data.Size();
+      if (strm->Read(load_data.dptr_, nread) != nread) return false;
+    }
+  }
+
+  if (ctx.dev_mask() == cpu::kDevMask) {
+    *this = std::move(temp); return true;
+  } else {
+#if MXNET_USE_CUDA
+    *this = temp.Copy(ctx); return true;
+#else
+    *this = std::move(temp); return true;
+#endif
+  }
+}
 
 const uint64_t kMXAPINDArrayListMagic = 0x112;
 
@@ -771,7 +1039,16 @@ void NDArray::Load(dmlc::Stream* fi,
 }
 
 NDArray NDArray::Copy(Context ctx) const {
-  NDArray ret(shape(), ctx, true, dtype_);
+  NDArray ret;
+  if (kDefaultStorage == storage_type()) {
+    ret = NDArray(shape(), ctx, true, dtype_);
+  } else if (kUndefinedStorage != storage_type()) {
+    ret = NDArray(storage_type(), shape(), ctx, true, dtype_,
+                  ptr_->aux_types, ptr_->aux_shapes, storage_shape());
+  } else {
+    LOG(FATAL) << "NDArray::Copy cannot copy undefined storage-type ndarray to ctx.dev_type="
+               << ctx.dev_type << ", ctx.dev_id=" << ctx.dev_id;
+  }
   CopyFromTo(*this, &ret);
   return ret;
 }
@@ -804,6 +1081,101 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
   }
 }
 
+/*!
+ * \brief Copy src.data()/aux_data(i) to dst->data()/aux_data(j).
+ */
+void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
+  if (i >= 0) {
+    CHECK_NE(src.storage_type(), kDefaultStorage);
+  } else {
+    CHECK(!src.is_none()) << "src dense ndarray must have been initialized";
+  }
+  if (j >= 0) {
+    CHECK_NE(storage_type(), kDefaultStorage);
+  } else {
+    CHECK(!this->is_none()) << "dst dense ndarray must have been initialized";
+  }
+
+  if (src.var() == var()) {
+    // skip to copy to itself
+    LOG(WARNING) << "SyncCopyFromNDArray does not support copying to self";
+    return;
+  }
+  const int src_dev_mask = src.ctx().dev_mask();
+  const int dst_dev_mask = ctx().dev_mask();
+  std::vector<Engine::VarHandle> const_vars;
+  const_vars.push_back(src.var());
+
+  // get or create a dst tblob for copying src to it
+  // if dst is a dense format and has not been allocated, allocate memory for it
+  // else if dst is not initialized, allocate corresponding data blob for it
+  auto get_dst_data = [&](const TShape& src_shape) {
+    if (this->storage_type() == kDefaultStorage) {
+      this->ReshapeAndAlloc(src_shape);
+    } else if (!this->storage_initialized()) {
+      if (j < 0) {
+        this->CheckAndAllocData(src_shape);
+      } else {
+        this->CheckAndAllocAuxData(j, src_shape);
+      }
+    }
+    TBlob dst_data = (j >= 0? this->aux_data(j) : this->data());
+    CHECK_LE(src_shape.Size(), dst_data.shape_.Size());
+    return dst_data;
+  };
+
+  if (src_dev_mask == cpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
+    Engine::Get()->PushSync([&](RunContext rctx) {
+        const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
+        TBlob dst_data = get_dst_data(src_data.shape_);
+        ndarray::Copy<cpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+      }, this->ctx(), const_vars, {this->var()},
+      FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2CPU"));
+  } else {
+#if MXNET_USE_CUDA
+    if (src_dev_mask == cpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
+      Engine::Get()->PushSync([&](RunContext rctx) {
+          const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
+          TBlob dst_data = get_dst_data(src_data.shape_);
+          ndarray::Copy<cpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+          rctx.get_stream<gpu>()->Wait();
+        }, this->ctx(), const_vars, {this->var()},
+        FnProperty::kCopyToGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2GPU"));
+    } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
+      Engine::Get()->PushSync([&](RunContext rctx) {
+          const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
+          TBlob dst_data = get_dst_data(src_data.shape_);
+          ndarray::Copy<gpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+          rctx.get_stream<gpu>()->Wait();
+        }, this->ctx(), const_vars, {this->var()},
+        FnProperty::kCopyFromGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2CPU"));
+    } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
+      Engine::Get()->PushSync([&](RunContext rctx) {
+          const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
+          TBlob dst_data = get_dst_data(src_data.shape_);
+          ndarray::Copy<gpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+          rctx.get_stream<gpu>()->Wait();
+        }, this->ctx(), const_vars, {this->var()},
+        src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
+        0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2GPU"));
+    } else {
+      LOG(FATAL) << "unknown device mask";
+    }
+#else
+    LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+  }
+  // The copy operation was pushed to engine to execute.
+  // Need to wait here for it being completed.
+  // The reason for pushing the copy operation to engine
+  // is because when copying data from a sparse tensor
+  // to the current one, that sparse ndarray's storage_shape/aux_shape
+  // may not be ready or changed and we need to ensure
+  // thread safty for reading the correct shape info to allocate
+  // memory for the current ndarray.
+  WaitToRead();
+}
+
 void NDArray::SyncCopyToCPU(void *data, size_t size) const {
   TShape dshape = this->shape();
   CHECK_EQ(dshape.Size(), size)
diff --git a/src/ndarray/ndarray_function-inl.h b/src/ndarray/ndarray_function-inl.h
index 2be55f50f934..b284e0378647 100644
--- a/src/ndarray/ndarray_function-inl.h
+++ b/src/ndarray/ndarray_function-inl.h
@@ -30,27 +30,28 @@
 // macro to help specialize evaluation function
 
 #ifndef DECL_TERNARY
-#define DECL_TERNARY(XPU, OP, FUN)                                       \
-  template<>                                                            \
-  void Eval<XPU, OP>(const TBlob &lhs, const TBlob &mhs, \
-                                       const TBlob &rhs, TBlob *ret, RunContext ctx) { \
-    FUN<XPU, OP>(lhs, mhs, rhs, ret, ctx);                                   \
+#define DECL_TERNARY(XPU, OP, FUN)                                          \
+  template<>                                                                \
+  void Eval<XPU, OP>(const TBlob &lhs, const TBlob &mhs,                    \
+                     const TBlob &rhs, TBlob *ret, RunContext ctx) {        \
+    FUN<XPU, OP>(lhs, mhs, rhs, ret, ctx);                                  \
   }
 #endif
 
 #ifndef DECL_BINARY
-#define DECL_BINARY(XPU, OP, FUN)                                       \
-  template<>                                                            \
+#define DECL_BINARY(XPU, OP, FUN)                                                      \
+  template<>                                                                           \
   void Eval<XPU, OP>(const TBlob &lhs, const TBlob &rhs, TBlob *ret, RunContext ctx) { \
-    FUN<XPU, OP>(lhs, rhs, ret, ctx);                                   \
+    FUN<XPU, OP>(lhs, rhs, ret, ctx);                                                  \
   }
 #endif
 
 #ifndef DECL_SCALAR
-#define DECL_SCALAR(XPU, OP, FUN, REVERSE)                              \
-  template<>                                                            \
-  void Eval<XPU, OP, REVERSE>(const TBlob &lhs, const real_t &rhs, TBlob *ret, RunContext ctx) { \
-    FUN<XPU, OP, REVERSE>(lhs, rhs, ret, ctx);                          \
+#define DECL_SCALAR(XPU, OP, FUN, REVERSE)                           \
+  template<>                                                         \
+  void Eval<XPU, OP, REVERSE>(const TBlob &lhs, const real_t &rhs,   \
+                                     TBlob *ret, RunContext ctx) {   \
+    FUN<XPU, OP, REVERSE>(lhs, rhs, ret, ctx);                       \
   }
 #endif
 
@@ -62,10 +63,11 @@
 
 namespace mxnet {
 namespace ndarray {
+
 // true implementation
 template<typename xpu, typename OP>
-inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs,
-                        TBlob *ret, RunContext ctx) {
+void EvalBinary_(const TBlob &lhs, const TBlob &rhs,
+                 TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(ret->type_flag_, lhs.type_flag_)
@@ -79,10 +81,9 @@ inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs,
   });
 }
 
-
 template<typename xpu, typename OP>
-inline void EvalOneHot_(const TBlob &index, const TBlob &rhs,
-                        TBlob *ret, RunContext ctx) {
+void EvalOneHot_(const TBlob &index, const TBlob &rhs,
+                 TBlob *ret, RunContext ctx) {
   LOG(INFO) << "The operator onehot_encode is deprecated; use one_hot instead.";
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -99,8 +100,8 @@ inline void EvalOneHot_(const TBlob &index, const TBlob &rhs,
 }
 
 template<typename xpu, typename OP>
-inline void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs,
-                                  TBlob *ret, RunContext ctx) {
+void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs,
+                           TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   // TODO(eric): support mixed type choose, i.e. int index and float rhs.
@@ -116,8 +117,8 @@ inline void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs,
 }
 
 template<typename xpu, typename OP>
-inline void EvalMatFillRowElem_(const TBlob &lhs, const TBlob &mhs, const TBlob &rhs,
-                                  TBlob *ret, RunContext ctx) {
+void EvalMatFillRowElem_(const TBlob &lhs, const TBlob &mhs, const TBlob &rhs,
+                         TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   ret->get<xpu, 2, real_t>(s)
@@ -127,8 +128,8 @@ inline void EvalMatFillRowElem_(const TBlob &lhs, const TBlob &mhs, const TBlob
 }
 
 template<typename xpu, typename OP, bool reverse>
-inline void EvalScalar_(const TBlob &lhs, const real_t &rhs,
-                        TBlob *ret, RunContext ctx) {
+void EvalScalar_(const TBlob &lhs, const real_t &rhs,
+                 TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(ret->type_flag_, lhs.type_flag_)
@@ -148,7 +149,7 @@ inline void EvalScalar_(const TBlob &lhs, const real_t &rhs,
 
 template<>
 void EvalClip<DEVICE>(const TBlob &src, const real_t &a_min, const real_t &a_max,
-                      TBlob *ret, RunContext ctx) {
+                             TBlob *ret, RunContext ctx) {
   typedef DEVICE xpu;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -163,12 +164,11 @@ void EvalClip<DEVICE>(const TBlob &src, const real_t &a_min, const real_t &a_max
 }
 
 template<>
-void EvalRandom<DEVICE, UniformDistribution>(
-    const real_t &a,
-    const real_t &b,
-    const Resource &resource,
-    TBlob *ret,
-    RunContext ctx) {
+void EvalRandom<DEVICE, UniformDistribution>(const real_t &a,
+                                             const real_t &b,
+                                             const Resource &resource,
+                                             TBlob *ret,
+                                             RunContext ctx) {
   typedef DEVICE xpu;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   switch (ret->type_flag_) {
@@ -444,6 +444,7 @@ DECL_SCALAR(DEVICE, Plus, EvalScalar_, true)
 DECL_SCALAR(DEVICE, Minus, EvalScalar_, true)
 DECL_SCALAR(DEVICE, Mul, EvalScalar_, true)
 DECL_SCALAR(DEVICE, Div, EvalScalar_, true)
+
 // for reverse seq
 DECL_SCALAR(DEVICE, Plus, EvalScalar_, false)
 DECL_SCALAR(DEVICE, Minus, EvalScalar_, false)
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index e4af86d2c824..5cea7942efa6 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -25,6 +25,7 @@
 // this will be invoked by gcc and compile CPU version
 #include "./ndarray_function.h"
 #include "./ndarray_function-inl.h"
+#include "../common/utils.h"
 
 namespace mxnet {
 namespace ndarray {
@@ -44,5 +45,138 @@ void Copy<cpu, cpu>(const TBlob &from, TBlob *to,
     }
   })
 }
+
+template<typename DType, typename IType>
+void ElementwiseSumRspImpl(mshadow::Stream<cpu>* s,
+                           const std::vector<NDArray>& nds,
+                           const std::vector<IType>& uniq_row_idx,
+                           NDArray* out,
+                           const int nthreads = 4) {
+#pragma omp parallel num_threads(nthreads)
+  {
+    const size_t nnr = uniq_row_idx.size();
+    const int num_threads = omp_get_num_threads();
+    size_t row_block_len = (nnr + num_threads  - 1) / num_threads;
+    const size_t row_block_start = omp_get_thread_num() * row_block_len;
+    if (row_block_start < nnr) {
+      const size_t row_block_end = std::min(row_block_start+row_block_len, nnr);
+
+      const size_t row_length = out->data().shape_.ProdShape(1, out->data().shape_.ndim());
+      auto out_values = out->data().get_with_shape<cpu, 2, DType>(
+          mshadow::Shape2(out->storage_shape()[0], row_length), s);
+      auto out_indices = out->aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>();
+      for (size_t i = row_block_start; i < row_block_end; ++i) {
+        out_indices[i] = uniq_row_idx[i];
+      }
+      for (const auto& nd : nds) {
+        if (nd.storage_initialized()) {
+          const auto nd_indices = nd.aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>();
+          const auto nd_values = nd.data().get_with_shape<cpu, 2, DType>(
+              mshadow::Shape2(nd.storage_shape()[0], row_length), s);
+          const auto nd_num_rows = nd.aux_shape(rowsparse::kIdx).Size();
+          const IType* nd_indices_start = &nd_indices[0];
+          const IType* nd_indices_end = nd_indices_start + nd_num_rows;
+          const IType* row_idx_ptr = std::lower_bound(nd_indices_start, nd_indices_end,
+                                                      out_indices[row_block_start]);
+          // skip this nd if all of its row indices are smaller than out_indices[row_block_start]
+          // or current row block is not covered by [*row_idx_ptr, nd_indices_end).
+          if (nd_indices_end == row_idx_ptr || *row_idx_ptr > out_indices[row_block_end-1]) {
+            continue;
+          }
+          for (size_t irow = row_block_start;
+               irow < row_block_end && row_idx_ptr != nd_indices_end;) {
+            if (out_indices[irow] == *row_idx_ptr) {
+              auto out_value_cur_row = out_values[irow];
+              const auto offset = row_idx_ptr - nd_indices_start;
+              auto nd_value_cur_row = nd_values[offset];
+              for (size_t j = 0; j < nd_value_cur_row.shape_[0]; ++j) {
+                out_value_cur_row[j] += nd_value_cur_row[j];
+              }
+              ++irow;
+              ++row_idx_ptr;
+            } else if (out_indices[irow] < *row_idx_ptr) {
+              ++irow;
+            } else {
+              ++row_idx_ptr;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/*!
+ * \brief Given a vector of ndarrays, generate a index vector containing
+ * all the unique row indices of the ndarrays.
+ */
+template<typename IType>
+void GetUniqueRspRowIdx(const std::vector<NDArray>& nds,
+                        std::vector<IType>* uniq_row_idx) {
+  using namespace rowsparse;
+  size_t total_num_rows = 0;
+  for (const auto& nd : nds) {
+    CHECK_EQ(nd.storage_type(), kRowSparseStorage);
+    if (nd.storage_initialized()) {
+      total_num_rows += nd.aux_shape(kIdx).Size();
+    }
+  }
+
+  uniq_row_idx->resize(total_num_rows);
+  int nthreads = omp_get_max_threads();
+  int offset = 0;
+  for (const auto& nd : nds) {
+    if (nd.storage_initialized()) {
+      const IType* nd_row_idx = nd.aux_data(kIdx).dptr<IType>();
+      const int num_rows = nd.aux_shape(kIdx).Size();
+#pragma omp parallel for num_threads(nthreads)
+      for (int i = 0; i < num_rows; ++i) {
+        (*uniq_row_idx)[offset+i] = nd_row_idx[i];
+      }
+      offset += num_rows;
+    }
+  }
+
+  common::ParallelSort(uniq_row_idx->begin(), uniq_row_idx->end(), nthreads);
+  auto it = std::unique(uniq_row_idx->begin(), uniq_row_idx->end());
+  uniq_row_idx->resize(it - uniq_row_idx->begin());
+}
+
+void ElementwiseSumRsp(mshadow::Stream<cpu>* s, const std::vector<NDArray>& nds, NDArray* out) {
+  if (nds.empty()) return;
+  using namespace rowsparse;
+  CHECK_EQ(out->storage_type(), kRowSparseStorage)
+    << "Expected row sparse storage type ("
+    << out->storage_type() << " given)";
+
+  MSHADOW_TYPE_SWITCH(out->dtype(), DType, {
+    MSHADOW_IDX_TYPE_SWITCH(out->aux_type(kIdx), IType, {
+      std::vector<IType> uniq_row_idx;
+      GetUniqueRspRowIdx(nds, &uniq_row_idx);
+      out->CheckAndAlloc({mshadow::Shape1(uniq_row_idx.size())});
+      out->data().FlatTo2D<cpu, DType>() = static_cast<DType>(0);
+      ElementwiseSumRspImpl<DType, IType>(s, nds, uniq_row_idx, out, omp_get_max_threads());
+    });
+  });
+}
+
+/*!
+ * \brief Parallel cpu impl of elemwise sum for sparse tensors.
+ * Currently only support row sparse sum.
+ */
+template<>
+void ElementwiseSum<cpu>(mshadow::Stream<cpu>* s,
+                         const std::vector<NDArray>& nds,
+                         NDArray* out) {
+  if (nds.empty()) return;
+
+  if (nds[0].storage_type() == kRowSparseStorage) {
+    ElementwiseSumRsp(s, nds, out);
+  } else {
+    LOG(FATAL) << "ElementwiseSum<cpu> has not been implemented for storage_type = << "
+               << nds[0].storage_type();
+  }
+}
+
 }  // namespace ndarray
 }  // namespace mxnet
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index b1ed58db3e74..65c59185f691 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -28,6 +28,7 @@
 #include <mshadow/tensor.h>
 #include <mxnet/base.h>
 #include <mxnet/resource.h>
+#include <mxnet/ndarray.h>
 #include <vector>
 #include "../operator/mshadow_op.h"
 
@@ -168,6 +169,14 @@ void ElementwiseSum(const std::vector<TBlob> source,
                     TBlob *out,
                     RunContext ctx);
 
+/*!
+ * \brief Interface for parallel impl of elemwise sum for sparse matrices
+ */
+template<typename xpu>
+void ElementwiseSum(mshadow::Stream<xpu>* s,
+                    const std::vector<NDArray>& nds,
+                    NDArray* out);
+
 // broadcasting
 template <typename Device>
 void EvalBroadcast(TBlob const& src, TBlob* ret, int size, RunContext ctx);
diff --git a/src/nnvm/legacy_op_util.cc b/src/nnvm/legacy_op_util.cc
index 2bba5f1c3655..6e601780080b 100644
--- a/src/nnvm/legacy_op_util.cc
+++ b/src/nnvm/legacy_op_util.cc
@@ -60,19 +60,20 @@ class OperatorState {
     opr_ = opr;
     fwd_init_ = bwd_init_ = false;
 
-    in_data_.resize(prop->ListArguments().size());
+    in_data_fwd_.resize(prop->ListArguments().size());
+    in_data_bwd_.resize(prop->ListArguments().size());
     out_data_.resize(prop->NumOutputs());
     aux_data_.resize(prop->ListAuxiliaryStates().size());
-    in_grad_.resize(in_data_.size());
+    in_grad_.resize(in_data_fwd_.size());
     out_grad_.resize(prop->NumVisibleOutputs());
 
     std::vector<TBlob*> out_grad_ptr(out_grad_.size());
     for (size_t i = 0; i < out_grad_.size(); ++i) {
       out_grad_ptr[i] = &out_grad_[i];
     }
-    std::vector<TBlob*> in_data_ptr(in_data_.size());
-    for (size_t i = 0; i < in_data_.size(); ++i) {
-      in_data_ptr[i] = &in_data_[i];
+    std::vector<TBlob*> in_data_ptr(in_data_fwd_.size());
+    for (size_t i = 0; i < in_data_fwd_.size(); ++i) {
+      in_data_ptr[i] = &in_data_bwd_[i];
     }
     std::vector<TBlob*> out_data_ptr(out_data_.size());
     for (size_t i = 0; i < out_data_.size(); ++i) {
@@ -89,16 +90,19 @@ class OperatorState {
                const std::vector<OpReqType>& req,
                const std::vector<TBlob>& outputs) {
     if (!fwd_init_) {
-      CHECK_EQ(inputs.size(), in_data_.size() + aux_data_.size());
+      CHECK_EQ(inputs.size(), in_data_fwd_.size() + aux_data_.size());
       CHECK_EQ(outputs.size(), out_data_.size());
-      for (size_t i = 0; i < in_data_.size(); ++i) in_data_[i] = inputs[i];
+      // in_data_bwd_ has the same tblobs as the ones in in_data_fwd_, except that the ones
+      // referred by arg_data_ptr_ will be overriden
+      for (size_t i = 0; i < in_data_fwd_.size(); ++i) in_data_fwd_[i] = inputs[i];
+      for (size_t i = 0; i < in_data_fwd_.size(); ++i) in_data_bwd_[i] = inputs[i];
       for (size_t i = 0; i < aux_data_.size(); ++i) {
-        aux_data_[i] = inputs[i + in_data_.size()];
+        aux_data_[i] = inputs[i + in_data_fwd_.size()];
       }
       for (size_t i = 0; i < out_data_.size(); ++i) out_data_[i] = outputs[i];
       fwd_init_ = true;
     }
-    opr_->Forward(ctx, in_data_, req, out_data_, aux_data_);
+    opr_->Forward(ctx, in_data_fwd_, req, out_data_, aux_data_);
   }
 
   void Backward(const OpContext &ctx,
@@ -108,6 +112,8 @@ class OperatorState {
     if (!bwd_init_) {
       CHECK(fwd_init_);
       CHECK_EQ(arg_data_ptr_.size() + aux_data_.size(), inputs.size());
+      // override tblobs pointed by arg_data_ptr_ since they might not contain
+      // initialized data during forward pass.
       for (size_t i = 0; i < arg_data_ptr_.size(); ++i) {
         *arg_data_ptr_[i] = inputs[i];
       }
@@ -118,13 +124,19 @@ class OperatorState {
       for (size_t i = 0; i < outputs.size(); ++i) in_grad_[i] = outputs[i];
       bwd_init_ = true;
     }
-    opr_->Backward(ctx, out_grad_, in_data_, out_data_, req, in_grad_, aux_data_);
+    opr_->Backward(ctx, out_grad_, in_data_bwd_, out_data_, req, in_grad_, aux_data_);
   }
 
  private:
   Operator *opr_;
   bool fwd_init_, bwd_init_;
-  std::vector<TBlob> in_data_, aux_data_, out_data_, in_grad_, out_grad_;
+  // input data blobs for forward and backward
+  // in_data_fwd_ and in_data_bwd_ will hold different tblobs when StorageFallbackOpExecutor
+  // performs storage fallback on a non-default input NDArray. The one in in_data_fwd_ is
+  // generated when setting up forward executor, while the one in in_data_bwd_ is generated
+  // when setting up backward executor.
+  std::vector<TBlob> in_data_fwd_, in_data_bwd_;
+  std::vector<TBlob> aux_data_, out_data_, in_grad_, out_grad_;
   std::vector<TBlob*> arg_data_ptr_;
 };
 
diff --git a/src/operator/batch_norm.cc b/src/operator/batch_norm.cc
index 86f47dd6163f..866b7fe619cb 100644
--- a/src/operator/batch_norm.cc
+++ b/src/operator/batch_norm.cc
@@ -230,7 +230,7 @@ void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<cpu> *,
   #pragma omp parallel for
   for (int channel = 0; channel < static_cast<int>(channelCount); ++channel) {
     const AccReal *weight = weights.dptr<AccReal>();
-    const AccReal w = weight ? weight[channel] : AccReal(1);
+    const AccReal w = !param_.fix_gamma ? weight[channel] : AccReal(1);
     AccReal mean, invstd;
     if (is_train_and_not_global_stats) {
       mean = saveMeanDataPtr[channel];
diff --git a/src/operator/batch_norm.cu b/src/operator/batch_norm.cu
index 64f7d9373823..9a8b576a16ee 100644
--- a/src/operator/batch_norm.cu
+++ b/src/operator/batch_norm.cu
@@ -283,7 +283,7 @@ __global__ void BatchNormalizationUpdateOutputKernel(
   }
 
   // Write normalized and update the output
-  const AccReal gamma = weight.numElements() > 0
+  const AccReal gamma = ((flags & FIX_GAMMA_FLAG) == 0 && weight.numElements() > 0)
                         ? ScalarConvert<DType, AccReal>::to(weight[plane])
                         : ScalarConvert<int, AccReal>::to(1);
   const AccReal beta = bias.numElements() > 0 ? ScalarConvert<DType, AccReal>::to(bias[plane])
@@ -332,7 +332,7 @@ static __global__ void BatchNormalizationBackwardKernel(
     invstd = VARIANCE_TO_INVSTD(tensors.runningVar[plane], eps);
   }
 
-  const AccReal weightVal = tensors.weight.numElements() > 0 ?
+  const AccReal weightVal = ((flags & FIX_GAMMA_FLAG) == 0 && tensors.weight.numElements() > 0) ?
                       ScalarConvert<DType, AccReal>::to(tensors.weight[plane]) : AccReal(1);
   const AccReal norm = AccReal(1) / N;
 
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index 9b398f947e30..f60bb590a2e6 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -80,6 +80,42 @@ inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+// Only inferring output storage types from input for now
+template<typename AttrType, bool (*is_none)(const AttrType&),
+         bool (*assign)(AttrType*, const AttrType&), bool reverse_infer,
+         bool enable_fallback>
+inline bool ElemwiseStorageAttr(const nnvm::NodeAttrs& attrs,
+                         std::vector<AttrType> *in_attrs,
+                         std::vector<AttrType> *out_attrs) {
+  auto deduce = [&](std::vector<AttrType> *vec, const char *name, AttrType& result,
+                    bool fallback) {
+      auto &v = *vec;
+      for (size_t i = 0; i < vec->size(); ++i) {
+        if (v[i] == kUndefinedStorage) {
+          // if input type is unknown, assume it's default storage
+          CHECK(assign(&v[i], kDefaultStorage));
+        } else if (assign(&result, v[i]) == false && fallback) {
+          result = kDefaultStorage;
+        }
+      }
+    };
+  AttrType dattr = kUndefinedStorage;
+  deduce(in_attrs, "input", dattr, enable_fallback);
+  if (reverse_infer) {
+    LOG(FATAL) << "not implemented yet";
+  }
+  auto write = [&](std::vector<AttrType> *vec, const char *name) {
+      for (size_t i = 0; i < vec->size(); ++i) {
+        CHECK(assign(&(*vec)[i], dattr))
+          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
+          << name << ": " << "expected " << dattr << ", got " << (*vec)[i];
+      }
+    };
+  if (is_none(dattr)) dattr = kDefaultStorage;
+  write(out_attrs, "output");
+  return true;
+}
+
 template<int n_in, int n_out>
 inline bool ElemwiseShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
@@ -108,6 +144,18 @@ inline bool ElemwiseType(const nnvm::NodeAttrs& attrs,
     attrs, in_attrs, out_attrs, -1);
 }
 
+template<int n_in, int n_out>
+inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs,
+                                const Context& ctx,
+                                std::vector<int> *in_attrs,
+                                std::vector<int> *out_attrs) {
+  // TODO(junwu): add ctx info into storage inference logic
+  CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in)) << " in operator " << attrs.name;
+  CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out)) << " in operator " << attrs.name;
+  return ElemwiseStorageAttr<int, type_is_none, type_assign, false, true>(
+    attrs, in_attrs, out_attrs);
+}
+
 // Transfer gradient and input to FGradient function
 struct ElemwiseGradUseIn {
   const char *op_name;
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 0af7d026d9d5..3162ab6b7b16 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -25,8 +25,12 @@
 #ifndef MXNET_OPERATOR_MXNET_OP_H_
 #define MXNET_OPERATOR_MXNET_OP_H_
 
+#include <dmlc/omp.h>
 #include <mxnet/base.h>
 #include <algorithm>
+#ifdef __CUDACC__
+#include "../common/cuda_utils.h"
+#endif  // __CUDACC__
 
 namespace mxnet {
 namespace op {
@@ -40,6 +44,8 @@ const float PI = 3.14159265358979323846;
 using std::isnan;
 #endif
 
+template<typename xpu>
+int get_num_threads(const int N);
 
 #ifdef __CUDACC__
 #define CUDA_KERNEL_LOOP(i, n) \
@@ -47,6 +53,13 @@ using std::isnan;
       i < (n); \
       i += blockDim.x * gridDim.x)
 
+inline cudaDeviceProp cuda_get_device_prop() {
+  int device;
+  CUDA_CALL(cudaGetDevice(&device));
+  cudaDeviceProp deviceProp;
+  CUDA_CALL(cudaGetDeviceProperties(&deviceProp, device));
+  return deviceProp;
+}
 
 /*!
  * \brief Get the number of blocks for cuda kernel given N
@@ -55,8 +68,18 @@ inline int cuda_get_num_blocks(const int N) {
   using namespace mshadow::cuda;
   return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
 }
+
+template<>
+inline int get_num_threads<gpu>(const int N) {
+  using namespace mshadow::cuda;
+  return kBaseThreadNum * cuda_get_num_blocks(N);
+}
 #endif  // __CUDACC__
 
+template<>
+inline int get_num_threads<cpu>(const int N) {
+  return omp_get_max_threads();
+}
 
 /*! \brief operator request type switch */
 #define MXNET_ASSIGN_REQ_SWITCH(req, ReqType, ...)  \
@@ -216,7 +239,6 @@ __global__ void mxnet_generic_kernel(int N, Args... args) {
   }
 }
 
-
 template<typename OP>
 struct Kernel<OP, gpu> {
   template<typename ...Args>
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 2d46bd3230ce..dc53e1a7d232 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -29,12 +29,15 @@
 #include <dmlc/json.h>
 #include <dmlc/logging.h>
 #include <mxnet/operator.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/op_attr_types.h>
 #include <mxnet/base.h>
 #include <istream>
 #include <ostream>
 #include <string>
 #include <vector>
 #include "../common/cuda_utils.h"
+#include "../common/utils.h"
 
 namespace mxnet {
 namespace op {
@@ -125,6 +128,19 @@ inline std::string type_string(const int& x) {
   return "unknown";
 }
 
+/*! \brief get string representation of storage_type */
+inline std::string stype_string(const int& x) {
+  switch (x) {
+    case kDefaultStorage:
+      return "default";
+    case kCSRStorage:
+      return "csr";
+    case kRowSparseStorage:
+      return "row_sparse";
+  }
+  return "unknown";
+}
+
 /*!
  * \brief Assign x to y. Checks for compatiblity when y is not empty.
  *  Allow missing dim in both x and y (as 0).
@@ -201,6 +217,24 @@ inline bool type_assign(int *y, const int& x) {
     }                                                                       \
   }
 
+/*!
+ * \brief macro assign type to out if out is unknown (-1) otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param type_array the storage type array to store the result
+ * \param index the index of in the array
+ * \param type the inferred storage type
+ */
+#define STORAGE_TYPE_ASSIGN_CHECK(type_array, index, type)                  \
+  {                                                                         \
+    if (!type_assign(&(type_array)[index], type)) {                         \
+      std::ostringstream os;                                                \
+      os << "Storage type inconsistent, Provided="                          \
+         << stype_string((type_array)[index]) << ','                        \
+         << " inferred storage type=" << stype_string(type);                \
+      throw ::mxnet::op::InferTypeError(os.str(), index);                   \
+    }                                                                       \
+  }
+
 // helper macro to implement bind dispatch
 #if MXNET_USE_CUDA
 #define DO_BIND_DISPATCH(Method, ...)                                \
@@ -333,6 +367,54 @@ inline void ParamParser(nnvm::NodeAttrs* attrs) {
   attrs->parsed = std::move(param);
 }
 
+/*! \brief Perform storage fallback to invoke fcompute.
+ *  \param attrs attributes of the operator
+ *  \param ctx operator context
+ *  \param inputs inputs of fcompute
+ *  \param req req of fcompute
+ *  \param outputs outputs of fcompute
+ *  \param fcompute
+ *  \param fname name of the operator
+ *  \param mutate_idx the indices of mutable inputs
+ */
+template <typename xpu>
+void FCompExFallback(const nnvm::NodeAttrs& attrs,
+                     const OpContext& ctx,
+                     const std::vector<NDArray>& inputs,
+                     const std::vector<OpReqType>& req,
+                     const std::vector<NDArray>& outputs,
+                     FCompute fcompute,
+                     const std::string& fname,
+                     std::vector<uint32_t> mutate_idx = {}) {
+  using namespace mxnet::common;
+  std::vector<TBlob> in_blobs, out_blobs;
+  std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
+  // mapping from index in input_blobs to index in pre_temp_dst
+  std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
+  SetupDefaultBlobs(inputs, &in_blobs, &pre_temp_src, &pre_temp_dst, &in_temp_idx_map);
+  SetupDefaultBlobs(outputs, &out_blobs, &post_temp_dst, &post_temp_src);
+  for (const auto idx : mutate_idx) {
+    auto map_iter = in_temp_idx_map.find(idx);
+    if (map_iter != in_temp_idx_map.end()) {
+      post_temp_src.push_back(pre_temp_dst[map_iter->second]);
+      post_temp_dst.push_back(inputs[idx]);
+    }
+  }
+  CastNonDefaultStorage<xpu>(pre_temp_src, pre_temp_dst, ctx, true);
+  fcompute(attrs, ctx, in_blobs, req, out_blobs);
+  CastNonDefaultStorage<xpu>(post_temp_src, post_temp_dst, ctx, true);
+}
+
+#define CHECK_RSP_ALL_ROWS_NON_ZERO(rsp, func, param)                              \
+  {                                                                                \
+    CHECK(rsp.storage_shape()[0] == rsp.shape()[0]) << func                        \
+          << " for RowSparse " << param << " is only implemented for "             \
+          << "RowSparse " << param << " with all rows containing non-zeros. "      \
+          << "Expects " << param << ".values.shape[0] (" << rsp.storage_shape()[0] \
+          << ") == " << param << ".shape[0] (" << rsp.shape()[0] << ").";          \
+  }
+
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_OPERATOR_COMMON_H_
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 70759b15251a..28707aae4ce8 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -36,6 +36,7 @@
 #include "./mshadow_op.h"
 #include "./elemwise_op_common.h"
 #include "mxnet_op.h"
+#include "./tensor/init_op.h"
 
 namespace mxnet {
 namespace op {
@@ -102,6 +103,167 @@ inline void SGDUpdate(const nnvm::NodeAttrs& attrs,
   });
 }
 
+/*! \brief kernel for sparse sgd
+ */
+template<int req>
+struct SGDDnsRspKernel {
+  // DType is the output data type
+  // IType is row sparse idx type
+  // i is the ith row in row sparse gradient
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, const index_t row_length, DType* out, const DType* weight,
+                                  const IType* grad_idx, const DType *grad_val,
+                                  const DType clip_gradient, const DType lr,
+                                  const DType wd, const DType rescale_grad) {
+    for (index_t j = 0; j < row_length; j++) {
+      index_t data_i = grad_idx[i] * row_length + j;
+      index_t grad_i = i * row_length + j;
+      if (clip_gradient >= 0.0f) {
+        KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
+                     (lr) * mshadow_op::clip::Map(rescale_grad * grad_val[grad_i], clip_gradient));
+      } else {
+        KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
+                      (lr * rescale_grad) * grad_val[grad_i]);
+      }
+    }
+  }
+};
+
+template<typename xpu>
+inline void SGDUpdateDnsRspImpl(const SGDParam& param,
+                                const OpContext &ctx,
+                                const TBlob& weight,
+                                const NDArray& grad,
+                                const OpReqType& req,
+                                TBlob *out) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mshadow_op;
+  using namespace mxnet_op;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  CHECK_EQ(grad.storage_type(), kRowSparseStorage);
+  // if gradients are zeros, no weights are updated
+  if (!grad.storage_initialized() || req == kNullOp) return;
+  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update";
+  CHECK_GT(weight.shape_.Size(), 0);
+
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        DType* weight_data = weight.dptr<DType>();
+        IType* grad_idx = grad.aux_data(rowsparse::kIdx).dptr<IType>();
+        DType* grad_val = grad.data().dptr<DType>();
+        index_t num_rows = grad.aux_shape(rowsparse::kIdx)[0];
+        auto row_length = weight.shape_.ProdShape(1, weight.ndim());
+        Kernel<SGDDnsRspKernel<req_type>, xpu>::Launch(s, num_rows, row_length,
+          out->dptr<DType>(), weight_data, grad_idx, grad_val,
+          static_cast<DType>(param.clip_gradient),
+          static_cast<DType>(param.lr), static_cast<DType>(param.wd),
+          static_cast<DType>(param.rescale_grad));
+      });
+    });
+  });
+}
+
+/*! \brief kernel for sparse sgd
+ */
+template<int req>
+struct SGDRspDnsKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, const index_t num_cols, DType* out, const DType* weight,
+                                  const DType *grad, const DType clip_gradient, const DType lr,
+                                  const DType wd, const DType rescale_grad) {
+    bool contains_non_zeros = false;
+    index_t j = 0;
+    index_t offset = i * num_cols;
+    for (; j < num_cols; ++j) {
+      if (grad[offset + j] != 0) {
+        contains_non_zeros = true;
+        break;
+      }
+    }
+    if (!contains_non_zeros) return;
+    const DType rate = 1.f - lr * wd;
+    for (index_t j = 0; j < num_cols; j++) {
+      auto index = offset + j;
+      if (clip_gradient >= 0.0f) {
+        KERNEL_ASSIGN(out[index], req, rate * weight[index] -
+                      lr * mshadow_op::clip::Map(rescale_grad * grad[index], clip_gradient));
+      } else {
+        KERNEL_ASSIGN(out[index], req, rate * weight[index] -
+                      lr * rescale_grad * grad[index]);
+      }
+    }
+  }
+};
+
+template<typename xpu>
+inline void SGDUpdateRspDnsImpl(const SGDParam& param,
+                                const OpContext &ctx,
+                                const NDArray& weight,
+                                const TBlob& grad,
+                                const OpReqType req,
+                                NDArray *out) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDUpdate", "weights");
+  CHECK_EQ(weight.storage_type(), kRowSparseStorage);
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_update";
+  CHECK(weight.storage_initialized());
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(weight.dtype(), DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+      DType* weight_data = weight.data().dptr<DType>();
+      DType* grad_data = grad.dptr<DType>();
+      index_t num_rows = weight.aux_shape(kIdx)[0];
+      auto num_cols = weight.shape().ProdShape(1, weight.shape().ndim());
+      Kernel<SGDRspDnsKernel<req_type>, xpu>::Launch(s, num_rows, num_cols,
+        out->data().dptr<DType>(), weight_data, grad_data,
+        static_cast<DType>(param.clip_gradient),
+        static_cast<DType>(param.lr), static_cast<DType>(param.wd),
+        static_cast<DType>(param.rescale_grad));
+    });
+  });
+}
+
+template<typename xpu>
+inline void SGDUpdateRspRspImpl(const SGDParam& param,
+                                const OpContext& ctx,
+                                const NDArray& weight,
+                                const NDArray& grad,
+                                const OpReqType& req,
+                                NDArray *out) {
+  CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDUpdate", "weights");
+  // reuse dns rsp implementation when storage_shape == shape
+  TBlob out_blob = out->data();
+  SGDUpdateDnsRspImpl<xpu>(param, ctx, weight.data(), grad, req, &out_blob);
+}
+
+template<typename xpu>
+inline void SGDUpdateEx(const nnvm::NodeAttrs& attrs,
+                        const OpContext &ctx,
+                        const std::vector<NDArray> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<NDArray> &outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mshadow_op;
+  const SGDParam& param = nnvm::get<SGDParam>(attrs.parsed);
+  auto weight_stype = inputs[0].storage_type();
+  auto grad_stype = inputs[1].storage_type();
+  if (weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage) {
+    NDArray out = outputs[0];
+    SGDUpdateRspRspImpl<xpu>(param, ctx, inputs[0], inputs[1], req[0], &out);
+  } else if (weight_stype == kRowSparseStorage && grad_stype == kDefaultStorage) {
+    NDArray out = outputs[0];
+    SGDUpdateRspDnsImpl<xpu>(param, ctx, inputs[0], inputs[1].data(), req[0], &out);
+  } else {
+    FCompExFallback<xpu>(attrs, ctx, inputs, req, outputs, SGDUpdate<xpu>, "SGDUpdate");
+  }
+}
+
 struct SGDMomParam : public dmlc::Parameter<SGDMomParam> {
   float lr;
   float momentum;
@@ -275,6 +437,196 @@ inline void MP_SGDMomUpdate(const nnvm::NodeAttrs& attrs,
   });
 }
 
+template<int req>
+struct SGDMomDnsRspDnsKernel {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
+    DType* mom_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const DType clip_gradient, const DType momentum,
+    const DType lr, const DType wd, const DType rescale_grad) {
+    const DType rate = lr * wd;
+    for (index_t j = 0; j < row_length; j++) {
+      index_t data_i = grad_idx[i] * row_length + j;
+      index_t grad_i = i * row_length + j;
+      if (clip_gradient >= 0.0f) {
+        mom_data[data_i] = momentum * mom_data[data_i]
+                - rate * weight_data[data_i]
+                - lr *
+                mshadow_op::clip::Map(rescale_grad * grad_data[grad_i],
+                                      clip_gradient);
+      } else {
+        mom_data[data_i] = momentum * mom_data[data_i]
+                  - rate * weight_data[data_i]
+                  - lr * rescale_grad * grad_data[grad_i];
+      }
+      KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
+    }
+  }
+};
+
+template<typename xpu>
+inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param,
+                                      const OpContext& ctx,
+                                      const TBlob& weight,
+                                      const NDArray& grad,
+                                      const TBlob& mom,
+                                      const OpReqType& req,
+                                      TBlob *out) {
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  if (!grad.storage_initialized() || req == kNullOp) return;
+  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update";
+  CHECK_GT(weight.shape_.Size(), 0);
+  CHECK_GT(mom.shape_.Size(), 0);
+
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        DType* weight_data = weight.dptr<DType>();
+        IType* grad_idx = grad.aux_data(kIdx).dptr<IType>();
+        DType* grad_val = grad.data().dptr<DType>();
+        DType* mom_data = mom.dptr<DType>();
+        DType* out_data = out->dptr<DType>();
+        index_t num_rows = grad.aux_shape(kIdx)[0];
+        auto row_length = weight.shape_.ProdShape(1, weight.ndim());
+        Kernel<SGDMomDnsRspDnsKernel<req_type>, xpu>::Launch(s, num_rows, row_length,
+          out_data, mom_data, weight_data, grad_idx, grad_val,
+          static_cast<DType>(param.clip_gradient), static_cast<DType>(param.momentum),
+          static_cast<DType>(param.lr), static_cast<DType>(param.wd),
+          static_cast<DType>(param.rescale_grad));
+      });
+    });
+  });
+}
+
+template<int req>
+struct SGDMomRspDnsKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, index_t num_cols, DType* out, DType* mom,
+                                  const DType* weight, const DType *grad,
+                                  const DType clip_gradient, const DType momentum,
+                                  const DType lr, const DType wd, const DType rescale_grad) {
+    bool contains_non_zeros = false;
+    index_t j = 0;
+    index_t offset = i * num_cols;
+    for (; j < num_cols; ++j) {
+      if (grad[offset + j] != 0) {
+        contains_non_zeros = true;
+        break;
+      }
+    }
+    if (!contains_non_zeros) return;
+    const DType rate = lr * wd;
+    for (index_t j = 0; j < num_cols; j++) {
+      auto index = offset + j;
+      if (clip_gradient >= 0.0f) {
+        mom[index] = momentum * mom[index] - rate * weight[index]
+                   - lr * mshadow_op::clip::Map(rescale_grad * grad[index], clip_gradient);
+      } else {
+        mom[index] = momentum * mom[index] - rate * weight[index]
+                   - lr * rescale_grad * grad[index];
+      }
+      KERNEL_ASSIGN(out[index], req, weight[index] + mom[index]);
+    }
+  }
+};
+
+template<typename xpu>
+inline void SGDMomUpdateRspDnsImpl(const SGDMomParam& param,
+                                   const OpContext &ctx,
+                                   const NDArray& weight,
+                                   const TBlob& grad,
+                                   const NDArray& mom,
+                                   const OpReqType req,
+                                   NDArray *out) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDMomUpdate", "weights");
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  CHECK_EQ(weight.storage_type(), kRowSparseStorage);
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update";
+  CHECK(weight.storage_initialized());
+  // fill mom with zero values if not initialized yet
+  if (!mom.storage_initialized()) {
+    NDArray mom_zeros = mom;
+    FillDnsZerosRspImpl(s, &mom_zeros);
+  }
+  MSHADOW_REAL_TYPE_SWITCH(weight.dtype(), DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+      DType* weight_data = weight.data().dptr<DType>();
+      DType* grad_data = grad.dptr<DType>();
+      DType* mom_data = mom.data().dptr<DType>();
+      index_t num_rows = weight.aux_shape(kIdx)[0];
+      auto num_cols = weight.shape().ProdShape(1, weight.shape().ndim());
+      Kernel<SGDMomRspDnsKernel<req_type>, xpu>::Launch(s, num_rows, num_cols,
+        out->data().dptr<DType>(), mom_data, weight_data, grad_data,
+        static_cast<DType>(param.clip_gradient), static_cast<DType>(param.momentum),
+        static_cast<DType>(param.lr), static_cast<DType>(param.wd),
+        static_cast<DType>(param.rescale_grad));
+    });
+  });
+}
+
+
+template<typename xpu>
+inline void SGDMomUpdateRspRspRspImpl(const SGDMomParam& param,
+                                      const OpContext& ctx,
+                                      const NDArray& weight,
+                                      const NDArray& grad,
+                                      const NDArray& mom,
+                                      const OpReqType& req,
+                                      NDArray *out) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDMomUpdate", "weights");
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  // fill mom with zero values in order to reuse the sgd mom dns impl
+  if (!mom.storage_initialized()) {
+    NDArray mom_zeros = mom;
+    FillDnsZerosRspImpl(s, &mom_zeros);
+  }
+  TBlob out_blob = out->data();
+  // reuse dns rsp implementation when storage_shape == shape
+  SGDMomUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad,
+                                 mom.data(), req, &out_blob);
+}
+
+template<typename xpu>
+inline void SGDMomUpdateEx(const nnvm::NodeAttrs& attrs,
+                           const OpContext &ctx,
+                           const std::vector<NDArray> &inputs,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<NDArray> &outputs) {
+  using namespace mxnet_op;
+  const SGDMomParam& param = nnvm::get<SGDMomParam>(attrs.parsed);
+  auto &weight = inputs[0];
+  auto &grad = inputs[1];
+  auto &mom = inputs[2];
+  auto weight_stype = weight.storage_type();
+  auto grad_stype = grad.storage_type();
+  auto mom_stype = mom.storage_type();
+  CHECK_EQ(weight_stype, mom_stype) << "Inconsistent storage type detected between mom.stype = "
+           << mom_stype << " and weight.stype = " << weight_stype;
+  if (weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage &&
+      mom_stype == kRowSparseStorage) {
+     NDArray out = outputs[0];
+     SGDMomUpdateRspRspRspImpl<xpu>(param, ctx, weight, grad, mom, req[0], &out);
+  } else if (weight_stype == kRowSparseStorage && grad_stype == kDefaultStorage &&
+      mom_stype == kRowSparseStorage) {
+     NDArray out = outputs[0];
+     SGDMomUpdateRspDnsImpl<xpu>(param, ctx, weight, grad.data(), mom, req[0], &out);
+  } else {
+    // inputs[2] is a mutable input
+    FCompExFallback<xpu>(attrs, ctx, inputs, req, outputs,
+                         SGDMomUpdate<xpu>, "SGDMomUpdate", {2});
+  }
+}
+
 struct AdamParam : public dmlc::Parameter<AdamParam> {
   float lr;
   float beta1;
@@ -348,6 +700,147 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs,
   });
 }
 
+/*!
+ * Note: this kernel performs sparse adam update. For each row-slice in row_sparse
+ * gradient, it finds the corresponding elements in weight, mean and var and performs
+ * the update.
+ * The kernel assumes dense weight/mean/var, and row_sparse gradient
+ */
+template<int req>
+struct AdamDnsRspDnsKernel {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
+    DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const DType clip_gradient, const DType beta1, const DType beta2,
+    const DType lr, const DType wd, const DType epsilon, const DType rescale_grad) {
+    using nnvm::dim_t;
+    using namespace mshadow_op;
+    const dim_t row_offset = grad_idx[i] * row_length;
+    for (dim_t j = 0; j < row_length; j++) {
+      // index in data/mean/var
+      const dim_t data_i = row_offset + j;
+      // index in grad
+      const dim_t grad_i = i * row_length + j;
+      const DType grad_rescaled = grad_data[grad_i] * rescale_grad + weight_data[data_i] * wd;
+      if (clip_gradient >= 0.0f) {
+        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) *
+                            clip::Map(grad_rescaled, clip_gradient);
+        var_data[data_i] =  beta2 * var_data[data_i] + (1.f - beta2) * square::Map(
+                            clip::Map(grad_rescaled, clip_gradient));
+      } else {
+        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
+        var_data[data_i] = beta2 * var_data[data_i] +
+                           (1.f - beta2) * grad_rescaled * grad_rescaled;
+      }
+      KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] /
+                    (square_root::Map(var_data[data_i]) + epsilon));
+    }
+  }
+};
+
+
+template<typename xpu>
+inline void AdamUpdateDnsRspDnsImpl(const AdamParam& param,
+                                    const OpContext& ctx,
+                                    const TBlob& weight,
+                                    const NDArray& grad,
+                                    const TBlob& mean,
+                                    const TBlob& var,
+                                    const OpReqType& req,
+                                    TBlob *out) {
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  if (!grad.storage_initialized() || req == kNullOp) return;
+  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse adam_update";
+  CHECK_GT(weight.shape_.Size(), 0);
+  CHECK_GT(mean.shape_.Size(), 0);
+  CHECK_GT(var.shape_.Size(), 0);
+
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        const DType* weight_data = weight.dptr<DType>();
+        const IType* grad_idx = grad.aux_data(kIdx).dptr<IType>();
+        const DType* grad_val = grad.data().dptr<DType>();
+        DType* mean_data = mean.dptr<DType>();
+        DType* var_data = var.dptr<DType>();
+        DType* out_data = out->dptr<DType>();
+        nnvm::dim_t num_rows = grad.aux_shape(kIdx)[0];
+        const auto row_length = weight.shape_.ProdShape(1, weight.ndim());
+        Kernel<AdamDnsRspDnsKernel<req_type>, xpu>::Launch(s, num_rows, row_length,
+          out_data, mean_data, var_data, weight_data, grad_idx, grad_val,
+          static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
+          static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
+          static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
+          static_cast<DType>(param.rescale_grad));
+      });
+    });
+  });
+}
+
+template<typename xpu>
+inline void AdamUpdateRspRspRspImpl(const AdamParam& param,
+                                    const OpContext& ctx,
+                                    const NDArray& weight,
+                                    const NDArray& grad,
+                                    const NDArray& mean,
+                                    const NDArray& var,
+                                    const OpReqType& req,
+                                    NDArray *out) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "AdamUpdate", "weights");
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  // fill mean and variance with zero values in order to reuse the sgd mom dns impl
+  if (!mean.storage_initialized()) {
+    NDArray mean_zeros = mean;
+    FillDnsZerosRspImpl(s, &mean_zeros);
+  }
+  if (!var.storage_initialized()) {
+    NDArray var_zeros = var;
+    FillDnsZerosRspImpl(s, &var_zeros);
+  }
+  TBlob out_blob = out->data();
+  // reuse dns rsp implementation when storage_shape == shape
+  AdamUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad, mean.data(),
+                               var.data(), req, &out_blob);
+}
+
+
+template<typename xpu>
+inline void AdamUpdateEx(const nnvm::NodeAttrs& attrs,
+                         const OpContext &ctx,
+                         const std::vector<NDArray> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &outputs) {
+  const AdamParam& param = nnvm::get<AdamParam>(attrs.parsed);
+  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+  const auto weight_stype = inputs[0].storage_type();
+  const auto grad_stype = inputs[1].storage_type();
+  const auto mean_stype = inputs[2].storage_type();
+  const auto var_stype = inputs[3].storage_type();
+
+  const auto out_stype = outputs[0].storage_type();
+  CHECK_EQ(mean_stype, weight_stype) << "Inconsistent storage type detected between "
+           << " mean.stype = " << mean_stype << " and weight.stype = " << weight_stype;
+  CHECK_EQ(var_stype, weight_stype) << "Inconsistent storage type detected between "
+           << " var.stype = " << var_stype << " and weight.stype = " << weight_stype;
+  if (weight_stype == kRowSparseStorage && mean_stype == kRowSparseStorage &&
+      var_stype == kRowSparseStorage && grad_stype == kRowSparseStorage &&
+      out_stype == kRowSparseStorage) {
+     NDArray out = outputs[0];
+     AdamUpdateRspRspRspImpl<xpu>(param, ctx, inputs[0], inputs[1], inputs[2],
+                                  inputs[3], req[0], &out);
+  } else {
+    LOG(FATAL) << "Unexpected storage types: weight.stype = " << weight_stype
+               << ", var.stype = " << var_stype << ", mean.stype = " << mean_stype
+               << ", grad.stype = " << grad_stype;
+  }
+}
+
 // This RMSProp code follows the version in
 // http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45)
 // by Alex Graves, 2013.
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index b26c333edaef..9b2b088c5095 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -40,6 +40,9 @@ It updates the weights using::
 
  weight = weight - learning_rate * gradient
 
+If weight is stored with `row_sparse` storage type,
+only the row slices whose indices appear in grad.indices are updated.
+
 )code" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
@@ -47,6 +50,7 @@ It updates the weights using::
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
 .set_attr<FCompute>("FCompute<cpu>", SGDUpdate<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SGDUpdateEx<cpu>)
 .add_argument("weight", "NDArray-or-Symbol", "Weight")
 .add_argument("grad", "NDArray-or-Symbol", "Gradient")
 .add_arguments(SGDParam::__FIELDS__());
@@ -70,6 +74,9 @@ It updates the weights using::
 
 Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
 
+If weights are stored with `row_sparse` storage type,
+only the row slices whose indices appear in grad.indices are updated (for both weight and momentum).
+
 )code" ADD_FILELINE)
 .set_num_inputs(3)
 .set_num_outputs(1)
@@ -81,6 +88,7 @@ Where the parameter ``momentum`` is the decay rate of momentum estimates at each
     return std::vector<uint32_t>{2};
   })
 .set_attr<FCompute>("FCompute<cpu>", SGDMomUpdate<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SGDMomUpdateEx<cpu>)
 .add_argument("weight", "NDArray-or-Symbol", "Weight")
 .add_argument("grad", "NDArray-or-Symbol", "Gradient")
 .add_argument("mom", "NDArray-or-Symbol", "Momentum")
@@ -152,6 +160,7 @@ It updates the weights using::
     return std::vector<uint32_t>{2, 3};
   })
 .set_attr<FCompute>("FCompute<cpu>", AdamUpdate<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", AdamUpdateEx<cpu>)
 .add_argument("weight", "NDArray-or-Symbol", "Weight")
 .add_argument("grad", "NDArray-or-Symbol", "Gradient")
 .add_argument("mean", "NDArray-or-Symbol", "Moving mean")
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index 0e74e303dbc9..fe45f4be8c66 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -28,10 +28,12 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(sgd_update)
-.set_attr<FCompute>("FCompute<gpu>", SGDUpdate<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SGDUpdate<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SGDUpdateEx<gpu>);
 
 NNVM_REGISTER_OP(sgd_mom_update)
-.set_attr<FCompute>("FCompute<gpu>", SGDMomUpdate<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SGDMomUpdate<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SGDMomUpdateEx<gpu>);
 
 NNVM_REGISTER_OP(mp_sgd_update)
 .set_attr<FCompute>("FCompute<gpu>", MP_SGDUpdate<gpu>);
@@ -40,7 +42,8 @@ NNVM_REGISTER_OP(mp_sgd_mom_update)
 .set_attr<FCompute>("FCompute<gpu>", MP_SGDMomUpdate<gpu>);
 
 NNVM_REGISTER_OP(adam_update)
-.set_attr<FCompute>("FCompute<gpu>", AdamUpdate<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", AdamUpdate<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", AdamUpdateEx<gpu>);
 
 NNVM_REGISTER_OP(rmsprop_update)
 .set_attr<FCompute>("FCompute<gpu>", RMSPropUpdate<gpu>);
diff --git a/src/operator/random/sample_op.cc b/src/operator/random/sample_op.cc
index 8d87d2b99d14..363163cbc697 100644
--- a/src/operator/random/sample_op.cc
+++ b/src/operator/random/sample_op.cc
@@ -61,7 +61,8 @@ Example::
                                                  [ 0.54488319,  0.84725171]]
 
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", SampleUniform_<cpu>);
+.set_attr<FCompute>("FCompute<cpu>", SampleUniform_<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SampleUniformEx_<cpu>);
 
 // Add "normal" alias for backward compatibility
 MXNET_OPERATOR_REGISTER_SAMPLE(random_normal, SampleNormalParam)
@@ -78,7 +79,8 @@ Example::
    random_normal(loc=0, scale=1, shape=(2,2)) = [[ 1.89171135, -1.16881478],
                                                  [-1.23474145,  1.55807114]]
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", SampleNormal_<cpu>);
+.set_attr<FCompute>("FCompute<cpu>", SampleNormal_<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SampleNormalEx_<cpu>);
 
 MXNET_OPERATOR_REGISTER_SAMPLE(random_gamma, SampleGammaParam)
 .add_alias("_sample_gamma")
@@ -91,7 +93,8 @@ Example::
    random_gamma(alpha=9, beta=0.5, shape=(2,2)) = [[ 7.10486984,  3.37695289],
                                                    [ 3.91697288,  3.65933681]]
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", SampleGamma_<cpu>);
+.set_attr<FCompute>("FCompute<cpu>", SampleGamma_<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SampleGammaEx_<cpu>);
 
 MXNET_OPERATOR_REGISTER_SAMPLE(random_exponential, SampleExponentialParam)
 .add_alias("_sample_exponential")
diff --git a/src/operator/random/sample_op.cu b/src/operator/random/sample_op.cu
index 0d4b2e5a8270..7bdb9faf334e 100644
--- a/src/operator/random/sample_op.cu
+++ b/src/operator/random/sample_op.cu
@@ -28,21 +28,20 @@ namespace op {
 
 // GPU versions of uniform and normal distribution.
 template<>
-void SampleUniform_<gpu>(const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<TBlob>& inputs,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<TBlob>& outputs) {
+void SampleUniformDnsImpl<gpu>(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const OpReqType& req,
+                               TBlob* output) {
   using namespace mxnet::op;
   using namespace mshadow::expr;
   typedef gpu xpu;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const SampleUniformParam& param = nnvm::get<SampleUniformParam>(attrs.parsed);
   mshadow::Random<xpu, float> *prnd = ctx.requested[0].get_random<xpu, float>(s);
-  if (outputs[0].type_flag_ != mshadow::kFloat32) {
-    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  if (output->type_flag_ != mshadow::kFloat32) {
+    MSHADOW_REAL_TYPE_SWITCH(output->type_flag_, DType, {
       // Not float32: use workspace and copy to output
-      mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> out = output->FlatTo2D<xpu, DType>(s);
       mshadow::Tensor<xpu, 1, float> workspace =
         ctx.requested[1].get_space_typed<xpu, 1, float>
         (mshadow::Shape1(out.shape_.Size()), s);
@@ -51,27 +50,36 @@ void SampleUniform_<gpu>(const nnvm::NodeAttrs& attrs,
     });
   } else {
     // float32: write directly into output
-    mshadow::Tensor<xpu, 2, float> out = outputs[0].FlatTo2D<xpu, float>(s);
+    mshadow::Tensor<xpu, 2, float> out = output->FlatTo2D<xpu, float>(s);
     prnd->SampleUniform(&out, param.low, param.high);
   }
 }
 
 template<>
-void SampleNormal_<gpu>(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
+void SampleUniform_<gpu>(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+  TBlob out = outputs[0];
+  SampleUniformDnsImpl<gpu>(attrs, ctx, req[0], &out);
+}
+
+template<>
+void SampleNormalDnsImpl<gpu>(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const OpReqType& req,
+                              TBlob* output) {
   using namespace mxnet::op;
   using namespace mshadow::expr;
   typedef gpu xpu;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const SampleNormalParam& param = nnvm::get<SampleNormalParam>(attrs.parsed);
   mshadow::Random<xpu, float> *prnd = ctx.requested[0].get_random<xpu, float>(s);
-  if (outputs[0].type_flag_ != mshadow::kFloat32) {
-    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  if (output->type_flag_ != mshadow::kFloat32) {
+    MSHADOW_REAL_TYPE_SWITCH(output->type_flag_, DType, {
       // Not float32: use workspace and copy to output
-      mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> out = output->FlatTo2D<xpu, DType>(s);
       mshadow::Tensor<xpu, 1, float> workspace =
         ctx.requested[1].get_space_typed<xpu, 1, float>
         (mshadow::Shape1(out.shape_.Size()), s);
@@ -80,16 +88,28 @@ void SampleNormal_<gpu>(const nnvm::NodeAttrs& attrs,
     });
   } else {
     // float32: write directly into output
-    mshadow::Tensor<xpu, 2, float> out = outputs[0].FlatTo2D<xpu, float>(s);
+    mshadow::Tensor<xpu, 2, float> out = output->FlatTo2D<xpu, float>(s);
     prnd->SampleGaussian(&out, param.loc, param.scale);
   }
 }
 
+template<>
+void SampleNormal_<gpu>(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  TBlob out = outputs[0];
+  SampleNormalDnsImpl<gpu>(attrs, ctx, req[0], &out);
+}
+
 NNVM_REGISTER_OP(random_uniform)
-.set_attr<FCompute>("FCompute<gpu>", SampleUniform_<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SampleUniform_<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SampleUniformEx_<gpu>);
 
 NNVM_REGISTER_OP(random_normal)
-.set_attr<FCompute>("FCompute<gpu>", SampleNormal_<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SampleNormal_<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SampleNormalEx_<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/random/sample_op.h b/src/operator/random/sample_op.h
index a1a6a2345b1b..0cd3f6bc2efb 100644
--- a/src/operator/random/sample_op.h
+++ b/src/operator/random/sample_op.h
@@ -232,29 +232,75 @@ struct SampleGenNegBinomialParam : public dmlc::Parameter<SampleGenNegBinomialPa
   }
 };
 
+using FSampleCompute = std::function<void (const nnvm::NodeAttrs& attrs,
+                                           const OpContext& ctx,
+                                           const OpReqType& req,
+                                           TBlob* outputs)>;
+
 template<typename xpu>
-void SampleUniform_(const nnvm::NodeAttrs& attrs,
-                    const OpContext& ctx,
-                    const std::vector<TBlob>& inputs,
-                    const std::vector<OpReqType>& req,
-                    const std::vector<TBlob>& outputs) {
+void SampleComputeEx_(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<NDArray>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<NDArray>& outputs,
+                      FSampleCompute fcomp) {
+  NDArray output = outputs[0];
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  if (output.storage_type() == kRowSparseStorage) {
+    // indices
+    nnvm::dim_t nnr = output.shape()[0];
+    output.CheckAndAlloc({mshadow::Shape1(nnr)});
+    PopulateFullIdxRspImpl(s, &output);
+    // data
+    TBlob out_blob = output.data();
+    fcomp(attrs, ctx, req[0], &out_blob);
+  } else {
+    LOG(FATAL) << "Unexpected storage type for SampleComputeEx_: "
+               << output.storage_type();
+  }
+}
+
+template<typename xpu>
+void SampleUniformDnsImpl(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const OpReqType& req,
+                          TBlob* output) {
   using namespace mxnet::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const SampleUniformParam& param = nnvm::get<SampleUniformParam>(attrs.parsed);
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  MSHADOW_REAL_TYPE_SWITCH(output->type_flag_, DType, {
     mshadow::Random<xpu, DType> *prnd = ctx.requested[0].get_random<xpu, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 2, DType> out = output->FlatTo2D<xpu, DType>(s);
     prnd->SampleUniform(&out, param.low, param.high);
   });
 }
 
 template<typename xpu>
-void SampleNormal_(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
+void SampleUniform_(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  TBlob out = outputs[0];
+  SampleUniformDnsImpl<xpu>(attrs, ctx, req[0], &out);
+}
+
+
+template<typename xpu>
+void SampleUniformEx_(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<NDArray>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<NDArray>& outputs) {
+  SampleComputeEx_<xpu>(attrs, ctx, inputs, req, outputs, SampleUniformDnsImpl<xpu>);
+}
+
+template<typename xpu>
+void SampleNormalDnsImpl(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const OpReqType& req,
+                         TBlob* outputs) {
   using namespace mxnet::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -268,11 +314,29 @@ void SampleNormal_(const nnvm::NodeAttrs& attrs,
 }
 
 template<typename xpu>
-void SampleGamma_(const nnvm::NodeAttrs& attrs,
+void SampleNormal_(const nnvm::NodeAttrs& attrs,
                    const OpContext& ctx,
                    const std::vector<TBlob>& inputs,
                    const std::vector<OpReqType>& req,
                    const std::vector<TBlob>& outputs) {
+  TBlob out = outputs[0];
+  SampleNormalDnsImpl<xpu>(attrs, ctx, req[0], &out);
+}
+
+template<typename xpu>
+void SampleNormalEx_(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<NDArray>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<NDArray>& outputs) {
+  SampleComputeEx_<xpu>(attrs, ctx, inputs, req, outputs, SampleNormalDnsImpl<xpu>);
+}
+
+template<typename xpu>
+void SampleGammaDnsImpl(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const OpReqType& req,
+                        TBlob* outputs) {
   using namespace mxnet::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -286,6 +350,25 @@ void SampleGamma_(const nnvm::NodeAttrs& attrs,
   });
 }
 
+template<typename xpu>
+void SampleGamma_(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+  TBlob out = outputs[0];
+  SampleGammaDnsImpl<xpu>(attrs, ctx, req[0], &out);
+}
+
+template<typename xpu>
+void SampleGammaEx_(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<NDArray>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<NDArray>& outputs) {
+  SampleComputeEx_<xpu>(attrs, ctx, inputs, req, outputs, SampleGammaDnsImpl<xpu>);
+}
+
 template<typename xpu>
 void SampleExponential_(const nnvm::NodeAttrs& attrs,
                    const OpContext& ctx,
diff --git a/src/operator/tensor/cast_storage-inl.cuh b/src/operator/tensor/cast_storage-inl.cuh
new file mode 100644
index 000000000000..afef53e979ea
--- /dev/null
+++ b/src/operator/tensor/cast_storage-inl.cuh
@@ -0,0 +1,589 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file cast_storage-inl.cuh
+ * \brief implementation of cast_storage op on GPU
+ */
+#ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_
+#define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_
+
+#include <cub/cub.cuh>
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include <nnvm/tuple.h>
+#include "./util/tensor_util-inl.cuh"
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief GPU Kernel for filling the value array of the rsp tensor.
+ * Parallelized by rsp tensor elements: 1 thread/element
+ */
+struct CastDnsRspValsKernel {
+  /*!
+   * \brief
+   * \param tid         global thread id
+   * \param rsp_val     value array of rsp tensor to store data
+   * \param row_idx     indices of non-zero rows
+   * \param dns         dense matrix data
+   * \param nnr         number of non-zero rows
+   * \param row_length  number of elements per row
+   */
+  template<typename DType, typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* rsp_val,
+                                             const RType* row_idx,
+                                             const DType* dns,
+                                             const nnvm::dim_t nnr,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    if (tid < nnr*row_length) {
+      const dim_t row_id = tid / row_length;
+      const dim_t row_el = tid % row_length;
+      const dim_t dns_idx = row_idx[row_id] * row_length + row_el;
+      rsp_val[tid] = dns[dns_idx];
+    }
+  }
+};
+
+template<typename xpu, int ndim, typename DType>
+inline mshadow::Tensor<xpu, ndim, DType> AllocateTempDataForCast(const OpContext& op_ctx,
+                                                                 const mshadow::Shape<ndim>& shape) {
+  Resource rsc = ResourceManager::Get()->Request(op_ctx.run_ctx.ctx,
+                                                 ResourceRequest(ResourceRequest::kTempSpace));
+  mshadow::Stream<xpu> *stream = op_ctx.run_ctx.get_stream<xpu>();
+  return rsc.get_space_typed<xpu, ndim, DType>(shape, stream);
+};
+
+/*!
+ * \brief GPU implementation of casting a dns tensor to rsp type.
+ */
+inline void CastStorageDnsRspImpl(const OpContext& ctx,
+                                  const gpu& gpu_dev,
+                                  const TBlob& dns,
+                                  NDArray* rsp) {
+  CHECK(rsp != nullptr);
+  CHECK_EQ(rsp->storage_type(), kRowSparseStorage);
+  CHECK_EQ(dns.shape_, rsp->shape());
+  using mshadow::Shape1;
+  using mxnet_op::Kernel;
+  using nnvm::dim_t;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(rowsparse::kIdx), RType, {  // row idx type
+      const dim_t num_rows = dns.shape_[0];
+      const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
+      const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
+      const dim_t threads_per_block = mshadow::cuda::kBaseThreadNum;
+      const dim_t min_num_warps = 512;
+      dim_t num_threads;
+      // TODO: remove kernel dependency on warpSize=32
+      if (threads_per_warp != 32) {
+        LOG(FATAL) << "CastStorageDnsRspImpl GPU kernels expect warpSize=32";
+      }
+      // Determine temporary device storage requirements
+      dim_t* row_flg = NULL;
+      void* d_temp_storage = NULL;
+      size_t temp_storage_bytes = 0;
+      cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                    temp_storage_bytes,
+                                    row_flg,
+                                    row_flg,
+                                    num_rows,
+                                    mshadow::Stream<gpu>::GetStream(s));
+
+      // Allocate temp storage for marking non-zero rows and for cub's prefix sum
+      auto workspace = AllocateTempDataForCast<gpu, 1, char>(ctx, Shape1(num_rows*sizeof(dim_t)
+                                                                          + temp_storage_bytes));
+      row_flg = reinterpret_cast<dim_t*>(workspace.dptr_);
+      d_temp_storage = workspace.dptr_ + num_rows*sizeof(dim_t);
+
+      // Mark non-zero rows as 'one' in row_flg
+      // Different kernel versions are optimized for different matrix instances
+      // (1) 'Thread kernel' (one thread       computing one row)
+      // (2) 'Warp kernel'   (one warp         computing one row)
+      // (3) 'Block kernel'  (one thread block computing one row)
+      const int kernel_version = 0;
+      switch (kernel_version) {
+        case 1:
+          num_threads = num_rows;
+          Kernel<MarkRspRowThreadKernel, gpu>::Launch(s, num_threads,
+              row_flg, dns.dptr<DType>(), num_rows, row_length);
+          break;
+        case 2:
+          num_threads = num_rows * threads_per_warp;
+          Kernel<MarkRspRowWarpKernel, gpu>::Launch(s, num_threads,
+              row_flg, dns.dptr<DType>(), num_rows, row_length);
+          break;
+        case 3:
+          num_threads = num_rows * threads_per_block;
+          Kernel<MarkRspRowBlockKernel, gpu>::Launch(s, num_threads,
+              row_flg, dns.dptr<DType>(), num_rows, row_length);
+          break;
+        default:
+          if (row_length < threads_per_warp) {
+            num_threads = num_rows;
+            Kernel<MarkRspRowThreadKernel, gpu>::Launch(s, num_threads,
+                row_flg, dns.dptr<DType>(), num_rows, row_length);
+          } else if (row_length < threads_per_block || num_rows > min_num_warps) {
+            num_threads = num_rows * threads_per_warp;
+            Kernel<MarkRspRowWarpKernel, gpu>::Launch(s, num_threads,
+                row_flg, dns.dptr<DType>(), num_rows, row_length);
+          } else {
+            num_threads = num_rows * threads_per_block;
+            Kernel<MarkRspRowBlockKernel, gpu>::Launch(s, num_threads,
+                row_flg, dns.dptr<DType>(), num_rows, row_length);
+          }
+          break;
+      }
+      // Compute non-zero row indices through inclusive prefix sum
+      cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                    temp_storage_bytes,
+                                    row_flg,
+                                    row_flg,
+                                    num_rows,
+                                    mshadow::Stream<gpu>::GetStream(s));
+
+      // Get total number of non-zero rows from device
+      dim_t nnr = 0;
+      CUDA_CALL(cudaMemcpy(&nnr, &row_flg[num_rows-1], sizeof(dim_t), cudaMemcpyDeviceToHost));
+
+      // Allocate rsp tensor row index array and fill
+      rsp->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(nnr));
+      if (0 == nnr) return;
+      RType* row_idx = rsp->aux_data(rowsparse::kIdx).dptr<RType>();
+      num_threads = num_rows;
+      Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_threads,
+          row_idx, row_flg, num_rows);
+
+      // Construct shape of rsp tensor data, allocate, and fill
+      auto storage_shape = dns.shape_;
+      storage_shape[0] = nnr;
+      rsp->CheckAndAllocData(storage_shape);
+      num_threads = nnr * row_length;
+      Kernel<CastDnsRspValsKernel, gpu>::Launch(s, num_threads,
+          rsp->data().dptr<DType>(), row_idx, dns.dptr<DType>(), nnr, row_length);
+    });
+  });
+}
+
+/*!
+ * \brief Thread kernel for initializing the indptr in a csr matrix.
+ * Parallelized by matrix rows: 1 thread/row
+ */
+struct CastDnsCsrIndPtrThreadKernel {
+  /*!
+   * \brief
+   * \param tid       global thread id
+   * \param indptr    index pointer array of the csr matrix
+   * \param dns       dense matrix
+   * \param num_rows  number of rows of the dense matrix
+   * \param num_cols  number of columns of the dense matrix
+   */
+  template<typename DType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    if (tid == 0) {
+      indptr[tid] = 0;
+    }
+    if (tid < num_rows) {
+      dim_t nnz = 0;
+      const dim_t offset = tid * num_cols;
+      for (dim_t j = 0; j < num_cols; ++j) {
+        if (dns[offset+j] != 0) {
+          nnz++;
+        }
+      }
+      indptr[tid+1] = nnz;
+    }
+  }
+};
+
+/*!
+ * \brief Thread kernel for initializing the col_idx and value array of the csr matrix.
+ * Parallelized by matrix rows: 1 thread/row
+ */
+struct CastDnsCsrColIdxAndValsThreadKernel {
+  /*!
+   * \brief
+   * \param tid       global thread id
+   * \param val       data array of the csr matrix
+   * \param col_idx   column index array of the csr matrix
+   * \param indptr    index pointer array of the csr matrix
+   * \param dns       dense matrix
+   * \param num_rows  number of rows of the dense matrix
+   * \param num_cols  number of columns of the dense matrix
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* val,
+                                             CType* col_idx,
+                                             const IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    if (tid < num_rows) {
+      const dim_t offset = tid * num_cols;
+      dim_t k = indptr[tid];
+      for (dim_t j = 0; j < num_cols; ++j) {
+        if (dns[offset+j] != 0) {
+          val[k] = dns[offset+j];
+          col_idx[k] = j;
+          ++k;
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Warp kernel for initializing the indptr in a csr matrix.
+ * Parallelized by matrix rows: 1 warp/row
+ */
+struct CastDnsCsrIndPtrWarpKernel {
+  template<typename DType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    typedef cub::WarpReduce<dim_t> WarpReduce;
+    const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
+    __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
+
+    if (tid == 0) {
+      indptr[tid] = 0;
+    }
+    const dim_t warp_id   = tid / 32;          // global warp   id
+    const dim_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const dim_t lane      = tid & (32-1);      // local  thread id within warp
+    if (warp_id < num_rows) {
+      dim_t lane_nnz = 0;
+      const dim_t offset = warp_id * num_cols;
+      for (dim_t j = lane; j < num_cols; j+=32) {
+        if (dns[offset+j] != 0) {
+          lane_nnz++;
+        }
+      }
+      dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(lane_nnz);
+      if (lane == 0) {
+        indptr[warp_id+1] = aggr;
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Warp kernel for initializing the col_idx and value array of the csr matrix.
+ * Parallelized by matrix rows: 1 warp/row
+ */
+struct CastDnsCsrColIdxAndValsWarpKernel {
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* val,
+                                             CType* col_idx,
+                                             const IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    typedef cub::WarpScan<dim_t> WarpScan;
+    const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
+    __shared__ typename WarpScan::TempStorage temp_storage[warps_per_block];
+    __shared__ volatile dim_t warp_nnz[warps_per_block];
+
+    const dim_t warp_id   = tid / 32;          // global warp   id
+    const dim_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const dim_t lane      = tid & (32-1);      // local  thread id within warp
+    if (warp_id < num_rows) {
+      const dim_t offset = warp_id * num_cols;
+      dim_t k = indptr[warp_id];
+      dim_t nnz;
+      for (dim_t j = lane; j < num_cols+lane; j+=32) {
+        nnz = 0;
+        if (j < num_cols) {
+          if (dns[offset+j] != 0) {
+            nnz++;
+          }
+        }
+        if (lane == 31) {
+          warp_nnz[warp_lane] = nnz;
+        }
+        // Compute index each thread has to write to
+        WarpScan(temp_storage[warp_lane]).ExclusiveSum(nnz, nnz);
+        if (j < num_cols) {
+          if (dns[offset+j] != 0) {
+            val[k+nnz] = dns[offset+j];
+            col_idx[k+nnz] = j;
+          }
+        }
+        if (lane == 31) {
+          warp_nnz[warp_lane] += nnz;
+        }
+        __syncwarp();
+        k += warp_nnz[warp_lane];
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Block kernel for initializing the indptr in a csr matrix.
+ * Parallelized by matrix rows: 1 threadBlock/row
+ */
+struct CastDnsCsrIndPtrBlockKernel {
+  template<typename DType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using mshadow::cuda::kBaseThreadNum;
+    using nnvm::dim_t;
+    typedef cub::BlockReduce<dim_t, kBaseThreadNum> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    if (tid == 0) {
+      indptr[tid] = 0;
+    }
+    if (blockIdx.x < num_rows) {
+      dim_t lane_nnz = 0;
+      const dim_t offset = blockIdx.x * num_cols;
+      for (dim_t j = threadIdx.x; j < num_cols; j+=kBaseThreadNum) {
+        if (dns[offset+j] != 0) {
+          lane_nnz++;
+        }
+      }
+      dim_t aggr = BlockReduce(temp_storage).Sum(lane_nnz);
+      if (threadIdx.x == 0) {
+        indptr[blockIdx.x+1] = aggr;
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Block kernel for initializing the col_idx and value array of the csr matrix.
+ * Parallelized by matrix rows: 1 threadBlock/row
+ */
+struct CastDnsCsrColIdxAndValsBlockKernel {
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* val,
+                                             CType* col_idx,
+                                             const IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using mshadow::cuda::kBaseThreadNum;
+    using nnvm::dim_t;
+    typedef cub::BlockScan<dim_t, kBaseThreadNum> BlockScan;
+    __shared__ typename BlockScan::TempStorage temp_storage;
+    __shared__ volatile dim_t block_nnz;
+
+    if (blockIdx.x < num_rows) {
+      const dim_t offset = blockIdx.x * num_cols;
+      dim_t k = indptr[blockIdx.x];
+      dim_t nnz;
+      for (dim_t j = threadIdx.x; j < num_cols+threadIdx.x; j+=kBaseThreadNum) {
+        nnz = 0;
+        if (j < num_cols) {
+          if (dns[offset+j] != 0) {
+            nnz++;
+          }
+        }
+        if (threadIdx.x == kBaseThreadNum-1) {
+          block_nnz = nnz;
+        }
+        // Compute index each thread has to write to
+        BlockScan(temp_storage).ExclusiveSum(nnz, nnz);
+        if (j < num_cols) {
+          if (dns[offset+j] != 0) {
+            val[k+nnz] = dns[offset+j];
+            col_idx[k+nnz] = j;
+          }
+        }
+        if (threadIdx.x == kBaseThreadNum-1) {
+          block_nnz += nnz;
+        }
+        __syncthreads();
+        k += block_nnz;
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU implementation of casting a dense matrix to csr type.
+ */
+inline void CastStorageDnsCsrImpl(const OpContext& ctx,
+                                  const gpu& gpu_dev,
+                                  const TBlob& dns,
+                                  NDArray* csr) {
+  CHECK(csr != nullptr);
+  CHECK_EQ(csr->storage_type(), kCSRStorage);
+  CHECK_EQ(dns.shape_.ndim(), 2);
+  CHECK_EQ(dns.shape_, csr->shape());
+  using mshadow::Shape1;
+  using mxnet_op::Kernel;
+  using nnvm::dim_t;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {                     // data type
+    MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, {   // col_idx type
+        const dim_t num_rows = dns.shape_[0];
+        const dim_t num_cols = dns.shape_[1];
+        const dim_t threads_per_warp  = mxnet_op::cuda_get_device_prop().warpSize;
+        const dim_t threads_per_block = mshadow::cuda::kBaseThreadNum;
+        const dim_t min_num_warps = 512;
+        dim_t num_threads;
+        // TODO: remove kernel dependency on warpSize=32
+        if (threads_per_warp != 32) {
+          LOG(FATAL) << "CastStorageDnsCsrImpl GPU kernels expect warpSize=32";
+        }
+        csr->CheckAndAllocAuxData(csr::kIndPtr, Shape1(num_rows+1));
+        IType* indptr = csr->aux_data(csr::kIndPtr).dptr<IType>();
+        DType* dns_data = dns.dptr<DType>();
+
+        // Different kernel versions are optimized for different matrix instances
+        // (1) 'Thread kernel' (one thread       computing one row)
+        // (2) 'Warp kernel'   (one warp         computing one row)
+        // (3) 'Block kernel'  (one thread block computing one row)
+        const int kernel_version = 0;
+        switch (kernel_version) {
+          case 1:
+            num_threads = num_rows;
+            Kernel<CastDnsCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
+                indptr, dns_data, num_rows, num_cols);
+            break;
+          case 2:
+            num_threads = num_rows * threads_per_warp;
+            Kernel<CastDnsCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
+                indptr, dns_data, num_rows, num_cols);
+            break;
+          case 3:
+            num_threads = num_rows * threads_per_block;
+            Kernel<CastDnsCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
+                indptr, dns_data, num_rows, num_cols);
+            break;
+          default:
+            if (num_cols < threads_per_warp) {
+              num_threads = num_rows;
+              Kernel<CastDnsCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
+                  indptr, dns_data, num_rows, num_cols);
+            } else if (num_cols < threads_per_block || num_rows > min_num_warps) {
+              num_threads = num_rows * threads_per_warp;
+              Kernel<CastDnsCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
+                  indptr, dns_data, num_rows, num_cols);
+            } else {
+              num_threads = num_rows * threads_per_block;
+              Kernel<CastDnsCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
+                  indptr, dns_data, num_rows, num_cols);
+            }
+            break;
+        }
+
+        // Determine temporary device storage requirements
+        void *d_temp_storage = NULL;
+        size_t temp_storage_bytes = 0;
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      indptr,
+                                      indptr,
+                                      num_rows+1,
+                                      mshadow::Stream<gpu>::GetStream(s));
+
+        // Allocate temporary storage
+        auto workspace = AllocateTempDataForCast<gpu, 1, char>(ctx, Shape1(temp_storage_bytes));
+
+        d_temp_storage = workspace.dptr_;
+
+        // Compute indptr through inclusive prefix sum
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      indptr,
+                                      indptr,
+                                      num_rows+1,
+                                      mshadow::Stream<gpu>::GetStream(s));
+
+        // Receive total number of nnz values from device
+        IType nnz = 0;
+        CUDA_CALL(cudaMemcpy(&nnz, &(indptr[num_rows]), sizeof(IType), cudaMemcpyDeviceToHost));
+
+        // Allocate column index array and data array of the csr matrix
+        csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast<dim_t>(nnz)));
+        csr->CheckAndAllocData(Shape1(static_cast<dim_t>(nnz)));
+
+        // Compute and fill column index array and data array of the csr matrix
+        switch (kernel_version) {
+          case 1:
+            num_threads = num_rows;
+            Kernel<CastDnsCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
+                csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                indptr, dns_data, num_rows, num_cols);
+            break;
+          case 2:
+            num_threads = num_rows * threads_per_warp;
+            Kernel<CastDnsCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
+                csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                indptr, dns_data, num_rows, num_cols);
+            break;
+          case 3:
+            num_threads = num_rows * threads_per_block;
+            Kernel<CastDnsCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
+                csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                indptr, dns_data, num_rows, num_cols);
+            break;
+          default:
+            if (num_cols < threads_per_warp) {
+              num_threads = num_rows;
+              Kernel<CastDnsCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
+                  csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                  indptr, dns_data, num_rows, num_cols);
+            } else if (num_cols < threads_per_block || num_rows > min_num_warps) {
+              num_threads = num_rows * threads_per_warp;
+              Kernel<CastDnsCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
+                csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                indptr, dns_data, num_rows, num_cols);
+            } else {
+              num_threads = num_rows * threads_per_block;
+              Kernel<CastDnsCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
+                csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                indptr, dns_data, num_rows, num_cols);
+            }
+            break;
+        }
+      });
+    });
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_
diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h
new file mode 100644
index 000000000000..acb30a9eff2b
--- /dev/null
+++ b/src/operator/tensor/cast_storage-inl.h
@@ -0,0 +1,392 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file cast_storage-inl.h
+ * \brief cast_storage implementation for dense and sparse tensors
+ */
+#ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_
+#define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_
+
+#include <dmlc/timer.h>
+#include <mxnet/ndarray.h>
+#include <vector>
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#ifdef __CUDACC__
+#include "./cast_storage-inl.cuh"
+#endif  // __CUDACC__
+
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief CPU Kernel for marking row_idx of a RSP tensor per row.
+ */
+struct MarkRspRowIdx {
+  // i represents the row index of the tensor data
+  template<typename DType, typename RType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  RType* row_idx,
+                                  const DType* data,
+                                  const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    dim_t j = 0;
+    dim_t offset = i * row_length;
+    for (; j < row_length; ++j) {
+      if (data[offset+j] != 0) {
+        break;
+      }
+    }
+    if (row_length == j) {
+      row_idx[i] = 0;  // mark as zero for zero row
+    } else {
+      row_idx[i] = 1;  // mark as one for non-zero row
+    }
+  }
+};
+
+/*!
+ * \brief CPU implementation of casting a dns tensor to rsp type.
+ */
+inline void CastStorageDnsRspImpl(const OpContext& ctx,
+                                  const cpu& cpu_dev,
+                                  const TBlob& dns,
+                                  NDArray* rsp) {
+  using namespace rowsparse;
+  using namespace mshadow;
+  using nnvm::dim_t;
+  CHECK(rsp != nullptr);
+  CHECK_EQ(rsp->storage_type(), kRowSparseStorage);
+  CHECK_EQ(dns.shape_, rsp->shape());
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(kIdx), RType, {  // row idx type
+      const dim_t num_rows = dns.shape_[0];
+      const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
+      rsp->CheckAndAllocAuxData(kIdx, Shape1(num_rows));
+      TBlob row_idx_blob = rsp->aux_data(kIdx);
+      RType* row_idx = row_idx_blob.dptr<RType>();
+      dim_t num_threads = num_rows;
+      mxnet_op::Kernel<MarkRspRowIdx, cpu>::Launch(s, num_threads,
+          row_idx, dns.dptr<DType>(), row_length);
+      dim_t nnr = 0;
+      nnr = common::ParallelAccumulate(row_idx, num_rows, nnr);
+      rsp->set_aux_shape(kIdx, Shape1(nnr));
+      if (0 == nnr) return;
+      auto storage_shape = dns.shape_;
+      storage_shape[0] = nnr;
+      rsp->CheckAndAllocData(storage_shape);
+      auto dns_data = dns.get_with_shape<cpu, 2, DType>(Shape2(num_rows, row_length), s);
+      auto rsp_data = rsp->data().get_with_shape<cpu, 2, DType>(Shape2(nnr, row_length), s);
+      dim_t idx = 0;
+      for (dim_t i = 0; i < num_rows; ++i) {
+        if (row_idx[i] > 0) {
+          row_idx[idx] = i;
+          Copy(rsp_data[idx], dns_data[i], s);
+          ++idx;
+        }
+      }
+    });
+  });
+}
+
+// TODO(haibin) Use memcopy instead will be much faster than assigning each individual element
+struct CastStorageRspDnsKernel {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  const nnvm::dim_t row_length,
+                                  const IType* idx,
+                                  const DType *data,
+                                  DType* dns) {
+    using nnvm::dim_t;
+    IType rid = idx[i];
+    dim_t dns_offset = rid * row_length;
+    dim_t rsp_offset = i * row_length;
+    for (dim_t col = 0; col < row_length; col++) {
+      dns[dns_offset + col] = data[rsp_offset + col];
+    }
+  }
+};
+
+/*!
+ * \brief This function assumes that the memory for dns has been allocated already
+ * since the shape is known at binding stage.
+ */
+template<typename xpu>
+void CastStorageRspDnsImpl(const OpContext& ctx,
+                           const NDArray& rsp,
+                           TBlob* dns) {
+  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+  CHECK_EQ(rsp.storage_type(), kRowSparseStorage);
+  using nnvm::dim_t;
+  MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, {
+      // assign zeros
+      mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, dns->Size(), dns->dptr<DType>());
+      if (rsp.storage_initialized()) {
+        // copy over row by row
+        auto in_idx = rsp.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s).dptr_;
+        auto in_data = rsp.data().dptr<DType>();
+        auto out_data = dns->dptr<DType>();
+        auto shape = rsp.shape();
+        const dim_t num_rows = rsp.aux_shape(rowsparse::kIdx).Size();
+        const dim_t row_length = shape.ProdShape(1, shape.ndim());
+        const dim_t num_threads = num_rows;
+        mxnet_op::Kernel<CastStorageRspDnsKernel, xpu>::Launch(s, num_threads,
+            row_length, in_idx, in_data, out_data);
+      }
+    });
+  });
+}
+
+/*!
+ * \brief CPU kernel for initializing the indptr in a csr matrix.
+ */
+struct FillCsrIndPtr {
+  /*!
+   * \brief
+   * \param i         the i-th row of the dns tensor
+   * \param indptr    the indptr of the csr tensor
+   * \param dns       the dns tensor
+   * \param num_rows  number of rows of the dns tensor
+   * \param num_cols  number of columns of the dns tensor
+   */
+  template<typename DType, typename IType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  IType* indptr,
+                                  const DType* dns,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    indptr[i+1] = 0;
+    const dim_t offset = i * num_cols;
+    for (dim_t j = 0; j < num_cols; ++j) {
+      if (dns[offset+j] != 0) {
+        ++indptr[i+1];
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU kernel for initializing the col_idx and value array of the csr matrix.
+ */
+struct FillCsrColIdxAndVals {
+  /*!
+   * \brief
+   * \param i         the i-th row of the dns tensor
+   * \param val       value array of the csr tensor
+   * \param col_idx   column idx array of the csr tensor
+   * \param indptr    indptr array of the csr tensor
+   * \param dns       dns tensor
+   * \param num_rows  number of rows of the dns tensor
+   * \param num_cols  number of columns of the dns tensor
+   */
+  template<typename DType, typename IType, typename CType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* val,
+                                  CType* col_idx,
+                                  const IType* indptr,
+                                  const DType* dns,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    const dim_t offset = i * num_cols;
+    IType k = indptr[i];
+    for (dim_t j = 0; j < num_cols; ++j) {
+      if (dns[offset+j] != 0) {
+        val[k] = dns[offset+j];
+        col_idx[k] = j;
+        ++k;
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU implementation of casting a dns matrix to csr type.
+ */
+inline void CastStorageDnsCsrImpl(const OpContext& ctx,
+                                  const cpu& cpu_dev,
+                                  const TBlob& dns,
+                                  NDArray* csr) {
+  CHECK(csr != nullptr);
+  CHECK_EQ(csr->storage_type(), kCSRStorage);
+  CHECK_EQ(dns.shape_.ndim(), 2);
+  CHECK_EQ(dns.shape_, csr->shape());
+  using mshadow::Shape1;
+  using nnvm::dim_t;
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, {  // col idx type
+        const dim_t num_rows = dns.shape_[0];
+        const dim_t num_cols = dns.shape_[1];
+        csr->CheckAndAllocAuxData(csr::kIndPtr, mshadow::Shape1(num_rows+1));
+        IType* indptr = csr->aux_data(csr::kIndPtr).dptr<IType>();
+        DType* dns_data = dns.dptr<DType>();
+        dim_t num_threads = num_rows;
+        mxnet_op::Kernel<FillCsrIndPtr, cpu>::Launch(s, num_threads,
+            indptr, dns_data, num_rows, num_cols);
+        // single thread to accumulate indptr
+        // indptr[num_rows] indicates the number of non-zero elements
+        indptr[0] = 0;
+        for (dim_t i = 0; i < num_rows; ++i) {
+          indptr[i+1] += indptr[i];
+        }
+        // allocate column idx array and value array
+        csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast<index_t>(indptr[num_rows])));
+        csr->CheckAndAllocData(Shape1(static_cast<index_t>(indptr[num_rows])));
+        // fill col_idx and value arrays of the csr
+        mxnet_op::Kernel<FillCsrColIdxAndVals, cpu>::Launch(s, num_threads,
+            csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+            indptr, dns_data, num_rows, num_cols);
+      });
+    });
+  });
+}
+
+/*!
+ * \brief This is the kernel for copying csr.data to its corresponding dns matrix.
+ */
+struct CopyCsrDataToDns {
+  /*!
+   * \brief
+   * \param i         the i-th row of the dns tensor
+   * \param dns_data  data blob of the dns tensor
+   * \param col_idx   column idx array of the csr tensor
+   * \param indptr    indptr array of the csr tensor
+   * \param csr_data  data blob of the csr tensor
+   * \param num_cols  number of columns of the dns tensor
+   */
+  template<typename DType, typename IType, typename CType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  DType* dns_data,
+                                  const CType* col_idx,
+                                  const IType* indptr,
+                                  const DType* csr_data,
+                                  const nnvm::dim_t num_cols) {
+    const nnvm::dim_t offset = i * num_cols;
+    for (IType j = indptr[i]; j < indptr[i+1]; ++j) {
+      dns_data[offset+col_idx[j]] = csr_data[j];
+    }
+  }
+};
+
+/*!
+ * \brief Casts a csr matrix to dns format.
+ */
+template<typename xpu>
+void CastStorageCsrDnsImpl(const OpContext& ctx,
+                           const NDArray& csr,
+                           TBlob* dns) {
+  CHECK(dns != nullptr);
+  CHECK_EQ(csr.storage_type(), kCSRStorage);
+  CHECK_EQ(dns->shape_.ndim(), 2);
+  CHECK_EQ(dns->shape_, csr.shape());
+  using nnvm::dim_t;
+  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIndPtr), IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIdx), CType, {  // col idx type
+        const dim_t num_rows = dns->shape_[0];
+        const dim_t num_cols = dns->shape_[1];
+        DType* dns_data = dns->dptr<DType>();
+        dim_t num_threads = dns->shape_.Size();
+        mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, num_threads, dns_data);
+        if (!csr.storage_initialized()) return;
+        const IType* indptr = csr.aux_data(csr::kIndPtr).dptr<IType>();
+        const CType* col_idx = csr.aux_data(csr::kIdx).dptr<CType>();
+        const DType* csr_data = csr.data().dptr<DType>();
+        num_threads = num_rows;
+        mxnet_op::Kernel<CopyCsrDataToDns, xpu>::Launch(s, num_threads,
+            dns_data, col_idx, indptr, csr_data, num_cols);
+      });
+    });
+  });
+}
+
+template<typename xpu>
+void CastStorageComputeImpl(const OpContext& ctx,
+                            const NDArray& input,
+                            const NDArray& output) {
+  const auto src_stype = input.storage_type();
+  const auto dst_stype = output.storage_type();
+  if (src_stype == kRowSparseStorage && dst_stype == kDefaultStorage) {
+    TBlob ret = output.data();
+    CastStorageRspDnsImpl<xpu>(ctx, input, &ret);
+  } else if (src_stype == kDefaultStorage && dst_stype == kRowSparseStorage) {
+    NDArray ret = output;  // get rid of the const qualifer
+    CastStorageDnsRspImpl(ctx, xpu(), input.data(), &ret);
+  } else if (src_stype == kDefaultStorage && dst_stype == kCSRStorage) {
+    NDArray ret = output;  // get rid of the const qualifer
+    CastStorageDnsCsrImpl(ctx, xpu(), input.data(), &ret);
+  } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) {
+    TBlob ret = output.data();
+    CastStorageCsrDnsImpl<xpu>(ctx, input, &ret);
+  } else {
+    LOG(FATAL) << "Not implemented";
+  }
+}
+
+struct CastStorageParam : public dmlc::Parameter<CastStorageParam> {
+  int stype;
+  DMLC_DECLARE_PARAMETER(CastStorageParam) {
+    DMLC_DECLARE_FIELD(stype)
+    .add_enum("default", kDefaultStorage)
+    .add_enum("row_sparse", kRowSparseStorage)
+    .add_enum("csr", kCSRStorage)
+    .describe("Output storage type.");
+  }
+};
+
+inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs,
+                                        const Context& ctx,
+                                        std::vector<int> *in_attrs,
+                                        std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_NE(in_attrs->at(0), kUndefinedStorage)
+    << "src ndarray's storage type must be specified";
+  const CastStorageParam& param = nnvm::get<CastStorageParam>(attrs.parsed);
+  CHECK_NE(param.stype, kUndefinedStorage)
+    << "dst ndarray's storage type must be specified";
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, param.stype);
+  return true;
+}
+
+template<typename xpu>
+void CastStorageComputeEx(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1);
+  CHECK_EQ(outputs.size(), 1);
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(req[0], kWriteTo) << "CastStorageComputeEx expects req[0] == kWriteTo";
+  CastStorageComputeImpl<xpu>(ctx, inputs[0], outputs[0]);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_
diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc
new file mode 100644
index 000000000000..0ad063cd0ed5
--- /dev/null
+++ b/src/operator/tensor/cast_storage.cc
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file cast_storage.cc
+ * \brief CPU Implementation of cast_storage operator.
+ */
+
+#include "./cast_storage-inl.h"
+#include "../elemwise_op_common.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(CastStorageParam);
+NNVM_REGISTER_OP(cast_storage)
+.describe(R"code(Casts tensor storage type to the new type.
+
+When an NDArray with default storage type is cast to csr or row_sparse storage,
+the result is compact, which means:
+
+- for csr, zero values will not be retained
+- for row_sparse, row slices of all zeros will not be retained
+
+The storage type of ``cast_storage`` output depends on stype parameter:
+
+- cast_storage(csr, 'default') = default
+- cast_storage(row_sparse, 'default') = default
+- cast_storage(default, 'csr') = csr
+- cast_storage(default, 'row_sparse') = row_sparse
+
+Example::
+
+    dense = [[ 0.,  1.,  0.],
+             [ 2.,  0.,  3.],
+             [ 0.,  0.,  0.],
+             [ 0.,  0.,  0.]]
+
+    # cast to row_sparse storage type
+    rsp = cast_storage(default, 'default')
+    rsp.indices = [0, 1]
+    rsp.values = [[ 0.,  1.,  0.],
+                  [ 2.,  0.,  3.]]
+
+    # cast to row_sparse storage type
+    csr = cast_storage(default, 'default')
+    csr.indices = [1, 0, 2]
+    csr.values = [ 1.,  2.,  3.]
+    csr.indptr = [0, 1, 3, 3, 3]
+
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<CastStorageParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", CastStorageInferStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", CastStorageComputeEx<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"})
+.add_argument("data", "NDArray-or-Symbol", "The input.")
+.add_arguments(CastStorageParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/cast_storage.cu b/src/operator/tensor/cast_storage.cu
new file mode 100644
index 000000000000..1be5f79ae297
--- /dev/null
+++ b/src/operator/tensor/cast_storage.cu
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file cast_storage.cu
+ * \brief GPU Implementation of cast_storage operator.
+ */
+#include "./cast_storage-inl.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(cast_storage)
+.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", CastStorageComputeEx<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/dot-inl.cuh b/src/operator/tensor/dot-inl.cuh
new file mode 100644
index 000000000000..41c3faaf419f
--- /dev/null
+++ b/src/operator/tensor/dot-inl.cuh
@@ -0,0 +1,883 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dot-inl.cuh
+ * \brief implementation of matrix dot op on GPU
+ */
+#ifndef MXNET_OPERATOR_TENSOR_DOT_INL_CUH_
+#define MXNET_OPERATOR_TENSOR_DOT_INL_CUH_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include "./util/tensor_util-inl.cuh"
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief GPU scalar kernel of dot(csr, dns1) = dns2
+ * Parallelization by output matrix elements: 1 thread/element
+ */
+template<int req>
+struct DotCsrDnsDnsScalarKernel {
+  /*!
+   * \brief This function represents performing an inner product between a row of lhs
+   * and a column of rhs and then assigning the value to out[tid].
+   * \param tid         global thread id
+   * \param out         output matrix data
+   * \param data_l      csr matrix data
+   * \param indptr_l    csr matrix row index pointer
+   * \param col_idx_l   csr matrix column indices
+   * \param data_r      dns1 matrix data of rhs
+   * \param num_cols_r  dns1 matrix number of columns
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_cols_r) {
+    const nnvm::dim_t irow = tid / num_cols_r;  // row id of the lhs
+    const nnvm::dim_t icol = tid % num_cols_r;  // col id of the rhs
+    DType sum = 0;
+    for (IType j = indptr_l[irow]; j < indptr_l[irow+1]; ++j) {
+      const CType cur_col = col_idx_l[j];  // corresponding row id of the rhs
+      sum += data_l[j] * data_r[cur_col*num_cols_r+icol];
+    }
+    KERNEL_ASSIGN(out[tid], req, sum);
+  }
+};
+
+/*!
+ * \brief GPU vector kernel of dot(csr, dns1) = dns2
+ * Parallelization by output matrix elements: 1 warp/element
+ */
+template<int req>
+struct DotCsrDnsDnsVectorKernel {
+  /*!
+   * \brief see DotCsrDnsDnsScalarKernel Map for documentation.
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_cols_r) {
+    using nnvm::dim_t;
+    __shared__ volatile DType vals[mshadow::cuda::kBaseThreadNum];
+    const dim_t warp_id = tid / 32;           // global warp id
+    const dim_t lane = tid & (32-1);          // local thread id within warp
+    const dim_t irow = warp_id / num_cols_r;  // lhs row that this warp computes
+    const dim_t kcol = warp_id % num_cols_r;  // rhs column that this warp computes
+
+    // Range of nnz elements in this row
+    const dim_t low  = static_cast<dim_t>(indptr_l[irow]);
+    const dim_t high = static_cast<dim_t>(indptr_l[irow+1]);
+
+    // Compute running sum per thread
+    DType sum = 0;
+    for (dim_t j = low+lane; j < high; j+=32) {
+      sum += data_l[j] * data_r[col_idx_l[j]*num_cols_r + kcol];
+    }
+    vals[threadIdx.x] = sum; __syncwarp();
+
+    // Parallel reduction in shared memory
+    if (lane < 16) {vals[threadIdx.x] += vals[threadIdx.x+16];} __syncwarp();
+    if (lane <  8) {vals[threadIdx.x] += vals[threadIdx.x+ 8];} __syncwarp();
+    if (lane <  4) {vals[threadIdx.x] += vals[threadIdx.x+ 4];} __syncwarp();
+    if (lane <  2) {vals[threadIdx.x] += vals[threadIdx.x+ 2];} __syncwarp();
+    if (lane <  1) {vals[threadIdx.x] += vals[threadIdx.x+ 1];} __syncwarp();
+
+    if (lane == 0) {
+      KERNEL_ASSIGN(out[irow*num_cols_r+kcol], req, vals[threadIdx.x]);
+    }
+  }
+};
+
+/*!
+ * \brief GPU scalar kernel of dot(csr.T, dns1) = dns2
+ * Parallelization by output matrix elements: 1 thread/element
+ */
+template<int req>
+struct DotCsrTransDnsDnsScalarKernel {
+  /*!
+   * \brief This function represents performing an inner product between a column of lhs
+   * and a column of rhs and then assigning the value to out[tid].
+   * \param tid         global thread id
+   * \param out         output matrix
+   * \param data_l      csr matrix data
+   * \param indptr_l    csr matrix row index pointer
+   * \param col_idx_l   csr matrix column indices
+   * \param data_r      dns1 matrix data of rhs
+   * \param num_rows_l  csr matrix number of rows (= number of columns of csr.T)
+   * \param num_cols_r  dns1 matrix number of columns
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_rows_l,
+                                             const nnvm::dim_t num_cols_r) {
+    using nnvm::dim_t;
+    const dim_t irow = tid / num_cols_r;  // col id of the lhs
+    const dim_t icol = tid % num_cols_r;  // col id of the rhs
+    DType sum = 0;
+
+    // Each thread scans each column with binary search to find nnz elements in its row
+    for (dim_t k = 0; k < num_rows_l; ++k) {
+      const dim_t low = static_cast<dim_t>(indptr_l[k]);
+      const dim_t high = static_cast<dim_t>(indptr_l[k+1]);
+      if (low == high || irow < col_idx_l[low] || irow > col_idx_l[high-1]) continue;
+      dim_t j = high, l = low, r = high - 1;
+      while (l <= r) {
+        dim_t m = l + (r - l) / 2;
+        if (col_idx_l[m] == irow) {
+          j = m; break;
+        }
+        if (col_idx_l[m] < irow) {
+          l = m + 1;
+        } else {
+          r = m - 1;
+        }
+      }
+      if (j < high) {
+        sum += data_l[j] * data_r[k*num_cols_r+icol];
+      }
+    }
+    KERNEL_ASSIGN(out[tid], req, sum);
+  }
+};
+
+/*!
+ * \brief GPU warp kernel of dot(csr.T, dns1) = dns2
+ * Parallelization by columns: 1 warp computes one lhs column for one rhs column
+ */
+struct DotCsrTransDnsDnsWarpKernel {
+  /*!
+   * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation.
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_cols_r) {
+    using nnvm::dim_t;
+    const dim_t warp_id = tid / 32;           // global warp id
+    const dim_t lane = tid & (32-1);          // local thread id within warp
+    const dim_t icol = warp_id / num_cols_r;  // lhs column that this warp computes
+    const dim_t kcol = warp_id % num_cols_r;  // rhs column that this warp computes
+
+    // Compute range of nnz elements in this column
+    const dim_t low  = static_cast<dim_t>(indptr_l[icol]);
+    const dim_t high = static_cast<dim_t>(indptr_l[icol+1]);
+
+    // Iterate through the nnz elements in this column
+    for (dim_t j = low+lane; j < high; j+=32) {
+      const dim_t irow = static_cast<dim_t>(col_idx_l[j]);
+      const DType val = data_l[j]*data_r[icol*num_cols_r+kcol];
+      atomicAdd(static_cast<DType *>(&(out[irow*num_cols_r+kcol])), val);
+    }
+  }
+};
+
+/*!
+ * \brief GPU thread block kernel of dot(csr.T, dns1) = dns2
+ * Parallelization by columns: 1 thread block computes one lhs column for all rhs columns
+ */
+struct DotCsrTransDnsDnsThreadBlockKernel {
+  /*!
+   * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation.
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_cols_r) {
+    using nnvm::dim_t;
+    const dim_t warps_per_block = blockDim.x / 32;  // number of warps in this thread block
+    const dim_t warp_id = tid / 32;                 // global warp id
+    const dim_t lane = tid & (32-1);                // local thread id within warp
+    const dim_t icol = blockIdx.x;                  // lhs column that this thread block computes
+    const dim_t kcol = warp_id % warps_per_block;   // rhs column where warp starts computing (offset)
+
+    // Compute range of nnz elements in this lhs column
+    const dim_t low  = static_cast<dim_t>(indptr_l[icol]);
+    const dim_t high = static_cast<dim_t>(indptr_l[icol+1]);
+
+    // Iterate through the nnz elements in this lhs column
+    for (dim_t j = low+lane; j < high; j+=32) {
+      const dim_t irow = static_cast<dim_t>(col_idx_l[j]);
+      const DType datum_l = data_l[j];
+      // Iterate over rhs columns that this warp computes
+      for (dim_t k = kcol; k < num_cols_r; k+=warps_per_block) {
+        const DType val = datum_l*data_r[icol*num_cols_r+k];
+        atomicAdd(static_cast<DType *>(&(out[irow*num_cols_r+k])), val);
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU warp block kernel of dot(csr.T, dns1) = dns2
+ * Parallelization by columns: 1 warp computes one lhs column for all rhs columns
+ */
+struct DotCsrTransDnsDnsWarpBlockKernel {
+  /*!
+   * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation.
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_cols_r) {
+    using nnvm::dim_t;
+    const dim_t warp_id = tid / 32;   // global warp id
+    const dim_t lane = tid & (32-1);  // local thread id within warp
+    const dim_t icol = warp_id;       // lhs column that this warp computes
+
+    // Compute range of nnz elements in this column
+    const dim_t low  = static_cast<dim_t>(indptr_l[icol]);
+    const dim_t high = static_cast<dim_t>(indptr_l[icol+1]);
+
+    // Iterate through the nnz elements in lhs column
+    for (dim_t j = low+lane; j < high; j+=32) {
+      const dim_t irow = static_cast<dim_t>(col_idx_l[j]);
+      const DType datum_l = data_l[j];
+      // Iterate over all rhs columns
+      for (dim_t k = 0; k < num_cols_r; k++) {
+        const DType val = datum_l*data_r[icol*num_cols_r+k];
+        atomicAdd(static_cast<DType *>(&(out[irow*num_cols_r+k])), val);
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU warp kernel of dot(csr.T, dns) = rsp
+ * Parallelization by columns: 1 warp computes one lhs column for one rhs column
+ */
+struct DotCsrTransDnsRspWarpKernel {
+  /*!
+   * \brief
+   * \param tid              global thread id
+   * \param out              output rsp matrix data
+   * \param row_flg_sum_out  inclusive prefix sum array over 0/1 marked row flag array
+   * \param data_l           csr matrix data
+   * \param indptr_l         csr matrix row index pointer
+   * \param col_idx_l        csr matrix column indices
+   * \param data_r           dns matrix data
+   * \param num_cols_r       dns matrix number of columns
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const nnvm::dim_t* row_flg_sum_out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_cols_r) {
+    using nnvm::dim_t;
+    const dim_t warp_id = tid / 32;           // global warp id
+    const dim_t lane = tid & (32-1);          // local thread id within warp
+    const dim_t icol = warp_id / num_cols_r;  // lhs column that this warp computes
+    const dim_t kcol = warp_id % num_cols_r;  // rhs column that this warp computes
+
+    // Compute range of nnz elements in this column
+    const dim_t low  = static_cast<dim_t>(indptr_l[icol]);
+    const dim_t high = static_cast<dim_t>(indptr_l[icol+1]);
+
+    // Iterate through the nnz elements in this column
+    for (dim_t j = low+lane; j < high; j+=32) {
+      const dim_t irow = static_cast<dim_t>(col_idx_l[j]);
+      const dim_t rsp_row = row_flg_sum_out[irow]-1;
+      const DType val = data_l[j]*data_r[icol*num_cols_r+kcol];
+      atomicAdd(static_cast<DType *>(&(out[rsp_row*num_cols_r+kcol])), val);
+    }
+  }
+};
+
+/*!
+ * \brief GPU Kernel of dot(csr.T, rsp1) = rsp2
+ * Parallelization by rows: 1 thread/row
+ * TODO: write a faster kernel optimized for GPU
+ */
+struct DotCsrTransRspRspByRowsKernel {
+  /*!
+   * \brief
+   * \param tid           global thread id
+   * \param out           output rsp matrix data
+   * \param row_idx_out   output rsp matrix non-zero row indices
+   * \param data_l        csr matrix data
+   * \param indptr_l      csr matrix row index pointer
+   * \param col_idx_l     csr matrix column indices
+   * \param data_r        rsp1 matrix data
+   * \param row_idx_r     rsp1 matrix non-zero row indices
+   * \param num_cols_r    rsp1 matrix number of cols
+   * \param nnr_r         rsp1 matrix number of non-zero rows
+   * \param nnr_out       output rsp matrix number of non-zero rows
+   */
+  template<typename DType, typename IType, typename CType, typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const RType* row_idx_out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const RType* row_idx_r,
+                                             const nnvm::dim_t num_cols_r,
+                                             const nnvm::dim_t nnr_r,
+                                             const nnvm::dim_t nnr_out) {
+    using nnvm::dim_t;
+    // This thread computes non-zero row 'tid' of the output matrix
+    // The actual row id corresponding to the lhs row is row_idx_out[tid]
+    if (tid < nnr_out) {
+      const dim_t offset_out = tid * num_cols_r;
+      // Iterate over rhs matrix rows (or, equivalently, lhs columns worthy taking a look at)
+      for (dim_t i = 0; i < nnr_r; i++) {
+        const RType j = row_idx_r[i];  // j is the actual rhs row id (= lhs column id)
+        if (indptr_l[j] == indptr_l[j+1]) continue;
+        const dim_t offset_r = i * num_cols_r;
+        // Iterate over lhs column j to find possible non-zero value in this row
+        // TODO: remove sequential search, this is a bottleneck
+        for (IType k = indptr_l[j]; k < indptr_l[j+1]; k++) {
+          const CType col_idx = col_idx_l[k];
+          if (col_idx == row_idx_out[tid]) {
+            for (dim_t l = 0; l < num_cols_r; l++) {
+              out[offset_out+l] += data_l[k] * data_r[offset_r+l];
+            }
+          } else if (col_idx > row_idx_out[tid]) {
+            break;
+          }
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU Kernel of dot(csr, rsp) = dns
+ * Parallelization by output elements: 1 thread/element
+ */
+struct DotCsrRspDnsScalarKernel {
+  /*!
+   * \brief
+   * \param tid        global thread id
+   * \param out        output dns matrix data
+   * \param data_l     csr matrix data
+   * \param indptr_l   csr matrix row index pointer
+   * \param col_idx_l  csr matrix column indices
+   * \param data_r     rsp matrix data
+   * \param row_idx_r  rsp matrix non-zero row indices
+   * \param row_flg_r  rsp matrix auxiliary array holding storage indices of non-zero rows
+   * \param nnr_r      rsp matrix number of non-zero rows
+   * \param num_rows   output dns matrix number of rows
+   * \param num_cols   output dns matrix number of columns
+   */
+  template<typename DType, typename IType, typename CType, typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const RType* row_idx_r,
+                                             const RType* row_flg_r,
+                                             const nnvm::dim_t nnr_r,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    if (tid < num_rows*num_cols) {
+      const dim_t i = static_cast<dim_t>(tid) / num_cols;  // i = row this thread computes
+      const dim_t k = static_cast<dim_t>(tid) % num_cols;  // k = col this thread computes
+      // Compute inner product of i-th row and k-th col
+      DType sum = 0;
+      for (IType j = indptr_l[i]; j < indptr_l[i+1]; j++) {
+        const dim_t csr_col = col_idx_l[j];
+        const dim_t rsp_row_idx = row_flg_r[csr_col];
+        if (rsp_row_idx > 0) {
+          sum += data_l[j] * data_r[(rsp_row_idx-1)*num_cols+k];
+        }
+      }
+      if (sum != 0) {
+        out[i*num_cols+k] += sum;
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU Impl of dot(csr, dns1) = dns2 and dot(csr.T, dns1) = dns2
+ */
+inline void DotCsrDnsDnsImpl(const OpContext& ctx,
+                             const gpu& gpu_dev,
+                             const NDArray& lhs,
+                             const TBlob& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             TBlob* ret) {
+  if (kNullOp == req) return;
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  if (!lhs.storage_initialized()) return;
+
+  using mshadow::cuda::kBaseThreadNum;
+  using mxnet_op::Kernel;
+  using mxnet_op::set_zero;
+  using nnvm::dim_t;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+
+  const dim_t num_rows_l = lhs.shape()[0];
+  const dim_t num_cols_r = rhs.shape_[1];
+  const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
+  const dim_t threads_per_block = kBaseThreadNum;
+  dim_t num_threads;
+  // TODO: remove kernel dependency on warpSize=32
+  if (threads_per_warp != 32) {
+    LOG(FATAL) << "DotCsrDnsDnsImpl GPU kernels expect warpSize=32";
+  }
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob& data_r = rhs;
+  const TBlob data_out = *ret;
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        if (kWriteTo == req) {
+          num_threads = data_out.Size();
+          Kernel<set_zero, gpu>::Launch(s, num_threads, data_out.dptr<DType>());
+        }
+        if (trans_lhs) {
+          // Different kernel versions are optimized for different matrix instances
+          // TODO: switch between kernel versions depending on input
+          // (1) 'Scalar kernel'       (one thread       computing one output element                )
+          // (2) 'Warp kernel'         (one warp         computing one lhs column for one rhs column )
+          // (3) 'Thread block kernel' (one thread block computing one lhs column for all rhs columns)
+          // (4) 'Warp block kernel'   (one warp         computing one lhs column for all rhs columns)
+          const int kernel_version = 0;
+          switch (kernel_version) {
+            case 1:
+              num_threads = data_out.Size();
+              MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+                Kernel<DotCsrTransDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
+                    data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                    col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_rows_l, num_cols_r);
+              });
+              break;
+            case 2:
+              num_threads = threads_per_warp * num_rows_l * num_cols_r;
+              Kernel<DotCsrTransDnsDnsWarpKernel, gpu>::Launch(s, num_threads,
+                  data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                  col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+              break;
+            case 3:
+              num_threads = threads_per_block * num_rows_l;
+              Kernel<DotCsrTransDnsDnsThreadBlockKernel, gpu>::Launch(s, num_threads,
+                  data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                  col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+              break;
+            case 4:
+              num_threads = threads_per_warp * num_rows_l;
+              Kernel<DotCsrTransDnsDnsWarpBlockKernel, gpu>::Launch(s, num_threads,
+                  data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                  col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+              break;
+            default:
+              num_threads = threads_per_warp * num_rows_l * num_cols_r;
+              Kernel<DotCsrTransDnsDnsWarpKernel, gpu>::Launch(s, num_threads,
+                  data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                  col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+              break;
+          }
+        } else {
+          // Different kernel versions are optimized for different matrix instances
+          // (1) 'Scalar kernel' (one thread computing one output element)
+          // (2) 'Vector kernel' (one warp   computing one output element)
+          const int kernel_version = 0;
+          switch (kernel_version) {
+            case 1:
+              num_threads = data_out.Size();
+              MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+                Kernel<DotCsrDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
+                    data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                    col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+              });
+              break;
+            case 2:
+              num_threads = threads_per_warp * num_rows_l * num_cols_r;
+              MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+                Kernel<DotCsrDnsDnsVectorKernel<ReqType>, gpu>::Launch(s, num_threads,
+                    data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                    col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+              });
+              break;
+            default:
+              if (num_cols_r > 4) {
+                num_threads = data_out.Size();
+                MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+                  Kernel<DotCsrDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
+                      data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                      col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+                });
+              } else {
+                num_threads = threads_per_warp * num_rows_l * num_cols_r;
+                MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+                  Kernel<DotCsrDnsDnsVectorKernel<ReqType>, gpu>::Launch(s, num_threads,
+                      data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                      col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+                });
+              }
+              break;
+          }
+        }
+      });
+    });
+  });
+}
+
+/*!
+ * \brief GPU Impl of dot(csr, dns) = rsp and dot(csr.T, dns) = rsp
+ */
+inline void DotCsrDnsRspImpl(const OpContext& ctx,
+                             const gpu& gpu_dev,
+                             const NDArray& lhs,
+                             const TBlob& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             NDArray* ret) {
+  if (kNullOp == req) return;
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  CHECK_EQ(ret->storage_type(), kRowSparseStorage);
+  CHECK_EQ(req, kWriteTo);
+  if (!lhs.storage_initialized()) return;
+
+  using mshadow::Shape1;
+  using mxnet_op::Kernel;
+  using mxnet_op::set_zero;
+  using nnvm::dim_t;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob& data_r = rhs;
+
+  const dim_t num_rows_l = lhs.shape()[0];
+  const dim_t num_cols_l = lhs.shape()[1];
+  const dim_t num_cols_r = rhs.shape_[1];
+  const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
+  dim_t num_threads;
+  // TODO: remove kernel dependency on warpSize=32
+  if (threads_per_warp != 32) {
+    LOG(FATAL) << "DotCsrDnsRspImpl GPU kernels expect warpSize=32";
+  }
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        if (trans_lhs) {
+          // Compute number of non-zero rows (nnr) of output matrix
+          // - alloc temp storage for row_flg array and for cub's prefix sum
+          // - mark non-zero columns of csr matrix in row_flg
+          // - compute inclusive prefix sum over marked array
+          // - copy last value (nnr_out) from device to host
+          dim_t* row_flg_out = NULL;
+          void* d_temp_storage = NULL;
+          size_t temp_storage_bytes = 0;
+          cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                        temp_storage_bytes,
+                                        row_flg_out,
+                                        row_flg_out,
+                                        num_cols_l,
+                                        mshadow::Stream<gpu>::GetStream(s));
+          mshadow::Tensor<gpu, 1, char> workspace = ctx.requested[0]
+              .get_space_typed<gpu, 1, char>(Shape1(num_cols_l * sizeof(dim_t) +
+                                                    temp_storage_bytes), s);
+          row_flg_out = reinterpret_cast<dim_t*>(workspace.dptr_);
+          d_temp_storage = workspace.dptr_ + num_cols_l*sizeof(dim_t);
+          num_threads = num_cols_l;
+          Kernel<set_zero, gpu>::Launch(s, num_threads, row_flg_out);
+          num_threads = num_rows_l * threads_per_warp;
+          Kernel<MarkCsrColWarpKernel, gpu>::Launch(s, num_threads,
+              row_flg_out, col_idx_l.dptr<CType>(), indptr_l.dptr<IType>(),
+              num_rows_l, num_cols_l);
+          cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                        temp_storage_bytes,
+                                        row_flg_out,
+                                        row_flg_out,
+                                        num_cols_l,
+                                        mshadow::Stream<gpu>::GetStream(s));
+          dim_t nnr_out = 0;
+          CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t),
+                               cudaMemcpyDeviceToHost));
+
+          // Allocate output matrix space
+          ret->CheckAndAlloc({Shape1(nnr_out)});
+          const TBlob data_out_blob = ret->data();
+          const TBlob row_idx_out_blob = ret->aux_data(rowsparse::kIdx);
+          MSHADOW_IDX_TYPE_SWITCH(row_idx_out_blob.type_flag_, RType, {  // row idx type
+            DType* data_out = data_out_blob.dptr<DType>();
+            RType* row_idx_out = row_idx_out_blob.dptr<RType>();
+            num_threads = nnr_out * num_cols_r;
+            Kernel<set_zero, gpu>::Launch(s, num_threads, data_out);
+            num_threads = nnr_out;
+            Kernel<set_zero, gpu>::Launch(s, num_threads, row_idx_out);
+
+            // Fill row_idx array of output matrix, using the row_flg values
+            num_threads = num_cols_l;
+            Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_threads,
+                row_idx_out, row_flg_out, num_cols_l);
+
+            // Perform matrix-matrix multiply
+            num_threads = threads_per_warp * num_rows_l * num_cols_r;
+            Kernel<DotCsrTransDnsRspWarpKernel, gpu>::Launch(s, num_threads,
+                data_out, row_flg_out,
+                data_l.dptr<DType>(), indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(),
+                data_r.dptr<DType>(), num_cols_r);
+          });
+        } else {
+          LOG(FATAL) << "DotCsrDnsRspImpl has not implemented dot(csr, dns) = rsp yet.";
+        }
+      });
+    });
+  });
+}
+
+/*!
+ * \brief GPU Impl of dot(csr, rsp1) = rsp2 and dot(csr.T, rsp1) = rsp2
+ * TODO: Optimize for GPU; this is a baseline implementation providing
+ *       the operator functionality, it is not yet fully optimized for GPU.
+ */
+inline void DotCsrRspRspImpl(const OpContext& ctx,
+                             const gpu& gpu_dev,
+                             const NDArray& lhs,
+                             const NDArray& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             NDArray* ret) {
+  if (kNullOp == req) return;
+  // Reuse dot(csr, dns) implementation if rhs rsp matrix is in fact dense
+  if (rhs.storage_shape()[0] == rhs.shape()[0]) {
+    DotCsrDnsRspImpl(ctx, gpu_dev, lhs, rhs.data(), req, trans_lhs, ret);
+    return;
+  }
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  CHECK_EQ(rhs.storage_type(), kRowSparseStorage);
+  CHECK_EQ(ret->storage_type(), kRowSparseStorage);
+  if (!lhs.storage_initialized() || !rhs.storage_initialized()) return;
+  CHECK_EQ(req, kWriteTo);
+
+  using mshadow::Shape1;
+  using mxnet_op::Kernel;
+  using mxnet_op::set_zero;
+  using nnvm::dim_t;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob data_r = rhs.data();
+  const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx);
+
+  const dim_t num_rows_l = lhs.shape()[0];
+  const dim_t num_cols_l = lhs.shape()[1];
+  const dim_t num_cols_r = rhs.shape()[1];
+  const dim_t nnr_r = rhs.storage_shape()[0];
+  const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
+  dim_t num_threads;
+  // TODO: remove kernel dependency on warpSize=32
+  if (threads_per_warp != 32) {
+    LOG(FATAL) << "DotCsrRspRspImpl GPU kernels expect warpSize=32";
+  }
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
+          if (trans_lhs) {
+            // Compute number of non-zero rows (nnr) of output matrix
+            // - alloc temp storage for row_flg array and for cub's prefix sum
+            // - mark non-zero columns of csr matrix in row_flg
+            // - compute inclusive prefix sum over marked array
+            // - copy last value (nnr_out) from device to host
+            dim_t* row_flg_out = NULL;
+            void* d_temp_storage = NULL;
+            size_t temp_storage_bytes = 0;
+            cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                          temp_storage_bytes,
+                                          row_flg_out,
+                                          row_flg_out,
+                                          num_cols_l,
+                                          mshadow::Stream<gpu>::GetStream(s));
+            mshadow::Tensor<gpu, 1, char> workspace = ctx.requested[0]
+                .get_space_typed<gpu, 1, char>(Shape1(num_cols_l * sizeof(dim_t) +
+                                                      temp_storage_bytes), s);
+            row_flg_out = reinterpret_cast<dim_t*>(workspace.dptr_);
+            d_temp_storage = workspace.dptr_ + num_cols_l*sizeof(dim_t);
+            num_threads = num_cols_l;
+            Kernel<set_zero, gpu>::Launch(s, num_threads, row_flg_out);
+            num_threads = num_rows_l * threads_per_warp;
+            Kernel<MarkCsrColWarpKernel, gpu>::Launch(s, num_threads,
+                row_flg_out, col_idx_l.dptr<CType>(), indptr_l.dptr<IType>(),
+                num_rows_l, num_cols_l);
+            cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                          temp_storage_bytes,
+                                          row_flg_out,
+                                          row_flg_out,
+                                          num_cols_l,
+                                          mshadow::Stream<gpu>::GetStream(s));
+            dim_t nnr_out = 0;
+            CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t),
+                                 cudaMemcpyDeviceToHost));
+
+            // Allocate output matrix space
+            ret->CheckAndAlloc({mshadow::Shape1(nnr_out)});
+            const TBlob data_out_blob = ret->data();
+            const TBlob row_idx_out_blob = ret->aux_data(rowsparse::kIdx);
+            DType* data_out = data_out_blob.dptr<DType>();
+            RType* row_idx_out = row_idx_out_blob.dptr<RType>();
+            num_threads = nnr_out * num_cols_r;
+            Kernel<set_zero, gpu>::Launch(s, num_threads, data_out);
+            num_threads = nnr_out;
+            Kernel<set_zero, gpu>::Launch(s, num_threads, row_idx_out);
+
+            // Fill row_idx array of output matrix, using the row_flg values
+            num_threads = num_cols_l;
+            Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_threads,
+                row_idx_out, row_flg_out, num_cols_l);
+
+            // Perform matrix-matrix multiply
+            num_threads = nnr_out;
+            Kernel<DotCsrTransRspRspByRowsKernel, gpu>::Launch(s, num_threads,
+                data_out, row_idx_out,
+                data_l.dptr<DType>(), indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(),
+                data_r.dptr<DType>(), row_idx_r.dptr<RType>(),
+                num_cols_r, nnr_r, nnr_out);
+          } else {
+            LOG(FATAL) << "DotCsrRspRspImpl has not implemented dot(csr, rsp1) = rsp2 yet.";
+          }
+        });
+      });
+    });
+  });
+}
+
+/*!
+ * \brief GPU Impl of dot(csr, rsp) = dns and dot(csr.T, rsp) = dns
+ */
+inline void DotCsrRspDnsImpl(const OpContext& ctx,
+                             const gpu& gpu_dev,
+                             const NDArray& lhs,
+                             const NDArray& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             TBlob* ret) {
+  // Reuse dot(csr, dns) implementation if rhs rsp matrix is in fact dense
+  if (rhs.storage_shape()[0] == rhs.shape()[0]) {
+    DotCsrDnsDnsImpl(ctx, gpu_dev, lhs, rhs.data(), req, trans_lhs, ret);
+    return;
+  }
+  if (kNullOp == req) return;
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  CHECK_EQ(rhs.storage_type(), kRowSparseStorage);
+
+  using mxnet_op::Kernel;
+  using mxnet_op::set_zero;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  if (!lhs.storage_initialized() || !rhs.storage_initialized()) {
+    if (kWriteTo == req) {
+      MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {  // data type
+        Kernel<set_zero, gpu>::Launch(s, ret->Size(), ret->dptr<DType>());
+      });
+    }
+    return;
+  }
+
+  using nnvm::dim_t;
+  const dim_t num_rows = ret->shape_[0];
+  const dim_t num_cols = ret->shape_[1];
+  const dim_t nnr_r = rhs.storage_shape()[0];
+  dim_t num_threads;
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob data_r = rhs.data();
+  const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx);
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
+          if (kWriteTo == req) {
+            num_threads = num_rows*num_cols;
+            Kernel<set_zero, gpu>::Launch(s, num_threads, ret->dptr<DType>());
+          }
+          if (trans_lhs) {
+            LOG(FATAL) << "DotCsrRspDnsImpl has not implemented dot(csr.T, rsp) = dns yet.";
+          } else {
+            // TODO: Consider implementing a vector kernel for SpMV (similar to DotCsrDnsDns)
+            // Alloc temp storage for row_flg array
+            RType* row_flg_r = ctx.requested[0]
+                .get_space_typed<gpu, 1, RType>(mshadow::Shape1(rhs.shape()[0]), s).dptr_;
+            num_threads = rhs.shape()[0];
+            Kernel<set_zero, gpu>::Launch(s, num_threads, row_flg_r);
+            // Set row_flg index array
+            num_threads = nnr_r;
+            Kernel<SetRspRowFlgKernel, gpu>::Launch(s, num_threads,
+                row_flg_r, row_idx_r.dptr<RType>(), nnr_r);
+            // Perform sparse matrix-matrix multiply
+            num_threads = num_rows*num_cols;
+            Kernel<DotCsrRspDnsScalarKernel, gpu>::Launch(s, num_threads,
+                ret->dptr<DType>(),
+                data_l.dptr<DType>(), indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(),
+                data_r.dptr<DType>(), row_idx_r.dptr<RType>(), row_flg_r, rhs.storage_shape()[0],
+                num_rows, num_cols);
+          }
+        });
+      });
+    });
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_DOT_INL_CUH_
diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
new file mode 100644
index 000000000000..7b7d82b01b91
--- /dev/null
+++ b/src/operator/tensor/dot-inl.h
@@ -0,0 +1,1007 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dot-inl.h
+ * \brief Function definition of matrix dot operator
+ */
+
+#ifndef MXNET_OPERATOR_TENSOR_DOT_INL_H_
+#define MXNET_OPERATOR_TENSOR_DOT_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include <type_traits>
+#include "../mshadow_op.h"
+#include "../elemwise_op_common.h"
+#include "../mxnet_op.h"
+#ifdef __CUDACC__
+#include "./dot-inl.cuh"
+#endif  // __CUDACC__
+
+namespace mxnet {
+namespace op {
+
+struct DotParam : public dmlc::Parameter<DotParam> {
+  bool transpose_a;
+  bool transpose_b;
+  DMLC_DECLARE_PARAMETER(DotParam) {
+    DMLC_DECLARE_FIELD(transpose_a)
+      .describe("If true then transpose the first input before dot.")
+      .set_default(false);
+    DMLC_DECLARE_FIELD(transpose_b)
+      .describe("If true then transpose the second input before dot.")
+      .set_default(false);
+  }
+};
+
+template<typename xpu>
+void DotForward_(const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx,
+                 const std::vector<TBlob>& inputs,
+                 const std::vector<OpReqType>& req,
+                 const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
+      << "dot only supports float32 and float64";
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    if (inputs[0].ndim() == 1 && inputs[1].ndim() == 1) {
+      CHECK_NE(req[0], kAddTo) << "AddTo not yet suported";
+      Tensor<xpu, 1, DType> out = outputs[0].get<xpu, 1, DType>(s);
+      VectorDot(out,
+                inputs[0].get<xpu, 1, DType>(s),
+                inputs[1].get<xpu, 1, DType>(s));
+    } else {
+      int ma, na, mb, nb, m, n;
+      if (param.transpose_a) {
+        ma = inputs[0].size(0);
+        na = inputs[0].Size()/ma;
+        m = na;
+      } else {
+        na = inputs[0].size(inputs[0].ndim()-1);
+        ma = inputs[0].Size()/na;
+        m = ma;
+      }
+      if (param.transpose_b) {
+        nb = inputs[1].size(inputs[1].ndim()-1);
+        mb = inputs[1].Size()/nb;
+        n = mb;
+      } else {
+        mb = inputs[1].size(0);
+        nb = inputs[1].Size()/mb;
+        n = nb;
+      }
+      Tensor<xpu, 2, DType> input0 =
+      inputs[0].get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
+      Tensor<xpu, 2, DType> input1 =
+      inputs[1].get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
+      Tensor<xpu, 2, DType> out =
+      outputs[0].get_with_shape<xpu, 2, DType>(Shape2(m, n), s);
+      if (param.transpose_a && param.transpose_b) {
+        ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1.T()));
+      } else if (!param.transpose_a && param.transpose_b) {
+        ASSIGN_DISPATCH(out, req[0], dot(input0, input1.T()));
+      } else if (param.transpose_a && !param.transpose_b) {
+        ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1));
+      } else {
+        ASSIGN_DISPATCH(out, req[0], dot(input0, input1));
+      }
+    }
+  });
+}
+
+template<typename xpu>
+void DotBackward_(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_NE(req[0], kWriteInplace);
+  CHECK_NE(req[1], kWriteInplace);
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    if (inputs[1].ndim() == 1 && inputs[2].ndim() == 1) {
+      Tensor<xpu, 1, DType> mout_grad = inputs[0].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, DType> mlhs_data = inputs[1].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, DType> mrhs_data = inputs[2].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, DType> mlhs_grad = outputs[0].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, DType> mrhs_grad = outputs[1].get<xpu, 1, DType>(s);
+      ASSIGN_DISPATCH(mrhs_grad, req[1],
+                      broadcast_scalar(mout_grad, mlhs_data.shape_) * mlhs_data);
+      ASSIGN_DISPATCH(mlhs_grad, req[0],
+                      broadcast_scalar(mout_grad, mlhs_data.shape_) * mrhs_data);
+    } else {
+      int ma, na, mb, nb, m, n;
+      if (param.transpose_a) {
+        ma = outputs[0].size(0);
+        na = outputs[0].Size()/ma;
+        m = na;
+      } else {
+        na = outputs[0].size(outputs[0].ndim()-1);
+        ma = outputs[0].Size()/na;
+        m = ma;
+      }
+      if (param.transpose_b) {
+        nb = outputs[1].size(outputs[1].ndim()-1);
+        mb = outputs[1].Size()/nb;
+        n = mb;
+      } else {
+        mb = outputs[1].size(0);
+        nb = outputs[1].Size()/mb;
+        n = nb;
+      }
+      Tensor<xpu, 2, DType> mout_grad =
+      inputs[0].get_with_shape<xpu, 2, DType>(Shape2(m, n), s);
+      Tensor<xpu, 2, DType> mlhs_data =
+      inputs[1].get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
+      Tensor<xpu, 2, DType> mrhs_data =
+      inputs[2].get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
+      Tensor<xpu, 2, DType> mlhs_grad =
+      outputs[0].get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
+      Tensor<xpu, 2, DType> mrhs_grad =
+      outputs[1].get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
+      if (param.transpose_a && param.transpose_b) {
+        // Gradient of z = dot(x.T, y.T)
+        // dy = dot(x, dz).T = dot(dz.T, x.T)
+        // dx = dot(dz, y).T = dot(y.T, dz.T)
+        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data.T()));
+        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data.T(), mout_grad.T()));
+      } else if (!param.transpose_a && param.transpose_b) {
+        // Gradient of z = dot(x, y.T)
+        // dy = dot(x.T, dz).T = dot(dz.T, x)
+        // dx = dot(dz, y)
+        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data));
+        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data));
+      } else if (param.transpose_a && !param.transpose_b) {
+        // Gradient of z = dot(x.T, y)
+        // dy = dot(x, dz)
+        // dx = dot(dz, y.T).T = dot(y, dz.T)
+        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data, mout_grad));
+        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data, mout_grad.T()));
+      } else {
+        // Gradient of z = dot(x, y)
+        // dy = dot(x.T, dz)
+        // dx = dot(dz, y.T)
+        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data.T(), mout_grad));
+        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data.T()));
+      }
+    }
+  });
+}
+
+inline bool DotForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                       const Context& ctx,
+                                       std::vector<int> *in_attrs,
+                                       std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  // csr has many zero columns, so the result of dot(csr.T, matrix) should be rsp
+  // TODO(stefan/haibin/jun): check type_assign return value
+  if (param.transpose_a && kCSRStorage == (*in_attrs)[0]) {
+    type_assign(&((*out_attrs)[0]), kRowSparseStorage);
+  } else {
+    type_assign(&((*out_attrs)[0]), kDefaultStorage);
+  }
+  return true;
+}
+
+inline bool DotBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                        const Context& ctx,
+                                        std::vector<int> *in_attrs,
+                                        std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3U);
+  CHECK_EQ(out_attrs->size(), 2U);
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  type_assign(&((*out_attrs)[0]), kDefaultStorage);
+  if (!param.transpose_a && kCSRStorage == (*in_attrs)[1]) {
+    type_assign(&((*out_attrs)[1]), kRowSparseStorage);
+  } else {
+    type_assign(&((*out_attrs)[1]), kDefaultStorage);
+  }
+  return true;
+}
+
+/*!
+ * \brief CPU Kernel of dot(csr, dns1) = dns2
+ * Parallelization by row blocks
+ */
+struct DotCsrDnsDnsByRowBlocks {
+  /*!
+   * \brief
+   * \param i the i-th thread
+   */
+  template<typename DType, typename IType, typename CType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* out,
+                                  const DType* data_l,
+                                  const IType* indptr_l,
+                                  const CType* col_idx_l,
+                                  const DType* data_r,
+                                  const nnvm::dim_t seg_len,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    const dim_t seg_start = i * seg_len;
+    if (seg_start >= num_rows) return;
+    const dim_t seg_end = std::min(seg_start + seg_len, num_rows);
+    for (dim_t j = seg_start; j < seg_end; ++j) {
+      if (indptr_l[j] == indptr_l[j+1]) continue;
+      const dim_t offset_out = j * num_cols;
+      for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) {
+        const DType val = data_l[k];
+        const dim_t offset_r = col_idx_l[k] * num_cols;
+        for (dim_t l = 0; l < num_cols; ++l) {
+          out[offset_out+l] += data_r[offset_r+l] * val;
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU Kernel of dot(csr.T(), dns1) = dns2
+ * Parallelization by row blocks
+ */
+struct DotCsrTransDnsDnsByRowBlocks {
+  /*!
+   * \brief
+   * \param i the i-th thread
+   */
+  template<typename DType, typename IType, typename CType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* out,
+                                  const DType* data_l,
+                                  const IType* indptr_l,
+                                  const CType* col_idx_l,
+                                  const DType* data_r,
+                                  const nnvm::dim_t seg_len,
+                                  const nnvm::dim_t num_rows_l,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    const dim_t seg_start = i * seg_len;
+    if (seg_start >= num_rows) return;
+    const dim_t seg_end = (i + 1) * seg_len;
+    for (dim_t j = 0; j < num_rows_l; ++j) {
+      if (indptr_l[j] == indptr_l[j+1]) continue;
+      const dim_t offset_r = j * num_cols;
+      for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) {
+        const CType col_idx = col_idx_l[k];
+        if (col_idx < seg_start || col_idx >= seg_end) continue;
+        const dim_t offset_out = col_idx * num_cols;
+        const DType val = data_l[k];
+        for (dim_t l = 0; l < num_cols; ++l) {
+          out[offset_out+l] += data_r[offset_r+l] * val;
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU Kernel of dot(csr.T(), dns) = rsp
+ * Parallelization by row blocks.
+ * This kernel fills up the row_idx array of the rsp 
+ * with 1 for nonzero rows and 0 for zero rows.
+ * The matrix will be compacted after this kernel call.
+ */
+struct DotCsrTransDnsRspByRowBlocks {
+  /*!
+   * \brief
+   * \param i the i-th thread
+   */
+  template<typename DType, typename RType, typename IType, typename CType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* out,
+                                  RType* row_idx,
+                                  const DType* data_l,
+                                  const IType* indptr_l,
+                                  const CType* col_idx_l,
+                                  const DType* data_r,
+                                  const nnvm::dim_t seg_len,
+                                  const nnvm::dim_t num_rows_l,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    const dim_t seg_start = i * seg_len;
+    if (seg_start >= num_rows) return;
+    const dim_t seg_end = (i + 1) * seg_len;
+    for (dim_t j = 0; j < num_rows_l; ++j) {
+      if (indptr_l[j] == indptr_l[j+1]) continue;
+      const dim_t offset_r = j * num_cols;
+      for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) {
+        const CType col_idx = col_idx_l[k];
+        if (col_idx < seg_start || col_idx >= seg_end) continue;
+        const dim_t offset_out = col_idx * num_cols;
+        row_idx[col_idx] = 1;
+        const DType val = data_l[k];
+        for (dim_t l = 0; l < num_cols; ++l) {
+          out[offset_out+l] += data_r[offset_r+l] * val;
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU Kernel of dot(csr, rsp) = dns
+ * Parallelization by row blocks
+ */
+struct DotCsrRspDnsByRowBlocks {
+  /*!
+   * \brief
+   * \param i         the i-th thread
+   * \param nnr_r     storage_shape[0] of the rsp
+   * \param num_rows  dns.shape[0]
+   * \param num_cols  dns.shape[1]
+   */
+  template<typename DType, typename IType, typename CType, typename RType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* out,
+                                  const DType* data_l,
+                                  const IType* indptr_l,
+                                  const CType* col_idx_l,
+                                  const DType* data_r,
+                                  const RType* row_idx_r,
+                                  const nnvm::dim_t nnr_r,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols,
+                                  const nnvm::dim_t seg_len) {
+    using nnvm::dim_t;
+    const dim_t seg_start = i * seg_len;
+    if (seg_start >= num_rows) return;
+    const dim_t seg_end = std::min(seg_start + seg_len, num_rows);
+    for (dim_t j = seg_start; j < seg_end; ++j) {
+      if (indptr_l[j] == indptr_l[j+1]) continue;
+      const dim_t offset_out = j * num_cols;
+      // Use binary search to find the lower_bound of val in row_idx array
+      const RType* first = row_idx_r;
+      const RType* last = row_idx_r + nnr_r;
+      const CType val = col_idx_l[indptr_l[j]];
+      const RType* it;
+      int count = last - first, step;
+      while (count > 0) {
+        it = first;
+        step = count / 2;
+        it += step;
+        if (*it < val) {
+          first = ++it;
+          count -= step + 1;
+        } else {
+          count = step;
+        }
+      }
+      const RType* row_idx_ptr = first;
+      // end of binary search
+      if (row_idx_ptr == row_idx_r+nnr_r || *row_idx_ptr> col_idx_l[indptr_l[j+1]-1]) continue;
+      for (IType k = indptr_l[j]; k < indptr_l[j+1] && row_idx_ptr != row_idx_r+nnr_r;) {
+        if (col_idx_l[k] == *row_idx_ptr) {
+          const dim_t offset_r = (row_idx_ptr - row_idx_r) * num_cols;
+          for (dim_t l = 0; l < num_cols; ++l) {
+            out[offset_out+l] += data_l[k] * data_r[offset_r+l];
+          }
+          ++k;
+          ++row_idx_ptr;
+        } else if (col_idx_l[k] < *row_idx_ptr) {
+          ++k;
+        } else {
+          ++row_idx_ptr;
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU Kernel of dot(csr.T(), rsp1) = rsp2, with row_idx marked for non-zero rows
+ * Parallelization by row blocks
+ */
+struct DotCsrTransRspRspByRowBlocks {
+  /*!
+   * \brief
+   * \param i the i-th thread
+   * \param num_rows_l number of rows of lhs matrix
+   * \param nnr_r number of non-zero rows of rhs matrix
+   * \param num_rows number of rows of out matrix
+   * \param num_cols number of cols of out matrix
+   */
+  template<typename DType, typename IType, typename CType, typename RType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* out,
+                                  RType* row_idx_out,
+                                  const DType* data_l,
+                                  const IType* indptr_l,
+                                  const CType* col_idx_l,
+                                  const DType* data_r,
+                                  const RType* row_idx_r,
+                                  const nnvm::dim_t num_rows_l,
+                                  const nnvm::dim_t nnr_r,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols,
+                                  const nnvm::dim_t seg_len) {
+    using nnvm::dim_t;
+    const dim_t seg_start = i * seg_len;
+    if (seg_start >= num_rows) return;
+    const dim_t seg_end = (i + 1) * seg_len;
+    for (dim_t rid = 0; rid < nnr_r; ++rid) {
+      const RType j = row_idx_r[rid];
+      if (indptr_l[j] == indptr_l[j+1]) continue;
+      const dim_t offset_r = rid * num_cols;
+      for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) {
+        const CType col_idx = col_idx_l[k];
+        if (col_idx < seg_start || col_idx >= seg_end) continue;
+        row_idx_out[col_idx] = 1;  // mark nonzero row as 1
+        const dim_t offset_out = col_idx * num_cols;
+        for (dim_t l = 0; l < num_cols; ++l) {
+          out[offset_out+l] += data_r[offset_r+l] * data_l[k];
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU Impl of dot(csr, dns1) = dns2 and dot(csr.T, dns1) = dns2
+ */
+inline void DotCsrDnsDnsImpl(const OpContext& ctx,
+                             const cpu& cpu_dev,
+                             const NDArray& lhs,
+                             const TBlob& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             TBlob* ret) {
+  if (kNullOp == req) return;
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  if (!lhs.storage_initialized()) return;
+
+  using nnvm::dim_t;
+
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob& data_r = rhs;
+  const TBlob data_out = *ret;
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        dim_t num_threads;
+        if (kWriteTo == req) {
+          num_threads = data_out.Size();
+          mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(
+              s, num_threads, data_out.dptr<DType>());
+        }
+        num_threads = mxnet_op::get_num_threads<cpu>(data_out.shape_[0]);
+        dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads;
+        if (trans_lhs) {
+          mxnet_op::Kernel<DotCsrTransDnsDnsByRowBlocks, cpu>::Launch(s, num_threads,
+              data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+              col_idx_l.dptr<CType>(), data_r.dptr<DType>(), seg_len,
+              lhs.shape()[0], data_out.shape_[0], data_out.shape_[1]);
+        } else {
+          mxnet_op::Kernel<DotCsrDnsDnsByRowBlocks, cpu>::Launch(s, num_threads,
+              data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+              col_idx_l.dptr<CType>(), data_r.dptr<DType>(), seg_len,
+              data_out.shape_[0], data_out.shape_[1]);
+        }
+      });
+    });
+  });
+}
+
+/*!
+ * \brief CPU Impl of dot(csr.T, dns) = rsp
+ */
+inline void DotCsrDnsRspImpl(const OpContext& ctx,
+                             const cpu& cpu_dev,
+                             const NDArray& lhs,
+                             const TBlob& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             NDArray* ret) {
+  if (kNullOp == req) return;
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  CHECK_EQ(ret->storage_type(), kRowSparseStorage);
+  if (!lhs.storage_initialized()) return;
+  CHECK_EQ(req, kWriteTo);
+
+  using mxnet_op::set_zero;
+  using nnvm::dim_t;
+
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob& data_r = rhs;
+
+  // pre-allocate spaces for ret using the dense dimension size
+  ret->CheckAndAlloc({mshadow::Shape1(lhs.shape()[1])});
+  const TBlob data_out = ret->data();
+  const TBlob row_idx_out = ret->aux_data(rowsparse::kIdx);
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        MSHADOW_IDX_TYPE_SWITCH(row_idx_out.type_flag_, RType, {  // row idx type
+          dim_t num_threads = data_out.Size();
+          mxnet_op::Kernel<set_zero, cpu>::Launch(s, num_threads, data_out.dptr<DType>());
+          RType* row_idx = row_idx_out.dptr<RType>();
+          num_threads = row_idx_out.Size();
+          mxnet_op::Kernel<set_zero, cpu>::Launch(s, num_threads, row_idx);
+          num_threads = mxnet_op::get_num_threads<cpu>(data_out.shape_[0]);
+          dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads;
+          if (trans_lhs) {
+            mxnet_op::Kernel<DotCsrTransDnsRspByRowBlocks, cpu>::Launch(s, num_threads,
+                data_out.dptr<DType>(), row_idx, data_l.dptr<DType>(),
+                indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(), data_r.dptr<DType>(),
+                seg_len, lhs.shape()[0], data_out.shape_[0], data_out.shape_[1]);
+            dim_t nnr = 0;
+            nnr = mxnet::common::ParallelAccumulate(row_idx, ret->shape()[0], nnr);
+            ret->set_aux_shape(rowsparse::kIdx, mshadow::Shape1(nnr));
+            if (0 == nnr) return;
+            mshadow::Tensor<cpu, 2, DType> rsp_data = data_out.FlatTo2D<cpu, DType>(s);
+            dim_t idx = 0;
+            for (index_t i = 0; i < ret->shape()[0]; ++i) {
+              if (row_idx[i] > 0) {
+                row_idx[idx] = i;
+                mshadow::Copy(rsp_data[idx], rsp_data[i], s);
+                ++idx;
+              }
+            }
+          } else {
+            LOG(FATAL) << "DotCsrDnsRspImpl has not implemented dot(csr, dns)=rsp yet.";
+          }
+        });
+      });
+    });
+  });
+}
+
+/*!
+ * \brief CPU Impl of dot(csr, rsp) = dns
+ */
+inline void DotCsrRspDnsImpl(const OpContext& ctx,
+                             const cpu& cpu_dev,
+                             const NDArray& lhs,
+                             const NDArray& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             TBlob* ret) {
+  if (kNullOp == req) return;
+  // reuse csr dns implementation when storage_shape == shape for rhs
+  if (rhs.storage_shape()[0] == rhs.shape()[0]) {  // if rsp is actually dense
+    DotCsrDnsDnsImpl(ctx, cpu_dev, lhs, rhs.data(), req, trans_lhs, ret);
+    return;
+  }
+
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  CHECK_EQ(rhs.storage_type(), kRowSparseStorage);
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  if (!lhs.storage_initialized() || !rhs.storage_initialized()) {
+    if (kWriteTo == req) {
+      MSHADOW_SGL_DBL_TYPE_SWITCH(ret->type_flag_, DType, {  // data type
+        mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(
+            s, ret->Size(), ret->dptr<DType>());
+      });
+    }
+    return;
+  }
+  using nnvm::dim_t;
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob data_r = rhs.data();
+  const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx);
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
+          dim_t num_threads;
+          if (kWriteTo == req) {
+            num_threads = ret->Size();
+            mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, num_threads,
+                                                              ret->dptr<DType>());
+          }
+          num_threads = mxnet_op::get_num_threads<cpu>(ret->shape_[0]);
+          dim_t seg_len = (ret->shape_[0] + num_threads - 1) / num_threads;
+          if (trans_lhs) {
+            LOG(FATAL) << "DotCsrRspDnsImpl has not implemented dot(csr.T, rsp) = dns yet";
+          } else {
+            mxnet_op::Kernel<DotCsrRspDnsByRowBlocks, cpu>::Launch(s, num_threads,
+                ret->dptr<DType>(), data_l.dptr<DType>(),
+                indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(), data_r.dptr<DType>(),
+                row_idx_r.dptr<RType>(), rhs.storage_shape()[0],
+                ret->shape_[0], ret->shape_[1], seg_len);
+          }
+        });
+      });
+    });
+  });
+}
+
+/*!
+ * \brief CPU Impl of dot(csr.T, rsp1) = rsp2
+ */
+inline void DotCsrRspRspImpl(const OpContext& ctx,
+                             const cpu& cpu_dev,
+                             const NDArray& lhs,
+                             const NDArray& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             NDArray* ret) {
+  if (kNullOp == req) return;
+  // reuse csr dns implementation when storage_shape == shape for rhs
+  if (rhs.storage_shape()[0] == rhs.shape()[0]) {  // if rsp is actually dense
+    DotCsrDnsRspImpl(ctx, cpu_dev, lhs, rhs.data(), req, trans_lhs, ret);
+    return;
+  }
+
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  CHECK_EQ(rhs.storage_type(), kRowSparseStorage);
+  CHECK_EQ(ret->storage_type(), kRowSparseStorage);
+  if (!lhs.storage_initialized() || !rhs.storage_initialized()) return;
+  CHECK_EQ(req, kWriteTo);
+
+  using mxnet_op::set_zero;
+  using nnvm::dim_t;
+
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob data_r = rhs.data();
+  const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx);
+
+  // pre-allocate spaces for ret using the dense dimension size
+  if (ret->storage_type() == kRowSparseStorage) {
+    ret->CheckAndAlloc({mshadow::Shape1(lhs.shape()[1])});
+  }
+  const TBlob data_out = ret->data();
+  const TBlob row_idx_out = ret->aux_data(rowsparse::kIdx);
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
+          dim_t num_threads = data_out.Size();
+          mxnet_op::Kernel<set_zero, cpu>::Launch(s, num_threads, data_out.dptr<DType>());
+          num_threads = mxnet_op::get_num_threads<cpu>(data_out.shape_[0]);
+          dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads;
+          if (trans_lhs) {
+            RType* row_idx = row_idx_out.dptr<RType>();
+            num_threads = row_idx_out.Size();
+            mxnet_op::Kernel<set_zero, cpu>::Launch(s, num_threads, row_idx);
+            mxnet_op::Kernel<DotCsrTransRspRspByRowBlocks, cpu>::Launch(s, num_threads,
+                data_out.dptr<DType>(), row_idx, data_l.dptr<DType>(),
+                indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(), data_r.dptr<DType>(),
+                row_idx_r.dptr<RType>(), lhs.shape()[0], rhs.storage_shape()[0],
+                ret->shape()[0], ret->shape()[1], seg_len);
+            dim_t nnr = 0;
+            nnr = mxnet::common::ParallelAccumulate(row_idx, ret->shape()[0], nnr);
+            ret->set_aux_shape(rowsparse::kIdx, mshadow::Shape1(nnr));
+            if (0 == nnr) return;
+            mshadow::Tensor<cpu, 2, DType> rsp_data = data_out.FlatTo2D<cpu, DType>(s);
+            dim_t idx = 0;
+            for (index_t i = 0; i < ret->shape()[0]; ++i) {
+              if (row_idx[i] > 0) {
+                row_idx[idx] = i;
+                mshadow::Copy(rsp_data[idx], rsp_data[i], s);
+                ++idx;
+              }
+            }
+          } else {
+            LOG(FATAL) << "DotCsrRspRspImpl has not implemented dot(csr, rsp) = rsp2 yet";
+          }
+        });
+      });
+    });
+  });
+}
+
+inline bool DotShape(const nnvm::NodeAttrs& attrs,
+                     std::vector<TShape> *in_attrs,
+                     std::vector<TShape> *out_attrs) {
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TShape& lshape = (*in_attrs)[0];
+  TShape& rshape = (*in_attrs)[1];
+  if (lshape.ndim() == 1 && rshape.ndim() == 1) {
+    CHECK(!param.transpose_a && !param.transpose_b) << "Cannot transpose vectors";
+    CHECK_EQ(lshape[0], rshape[0]) << "dot shape error: " << lshape << " X " << rshape;
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape1(1));
+  } else {
+    bool Ta = param.transpose_a, Tb = param.transpose_b;
+    TShape L[2], R[2];
+    if (Ta) {
+      L[0] = mshadow::Shape1(lshape[0]);
+      L[1] = lshape.ndim() > 1 ? TShape(&lshape[1], &lshape[lshape.ndim()]) : TShape(1);
+    } else {
+      L[0] = lshape.ndim() > 1 ? TShape(&lshape[0], &lshape[lshape.ndim()-1]) : TShape(1);
+      L[1] = mshadow::Shape1(lshape[lshape.ndim()-1]);
+    }
+    if (Tb) {
+      R[0] = rshape.ndim() > 1 ? TShape(&rshape[0], &rshape[rshape.ndim()-1]) : TShape(1);
+      R[1] = mshadow::Shape1(rshape[rshape.ndim()-1]);
+    } else {
+      R[0] = mshadow::Shape1(rshape[0]);
+      R[1] = rshape.ndim() > 1 ? TShape(&rshape[1], &rshape[rshape.ndim()]) : TShape(1);
+    }
+
+    if (L[!Ta].Size() != 0 && R[Tb].Size() != 0) {
+      CHECK_EQ(L[!Ta].Size(), R[Tb].Size())
+        << "dot shape error: " << lshape << " X " << rshape;
+    }
+    std::vector<index_t> buf;
+    if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], &L[Ta][L[Ta].ndim()]);
+    if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], &R[!Tb][R[!Tb].ndim()]);
+    TShape oshape(buf.begin(), buf.end());
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+  }
+  return true;
+}
+
+template<typename xpu>
+void DotForwardEx(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<NDArray>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  CHECK(!param.transpose_b) << "transposing rhs of the sparse dot op is not supported";
+  CHECK_EQ(inputs[0].shape().ndim(), 2) << "sparse dot only supports 2 dimensional lhs";
+  CHECK_EQ(inputs[1].shape().ndim(), 2) << "sparse dot only supports 2 dimensional rhs";
+  auto lhs_stype = inputs[0].storage_type();
+  auto rhs_stype = inputs[1].storage_type();
+  auto out_stype = outputs[0].storage_type();
+  if (lhs_stype == kCSRStorage && rhs_stype == kDefaultStorage && out_stype == kDefaultStorage) {
+    TBlob ret = outputs[0].data();
+    DotCsrDnsDnsImpl(ctx, xpu(), inputs[0], inputs[1].data(), req[0], param.transpose_a, &ret);
+  } else if (lhs_stype == kCSRStorage && rhs_stype == kRowSparseStorage
+      && out_stype == kDefaultStorage) {
+    TBlob ret = outputs[0].data();
+    DotCsrRspDnsImpl(ctx, xpu(), inputs[0], inputs[1], req[0], param.transpose_a, &ret);
+  } else if (lhs_stype == kCSRStorage && rhs_stype == kDefaultStorage
+      && out_stype == kRowSparseStorage) {
+    NDArray out = outputs[0];
+    DotCsrDnsRspImpl(ctx, xpu(), inputs[0], inputs[1].data(), req[0], param.transpose_a, &out);
+  } else if (lhs_stype == kCSRStorage && rhs_stype == kRowSparseStorage
+      && out_stype == kRowSparseStorage) {
+    NDArray ret = outputs[0];
+    DotCsrRspRspImpl(ctx, xpu(), inputs[0], inputs[1], req[0], param.transpose_a, &ret);
+  } else {
+    FCompExFallback<xpu>(attrs, ctx, inputs, req, outputs, DotForward_<xpu>, "DotForward_");
+  }
+}
+
+template<typename xpu>
+void DotBackwardEx(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<NDArray>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 2U);
+  CHECK_EQ(req.size(), 2U);
+  CHECK_EQ(kNullOp, req[0])
+    << "sparse dot does not support computing the gradient of the csr/lhs";
+  CHECK_NE(req[1], kWriteInplace) << "DotBackwardEx does not support WriteInplace";
+
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  CHECK(!param.transpose_b) << "sparse dot only supports dot(A, X) and dot(A.T(), X)";
+  CHECK_EQ(inputs[0].shape().ndim(), 2) << "sparse dot only supports 2 dimensional lhs";
+  CHECK_EQ(inputs[1].shape().ndim(), 2) << "sparse dot only supports 2 dimensional rhs";
+  const auto ograd_stype = inputs[0].storage_type();
+  const auto lhs_stype = inputs[1].storage_type();
+  const auto rhs_stype = inputs[2].storage_type();
+  const auto grad_rhs_stype = outputs[1].storage_type();
+  if (ograd_stype == kDefaultStorage  // ograd dns format
+      && lhs_stype == kCSRStorage  // csr input lhs of the op
+      && grad_rhs_stype == kDefaultStorage) {  // grad(rhs) dns format
+    TBlob ret = outputs[1].data();
+    DotCsrDnsDnsImpl(ctx, xpu(), inputs[1], inputs[0].data(), req[1], !param.transpose_a, &ret);
+  } else if (ograd_stype == kDefaultStorage
+      && lhs_stype == kCSRStorage
+      && grad_rhs_stype == kRowSparseStorage) {
+    NDArray ret = outputs[1];
+    DotCsrDnsRspImpl(ctx, xpu(), inputs[1], inputs[0].data(), req[1], !param.transpose_a, &ret);
+  } else {
+    FCompExFallback<xpu>(attrs, ctx, inputs, req, outputs, DotBackward_<xpu>, "DotBackward_");
+  }
+}
+
+template<typename xpu>
+void BatchDotForward_(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
+      << "dot only supports float32 and float64";
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    mshadow::Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mlhs = inputs[0].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mrhs = inputs[1].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 1, DType*> workspace =
+      ctx.requested[0].get_space_typed<xpu, 1, DType*>(mshadow::Shape1(3 * out.size(0)), s);
+    if (kNullOp != req[0]) {
+      if (param.transpose_a && param.transpose_b) {
+        mshadow::BatchGEMM<true, true>(out, mlhs, mrhs, (DType)1.0f,
+                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                       workspace);
+      } else if (!param.transpose_a && param.transpose_b) {
+        mshadow::BatchGEMM<false, true>(out, mlhs, mrhs, (DType)1.0f,
+                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                       workspace);
+      } else if (param.transpose_a && !param.transpose_b) {
+        mshadow::BatchGEMM<true, false>(out, mlhs, mrhs, (DType)1.0f,
+                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                       workspace);
+      } else {
+        mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, (DType)1.0f,
+                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                       workspace);
+      }
+    }
+  });
+}
+
+template<typename xpu>
+void BatchDotBackward_(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  CHECK_NE(req[1], kWriteInplace);
+  CHECK_NE(req[0], kWriteInplace);
+  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
+      << "dot only supports float32 and float64";
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    mshadow::Tensor<xpu, 3, DType> mout_grad = inputs[0].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mlhs_data = inputs[1].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mrhs_data = inputs[2].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mlhs_grad = outputs[0].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mrhs_grad = outputs[1].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 2, DType*> workspace =
+      ctx.requested[0].get_space_typed<xpu, 2, DType*>(
+        mshadow::Shape2(2, 3 * mout_grad.size(0)), s);
+    mshadow::Tensor<xpu, 1, DType*> rhs_workspace = workspace[0];
+    mshadow::Tensor<xpu, 1, DType*> lhs_workspace = workspace[1];
+    if (param.transpose_a && param.transpose_b) {
+      // Gradient of z = dot(x.T, y.T)
+      // dy = dot(x, dz).T = dot(dz.T, x.T)
+      // dx = dot(dz, y).T = dot(y.T, dz.T)
+      if (kNullOp != req[1]) {
+        mshadow::BatchGEMM<true, true>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
+                                        (kAddTo == req[1]) ? (DType)1.0f :  (DType)0.0f,
+                                        rhs_workspace);
+      }
+      if (kNullOp != req[0]) {
+        mshadow::BatchGEMM<true, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
+                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                        lhs_workspace);
+      }
+    } else if (!param.transpose_a && param.transpose_b) {
+      // Gradient of z = dot(x, y.T)
+      // dy = dot(x.T, dz).T = dot(dz.T, x)
+      // dx = dot(dz, y)
+      if (kNullOp != req[1]) {
+        mshadow::BatchGEMM<true, false>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
+                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
+                                        rhs_workspace);
+      }
+      if (kNullOp != req[0]) {
+        mshadow::BatchGEMM<false, false>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
+                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                        lhs_workspace);
+      }
+    } else if (param.transpose_a && !param.transpose_b) {
+      // Gradient of z = dot(x.T, y)
+      // dy = dot(x, dz)
+      // dx = dot(dz, y.T).T = dot(y, dz.T)
+      if (kNullOp != req[1]) {
+        mshadow::BatchGEMM<false, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
+                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
+                                        rhs_workspace);
+      }
+      if (kNullOp != req[0]) {
+        mshadow::BatchGEMM<false, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
+                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                        lhs_workspace);
+      }
+    } else {
+      // Gradient of z = dot(x, y)
+      // dy = dot(x.T, dz)
+      // dx = dot(dz, y.T)
+      if (kNullOp != req[1]) {
+        mshadow::BatchGEMM<true, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
+                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
+                                        rhs_workspace);
+      }
+      if (kNullOp != req[0]) {
+        mshadow::BatchGEMM<false, true>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
+                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                        lhs_workspace);
+      }
+    }
+  });
+}
+
+inline bool BatchDotShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape> *in_attrs,
+                          std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  TShape& lshape = (*in_attrs)[0];
+  TShape& rshape = (*in_attrs)[1];
+  if (lshape.ndim() == 3 && rshape.ndim() == 3) {
+    CHECK(lshape[0] == rshape[0])
+      << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape
+      << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
+    index_t out_m = param.transpose_a ? lshape[2] : lshape[1];
+    index_t lshape_k = param.transpose_a ? lshape[1] : lshape[2];
+    index_t out_n = param.transpose_b ? rshape[1] : rshape[2];
+    index_t rshape_k = param.transpose_b ? rshape[2] : rshape[1];
+    CHECK(lshape_k == rshape_k)
+      << "batch_dot shape error(shape mismatch): " << lshape << " X " << rshape
+      << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape3(lshape[0], out_m, out_n));
+  } else {
+    LOG(FATAL) << "batch_dot currently only support 3D*3D array"
+               << lshape << " v.s. " << rshape;
+  }
+  return true;
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_DOT_INL_H_
diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc
new file mode 100644
index 000000000000..c455702fc638
--- /dev/null
+++ b/src/operator/tensor/dot.cc
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dot.cc
+ * \brief CPU Implementation of matrix dot
+ */
+
+#include "./dot-inl.h"
+
+namespace mxnet {
+namespace op {
+DMLC_REGISTER_PARAMETER(DotParam);
+
+NNVM_REGISTER_OP(dot)
+.describe(R"doc(Dot product of two arrays.
+
+``dot``'s behavior depends on the input array dimensions:
+
+- 1-D arrays: inner product of vectors
+- 2-D arrays: matrix multiplication
+- N-D arrays: a sum product over the last axis of the first input and the first
+  axis of the second input
+
+  For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape `(k,r,s)`, the
+  result array will have shape `(n,m,r,s)`. It is computed by::
+
+    dot(x,y)[i,j,a,b] = sum(x[i,j,:]*y[:,a,b])
+
+  Example::
+
+    x = reshape([0,1,2,3,4,5,6,7], shape=(2,2,2))
+    y = reshape([7,6,5,4,3,2,1,0], shape=(2,2,2))
+    dot(x,y)[0,0,1,1] = 0
+    sum(x[0,0,:]*y[:,1,1]) = 0
+
+The storage type of ``dot`` output depends on storage types of inputs and transpose options:
+
+- dot(csr, default) = default
+- dot(csr.T, default) = row_sparse
+- dot(csr, row_sparse) = default
+- otherwise, ``dot`` generates output with default storage
+
+)doc" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<DotParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"lhs", "rhs"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", DotShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", DotForwardInferStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", DotForward_<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", DotForwardEx<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_dot"})
+.add_argument("lhs", "NDArray-or-Symbol", "The first input")
+.add_argument("rhs", "NDArray-or-Symbol", "The second input")
+.add_arguments(DotParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_dot)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<DotParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", DotBackwardInferStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", DotBackward_<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", DotBackwardEx<cpu>)
+.add_arguments(DotParam::__FIELDS__());
+
+NNVM_REGISTER_OP(batch_dot)
+.describe(R"doc(Batchwise dot product.
+
+``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
+``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`.
+
+For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape
+`(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`,
+which is computed by::
+
+   batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:])
+
+)doc" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<DotParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"lhs", "rhs"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", BatchDotShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", BatchDotForward_<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_batch_dot"})
+.add_argument("lhs", "NDArray-or-Symbol", "The first input")
+.add_argument("rhs", "NDArray-or-Symbol", "The second input")
+.add_arguments(DotParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_batch_dot)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<DotParam>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", BatchDotBackward_<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/dot.cu b/src/operator/tensor/dot.cu
new file mode 100644
index 000000000000..8ee2e2832fbb
--- /dev/null
+++ b/src/operator/tensor/dot.cu
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dot.cu
+ * \brief GPU Implementation of matrix dot
+ */
+
+#include "./dot-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(dot)
+.set_attr<FCompute>("FCompute<gpu>", DotForward_<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", DotForwardEx<gpu>);
+
+NNVM_REGISTER_OP(_backward_dot)
+.set_attr<FCompute>("FCompute<gpu>", DotBackward_<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", DotBackwardEx<gpu>);
+
+NNVM_REGISTER_OP(batch_dot)
+.set_attr<FCompute>("FCompute<gpu>", BatchDotForward_<gpu>);
+
+NNVM_REGISTER_OP(_backward_batch_dot)
+.set_attr<FCompute>("FCompute<gpu>", BatchDotBackward_<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc
index c80d46a883ea..8c97849e20dc 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc
@@ -123,6 +123,7 @@ Example::
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow::op::mul>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"});
 
+
 NNVM_REGISTER_OP(_backward_broadcast_mul)
 .set_num_inputs(3)
 .set_num_outputs(2)
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index 87b0d46a63c9..ddcad5e61ba0 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -28,10 +28,12 @@
 #include <vector>
 #include <string>
 #include <utility>
+#include <typeinfo>
 #include "../mxnet_op.h"
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
-#include "../mxnet_op.h"
+#include "./init_op.h"
+#include "../../common/utils.h"
 
 namespace mxnet {
 namespace op {
@@ -141,6 +143,120 @@ void BinaryBackwardUseNone_(const nnvm::NodeAttrs& attrs,
   }
 }
 
+// TODO(haibin) This is a single-thread inefficient implementation
+// This implementation only works on CPU
+template<typename xpu, typename OP>
+void BinaryComputeRspRspImpl(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<NDArray>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<NDArray>& outputs) {
+  if (req[0] == kNullOp) return;
+  CHECK(req[0] == kWriteTo) << "only kWriteTo is supported for rowsparse elemwise_add";
+  using namespace rowsparse;
+  using namespace mshadow;
+  auto &lhs = inputs[0];
+  auto &rhs = inputs[1];
+  auto &output = outputs[0];
+
+  bool init_l = lhs.storage_initialized();
+  bool init_r = rhs.storage_initialized();
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  // both inputs are zeros
+  if (!init_l && !init_r) {
+    NDArray out = output;
+    FillZerosRspImpl(s, &out);
+    return;
+  }
+  // Memory Estimation: This is (roughly) the number of result rows. We still
+  // need to subtract the number of common rows
+  unsigned int num_rows_l = lhs.aux_shape(kIdx)[0];
+  unsigned int num_rows_r = rhs.aux_shape(kIdx)[0];
+  unsigned int num_rows_total = num_rows_l + num_rows_r;
+  auto row_len = output.shape().ProdShape(1, output.shape().ndim());
+  output.CheckAndAlloc({Shape1(num_rows_total)});
+  CHECK_GT(row_len, 0);
+  MSHADOW_TYPE_SWITCH(output.dtype(), DType, {
+    MSHADOW_TYPE_SWITCH(lhs.aux_type(kIdx), IType, {
+      // Indices
+      auto indices_l = lhs.aux_data(kIdx).dptr<IType>();
+      auto indices_r = rhs.aux_data(kIdx).dptr<IType>();
+      auto indices_out = output.aux_data(kIdx).dptr<IType>();
+      // Data
+      auto data_l = lhs.data().get_with_shape<cpu, 2, DType>(Shape2(num_rows_l, row_len), s);
+      auto data_r = rhs.data().get_with_shape<cpu, 2, DType>(Shape2(num_rows_r, row_len), s);
+      auto out = output.data().get_with_shape<cpu, 2, DType>(Shape2(num_rows_total, row_len), s);
+
+      // TODO(haibin) A more appropriate way: Copy to output, then apply ops
+      size_t iter_l = 0;
+      size_t iter_r = 0;
+      size_t iter_out = 0;
+      int32_t num_common_rows = 0;
+      while (iter_l < num_rows_l && iter_r < num_rows_r) {
+        auto idx_l = indices_l[iter_l];
+        auto idx_r = indices_r[iter_r];
+        if (idx_l == idx_r) {
+          // Same row
+          indices_out[iter_out] = idx_l;
+          Copy(out[iter_out], data_l[iter_l++], s);
+          out[iter_out] += data_r[iter_r++];
+          num_common_rows++;
+        } else if (idx_l < idx_r) {
+          // Left only
+          indices_out[iter_out] = idx_l;
+          Copy(out[iter_out], data_l[iter_l++], s);
+        } else {
+          // Right only
+          indices_out[iter_out] = idx_r;
+          Copy(out[iter_out], data_r[iter_r++], s);
+        }
+        iter_out++;
+      }
+      // Copying over the rest of the rows
+      while (iter_l < num_rows_l) {
+        indices_out[iter_out] = indices_l[iter_l];
+        Copy(out[iter_out++], data_l[iter_l++], s);
+      }
+      while (iter_r < num_rows_r) {
+        indices_out[iter_out] = indices_r[iter_r];
+        Copy(out[iter_out++], data_r[iter_r++], s);
+      }
+      auto new_sshape = TShape(output.aux_shape(rowsparse::kIdx));
+      CHECK_GT(new_sshape[0], num_common_rows);
+      new_sshape[0] -= num_common_rows;
+      output.set_aux_shape(rowsparse::kIdx, new_sshape);
+    });
+  });
+}
+
+template<typename xpu, typename OP>
+void BinaryComputeEx(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<NDArray>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<NDArray>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(inputs.size(), 2);
+  CHECK_EQ(outputs.size(), 1);
+  if (typeid(OP) == typeid(mshadow::op::plus)) {
+    // If any input is dense, fallback to FCompute
+    // TODO(haibin) implement dns + rsp in a separate kernel
+    if (common::ContainsDefaultStorage(inputs)) {
+      FCompExFallback<xpu>(attrs, ctx, inputs, req, outputs,
+                           BinaryCompute<xpu, OP>, "BinaryCompute");
+      return;
+    }
+    CHECK_EQ(inputs[0].storage_type(), kRowSparseStorage) << "Sparse type not supported yet";
+    CHECK_EQ(inputs[1].storage_type(), kRowSparseStorage) << "Sparse type not supported yet";
+    BinaryComputeRspRspImpl<xpu, OP>(attrs, ctx, inputs, req, outputs);
+    return;
+  } else {
+    LOG(FATAL) << "Not implemented";
+  }
+}
+
 template<typename xpu, typename LOP, typename ROP>
 void BinaryBackwardUseNone(const nnvm::NodeAttrs& attrs,
                            const OpContext& ctx,
@@ -152,6 +268,55 @@ void BinaryBackwardUseNone(const nnvm::NodeAttrs& attrs,
   });
 }
 
+// Only implemented for _backward_add for now
+template<typename xpu, typename LOP, typename ROP>
+void BinaryBackwardUseNoneRsp(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(inputs[0].storage_type(), kRowSparseStorage);
+  CHECK_EQ(outputs[0].storage_type(), kRowSparseStorage);
+  CHECK_EQ(outputs[1].storage_type(), kRowSparseStorage);
+  CHECK(typeid(LOP) == typeid(mshadow_op::identity));
+  CHECK(typeid(ROP) == typeid(mshadow_op::identity));
+  TShape shape = inputs[0].aux_shape(rowsparse::kIdx);
+  outputs[0].CheckAndAlloc({shape});
+  outputs[1].CheckAndAlloc({shape});
+  MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, {
+    MSHADOW_TYPE_SWITCH(outputs[0].aux_type(rowsparse::kIdx), IType, {
+      auto lgrad_idx = outputs[0].aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
+      auto rgrad_idx = outputs[1].aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
+      auto ograd_idx = inputs[0].aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
+      auto lgrad = outputs[0].data().FlatTo1D<xpu, DType>(s);
+      Tensor<xpu, 1, DType> rgrad = outputs[1].data().FlatTo1D<xpu, DType>(s);
+      Tensor<xpu, 1, DType> ograd = inputs[0].data().FlatTo1D<xpu, DType>(s);
+      ASSIGN_DISPATCH(lgrad, req[0], F<LOP>(ograd));
+      ASSIGN_DISPATCH(rgrad, req[1], F<ROP>(ograd));
+      ASSIGN_DISPATCH(lgrad_idx, req[0], F<LOP>(ograd_idx));
+      ASSIGN_DISPATCH(rgrad_idx, req[1], F<ROP>(ograd_idx));
+    });
+  });
+}
+// Only implemented for _backward_add for now
+template<typename xpu, typename LOP, typename ROP>
+void BinaryBackwardUseNoneEx(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  auto stype = inputs[0].storage_type();
+  CHECK_EQ(stype, kRowSparseStorage) << "Not implemented yet";
+  BinaryBackwardUseNoneRsp<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
+  // TODO(haibin) fallback for kDefaultStorage
+}
+
 template<typename xpu, typename LOP, typename ROP>
 void BinaryBackwardUseNoneWithHalf2(const nnvm::NodeAttrs& attrs,
                                     const OpContext& ctx,
@@ -232,7 +397,7 @@ void BinaryBackwardUseInWithHalf2(const nnvm::NodeAttrs& attrs,
     [](const NodeAttrs& attrs){                                     \
       return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};     \
     })                                                              \
-  .add_argument("lhs", "NDArray-or-Symbol", "first input")                    \
+  .add_argument("lhs", "NDArray-or-Symbol", "first input")          \
   .add_argument("rhs", "NDArray-or-Symbol", "second input")
 
 }  // namespace op
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index 65d4ca9aadd6..ed0b6fb96aa1 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -28,9 +28,18 @@ namespace mxnet {
 namespace op {
 MXNET_OPERATOR_REGISTER_BINARY(elemwise_add)
 .add_alias("_add").add_alias("_plus").add_alias("_Plus")
-.describe("Adds arguments element-wise.")
+.describe(R"code(Adds arguments element-wise.
+
+The storage type of ``elemwise_add`` output depends on storage types of inputs
+
+- elemwise_add(row_sparse, row_sparse) = row_sparse
+- otherwise, ``elemwise_add`` generates output with default storage
+
+)code")
 .set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow::op::plus>)
-.set_attr<nnvm::FGradient>("FGradient", CloneGradient{"_backward_add"});
+.set_attr<nnvm::FGradient>("FGradient", CloneGradient{"_backward_add"})
+.set_attr<FComputeEx>("FComputeEx<cpu>", BinaryComputeEx<cpu, mshadow::op::plus>)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<2, 1>);
 
 // specialized gradient add function to do add to optimization
 // this must differ from elemwise_add to prevent add to optimization in forward pass.
@@ -46,7 +55,10 @@ NNVM_REGISTER_OP(_backward_add)
     return std::vector<std::pair<int, int> >{{0, 0}, {0, 1}};
   })
 .set_attr<FCompute>("FCompute<cpu>", BinaryBackwardUseNone<cpu, mshadow_op::identity,
-                                                                mshadow_op::identity>);
+                                                                mshadow_op::identity>)
+.set_attr<FComputeEx>("FComputeEx<cpu>",
+                      BinaryBackwardUseNoneEx<cpu, mshadow_op::identity, mshadow_op::identity>)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 2>);
 
 MXNET_OPERATOR_REGISTER_BINARY(_sub)
 .add_alias("_minus").add_alias("_Minus")
diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc
index 652be72f3fab..40757300c68d 100644
--- a/src/operator/tensor/elemwise_sum.cc
+++ b/src/operator/tensor/elemwise_sum.cc
@@ -22,6 +22,7 @@
  * \brief elementwise sum operator
 */
 #include "./elemwise_sum.h"
+#include "../../ndarray/ndarray_function.h"
 
 namespace mxnet {
 namespace op {
@@ -54,6 +55,53 @@ std::vector<nnvm::NodeEntry> ElementWiseSumGrad(
   return ret;
 }
 
+bool ElementWiseSumShape(const nnvm::NodeAttrs& attrs,
+                         std::vector<TShape> *in_attrs,
+                         std::vector<TShape> *out_attrs) {
+  CHECK_EQ(out_attrs->size(), 1);
+  return ElemwiseAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
+    attrs, in_attrs, out_attrs, TShape());
+}
+
+bool ElementWiseSumType(const nnvm::NodeAttrs& attrs,
+                        std::vector<int> *in_attrs,
+                        std::vector<int> *out_attrs) {
+  CHECK_EQ(out_attrs->size(), 1);
+  return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
+    attrs, in_attrs, out_attrs, -1);
+}
+
+bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                           const Context& ctx,
+                                           std::vector<int> *in_attrs,
+                                           std::vector<int> *out_attrs) {
+  CHECK(!in_attrs->empty());
+  CHECK_EQ(out_attrs->size(), 1U);
+  return ElemwiseStorageAttr<int, type_is_none, type_assign, false, true>(
+      attrs, in_attrs, out_attrs);
+}
+
+void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+  CHECK(!inputs.empty());
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(req[0], kWriteTo) << "ElementWiseSumComputeExCPU only supports req = kWriteTo";
+  using namespace mshadow;
+  Stream<cpu>* s = ctx.get_stream<cpu>();
+  NDArray out_nd = outputs[0];
+  if (inputs[0].storage_type() == kRowSparseStorage) {
+    mxnet::ndarray::ElementwiseSum<cpu>(s, inputs, &out_nd);
+  } else {
+    FCompExFallback<cpu>(attrs, ctx, inputs, req, outputs,
+                         ElementWiseSumCompute<cpu>, "ElementWiseSumCompute<cpu>");
+  }
+}
+
 NNVM_REGISTER_OP(add_n)
 .add_alias("ElementWiseSum")
 .describe(R"doc(Adds all input arguments element-wise.
@@ -62,6 +110,12 @@ NNVM_REGISTER_OP(add_n)
    add\_n(a_1, a_2, ..., a_n) = a_1 + a_2 + ... + a_n
 
 ``add_n`` is potentially more efficient than calling ``add`` by `n` times.
+
+The storage type of ``add_n`` output depends on storage types of inputs
+
+- add_n(row_sparse, row_sparse, ..) = row_sparse
+- otherwise, ``add_n`` generates output with default storage
+
 )doc" ADD_FILELINE)
 .set_attr_parser(ParamParser<ElementWiseSumParam>)
 .set_num_inputs([](const nnvm::NodeAttrs& attrs) {
@@ -79,16 +133,16 @@ NNVM_REGISTER_OP(add_n)
   })
 .set_attr<std::string>("key_var_num_args", "num_args")
 .set_attr<FCompute>("FCompute<cpu>", ElementWiseSumCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ElementWiseSumComputeExCPU)
 .set_attr<nnvm::FInplaceOption>(
     "FInplaceOption", [](const NodeAttrs& attrs) {
       return std::vector<std::pair<int, int> >{{0, 0}};
     })
-.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<-1, 1>)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, 1>)
-.set_attr<nnvm::FGradient>("FGradient", CloneGradient{"_backward_add_n"})
+.set_attr<nnvm::FInferShape>("FInferShape", ElementWiseSumShape)
+.set_attr<nnvm::FInferType>("FInferType", ElementWiseSumType)
+.set_attr<FInferStorageType>("FInferStorageType", ElementWiseSumForwardInferStorageType)
+.set_attr<nnvm::FGradient>("FGradient", ElementWiseSumGrad)
 .add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments");
 
-
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc
index defe72d3738c..e94b8bfb9fea 100644
--- a/src/operator/tensor/elemwise_unary_op.cc
+++ b/src/operator/tensor/elemwise_unary_op.cc
@@ -70,7 +70,9 @@ MXNET_OPERATOR_REGISTER_UNARY(_copy)
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
   })
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1>)
 .set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", IdentityComputeEx<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
 
 NNVM_REGISTER_OP(_backward_copy)
@@ -85,7 +87,9 @@ NNVM_REGISTER_OP(_backward_copy)
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
   })
-.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>);
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", IdentityComputeEx<cpu>);
 
 MXNET_OPERATOR_REGISTER_UNARY(BlockGrad)
 .add_alias("stop_gradient")
@@ -162,7 +166,9 @@ NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
 .set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
     [](const NodeAttrs& attrs) { return std::vector<uint32_t>(1, 1); })
 .set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", IdentityLikeRhsComputeEx<cpu>)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", IdentityAttrLikeRhsStorageType)
 .set_attr<nnvm::FGradient>(
     "FGradient",  [](const nnvm::NodePtr& n,
                      const std::vector<nnvm::NodeEntry>& ograds) {
@@ -219,6 +225,7 @@ NNVM_REGISTER_OP(_backward_cast)
   })
 .set_attr<FCompute>("FCompute<cpu>", CastCompute<cpu>);
 
+
 // negative
 MXNET_OPERATOR_REGISTER_UNARY(negative)
 .MXNET_DESCRIBE("Numerical negative of the argument, element-wise.")
diff --git a/src/operator/tensor/elemwise_unary_op.cu b/src/operator/tensor/elemwise_unary_op.cu
index 4211ea305b4e..f5d711c01a29 100644
--- a/src/operator/tensor/elemwise_unary_op.cu
+++ b/src/operator/tensor/elemwise_unary_op.cu
@@ -40,7 +40,8 @@ NNVM_REGISTER_OP(_backward_sigmoid)
 
 // copy
 NNVM_REGISTER_OP(_copy)
-.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", IdentityComputeEx<gpu>);
 
 NNVM_REGISTER_OP(_backward_copy)
 .set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
@@ -53,7 +54,9 @@ NNVM_REGISTER_OP(make_loss)
 
 // identity output as first input, but attributes are constrainted to be like rhs
 NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
-.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", IdentityLikeRhsComputeEx<gpu>);
+
 
 NNVM_REGISTER_OP(Cast)
 .set_attr<FCompute>("FCompute<gpu>", CastCompute<gpu>);
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index b6994844e0fe..16477b1973d3 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -31,15 +31,17 @@
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
 #include "../special_functions-inl.h"
+#include "./broadcast_reduce-inl.h"
+#include "./init_op.h"
 
 namespace mxnet {
 namespace op {
 template<typename xpu, typename op>
 void UnaryLaunch(const nnvm::NodeAttrs& attrs,
-                        const OpContext& ctx,
-                        const std::vector<TBlob>& inputs,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<TBlob>& outputs) {
+                 const OpContext& ctx,
+                 const std::vector<TBlob>& inputs,
+                 const std::vector<OpReqType>& req,
+                 const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mxnet_op;
   Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -95,6 +97,108 @@ void IdentityCompute(const nnvm::NodeAttrs& attrs,
   });
 }
 
+template<typename xpu>
+void IdentityComputeRspRspImpl(const nnvm::NodeAttrs& attrs,
+                               mshadow::Stream<xpu> *s,
+                               const NDArray& input,
+                               const OpReqType req,
+                               NDArray* output) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace rowsparse;
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "kWriteTo is expected for IdentityComputeRspRspImpl";
+  if (!input.storage_initialized()) {
+    FillZerosRspImpl(s, output);
+    return;
+  }
+  TShape shape = input.aux_shape(kIdx);
+  output->CheckAndAlloc({shape});
+  MSHADOW_TYPE_SWITCH(output->dtype(), DType, {
+    MSHADOW_TYPE_SWITCH(output->aux_type(kIdx), AuxType, {
+      auto out_d = output->data().FlatTo1D<xpu, DType>(s);
+      auto out_aux = output->aux_data(kIdx).FlatTo1D<xpu, AuxType>(s);
+      auto in_aux = input.aux_data(kIdx).FlatTo1D<xpu, AuxType>(s);
+      ASSIGN_DISPATCH(out_d, req,
+                      F<mshadow_op::identity>(input.data().FlatTo1D<xpu, DType>(s)));
+      ASSIGN_DISPATCH(out_aux, req, F<mshadow_op::identity>(in_aux));
+    });
+  });
+}
+
+template<typename xpu>
+void IdentityComputeEx(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<NDArray>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const auto in_stype = inputs[0].storage_type();
+  const auto out_stype = outputs[0].storage_type();
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  if (req[0] == kNullOp) return;
+  if (in_stype == out_stype) {
+    if (in_stype == kDefaultStorage) {  // dense ndarray
+      IdentityCompute<xpu>(attrs, ctx, {inputs[0].data()}, req, {outputs[0].data()});
+    } else if (in_stype == kRowSparseStorage || in_stype == kCSRStorage) {  // sparse ndarray
+      if (!inputs[0].storage_initialized()) {
+        FillComputeZerosEx<xpu>(attrs, ctx, inputs, req, outputs);
+        return;
+      }
+      CHECK_NE(req[0], kAddTo) << "kAddTo is not supported for IdentityComputeEx";
+      const size_t n = mxnet::num_aux_data(out_stype);
+      outputs[0].CheckAndAlloc(inputs[0].aux_shapes());
+      IdentityCompute<xpu>(attrs, ctx, {inputs[0].data()}, req, {outputs[0].data()});
+      for (size_t i = 0; i < n; ++i) {
+        IdentityCompute<xpu>(attrs, ctx, {inputs[0].aux_data(i)}, req, {outputs[0].aux_data(i)});
+      }
+    } else {
+      LOG(FATAL) << "IdentityComputeEx does not support input stype = " << in_stype;
+    }
+  } else {
+    FCompExFallback<xpu>(attrs, ctx, inputs, req, outputs, IdentityCompute<xpu>, "IdentityCompute");
+  }
+}
+
+inline bool IdentityAttrLikeRhsStorageType(const nnvm::NodeAttrs& attrs,
+                                           const Context& ctx,
+                                           std::vector<int> *in_attrs,
+                                           std::vector<int> *out_attrs) {
+  // TODO(junwu): add ctx info into storage inference logic
+  CHECK_EQ(in_attrs->size(), static_cast<size_t>(2)) << " in operator " << attrs.name;
+  CHECK_EQ(out_attrs->size(), static_cast<size_t>(1)) << " in operator " << attrs.name;
+  auto &in = *in_attrs;
+  auto &out = *out_attrs;
+  CHECK_NE(in[1], kUndefinedStorage) << "rhs storage type must be known";
+  if (in[0] == kUndefinedStorage) STORAGE_TYPE_ASSIGN_CHECK(in, 0, in[1]);
+  if (out[0] == kUndefinedStorage) STORAGE_TYPE_ASSIGN_CHECK(out, 0, in[1]);
+  return true;
+}
+
+template<typename xpu>
+void IdentityLikeRhsComputeEx(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  CHECK_EQ(inputs.size(), 2);
+  CHECK_EQ(outputs.size(), 1);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const auto in_stype = inputs[0].storage_type();
+  const auto out_stype = outputs[0].storage_type();
+  if (in_stype == out_stype) {
+    std::vector<NDArray> in{inputs[0]};
+    IdentityComputeEx<xpu>(attrs, ctx, in, req, outputs);
+  } else {
+    LOG(FATAL) << "IdentityLikeRhsComputeEx not implemented for in_stype = " << in_stype
+               << " out_stype = " << out_stype;
+  }
+}
+
 struct CastParam : public dmlc::Parameter<CastParam> {
   // use int for enumeration
   int dtype;
@@ -186,4 +290,5 @@ struct relu_grad {
 
 }  // namespace op
 }  // namespace mxnet
+
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_H_
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index e5cb41088e22..8c5d4f5411f8 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -104,7 +104,6 @@ NNVM_REGISTER_OP(_backward_Embedding)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpBackward<cpu>);
 
-
 NNVM_REGISTER_OP(take)
 .describe(R"code(Takes elements from an input array along the given axis.
 
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index ef42b01fb5b6..a9ee408082d4 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -40,6 +40,9 @@
 #include "../elemwise_op_common.h"
 #include "../mxnet_op.h"
 #include "./sort_op.h"
+#include "./dot-inl.h"
+#include "./init_op.h"
+#include "./matrix_op-inl.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/tensor/init_op.cc b/src/operator/tensor/init_op.cc
index 8dac22a64966..9f333d2d5efe 100644
--- a/src/operator/tensor/init_op.cc
+++ b/src/operator/tensor/init_op.cc
@@ -39,6 +39,7 @@ NNVM_REGISTER_OP(_zeros)
 .set_attr<nnvm::FInferShape>("FInferShape", InitShape<InitOpParam>)
 .set_attr<nnvm::FInferType>("FInferType", InitType<InitOpParam>)
 .set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 0>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", FillComputeZerosEx<cpu>)
 .add_arguments(InitOpParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_ones)
diff --git a/src/operator/tensor/init_op.cu b/src/operator/tensor/init_op.cu
index 6e2b65cc8519..cbee203c2b31 100644
--- a/src/operator/tensor/init_op.cu
+++ b/src/operator/tensor/init_op.cu
@@ -27,7 +27,8 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_zeros)
-.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", FillComputeZerosEx<gpu>);
 
 NNVM_REGISTER_OP(_ones)
 .set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index bdc74d332491..0cd81d77133c 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -33,6 +33,8 @@
 #include <string>
 #include <limits>
 #include "../elemwise_op_common.h"
+#include "../mxnet_op.h"
+
 
 namespace mxnet {
 namespace op {
@@ -129,7 +131,6 @@ inline bool InitType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-
 template<typename xpu, int value>
 void FillCompute(const nnvm::NodeAttrs& attrs,
                  const OpContext& ctx,
@@ -145,6 +146,91 @@ void FillCompute(const nnvm::NodeAttrs& attrs,
   });
 }
 
+// Fill in the indices and values of a RowSparse NDArray to represent a zeros NDArray,
+// instead of the usual compact representation.
+template<typename xpu>
+inline void FillDnsZerosRspImpl(mshadow::Stream<xpu> *s, NDArray *dst) {
+  using namespace rowsparse;
+  using namespace mshadow::expr;
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(dst->storage_type(), kRowSparseStorage);
+  MSHADOW_REAL_TYPE_SWITCH(dst->dtype(), DType, {
+    MSHADOW_IDX_TYPE_SWITCH(dst->aux_type(kIdx), IType, {
+      auto num_rows = dst->shape()[0];
+      dst->CheckAndAlloc({Shape1(num_rows)});
+      auto idx = dst->aux_data(kIdx).FlatTo1D<xpu, IType>(s);
+      auto val = dst->data();
+      Kernel<set_zero, xpu>::Launch(s, val.Size(), val.dptr<DType>());
+      ASSIGN_DISPATCH(idx, kWriteTo, range<IType>(0, num_rows, 1, 1));
+    });
+  });
+}
+
+struct PopulateFullIdxRspKernel {
+  template<typename IType>
+  MSHADOW_XINLINE static void Map(int i, IType* out) {
+    KERNEL_ASSIGN(out[i], kWriteTo, i);
+  }
+};
+
+// Fill full indices NDArray with zeros by updating the aux shape.
+template<typename xpu>
+void PopulateFullIdxRspImpl(mshadow::Stream<xpu> *s, NDArray *dst) {
+  using namespace rowsparse;
+  CHECK_EQ(dst->storage_type(), kRowSparseStorage);
+  nnvm::dim_t nnr = dst->shape()[0];
+  dst->CheckAndAllocAuxData(kIdx, mshadow::Shape1(nnr));
+  MSHADOW_IDX_TYPE_SWITCH(dst->aux_type(kIdx), IType, {
+    IType* idx = dst->aux_data(kIdx).dptr<IType>();
+    mxnet_op::Kernel<PopulateFullIdxRspKernel, xpu>::Launch(s, nnr, idx);
+  });
+}
+
+// Fill a rsp NDArray with zeros by updating the aux shape.
+template<typename xpu>
+void FillZerosRspImpl(mshadow::Stream<xpu> *s, NDArray *dst) {
+  if (!dst->storage_initialized()) return;
+  // reset the shapes if it's not zeros
+  auto storage_shape = dst->storage_shape();
+  storage_shape[0] = 0;
+  dst->set_aux_shape(rowsparse::kIdx, TShape(mshadow::Shape1(0)));
+}
+
+// Fill a CSR NDArray with zeros by updating the aux shape.
+template<typename xpu>
+void FillZerosCsrImpl(mshadow::Stream<xpu> *s, NDArray *dst) {
+  if (!dst->storage_initialized()) return;
+  // reset the shapes if it's not zeros
+  TShape new_shape(mshadow::Shape1(0));
+  dst->set_aux_shape(csr::kIndPtr, new_shape);
+  dst->set_aux_shape(csr::kIdx, new_shape);
+}
+
+template<typename xpu>
+void FillComputeZerosEx(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<NDArray>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<NDArray>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(outputs.size(), 1);
+  auto stype = outputs[0].storage_type();
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(req[0], kWriteTo) << "kWriteTo is expected for FillComputeZerosEx";
+  if (stype == kRowSparseStorage) {
+    NDArray nd(outputs[0]);
+    FillZerosRspImpl<xpu>(s, &nd);
+  } else if (stype == kCSRStorage) {
+    NDArray nd(outputs[0]);
+    FillZerosCsrImpl<xpu>(s, &nd);
+  } else {
+    // no fallback is required since the output doesn't depend on input
+    LOG(FATAL) << "storage type " << stype << " not implemented.";
+  }
+}
 
 template<typename xpu>
 void RangeCompute(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index af0de593c1be..4654b37ab2bc 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -28,6 +28,7 @@
 #include <vector>
 #include <algorithm>
 #include <utility>
+#include <type_traits>
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
 #include "../channel_op_common.h"
@@ -368,364 +369,6 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-struct DotParam : public dmlc::Parameter<DotParam> {
-  bool transpose_a;
-  bool transpose_b;
-  DMLC_DECLARE_PARAMETER(DotParam) {
-    DMLC_DECLARE_FIELD(transpose_a)
-      .describe("If true then transpose the first input before dot.")
-      .set_default(false);
-    DMLC_DECLARE_FIELD(transpose_b)
-      .describe("If true then transpose the second input before dot.")
-      .set_default(false);
-  }
-};
-
-template<typename xpu>
-void DotForward_(const nnvm::NodeAttrs& attrs,
-                 const OpContext& ctx,
-                 const std::vector<TBlob>& inputs,
-                 const std::vector<OpReqType>& req,
-                 const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
-      << "Binary function only support input/output with the same type";
-  CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_)
-      << "Binary function only support input/output with the same type";
-  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
-      << "dot only supports float32 and float64";
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    if (inputs[0].ndim() == 1 && inputs[1].ndim() == 1) {
-      CHECK_NE(req[0], kAddTo) << "AddTo not yet suported";
-      Tensor<xpu, 1, DType> out = outputs[0].get<xpu, 1, DType>(s);
-      VectorDot(out,
-                inputs[0].get<xpu, 1, DType>(s),
-                inputs[1].get<xpu, 1, DType>(s));
-    } else {
-      int ma, na, mb, nb, m, n;
-      if (param.transpose_a) {
-        ma = inputs[0].size(0);
-        na = inputs[0].Size()/ma;
-        m = na;
-      } else {
-        na = inputs[0].size(inputs[0].ndim()-1);
-        ma = inputs[0].Size()/na;
-        m = ma;
-      }
-      if (param.transpose_b) {
-        nb = inputs[1].size(inputs[1].ndim()-1);
-        mb = inputs[1].Size()/nb;
-        n = mb;
-      } else {
-        mb = inputs[1].size(0);
-        nb = inputs[1].Size()/mb;
-        n = nb;
-      }
-      Tensor<xpu, 2, DType> input0 =
-      inputs[0].get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
-      Tensor<xpu, 2, DType> input1 =
-      inputs[1].get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
-      Tensor<xpu, 2, DType> out =
-      outputs[0].get_with_shape<xpu, 2, DType>(Shape2(m, n), s);
-      if (param.transpose_a && param.transpose_b) {
-        ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1.T()));
-      } else if (!param.transpose_a && param.transpose_b) {
-        ASSIGN_DISPATCH(out, req[0], dot(input0, input1.T()));
-      } else if (param.transpose_a && !param.transpose_b) {
-        ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1));
-      } else {
-        ASSIGN_DISPATCH(out, req[0], dot(input0, input1));
-      }
-    }
-  });
-}
-
-template<typename xpu>
-void DotBackward_(const nnvm::NodeAttrs& attrs,
-                  const OpContext& ctx,
-                  const std::vector<TBlob>& inputs,
-                  const std::vector<OpReqType>& req,
-                  const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  CHECK_NE(req[0], kWriteInplace);
-  CHECK_NE(req[1], kWriteInplace);
-  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
-      << "dot only supports float32 and float64";
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    if (inputs[1].ndim() == 1 && inputs[2].ndim() == 1) {
-      Tensor<xpu, 1, DType> mout_grad = inputs[0].get<xpu, 1, DType>(s);
-      Tensor<xpu, 1, DType> mlhs_data = inputs[1].get<xpu, 1, DType>(s);
-      Tensor<xpu, 1, DType> mrhs_data = inputs[2].get<xpu, 1, DType>(s);
-      Tensor<xpu, 1, DType> mlhs_grad = outputs[0].get<xpu, 1, DType>(s);
-      Tensor<xpu, 1, DType> mrhs_grad = outputs[1].get<xpu, 1, DType>(s);
-      ASSIGN_DISPATCH(mrhs_grad, req[1],
-                      broadcast_scalar(mout_grad, mlhs_data.shape_) * mlhs_data);
-      ASSIGN_DISPATCH(mlhs_grad, req[0],
-                      broadcast_scalar(mout_grad, mlhs_data.shape_) * mrhs_data);
-    } else {
-      int ma, na, mb, nb, m, n;
-      if (param.transpose_a) {
-        ma = outputs[0].size(0);
-        na = outputs[0].Size()/ma;
-        m = na;
-      } else {
-        na = outputs[0].size(outputs[0].ndim()-1);
-        ma = outputs[0].Size()/na;
-        m = ma;
-      }
-      if (param.transpose_b) {
-        nb = outputs[1].size(outputs[1].ndim()-1);
-        mb = outputs[1].Size()/nb;
-        n = mb;
-      } else {
-        mb = outputs[1].size(0);
-        nb = outputs[1].Size()/mb;
-        n = nb;
-      }
-      Tensor<xpu, 2, DType> mout_grad =
-      inputs[0].get_with_shape<xpu, 2, DType>(Shape2(m, n), s);
-      Tensor<xpu, 2, DType> mlhs_data =
-      inputs[1].get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
-      Tensor<xpu, 2, DType> mrhs_data =
-      inputs[2].get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
-      Tensor<xpu, 2, DType> mlhs_grad =
-      outputs[0].get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
-      Tensor<xpu, 2, DType> mrhs_grad =
-      outputs[1].get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
-      if (param.transpose_a && param.transpose_b) {
-        // Gradient of z = dot(x.T, y.T)
-        // dy = dot(x, dz).T = dot(dz.T, x.T)
-        // dx = dot(dz, y).T = dot(y.T, dz.T)
-        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data.T()));
-        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data.T(), mout_grad.T()));
-      } else if (!param.transpose_a && param.transpose_b) {
-        // Gradient of z = dot(x, y.T)
-        // dy = dot(x.T, dz).T = dot(dz.T, x)
-        // dx = dot(dz, y)
-        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data));
-        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data));
-      } else if (param.transpose_a && !param.transpose_b) {
-        // Gradient of z = dot(x.T, y)
-        // dy = dot(x, dz)
-        // dx = dot(dz, y.T).T = dot(y, dz.T)
-        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data, mout_grad));
-        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data, mout_grad.T()));
-      } else {
-        // Gradient of z = dot(x, y)
-        // dy = dot(x.T, dz)
-        // dx = dot(dz, y.T)
-        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data.T(), mout_grad));
-        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data.T()));
-      }
-    }
-  });
-}
-
-inline bool DotShape(const nnvm::NodeAttrs& attrs,
-                     std::vector<TShape> *in_attrs,
-                     std::vector<TShape> *out_attrs) {
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 2U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  TShape& lshape = (*in_attrs)[0];
-  TShape& rshape = (*in_attrs)[1];
-  if (lshape.ndim() == 1 && rshape.ndim() == 1) {
-    CHECK(!param.transpose_a && !param.transpose_b) << "Cannot transpose vectors";
-    CHECK_EQ(lshape[0], rshape[0]) << "dot shape error: " << lshape << " X " << rshape;
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape1(1));
-  } else {
-    bool Ta = param.transpose_a, Tb = param.transpose_b;
-    TShape L[2], R[2];
-    if (Ta) {
-      L[0] = mshadow::Shape1(lshape[0]);
-      L[1] = lshape.ndim() > 1 ? TShape(&lshape[1], &lshape[lshape.ndim()]) : TShape(1);
-    } else {
-      L[0] = lshape.ndim() > 1 ? TShape(&lshape[0], &lshape[lshape.ndim()-1]) : TShape(1);
-      L[1] = mshadow::Shape1(lshape[lshape.ndim()-1]);
-    }
-    if (Tb) {
-      R[0] = rshape.ndim() > 1 ? TShape(&rshape[0], &rshape[rshape.ndim()-1]) : TShape(1);
-      R[1] = mshadow::Shape1(rshape[rshape.ndim()-1]);
-    } else {
-      R[0] = mshadow::Shape1(rshape[0]);
-      R[1] = rshape.ndim() > 1 ? TShape(&rshape[1], &rshape[rshape.ndim()]) : TShape(1);
-    }
-
-    if (L[!Ta].Size() != 0 && R[Tb].Size() != 0) {
-      CHECK_EQ(L[!Ta].Size(), R[Tb].Size())
-        << "dot shape error: " << lshape << " X " << rshape;
-    }
-    std::vector<index_t> buf;
-    if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], &L[Ta][L[Ta].ndim()]);
-    if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], &R[!Tb][R[!Tb].ndim()]);
-    TShape oshape(buf.begin(), buf.end());
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  }
-  return true;
-}
-
-template<typename xpu>
-void BatchDotForward_(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx,
-                      const std::vector<TBlob>& inputs,
-                      const std::vector<OpReqType>& req,
-                      const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
-      << "Binary function only support input/output with the same type";
-  CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_)
-      << "Binary function only support input/output with the same type";
-  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
-      << "dot only supports float32 and float64";
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mlhs = inputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mrhs = inputs[1].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 1, DType*> workspace =
-      ctx.requested[0].get_space_typed<xpu, 1, DType*>(mshadow::Shape1(3 * out.size(0)), s);
-    if (kNullOp != req[0]) {
-      if (param.transpose_a && param.transpose_b) {
-        mshadow::BatchGEMM<true, true>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      } else if (!param.transpose_a && param.transpose_b) {
-        mshadow::BatchGEMM<false, true>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      } else if (param.transpose_a && !param.transpose_b) {
-        mshadow::BatchGEMM<true, false>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      } else {
-        mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      }
-    }
-  });
-}
-
-template<typename xpu>
-void BatchDotBackward_(const nnvm::NodeAttrs& attrs,
-                       const OpContext& ctx,
-                       const std::vector<TBlob>& inputs,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  CHECK_NE(req[1], kWriteInplace);
-  CHECK_NE(req[0], kWriteInplace);
-  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
-      << "dot only supports float32 and float64";
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Tensor<xpu, 3, DType> mout_grad = inputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mlhs_data = inputs[1].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mrhs_data = inputs[2].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mlhs_grad = outputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mrhs_grad = outputs[1].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 2, DType*> workspace =
-      ctx.requested[0].get_space_typed<xpu, 2, DType*>(
-        mshadow::Shape2(2, 3 * mout_grad.size(0)), s);
-    mshadow::Tensor<xpu, 1, DType*> rhs_workspace = workspace[0];
-    mshadow::Tensor<xpu, 1, DType*> lhs_workspace = workspace[1];
-    if (param.transpose_a && param.transpose_b) {
-      // Gradient of z = dot(x.T, y.T)
-      // dy = dot(x, dz).T = dot(dz.T, x.T)
-      // dx = dot(dz, y).T = dot(y.T, dz.T)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<true, true>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f :  (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<true, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
-    } else if (!param.transpose_a && param.transpose_b) {
-      // Gradient of z = dot(x, y.T)
-      // dy = dot(x.T, dz).T = dot(dz.T, x)
-      // dx = dot(dz, y)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<true, false>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<false, false>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
-    } else if (param.transpose_a && !param.transpose_b) {
-      // Gradient of z = dot(x.T, y)
-      // dy = dot(x, dz)
-      // dx = dot(dz, y.T).T = dot(y, dz.T)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<false, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<false, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
-    } else {
-      // Gradient of z = dot(x, y)
-      // dy = dot(x.T, dz)
-      // dx = dot(dz, y.T)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<true, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<false, true>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
-    }
-  });
-}
-
-inline bool BatchDotShape(const nnvm::NodeAttrs& attrs,
-                          std::vector<TShape> *in_attrs,
-                          std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 2U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  TShape& lshape = (*in_attrs)[0];
-  TShape& rshape = (*in_attrs)[1];
-  if (lshape.ndim() == 3 && rshape.ndim() == 3) {
-    CHECK(lshape[0] == rshape[0])
-      << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape
-      << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
-    index_t out_m = param.transpose_a ? lshape[2] : lshape[1];
-    index_t lshape_k = param.transpose_a ? lshape[1] : lshape[2];
-    index_t out_n = param.transpose_b ? rshape[1] : rshape[2];
-    index_t rshape_k = param.transpose_b ? rshape[2] : rshape[1];
-    CHECK(lshape_k == rshape_k)
-      << "batch_dot shape error(shape mismatch): " << lshape << " X " << rshape
-      << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape3(lshape[0], out_m, out_n));
-  } else {
-    LOG(FATAL) << "batch_dot currently only support 3D*3D array"
-               << lshape << " v.s. " << rshape;
-  }
-  return true;
-}
-
 struct SliceParam : public dmlc::Parameter<SliceParam> {
   nnvm::Tuple<dmlc::optional<int> > begin, end;
   DMLC_DECLARE_PARAMETER(SliceParam) {
@@ -845,6 +488,96 @@ void Slice(const nnvm::NodeAttrs& attrs,
   });
 }
 
+// slice the indptr of a csr
+struct SliceCsrIndPtr {
+  template<typename IType>
+  MSHADOW_XINLINE static void Map(int i, IType* out, const IType* in, const IType* base) {
+    KERNEL_ASSIGN(out[i], kWriteTo, in[i] - *base);
+  }
+};
+
+/*
+ * a wrapper to launch SliceCsrIndPtr kernel.
+ * slice [src[begin] .. src[end]) and store in dst[0, end - begin)
+ */
+template<typename xpu, typename IType>
+void SliceCsrIndPtrImpl(const int begin, const int end, RunContext ctx,
+                        const IType* src, IType* dst) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  int indptr_len = end - begin + 1;
+  Kernel<SliceCsrIndPtr, xpu>::Launch(s, indptr_len, dst, src + begin, src + begin);
+}
+
+/*
+ * Slice a CSR NDArray
+ * Only implemented for CPU
+ */
+template<typename xpu>
+void SliceCsrImpl(const SliceParam &param, const OpContext& ctx,
+                  const NDArray &in, OpReqType req, const NDArray &out) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace csr;
+  CHECK((std::is_same<xpu, cpu>::value)) << "Slice for CSR input only implemented for CPU";
+  if (req == kNullOp) return;
+  CHECK_NE(req, kAddTo) << "kAddTo for Slice on CSR input is not supported";
+  CHECK_NE(req, kWriteInplace) << "kWriteInplace for Slice on CSR input is not supported";
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  int begin = *param.begin[0];
+  int end = *param.end[0];
+  int indptr_len = end - begin + 1;
+  out.CheckAndAllocAuxData(kIndPtr, Shape1(indptr_len));
+  if (!in.storage_initialized()) {
+    out.set_aux_shape(kIndPtr, Shape1(0));
+    return;
+  }
+  // assume idx indptr share the same type
+  MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIndPtr), RType, {
+    MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIdx), IType, {
+      MSHADOW_TYPE_SWITCH(in.dtype(), DType, {
+        auto in_indptr = in.aux_data(kIndPtr).dptr<RType>();
+        auto out_indptr = out.aux_data(kIndPtr).dptr<RType>();
+        SliceCsrIndPtrImpl<cpu, RType>(begin, end, ctx.run_ctx, in_indptr, out_indptr);
+
+        // retrieve nnz (CPU implementation)
+        int nnz = out_indptr[indptr_len - 1];
+        // copy indices and values
+        out.CheckAndAllocAuxData(kIdx, Shape1(nnz));
+        out.CheckAndAllocData(Shape1(nnz));
+        auto in_idx = in.aux_data(kIdx).dptr<IType>();
+        auto out_idx = out.aux_data(kIdx).dptr<IType>();
+        auto in_data = in.data().dptr<DType>();
+        auto out_data = out.data().dptr<DType>();
+        int offset = in_indptr[begin];
+        // this is also a CPU-only implementation
+        memcpy(out_idx, in_idx + offset, nnz * sizeof(IType));
+        memcpy(out_data, in_data + offset, nnz * sizeof(DType));
+      });
+    });
+  });
+}
+
+template<typename xpu>
+void SliceEx(const nnvm::NodeAttrs& attrs,
+          const OpContext& ctx,
+          const std::vector<NDArray>& inputs,
+          const std::vector<OpReqType>& req,
+          const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1);
+  CHECK_EQ(outputs.size(), 1);
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  auto in_stype = inputs[0].storage_type();
+  CHECK_NE(in_stype, kDefaultStorage)
+           << "SliceEx is not expected to execute for input with default storage type";
+  if (in_stype == kCSRStorage) {
+    SliceCsrImpl<xpu>(param, ctx, inputs[0], req[0], outputs[0]);
+  } else {
+    LOG(FATAL) << "Slice not implemented for storage type" << in_stype;
+  }
+}
+
 inline bool SliceAssignShape(const nnvm::NodeAttrs& attrs,
                              std::vector<TShape> *in_attrs,
                              std::vector<TShape> *out_attrs) {
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index e7e8f5548a1c..d409b9ec6056 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -34,7 +34,6 @@ DMLC_REGISTER_PARAMETER(ClipParam);
 DMLC_REGISTER_PARAMETER(SimpleCropAssignScalarParam);
 DMLC_REGISTER_PARAMETER(SliceParam);
 DMLC_REGISTER_PARAMETER(SliceAxisParam);
-DMLC_REGISTER_PARAMETER(DotParam);
 DMLC_REGISTER_PARAMETER(RepeatParam);
 DMLC_REGISTER_PARAMETER(TileParam);
 DMLC_REGISTER_PARAMETER(ReverseParam);
@@ -263,6 +262,9 @@ and ``end=(e_1, e_2, ... e_n)`` indices will result in an array with the shape
 The resulting array's *k*-th dimension contains elements
 from the *k*-th dimension of the input array with the open range ``[b_k, e_k)``.
 
+For an input array of non-default storage type(e.g. `csr` or `row_sparse`), it only supports
+slicing on the first dimension.
+
 Example::
 
   x = [[  1.,   2.,   3.,   4.],
@@ -276,8 +278,10 @@ Example::
 .set_attr_parser(ParamParser<SliceParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", SliceShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_slice"})
 .set_attr<FCompute>("FCompute<cpu>", Slice<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SliceEx<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "Source input")
 .add_arguments(SliceParam::__FIELDS__());
 
@@ -370,94 +374,6 @@ NNVM_REGISTER_OP(_backward_slice_axis)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", SliceAxisGrad_<cpu>);
 
-NNVM_REGISTER_OP(dot)
-.describe(R"doc(Dot product of two arrays.
-
-``dot``'s behavior depends on the input array dimensions:
-
-- 1-D arrays: inner product of vectors
-- 2-D arrays: matrix multiplication
-- N-D arrays: a sum product over the last axis of the first input and the first
-  axis of the second input
-
-  For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape `(k,r,s)`, the
-  result array will have shape `(n,m,r,s)`. It is computed by::
-
-    dot(x,y)[i,j,a,b] = sum(x[i,j,:]*y[:,a,b])
-
-  Example::
-
-    x = reshape([0,1,2,3,4,5,6,7], shape=(2,2,2))
-    y = reshape([7,6,5,4,3,2,1,0], shape=(2,2,2))
-    dot(x,y)[0,0,1,1] = 0
-    sum(x[0,0,:]*y[:,1,1]) = 0
-)doc" ADD_FILELINE)
-.set_num_inputs(2)
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<DotParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"lhs", "rhs"};
-  })
-.set_attr<nnvm::FInferShape>("FInferShape", DotShape)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
-.set_attr<FCompute>("FCompute<cpu>", DotForward_<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_dot"})
-.add_argument("lhs", "NDArray-or-Symbol", "The first input")
-.add_argument("rhs", "NDArray-or-Symbol", "The second input")
-.add_arguments(DotParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_backward_dot)
-.set_num_inputs(3)
-.set_num_outputs(2)
-.set_attr_parser(ParamParser<DotParam>)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FCompute>("FCompute<cpu>", DotBackward_<cpu>)
-.add_arguments(DotParam::__FIELDS__());
-
-NNVM_REGISTER_OP(batch_dot)
-.describe(R"doc(Batchwise dot product.
-
-``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
-``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`.
-
-For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape
-`(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`,
-which is computed by::
-
-   batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:])
-
-)doc" ADD_FILELINE)
-.set_num_inputs(2)
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<DotParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"lhs", "rhs"};
-  })
-.set_attr<nnvm::FInferShape>("FInferShape", BatchDotShape)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<FCompute>("FCompute<cpu>", BatchDotForward_<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_batch_dot"})
-.add_argument("lhs", "NDArray-or-Symbol", "The first input")
-.add_argument("rhs", "NDArray-or-Symbol", "The second input")
-.add_arguments(DotParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_backward_batch_dot)
-.set_num_inputs(3)
-.set_num_outputs(2)
-.set_attr_parser(ParamParser<DotParam>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FCompute>("FCompute<cpu>", BatchDotBackward_<cpu>);
-
 NNVM_REGISTER_OP(clip)
 .describe(R"code(Clips (limits) the values in an array.
 
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index ca40419a9367..3cf2a7a753d0 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -57,18 +57,6 @@ NNVM_REGISTER_OP(slice_axis)
 NNVM_REGISTER_OP(_backward_slice_axis)
 .set_attr<FCompute>("FCompute<gpu>", SliceAxisGrad_<gpu>);
 
-NNVM_REGISTER_OP(dot)
-.set_attr<FCompute>("FCompute<gpu>", DotForward_<gpu>);
-
-NNVM_REGISTER_OP(_backward_dot)
-.set_attr<FCompute>("FCompute<gpu>", DotBackward_<gpu>);
-
-NNVM_REGISTER_OP(batch_dot)
-.set_attr<FCompute>("FCompute<gpu>", BatchDotForward_<gpu>);
-
-NNVM_REGISTER_OP(_backward_batch_dot)
-.set_attr<FCompute>("FCompute<gpu>", BatchDotBackward_<gpu>);
-
 NNVM_REGISTER_OP(clip)
 .set_attr<FCompute>("FCompute<gpu>", Clip<gpu>);
 
diff --git a/src/operator/tensor/sparse_retain-inl.h b/src/operator/tensor/sparse_retain-inl.h
new file mode 100644
index 000000000000..5add57c83b24
--- /dev/null
+++ b/src/operator/tensor/sparse_retain-inl.h
@@ -0,0 +1,396 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file sparse_retain-inl.h
+ * \brief
+*/
+#ifndef MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_
+#define MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <utility>
+#include "./init_op.h"
+#include "../mshadow_op.h"
+#include "../elemwise_op_common.h"
+#include "../mxnet_op.h"
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief sparse retain namespace
+ */
+namespace sr {
+enum SparseRetainOpInputs {kArr, kIdx};
+enum SparseRetainOpOutputs {kOut};
+}  // namespace sr
+
+inline bool SparseRetainOpShape(const nnvm::NodeAttrs& attrs,
+                                std::vector<TShape> *in_attrs,
+                                std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U)
+    << "sparse_retain operator takes 2 arguments (" << in_attrs->size() << " given)";
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TShape tshape((*in_attrs)[sr::kArr]);
+  shape_assign(&tshape, (*out_attrs)[sr::kOut]);
+  SHAPE_ASSIGN_CHECK(*in_attrs, sr::kArr, tshape);
+  SHAPE_ASSIGN_CHECK(*out_attrs, sr::kOut, tshape);
+  return true;
+}
+
+inline bool SparseRetainOpType(const nnvm::NodeAttrs& attrs,
+                               std::vector<int> *in_attrs,
+                               std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_NE((*in_attrs)[sr::kIdx], -1) << "Index type must be set for sparse_retain operator";
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[sr::kArr]);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[sr::kOut]);
+  return (*in_attrs)[0] != -1;
+}
+
+inline bool SparseRetainForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                                const Context& ctx,
+                                                std::vector<int> *in_attrs,
+                                                std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  type_assign(&(in_attrs->at(sr::kArr)), kRowSparseStorage);
+  type_assign(&(in_attrs->at(sr::kIdx)), kDefaultStorage);
+  type_assign(&(out_attrs->at(sr::kOut)), kRowSparseStorage);
+  return true;
+}
+
+inline bool SparseRetainBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                                 const Context& ctx,
+                                                 std::vector<int> *in_attrs,
+                                                 std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 2U);
+
+  type_assign(&(in_attrs->at(sr::kOut)), kDefaultStorage);
+  type_assign(&(in_attrs->at(sr::kIdx)), kDefaultStorage);
+  type_assign(&(out_attrs->at(sr::kArr)), kRowSparseStorage);
+  type_assign(&(out_attrs->at(sr::kIdx)), kDefaultStorage);
+  return true;
+}
+
+/*!
+ * \brief Each thread searches for a user input index in the input
+ * row sparse ndarray alternatively. This ensures each thread
+ * has the almost the same workload. The overhead is the binary
+ * search. If all the indices of the idx array are contained
+ * in the in_idx, one should use SparseRetainRspRowBlockKernel instead,
+ * where each thread only perform binary search once.
+ */
+struct SparseRetainRspThreadKernel {
+  template<typename DType, typename RType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, RType* out_idx,
+                                  const DType* in_data, const RType* in_idx,
+                                  const IType* idx, const size_t nnr,
+                                  const size_t row_length) {
+    const RType irow = idx[i];
+    int j = -1, left = 0, right = nnr - 1;
+    while (left <= right) {
+      int m = left + (right - left) / 2;
+      const auto in_idx_m = in_idx[m];
+      if (in_idx_m == irow) {
+        j = m;
+        break;
+      } else if (in_idx_m < irow) {
+        left = m + 1;
+      } else {
+        right = m - 1;
+      }
+    }
+    out_idx[i] = idx[i];
+    if (j >= 0) {
+      const size_t in_offset = j * row_length;
+      const size_t out_offset = i * row_length;
+      for (size_t k = 0; k < row_length; ++k) {
+        out_data[out_offset+k] = in_data[in_offset+k];
+      }
+    }
+  }
+};
+
+/*!
+ * \brief This kernel should be invoked when the row indices
+ * to be retained are all in the input rsp.
+ * Each thread searches for a subarray of indices of
+ * the user-input idx array for retain. The first index
+ * in the subarray will be searched for using binary search.
+ * The rest of the indices will be searched for starting from
+ * the lower bound of the binary search. This kernel assumes
+ * that idx has been sorted in ascending order.
+ */
+struct SparseRetainRspRowBlockKernel {
+  template<typename DType, typename RType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, RType* out_idx,
+                                  const DType* in_data, const RType* in_idx,
+                                  const IType* idx, const size_t num_indices,
+                                  const size_t nnr, const size_t row_length,
+                                  const size_t seg_len) {
+    const size_t seg_start = i * seg_len;
+    if (seg_start >= num_indices) return;
+    const size_t seg_end = (seg_start+seg_len < num_indices? seg_start+seg_len : num_indices);
+    for (size_t j = seg_start; j < seg_end; ++j) {
+      out_idx[j] = idx[j];
+    }
+    // use binary search to find the lower bound of idx[seg_start] in in_idx
+    const RType* first = in_idx;
+    const RType* last = in_idx + nnr;
+    const auto val = idx[seg_start];
+    const RType* it;
+    int count = last - first, step;
+    while (count > 0) {
+      it = first;
+      step = count / 2;
+      it += step;
+      if (*it < val) {
+        first = ++it;
+        count -= step + 1;
+      } else {
+        count = step;
+      }
+    }
+    size_t cur_row_idx = first - in_idx;
+    // end of binary search
+    if (cur_row_idx == nnr ||  in_idx[cur_row_idx] > idx[seg_end-1]) {
+      return;
+    }
+    size_t cur_idx = seg_start;
+    while (cur_row_idx < nnr && cur_idx < seg_end) {
+      if (in_idx[cur_row_idx] == idx[cur_idx]) {
+        const size_t in_offset = cur_row_idx * row_length;
+        const size_t out_offset = cur_idx * row_length;
+        for (size_t k = 0; k < row_length; ++k) {
+          out_data[out_offset+k] = in_data[in_offset+k];
+        }
+        ++cur_row_idx;
+        ++cur_idx;
+      } else if (in_idx[cur_row_idx] < idx[cur_idx]) {
+        ++cur_row_idx;
+      } else {
+        ++cur_idx;
+      }
+    }
+  }
+};
+
+/*!
+ * Copy input indices to output indices.
+ * Only used when input rsp is dense.
+ */
+struct SparseRetainCopyIndices {
+  template<typename RType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, RType* out_idx, IType* idx) {
+    out_idx[i] = idx[i];
+  }
+};
+
+/*!
+ * Copy input retained rows to output rows.
+ * Only used when input rsp is dense.
+ * This kernel is only used when ctx is on GPU.
+ * So it's parallelized by out_rows' elements,
+ * instead of rows.
+ * For CPU ctx, we simply call mshadow::Copy.
+ */
+struct SparseRetainCopyRetainedRowsFromDns {
+  template<typename DType, typename RType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_rows, const DType* in_rows,
+                                  const RType* in_row_idx, const IType* idx,
+                                  const size_t row_length) {
+    const size_t irow = i / row_length;
+    const size_t icol = i % row_length;
+    out_rows[i] = in_rows[static_cast<size_t>(idx[irow]) * row_length + icol];
+  }
+};
+
+template<typename xpu>
+void SparseRetainOpForwardRspImpl(mshadow::Stream<xpu> *s,
+                                  const NDArray& input_nd,
+                                  const TBlob& idx_data,
+                                  const OpReqType req,
+                                  NDArray* output_nd) {
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "SparseRetainOpForwardRspImpl only support req = kWriteTo now";
+  CHECK_EQ(input_nd.storage_type(), kRowSparseStorage)
+    << "SparseRetainOpForwardRspImpl operator only takes row sparse NDArray as input";
+  CHECK_EQ(output_nd->storage_type(), kRowSparseStorage)
+    << "SparseRetainOpForwardRspImpl operator only outputs row sparse NDArray";
+
+  if (!input_nd.storage_initialized()
+      || idx_data.Size() == 0U
+      || input_nd.shape()[0] == 0) {
+    FillZerosRspImpl(s, output_nd);
+    return;
+  }
+
+  const TBlob input_data = input_nd.data();
+  const TBlob input_idx = input_nd.aux_data(rowsparse::kIdx);
+
+  output_nd->CheckAndAlloc({mshadow::Shape1(idx_data.Size())});
+  TBlob output_data = output_nd->data();
+  TBlob output_idx = output_nd->aux_data(rowsparse::kIdx);
+  const auto row_length = input_data.shape_.ProdShape(1, input_data.shape_.ndim());
+
+  using namespace mxnet_op;
+  MSHADOW_TYPE_SWITCH(output_data.type_flag_, DType, {  // output data type
+    Kernel<set_zero, xpu>::Launch(s, output_data.Size(), output_data.dptr<DType>());
+    MSHADOW_IDX_TYPE_SWITCH(output_idx.type_flag_, RType, {  // row index data type
+      MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, {  // index array data type
+        if (input_idx.Size() == input_nd.shape()[0]) {  // input rsp is dense
+          using namespace mshadow;
+          // copy indices
+          Tensor<xpu, 1, RType> output_idx_tensor = output_idx.FlatTo1D<xpu, RType>(s);
+          const size_t num_rows_retained = output_idx.Size();
+          if (output_idx.type_flag_ == idx_data.type_flag_) {  // same type, use Copy
+            const Tensor<xpu, 1, RType> idx_tensor = idx_data.FlatTo1D<xpu, RType>(s);
+            Copy(output_idx_tensor, idx_tensor, s);
+          } else {  // different index types, use Kernel::Launch
+            Kernel<SparseRetainCopyIndices, xpu>::Launch(s, num_rows_retained,
+                output_idx.dptr<RType>(), idx_data.dptr<IType>());
+          }
+          // copy data
+          if (std::is_same<xpu, cpu>::value) {  // For cpu, we can access output_idx_tensor[i]
+            const Tensor<xpu, 2, DType> input_tensor =
+              input_data.get_with_shape<xpu, 2, DType>(Shape2(input_data.shape_[0], row_length), s);
+            Tensor<xpu, 2, DType> output_tensor =
+              output_data.get_with_shape<xpu, 2, DType>(Shape2(output_data.shape_[0], row_length),
+                                                        s);
+            for (size_t i = 0; i < num_rows_retained; ++i) {
+              Copy(output_tensor[i], input_tensor[output_idx_tensor[i]], s);
+            }
+          } else {  // For gpu, have to kernel launch
+            Kernel<SparseRetainCopyRetainedRowsFromDns, xpu>::Launch(s, output_data.Size(),
+                output_data.dptr<DType>(), input_data.dptr<DType>(), input_idx.dptr<RType>(),
+                idx_data.dptr<IType>(), row_length);
+          }
+        } else {  // input rsp is not dense
+          Kernel<SparseRetainRspThreadKernel, xpu>::Launch(s, idx_data.Size(),
+              output_data.dptr<DType>(), output_idx.dptr<RType>(), input_data.dptr<DType>(),
+              input_idx.dptr<RType>(), idx_data.dptr<IType>(), input_data.shape_[0], row_length);
+        }
+      });
+    });
+  });
+}
+
+template<typename xpu>
+void SparseRetainOpForwardEx(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<NDArray>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (req[sr::kOut] == kNullOp) return;
+  CHECK_EQ(req[sr::kOut], kWriteTo) << "sparse_retain only supports req=\'write\'";
+  CHECK_EQ(inputs[sr::kIdx].storage_type(), kDefaultStorage)
+    << "sparse_retain operator only takes default NDArray as its index array";
+  if (inputs[sr::kArr].storage_type() == kRowSparseStorage) {
+    NDArray output_nd = outputs[sr::kOut];
+    SparseRetainOpForwardRspImpl<xpu>(ctx.get_stream<xpu>(), inputs[sr::kArr],
+        inputs[sr::kIdx].data(), req[sr::kOut], &output_nd);
+  } else {
+    LOG(FATAL) << "sparse_retain op only supports row-sparse ndarrays as input";
+  }
+}
+
+template<int req>
+struct SparseRetainRspGradKernel {
+  template<typename DType, typename RType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* in_grad, RType* in_grad_idx,
+                                  const DType* out_grad, const IType* idx,
+                                  const size_t row_length) {
+    const RType irow = idx[i];
+    in_grad_idx[i] = irow;
+    const size_t out_offset = irow * row_length;
+    const size_t in_offset = i * row_length;
+    for (size_t j = 0; j < row_length; ++j) {
+      KERNEL_ASSIGN(in_grad[in_offset+j], req, out_grad[out_offset+j]);
+    }
+  }
+};
+
+template<typename xpu>
+void SparseRetainOpBackwardEx(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
+  CHECK_EQ(req.size(), 2U);
+  CHECK_EQ(req[sr::kIdx], kNullOp);
+  if (req[sr::kArr] == kNullOp) return;
+  CHECK_EQ(req[sr::kArr], kWriteTo);
+
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 2U)
+    << "sparse_retain does not support calculating gradients of indices";
+
+  CHECK_EQ(inputs[sr::kOut].storage_type(), kDefaultStorage)
+    << "sparse_retain backward only takes default NDArray as ograd";
+  CHECK_EQ(inputs[sr::kIdx].storage_type(), kDefaultStorage)
+    << "sparse_retain backward only takes default NDArray as its index array";
+  CHECK_EQ(outputs[sr::kArr].storage_type(), kRowSparseStorage)
+    << "sparse_retain backward only outputs row sparse NDArray as grad of input";
+
+  using namespace mxnet_op;
+  using namespace mshadow;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob idx_data = inputs[sr::kIdx].data();
+  if (idx_data.Size() == 0U) {
+    NDArray output = outputs[sr::kArr];
+    FillZerosRspImpl<xpu>(s, &output);
+    return;
+  }
+
+  const TBlob out_grad_data = inputs[sr::kOut].data();
+
+  NDArray in_grad_nd = outputs[sr::kArr];
+  in_grad_nd.CheckAndAlloc({mshadow::Shape1(idx_data.Size())});
+  TBlob in_grad_data = in_grad_nd.data();
+  TBlob in_grad_idx = in_grad_nd.aux_data(rowsparse::kIdx);
+  const auto row_length = out_grad_data.shape_.ProdShape(1, out_grad_data.shape_.ndim());
+
+  MSHADOW_TYPE_SWITCH(out_grad_data.type_flag_, DType, {  // output data type
+    MSHADOW_IDX_TYPE_SWITCH(in_grad_idx.type_flag_, RType, {  // row index data type
+      MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, {  // index array data type
+        MXNET_ASSIGN_REQ_SWITCH(req[sr::kArr], req_type, {
+          Kernel<SparseRetainRspGradKernel<req_type>, xpu>::Launch(
+              s, in_grad_idx.Size(), in_grad_data.dptr<DType>(), in_grad_idx.dptr<RType>(),
+              out_grad_data.dptr<DType>(), idx_data.dptr<IType>(), row_length);
+        });
+      });
+    });
+  });
+}
+
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_
diff --git a/src/operator/tensor/sparse_retain.cc b/src/operator/tensor/sparse_retain.cc
new file mode 100644
index 000000000000..9b5b90e46835
--- /dev/null
+++ b/src/operator/tensor/sparse_retain.cc
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file sparse_retain.cc
+ * \brief
+*/
+
+#include "./sparse_retain-inl.h"
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(sparse_retain)
+.describe(R"code(pick rows specified by user input index array from a row sparse matrix
+and save them in the output sparse matrix.
+
+Example::
+
+  data = [[1, 2], [3, 4], [5, 6]]
+  indices = [0, 1, 3]
+  shape = (4, 2)
+  rsp_in = row_sparse(data, indices)
+  to_retain = [0, 3]
+  rsp_out = sparse_retain(rsp_in, to_retain)
+  rsp_out.values = [[1, 2], [5, 6]]
+  rsp_out.indices = [0, 3]
+
+The storage type of ``sparse_retain`` output depends on storage types of inputs
+
+- sparse_retain(row_sparse, default) = row_sparse
+- otherwise, ``sparse_retain`` is not supported
+
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "indices"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", SparseRetainOpShape)
+.set_attr<nnvm::FInferType>("FInferType", SparseRetainOpType)
+.set_attr<FInferStorageType>("FInferStorageType", SparseRetainForwardInferStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SparseRetainOpForwardEx<cpu>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    return MakeNonlossGradNode("_backward_sparse_retain", n, ograds,
+                               {n->inputs[sr::kIdx]}, n->attrs.dict);
+  })
+.add_argument("data", "NDArray-or-Symbol", "The input array for sparse_retain operator.")
+.add_argument("indices", "NDArray-or-Symbol", "The index array of rows ids that will be retained.");
+
+NNVM_REGISTER_OP(_backward_sparse_retain)
+.set_num_inputs(2)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", SparseRetainBackwardInferStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SparseRetainOpBackwardEx<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/sparse_retain.cu b/src/operator/tensor/sparse_retain.cu
new file mode 100644
index 000000000000..f2a5b15dada4
--- /dev/null
+++ b/src/operator/tensor/sparse_retain.cu
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file sparse_retain.cu
+ * \brief
+*/
+
+#include "./sparse_retain-inl.h"
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(sparse_retain)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SparseRetainOpForwardEx<gpu>);
+
+NNVM_REGISTER_OP(_backward_sparse_retain)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SparseRetainOpBackwardEx<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/square_sum-inl.h b/src/operator/tensor/square_sum-inl.h
new file mode 100644
index 000000000000..beb77c37b8d2
--- /dev/null
+++ b/src/operator/tensor/square_sum-inl.h
@@ -0,0 +1,456 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file square_sum-inl.h
+ * \brief This is a temporary solution for fusing operators
+ * square and sum together as a composite op for row sparse tensors.
+ * The purpose for fusing square and sum for row sparse tensors
+ * is that the gradient of the fused operator depends on the input
+ * ndarray and thus its gradient is a row-sparse ndarray too.
+ * This fused op will become deprecated after the functionality
+ * of fusing operators is finished in the future.
+ */
+
+#ifndef MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_
+#define MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_
+
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include "../mxnet_op.h"
+#include "./broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool SquareSumForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                             const Context& ctx,
+                                             std::vector<int>* in_attrs,
+                                             std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
+  if (in_attrs->at(0) == kRowSparseStorage) {  // current impl
+    if (param.axis[0] == 1 && param.keepdims) {  // sum per row and keep dims
+      STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage);
+    } else {
+      STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage);
+    }
+  } else {  // fallback
+    type_assign(&((*in_attrs)[0]), kDefaultStorage);
+    type_assign(&((*out_attrs)[0]), kDefaultStorage);
+  }
+  return true;
+}
+
+inline bool SquareSumBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                              const Context& ctx,
+                                              std::vector<int>* in_attrs,
+                                              std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
+  if (in_attrs->at(0) == kDefaultStorage || in_attrs->at(0) == kRowSparseStorage) {
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kRowSparseStorage);
+    STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage);
+  } else {  // fallback
+    type_assign(&((*in_attrs)[0]), kDefaultStorage);
+    type_assign(&((*in_attrs)[1]), kDefaultStorage);
+    type_assign(&((*out_attrs)[0]), kDefaultStorage);
+  }
+  return true;
+}
+
+/*!
+ * \brief square sum of a rsp
+ * if axis = -1, same as mx.nd.sum(tensor*tensor)
+ * if axis = 0, same as mx.nd.sum(tensor*tensor, axis=0)
+ * if axis = 1, same as mx.nd.sum(tensor*tensor, axis=1)
+ * where tensor*tensor is elemwise multiplication of two ndarrays.
+ */
+template<int req, int axis, bool keepdim>
+struct SquareSumRspKernel;
+
+/*!
+ * \brief square sum of a rsp on axis=0 without keeping the dim
+ */
+template<int req>
+struct SquareSumRspKernel<req, 0, false> {
+  /*!
+   * \param j the element index in out_data and column id of in_data
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int j, DType* out_data, const DType* in_data,
+                                  const int64_t nnr, const int64_t num_cols) {
+    DType sum = 0;
+    for (int64_t i = 0; i < nnr; ++i) {
+      const DType val = in_data[i*num_cols+j];
+      sum += val * val;
+    }
+    KERNEL_ASSIGN(out_data[j], req, sum);
+  }
+};
+
+/*!
+ * \brief square sum of a rsp on axis=1 without keeping the dim
+ */
+template<int req>
+struct SquareSumRspKernel<req, 1, false> {
+  /*!
+   * \param i the i-th non-zero row of in_data
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, const IType* in_row_idx,
+                                  const DType* in_data, const int64_t num_cols) {
+    DType sum = 0;
+    const int64_t offset = i * num_cols;
+    for (int64_t j = 0; j < num_cols; ++j) {
+      const DType val = in_data[offset+j];
+      sum += val * val;
+    }
+    KERNEL_ASSIGN(out_data[in_row_idx[i]], req, sum);
+  }
+};
+
+/*!
+ * \brief square sum of a rsp on axis=1 keeping the dim
+ */
+template<int req>
+struct SquareSumRspKernel<req, 1, true> {
+  /*!
+   * \param i the i-th non-zero row of in_data
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* out_row_idx, DType* out_data,
+                                  const IType* in_row_idx, const DType* in_data,
+                                  const int64_t num_cols) {
+    DType sum = 0;
+    out_row_idx[i] = in_row_idx[i];
+    const int64_t offset = i * num_cols;
+    for (int64_t j = 0; j < num_cols; ++j) {
+      const DType val = in_data[offset+j];
+      sum += val * val;
+    }
+    KERNEL_ASSIGN(out_data[i], req, sum);
+  }
+};
+
+template<int req, int axis, int ograd_stype = kDefaultStorage>
+struct SquareSumRspGradKernel;
+
+template<int req>
+struct SquareSumRspGradKernel<req, 0> {
+  /*!
+   * \param i element index in in_grad and in_data
+   * \param in_grad_row_idx row_idx of the gradient of the op's input
+   * \param in_grad gradient of the op's input
+   * \param out_grad gradient of the op's output
+   * \param in_row_idx row idx of the op's input
+   * \param in_data op's input
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad,
+                                  const DType* out_grad, const IType* in_row_idx,
+                                  const DType* in_data, const int64_t num_cols) {
+    const int64_t row = i / num_cols;
+    in_grad_row_idx[row] = in_row_idx[row];
+    KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[i%num_cols]);
+  }
+};
+
+template<int req>
+struct SquareSumRspGradKernel<req, 1> {
+  /*!
+   * \param i element index in in_grad and in_data
+   * \param in_grad_row_idx row_idx of the gradient of the op's input
+   * \param in_grad gradient of the op's input
+   * \param out_grad gradient of the op's output
+   * \param in_row_idx row idx of the op's input
+   * \param in_data op's input
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad,
+                                  const DType* out_grad, const IType* in_row_idx,
+                                  const DType* in_data, const int64_t num_cols) {
+    const int64_t row = i / num_cols;
+    in_grad_row_idx[row] = in_row_idx[row];
+    KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[in_row_idx[row]]);
+  }
+};
+
+/*!
+ * Note: This kernel assumes that the ograd and in_data
+ * are all rsp and have equal row_idx array, or
+ * in_data is a full rsp.
+ */
+template<int req>
+struct SquareSumRspGradKernel<req, 1, kRowSparseStorage> {
+  /*!
+   * \param i index of igrad.data()
+   * \param in_grad_row_idx row_idx of the gradient of the op's input
+   * \param in_grad gradient of the op's input
+   * \param out_grad_row_idx row_idx of the gradient of the op's output
+   * \param out_grad gradient of the op's output
+   * \param in_data op's input
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad,
+                                  const IType* out_grad_row_idx, const DType* out_grad,
+                                  const DType* in_data, const int64_t num_cols) {
+    const int64_t row = i / num_cols;
+    in_grad_row_idx[row] = out_grad_row_idx[row];
+    KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[row]);
+  }
+};
+
+template<typename xpu>
+void SquareSumRspImpl(const nnvm::NodeAttrs& attrs,
+                      mshadow::Stream<xpu>* s,
+                      const NDArray& input,
+                      const OpReqType req,
+                      NDArray* output) {
+  if (req == kNullOp) return;
+  const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
+  CHECK_EQ(param.axis.ndim(), 1U) << "_square_sum(row_sparse_matrix) only supports axis=0 or 1";
+  CHECK(param.axis[0] == 0 || param.axis[0] == 1)
+    << "_square_sum(row_sparse_matrix) only supports axis=0 or 1";
+  CHECK_EQ(input.storage_type(), kRowSparseStorage)
+    << "_square_sum op only supports row-sparse matrix as input";
+  int64_t out_data_size = 0;
+  if (param.axis[0] == 0) {  // axis = 0
+    CHECK_EQ(output->storage_type(), kDefaultStorage);
+    out_data_size = input.storage_shape()[1];
+  } else if (param.keepdims) {  // axis = 1, keepdims = true
+    CHECK_EQ(output->storage_type(), kRowSparseStorage);
+    out_data_size = input.storage_shape()[0];
+  } else {  // axis = 1, keepdims = false
+    CHECK_EQ(output->storage_type(), kDefaultStorage);
+    out_data_size = input.shape()[0];
+  }
+  CHECK_NE(req, kWriteInplace);
+
+  using namespace mxnet_op;
+  if (!input.storage_initialized()) {
+    if (req == kWriteTo) {
+      if (output->storage_type() == kDefaultStorage) {
+        MSHADOW_TYPE_SWITCH(output->data().type_flag_, DType, {
+          Kernel<set_zero, xpu>::Launch(s, out_data_size, output->data().dptr<DType>());
+        })
+      } else if (output->storage_type() == kRowSparseStorage) {
+        FillZerosRspImpl<xpu>(s, output);
+      } else {
+        LOG(FATAL) << "SquareSumRspImpl only supports row-sparse/dense output storage type";
+      }
+    }
+    return;
+  }
+
+  if (output->storage_type() == kRowSparseStorage) {
+    output->CheckAndAlloc({input.aux_shape(rowsparse::kIdx)});
+  }
+  const TBlob& out_data = output->data();
+  const int64_t nnr = input.storage_shape()[0];
+  const int64_t num_cols = input.storage_shape()[1];
+  const TBlob& in_data = input.data();
+  if (0 == param.axis[0]) {  // axis = 0, output is dense
+    MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        Kernel<SquareSumRspKernel<req_type, 0, false>, xpu>::Launch(s, num_cols,
+            out_data.dptr<DType>(), input.data().dptr<DType>(), nnr, num_cols);
+      })
+    })
+  } else {  // axis = 1
+    const TBlob in_row_idx = input.aux_data(rowsparse::kIdx);
+    if (param.keepdims) {  // output is rsp
+      const TBlob out_row_idx = output->aux_data(rowsparse::kIdx);
+      MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+        MSHADOW_IDX_TYPE_SWITCH(in_row_idx.type_flag_, IType, {
+          MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+            Kernel<SquareSumRspKernel<req_type, 1, true>, xpu>::Launch(s, nnr,
+                out_row_idx.dptr<IType>(), out_data.dptr<DType>(), in_row_idx.dptr<IType>(),
+                in_data.dptr<DType>(), num_cols);
+          })
+        })
+      })
+    } else {  // output is dense
+      if (req == kWriteTo) {
+        MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+          Kernel<set_zero, xpu>::Launch(s, out_data_size, out_data.dptr<DType>());
+        })
+      }
+      MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+        MSHADOW_IDX_TYPE_SWITCH(in_row_idx.type_flag_, IType, {
+          MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+            Kernel<SquareSumRspKernel<req_type, 1, false>, xpu>::Launch(s, nnr,
+                out_data.dptr<DType>(), in_row_idx.dptr<IType>(), in_data.dptr<DType>(), num_cols);
+          })
+        })
+      })
+    }
+  }
+}
+
+template<typename xpu>
+void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
+                          mshadow::Stream<xpu>* s,
+                          const NDArray& ograd,
+                          const NDArray& input,
+                          const OpReqType req,
+                          NDArray* igrad) {
+  if (req == kNullOp) return;
+  const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
+  CHECK_EQ(param.axis.ndim(), 1U) << "_square_sum(row_sparse_matrix) only supports axis=0/1";
+  CHECK(param.axis[0] == 0 || param.axis[0] == 1)
+    << "_square_sum(row_sparse_matrix) only supports axis=0 or 1";
+  CHECK(ograd.storage_type() == kDefaultStorage || ograd.storage_type() == kRowSparseStorage);
+  CHECK_EQ(input.storage_type(), kRowSparseStorage);
+  CHECK_EQ(igrad->storage_type(), kRowSparseStorage);
+  CHECK_EQ(req, kWriteTo);
+  if (!input.storage_initialized()) {
+    FillZerosRspImpl<xpu>(s, igrad);
+    return;
+  }
+
+  using namespace mxnet_op;
+  // TODO(junwu) change the input of CheckAndAlloc
+  // if we want to support differen row idx arrays
+  // for ograd and input when they are both row-sparse ndarrays
+  igrad->CheckAndAlloc({input.aux_shape(rowsparse::kIdx)});
+  const int64_t num_cols = input.storage_shape()[1];
+  const TBlob& igrad_data = igrad->data();
+  const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx);
+  const TBlob& ograd_data = ograd.data();
+  const TBlob& in_data = input.data();
+  const TBlob in_row_idx = input.aux_data(rowsparse::kIdx);
+  if (ograd.storage_type() == kDefaultStorage) {
+    if (0 == param.axis[0]) {  // forward is sum per column
+      MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, {
+        MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
+          MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+            Kernel<SquareSumRspGradKernel<req_type, 0, kDefaultStorage>, xpu>::Launch(
+                s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
+                igrad_data.dptr<DType>(), ograd_data.dptr<DType>(),
+                in_row_idx.dptr<IType>(), in_data.dptr<DType>(), num_cols);
+          })
+        })
+      })
+    } else {  // forward is sum per row
+      MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, {
+        MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
+          MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+            Kernel<SquareSumRspGradKernel<req_type, 1, kDefaultStorage>, xpu>::Launch(
+                s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
+                igrad_data.dptr<DType>(), ograd_data.dptr<DType>(),
+                in_row_idx.dptr<IType>(), in_data.dptr<DType>(), num_cols);
+          })
+        })
+      })
+    }
+  } else if (ograd.storage_type() == kRowSparseStorage) {
+    CHECK_EQ(1, param.axis[0]) << "SquareSumRspGradImpl only supports axis = 1"
+                                   " when ograd_stype = kRowSparseStorage";
+    CHECK_EQ(ograd.shape().ndim(), 2U);
+    const TBlob ograd_row_idx = ograd.aux_data(rowsparse::kIdx);
+    CHECK(ograd_row_idx.Size() == in_row_idx.Size() || in_row_idx.Size() == in_data.shape_[0]);
+    MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
+      if (std::is_same<xpu, cpu>::value) {
+        const IType* first1 = ograd_row_idx.dptr<IType>();
+        const IType* last1 = first1 + ograd_row_idx.Size();
+        const IType* first2 = in_row_idx.dptr<IType>();
+        // when ograd_row_idx and in_row_idx have the same size and input is not a full rsp
+        // ograd_row_idx and in_row_idx are expected to have the same elements
+        if (ograd_row_idx.Size() == in_row_idx.Size() && in_row_idx.Size() != in_data.shape_[0]) {
+          CHECK(std::equal(first1, last1, first2)) << "SquareSumRspGradImpl only supports"
+                                                      " equal ograd_row_idx and input_row_idx"
+                                                      " when ograd and input are both"
+                                                      " row-sparse";
+        }
+      } else {
+        LOG(FATAL) << "SquareSumRspGradImpl has not implemented GPU version when"
+                      " ograd and input are both row-sparse";
+      }
+      MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, {
+        MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+          Kernel<SquareSumRspGradKernel<req_type, 1, kRowSparseStorage>, xpu>::Launch(
+              s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
+              igrad_data.dptr<DType>(), ograd_row_idx.dptr<IType>(),
+              ograd_data.dptr<DType>(), in_data.dptr<DType>(), num_cols);
+        })
+      })
+    })
+  } else {
+    LOG(FATAL) << "SquareSumRspGradImpl only supports ograd_stype"
+               << " = kDefaultStorage/kRowSparseStorage";
+  }
+}
+
+template<typename xpu>
+void SquareSumOpForwardEx(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+  const NDArrayStorageType istype = inputs[0].storage_type();
+  if (istype == kRowSparseStorage) {
+    CHECK_EQ(inputs[0].shape().ndim(), 2U) << "_square_sum op only supports"
+                                              " 2D ndarray as input";
+    NDArray output = outputs[0];
+    SquareSumRspImpl(attrs, s, inputs[0], req[0], &output);
+  } else {
+    LOG(FATAL) << "_square_sum op only supports row-sparse ndarray"
+                  " as input, while input stype = "
+               << istype;
+  }
+}
+
+template<typename xpu>
+void SquareSumOpBackwardEx(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+  const NDArrayStorageType ograd_stype = inputs[0].storage_type();
+  const NDArrayStorageType input_stype = inputs[1].storage_type();
+  if (input_stype == kRowSparseStorage
+      && (ograd_stype == kDefaultStorage || ograd_stype == kRowSparseStorage)) {
+    CHECK_EQ(inputs[1].shape().ndim(), 2U) << "_square_sum op only supports"
+                                              " 2D ndarray as input";
+    NDArray output = outputs[0];
+    SquareSumRspGradImpl(attrs, s, inputs[0], inputs[1], req[0], &output);
+  } else {
+    LOG(FATAL) << "_square_sum op backward only supports dense ndarray as ograd,"
+                  " row-sparse ndarray as input and row-sparse ndarray as igrad,"
+                  " while ograd_stype = " << ograd_stype
+               << " input_stype = " << input_stype;
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_
diff --git a/src/operator/tensor/square_sum.cc b/src/operator/tensor/square_sum.cc
new file mode 100644
index 000000000000..e4b49d7f7fcb
--- /dev/null
+++ b/src/operator/tensor/square_sum.cc
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file square_sum.cc
+ * \brief CPU Implementation of square_sum op.
+ */
+#include "./square_sum-inl.h"
+
+namespace mxnet {
+namespace op {
+MXNET_OPERATOR_REGISTER_REDUCE(_square_sum)
+.describe(R"code(Computes the square sum of array elements over a given axis
+for row-sparse matrix. This is a temporary solution for fusing ops square and
+sum together for row-sparse matrix to save memory for storing gradients.
+It will become deprecated once the functionality of fusing operators is finished
+in the future.
+
+Example::
+
+  dns = mx.nd.array([[0, 0], [1, 2], [0, 0], [3, 4], [0, 0]])
+  rsp = dns.tostype('row_sparse')
+  sum = mx.nd._internal._square_sum(rsp, axis=1)
+  sum = [0, 5, 0, 25, 0]
+)code" ADD_FILELINE)
+.set_attr<FInferStorageType>("FInferStorageType", SquareSumForwardInferStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SquareSumOpForwardEx<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square_sum"});
+
+MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_square_sum)
+.set_num_inputs(2)
+.set_attr<FInferStorageType>("FInferStorageType", SquareSumBackwardInferStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SquareSumOpBackwardEx<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/util/tensor_util-inl.cuh b/src/operator/tensor/util/tensor_util-inl.cuh
new file mode 100644
index 000000000000..cf268e7ae9fc
--- /dev/null
+++ b/src/operator/tensor/util/tensor_util-inl.cuh
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file tensor_util-inl.cuh
+ * \brief commonly utilized tensor operator GPU kernels
+ */
+#ifndef MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_
+#define MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_
+
+#include <cub/cub.cuh>
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief Thread kernel for marking non-zero rows of a tensor.
+ * Parallelized by tensor rows: 1 thread/row
+ */
+struct MarkRspRowThreadKernel {
+  /*!
+   * \brief
+   * \param tid         global thread id
+   * \param row_flg     row flag array to mark non-zero rows
+   * \param dns         dense matrix data
+   * \param num_rows    number of rows (size of first dimension of tensor)
+   * \param row_length  number of elements per row
+   */
+  template<typename DType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* row_flg,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    if (tid < num_rows) {
+      dim_t j = 0;
+      dim_t offset = tid * row_length;
+      for (; j < row_length; ++j) {
+        if (dns[offset+j] != 0) {
+          break;
+        }
+      }
+      if (j < row_length) {
+        row_flg[tid] = 1;  // mark as one for non-zero row
+      } else {
+        row_flg[tid] = 0;  // mark as zero for zero row
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Warp kernel for marking non-zero rows of a tensor.
+ * Parallelized by tensor rows: 1 warp/row
+ */
+struct MarkRspRowWarpKernel {
+  template<typename DType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* row_flg,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    typedef cub::WarpReduce<dim_t> WarpReduce;
+    const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
+    __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
+
+    const dim_t warp_id   = tid / 32;          // global warp   id
+    const dim_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const dim_t lane      = tid & (32-1);      // local  thread id within warp
+
+    if (warp_id < num_rows) {
+      dim_t flg = 0;
+      dim_t offset = warp_id * row_length;
+      for (dim_t j = lane; j < row_length; j+=32) {
+        if (dns[offset+j] != 0) {
+          // avoid break: causes slower performance on sparse tensors (<20% density),
+          // due to thread divergence
+          flg++;
+        }
+      }
+      dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(flg);
+      if (lane == 0) {
+        if (aggr > 0) {
+          row_flg[warp_id] = 1;  // mark as one for non-zero row
+        } else {
+          row_flg[warp_id] = 0;  // mark as zero for zero row
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Block kernel for marking non-zero rows of a tensor.
+ * Parallelized by tensor rows: 1 threadBlock/row
+ */
+struct MarkRspRowBlockKernel {
+  template<typename DType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* row_flg,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    using mshadow::cuda::kBaseThreadNum;
+    typedef cub::BlockReduce<dim_t, kBaseThreadNum> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    if (blockIdx.x < num_rows) {
+      dim_t flg = 0;
+      dim_t offset = blockIdx.x * row_length;
+      for (dim_t j = threadIdx.x; j < row_length; j+=kBaseThreadNum) {
+        if (dns[offset+j] != 0) {
+          // avoid break: causes slower performance on sparse tensors (<20% density),
+          // due to thread divergence
+          flg++;
+        }
+      }
+      dim_t aggr = BlockReduce(temp_storage).Sum(flg);
+      if (threadIdx.x == 0) {
+        if (aggr > 0) {
+          row_flg[blockIdx.x] = 1;  // mark as one for non-zero row
+        } else {
+          row_flg[blockIdx.x] = 0;  // mark as zero for zero row
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU kernel to flag non-zero rows of an rsp tensor with indices.
+ * Parallelized by matrix rows: 1 thread/row
+ */
+struct SetRspRowFlgKernel {
+  /*!
+   * \brief
+   * \param tid      global thread id
+   * \param row_flg  array to flag storage indices of non-zero rows
+   * \param row_idx  rsp matrix row index array storing indices of non-zero rows
+   * \param nnr      rsp matrix number of non-zero rows (storage shape)
+   */
+  template<typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             RType* row_flg,
+                                             const RType* row_idx,
+                                             const nnvm::dim_t nnr) {
+    if (tid < nnr) {
+      row_flg[row_idx[tid]] = tid+1;
+    }
+  }
+};
+
+/*!
+ * \brief GPU kernel for filling the row index array of an rsp tensor.
+ * Parallelized by tensor rows: 1 thread/row
+ */
+struct FillRspRowIdxKernel {
+  /*!
+   * \brief
+   * \param tid          global thread id
+   * \param row_idx      row index array to store indices of non-zero rows
+   * \param row_flg_sum  inclusive prefix sum array over 0/1 marked row flag array
+   * \param num_rows     rsp tensor number of rows (shape)
+   */
+  template<typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             RType* row_idx,
+                                             const nnvm::dim_t* row_flg_sum,
+                                             const nnvm::dim_t num_rows) {
+    if (tid < num_rows) {
+      nnvm::dim_t prev = (tid == 0)? 0 : row_flg_sum[tid-1];
+      if (row_flg_sum[tid] > prev) {
+        row_idx[prev] = static_cast<RType>(tid);
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU kernel for marking non-zero columns of a csr matrix.
+ * Parallelized by matrix rows: 1 warp/row
+ */
+struct MarkCsrColWarpKernel {
+  /*!
+   * \brief
+   * \param tid       global thread id
+   * \param flg       flg array to mark non-zero columns
+   * \param col_idx   csr matrix column indices
+   * \param indptr    csr matrix row index pointer
+   * \param num_rows  csr matrix number of rows
+   * \param num_cols  csr matrix number of columns
+   */
+  template<typename CType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* flg,
+                                             const CType* col_idx,
+                                             const IType* indptr,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    typedef unsigned long long int uint64_cu;
+    static_assert(sizeof(uint64_cu) == sizeof(nnvm::dim_t), "unexpected sizeof dim_t");
+
+    const nnvm::dim_t warp_id = tid / 32;      // global warp   id
+    const nnvm::dim_t lane    = tid & (32-1);  // local  thread id within warp
+
+    if (warp_id < num_rows) {
+      uint64_cu zero = 0;
+      uint64_cu one = 1;
+      for (IType j = indptr[warp_id]+lane; j < indptr[warp_id+1]; j+=32) {
+        atomicCAS(reinterpret_cast<uint64_cu*>(flg+col_idx[j]), zero, one);
+      }
+    }
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_
diff --git a/tests/ci_build/install/ubuntu_install_python.sh b/tests/ci_build/install/ubuntu_install_python.sh
index bb67e3401a89..db4e9c4e0c94 100755
--- a/tests/ci_build/install/ubuntu_install_python.sh
+++ b/tests/ci_build/install/ubuntu_install_python.sh
@@ -24,5 +24,5 @@ apt-get update && apt-get install -y python-dev python3-dev
 # the version of the pip shipped with ubuntu may be too lower, install a recent version here
 cd /tmp && wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python2 get-pip.py
 
-pip2 install nose pylint numpy nose-timer requests h5py
-pip3 install nose pylint numpy nose-timer requests h5py
+pip2 install nose pylint numpy nose-timer requests h5py scipy
+pip3 install nose pylint numpy nose-timer requests h5py scipy
diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc
index 3fef28f79a0a..cd202ace1686 100644
--- a/tests/cpp/operator/batchnorm_test.cc
+++ b/tests/cpp/operator/batchnorm_test.cc
@@ -19,7 +19,7 @@
 
 /*!
  * \file batchnorm_test.cc
- * \brief operator unit test utility functions
+ * \brief batchnorm operator unit test utility functions
  * \author Chris Olivier
 */
 
@@ -892,8 +892,8 @@ TEST(BATCH_NORM, TestIterAll) {
           kwargs.push_back({ "cudnn_off", "True" });
         }
         for (TShape shape : shapes) {
-          for (int g1 = 0; g1 < 2U; ++g1) {
-            for (int g2 = 0; g2 < 2U; ++g2) {
+          for (int g1 = 0; g1 < 2; ++g1) {
+            for (int g2 = 0; g2 < 2; ++g2) {
               for (int type : v2_types) {
                 MSHADOW_REAL_TYPE_SWITCH_EX(
                   type, DType, AccReal,
diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py
index 3fbf9f910879..af1ecfc5036f 100644
--- a/tests/nightly/dist_sync_kvstore.py
+++ b/tests/nightly/dist_sync_kvstore.py
@@ -22,45 +22,155 @@
 sys.path.insert(0, "../../python/")
 import mxnet as mx
 import numpy as np
+import numpy.random as rnd
 import time
 
-def check_diff_to_scalar(A, x):
+def check_diff_to_scalar(A, x, rank=None):
     """ assert A == x"""
-    assert(np.sum(np.abs((A - x).asnumpy())) == 0), A.asnumpy()
+    assert(np.sum(np.abs((A - x).asnumpy())) == 0), (rank, A.asnumpy(), x)
 
 # setup
-keys = [3, 5, 7]
+keys = ['3', '5', '7']
+rsp_keys = ['9', '11', '13']
+
 rate = 2
-shape = (2, 2)
-big_shape = (1200, 1200)        # big than BIGARRAY_BOUND
+shape = (2, 3)
+big_shape = (1200, 1200)        # bigger than BIGARRAY_BOUND
 
 
-kv = mx.kv.create('dist_sync')
+def init_kv():
+    kv = mx.kv.create('dist_sync')
+    # init kv dns keys
+    kv.init(keys, [mx.nd.ones(shape)] * len(keys))
+    kv.init('99', mx.nd.ones(big_shape))
+    # init kv row_sparse keys
+    kv.init(rsp_keys, [mx.nd.ones(shape).tostype('row_sparse')] * len(rsp_keys))
+    kv.init('100', mx.nd.ones(big_shape).tostype('row_sparse'))
+    # worker info
+    my_rank = kv.rank
+    nworker = kv.num_workers
+    # init updater on servers
+    kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate))
+    return kv, my_rank, nworker
 
-# init kv
-kv.init(keys, [mx.nd.ones(shape)] * len(keys))
-kv.init(99, mx.nd.ones(big_shape))
-# init updater on servers
-kv.set_optimizer(mx.optimizer.create('test', rate))
+def test_sync_push_pull():
+    kv, my_rank, nworker = init_kv()
+    def check_default_keys(kv, my_rank, nworker):
+        nrepeat = 3
+        for i in range(nrepeat):
+            kv.push('3', mx.nd.ones(shape)*(my_rank+1))
+            kv.push('99', mx.nd.ones(big_shape)*(my_rank+1))
 
-my_rank = kv.rank
-nworker = kv.num_workers
+        num = (nworker + 1) * nworker * rate / 2 * nrepeat + 1
+        val = mx.nd.zeros(shape)
+        kv.pull('3', out=val)
+        check_diff_to_scalar(val, num)
 
-def test_sync_push_pull():
-    nrepeat = 3
-    for i in range(nrepeat):
-        kv.push(3, mx.nd.ones(shape)*(my_rank+1))
-        kv.push(99, mx.nd.ones(big_shape)*(my_rank+1))
-
-    num = (nworker + 1 ) * nworker * rate / 2 * nrepeat + 1
-    val = mx.nd.zeros(shape)
-    kv.pull(3, out = val)
-    check_diff_to_scalar(val, num)
-    # print val.asnumpy()
-
-    val2 = mx.nd.zeros(big_shape)
-    kv.pull(99, out = val2)
-    check_diff_to_scalar(val2, num)
+        val2 = mx.nd.zeros(big_shape)
+        kv.pull('99', out=val2)
+        check_diff_to_scalar(val2, num)
+
+    def check_row_sparse_keys(kv, my_rank, nworker):
+        nrepeat = 3
+        # prepare gradient
+        v = mx.nd.zeros(shape)
+        my_row = my_rank % shape[0]
+        v[my_row] = my_rank + 1
+        # push
+        for i in range(nrepeat):
+            kv.push('9', v.tostype('row_sparse'))
+        # select a random subset of rows this worker is interested in
+        num_rows = shape[0]
+        row_ids_np = np.random.randint(num_rows, size=num_rows)
+        row_ids = mx.nd.array(row_ids_np, dtype='int64')
+        # perform pull
+        val = mx.nd.zeros(shape, stype='row_sparse')
+        kv.row_sparse_pull('9', out=val, row_ids=row_ids)
+        # prepare updated values
+        updated_val = mx.nd.ones(shape)
+        for rank in range(nworker):
+            row = rank % shape[0]
+            updated_val[row] += (rank + 1) * rate * nrepeat
+        # verify subset of updated values
+        expected = mx.nd.zeros(shape)
+        for row in row_ids_np:
+            expected[row] = updated_val[row]
+        check_diff_to_scalar(val, expected)
+
+    def check_row_sparse_keys_with_zeros(kv, my_rank, nworker):
+        nrepeat = 3
+        # prepare gradient
+        v = mx.nd.zeros(shape)
+        big_v = mx.nd.zeros(big_shape)
+        # push
+        for i in range(nrepeat):
+            kv.push('11', v.tostype('row_sparse'))
+            kv.push('100', big_v.tostype('row_sparse'))
+
+        # pull a subset of rows this worker is interested in
+        all_row_ids = np.arange(shape[0])
+        val = mx.nd.ones(shape).tostype('row_sparse')
+        big_val = mx.nd.ones(big_shape).tostype('row_sparse')
+        kv.row_sparse_pull('11', out=val, row_ids=mx.nd.array(all_row_ids, dtype='int64'))
+        big_num_rows = shape[0]
+        big_all_row_ids = np.arange(big_shape[0])
+        kv.row_sparse_pull('100', out=big_val, row_ids=mx.nd.array(big_all_row_ids, dtype='int64'))
+        # verify results
+        check_diff_to_scalar(val, mx.nd.ones(shape))
+        check_diff_to_scalar(big_val, mx.nd.ones(big_shape))
+
+    def check_big_row_sparse_keys(kv, my_rank, nworker):
+        mx.random.seed(123)
+        rnd.seed(123)
+        density = 0.3
+        nrepeat = 3
+        # prepare gradient
+        v = mx.nd.zeros(big_shape)
+        idx_sample = rnd.rand(big_shape[0])
+        indices = np.argwhere(idx_sample < density).flatten()
+        # each worker chooses a subset of the indices to update
+        update_rows = []
+        for rank in range(nworker):
+            rows = []
+            i = 0
+            step = (rank + 1) * 2
+            while i < len(indices):
+                rows.append(indices[i])
+                i += step
+            update_rows.append(np.array(rows))
+        # rows to update for this worker
+        for row in update_rows[my_rank]:
+            v[row] = my_rank + 1
+        # push
+        for i in range(nrepeat):
+            kv.push('100', v.tostype('row_sparse'))
+
+        # select a random subset of rows this worker is interested in
+        mx.random.seed(my_rank)
+        rnd.seed(my_rank)
+        num_rows = big_shape[0]
+        row_ids_np = np.random.randint(num_rows, size=num_rows)
+        row_ids = mx.nd.array(row_ids_np, dtype='int64')
+        # perform pull
+        val = mx.nd.zeros(big_shape, stype='row_sparse')
+        kv.row_sparse_pull('100', out=val, row_ids=row_ids)
+        # prepare expected result
+        updated_val = mx.nd.ones(big_shape)
+        # apply updates from each worker
+        for rank in range(nworker):
+            for row in update_rows[rank]:
+                updated_val[row] += (rank + 1) * rate * nrepeat
+
+        expected = mx.nd.zeros(big_shape)
+        for row in row_ids_np:
+            expected[row] = updated_val[row]
+        check_diff_to_scalar(val, expected, rank=my_rank)
+
+    check_default_keys(kv, my_rank, nworker)
+    check_row_sparse_keys(kv, my_rank, nworker)
+    check_row_sparse_keys_with_zeros(kv, my_rank, nworker)
+    check_big_row_sparse_keys(kv, my_rank, nworker)
+    print('worker ' + str(my_rank) + ' is done')
 
 if __name__ == "__main__":
     test_sync_push_pull()
diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py
new file mode 100644
index 000000000000..6d3ba989a84f
--- /dev/null
+++ b/tests/python/gpu/test_kvstore_gpu.py
@@ -0,0 +1,51 @@
+# pylint: skip-file
+import mxnet as mx
+import numpy as np
+from mxnet.test_utils import assert_almost_equal, default_context
+
+shape = (4, 4)
+keys = [5, 7, 11]
+str_keys = ['b', 'c', 'd']
+
+
+def init_kv_with_str(stype='default'):
+    """init kv """
+    kv = mx.kv.create()
+    # single
+    kv.init('a', mx.nd.zeros(shape, stype=stype))
+    # list
+    kv.init(str_keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys))
+    return kv
+
+
+def test_row_sparse_pull():
+    kv = init_kv_with_str('row_sparse')
+    kv.init('e', mx.nd.ones(shape).tostype('row_sparse'))
+
+    def check_row_sparse_pull(kv, count, ctx=default_context()):
+        num_rows = shape[0]
+        vals = []
+        row_ids = []
+        all_row_ids = np.arange(num_rows)
+        for i in range(count):
+            vals.append(mx.nd.zeros(shape, ctx=ctx).tostype('row_sparse'))
+            row_id = np.random.randint(num_rows, size=num_rows)
+            row_ids.append(mx.nd.array(row_id, dtype='int64'))
+        row_ids_to_pull = row_ids[0] if len(row_ids) == 1 else row_ids
+        vals_to_pull = vals[0] if len(vals) == 1 else vals
+
+        kv.row_sparse_pull('e', out=vals_to_pull, row_ids=row_ids_to_pull)
+        for val, row_id in zip(vals, row_ids):
+            retained = val.asnumpy()
+            excluded_row_ids = np.setdiff1d(all_row_ids, row_id.asnumpy())
+            for row in range(num_rows):
+                expected_val = np.zeros_like(retained[row])
+                expected_val += 0 if row in excluded_row_ids else 1
+                assert_almost_equal(retained[row], expected_val)
+
+    check_row_sparse_pull(kv, 1, mx.gpu(0))
+    check_row_sparse_pull(kv, 4, mx.gpu(0))
+
+
+if __name__ == '__main__':
+    test_row_sparse_pull()
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index cd8e85ac9157..a2a1fe8e06b7 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -31,6 +31,9 @@
 from test_gluon import *
 #from test_rnn import *
 from test_gluon_rnn import *
+from test_sparse_operator import test_cast_storage_ex, test_sparse_dot
+from test_sparse_operator import test_sparse_nd_zeros, test_sparse_retain
+from test_sparse_ndarray import test_create_csr, test_create_row_sparse
 
 set_default_context(mx.gpu(0))
 del test_support_vector_machine_l1_svm
diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
index 30dd662ff1cc..37bb5626f765 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -106,29 +106,41 @@ def autograd_assert(*args, **kwargs):
         assert same(a.asnumpy(), b.asnumpy())
 
 def test_unary_func():
-    x = nd.uniform(shape=(4, 5))
-    f_exp         = lambda x: nd.exp(x)
-    f_exp_grad    = lambda x: [nd.exp(x)]
-    autograd_assert(x, func=f_exp, grad_func=f_exp_grad)
-    f_half        = lambda x: x/2
-    f_half_grad   = lambda x: [nd.ones(x.shape) * 0.5]
-    autograd_assert(x, func=f_half, grad_func=f_half_grad)
-    f_square      = lambda x: x**2
-    f_square_grad = lambda x: [2*x]
-    autograd_assert(x, func=f_square, grad_func=f_square_grad)
+    def check_unary_func(x):
+        f_exp         = lambda x: nd.exp(x)
+        f_exp_grad    = lambda x: [nd.exp(x)]
+        autograd_assert(x, func=f_exp, grad_func=f_exp_grad)
+        f_half        = lambda x: x/2
+        f_half_grad   = lambda x: [nd.ones(x.shape) * 0.5]
+        autograd_assert(x, func=f_half, grad_func=f_half_grad)
+        f_square      = lambda x: x**2
+        f_square_grad = lambda x: [2*x]
+        autograd_assert(x, func=f_square, grad_func=f_square_grad)
+    uniform = nd.uniform(shape=(4, 5))
+    stypes = ['row_sparse', 'csr', 'default']
+    for stype in stypes:
+        check_unary_func(uniform.tostype(stype))
 
 def test_binary_func():
-    x = nd.uniform(shape=(4, 5))
-    y = nd.uniform(shape=(4, 5))
-    f_add      = lambda x, y: x+y
-    f_add_grad = lambda x, y: [nd.ones(x.shape), nd.ones(y.shape)]
-    autograd_assert(x, y, func=f_add, grad_func=f_add_grad)
-    f_mul      = lambda x, y: x*y
-    f_mul_grad = lambda x, y: [y, x]
-    autograd_assert(x, y, func=f_mul, grad_func=f_mul_grad)
-    f_compose  = lambda x, y: x+x*y
-    f_compose_grad = lambda x, y: [nd.ones(x.shape) + y, x]
-    autograd_assert(x, y, func=f_compose, grad_func=f_compose_grad)
+    def check_binary_func(x, y):
+        f_add      = lambda x, y: x+y
+        f_add_grad = lambda x, y: [nd.ones(x.shape), nd.ones(y.shape)]
+        autograd_assert(x, y, func=f_add, grad_func=f_add_grad)
+        f_mul      = lambda x, y: x*y
+        f_mul_grad = lambda x, y: [y, x]
+        autograd_assert(x, y, func=f_mul, grad_func=f_mul_grad)
+        f_compose  = lambda x, y: x+x*y
+        f_compose_grad = lambda x, y: [nd.ones(x.shape) + y, x]
+        autograd_assert(x, y, func=f_compose, grad_func=f_compose_grad)
+    uniform_x = nd.uniform(shape=(4, 5))
+    uniform_y = nd.uniform(shape=(4, 5))
+    stypes = ['row_sparse', 'csr', 'default']
+    for stype_x in stypes:
+        for stype_y in stypes:
+            x = uniform_x.tostype(stype_x)
+            y = uniform_y.tostype(stype_y)
+            check_binary_func(x, y)
+
 
 def test_operator_with_state():
     def f_fc(a, b, weight, bias):
@@ -255,14 +267,19 @@ def test_retain_grad():
 
 
 def test_attach_grad():
-    x = mx.nd.zeros((10,))
-    assert x.grad is None
-    x.attach_grad()
-    with record():
-        y = x * 2
-        assert y.grad is None
-        y.backward()
-    assert (x.grad.asnumpy() == 2).all()
+    def check_attach_grad(x):
+        assert x.grad is None
+        x.attach_grad()
+        with record():
+            y = x * 2
+            assert y.grad is None
+            y.backward()
+        assert (x.grad.asnumpy() == 2).all()
+    zeros = mx.nd.zeros((10, 10))
+    stypes = ['default', 'row_sparse', 'csr']
+    for stype in stypes:
+        x = zeros.tostype(stype)
+        check_attach_grad(x)
 
 
 def test_is_train():
diff --git a/tests/python/unittest/test_infer_shape.py b/tests/python/unittest/test_infer_shape.py
index d7f52e216659..ccf7ffe897df 100644
--- a/tests/python/unittest/test_infer_shape.py
+++ b/tests/python/unittest/test_infer_shape.py
@@ -129,6 +129,24 @@ def test_incomplete_infer_concat():
     assert arg_shapes['b'] == (2, 5)
     assert arg_shapes['d'] == (2, 15)
 
+def test_fc_infer_type():
+    mx_real_t = mx.base.mx_real_t
+    data = mx.symbol.Variable('data')
+    out = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=1000)
+
+    # infer type
+    data_type = mx_real_t
+    arg_types, out_types, aux_types = out.infer_type(data=data_type)
+    arg_type_dict = dict(zip(out.list_arguments(), arg_types))
+    assert len(out_types) == 1
+    assert out_types[0] == mx_real_t
+    true_types = {
+                   'fc1_bias' : mx_real_t,
+                   'fc1_weight' : mx_real_t }
+    for k, v in true_types.items():
+        assert arg_type_dict[k] == v
+
+
 if __name__ == "__main__":
     test_mlp2_infer_shape()
     test_mlp2_infer_error()
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index c0f2acd4ed47..a543463f3663 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -17,6 +17,7 @@
 
 # pylint: skip-file
 import mxnet as mx
+from mxnet.test_utils import *
 import numpy as np
 import os, gzip
 import pickle as pickle
@@ -152,6 +153,109 @@ def test_NDArrayIter_h5py():
         else:
             assert(labelcount[i] == 100)
 
+def test_NDArrayIter_csr():
+    import scipy.sparse as sp
+    # creating toy data
+    num_rows = rnd.randint(5, 15)
+    num_cols = rnd.randint(1, 20)
+    batch_size = rnd.randint(1, num_rows)
+    shape = (num_rows, num_cols)
+    csr, _ = rand_sparse_ndarray(shape, 'csr')
+    dns = csr.asnumpy()
+
+    # make iterators
+    csr_iter = iter(mx.io.NDArrayIter(csr, csr, batch_size))
+    begin = 0
+    for batch in csr_iter:
+        expected = np.zeros((batch_size, num_cols))
+        end = begin + batch_size
+        expected[:num_rows - begin] = dns[begin:end]
+        if end > num_rows:
+            expected[num_rows - begin:] = dns[0:end - num_rows]
+        assert_almost_equal(batch.data[0].asnumpy(), expected)
+        begin += batch_size
+
+def test_LibSVMIter():
+    def get_data(data_dir, data_name, url, data_origin_name):
+        if not os.path.isdir(data_dir):
+            os.system("mkdir " + data_dir)
+        os.chdir(data_dir)
+        if (not os.path.exists(data_name)):
+            if sys.version_info[0] >= 3:
+                from urllib.request import urlretrieve
+            else:
+                from urllib import urlretrieve
+            zippath = os.path.join(data_dir, data_origin_name)
+            urlretrieve(url, zippath)
+            import bz2
+            bz_file = bz2.BZ2File(data_origin_name, 'rb')
+            with open(data_name, 'wb') as fout:
+                try:
+                    content = bz_file.read()
+                    fout.write(content)
+                finally:
+                    bz_file.close()
+        os.chdir("..")
+
+    def check_libSVMIter_synthetic():
+        cwd = os.getcwd()
+        data_path = os.path.join(cwd, 'data.t')
+        label_path = os.path.join(cwd, 'label.t')
+        with open(data_path, 'w') as fout:
+            fout.write('1.0 0:0.5 2:1.2\n')
+            fout.write('-2.0\n')
+            fout.write('-3.0 0:0.6 1:2.4 2:1.2\n')
+            fout.write('4 2:-1.2\n')
+
+        with open(label_path, 'w') as fout:
+            fout.write('1.0\n')
+            fout.write('-2.0 0:0.125\n')
+            fout.write('-3.0 2:1.2\n')
+            fout.write('4 1:1.0 2:-1.2\n')
+
+        data_dir = os.path.join(cwd, 'data')
+        data_train = mx.io.LibSVMIter(data_libsvm=data_path, label_libsvm=label_path,
+                                      data_shape=(3, ), label_shape=(3, ), batch_size=3)
+
+        first = mx.nd.array([[ 0.5, 0., 1.2], [ 0., 0., 0.], [ 0.6, 2.4, 1.2]])
+        second = mx.nd.array([[ 0., 0., -1.2], [ 0.5, 0., 1.2], [ 0., 0., 0.]])
+        i = 0
+        for batch in iter(data_train):
+            expected = first.asnumpy() if i == 0 else second.asnumpy()
+            assert_almost_equal(data_train.getdata().asnumpy(), expected)
+            i += 1
+
+    def check_libSVMIter_news_data():
+        news_metadata = {
+            'name': 'news20.t',
+            'origin_name': 'news20.t.bz2',
+            'url': "http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/news20.t.bz2",
+            'feature_dim': 62060,
+            'num_classes': 20,
+            'num_examples': 3993,
+        }
+        num_parts = 3
+        batch_size = 128
+        num_examples = news_metadata['num_examples']
+        data_dir = os.path.join(os.getcwd(), 'data')
+        get_data(data_dir, news_metadata['name'], news_metadata['url'],
+                 news_metadata['origin_name'])
+        path = os.path.join(data_dir, news_metadata['name'])
+        data_train = mx.io.LibSVMIter(data_libsvm=path, data_shape=(news_metadata['feature_dim'],),
+                                      batch_size=batch_size, num_parts=num_parts, part_index=0)
+        num_batches = 0
+        iterator = iter(data_train)
+        for batch in iterator:
+            # check the range of labels
+            assert(np.sum(batch.label[0].asnumpy() > 20) == 0)
+            assert(np.sum(batch.label[0].asnumpy() <= 0) == 0)
+            num_batches += 1
+        import math
+        expected_num_batches = math.ceil(num_examples * 1.0 / batch_size / num_parts)
+        assert(num_batches == int(expected_num_batches)), (num_batches, expected_num_batches)
+
+    check_libSVMIter_synthetic()
+    check_libSVMIter_news_data()
 
 if __name__ == "__main__":
     test_NDArrayIter()
@@ -159,3 +263,5 @@ def test_NDArrayIter_h5py():
         test_NDArrayIter_h5py()
     test_MNISTIter()
     test_Cifar10Rec()
+    test_LibSVMIter()
+    test_NDArrayIter_csr()
diff --git a/tests/python/unittest/test_kvstore.py b/tests/python/unittest/test_kvstore.py
index f1e10c757fad..c517da65de92 100644
--- a/tests/python/unittest/test_kvstore.py
+++ b/tests/python/unittest/test_kvstore.py
@@ -18,33 +18,35 @@
 # pylint: skip-file
 import mxnet as mx
 import numpy as np
+from mxnet.test_utils import rand_ndarray, assert_almost_equal
 
 shape = (4, 4)
 keys = [5, 7, 11]
 str_keys = ['b', 'c', 'd']
 
-def init_kv():
+def init_kv(stype='default'):
     """init kv """
     kv = mx.kv.create()
     # single
-    kv.init(3, mx.nd.zeros(shape))
+    kv.init(3, mx.nd.zeros(shape=shape, stype=stype))
     # list
-    kv.init(keys, [mx.nd.zeros(shape)] * len(keys))
+    kv.init(keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys))
     return kv
 
-def init_kv_with_str():
+def init_kv_with_str(stype='default'):
     """init kv """
     kv = mx.kv.create()
     # single
-    kv.init('a', mx.nd.zeros(shape))
+    kv.init('a', mx.nd.zeros(shape, stype=stype))
     # list
-    kv.init(str_keys, [mx.nd.zeros(shape)] * len(keys))
+    kv.init(str_keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys))
     return kv
 
 def check_diff_to_scalar(A, x):
     """ assert A == x"""
     assert(np.sum(np.abs((A - x).asnumpy())) == 0)
 
+
 def test_single_kv_pair():
     """single key-value pair push & pull"""
     def check_single_kv_pair(kv, key):
@@ -56,6 +58,34 @@ def check_single_kv_pair(kv, key):
     check_single_kv_pair(init_kv(), 3)
     check_single_kv_pair(init_kv_with_str(), 'a')
 
+def test_row_sparse_pull():
+    kv = init_kv_with_str('row_sparse')
+    kv.init('e', mx.nd.ones(shape).tostype('row_sparse'))
+
+    def check_row_sparse_pull(kv, count):
+        num_rows = shape[0]
+        vals = []
+        row_ids = []
+        all_row_ids = np.arange(num_rows)
+        for i in range(count):
+            vals.append(mx.nd.zeros(shape).tostype('row_sparse'))
+            row_id = np.random.randint(num_rows, size=num_rows)
+            row_ids.append(mx.nd.array(row_id, dtype='int64'))
+        row_ids_to_pull = row_ids[0] if len(row_ids) == 1 else row_ids
+        vals_to_pull = vals[0] if len(vals) == 1 else vals
+
+        kv.row_sparse_pull('e', out=vals_to_pull, row_ids=row_ids_to_pull)
+        for val, row_id in zip(vals, row_ids):
+            retained = val.asnumpy()
+            excluded_row_ids = np.setdiff1d(all_row_ids, row_id.asnumpy())
+            for row in range(num_rows):
+                expected_val = np.zeros_like(retained[row])
+                expected_val += 0 if row in excluded_row_ids else 1
+                assert_almost_equal(retained[row], expected_val)
+
+    check_row_sparse_pull(kv, 1)
+    check_row_sparse_pull(kv, 4)
+
 def test_init():
     """test init"""
     def check_init(kv, key):
@@ -110,10 +140,50 @@ def check_aggregator(kv, key, key_list):
     check_aggregator(init_kv_with_str(), 'a', str_keys)
 
 
+def test_sparse_aggregator():
+    """aggregate sparse ndarray on muliple devices"""
+
+    stype = 'row_sparse'
+    kv = init_kv_with_str(stype)
+
+    # devices
+    num_devs = 4
+    devs = [mx.Context('cpu', i) for i in range(num_devs)]
+
+    # single
+    vals = [rand_ndarray(shape, stype).copyto(devs[i]) for i in range(num_devs)]
+    expected_sum = np.zeros(shape)
+    for v in vals:
+        expected_sum += v.asnumpy()
+
+    # prepare row_ids
+    all_rows = mx.nd.array(np.arange(shape[0]), dtype='int64')
+    kv.push('a', vals)
+    kv.row_sparse_pull('a', out=vals, row_ids=[all_rows] * len(vals))
+    result_sum = np.zeros(shape)
+    for v in vals:
+        result_sum += v.asnumpy()
+    assert_almost_equal(result_sum, expected_sum * num_devs)
+
+    # list
+    vals = [[rand_ndarray(shape, stype).copyto(devs[i]) for i in range(num_devs)]] * len(keys)
+    expected_sum = np.zeros(shape)
+    for v in vals[0]:
+        expected_sum += v.asnumpy()
+
+    kv.push(str_keys, vals)
+    kv.row_sparse_pull(str_keys, out=vals, row_ids=[[all_rows] * num_devs] * len(vals))
+    for vv in vals:
+        result_sum = np.zeros(shape)
+        for v in vv:
+            result_sum += v.asnumpy()
+        assert_almost_equal(result_sum, expected_sum * num_devs)
+
 def updater(key, recv, local):
     """use updater: +="""
     local += recv
 
+
 def test_updater(dev = 'cpu'):
     """updater"""
 
@@ -152,7 +222,6 @@ def check_updater(kv, key, key_list):
     str_kv._set_updater(updater)
     check_updater(str_kv, 'a', str_keys)
 
-
 def test_get_type():
     kvtype = 'local_allreduce_cpu'
     kv = mx.kv.create(kvtype)
@@ -163,5 +232,7 @@ def test_get_type():
     test_get_type()
     test_single_kv_pair()
     test_list_kv_pair()
+    test_sparse_aggregator()
     test_aggregator()
     test_updater()
+    test_row_sparse_pull()
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index f522f29dae39..9e8ace563e0d 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -17,12 +17,15 @@
 
 import mxnet as mx
 import mxnet.ndarray as nd
+from mxnet.test_utils import *
 import numpy as np
 from functools import reduce
 from mxnet.module.executor_group import DataParallelExecutorGroup
 from common import assertRaises
 from collections import namedtuple
 
+import numpy.random as rnd
+
 
 def test_module_dtype():
     dtype = np.float16
@@ -345,7 +348,6 @@ def mean_abs(x):
                 break
     assert(mon_result_counts == [2, 2, 1, 6, 6, 4])
 
-
 def test_executor_group():
     def get_rnn_sym(num_layers, num_words, num_hidden, num_embed, seq_len):
         stack = mx.rnn.SequentialRNNCell()
@@ -458,6 +460,108 @@ def test_shared_exec_group(exec_grp_shared, exec_grp_created, shared_arg_names=N
                            shared_arg_names=shared_arg_names, extra_args=extra_args)
 
 
+def test_factorization_machine_module():
+    """ Test factorization machine model with sparse operators """
+    mx.random.seed(11)
+    rnd.seed(11)
+
+    def fm(factor_size, feature_dim, init):
+        x = mx.symbol.Variable("data", stype='csr')
+        v = mx.symbol.Variable("v", shape=(feature_dim, factor_size),
+                               init=init, stype='row_sparse')
+
+        w1_weight = mx.symbol.var('w1_weight', shape=(feature_dim, 1),
+                                  init=init, stype='row_sparse')
+        w1_bias = mx.symbol.var('w1_bias', shape=(1))
+        w1 = mx.symbol.broadcast_add(mx.symbol.dot(x, w1_weight), w1_bias)
+
+        v_s = mx._symbol_internal._square_sum(data=v, axis=1, keepdims=True)
+        x_s = mx.symbol.square(data=x)
+        bd_sum = mx.sym.dot(x_s, v_s)
+
+        w2 = mx.symbol.dot(x, v)
+        w2_squared = 0.5 * mx.symbol.square(data=w2)
+
+        w_all = mx.symbol.Concat(w1, w2_squared, dim=1)
+        sum1 = mx.symbol.sum(data=w_all, axis=1, keepdims=True)
+        sum2 = 0.5 * mx.symbol.negative(bd_sum)
+        model = mx.sym.elemwise_add(sum1, sum2)
+
+        y = mx.symbol.Variable("label")
+        model = mx.symbol.LinearRegressionOutput(data=model, label=y)
+        return model
+
+    # model
+    ctx = default_context()
+    init = mx.initializer.Normal(sigma=0.01)
+    factor_size = 4
+    feature_dim = 10000
+    model = fm(factor_size, feature_dim, init)
+
+    # data iter
+    num_batches = 5
+    batch_size = 64
+    num_samples = batch_size * num_batches
+    import scipy.sparse as sp
+    # generate some random scipy csr data
+    csr_sp = sp.rand(num_samples, feature_dim, density=0.1, format='csr')
+    csr_nd = mx.nd.csr_matrix(csr_sp.data, csr_sp.indptr, csr_sp.indices,
+                              (num_samples, feature_dim))
+    label = mx.nd.ones((num_samples,1))
+    # the alternative is to use LibSVMIter
+    train_iter = mx.io.NDArrayIter(data=csr_nd,
+                                   label={'label':label},
+                                   batch_size=batch_size)
+    # create module
+    mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label'])
+    # allocate memory by given the input data and lable shapes
+    mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
+    # initialize parameters by uniform random numbers
+    mod.init_params(initializer=init)
+    # use Sparse SGD with learning rate 0.1 to train
+    sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01,
+                           rescale_grad=1.0/batch_size)
+    mod.init_optimizer(optimizer=sgd)
+    # use accuracy as the metric
+    metric = mx.metric.create('MSE')
+    # train 10 epoch
+    for epoch in range(10):
+        train_iter.reset()
+        metric.reset()
+        for batch in train_iter:
+            mod.forward(batch, is_train=True)       # compute predictions
+            mod.update_metric(metric, batch.label)  # accumulate prediction accuracy
+            mod.backward()                          # compute gradients
+            mod.update()                            # update parameters
+        # print('Epoch %d, Training %s' % (epoch, metric.get()))
+    assert(metric.get()[1] < 0.02)
+
+
+def test_module_initializer():
+    def regression_model(m):
+         x = mx.symbol.var("data", stype='csr')
+         v = mx.symbol.var("v", shape=(m, 1), init=mx.init.Uniform(scale=.1),
+                                stype='row_sparse')
+         model = mx.symbol.dot(lhs=x, rhs=v)
+         y = mx.symbol.Variable("label")
+         model = mx.symbol.LinearRegressionOutput(data=model, label=y, name="out")
+         return model
+
+    n, m = 128, 100
+    model = regression_model(m)
+
+    data = mx.nd.zeros(shape=(n, m), stype='csr')
+    label = mx.nd.zeros((n, 1))
+    iterator = mx.io.NDArrayIter(data=data, label={'label':label}, batch_size=n)
+
+    # create module
+    mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label'])
+    mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label)
+    mod.init_params()
+    v = mod._arg_params['v']
+    assert(v.stype == 'row_sparse')
+    assert(np.sum(v.asnumpy()) != 0)
+
 def test_forward_reshape():
     num_class=10
     data1 = mx.sym.Variable('data1')
diff --git a/tests/python/unittest/test_multi_device_exec.py b/tests/python/unittest/test_multi_device_exec.py
index 6f8eb17ff34e..0a2739d9bb4e 100644
--- a/tests/python/unittest/test_multi_device_exec.py
+++ b/tests/python/unittest/test_multi_device_exec.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import os
+import numpy as np
 import mxnet as mx
 
 def test_ctx_group():
@@ -49,5 +50,31 @@ def test_ctx_group():
         else:
             assert arr.context == group2ctx['stage2']
 
+def test_ctx_group_sparse():
+    with mx.AttrScope(ctx_group='stage1'):
+        lhs = mx.symbol.Variable('lhs', stype='csr')
+        rhs = mx.symbol.Variable('rhs', stype='row_sparse')
+        dot  = mx.symbol.dot(lhs, rhs, name='dot')
+
+    set_stage1 = set(dot.list_arguments())
+    with mx.AttrScope(ctx_group='stage2'):
+        softmax  = mx.symbol.SoftmaxOutput(data = dot, name = 'softmax')
+
+    set_stage2 = set(softmax.list_arguments()) - set_stage1
+
+    group2ctx = {
+        'stage1' : mx.cpu(1),
+        'stage2' : mx.cpu(2)
+    }
+    texec = softmax.simple_bind(mx.cpu(0), group2ctx=group2ctx,
+                                lhs=(32,200), rhs=(200, 5))
+
+    for arr, name in zip(texec.arg_arrays, softmax.list_arguments()):
+        if name in set_stage1:
+            assert arr.context == group2ctx['stage1']
+        else:
+            assert arr.context == group2ctx['stage2']
+
 if __name__ == '__main__':
     test_ctx_group()
+    test_ctx_group_sparse()
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index eae364eeaecf..3e0ac66c168d 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -373,6 +373,7 @@ def test_dot():
     assert_almost_equal(c, C.asnumpy())
 
 
+
 def test_reduce():
     sample_num = 200
     def test_reduce_inner(numpy_reduce_func, nd_reduce_func, multi_axes):
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 7d56b46e21a0..f27204b119bd 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -855,75 +855,88 @@ def test_nearest_upsampling():
                     check_nearest_upsampling_with_shape(shapes, scale, root_scale)
 
 def test_batchnorm_training():
-    for shape in [(2, 3), (2, 3, 2, 2)]:
-        data_tmp = np.random.normal(-0.1, 0.1, size=shape)
-        s = shape[1],
-        gamma = np.ones(s)
-        beta = np.ones(s)
-        gamma[1] = 3
-        beta[0] = 3
+    def check_batchnorm_training(stype):
+        for shape in [(2, 3), (2, 3, 2, 2)]:
+            data_tmp = np.random.normal(-0.1, 0.1, size=shape)
+            s = shape[1],
+            gamma = np.ones(s)
+            beta = np.ones(s)
+            gamma[1] = 3
+            beta[0] = 3
 
-        rolling_mean = np.random.uniform(size=s)
-        rolling_std = np.random.uniform(size=s)
+            rolling_mean = np.random.uniform(size=s)
+            rolling_std = np.random.uniform(size=s)
 
-        data = mx.symbol.Variable('data')
+            data = mx.symbol.Variable('data', stype=stype)
+            in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype),
+                           mx.nd.array(beta).tostype(stype)]
+            mean_std = [mx.nd.array(rolling_mean).tostype(stype), mx.nd.array(rolling_std).tostype(stype)]
 
-        test = mx.symbol.BatchNorm_v1(data, fix_gamma=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm_v1(data, fix_gamma=True)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16)
 
-        test = mx.symbol.BatchNorm(data, fix_gamma=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm(data, fix_gamma=True)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16)
 
-        test = mx.symbol.BatchNorm_v1(data, fix_gamma=True, use_global_stats=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm_v1(data, fix_gamma=True, use_global_stats=True)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16)
 
-        test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16)
 
-        test = mx.symbol.BatchNorm_v1(data, fix_gamma=False)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm_v1(data, fix_gamma=False)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16)
 
-        test = mx.symbol.BatchNorm(data, fix_gamma=False)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm(data, fix_gamma=False)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16)
 
-        test = mx.symbol.BatchNorm_v1(data, fix_gamma=False, use_global_stats=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm_v1(data, fix_gamma=False, use_global_stats=True)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16)
 
-        test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16)
 
-        # Test varying channel axis
-        dim = len(shape)
-        for chaxis in range(-dim, dim):
-            chaxis_true = chaxis
-            if chaxis < 0:
-                chaxis_true = dim + chaxis
+            # Test varying channel axis
+            dim = len(shape)
+            for chaxis in range(-dim, dim):
+                chaxis_true = chaxis
+                if chaxis < 0:
+                    chaxis_true = dim + chaxis
 
-            shapex = shape
+                shapex = shape
 
-            channel_count = shapex[chaxis_true]
-            data_tmp = np.random.normal(-0.1, 0.1, size=shapex)
+                channel_count = shapex[chaxis_true]
+                data_tmp = np.random.normal(-0.1, 0.1, size=shapex)
 
-            gamma = np.ones(channel_count)
-            beta = np.ones(channel_count)
-            if channel_count > 1:
-                gamma[1] = 3
-            beta[0] = 3
+                gamma = np.ones(channel_count)
+                beta = np.ones(channel_count)
+                if channel_count > 1:
+                    gamma[1] = 3
+                beta[0] = 3
+
+                in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype),
+                               mx.nd.array(beta).tostype(stype)]
+
+                xrolling_mean = np.random.uniform(size=channel_count)
+                xrolling_std = np.random.uniform(size=channel_count)
+                xmean_std = [mx.nd.array(xrolling_mean).tostype(stype),
+                             mx.nd.array(xrolling_std).tostype(stype)]
 
-            xrolling_mean = np.random.uniform(size=channel_count)
-            xrolling_std = np.random.uniform(size=channel_count)
+                test = mx.symbol.BatchNorm(data, fix_gamma=True, axis=chaxis)
+                check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01)
 
-            test = mx.symbol.BatchNorm(data, fix_gamma=True, axis=chaxis)
-            check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01)
+                test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True, axis=chaxis)
+                check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01)
 
-            test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True, axis=chaxis)
-            check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01)
+                test = mx.symbol.BatchNorm(data, fix_gamma=False, axis=chaxis)
+                check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01)
 
-            test = mx.symbol.BatchNorm(data, fix_gamma=False, axis=chaxis)
-            check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01)
+                test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True, axis=chaxis)
+                check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01)
 
-            test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True, axis=chaxis)
-            check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01)
+    stypes = ['row_sparse', 'default']
+    for stype in stypes:
+        check_batchnorm_training(stype)
 
 def test_convolution_grouping():
     num_filter = 4
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 3b3b92b372d8..055f6464f0ef 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -47,26 +47,43 @@ def test_lr_wd_mult():
     assert not mx.test_utils.almost_equal(args1['fc2_weight'], args2['fc2_weight'], 1e-1)
 
 
-def compare_optimizer(opt1, opt2, shape, dtype):
-    w1 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-    g1 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-
-    w2 = w1.copyto(default_context())
-    g2 = g1.copyto(default_context())
+def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default'):
+    if w_stype == 'default':
+        w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+        w1 = w2.copyto(default_context())
+    elif w_stype == 'row_sparse' or w_stype == 'csr':
+        w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
+        w1 = w2.copyto(default_context()).tostype('default')
+    else:
+        raise Exception("type not supported yet")
+    if g_stype == 'default':
+        g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+        g1 = g2.copyto(default_context())
+    elif g_stype == 'row_sparse' or g_stype == 'csr':
+        g2 = rand_ndarray(shape, g_stype, dtype=dtype)
+        g1 = g2.copyto(default_context()).tostype('default')
+    else:
+        raise Exception("type not supported yet")
 
     state1 = opt1.create_state(0, w1)
     state2 = opt2.create_state(0, w2)
     if state1 is not None and state2 is not None:
-        for s1, s2, in zip(state1, state2):
-            if s1 is not None or s2 is not None:
-                assert(same(s1.asnumpy(), s2.asnumpy()))
+        if isinstance(state1, tuple):
+            for s1, s2, in zip(state1, state2):
+                if s1 is not None or s2 is not None:
+                    assert(same(s1.asnumpy(), s2.asnumpy()))
+        else:
+            assert_almost_equal(state1.asnumpy(), state2.asnumpy())
 
     opt1.update(0, w1, g1, state1)
     opt2.update(0, w2, g2, state2)
     if state1 is not None and state2 is not None:
-        for s1, s2, in zip(state1, state2):
-            if s1 is not None or s2 is not None:
-                assert_almost_equal(s1.asnumpy(), s2.asnumpy(), rtol=1e-4, atol=1e-5)
+        if isinstance(state1, tuple):
+            for s1, s2, in zip(state1, state2):
+                if s1 is not None or s2 is not None:
+                    assert_almost_equal(s1.asnumpy(), s2.asnumpy(), rtol=1e-4, atol=1e-5)
+        else:
+            assert_almost_equal(state1.asnumpy(), state2.asnumpy())
     assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=1e-4, atol=1e-5)
 
 # SGD
@@ -186,18 +203,122 @@ def test_sgd():
                                         not kwarg['multi_precision'])):
                                 continue
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+                            # test operator fallback on cpu
+                            if (default_context() == mx.cpu()):
+                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
+                                                  g_stype='row_sparse')
+                                if dtype != np.float16:
+                                    compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape[:2],
+                                                      dtype, w_stype='csr', g_stype='csr')
+
+class PySparseSGD(mx.optimizer.Optimizer):
+    """python reference implemenation of sgd"""
+    def __init__(self, learning_rate=0.01, momentum=0.0, **kwargs):
+        super(PySparseSGD, self).__init__(learning_rate=learning_rate, **kwargs)
+        self.momentum = momentum
+
+    def create_state(self, index, weight):
+        """Create additional optimizer state: momentum
+
+        Parameters
+        ----------
+        weight : NDArray
+        The weight data
+
+        """
+        if self.momentum == 0.0:
+            return None
+        else:
+            return mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
+
+    def update(self, index, weight, grad, state):
+        """Update the parameters.
+
+        Parameters
+        ----------
+        index : int
+        An unique integer key used to index the parameters
+
+        weight : NDArray
+        weight ndarray
+
+        grad : NDArray
+        grad ndarray
+
+        state : NDArray or other objects returned by init_state
+        The auxiliary state used in optimization.
+        """
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
+        self._update_count(index)
+        num_rows = weight.shape[0]
+        if self.momentum == 0.0:
+            # Update on a per row basis, skip all-zero rows
+            for row in range(num_rows):
+                grad_row = grad[row].asnumpy()
+                all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
+                if all_zeros:
+                   continue
+                if self.clip_gradient is not None:
+                    weight[row] = ((1 - lr*wd)*weight[row] -
+                        lr*mx.nd.clip(grad[row]*self.rescale_grad,
+                                     -self.clip_gradient, self.clip_gradient))
+                else:
+                    weight[row] = (1 - lr*wd)*weight[row] - lr*self.rescale_grad*grad[row]
+        else:
+            mom = state
+            for row in range(num_rows):
+              grad_row = grad[row].asnumpy()
+              all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
+              if all_zeros:
+                  continue
+              if self.clip_gradient is not None:
+                  mom[row] = (self.momentum*mom[row] - lr*wd*weight[row] -
+                      lr*mx.nd.clip(grad[row]*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
+                  weight[row] += mom[row]
+              else:
+                  mom[row] = self.momentum*mom[row] - lr*wd*weight[row] - lr*self.rescale_grad*grad[row]
+                  weight[row] += mom[row]
+
+def test_sparse_sgd():
+    mx.random.seed(0)
+    opt1 = PySparseSGD
+    opt2 = mx.optimizer.SGD
+    shape = (3, 4, 5)
+    mom_options = [{}, {'momentum': 0.9}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}]
+    for dtype in [np.float32]:
+        for mom_option in mom_options:
+            for cg_option in cg_options:
+                for rg_option in rg_options:
+                    for wd_option in wd_options:
+                        for mp_option in mp_options:
+                            kwarg = {}
+                            kwarg.update(mom_option)
+                            kwarg.update(cg_option)
+                            kwarg.update(rg_option)
+                            kwarg.update(wd_option)
+                            kwarg.update(mp_option)
+                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
+                                              w_stype='row_sparse', g_stype='row_sparse')
+                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
+                                              w_stype='row_sparse', g_stype='default')
 
 # ADAM
 
 class PyAdam(mx.optimizer.Optimizer):
     """python reference implemenation of adam"""
     def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 decay_factor=(1 - 1e-8), **kwargs):
+                 decay_factor=(1 - 1e-8), sparse_update=False, **kwargs):
         super(PyAdam, self).__init__(learning_rate=learning_rate, **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon
         self.decay_factor = decay_factor
+        self.sparse_update = sparse_update
 
     def create_state(self, index, weight):
         """Create additional optimizer state: mean, variance
@@ -235,21 +356,28 @@ def update(self, index, weight, grad, state):
         mean, variance = state
 
         wd = self._get_wd(index)
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient, out=grad)
-
-        mean *= self.beta1
-        mean += grad * (1. - self.beta1)
-
-        variance *= self.beta2
-        variance += (1 - self.beta2) * mx.nd.square(grad, out=grad)
-
+        num_rows = weight.shape[0]
         coef1 = 1. - self.beta1**t
         coef2 = 1. - self.beta2**t
         lr *= math.sqrt(coef2)/coef1
-
-        weight -= lr*mean/(mx.nd.sqrt(variance) + self.epsilon)
+        for row in range(num_rows):
+            # check row slices of all zeros
+            all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
+            # skip zeros during sparse update
+            if all_zeros and self.sparse_update:
+                continue
+            grad[row] = grad[row] * self.rescale_grad + wd * weight[row]
+            # clip gradients
+            if self.clip_gradient is not None:
+                mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+            # update mean
+            mean[row] *= self.beta1
+            mean[row] += grad[row] * (1. - self.beta1)
+            # update variance
+            variance[row] *= self.beta2
+            variance[row] += (1 - self.beta2) * mx.nd.square(grad[row], out=grad[row])
+            # update weight
+            weight[row] -= lr*mean[row]/(mx.nd.sqrt(variance[row]) + self.epsilon)
 
 
 def test_adam():
@@ -266,6 +394,8 @@ def test_adam():
               {'rescale_grad': 0.8, 'wd': 0.05}]
     for kwarg in kwargs:
         compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32)
+        compare_optimizer(opt1(sparse_update=True, **kwarg), opt2(**kwarg), shape,
+                          np.float32, w_stype='row_sparse', g_stype='row_sparse')
 
 # RMSProp
 class PyRMSProp(mx.optimizer.Optimizer):
@@ -406,8 +536,10 @@ def test_rms():
               {'rescale_grad': 0.8, 'wd': 0.05, 'centered': True, 'clip_weights': 0.01}]
     for kwarg in kwargs:
         compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32)
+        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32, g_stype='row_sparse')
 
 if __name__ == '__main__':
     test_adam()
     test_rms()
     test_sgd()
+    test_sparse_sgd()
diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py
new file mode 100644
index 000000000000..1849bf7107e4
--- /dev/null
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -0,0 +1,523 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pickle as pkl
+
+from mxnet.ndarray import NDArray
+from mxnet.test_utils import *
+from numpy.testing import assert_allclose
+import numpy.random as rnd
+
+from mxnet.ndarray import RowSparseNDArray, CSRNDArray
+
+
+def assert_fcompex(f, *args, **kwargs):
+    prev_val = mx.test_utils.set_env_var("MXNET_EXEC_STORAGE_FALLBACK", "0", "1")
+    f(*args, **kwargs)
+    mx.test_utils.set_env_var("MXNET_EXEC_STORAGE_FALLBACK", prev_val)
+
+
+def sparse_nd_ones(shape, stype):
+    return mx.nd.ones(shape).tostype(stype)
+
+
+def check_sparse_nd_elemwise_binary(shapes, stypes, f, g):
+    # generate inputs
+    nds = []
+    for i, stype in enumerate(stypes):
+        if stype == 'row_sparse':
+            nd, _ = rand_sparse_ndarray(shapes[i], stype)
+        elif stype == 'default':
+            nd = mx.nd.array(random_arrays(shapes[i]), dtype = np.float32)
+        else:
+            assert(False)
+        nds.append(nd)
+    # check result
+    test = f(nds[0], nds[1])
+    assert_almost_equal(test.asnumpy(), g(nds[0].asnumpy(), nds[1].asnumpy()))
+
+
+def test_sparse_nd_elemwise_add():
+    num_repeats = 10
+    g = lambda x,y: x + y
+    op = mx.nd.elemwise_add
+    for i in range(num_repeats):
+        shape = [rand_shape_2d()] * 2
+        assert_fcompex(check_sparse_nd_elemwise_binary,
+                       shape, ['default'] * 2, op, g)
+        assert_fcompex(check_sparse_nd_elemwise_binary,
+                       shape, ['default', 'row_sparse'], op, g)
+        assert_fcompex(check_sparse_nd_elemwise_binary,
+                       shape, ['row_sparse', 'row_sparse'], op, g)
+
+
+def test_sparse_nd_copy():
+    def check_sparse_nd_copy(from_stype, to_stype, shape):
+        from_nd = rand_ndarray(shape, from_stype)
+        # copy to ctx
+        to_ctx = from_nd.copyto(default_context())
+        # copy to stype
+        to_nd = rand_ndarray(shape, to_stype)
+        to_nd = from_nd.copyto(to_nd)
+        assert np.sum(np.abs(from_nd.asnumpy() != to_ctx.asnumpy())) == 0.0
+        assert np.sum(np.abs(from_nd.asnumpy() != to_nd.asnumpy())) == 0.0
+
+    shape = rand_shape_2d()
+    shape_3d = rand_shape_3d()
+    stypes = ['row_sparse', 'csr']
+    for stype in stypes:
+        check_sparse_nd_copy(stype, 'default', shape)
+        check_sparse_nd_copy('default', stype, shape)
+    check_sparse_nd_copy('row_sparse', 'row_sparse', shape_3d)
+    check_sparse_nd_copy('row_sparse', 'default', shape_3d)
+    check_sparse_nd_copy('default', 'row_sparse', shape_3d)
+
+def test_sparse_nd_basic():
+    def check_sparse_nd_basic_rsp():
+        storage_type = 'row_sparse'
+        shape = rand_shape_2d()
+        nd, (v, idx) = rand_sparse_ndarray(shape, storage_type)
+        assert(nd._num_aux == 1)
+        assert(nd.indices.dtype == np.int64)
+        assert(nd.stype == 'row_sparse')
+
+    check_sparse_nd_basic_rsp()
+
+
+def test_sparse_nd_setitem():
+    def check_sparse_nd_setitem(stype, shape, dst):
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        x[:] = dst
+        dst_nd = mx.nd.array(dst) if isinstance(dst, (np.ndarray, np.generic)) else dst
+        assert same(x.asnumpy(), dst_nd.asnumpy())
+
+    shape = rand_shape_2d()
+    for stype in ['row_sparse', 'csr']:
+        # ndarray assignment
+        check_sparse_nd_setitem(stype, shape, rand_ndarray(shape, 'default'))
+        check_sparse_nd_setitem(stype, shape, rand_ndarray(shape, stype))
+        # numpy assignment
+        check_sparse_nd_setitem(stype, shape, np.ones(shape))
+
+
+def test_sparse_nd_slice():
+    def check_sparse_nd_csr_slice(shape):
+        stype = 'csr'
+        A, _ = rand_sparse_ndarray(shape, stype)
+        A2 = A.asnumpy()
+        start = rnd.randint(0, shape[0] - 1)
+        end = rnd.randint(start + 1, shape[0])
+        assert same(A[start:end].asnumpy(), A2[start:end])
+        assert same(A[start:].asnumpy(), A2[start:])
+        assert same(A[:end].asnumpy(), A2[:end])
+
+    shape = (rnd.randint(2, 10), rnd.randint(1, 10))
+    check_sparse_nd_csr_slice(shape)
+
+
+def test_sparse_nd_equal():
+    for stype in ['row_sparse', 'csr']:
+        shape = rand_shape_2d()
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        y = sparse_nd_ones(shape, stype)
+        z = x == y
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = 0 == x
+        assert (z.asnumpy() == np.ones(shape)).all()
+
+
+def test_sparse_nd_not_equal():
+    for stype in ['row_sparse', 'csr']:
+        shape = rand_shape_2d()
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        y = sparse_nd_ones(shape, stype)
+        z = x != y
+        assert (z.asnumpy() == np.ones(shape)).all()
+        z = 0 != x
+        assert (z.asnumpy() == np.zeros(shape)).all()
+
+
+def test_sparse_nd_greater():
+    for stype in ['row_sparse', 'csr']:
+        shape = rand_shape_2d()
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        y = sparse_nd_ones(shape, stype)
+        z = x > y
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = y > 0
+        assert (z.asnumpy() == np.ones(shape)).all()
+        z = 0 > y
+        assert (z.asnumpy() == np.zeros(shape)).all()
+
+
+def test_sparse_nd_greater_equal():
+    for stype in ['row_sparse', 'csr']:
+        shape = rand_shape_2d()
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        y = sparse_nd_ones(shape, stype)
+        z = x >= y
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = y >= 0
+        assert (z.asnumpy() == np.ones(shape)).all()
+        z = 0 >= y
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = y >= 1
+        assert (z.asnumpy() == np.ones(shape)).all()
+
+
+def test_sparse_nd_lesser():
+    for stype in ['row_sparse', 'csr']:
+        shape = rand_shape_2d()
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        y = sparse_nd_ones(shape, stype)
+        z = y < x
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = 0 < y
+        assert (z.asnumpy() == np.ones(shape)).all()
+        z = y < 0
+        assert (z.asnumpy() == np.zeros(shape)).all()
+
+
+def test_sparse_nd_lesser_equal():
+    for stype in ['row_sparse', 'csr']:
+        shape = rand_shape_2d()
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        y = sparse_nd_ones(shape, stype)
+        z = y <= x
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = 0 <= y
+        assert (z.asnumpy() == np.ones(shape)).all()
+        z = y <= 0
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = 1 <= y
+        assert (z.asnumpy() == np.ones(shape)).all()
+
+
+def test_sparse_nd_binary():
+    N = 10
+    def check_binary(fn, stype):
+        for _ in range(N):
+            ndim = 2
+            oshape = np.random.randint(1, 6, size=(ndim,))
+            bdim = 2
+            lshape = list(oshape)
+            rshape = list(oshape[ndim-bdim:])
+            for i in range(bdim):
+                sep = np.random.uniform(0, 1)
+                if sep < 0.33:
+                    lshape[ndim-i-1] = 1
+                elif sep < 0.66:
+                    rshape[bdim-i-1] = 1
+            lhs = np.random.uniform(0, 1, size=lshape)
+            rhs = np.random.uniform(0, 1, size=rshape)
+            lhs_nd = mx.nd.array(lhs).tostype(stype)
+            rhs_nd = mx.nd.array(rhs).tostype(stype)
+            assert_allclose(fn(lhs, rhs), fn(lhs_nd, rhs_nd).asnumpy(), rtol=1e-4, atol=1e-4)
+
+    stypes = ['row_sparse', 'csr']
+    for stype in stypes:
+        check_binary(lambda x, y: x + y, stype)
+        check_binary(lambda x, y: x - y, stype)
+        check_binary(lambda x, y: x * y, stype)
+        check_binary(lambda x, y: x / y, stype)
+        check_binary(lambda x, y: x ** y, stype)
+        check_binary(lambda x, y: x > y, stype)
+        check_binary(lambda x, y: x < y, stype)
+        check_binary(lambda x, y: x >= y, stype)
+        check_binary(lambda x, y: x <= y, stype)
+        check_binary(lambda x, y: x == y, stype)
+
+
+def test_sparse_nd_binary_rop():
+    N = 10
+    def check(fn, stype):
+        for _ in range(N):
+            ndim = 2
+            shape = np.random.randint(1, 6, size=(ndim,))
+            npy = np.random.normal(0, 1, size=shape)
+            nd = mx.nd.array(npy).tostype(stype)
+            assert_allclose(fn(npy), fn(nd).asnumpy(), rtol=1e-4, atol=1e-4)
+
+    stypes = ['row_sparse', 'csr']
+    for stype in stypes:
+        check(lambda x: 1 + x, stype)
+        check(lambda x: 1 - x, stype)
+        check(lambda x: 1 * x, stype)
+        check(lambda x: 1 / x, stype)
+        check(lambda x: 2 ** x, stype)
+        check(lambda x: 1 > x, stype)
+        check(lambda x: 0.5 > x, stype)
+        check(lambda x: 0.5 < x, stype)
+        check(lambda x: 0.5 >= x, stype)
+        check(lambda x: 0.5 <= x, stype)
+        check(lambda x: 0.5 == x, stype)
+
+def test_sparse_nd_binary_iop():
+    N = 10
+    def check_binary(fn, stype):
+        for _ in range(N):
+            ndim = 2
+            oshape = np.random.randint(1, 6, size=(ndim,))
+            lshape = list(oshape)
+            rshape = list(oshape)
+            lhs = np.random.uniform(0, 1, size=lshape)
+            rhs = np.random.uniform(0, 1, size=rshape)
+            lhs_nd = mx.nd.array(lhs).tostype(stype)
+            rhs_nd = mx.nd.array(rhs).tostype(stype)
+            assert_allclose(fn(lhs, rhs),
+                            fn(lhs_nd, rhs_nd).asnumpy(),
+                            rtol=1e-4, atol=1e-4)
+
+    def inplace_add(x, y):
+        x += y
+        return x
+    def inplace_mul(x, y):
+        x *= y
+        return x
+    stypes = ['csr', 'row_sparse']
+    fns = [inplace_add, inplace_mul]
+    for stype in stypes:
+        for fn in fns:
+            check_binary(fn, stype)
+
+def test_sparse_nd_negate():
+    def check_sparse_nd_negate(shape, stype):
+        npy = np.random.uniform(-10, 10, rand_shape_2d())
+        arr = mx.nd.array(npy).tostype(stype)
+        assert_almost_equal(npy, arr.asnumpy())
+        assert_almost_equal(-npy, (-arr).asnumpy())
+
+        # a final check to make sure the negation (-) is not implemented
+        # as inplace operation, so the contents of arr does not change after
+        # we compute (-arr)
+        assert_almost_equal(npy, arr.asnumpy())
+
+    shape = rand_shape_2d()
+    stypes = ['csr', 'row_sparse']
+    for stype in stypes:
+        check_sparse_nd_negate(shape, stype)
+
+def test_sparse_nd_broadcast():
+    sample_num = 1000
+    # TODO(haibin) test with more than 2 dimensions
+    def test_broadcast_to(stype):
+        for i in range(sample_num):
+            ndim = 2
+            target_shape = np.random.randint(1, 11, size=ndim)
+            shape = target_shape.copy()
+            axis_flags = np.random.randint(0, 2, size=ndim)
+            axes = []
+            for (axis, flag) in enumerate(axis_flags):
+                if flag:
+                    shape[axis] = 1
+            dat = np.random.rand(*shape) - 0.5
+            numpy_ret = dat
+            ndarray = mx.nd.array(dat).tostype(stype)
+            ndarray_ret = ndarray.broadcast_to(shape=target_shape)
+            if type(ndarray_ret) is mx.ndarray.NDArray:
+                ndarray_ret = ndarray_ret.asnumpy()
+            assert (ndarray_ret.shape == target_shape).all()
+            err = np.square(ndarray_ret - numpy_ret).mean()
+            assert err < 1E-8
+    stypes = ['csr', 'row_sparse']
+    for stype in stypes:
+        test_broadcast_to(stype)
+
+
+def test_sparse_nd_transpose():
+    npy = np.random.uniform(-10, 10, rand_shape_2d())
+    stypes = ['csr', 'row_sparse']
+    for stype in stypes:
+        nd = mx.nd.array(npy).tostype(stype)
+        assert_almost_equal(npy.T, (nd.T).asnumpy())
+
+def test_sparse_nd_output_fallback():
+    shape = (10, 10)
+    out = mx.nd.zeros(shape=shape, stype='row_sparse')
+    mx.nd.random_normal(shape=shape, out=out)
+    assert(np.sum(out.asnumpy()) != 0)
+
+def test_sparse_nd_random():
+    """ test sparse random operator on cpu """
+    # gpu random operator doesn't use fixed seed
+    if default_context().device_type is 'gpu':
+        return
+    shape = (100, 100)
+    fns = [mx.nd.random_uniform, mx.nd.random_normal, mx.nd.random_gamma]
+    for fn in fns:
+        rsp_out = mx.nd.zeros(shape=shape, stype='row_sparse')
+        dns_out = mx.nd.zeros(shape=shape, stype='default')
+        mx.random.seed(0)
+        np.random.seed(0)
+        fn(shape=shape, out=dns_out)
+        mx.random.seed(0)
+        np.random.seed(0)
+        fn(shape=shape, out=rsp_out)
+        assert_almost_equal(dns_out.asnumpy(), rsp_out.asnumpy())
+
+
+def test_sparse_nd_astype():
+    stypes = ['row_sparse', 'csr']
+    for stype in stypes:
+        x = mx.nd.zeros(shape=rand_shape_2d(), stype=stype, dtype='float32')
+        y = x.astype('int32')
+        assert(y.dtype == np.int32), y.dtype
+
+
+def test_sparse_nd_pickle():
+    np.random.seed(0)
+    repeat = 10
+    dim0 = 40
+    dim1 = 40
+    stypes = ['row_sparse', 'csr']
+    densities = [0, 0.01, 0.1, 0.2, 0.5]
+    stype_dict = {'row_sparse': RowSparseNDArray, 'csr': CSRNDArray}
+    for _ in range(repeat):
+        shape = rand_shape_2d(dim0, dim1)
+        for stype in stypes:
+            for density in densities:
+                a, _ = rand_sparse_ndarray(shape, stype, density)
+                assert isinstance(a, stype_dict[stype])
+                data = pkl.dumps(a)
+                b = pkl.loads(data)
+                assert isinstance(b, stype_dict[stype])
+                assert same(a.asnumpy(), b.asnumpy())
+
+
+def test_sparse_nd_save_load():
+    np.random.seed(0)
+    repeat = 1
+    stypes = ['default', 'row_sparse', 'csr']
+    stype_dict = {'default': NDArray, 'row_sparse': RowSparseNDArray, 'csr': CSRNDArray}
+    num_data = 20
+    densities = [0, 0.01, 0.1, 0.2, 0.5]
+    fname = 'tmp_list.bin'
+    for _ in range(repeat):
+        data_list1 = []
+        for i in range(num_data):
+            stype = stypes[np.random.randint(0, len(stypes))]
+            shape = rand_shape_2d(dim0=40, dim1=40)
+            density = densities[np.random.randint(0, len(densities))]
+            data_list1.append(rand_ndarray(shape, stype, density))
+            assert isinstance(data_list1[-1], stype_dict[stype])
+        mx.nd.save(fname, data_list1)
+
+        data_list2 = mx.nd.load(fname)
+        assert len(data_list1) == len(data_list2)
+        for x, y in zip(data_list1, data_list2):
+            assert same(x.asnumpy(), y.asnumpy())
+
+        data_map1 = {'ndarray xx %s' % i: x for i, x in enumerate(data_list1)}
+        mx.nd.save(fname, data_map1)
+        data_map2 = mx.nd.load(fname)
+        assert len(data_map1) == len(data_map2)
+        for k, x in data_map1.items():
+            y = data_map2[k]
+            assert same(x.asnumpy(), y.asnumpy())
+    os.remove(fname)
+
+def test_sparse_nd_unsupported():
+    nd = mx.nd.zeros((2,2), stype='row_sparse')
+    fn_slice = lambda x: x._slice(None, None)
+    fn_at = lambda x: x._at(None)
+    fn_reshape = lambda x: x.reshape(None)
+    fns = [fn_slice, fn_at, fn_reshape]
+    for fn in fns:
+        try:
+            fn(nd)
+            assert(False)
+        except:
+            pass
+
+def test_create_csr():
+    dim0 = 50
+    dim1 = 50
+    densities = [0, 0.01, 0.1, 0.2, 0.5]
+    for density in densities:
+        shape = rand_shape_2d(dim0, dim1)
+        matrix = rand_ndarray(shape, 'csr', density)
+        data = matrix.data
+        indptr = matrix.indptr
+        indices = matrix.indices
+        csr_created = mx.nd.csr_matrix(data=data, indptr=indptr, indices=indices, shape=shape)
+        assert csr_created.stype == 'csr'
+        assert same(csr_created.data.asnumpy(), data.asnumpy())
+        assert same(csr_created.indptr.asnumpy(), indptr.asnumpy())
+        assert same(csr_created.indices.asnumpy(), indices.asnumpy())
+        csr_copy = mx.nd.array(csr_created)
+        assert(same(csr_copy.asnumpy(), csr_created.asnumpy()))
+
+
+def test_create_row_sparse():
+    dim0 = 50
+    dim1 = 50
+    densities = [0, 0.01, 0.1, 0.2, 0.5]
+    for density in densities:
+        shape = rand_shape_2d(dim0, dim1)
+        matrix = rand_ndarray(shape, 'row_sparse', density)
+        data = matrix.data
+        indices = matrix.indices
+        rsp_created = mx.nd.row_sparse_array(data=data, indices=indices, shape=shape)
+        assert rsp_created.stype == 'row_sparse'
+        assert same(rsp_created.data.asnumpy(), data.asnumpy())
+        assert same(rsp_created.indices.asnumpy(), indices.asnumpy())
+        rsp_copy = mx.nd.array(rsp_created)
+        assert(same(rsp_copy.asnumpy(), rsp_created.asnumpy()))
+
+def test_sparse_nd_empty():
+    stypes = ['csr', 'row_sparse', 'default']
+    for stype in stypes:
+        nd = mx.nd.empty((2,2), stype=stype)
+        assert(nd.stype == stype)
+
+
+def test_synthetic_dataset_generator():
+    def test_powerlaw_generator(csr_arr, final_row=1):
+        """Test power law distribution
+        Total Elements: 32000, Number of zeros: 3200
+        Every row has 2 * non zero elements of the previous row.
+        Also since (2047 < 3200 < 4095) this will be true till 10th row"""
+        indices = csr_arr.indices.asnumpy()
+        indptr = csr_arr.indptr.asnumpy()
+        for row in range(1, final_row + 1):
+            nextrow = row + 1
+            current_row_nnz = indices[indptr[row] - 1] + 1
+            next_row_nnz = indices[indptr[nextrow] - 1] + 1
+            assert next_row_nnz == 2 * current_row_nnz
+
+    # Test if density is preserved
+    csr_arr_cols, _ = rand_sparse_ndarray(shape=(32, 10000), stype="csr",
+                                          density=0.01, distribution="powerlaw")
+
+    csr_arr_small, _ = rand_sparse_ndarray(shape=(5, 5), stype="csr",
+                                           density=0.5, distribution="powerlaw")
+
+    csr_arr_big, _ = rand_sparse_ndarray(shape=(32, 1000000), stype="csr",
+                                         density=0.4, distribution="powerlaw")
+
+    csr_arr_square, _ = rand_sparse_ndarray(shape=(1600, 1600), stype="csr",
+                                            density=0.5, distribution="powerlaw")
+    assert len(csr_arr_cols.data) == 3200
+    test_powerlaw_generator(csr_arr_cols, final_row=9)
+    test_powerlaw_generator(csr_arr_small, final_row=1)
+    test_powerlaw_generator(csr_arr_big, final_row=4)
+    test_powerlaw_generator(csr_arr_square, final_row=6)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
new file mode 100644
index 000000000000..748a89990cbd
--- /dev/null
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -0,0 +1,372 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from mxnet.test_utils import *
+
+
+def check_elemwise_add_ex(lhs_stype, rhs_stype, shape, lhs_grad_stype=None, rhs_grad_stype=None):
+    lhs = mx.symbol.Variable('lhs', stype=lhs_stype)
+    rhs = mx.symbol.Variable('rhs', stype=rhs_stype)
+    lhs_nd = rand_ndarray(shape, lhs_stype)
+    rhs_nd = rand_ndarray(shape, rhs_stype)
+    lhs_np = lhs_nd.asnumpy()
+    rhs_np = rhs_nd.asnumpy()
+
+    out_np = lhs_np + rhs_np
+    test = mx.symbol.elemwise_add(lhs, rhs)
+    location = {'lhs': lhs_nd, 'rhs': rhs_nd}
+    check_symbolic_forward(test, location, [out_np])
+    check_numeric_gradient(test, location)
+    grad_stypes = {}
+    if lhs_grad_stype is not None and lhs_grad_stype != 'default':
+        grad_stypes['lhs'] = lhs_grad_stype
+    if rhs_grad_stype is not None and rhs_grad_stype != 'default':
+        grad_stypes['rhs'] = rhs_grad_stype
+    check_symbolic_backward(test, location, [out_np], [out_np, out_np],
+                            grad_stypes=grad_stypes)
+
+
+def test_elemwise_add_ex():
+    shapes = [rand_shape_2d(), rand_shape_3d()]
+    for shape in shapes:
+        check_elemwise_add_ex('default', 'default', shape)
+        check_elemwise_add_ex('default', 'row_sparse', shape)
+        check_elemwise_add_ex('row_sparse', 'default', shape)
+        check_elemwise_add_ex('row_sparse', 'row_sparse', shape,
+                              lhs_grad_stype='row_sparse', rhs_grad_stype='row_sparse')
+
+
+# TODO(haibin) randomize this test
+def test_elemwise_add_ex_multiple_stages():
+    # prep data
+    shape = (4, 2)
+    ds_np = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    sp_np1 = np.array([[5, 10], [0, 0], [0, 0], [0, 0]])
+    sp_np2 = np.array([[0, 0], [5, 10], [0, 0], [0, 0]])
+
+    val1 = mx.nd.array([[5, 10]]);
+    val2 = mx.nd.array([[5, 10]]);
+    idx1 = mx.nd.array([0], dtype=np.int64);
+    idx2 = mx.nd.array([1], dtype=np.int64);
+    sp_nd1 = mx.nd.row_sparse_array(val1, idx1, shape)
+    sp_nd2 = mx.nd.row_sparse_array(val2, idx2, shape)
+    ds_nd = mx.nd.array(ds_np)
+
+    # sparse + sparse = sparse
+    sp_data1 = mx.symbol.Variable('sp_data1', stype='row_sparse')
+    sp_data2 = mx.symbol.Variable('sp_data2', stype='row_sparse')
+    ds_data = mx.symbol.Variable('ds_data')
+    plus = mx.symbol.elemwise_add(sp_data1, sp_data2, name='plus')
+    # sparse + dense = dense
+    test = mx.symbol.elemwise_add(plus, ds_data)
+    check_symbolic_forward(test, {'sp_data1': sp_nd1, 'sp_data2': sp_nd2,
+                                  'ds_data': ds_nd}, [sp_np1 + sp_np2 + ds_np])
+
+    arr_grads = [mx.nd.zeros(shape) for i in range(3)]
+    exec_test = test.bind(default_context(), args={'sp_data1': sp_nd1, 'sp_data2': sp_nd2,
+                                                   'ds_data': ds_nd}, args_grad=arr_grads)
+    exec_test.forward(is_train=True)
+    assert_almost_equal(exec_test.outputs[0].asnumpy(), sp_np1 + sp_np2 + ds_np)
+    exec_test.backward(out_grads=exec_test.outputs)
+    assert_almost_equal(arr_grads[0].asnumpy(), arr_grads[1].asnumpy())
+
+def test_cast_storage_ex():
+    def check_cast_storage(shape, density, from_stype, to_stype, check_numeric_grad=True):
+        x = mx.symbol.Variable('x', stype=from_stype)
+        x_nd = rand_ndarray(shape, from_stype, density=density)
+        x_np = x_nd.asnumpy()
+        out_np = x_np
+        test = mx.symbol.cast_storage(x, stype=to_stype)
+        location = {'x': x_nd}
+        check_symbolic_forward(test, location, [out_np])
+        # consider disable the numeric grad check for gpu block kernel since the input is large
+        if check_numeric_grad:
+            check_numeric_gradient(test, location)
+        grad_stypes = {'x': to_stype}
+        check_symbolic_backward(test, location, [out_np], [out_np], grad_stypes=grad_stypes)
+
+    density = [1.00, 0.50, 0.10, 0.05, 0.01]
+    for d in density:
+        shape_2d = rand_shape_2d()
+        shape_3d = rand_shape_3d()
+        check_cast_storage(shape_2d, d, 'csr', 'default')
+        check_cast_storage(shape_2d, d, 'default', 'csr')
+        check_cast_storage(shape_2d, d, 'row_sparse', 'default')
+        check_cast_storage(shape_2d, d, 'default', 'row_sparse')
+        check_cast_storage(shape_3d, d, 'row_sparse', 'default')
+        check_cast_storage(shape_3d, d, 'default', 'row_sparse')
+        for i in range(4, 6):
+            shape = rand_shape_nd(i, 5)
+            check_cast_storage(shape, d, 'default', 'row_sparse')
+            check_cast_storage(shape, d, 'row_sparse', 'default')
+        # Test specific gpu kernels
+        if default_context().device_type is 'gpu':
+            dim0 = rnd.randint(1, 10)
+            # test gpu thread kernel
+            check_cast_storage((dim0, rnd.randint(  1,   32)), d, 'default', 'csr')
+            # test gpu warp   kernel
+            check_cast_storage((dim0, rnd.randint( 32,  512)), d, 'default', 'csr')
+            # test gpu block  kernel
+            check_cast_storage((dim0, rnd.randint(512, 1024)), d, 'default', 'csr',
+                               check_numeric_grad=False)
+            # test gpu thread kernel
+            check_cast_storage((dim0, rnd.randint(  1,   32)), d, 'default', 'row_sparse')
+            # test gpu warp   kernel
+            check_cast_storage((dim0, rnd.randint( 32,  512)), d, 'default', 'row_sparse')
+            # test gpu block  kernel
+            check_cast_storage((dim0, rnd.randint(512, 1024)), d, 'default', 'row_sparse',
+                               check_numeric_grad=False)
+
+def test_sparse_dot():
+    def test_dot_csr(lhs_shape, rhs_shape, rhs_stype, trans_lhs, lhs_density, rhs_density):
+        lhs_nd = rand_ndarray(lhs_shape, 'csr', density=lhs_density)
+        lhs_dns = lhs_nd.tostype('default')
+        rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_density)
+        rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.tostype('default')
+
+        out = mx.nd.dot(lhs_nd, rhs_nd, transpose_a=trans_lhs)
+        out_dns = mx.nd.dot(lhs_dns, rhs_dns, transpose_a=trans_lhs)
+        out_np = out_dns.asnumpy()
+        assert_almost_equal(out.asnumpy(), out_np, rtol=1e-4, atol=1e-5)
+
+        # test symbolic forward
+        lhs = mx.symbol.Variable('lhs', stype='csr')
+        rhs = mx.symbol.Variable('rhs', stype=rhs_stype)
+        out = mx.symbol.dot(lhs, rhs, transpose_a=trans_lhs)
+        location = {'lhs': lhs_nd, 'rhs': rhs_nd}
+        check_symbolic_forward(out, location, [out_np], rtol=1e-3, atol=1e-4)
+
+        # test symbolic backward
+        backward_trans = not trans_lhs
+        rhs_backward_grad = mx.nd.dot(lhs_dns, out_dns, transpose_a=backward_trans).asnumpy()
+        expected = {'rhs': rhs_backward_grad}
+        check_symbolic_backward(out, location, [out_np], expected,
+                                grad_req={'lhs': 'null', 'rhs': 'write'},
+                                rtol=1e-3, atol=1e-4)
+
+    density = [1.00, 0.50, 0.10, 0.05, 0.01]
+    for lhs_d in density:
+        lhs_shape = rand_shape_2d(50, 200)
+        rhs_d = 1
+        test_dot_csr(lhs_shape, (lhs_shape[1], 1), 'default', False, lhs_d, rhs_d) # test gpu SpMV
+        test_dot_csr(lhs_shape, (lhs_shape[0], 1), 'default', True , lhs_d, rhs_d) # (vector kernel)
+        test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(5, 10)), 'default', False, lhs_d, rhs_d) # test gpu SpMM
+        test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(5, 10)), 'default', True , lhs_d, rhs_d) # (scalar kernel)
+        for rhs_d in density:
+            test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(1, 10)), 'row_sparse', False, lhs_d, rhs_d)
+            test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(1, 10)), 'row_sparse', True, lhs_d, rhs_d)
+
+
+def test_sparse_slice():
+    def check_csr_slice(shape, slice_input):
+        storage_type = 'csr'
+        B, _ = rand_sparse_ndarray(shape, storage_type)
+        np = B.asnumpy()
+        begin = rnd.randint(0, B.shape[0] - 1)
+        end = rnd.randint(begin + 1, B.shape[0])
+        nd_slice = mx.nd.crop(B, begin=begin, end=end)
+        assert same(nd_slice.asnumpy(), np[begin:end]), (nd_slice.asnumpy(), np[begin:end])
+
+    shape = (rnd.randint(7, 15), rnd.randint(1, 10))
+    check_csr_slice(shape, True)
+    check_csr_slice(shape, False)
+
+
+def test_sparse_retain():
+    def check_sparse_retain(shape, density, index_type=np.int64):
+        num_rows = shape[0]
+        rsp, _ = rand_sparse_ndarray(shape=shape, stype='row_sparse', density=density)
+        length = np.random.randint(1, num_rows + 1)
+        idx = random_sample(list(range(0, num_rows)), length)
+        idx.sort()
+        dns = rsp.asnumpy()
+        tensor_retained_expected = np.zeros(shape)
+        for i in idx:
+            tensor_retained_expected[i][:] = dns[i]
+        indices = mx.nd.array(idx, dtype=index_type)
+        rsp_retained = mx.nd.sparse_retain(rsp, indices=indices)
+        assert same(tensor_retained_expected, rsp_retained.asnumpy())
+
+        # check numeric gradient
+        data = mx.symbol.Variable('data')
+        idx = mx.symbol.Variable('indices')
+        sym = mx.sym.sparse_retain(data=data, indices=idx)
+        check_numeric_gradient(sym, [rsp, indices], grad_nodes=['data'],
+                               grad_stype_dict={'data': 'row_sparse'})
+
+    shape = rand_shape_2d()
+    shape_3d = rand_shape_3d()
+    densities = [0.01, 0.1, 0.2, 0.5, 0.8, 1.0]
+    index_types = [np.float32, np.int32, np.int64]
+    for density in densities:
+        for itype in index_types:
+            check_sparse_retain(shape, density, itype)
+            check_sparse_retain(shape_3d, density, itype)
+
+
+def test_sparse_nd_zeros():
+    def check_sparse_nd_zeros(stype, shape):
+        zero = mx.nd.zeros(shape)
+        sparse_zero = mx.nd.zeros(shape=shape, stype=stype)
+        assert_almost_equal(sparse_zero.asnumpy(), zero.asnumpy())
+
+    shape = rand_shape_2d()
+    check_sparse_nd_zeros('row_sparse', shape)
+    check_sparse_nd_zeros('csr', shape)
+    check_sparse_nd_zeros('default', shape)
+
+
+def test_sparse_square_sum():
+    dim0 = 30
+    dim1 = 30
+    axes = [0, 1]
+    keepdims = [False, True]
+    densities = [0, 0.01, 0.1, 0.2, 0.5]
+    for density in densities:
+        shape = rand_shape_2d(dim0, dim1)
+        rsp = rand_ndarray(shape, 'row_sparse', density)
+        dns = rsp.tostype('default')
+        for axis in axes:
+            for keepdim in keepdims:
+                ret = mx.nd._internal._square_sum(rsp, axis=axis, keepdims=keepdim)
+                if axis == 1 and keepdim:
+                    assert ret.stype == 'row_sparse'
+                else:
+                    assert ret.stype == 'default'
+                ret_expected = mx.nd.sum(dns*dns, axis=axis, keepdims=keepdim)
+                # check forward result
+                assert same(ret.asnumpy(), ret_expected.asnumpy())
+
+                rsp_data = mx.sym.Variable('data', stype='row_sparse')
+                test = mx._symbol_internal._square_sum(rsp_data, axis=axis, keepdims=keepdim)
+
+                # check symbolic backward since ograd can be a rsp
+                # and cannot be checked through check_numeric_gradient
+                # because it will add a loss layer as the output layer
+                # which makes ograd of the square_sum dense
+                if axis == 1 and keepdims:
+                    dns_data = mx.sym.Variable('data')
+                    baseline = mx.sym.sum(mx.sym.square(dns_data), axis=axis, keepdims=keepdim)
+                    igrad_expected = mx.nd.empty(dns.shape)
+                    baseline_exec = baseline.bind(default_context(), args=[dns],
+                                                  args_grad=[igrad_expected])
+                    baseline_exec.forward(is_train=True)
+                    baseline_exec.backward([ret_expected])
+                    check_symbolic_backward(test, [rsp], [ret], [igrad_expected.asnumpy()],
+                                            grad_stypes={'data': 'row_sparse'})
+
+                # check numeric gradient
+                check_numeric_gradient(test, [rsp], grad_stype_dict={'data': 'row_sparse'},
+                                       atol=1e-2, rtol=0.1)
+
+def test_sparse_storage_fallback():
+    """ test operators which don't implement FComputeEx or FStatefulComputeEx """
+    def check_broadcast_add(shape, lhs_stype, rhs_stype):
+        lhs = mx.symbol.Variable('lhs', stype=lhs_stype)
+        rhs = mx.symbol.Variable('rhs', stype=rhs_stype)
+        lhs_nd = rand_ndarray(shape, lhs_stype)
+        rhs_nd = rand_ndarray(shape, rhs_stype)
+        lhs_dns = mx.nd.cast_storage(lhs_nd, stype='default')
+        rhs_dns = mx.nd.cast_storage(rhs_nd, stype='default')
+
+        out_dns = (lhs_dns + rhs_dns).asnumpy()
+        test = mx.symbol.broadcast_add(lhs, rhs)
+        location = {'lhs': lhs_nd, 'rhs': rhs_nd}
+        check_symbolic_forward(test, location, [out_dns])
+        check_numeric_gradient(test, location)
+        check_symbolic_backward(test, location, [out_dns], [out_dns, out_dns])
+
+    def np_softmax(x, axis=-1):
+        # fix for old numpy on Travis not supporting keepdims
+        # x = x - np.max(x, axis=-1, keepdims=True)
+        x = x - np.max(x, axis=axis, keepdims=True)
+        x = np.exp(x)
+        # x /= np.sum(x, axis=-1, keepdims=True)
+        x /= np.sum(x, axis=axis, keepdims=True)
+        return x
+
+    def check_softmax_with_shape(lhs_stype, rhs_stype, shape, preserve_shape=False):
+        # bind with label
+        ctx = default_context()
+        X = mx.symbol.Variable('X', stype=lhs_stype)
+        L = mx.symbol.Variable('L', stype=rhs_stype)
+        Y = mx.symbol.SoftmaxOutput(data=X, label=L, preserve_shape=preserve_shape)
+        x = rand_ndarray(shape, lhs_stype)
+        l = rand_ndarray(shape, rhs_stype)
+        l[:] = np_softmax(l.asnumpy())
+        grad = mx.nd.empty(shape, ctx=ctx)
+        exec1 = Y.bind(ctx, args = [x, l], args_grad = {'X': grad})
+        exec1.forward(is_train=True)
+        out = exec1.outputs[0].asnumpy()
+        assert_almost_equal(out, np_softmax(x.asnumpy()), rtol=1e-4)
+        exec1.backward()
+        assert_almost_equal(grad.asnumpy(), np_softmax(x.asnumpy()) - l.asnumpy(), rtol=1e-4)
+
+    def check_concat(shape, lhs_stype, rhs_stype):
+        x = mx.symbol.Variable('x', stype=lhs_stype)
+        w = mx.symbol.Variable('w', stype=rhs_stype)
+        test = mx.sym.Concat(x, w)
+        x_nd = rand_ndarray(shape, lhs_stype)
+        w_nd = rand_ndarray(shape, rhs_stype)
+        location = {'x': x_nd, 'w': w_nd}
+        check_numeric_gradient(test, location)
+
+    shape = rand_shape_2d()
+    stypes = ['default', 'csr', 'row_sparse']
+    for lhs in stypes:
+        for rhs in stypes:
+            check_broadcast_add(shape, lhs, rhs)
+            check_concat(shape, lhs, rhs)
+            check_softmax_with_shape(lhs, rhs, shape, preserve_shape=False)
+            check_softmax_with_shape(rhs, rhs, shape, preserve_shape=True)
+
+
+def test_sparse_elementwise_sum():
+    def check_sparse_elementwise_sum_with_shape(stype, shape, n):
+        # forward
+        inputs = [mx.symbol.Variable('arg%d' % i) for i in range(n)]
+        out = mx.symbol.add_n(*inputs, name='esum')
+        arr = []
+        arr_grad = [mx.nd.empty(shape) for _ in range(n)]
+        densities = [0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5]
+        for i in range(n):
+            arr.append(rand_ndarray(shape, stype, np.random.randint(0, len(densities))))
+
+        exec1 = out.bind(default_context(),
+                         args=arr,
+                         args_grad=arr_grad)
+        exec1.forward(is_train=True)
+        out1 = exec1.outputs[0].asnumpy()
+        out = sum(a.asnumpy() for a in arr)
+        assert_almost_equal(out, out1)
+
+        out_grad = mx.nd.empty(shape)
+        out_grad[:] = np.random.uniform(-10, 10, shape)
+        # backward
+        exec1.backward([out_grad])
+        for a in arr_grad:
+            assert_almost_equal(a.asnumpy(), out_grad.asnumpy())
+
+    maxdim = 5
+    for dim in range(2, maxdim):
+        shape = tuple(np.random.randint(5, 10, size=dim))
+        check_sparse_elementwise_sum_with_shape('row_sparse', shape, np.random.randint(1, 9))
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index fb1869f842b1..fd23f0e82b24 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -117,21 +117,21 @@ if [ ${TASK} == "python_test" ]; then
     mkdir -p ${PWD}/data
 
     if [ ${TRAVIS_OS_NAME} == "osx" ]; then
-        python -m nose tests/python/unittest || exit -1
-        python3 -m nose tests/python/unittest || exit -1
+        python -m nose -v tests/python/unittest || exit -1
+        python3 -m nose -v tests/python/unittest || exit -1
         # make cython3
         # cython tests
         # export MXNET_ENFORCE_CYTHON=1
         # python3 -m nose tests/python/unittest || exit -1
-        python3 -m nose tests/python/train || exit -1
-        python -m nose tests/python/doctest || exit -1
-        python3 -m nose tests/python/doctest || exit -1
+        python3 -m nose -v tests/python/train || exit -1
+        python -m nose -v tests/python/doctest || exit -1
+        python3 -m nose -v tests/python/doctest || exit -1
     else
-        nosetests tests/python/unittest || exit -1
-        nosetests3 tests/python/unittest || exit -1
-        nosetests3 tests/python/train || exit -1
-        nosetests tests/python/doctest || exit -1
-        nosetests3 tests/python/doctest || exit -1
+        nosetests -v tests/python/unittest || exit -1
+        nosetests3 -v tests/python/unittest || exit -1
+        nosetests3 -v tests/python/train || exit -1
+        nosetests -v tests/python/doctest || exit -1
+        nosetests3 -v tests/python/doctest || exit -1
     fi
     exit 0
 fi
diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh
index 94d674f3943e..f479306a31a8 100755
--- a/tests/travis/setup.sh
+++ b/tests/travis/setup.sh
@@ -33,8 +33,8 @@ if [ ${TRAVIS_OS_NAME} == "osx" ]; then
     brew install ImageMagick
     brew install swig
     if [ ${TASK} == "python_test" ]; then
-        python -m pip install --user nose numpy cython
-        python3 -m pip install --user nose numpy cython
+        python -m pip install --user nose numpy cython scipy
+        python3 -m pip install --user nose numpy cython scipy
     fi
 fi