apache · piiswrong · Jul 2, 2018 · Mar 20, 2018 · Mar 21, 2018 · Mar 21, 2018
diff --git a/benchmark/python/control_flow/rnn.py b/benchmark/python/control_flow/rnn.py
@@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import subprocess
+import mxnet as mx
+from mxnet import gluon
+import time
+import copy
+
+def get_gpus():
+    """
+    return a list of GPUs
+    """
+    try:
+        re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True)
+    except OSError:
+        return []
+    return range(len([i for i in re.split('\n') if 'GPU' in i]))
+
+class TestRNNLayer(gluon.HybridBlock):
+    def __init__(self, cell, prefix=None, params=None):
+        super(TestRNNLayer, self).__init__(prefix=prefix, params=params)
+        self.cell = cell
+
+    def hybrid_forward(self, F, inputs, states):
+        out, states = F.contrib.foreach(self.cell, inputs, states)
+        return out
+
+def benchmark_rnn(cell, rnn_data, states):
+    ctx = rnn_data.context
+    num_batches = 20
+
+    # Imperative
+    cell0 = copy.deepcopy(cell)
+    layer0 = TestRNNLayer(cell0)
+    layer0.initialize(ctx=ctx)
+
+    # Hybridize
+    cell1 = copy.deepcopy(cell)
+    cell1.hybridize()
+    layer1 = TestRNNLayer(cell1)
+    layer1.initialize(ctx=ctx)
+
+    # Hybridize
+    cell2 = copy.deepcopy(cell)
+    layer2 = TestRNNLayer(cell2)
+    layer2.initialize(ctx=ctx)
+    layer2.hybridize()
+    layer2(rnn_data, states)
+
+    # Hybridize
+    cell3 = copy.deepcopy(cell)
+    cell3.hybridize(static_alloc=True)
+    layer3 = TestRNNLayer(cell3)
+    layer3.initialize(ctx=ctx)
+
+    tic = time.time()
+    for i in range(num_batches):
+        res0 = layer0(rnn_data, states)
+        mx.nd.waitall()
+    print("Imperative inference takes " + str(time.time() - tic))
+
+    tic = time.time()
+    for i in range(num_batches):
+        res1 = layer1(rnn_data, states)
+        mx.nd.waitall()
+    print("Hybrid-cell inference takes " + str(time.time() - tic))
+
+    tic = time.time()
+    for i in range(num_batches):
+        res3 = layer3(rnn_data, states)
+        mx.nd.waitall()
+    print("Static-hybrid-cell inference takes " + str(time.time() - tic))
+
+    tic = time.time()
+    for i in range(num_batches):
+        res2 = layer2(rnn_data, states)
+        mx.nd.waitall()
+    print("Hybrid inference takes " + str(time.time() - tic))
+
+    layer2.export("foreach_rnn")
+    symnet = mx.symbol.load('foreach_rnn-symbol.json')
+    args1 = {}
+    params = layer2.collect_params()
+    for key in params.keys():
+        args1[key] = params[key].data()
+    args1['data0'] = rnn_data
+    for i in range(len(states)):
+        args1['data' + str(i + 1)] = states[i]
+    exe = symnet.bind(ctx=ctx, args=args1)
+    tic = time.time()
+    for i in range(num_batches):
+        exe.forward(is_train=False)
+        mx.nd.waitall()
+    print("Symbol inference takes " + str(time.time() - tic))
+
+    tic = time.time()
+    for i in range(num_batches):
+        with mx.autograd.record():
+            res0 = layer0(rnn_data, states)
+        res0.backward()
+        mx.nd.waitall()
+    print("Imperative training takes " + str(time.time() - tic))
+
+    tic = time.time()
+    for i in range(num_batches):
+        with mx.autograd.record():
+            res1 = layer1(rnn_data, states)
+        res1.backward()
+        mx.nd.waitall()
+    print("Hybrid-cell training takes " + str(time.time() - tic))
+
+    tic = time.time()
+    for i in range(num_batches):
+        with mx.autograd.record():
+            res3 = layer3(rnn_data, states)
+        res3.backward()
+        mx.nd.waitall()
+    print("Static-hybrid-cell training takes " + str(time.time() - tic))
+
+    tic = time.time()
+    for i in range(num_batches):
+        with mx.autograd.record():
+            res2 = layer2(rnn_data, states)
+        res2.backward()
+        mx.nd.waitall()
+    print("Hybrid training takes " + str(time.time() - tic))
+
+    # gradients for the backward of the foreach symbol
+    args_grad1 = {}
+    for key in args1.keys():
+        args_grad1[key] = mx.nd.empty(args1[key].shape, ctx=ctx)
+    exe = symnet.bind(ctx=ctx, args=args1, args_grad=args_grad1)
+    tic = time.time()
+    for i in range(num_batches):
+        exe.forward(is_train=True)
+        exe.backward(res2)
+        mx.nd.waitall()
+    print("Symbol training takes " + str(time.time() - tic))
+    print("")
+
+if __name__ == '__main__':
+    ndim = 512
+    seq_len = 100
+    batch_sizes = [1, 32]
+    cells = [gluon.rnn.GRUCell(ndim, prefix='rnn_'),
+             gluon.rnn.LSTMCell(ndim, prefix='rnn_')]
+    ctxs = [mx.cpu(0), mx.gpu(0)]
+    for cell in cells:
+        for ctx in ctxs:
+            for batch_size in batch_sizes:
+                if len(get_gpus()) == 0 and ctx == mx.gpu(0):
+                    continue
+
+                if isinstance(cell, gluon.rnn.GRUCell):
+                    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, ndim),
+                                            ctx=mx.cpu(0))
+                    states = []
+                    states.append(mx.nd.normal(loc=0, scale=1, shape=(batch_size, ndim),
+                                               ctx=mx.cpu(0)))
+                elif isinstance(cell, gluon.rnn.LSTMCell):
+                    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, ndim),
+                                            ctx=mx.cpu(0))
+                    states = []
+                    states.append(mx.nd.normal(loc=0, scale=1, shape=(batch_size, ndim),
+                                               ctx=mx.cpu(0)))
+                    states.append(mx.nd.normal(loc=0, scale=1, shape=(batch_size, ndim),
+                                               ctx=mx.cpu(0)))
+                if ctx == mx.gpu(0):
+                    dev = "GPU"
+                else:
+                    dev = "CPU"
+                print("Benchmark {} in {} (batch size: {})".format(cell._alias(), dev,
+                                                                   batch_size))
+                benchmark_rnn(cell, rnn_data, states)
@@ -52,6 +52,7 @@ In the rest of this document, we list routines provided by the `ndarray.contrib`
     fft
     ifft
     quantize
+    foreach
 ```
 
 ## API Reference

@@ -52,6 +52,7 @@ In the rest of this document, we list routines provided by the `symbol.contrib`
     fft
     ifft
     quantize
+    foreach
 ```
 
 ## API Reference

@@ -1051,6 +1051,28 @@ MXNET_DLL int MXSymbolListAtomicSymbolCreators(mx_uint *out_size,
  */
 MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
                                           const char **name);
+
+/*!
+ * \brief Get the input symbols of the graph.
+ * \param sym The graph.
+ * \param inputs The input symbols of the graph.
+ * \param input_size the number of input symbols returned.
+ */
+MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **inputs,
+                                      int *input_size);
+
+/*!
+ * \brief Cut a subgraph whose nodes are marked with a subgraph attribute.
+ * The input graph will be modified. A variable node will be created for each
+ * edge that connects to nodes outside the subgraph. The outside nodes that
+ * connect to the subgraph will be returned.
+ * \param sym The graph.
+ * \param inputs The nodes that connect to the subgraph.
+ * \param input_size The number of such nodes.
+ */
+MXNET_DLL int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle **inputs,
+                                  int *input_size);
+
 /*!
  * \brief Get the detailed information about atomic symbol.
  * \param creator the AtomicSymbolCreator.

@@ -64,8 +64,10 @@ enum OpReqType {
  * \sa Resource
  */
 struct OpContext {
+  /*! \brief whether there is a backward phase to compute gradients. */
+  bool need_grad;
   /*! \brief whether it is training phase */
-  int is_train;
+  bool is_train;
   /*! \brief RunContext related resources */
   RunContext run_ctx;
   /*! \brief the callback when operation completes, used by asynchronize ops */
@@ -98,7 +100,12 @@ enum class ExecType {
    *  In current implementation, copy operator is specially handled by executor.
    *  This flag is used for special case treatment and future extension of different copy ops.
    */
-  kCrossDeviceCopy
+  kCrossDeviceCopy,
+  /*!
+   * \brief A subgraph execution should happen in the main thread, instead of
+   *  in the execution engine.
+   */
+  kSubgraphExec,
 };
 
 /*! \brief the dispatch mode of the operator */

@@ -21,6 +21,8 @@
 import math
 from ..context import current_context
 from ..random import uniform
+from ..base import _as_list
+from . import ndarray
 try:
     from .gen_contrib import *
 except ImportError:
@@ -95,3 +97,97 @@ def rand_zipfian(true_classes, num_sampled, range_max, ctx=None):
     expected_count_sampled = expected_prob_sampled * num_sampled
     return sampled_classes, expected_count_true, expected_count_sampled
 # pylint: enable=line-too-long
+
+def foreach(body, data, init_states):
+    """Run a for loop with user-defined computation over NDArrays on dimension 0.
+
+    This operator simulates a for loop and body has the computation for an iteration
+    of the for loop. It runs the computation in body on each slice from the input
+    NDArrays.
+
+    body takes two arguments as input and outputs a tuple of two elements,
+    as illustrated below:
+
+    out, states = body(data1, states)
+
+    data1 can be either an NDArray or a list of NDArrays. If data is an NDArray,
+    data1 is an NDArray. Otherwise, data1 is a list of NDArrays and has the same
+    size as data. states is a list of NDArrays and have the same size as init_states.
+    Similarly, out can be either an NDArray or a list of NDArrays, which are concatenated
+    as the first output of foreach; states from the last execution of body
+    are the second output of foreach.
+
+    The computation done by this operator is equivalent to the pseudo code below
+    when the input data is NDArray:
+
+    states = init_states
+    outs = []
+    for i in data.shape[0]:
+        s = data[i]
+        out, states = body(s, states)
+        outs.append(out)
+    outs = stack(*outs)
+
+
+    Parameters
+    ----------
+    body : a Python function.
+        Define computation in an iteration.
+    data: an NDArray or a list of NDArrays.
+        The input data.
+    init_states: an NDArray or a list of NDArrays.
+        The initial values of the loop states.
+    name: string.
+        The name of the operator.
+
+    Returns
+    -------
+    outputs: an NDArray or a list of NDArrays.
+        The output data concatenated from the output of all iterations.
+    states: a list of NDArrays.
+        The loop states in the last iteration.
+
+    Examples
+    --------
+    >>> step = lambda data, states: (data + states[0], [states[0] * 2])
+    >>> data = mx.nd.random.uniform(shape=(2, 10))
+    >>> states = [mx.nd.random.uniform(shape=(10))]
+    >>> outs, states = mx.nd.contrib.foreach(step, data, states)
+    """
+
+    def check_input(inputs, in_type, msg):
+        is_NDArray_or_list = True
+        if isinstance(inputs, list):
+            for i in inputs:
+                if not isinstance(i, in_type):
+                    is_NDArray_or_list = False
+                    break
+        else:
+            is_NDArray_or_list = isinstance(inputs, in_type)
+        assert is_NDArray_or_list, msg
+
+    check_input(data, ndarray.NDArray, "data should be an NDArray or a list of NDArrays")
+    check_input(init_states, ndarray.NDArray,
+                "init_states should be an NDArray or a list of NDArrays")
+
+    not_data_list = isinstance(data, ndarray.NDArray)
+    num_iters = data.shape[0] if not_data_list else data[0].shape[0]
+    states = init_states
+    outputs = []
+    for i in range(num_iters):
+        if not_data_list:
+            eles = data[i]
+        else:
+            eles = [d[i] for d in data]
+        outs, states = body(eles, states)
+        outs = _as_list(outs)
+        outputs.append(outs)
+    outputs = zip(*outputs)
+    tmp_outputs = []
+    for out in outputs:
+        tmp_outputs.append(ndarray.op.stack(*out))
+    outputs = tmp_outputs
+
+    if not_data_list and len(outputs) == 1:
+        outputs = outputs[0]
+    return (outputs, states)