diff --git a/docs/architecture/note_memory.md b/docs/architecture/note_memory.md
index 6b752bdc4f6a..8ec77c395f31 100644
--- a/docs/architecture/note_memory.md
+++ b/docs/architecture/note_memory.md
@@ -312,18 +312,9 @@ that are already optimized for big operations,
 you can reduce memory consumption roughly *by half*.
 You can reduce memory usage even more
 if you are optimizing a fine-grained computation network
-used by symbolic libraries, such as Theano.
-
-Most of the ideas in this article inspired the design of _MXNet_.
-We've also provided a [Memory Cost Estimation Script](https://github.com/dmlc/mxnet/tree/master/example/memcost),
-which you can use to see how much memory you need under different scenarios.
-
-The script has an option called `forward_only`,
-which shows the cost of running only the forward pass.
-You will find that cost when using this option
-is extremely low compared to others.
-This is simply because there's  more memory reuse
-if you run only the forward pass.
+used by symbolic libraries, such as Theano. Most of the ideas in this article inspired the design of _MXNet_.
+
+Also, you will notice that memory cost, for forward pass only execution, is extremely low compared to running both forward and backward pass. This is simply because there's  more memory reuse if you run only the forward pass.
 
 So here are two takeaways:
 
diff --git a/example/memcost/Makefile b/example/memcost/Makefile
deleted file mode 100644
index f6d52db597b3..000000000000
--- a/example/memcost/Makefile
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License
-
-.PHONY: no_optimization with_inplace with_sharing with_both
-
-no_optimization:
-	@echo "Estimating the cost with no optimization..."
-	@NNVM_EXEC_ENABLE_INPLACE=false NNVM_EXEC_MATCH_RANGE=0 python inception_memcost.py
-
-with_inplace:
-	@echo "Estimating the cost with inplace optimization..."
-	@NNVM_EXEC_ENABLE_INPLACE=true MXNET_EXEC_MATCH_RANGE=0 python inception_memcost.py
-
-with_sharing:
-	@echo "Estimating the cost with memory sharing ..."
-	@NNVM_EXEC_ENABLE_INPLACE=false python inception_memcost.py
-
-with_both:
-	@echo "Estimating the cost with all optimizations ..."
-	@python inception_memcost.py
-
-forward_only:
-	@echo "Estimating the cost of forward only ..."
-	@python inception_memcost.py 'null'
diff --git a/example/memcost/README.md b/example/memcost/README.md
deleted file mode 100644
index 4c4e1fa977af..000000000000
--- a/example/memcost/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-Memory Cost of Deep Nets under Different Allocations
-====================================================
-This folder contains a script to show the memory cost of different allocation strategies,
-discussed in [Note on Memory Optimization](http://mxnet.io/architecture/note_memory.html).
-
-We use inception-bn as an example, with batch size of 32.
-
-How to See the cost
--------------------
-The possible options are gathered together in the [Makefile](Makefile).
-Type the following command to see the allocation cost. Look for the
-```Final message Total x MB allocated```
-- ```make no_optimization```
-  - Shows the cost without any optimization.
-- ```make with_inplace```
-  - Shows the cost with inplace optimization.
-- ```make with_sharing```
-  - Shows the cost with memory allocating algorithm for sharing.
-- ```make with_both```
-  - Shows the cost of memory allocation with both inplace and sharing optimization.
-- ```make forward_only```
-  - Shows the cost of when we only want to run forward pass.
-
-Notes
------
-- You can change the symbol in the [inception_memcost.py](inception_memcost.py) to the net you interested in.
-- You will need to install mxnet or type make on the root folder before use the script.
-- The estimation is only on space cost of intermediate node.
-  - The cost of temporal workspace is not estimated, so you will likely need more memory when running real nets.
-- The estimation does real allocation on CPU, the plan is the same on GPU.
diff --git a/example/memcost/inception_memcost.py b/example/memcost/inception_memcost.py
deleted file mode 100644
index c539e73b3c24..000000000000
--- a/example/memcost/inception_memcost.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: skip-file
-import sys
-sys.path.append('../../python/')
-import mxnet as mx
-import logging
-
-def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), name=None, suffix=''):
-    conv = mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
-    bn = mx.symbol.BatchNorm(data=conv, name='bn_%s%s' %(name, suffix))
-    act = mx.symbol.Activation(data=bn, act_type='relu', name='relu_%s%s' %(name, suffix))
-    return act
-
-def InceptionFactoryA(data, num_1x1, num_3x3red, num_3x3, num_d3x3red, num_d3x3, pool, proj, name):
-    # 1x1
-    c1x1 = ConvFactory(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_1x1' % name))
-    # 3x3 reduce + 3x3
-    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
-    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_3x3' % name))
-    # double 3x3 reduce + double 3x3
-    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1), name=('%s_double_3x3' % name), suffix='_reduce')
-    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_0' % name))
-    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_1' % name))
-    # pool + proj
-    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
-    cproj = ConvFactory(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_proj' %  name))
-    # concat
-    concat = mx.symbol.Concat(*[c1x1, c3x3, cd3x3, cproj], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
-    # 3x3 reduce + 3x3
-    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
-    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_3x3' % name))
-    # double 3x3 reduce + double 3x3
-    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1),  name=('%s_double_3x3' % name), suffix='_reduce')
-    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
-    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
-    # pool + proj
-    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type="max", name=('max_pool_%s_pool' % name))
-    # concat
-    concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-def inception(nhidden, grad_scale):
-    # data
-    data = mx.symbol.Variable(name="data")
-    # stage 1
-    conv1 = ConvFactory(data=data, num_filter=64, kernel=(7, 7), stride=(2, 2), pad=(3, 3), name='conv1')
-    pool1 = mx.symbol.Pooling(data=conv1, kernel=(3, 3), stride=(2, 2), name='pool1', pool_type='max')
-    # stage 2
-    conv2red = ConvFactory(data=pool1, num_filter=64, kernel=(1, 1), stride=(1, 1), name='conv2red')
-    conv2 = ConvFactory(data=conv2red, num_filter=192, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name='conv2')
-    pool2 = mx.symbol.Pooling(data=conv2, kernel=(3, 3), stride=(2, 2), name='pool2', pool_type='max')
-    # stage 2
-    in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, "avg", 32, '3a')
-    in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, "avg", 64, '3b')
-    in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, '3c')
-    # stage 3
-    in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, '4a')
-    in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, '4b')
-    in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, '4c')
-    in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192, "avg", 128, '4d')
-    in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, '4e')
-    # stage 4
-    in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, '5a')
-    in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, '5b')
-    # global avg pooling
-    avg = mx.symbol.Pooling(data=in5b, kernel=(7, 7), stride=(1, 1), name="global_pool", pool_type='avg')
-    # linear classifier
-    flatten = mx.symbol.Flatten(data=avg, name='flatten')
-    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
-    softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax')
-    return softmax
-
-
-
-softmax = inception(1000, 1.0)
-batch_size = 32
-softmax = inception(1000, 1.0)
-
-if len(sys.argv) == 2:
-    grad_req = sys.argv[1]
-else:
-    grad_req = 'write'
-
-texec = softmax.simple_bind(ctx=mx.cpu(),
-                            data=(batch_size, 3, 224, 224),
-                            grad_req=grad_req)
-# We extract the memory cost from the execution plan
-print(texec.debug_str().split('\n')[-3])