diff --git a/Makefile b/Makefile
index d1e082d1f202..67661cb83fe0 100644
--- a/Makefile
+++ b/Makefile
@@ -116,7 +116,7 @@ $(BIN) :
 
 include tests/cpp/unittest.mk
 
-test: tests/cpp/unittest
+test: $(TEST)
 
 lint:
 	python dmlc-core/scripts/lint.py mxnet ${LINT_LANG} include src scripts python
diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh
index 07abd4881dce..71b54429dfa4 100755
--- a/scripts/travis_script.sh
+++ b/scripts/travis_script.sh
@@ -63,7 +63,9 @@ if [ ${TASK} == "cpp_unittest" ]; then
     echo "USE_CUDA=0" >> config.mk
     make test || exit -1
     export MXNET_ENGINE_TYPE=ThreadedEngine
-    tests/cpp/unittest || exit -1
+    for test in tests/cpp/*_test; do
+        ./$test || exit -1
+    done
 fi
 
 # TODO(yutian): add unittest back
diff --git a/src/engine/engine.cc b/src/engine/engine.cc
index aeef21cbcab9..eececfa91e04 100644
--- a/src/engine/engine.cc
+++ b/src/engine/engine.cc
@@ -18,7 +18,7 @@ inline Engine* CreateEngine() {
 
   Engine *ret = nullptr;
   if (stype == "NaiveEngine") {
-    ret =  CreateNaiveEngine();
+    ret = CreateNaiveEngine();
   } else if (stype == "ThreadedEngine") {
     ret = CreateThreadedEnginePooled();
   } else if (stype == "ThreadedEnginePerDevice") {
diff --git a/tests/cpp/storage_unittest.cc b/tests/cpp/storage_test.cc
similarity index 50%
rename from tests/cpp/storage_unittest.cc
rename to tests/cpp/storage_test.cc
index 20a92f4daaf5..5b9c2300a249 100644
--- a/tests/cpp/storage_unittest.cc
+++ b/tests/cpp/storage_test.cc
@@ -3,32 +3,34 @@
 #include <dmlc/logging.h>
 #include <mxnet/storage.h>
 
-TEST(Storage, basics) {
+TEST(Storage, Basic_CPU) {
   constexpr size_t kSize = 1024;
   auto&& storage = mxnet::Storage::Get();
   mxnet::Context context_cpu{};
   auto&& handle = storage->Alloc(kSize, context_cpu);
-  ASSERT_EQ(handle.ctx, context_cpu);
-  ASSERT_EQ(handle.size, kSize);
+  EXPECT_EQ(handle.ctx, context_cpu);
+  EXPECT_EQ(handle.size, kSize);
   auto ptr = handle.dptr;
   storage->Free(handle);
   handle = storage->Alloc(kSize, context_cpu);
-  ASSERT_EQ(handle.ctx, context_cpu);
-  ASSERT_EQ(handle.size, kSize);
-  ASSERT_EQ(handle.dptr, ptr);
-  LOG(INFO) << "Success on CPU!\n";
+  EXPECT_EQ(handle.ctx, context_cpu);
+  EXPECT_EQ(handle.size, kSize);
+  EXPECT_EQ(handle.dptr, ptr);
+}
 
 #if MXNET_USE_CUDA
-  mxnet::Context context_gpu{mxnet::gpu::kDevMask, 0};
-  handle = storage->Alloc(kSize, context_gpu);
+TEST(Storage, Basic_GPU) {
+  constexpr size_t kSize = 1024;
+  mxnet::Context context_gpu = mxnet::Context::GPU(0);
+  auto&& storage = mxnet::Storage::Get();
+  auto&& handle = storage->Alloc(kSize, context_gpu);
   assert(handle.ctx == context_gpu);
   assert(handle.size == kSize);
-  ptr = handle.dptr;
+  auto ptr = handle.dptr;
   storage->Free(handle);
   handle = storage->Alloc(kSize, context_gpu);
-  ASSERT_EQ(handle.ctx, context_gpu);
-  ASSERT_EQ(handle.size, kSize);
-  ASSERT_EQ(handle.dptr, ptr);
-  LOG(INFO) << "Success on GPU!\n";
-#endif  // MXNET_USE_CUDA
+  EXPECT_EQ(handle.ctx, context_gpu);
+  EXPECT_EQ(handle.size, kSize);
+  EXPECT_EQ(handle.dptr, ptr);
 }
+#endif  // MXNET_USE_CUDA
diff --git a/tests/cpp/threaded_engine_test.cc b/tests/cpp/threaded_engine_test.cc
new file mode 100644
index 000000000000..11a8b656b169
--- /dev/null
+++ b/tests/cpp/threaded_engine_test.cc
@@ -0,0 +1,231 @@
+#include <time.h>
+#include <unistd.h>
+#include <dmlc/logging.h>
+#include <cstdio>
+#include <gtest/gtest.h>
+#include <thread>
+#include <chrono>
+#include <vector>
+
+#include <mxnet/engine.h>
+#include "../src/engine/engine_impl.h"
+#include <dmlc/timer.h>
+
+/**
+ * present the following workload
+ *  n = reads.size()
+ *  data[write] = (data[reads[0]] + ... data[reads[n]]) / n
+ *  std::this_thread::sleep_for(std::chrono::microsecons(time));
+ */
+struct Workload {
+  std::vector<int> reads;
+  int write;
+  int time;
+};
+
+/**
+ * generate a list of workloads
+ */
+void GenerateWorkload(int num_workloads, int num_var,
+                      int min_read, int max_read,
+                      int min_time, int max_time,
+                      std::vector<Workload>* workloads) {
+  workloads->clear();
+  workloads->resize(num_workloads);
+  for (int i = 0; i < num_workloads; ++i) {
+    auto& wl = workloads->at(i);
+    wl.write = rand() % num_var;
+    int r = rand();
+    int num_read = min_read + (r % (max_read - min_read));
+    for (int j = 0; j < num_read; ++j) {
+      wl.reads.push_back(rand() % num_var);
+    }
+    wl.time = min_time + rand() % (max_time - min_time);
+  }
+}
+
+/**
+ * evaluate a single workload
+ */
+void EvaluateWorload(const Workload& wl, std::vector<double>* data) {
+  double tmp = 0;
+  for (int i : wl.reads) tmp += data->at(i);
+  data->at(wl.write) = tmp / (wl.reads.size() + 1);
+  if (wl.time > 0) {
+    std::this_thread::sleep_for(std::chrono::microseconds(wl.time));
+  }
+}
+
+/**
+ * evaluate a list of workload, return the time used
+ */
+double EvaluateWorloads(const std::vector<Workload>& workloads,
+                      mxnet::Engine* engine,
+                      std::vector<double>* data) {
+  using namespace mxnet;
+  double t = dmlc::GetTime();
+  std::vector<Engine::VarHandle> vars;
+  if (engine) {
+    for (size_t i = 0; i < data->size(); ++i) {
+      vars.push_back(engine->NewVariable());
+    }
+  }
+
+  for (const auto& wl : workloads) {
+    if (wl.reads.size() == 0) continue;
+    if (engine == NULL) {
+      EvaluateWorload(wl, data);
+    } else {
+      auto func = [wl,data](RunContext ctx, Engine::CallbackOnComplete cb) {
+        EvaluateWorload(wl, data); cb();
+      };
+      std::vector<Engine::VarHandle> reads;
+      for (auto i : wl.reads) {
+        if (i != wl.write) reads.push_back(vars[i]);
+      }
+      engine->PushAsync(func, Context::CPU(), reads, {vars[wl.write]});
+    }
+  }
+
+  if (engine) {
+    engine->WaitForAll();
+  }
+  return dmlc::GetTime() - t;
+}
+
+TEST(Engine, RandSumExpr) {
+  std::vector<Workload> workloads;
+  int num_repeat = 5;
+  const int num_engine = 4;
+
+  std::vector<double> t(num_engine, 0.0);
+  std::vector<mxnet::Engine*> engine(num_engine);
+
+  engine[0] = NULL;
+  engine[1] = mxnet::engine::CreateNaiveEngine();
+  engine[2] = mxnet::engine::CreateThreadedEnginePooled();
+  engine[3] = mxnet::engine::CreateThreadedEnginePerDevice();
+
+  for (int repeat = 0; repeat < num_repeat; ++repeat) {
+    srand(time(NULL) + repeat);
+    int num_var = 100;
+    GenerateWorkload(10000, num_var, 2, 20, 1, 10, &workloads);
+    std::vector<std::vector<double>> data(num_engine);
+    for (int i = 0; i < num_engine; ++i) {
+      data[i].resize(num_var, 1.0);
+      t[i] += EvaluateWorloads(workloads, engine[i], &data[i]);
+    }
+
+    for (int i = 1; i < num_engine; ++i) {
+      for (int j = 0; j < num_var; ++j) EXPECT_EQ(data[0][j], data[i][j]);
+    }
+    LOG(INFO) << "data: " << data[0][1] << " " << data[0][2] << "...";
+  }
+
+
+  LOG(INFO) << "baseline\t\t"  << t[0] << " sec";
+  LOG(INFO) << "NaiveEngine\t\t"  << t[1] << " sec";
+  LOG(INFO) << "ThreadedEnginePooled\t" << t[2] << " sec";
+  LOG(INFO) << "ThreadedEnginePerDevice\t" << t[3] << " sec";
+}
+
+void Foo(mxnet::RunContext, int i) { printf("The fox says %d\n", i); }
+
+TEST(Engine, basics) {
+  auto&& engine = mxnet::Engine::Get();
+  auto&& var = engine->NewVariable();
+  std::vector<mxnet::Engine::OprHandle> oprs;
+
+  // Test #1
+  printf("============= Test #1 ==============\n");
+  for (int i = 0; i < 10; ++i) {
+    oprs.push_back(engine->NewOperator(
+        [i](mxnet::RunContext ctx, mxnet::Engine::CallbackOnComplete cb) {
+          Foo(ctx, i);
+          std::this_thread::sleep_for(std::chrono::seconds{1});
+          cb();
+        },
+        {var}, {}));
+    engine->Push(oprs.at(i), mxnet::Context{});
+  }
+  engine->WaitForAll();
+  printf("Going to push delete\n");
+  // std::this_thread::sleep_for(std::chrono::seconds{1});
+  for (auto&& i : oprs) {
+    engine->DeleteOperator(i);
+  }
+  engine->DeleteVariable([](mxnet::RunContext) {}, mxnet::Context{}, var);
+  engine->WaitForAll();
+
+  printf("============= Test #2 ==============\n");
+  var = engine->NewVariable();
+  oprs.clear();
+  for (int i = 0; i < 10; ++i) {
+    oprs.push_back(engine->NewOperator(
+        [i](mxnet::RunContext ctx, mxnet::Engine::CallbackOnComplete cb) {
+          Foo(ctx, i);
+          std::this_thread::sleep_for(std::chrono::milliseconds{500});
+          cb();
+        },
+        {}, {var}));
+    engine->Push(oprs.at(i), mxnet::Context{});
+  }
+  // std::this_thread::sleep_for(std::chrono::seconds{1});
+  engine->WaitForAll();
+  for (auto&& i : oprs) {
+    engine->DeleteOperator(i);
+  }
+  engine->DeleteVariable([](mxnet::RunContext) {}, mxnet::Context{}, var);
+
+  printf("============= Test #3 ==============\n");
+  var = engine->NewVariable();
+  oprs.clear();
+  engine->WaitForVar(var);
+  engine->DeleteVariable([](mxnet::RunContext) {}, mxnet::Context{}, var);
+  engine->WaitForAll();
+
+  printf("============= Test #4 ==============\n");
+  var = engine->NewVariable();
+  oprs.clear();
+  oprs.push_back(engine->NewOperator(
+      [](mxnet::RunContext ctx, mxnet::Engine::CallbackOnComplete cb) {
+        std::this_thread::sleep_for(std::chrono::seconds{2});
+        Foo(ctx, 42);
+        cb();
+      },
+      {}, {var}, mxnet::FnProperty::kCopyFromGPU));
+  engine->Push(oprs.at(0), mxnet::Context{});
+  LOG(INFO) << "IO operator pushed, should wait for 2 seconds.";
+  engine->WaitForVar(var);
+  LOG(INFO) << "OK, here I am.";
+  for (auto&& i : oprs) {
+    engine->DeleteOperator(i);
+  }
+  engine->DeleteVariable([](mxnet::RunContext) {}, mxnet::Context{}, var);
+  engine->WaitForAll();
+
+  printf("============= Test #5 ==============\n");
+  var = engine->NewVariable();
+  oprs.clear();
+  oprs.push_back(engine->NewOperator(
+      [](mxnet::RunContext ctx, mxnet::Engine::CallbackOnComplete cb) {
+        Foo(ctx, 42);
+        std::this_thread::sleep_for(std::chrono::seconds{2});
+        cb();
+      },
+      {var}, {}));
+  engine->Push(oprs.at(0), mxnet::Context{});
+  LOG(INFO) << "Operator pushed, should not wait.";
+  engine->WaitForVar(var);
+  LOG(INFO) << "OK, here I am.";
+  engine->WaitForAll();
+  LOG(INFO) << "That was 2 seconds.";
+  for (auto&& i : oprs) {
+    engine->DeleteOperator(i);
+  }
+  engine->DeleteVariable([](mxnet::RunContext) {}, mxnet::Context{}, var);
+  engine->WaitForAll();
+  var = nullptr;
+  oprs.clear();
+  LOG(INFO) << "All pass";
+}
diff --git a/tests/cpp/threaded_engine_unittest.cc b/tests/cpp/threaded_engine_unittest.cc
deleted file mode 100644
index ffe3ee4ad3da..000000000000
--- a/tests/cpp/threaded_engine_unittest.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-#include <unistd.h>
-#include <dmlc/logging.h>
-#include <cstdio>
-#include <gtest/gtest.h>
-#include <thread>
-#include <chrono>
-#include <vector>
-
-#include <mxnet/engine.h>
-
-void Foo(mxnet::RunContext, int i) { printf("The fox says %d\n", i); }
-
-TEST(Engine, basics) {
-  auto&& engine = mxnet::Engine::Get();
-  auto&& var = engine->NewVariable();
-  std::vector<mxnet::Engine::OprHandle> oprs;
-
-  // Test #1
-  printf("============= Test #1 ==============\n");
-  for (int i = 0; i < 10; ++i) {
-    oprs.push_back(engine->NewOperator(
-        [i](mxnet::RunContext ctx, mxnet::Engine::CallbackOnComplete cb) {
-          Foo(ctx, i);
-          std::this_thread::sleep_for(std::chrono::seconds{1});
-          cb();
-        },
-        {var}, {}));
-    engine->Push(oprs.at(i), mxnet::Context{});
-  }
-  engine->WaitForAll();
-  printf("Going to push delete\n");
-  // std::this_thread::sleep_for(std::chrono::seconds{1});
-  for (auto&& i : oprs) {
-    engine->DeleteOperator(i);
-  }
-  engine->DeleteVariable([](mxnet::RunContext) {}, mxnet::Context{}, var);
-  engine->WaitForAll();
-
-  printf("============= Test #2 ==============\n");
-  var = engine->NewVariable();
-  oprs.clear();
-  for (int i = 0; i < 10; ++i) {
-    oprs.push_back(engine->NewOperator(
-        [i](mxnet::RunContext ctx, mxnet::Engine::CallbackOnComplete cb) {
-          Foo(ctx, i);
-          std::this_thread::sleep_for(std::chrono::milliseconds{500});
-          cb();
-        },
-        {}, {var}));
-    engine->Push(oprs.at(i), mxnet::Context{});
-  }
-  // std::this_thread::sleep_for(std::chrono::seconds{1});
-  engine->WaitForAll();
-  for (auto&& i : oprs) {
-    engine->DeleteOperator(i);
-  }
-  engine->DeleteVariable([](mxnet::RunContext) {}, mxnet::Context{}, var);
-
-  printf("============= Test #3 ==============\n");
-  var = engine->NewVariable();
-  oprs.clear();
-  engine->WaitForVar(var);
-  engine->DeleteVariable([](mxnet::RunContext) {}, mxnet::Context{}, var);
-  engine->WaitForAll();
-
-  printf("============= Test #4 ==============\n");
-  var = engine->NewVariable();
-  oprs.clear();
-  oprs.push_back(engine->NewOperator(
-      [](mxnet::RunContext ctx, mxnet::Engine::CallbackOnComplete cb) {
-        std::this_thread::sleep_for(std::chrono::seconds{2});
-        Foo(ctx, 42);
-        cb();
-      },
-      {}, {var}, mxnet::FnProperty::kCopyFromGPU));
-  engine->Push(oprs.at(0), mxnet::Context{});
-  LOG(INFO) << "IO operator pushed, should wait for 2 seconds.";
-  engine->WaitForVar(var);
-  LOG(INFO) << "OK, here I am.";
-  for (auto&& i : oprs) {
-    engine->DeleteOperator(i);
-  }
-  engine->DeleteVariable([](mxnet::RunContext) {}, mxnet::Context{}, var);
-  engine->WaitForAll();
-
-  printf("============= Test #5 ==============\n");
-  var = engine->NewVariable();
-  oprs.clear();
-  oprs.push_back(engine->NewOperator(
-      [](mxnet::RunContext ctx, mxnet::Engine::CallbackOnComplete cb) {
-        Foo(ctx, 42);
-        std::this_thread::sleep_for(std::chrono::seconds{2});
-        cb();
-      },
-      {var}, {}));
-  engine->Push(oprs.at(0), mxnet::Context{});
-  LOG(INFO) << "Operator pushed, should not wait.";
-  engine->WaitForVar(var);
-  LOG(INFO) << "OK, here I am.";
-  engine->WaitForAll();
-  LOG(INFO) << "That was 2 seconds.";
-  for (auto&& i : oprs) {
-    engine->DeleteOperator(i);
-  }
-  engine->DeleteVariable([](mxnet::RunContext) {}, mxnet::Context{}, var);
-  engine->WaitForAll();
-  var = nullptr;
-  oprs.clear();
-  LOG(INFO) << "All pass";
-}
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index 4020dba82a82..dff9e62941cd 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -1,16 +1,11 @@
-UNITTEST_SRC = $(wildcard tests/cpp/*_unittest.cc)
-UNITTEST_OBJ = $(patsubst tests/cpp/%_unittest.cc, tests/cpp/%_unittest.o, $(UNITTEST_SRC))
+TEST_SRC = $(wildcard tests/cpp/*_test.cc)
+TEST = $(patsubst tests/cpp/%_test.cc, tests/cpp/%_test, $(TEST_SRC))
 
 GTEST_LIB=$(GTEST_PATH)/lib/
 GTEST_INC=$(GTEST_PATH)/include/
 
-tests/cpp/%.o : tests/cpp/%.cc 
-	$(CXX) -std=c++0x $(CFLAGS) -MM -MT tests/$*.o $< >tests/$*.d
-	$(CXX) -std=c++0x -c $(CFLAGS) -I$(GTEST_INC) -c $< -o $@
-
-tests/cpp/unittest: $(UNITTEST_OBJ) lib/libmxnet.a
-	$(CXX) $(CFLAGS) -std=c++0x  -o $@ $(filter %.o %.a, $^)  $(LDFLAGS) -lgtest -lgtest_main 
+tests/cpp/% : tests/cpp/%.cc lib/libmxnet.a
+	$(CXX) -std=c++0x $(CFLAGS) -MM -MT tests/cpp/$* $< >tests/cpp/$*.d
+	$(CXX) -std=c++0x $(CFLAGS) -I$(GTEST_INC) -o $@ $(filter %.cc %.a, $^) $(LDFLAGS) -L$(GTEST_LIB) -lgtest -lgtest_main
 
 -include tests/cpp/*.d
-
-