diff --git a/.gitignore b/.gitignore
index 1996f1cbf..3b7bcbc53 100644
--- a/.gitignore
+++ b/.gitignore
@@ -116,3 +116,4 @@ venv.bak/
 
 # for development
 scripts/
+exps/
diff --git a/byteps/common/common.cc b/byteps/common/common.cc
index 0ea59e769..89894297b 100644
--- a/byteps/common/common.cc
+++ b/byteps/common/common.cc
@@ -100,6 +100,7 @@ int GetCommandType(RequestType requestType, int d) {
   return (((m + d) * (m + d + 1)) / 2) + d;
 }
 
+#ifndef BYTEPS_BUILDING_SERVER
 ncclDataType_t getNcclDataType(DataType dtype) {
   switch (dtype) {
     case BYTEPS_FLOAT32:
@@ -121,6 +122,7 @@ ncclDataType_t getNcclDataType(DataType dtype) {
   }
   return ncclFloat32;
 }
+#endif
 
 int getDataTypeLength(int dtype) {
   switch (dtype) {
diff --git a/byteps/common/common.h b/byteps/common/common.h
index 8781cc5c7..9824488bd 100644
--- a/byteps/common/common.h
+++ b/byteps/common/common.h
@@ -31,16 +31,23 @@
 #include <vector>
 
 // Add for profiling communication events
-#include <fstream>
 #include <stdio.h>
 #include <stdlib.h>
-#include <iostream>
-#include <thread>
+
 #include <chrono>
+#include <fstream>
+#include <iostream>
 #include <queue>
+#include <thread>
 
 namespace byteps {
 namespace common {
+namespace compressor {
+struct BPSTensor;
+typedef BPSTensor tensor_t;
+class Compressor;
+class ErrorFeedback;
+}  // namespace compressor
 
 // Device ID used for CPU.
 #define CPU_DEVICE_ID (-1)
@@ -83,8 +90,10 @@ enum QueueType {
   COPYD2H,
   PCIE_REDUCE,
   COORDINATE_PUSH,
+  COMPRESS,
   PUSH,
   PULL,
+  DECOMPRESS,
   COPYH2D,
   COORDINATE_BROADCAST,
   BROADCAST,
@@ -94,10 +103,18 @@ enum QueueType {
 const int QueueNum =
     (int)QUEUE_NUM_AND_NOT_A_REAL_QUEUE_TYPE_AND_MUST_BE_THE_LAST;
 
-const std::vector<std::string> LogStrings = {
-    "COORDINATE_REDUCE",    "REDUCE",   "COPYD2H", "PCIE_REDUCE",
-    "COORDINATE_PUSH",      "PUSH",     "PULL",    "COPYH2D",
-    "COORDINATE_BROADCAST", "BROADCAST"};
+const std::vector<std::string> LogStrings = {"COORDINATE_REDUCE",
+                                             "REDUCE",
+                                             "COPYD2H",
+                                             "PCIE_REDUCE",
+                                             "COORDINATE_PUSH",
+                                             "COMPRESS",
+                                             "PUSH",
+                                             "PULL",
+                                             "DECOMPRESS",
+                                             "COPYH2D",
+                                             "COORDINATE_BROADCAST",
+                                             "BROADCAST"};
 
 class Status {
  public:
@@ -173,11 +190,17 @@ typedef struct BytePSContext {
   std::vector<void*> pcie_cpubuff;
   size_t buff_len;
   // Used for profiling communication events
-  std::queue<BPSCommTime *> comm_time;
+  std::queue<BPSCommTime*> comm_time;
   bool profile_flag = false;
   int step_cnt = 0;
   int local_rank = 0;
-  std::unordered_map<uint64_t, std::unordered_map<int, std::queue<BPSCommTime *>>> part_comm_time;
+  std::unordered_map<uint64_t,
+                     std::unordered_map<int, std::queue<BPSCommTime*>>>
+      part_comm_time;
+  // Compressor list
+  std::vector<std::shared_ptr<compressor::Compressor>> compressor_list;
+  // kwargs
+  std::unordered_map<std::string, std::string> kwargs;
 } BPSContext;
 
 class Tensor {
@@ -233,6 +256,10 @@ struct TensorTableEntry {
   std::shared_ptr<std::atomic_int> counter_ptr;
   // How many partitions
   unsigned int total_partnum = 0;
+  // Compressor
+  std::shared_ptr<compressor::Compressor> compressor;
+  // Compressed
+  std::shared_ptr<compressor::tensor_t> compressed;
 };
 using TensorTable = std::unordered_map<std::string, TensorTableEntry>;
 
@@ -250,6 +277,11 @@ ncclDataType_t getNcclDataType(DataType dtype);
 
 int getDataTypeLength(int dtype);
 
+inline size_t Align(size_t size, int dtype) {
+  const size_t min_size =
+      (getDataTypeLength(dtype) * getDataTypeLength(dtype)) * 8;
+  return size + (min_size - size % min_size) % min_size;
+}
 }  // namespace common
 }  // namespace byteps
 
diff --git a/byteps/common/compressor/common.h b/byteps/common/compressor/common.h
new file mode 100644
index 000000000..04f9905db
--- /dev/null
+++ b/byteps/common/compressor/common.h
@@ -0,0 +1,99 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_COMMON_H
+#define BYTEPS_COMPRESSOR_COMMON_H
+
+#include <unordered_map>
+#if __F16C__
+#include "../half.h"
+using half_t = mshadow::half::half_t;
+#endif
+
+namespace byteps {
+namespace common {
+namespace compressor {
+typedef char byte_t;
+/*!
+ * \brief Tensor type
+ */
+typedef struct BPSTensor {
+  byte_t* data;
+  size_t size;
+  int dtype;
+
+  BPSTensor() : data(nullptr), size(0), dtype(0) {}
+  BPSTensor(void* data, size_t size = 0, int dtype = 0)
+      : data(reinterpret_cast<byte_t*>(data)), size(size), dtype(dtype) {}
+} tensor_t;
+
+using kwargs_t = std::unordered_map<std::string, std::string>;
+
+#define COMPRESS_IMPL_SWITCH(dtype, func, dst, src, size)                     \
+  switch (dtype) {                                                            \
+    case BYTEPS_FLOAT16:                                                      \
+      return func(reinterpret_cast<uint16_t*>(dst),                           \
+                  reinterpret_cast<const half_t*>(src),                       \
+                  size / sizeof(half_t));                                     \
+    case BYTEPS_FLOAT32:                                                      \
+      return func(reinterpret_cast<uint32_t*>(dst),                           \
+                  reinterpret_cast<const float*>(src), size / sizeof(float)); \
+    case BYTEPS_FLOAT64:                                                      \
+      return func(reinterpret_cast<uint64_t*>(dst),                           \
+                  reinterpret_cast<const double*>(src),                       \
+                  size / sizeof(double));                                     \
+    default:                                                                  \
+      BPS_CHECK(0) << "Unsupported data type:" << dtype;                      \
+  }
+
+#define DECOMPRESS_IMPL_SWITCH(dtype, func, dst, src, compressed_size)      \
+  switch (dtype) {                                                          \
+    case BYTEPS_FLOAT16:                                                    \
+      return func(reinterpret_cast<half_t*>(dst),                           \
+                  reinterpret_cast<const uint16_t*>(src), compressed_size); \
+    case BYTEPS_FLOAT32:                                                    \
+      return func(reinterpret_cast<float*>(dst),                            \
+                  reinterpret_cast<const uint32_t*>(src), compressed_size); \
+    case BYTEPS_FLOAT64:                                                    \
+      return func(reinterpret_cast<double*>(dst),                           \
+                  reinterpret_cast<const uint64_t*>(src), compressed_size); \
+    default:                                                                \
+      BPS_CHECK(0) << "Unsupported data type:" << dtype;                    \
+  }
+
+#define FAST_UPDATE_ERROR_IMPL_SWITCH(dtype, func, dst, src1, src2,          \
+                                      compressed_size)                       \
+  switch (dtype) {                                                           \
+    case BYTEPS_FLOAT16:                                                     \
+      return func(reinterpret_cast<half_t*>(dst),                            \
+                  reinterpret_cast<half_t*>(src1),                           \
+                  reinterpret_cast<const uint16_t*>(src2), compressed_size); \
+    case BYTEPS_FLOAT32:                                                     \
+      return func(reinterpret_cast<float*>(dst),                             \
+                  reinterpret_cast<float*>(src1),                            \
+                  reinterpret_cast<const uint32_t*>(src2), compressed_size); \
+    case BYTEPS_FLOAT64:                                                     \
+      return func(reinterpret_cast<double*>(dst),                            \
+                  reinterpret_cast<double*>(src1),                           \
+                  reinterpret_cast<const uint64_t*>(src2), compressed_size); \
+    default:                                                                 \
+      BPS_CHECK(0) << "Unsupported data type:" << dtype;                     \
+  }
+
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_COMMON_H
\ No newline at end of file
diff --git a/byteps/common/compressor/compressor.h b/byteps/common/compressor/compressor.h
new file mode 100644
index 000000000..89f78e8a1
--- /dev/null
+++ b/byteps/common/compressor/compressor.h
@@ -0,0 +1,133 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_COMPRESSOR_H
+#define BYTEPS_COMPRESSOR_COMPRESSOR_H
+
+#include <memory>
+
+#include "../common.h"
+#include "../logging.h"
+#include "common.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+/*!
+ * \brief Compressor interface
+ * Compressor defines two universal API - Compress & Decompress
+ *
+ * \par
+ * The caller do not need to allocate additional memory to store compressed data
+ * because there is an internal buffer to store the compressed data and the
+ * pointer will be returned to the caller. Then the caller can send the returned
+ * compressed data as normal.
+ *
+ * \par
+ * There are two optional features of the compressor - error-feedback &
+ * momentum. These two features can be added to any common compressors like 1bit
+ * and topk. To be generic, these two features are also compressors, exposing
+ * the same API as Compressor. More details can be found in their own files.
+ *
+ * \par
+ * To add a new compressor, developers need to inherit this class in 'impl'
+ * directory. If a new optional feature like error-feedback is needed,
+ * developers need to use decorator pattern and add new files in the current
+ * directory. The existing implementation can be used as a reference.
+ *
+ *
+ * \sa ErrorFeedback, Momentum
+ */
+class Compressor {
+ public:
+  Compressor(size_t size, DataType dtype)
+      : _size(size), _dtype(dtype), _buf(new byte_t[size]){};
+  virtual ~Compressor() = default;
+
+  /*!
+   * \brief Compress function
+   *
+   * \note Except for error-feedback and momentum, the underlying data of input
+   * should never be changed. this is because input is still used in error
+   * feedback if enabled.
+   *
+   * \note Compressed data should be stored in the buffer of the compressor. So
+   * it is not an inplace operation.
+   *
+   * \param grad gradient tensor, passed by value.
+   * \return compressed tensor. it is the buffer of the compressor,
+   * which contains the compressed data. the returned size is the size of
+   * compressed data.
+   */
+  virtual tensor_t Compress(tensor_t grad) = 0;
+
+  /*!
+   * \brief Decompress function
+   *
+   * \note For servers, decompression is not an inplace operation. The
+   * decompressed results locates in the buffer of the compressor. For workers,
+   * it is an inplace operation.
+   *
+   * \param compressed compressed tensor.
+   * \return decompressed tensor. For servers, it is the buffer of the
+   * compressor, which contains the decompressed data. For workers, its pointer
+   * is the same as the input's, while the size is decompressed size, which is
+   * also the original size.
+   */
+  virtual tensor_t Decompress(tensor_t compressed) = 0;
+
+  /*!
+   * \brief faster version of `UpdateError` via operation fusion
+   *
+   * \par
+   * This is a helper function implemented by each compressor. If defined,
+   * `ErrorFeedback` will use this function instead of defualt `UpdateError`
+   * function implemented in error_feedback.cc. If undefined, default
+   * `UpdateError` will be used.
+   *
+   * \par
+   * Typically `UpdateError` needs to decompress and do a substraction. But for
+   * most compressors, the step of decompression can be avoided. For example,
+   * for topk compressor, `UpdateError` can be simplied in this way:
+   * 1. e <- p (e is the error and p is the corrected gradient)
+   * 2. zero-fill e with selected k indices
+   *
+   * Actually it is a fusion of original decompression and substraction. It is
+   * optional to override.
+   *
+   * \param corrected gradient corrected with error
+   * \param error error
+   * \param compressed compressed gradient
+   */
+  virtual void FastUpdateError(tensor_t error, tensor_t corrected,
+                               tensor_t compressed) {
+    BPS_LOG(FATAL) << "FastUpdateError is not implemented";
+  };
+
+ protected:
+  /*! \brief original size */
+  size_t _size;
+
+  DataType _dtype;
+
+  /*! \brief buffer to store compressed grad */
+  std::unique_ptr<byte_t[]> _buf;
+};
+
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_COMPRESSOR_H
\ No newline at end of file
diff --git a/byteps/common/compressor/compressor_registry.cc b/byteps/common/compressor/compressor_registry.cc
new file mode 100644
index 000000000..2ab97db98
--- /dev/null
+++ b/byteps/common/compressor/compressor_registry.cc
@@ -0,0 +1,60 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "compressor_registry.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+
+CompressorRegistry::map_t CompressorRegistry::_ctor_map;
+
+CompressorRegistry::Register::Register(std::string name, ctor_t ctor) {
+  BPS_CHECK_EQ(_ctor_map.count(name), 0)
+      << "Duplicate registration of compressor under name " << name;
+  _ctor_map.emplace(name + "_type", std::move(ctor));
+  BPS_LOG(INFO) << name << " compressor is registered";
+}
+
+CompressorRegistry::ctor_t CompressorRegistry::Find(const std::string& name) {
+  auto it = _ctor_map.find(name);
+  if (it == _ctor_map.end()) {
+    BPS_LOG(FATAL) << "No compressor registered under name:" << name;
+  }
+  return it->second;
+}
+
+std::unique_ptr<Compressor> CompressorRegistry::Create(const kwargs_t& kwargs,
+                                                       size_t size, DataType dtype) {
+#ifndef BYTEPS_BUILDING_SERVER
+  const std::string types[] = {"momentum_type", "ef_type", "compressor_type"};
+#else
+  // server do not need momentum
+  const std::string types[] = {"ef_type", "compressor_type"};
+#endif
+  for (auto& type : types) {
+    auto iter = kwargs.find(type);
+    if (iter != kwargs.end()) {
+      auto ctor = CompressorRegistry::Find(iter->second + "_" + type);
+      return ctor(kwargs, size, dtype);
+    }
+  }
+
+  return nullptr;
+}
+
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
\ No newline at end of file
diff --git a/byteps/common/compressor/compressor_registry.h b/byteps/common/compressor/compressor_registry.h
new file mode 100644
index 000000000..1001878a7
--- /dev/null
+++ b/byteps/common/compressor/compressor_registry.h
@@ -0,0 +1,54 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H
+#define BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H
+
+#include "compressor.h"
+#include "utils.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+
+class CompressorRegistry {
+ public:
+  // constructor of compressor
+  using ctor_t = std::function<std::unique_ptr<Compressor>(
+      const kwargs_t& kwargs, size_t size, DataType dtype)>;
+
+  using map_t = std::unordered_map<std::string, ctor_t>;
+
+  struct Register {
+    Register(std::string name, ctor_t ctor);
+  };
+
+  static ctor_t Find(const std::string& name);
+
+  static std::unique_ptr<Compressor> Create(const kwargs_t& kwargs, size_t size,
+                                            DataType dtype);
+
+ private:
+  static map_t _ctor_map;
+
+  CompressorRegistry() = delete;
+  ~CompressorRegistry() = delete;
+};
+
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H
\ No newline at end of file
diff --git a/byteps/common/compressor/error_feedback.cc b/byteps/common/compressor/error_feedback.cc
new file mode 100644
index 000000000..0c343f16a
--- /dev/null
+++ b/byteps/common/compressor/error_feedback.cc
@@ -0,0 +1,47 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "error_feedback.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+
+tensor_t ErrorFeedback::Compress(tensor_t grad) {
+  // 1. grad <- grad + error
+  UpdateGradient(grad);
+
+  // 2. c <- Compress(grad)
+  auto compressed = _cptr->Compress(grad);
+
+  // 3. e <- grad - Decompress(c)
+  UpdateError(grad, compressed);
+
+  return compressed;
+}
+
+tensor_t ErrorFeedback::Decompress(tensor_t compressed) {
+  // directly forward to internal compressor
+  return _cptr->Decompress(compressed);
+}
+
+void ErrorFeedback::UpdateError(tensor_t corrected, tensor_t compressed) {
+  tensor_t error{_error.get(), _size, corrected.dtype};
+  _cptr->FastUpdateError(error, corrected, compressed);
+}
+
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
\ No newline at end of file
diff --git a/byteps/common/compressor/error_feedback.h b/byteps/common/compressor/error_feedback.h
new file mode 100644
index 000000000..ae89b740d
--- /dev/null
+++ b/byteps/common/compressor/error_feedback.h
@@ -0,0 +1,100 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_ERROR_FEEDBACK_H
+#define BYTEPS_COMPRESSOR_ERROR_FEEDBACK_H
+
+#include "../cpu_reducer.h"
+#include "compressor.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+
+/*!
+ *  \brief Error feedback Decorator
+ *
+ * paper: 1-bit stochastic gradient descent and its application to data-parallel
+ * distributed training of speech dnns
+ * https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/IS140694.pdf
+ *
+ * 1. UpdateGradient: g <- g + e
+ * 2. UpdateError: e <- g - c
+ *
+ * These two functions should be implemented in children classes.
+ *
+ * \par
+ * The caller do not need allocate an additional buffer to store error. There is
+ * a buffer already inside the class.
+ *
+ * \par
+ * Add error feedback behavior to any compressor at run-time via decorator
+ * pattern. It keeps the same interface as Compressor. Compress and Decompress
+ * have been implemented and can not be changed in children classes.
+ *
+ * \sa Compressor, VanillaErrorFeedbackCompressor
+ */
+class ErrorFeedback : public Compressor {
+ public:
+  // error buffer should be cleared to zeros at the beginning.
+  ErrorFeedback(size_t size, DataType dtype, std::unique_ptr<Compressor> cptr)
+      : Compressor(size, dtype),
+        _error(new byte_t[size]()),
+        _cpu_reducer(new CpuReducer(nullptr)),
+        _cptr(std::move(cptr)) {}
+  virtual ~ErrorFeedback() = default;
+
+  virtual tensor_t Compress(tensor_t grad) final;
+
+  virtual tensor_t Decompress(tensor_t compressed) final;
+
+ protected:
+  /*!
+   * \brief Correct gradient with error
+   *
+   * grad += error
+   *
+   * \note it is an inplace operation.
+   *
+   * \param grad input gradient to be updated inplace
+   * \param dtype type
+   */
+  virtual void UpdateGradient(tensor_t grad) = 0;
+
+  /*!
+   * \brief Update error
+   *
+   * error = corrected_grad - decompressed
+   *
+   * \param corrected refers to gradient + error
+   * \param compressed compressed tensor
+   */
+  virtual void UpdateError(tensor_t corrected, tensor_t compressed);
+
+ protected:
+  /*! \brief buffer of error */
+  std::unique_ptr<byte_t[]> _error;
+
+  std::unique_ptr<CpuReducer> _cpu_reducer;
+
+ private:
+  /*! \brief compressor pointer */
+  std::unique_ptr<Compressor> _cptr;
+};
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_ERROR_FEEDBACK_H
\ No newline at end of file
diff --git a/byteps/common/compressor/impl/dithering.cc b/byteps/common/compressor/impl/dithering.cc
new file mode 100644
index 000000000..ca84fcfee
--- /dev/null
+++ b/byteps/common/compressor/impl/dithering.cc
@@ -0,0 +1,218 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include <cmath>
+#include <cstring>
+
+#include "../compressor_registry.h"
+#include "dithering.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+namespace {
+CompressorRegistry::Register reg(
+    "dithering_compressor",
+    [](const kwargs_t& kwargs, size_t size,
+       DataType dtype) -> std::unique_ptr<Compressor> {
+      std::tuple<> params;
+      auto k = HyperParamFinder<unsigned>(kwargs, "compressor_k");
+
+      auto seed = HyperParamFinder<unsigned>(kwargs, "seed", true,
+                                             [](unsigned x) { return x != 0; });
+
+      auto ptype_int =
+          HyperParamFinder<int>(kwargs, "dithering_partition", true,
+                                [](int x) { return x == 0 || x == 1; });
+      auto ptype = static_cast<DitheringCompressor::PartitionType>(ptype_int);
+
+      auto ntype_int =
+          HyperParamFinder<int>(kwargs, "dithering_normalize", true,
+                                [](int x) { return x == 0 || x == 1; });
+      auto ntype = static_cast<DitheringCompressor::NomalizeType>(ntype_int);
+
+      return std::unique_ptr<Compressor>(
+          new DitheringCompressor(size, dtype, k, seed, ptype, ntype));
+    });
+}
+
+template <typename index_t, typename scalar_t>
+tensor_t DitheringCompressor::CompressImpl(index_t* dst, const scalar_t* src,
+                                           size_t len) {
+  static_assert(sizeof(index_t) == sizeof(scalar_t),
+                "index_t should be the same size as scalar_t");
+
+  // normalize
+  double scale = 0.0;
+  if (_ntype == NomalizeType::MAX) {
+    for (size_t i = 0; i < len; i++) {
+      scale = scale > std::abs(src[i]) ? scale : std::abs(src[i]);
+    }
+  } else if (_ntype == NomalizeType::L2) {
+    for (size_t i = 0; i < len; ++i) {
+      scale += src[i] * src[i];
+    }
+    scale = std::sqrt(scale);
+  }
+
+  BitWriter<index_t> bit_writer(dst);
+  size_t last_non_zero_pos = -1;
+  if (_ptype == PartitionType::LINEAR) {
+    for (size_t i = 0; i < len; ++i) {
+      float abs_x = std::abs(src[i]);
+      float normalized = (abs_x / scale) * _s;
+      float floor = std::floor(normalized);
+      unsigned quantized = floor + _rng.Bernoulli(normalized - floor);
+      if (quantized) {
+        size_t diff = i - last_non_zero_pos;
+        last_non_zero_pos = i;
+        EliasDeltaEncode(bit_writer, diff);
+        bit_writer.Put(std::signbit(src[i]));
+        EliasDeltaEncode(bit_writer, quantized);
+      }
+    }
+  } else if (_ptype == PartitionType::NATURAL) {
+    const unsigned level = 1 << (_s - 1);
+    for (size_t i = 0; i < len; ++i) {
+      float abs_x = std::abs(src[i]);
+      double normalized = (abs_x / scale) * level;
+      unsigned floor = RoundNextPow2(std::ceil(normalized)) >> 1;
+      unsigned length = (floor != 0) ? floor : 1;
+      double p = (normalized - floor) / length;
+      unsigned quantized = floor + length * _rng.Bernoulli(p);
+      if (quantized) {
+        size_t diff = i - last_non_zero_pos;
+        last_non_zero_pos = i;
+        EliasDeltaEncode(bit_writer, diff);
+        bit_writer.Put(std::signbit(src[i]));
+        EliasDeltaEncode(bit_writer, quantized);
+      }
+    }
+  }
+  bit_writer.Flush();
+
+  // bits
+  index_t* p_bits = reinterpret_cast<index_t*>(&dst[bit_writer.blocks()]);
+  *p_bits = bit_writer.bits();
+
+  // l2
+  float* p_scale = reinterpret_cast<float*>(&dst[bit_writer.blocks() + 1]);
+  *p_scale = scale;
+
+  return {dst, bit_writer.blocks() * sizeof(index_t) + sizeof(index_t) +
+                   sizeof(float)};
+}
+
+tensor_t DitheringCompressor::Compress(tensor_t grad) {
+  COMPRESS_IMPL_SWITCH(grad.dtype, CompressImpl, _buf.get(), grad.data,
+                       grad.size);
+}
+
+template <typename index_t, typename scalar_t>
+tensor_t DitheringCompressor::DecompressImpl(scalar_t* dst, const index_t* src,
+                                             size_t compressed_size) {
+  static_assert(sizeof(index_t) == sizeof(scalar_t),
+                "index_t should be the same size as scalar_t");
+
+  const size_t blocks =
+      (compressed_size - sizeof(float) - sizeof(index_t)) / sizeof(index_t);
+  auto* p_bits = reinterpret_cast<const index_t*>(src + blocks);
+  const index_t bits = *p_bits;
+
+  auto* p_scale = reinterpret_cast<const float*>(src + blocks + 1);
+  const float scale = *p_scale;
+
+  auto ptr = const_cast<index_t*>(src);
+  if ((void*)dst == (void*)src) {
+    ptr = reinterpret_cast<index_t*>(_buf.get());
+    std::memcpy(ptr, src, compressed_size);
+  }
+  std::memset(dst, 0, _size);
+
+  unsigned int s = _s;
+  if (_ptype == PartitionType::NATURAL) {
+    s = 1 << (_s - 1);
+  }
+
+  BitReader<index_t> bit_reader(ptr);
+  size_t last_non_zero_pos = -1;
+  while (bit_reader.bits() < bits) {
+    size_t diff = EliasDeltaDecode(bit_reader);
+    size_t i = last_non_zero_pos + diff;
+    last_non_zero_pos = i;
+    int signbit = bit_reader.Get();
+    unsigned quantized = EliasDeltaDecode(bit_reader);
+    float num = quantized * scale / s;
+    dst[i] = (1 - (signbit << 1)) * num;
+  }
+
+  return {dst, _size};
+}
+
+tensor_t DitheringCompressor::Decompress(tensor_t compressed) {
+#ifdef BYTEPS_BUILDING_SERVER
+  auto dst = _buf.get();
+#else
+  auto dst = compressed.data;
+#endif
+  DECOMPRESS_IMPL_SWITCH(_dtype, DecompressImpl, dst, compressed.data,
+                         compressed.size);
+}
+
+template <typename index_t, typename scalar_t>
+void DitheringCompressor::FastUpdateErrorImpl(scalar_t* error,
+                                              scalar_t* corrected,
+                                              const index_t* compressed,
+                                              size_t compressed_size) {
+  static_assert(sizeof(index_t) == sizeof(scalar_t),
+                "index_t should be the same size as scalar_t");
+
+  const size_t blocks =
+      (compressed_size - sizeof(float) - sizeof(index_t)) / sizeof(index_t);
+  auto* p_bits = reinterpret_cast<const index_t*>(compressed + blocks);
+  const index_t bits = *p_bits;
+
+  auto* p_scale = reinterpret_cast<const float*>(compressed + blocks + 1);
+  const float scale = *p_scale;
+
+  std::memcpy(error, corrected, _size);
+
+  unsigned int s = _s;
+  if (_ptype == PartitionType::NATURAL) {
+    s = 1 << (_s - 1);
+  }
+
+  BitReader<index_t> bit_reader(compressed);
+  size_t last_non_zero_pos = -1;
+  while (bit_reader.bits() < bits) {
+    size_t diff = EliasDeltaDecode(bit_reader);
+    size_t i = last_non_zero_pos + diff;
+    last_non_zero_pos = i;
+    int signbit = bit_reader.Get();
+    unsigned quantized = EliasDeltaDecode(bit_reader);
+    float num = quantized * scale / s;
+    error[i] -= (1 - (signbit << 1)) * num;
+  }
+}
+
+void DitheringCompressor::FastUpdateError(tensor_t error, tensor_t corrected,
+                                          tensor_t compressed) {
+  FAST_UPDATE_ERROR_IMPL_SWITCH(_dtype, FastUpdateErrorImpl, error.data,
+                                corrected.data, compressed.data,
+                                compressed.size);
+}
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
\ No newline at end of file
diff --git a/byteps/common/compressor/impl/dithering.h b/byteps/common/compressor/impl/dithering.h
new file mode 100644
index 000000000..d27c5e5cf
--- /dev/null
+++ b/byteps/common/compressor/impl/dithering.h
@@ -0,0 +1,89 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_IMPL_MULTIBIT_H
+#define BYTEPS_COMPRESSOR_IMPL_MULTIBIT_H
+
+#include "../compressor.h"
+#include "../utils.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+
+/*!
+ * \brief Dithering Compressor
+ *
+ * paper: Natural Compression for Distributed Deep Learning
+ * https://arxiv.org/pdf/1905.10988.pdf
+ *
+ * two kinds of partition:
+ * 1. linear: {0, 1/s, 2/s, ..., (s-1)/s, 1}
+ *
+ * 2. natural: {0, 2^{1-s}, 2^(2-s), ..., 2^{-1}, 1}
+ *
+ * two kinds of normalization:
+ * 1. max: it gives better accuracy but less sparsity.
+ *
+ * 2. l2 norm: it is more sparse but less accurate. and
+ * empirically we found it will diverge with error-feedback.
+ */
+class DitheringCompressor : public Compressor {
+ public:
+  enum class PartitionType { LINEAR = 0, NATURAL = 1 };
+  enum class NomalizeType { MAX = 0, L2 = 1 };
+
+  DitheringCompressor(size_t size, DataType dtype, unsigned int s,
+                      unsigned int seed = 0,
+                      PartitionType ptype = PartitionType::LINEAR,
+                      NomalizeType ntype = NomalizeType::MAX)
+      : Compressor(size, dtype), _s(s), _ptype(ptype), _ntype(ntype) {
+    if (seed) {
+      _rng.set_seed(seed);
+    }
+  };
+  virtual ~DitheringCompressor() = default;
+
+  tensor_t Compress(tensor_t grad) override;
+
+  tensor_t Decompress(tensor_t compressed) override;
+
+  void FastUpdateError(tensor_t error, tensor_t corrected,
+                       tensor_t compressed) override;
+
+ private:
+  template <typename index_t, typename scalar_t>
+  tensor_t CompressImpl(index_t* dst, const scalar_t* src, size_t len);
+
+  template <typename index_t, typename scalar_t>
+  tensor_t DecompressImpl(scalar_t* dst, const index_t* src,
+                          size_t compressed_size);
+
+  template <typename index_t, typename scalar_t>
+  void FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected,
+                           const index_t* compressed, size_t compressed_size);
+
+  /*! \brief number of levels */
+  const unsigned int _s;
+
+  PartitionType _ptype;
+  NomalizeType _ntype;
+  XorShift128PlusBitShifterRNG _rng;
+};
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_IMPL_MULTIBIT_H
\ No newline at end of file
diff --git a/byteps/common/compressor/impl/nesterov_momentum.cc b/byteps/common/compressor/impl/nesterov_momentum.cc
new file mode 100644
index 000000000..283aea7f7
--- /dev/null
+++ b/byteps/common/compressor/impl/nesterov_momentum.cc
@@ -0,0 +1,53 @@
+// Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "nesterov_momentum.h"
+#include "../compressor_registry.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+namespace {
+CompressorRegistry::Register reg(
+    "nesterov_momentum",
+    [](const kwargs_t& kwargs, size_t size,
+       DataType dtype) -> std::unique_ptr<Compressor> {
+      // register cptr
+      auto kwargs_clone = kwargs;
+      kwargs_clone.erase("momentum_type");
+      auto cptr = CompressorRegistry::Create(kwargs_clone, size, dtype);
+      BPS_CHECK_NE(cptr, nullptr);
+      // find \mu
+      auto mu = HyperParamFinder<float>(kwargs, "momentum_mu");
+      return std::unique_ptr<NesterovMomentumCompressor>(
+          new NesterovMomentumCompressor(size, dtype, std::move(cptr), mu));
+    });
+}
+
+void NesterovMomentumCompressor::UpdateMom(tensor_t grad) {
+  // m_t = \mu * m_{t-1} + g_t
+  this->_cpu_reducer->sum(_mom.get(), grad.data, _mom.get(), grad.size,
+                          static_cast<DataType>(grad.dtype), _mu);
+}
+
+void NesterovMomentumCompressor::UpdateGradient(tensor_t grad) {
+  // p_t = \mu m_t + g_t
+  this->_cpu_reducer->sum(grad.data, _mom.get(), grad.size,
+                          static_cast<DataType>(grad.dtype), _mu);
+}
+
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
\ No newline at end of file
diff --git a/byteps/common/compressor/impl/nesterov_momentum.h b/byteps/common/compressor/impl/nesterov_momentum.h
new file mode 100644
index 000000000..0764e4d3d
--- /dev/null
+++ b/byteps/common/compressor/impl/nesterov_momentum.h
@@ -0,0 +1,51 @@
+// Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H
+#define BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H
+
+#include "../momentum.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+
+/*!
+ * \brief Nesterov Momentum Compressor
+ *
+ * paper: A method for solving the convex programming problem with convergence
+ * rate $O (1/k^2)$
+ *
+ * m_t <- \mu m_{t-1} + g_t
+ * g_t <- \mu m_t + g_t
+ *
+ */
+class NesterovMomentumCompressor : public Momentum {
+ public:
+  NesterovMomentumCompressor(size_t size, DataType dtype,
+                             std::unique_ptr<Compressor> cptr, float mu)
+      : Momentum(size, dtype, std::move(cptr), mu){};
+  virtual ~NesterovMomentumCompressor() = default;
+
+ protected:
+  void UpdateMom(tensor_t grad) override;
+  void UpdateGradient(tensor_t grad) override;
+};
+
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H
\ No newline at end of file
diff --git a/byteps/common/compressor/impl/onebit.cc b/byteps/common/compressor/impl/onebit.cc
new file mode 100644
index 000000000..69e2cb907
--- /dev/null
+++ b/byteps/common/compressor/impl/onebit.cc
@@ -0,0 +1,143 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include <cstring>
+
+#include "onebit.h"
+#include "../compressor_registry.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+namespace {
+CompressorRegistry::Register reg("onebit_compressor", [](const kwargs_t& kwargs,
+                                                         size_t size,
+                                                         DataType dtype) {
+  auto scaled =
+      HyperParamFinder<bool>(kwargs, "compressor_onebit_scaling", true);
+  return std::unique_ptr<Compressor>(new OnebitCompressor(size, dtype, scaled));
+});
+}
+
+template <typename index_t, typename scalar_t>
+tensor_t OnebitCompressor::CompressImpl(index_t* dst, const scalar_t* src,
+                                        size_t len) {
+  static_assert(sizeof(index_t) == sizeof(scalar_t),
+                "index_t should be the same size as scalar_t");
+  constexpr size_t PACKING_SIZE = sizeof(scalar_t) * 8;
+  size_t padding_len = (PACKING_SIZE - (len % PACKING_SIZE)) % PACKING_SIZE;
+  const size_t chunk_len = (len + padding_len) / PACKING_SIZE;
+
+  float scale = 1.0f;
+  if (_use_scale) {
+    double sum = 0.0f;
+    for (size_t i = 0; i < len; ++i) {
+      sum += std::abs(src[i]);
+    }
+    scale = sum / len;
+  }
+
+#pragma omp parallel for simd
+  for (size_t i = 0; i < chunk_len; ++i) {
+    index_t x = src[i * PACKING_SIZE] < 0;
+    for (size_t j = 1; j < PACKING_SIZE; ++j) {
+      x <<= 1;
+      x |= src[i * PACKING_SIZE + j] < 0;
+    }
+    dst[i] = x;
+  }
+
+  float* p_scale = reinterpret_cast<float*>(&dst[chunk_len]);
+  *p_scale = scale;
+
+  return {dst, chunk_len * sizeof(index_t) + sizeof(float)};
+}  // namespace compressor
+
+tensor_t OnebitCompressor::Compress(tensor_t grad) {
+  COMPRESS_IMPL_SWITCH(grad.dtype, CompressImpl, _buf.get(), grad.data,
+                       grad.size);
+}
+
+template <typename scalar_t, typename index_t>
+tensor_t OnebitCompressor::DecompressImpl(scalar_t* dst, const index_t* src,
+                                          size_t compressed_size) {
+  static_assert(sizeof(scalar_t) == sizeof(index_t),
+                "scalar_t should be the same size as index_t");
+  constexpr size_t PACKING_SIZE = sizeof(index_t) * 8;
+  const size_t chunk_len = (compressed_size - sizeof(float)) / sizeof(index_t);
+
+  auto* pf = reinterpret_cast<const float*>(src + chunk_len);
+  float scale = *pf;
+
+  index_t* ptr = const_cast<index_t*>(src);
+  if ((void*)dst == (void*)src) {
+    ptr = reinterpret_cast<index_t*>(_buf.get());
+    std::memcpy(ptr, src, compressed_size);
+  }
+
+#pragma omp parallel for simd
+  for (int i = chunk_len - 1; i >= 0; --i) {
+    index_t x = ptr[i];
+    for (int j = PACKING_SIZE - 1; j >= 0; --j) {
+      int sign = 1 - ((x & 0x01) << 1);
+      dst[i * PACKING_SIZE + j] = sign * scale;
+      x >>= 1;
+    }
+  }
+
+  return {dst, _size};
+}
+
+tensor_t OnebitCompressor::Decompress(tensor_t compressed) {
+#ifdef BYTEPS_BUILDING_SERVER
+  auto dst = _buf.get();
+#else
+  auto dst = compressed.data;
+#endif
+  DECOMPRESS_IMPL_SWITCH(_dtype, DecompressImpl, dst, compressed.data,
+                         compressed.size);
+}
+
+template <typename scalar_t, typename index_t>
+void OnebitCompressor::FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected,
+                                           const index_t* compressed,
+                                           size_t compressed_size) {
+  constexpr size_t PACKING_SIZE = sizeof(index_t) * 8;
+  const size_t chunk_len = (compressed_size - sizeof(float)) / sizeof(index_t);
+
+  auto* pf = reinterpret_cast<const float*>(compressed + chunk_len);
+  float scale = *pf;
+
+#pragma omp parallel for simd
+  for (int i = chunk_len - 1; i >= 0; --i) {
+    index_t x = compressed[i];
+    for (int j = PACKING_SIZE - 1; j >= 0; --j) {
+      int sign = ((x & 0x01) << 1) - 1;
+      error[i * PACKING_SIZE + j] =
+          corrected[i * PACKING_SIZE + j] + sign * scale;
+      x >>= 1;
+    }
+  }
+}
+
+void OnebitCompressor::FastUpdateError(tensor_t error, tensor_t corrected,
+                                       tensor_t compressed) {
+  FAST_UPDATE_ERROR_IMPL_SWITCH(_dtype, FastUpdateErrorImpl, error.data,
+                                corrected.data, compressed.data,
+                                compressed.size);
+}
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
\ No newline at end of file
diff --git a/byteps/common/compressor/impl/onebit.h b/byteps/common/compressor/impl/onebit.h
new file mode 100644
index 000000000..92b74530b
--- /dev/null
+++ b/byteps/common/compressor/impl/onebit.h
@@ -0,0 +1,95 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_IMPL_ONEBIT_H
+#define BYTEPS_COMPRESSOR_IMPL_ONEBIT_H
+
+#include "../compressor.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+
+/*!
+ * \brief Onebit Compressor
+ *
+ * paper: SIGNSGD: Compressed Optimisation for Non-Convex Problems
+ * https://arxiv.org/pdf/1802.04434.pdf
+ *
+ * each worker i:
+ *    c_i <- sign(grad)
+ *
+ * server: majority vote
+ *    sign(\sum_i c_i)
+ *
+ * \note 0 represents positive and 1 represents negative.
+ */
+class OnebitCompressor : public Compressor {
+ public:
+  OnebitCompressor(size_t size, DataType dtype, bool use_scale = false)
+      : Compressor(size, dtype), _use_scale(use_scale) {}
+  virtual ~OnebitCompressor() = default;
+
+  /*!
+   * \brief Compress function
+   *
+   * compress and pack into byte array.
+   * each bit represents a sign.
+   *
+   * \param grad gradient tensor
+   * \param compressed compressed tensor
+   */
+  tensor_t Compress(tensor_t grad) override;
+
+  /*!
+   * \brief Decompress function
+   *
+   * unpack from byte array to FP tensor
+   *
+   * \param compressed compressed tensor
+   * \param decompressed decompressed tensor
+   */
+  tensor_t Decompress(tensor_t compressed) override;
+
+  /*!
+   * \brief help function for error feedback `UpdateError`
+   *
+   * \param corrected gradient corrected with error
+   * \param error error
+   * \param compressed compressed gradient
+   */
+  void FastUpdateError(tensor_t error, tensor_t corrected,
+                       tensor_t compressed) override;
+
+ private:
+  template <typename index_t, typename scalar_t>
+  tensor_t CompressImpl(index_t* dst, const scalar_t* src, size_t len);
+
+  template <typename scalar_t, typename index_t>
+  tensor_t DecompressImpl(scalar_t* dst, const index_t* src,
+                          size_t compressed_size);
+
+  template <typename scalar_t, typename index_t>
+  void FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected,
+                           const index_t* compressed, size_t compressed_size);
+
+ private:
+  bool _use_scale;
+};
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_IMPL_ONEBIT_H
\ No newline at end of file
diff --git a/byteps/common/compressor/impl/randomk.cc b/byteps/common/compressor/impl/randomk.cc
new file mode 100644
index 000000000..5634f7ef6
--- /dev/null
+++ b/byteps/common/compressor/impl/randomk.cc
@@ -0,0 +1,130 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include <cstring>
+
+#include "../compressor_registry.h"
+#include "randomk.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+namespace {
+CompressorRegistry::Register reg(
+    "randomk_compressor",
+    [](const kwargs_t& kwargs, size_t size,
+       DataType dtype) -> std::unique_ptr<Compressor> {
+      auto factor = HyperParamFinder<float>(kwargs, "compressor_k", false,
+                                            [](float x) { return x > 0; });
+      unsigned k;
+      if (factor < 1) {
+        k = static_cast<unsigned>(factor * size / getDataTypeLength(dtype));
+        if (k == 0) k = 1;
+      } else {
+        k = static_cast<unsigned>(factor);
+      }
+
+      auto seed = HyperParamFinder<unsigned>(kwargs, "seed", true,
+                                             [](unsigned x) { return x != 0; });
+
+      return std::unique_ptr<Compressor>(
+          new RandomkCompressor(size, dtype, k, seed));
+    });
+}
+
+template <typename index_t, typename scalar_t>
+tensor_t RandomkCompressor::CompressImpl(index_t* dst, const scalar_t* src,
+                                         size_t len) {
+  static_assert(sizeof(index_t) == sizeof(scalar_t),
+                "index_t should be the same size as scalar_t");
+  BPS_CHECK_LE(this->_k, len / 2);
+  using pair_t = std::pair<index_t, scalar_t>;
+  auto ptr = reinterpret_cast<pair_t*>(dst);
+
+  for (size_t i = 0; i < this->_k; ++i) {
+    auto index = _rng.Randint(0, len);
+    ptr[i] = std::make_pair(index, src[index]);
+  }
+
+  return {dst, this->_k * sizeof(pair_t)};
+}
+
+tensor_t RandomkCompressor::Compress(tensor_t grad) {
+  COMPRESS_IMPL_SWITCH(grad.dtype, CompressImpl, _buf.get(), grad.data,
+                       grad.size);
+}
+
+template <typename index_t, typename scalar_t>
+tensor_t RandomkCompressor::DecompressImpl(scalar_t* dst, const index_t* src,
+                                           size_t compressed_size) {
+  static_assert(sizeof(index_t) == sizeof(scalar_t),
+                "index_t should be the same size as scalar_t");
+  using pair_t = std::pair<index_t, scalar_t>;
+
+  auto ptr = reinterpret_cast<const pair_t*>(src);
+  if ((void*)dst == (void*)src) {
+    auto buf = reinterpret_cast<pair_t*>(_buf.get());
+    std::memcpy(buf, ptr, compressed_size);
+    ptr = const_cast<const pair_t*>(buf);
+  }
+
+  // reset to zeros
+  std::memset(dst, 0, _size);
+  size_t len = compressed_size / sizeof(pair_t);
+  for (size_t i = 0; i < len; ++i) {
+    auto& pair = ptr[i];
+    dst[pair.first] = pair.second;
+  }
+
+  return {dst, _size};
+}
+
+tensor_t RandomkCompressor::Decompress(tensor_t compressed) {
+#ifdef BYTEPS_BUILDING_SERVER
+  auto dst = _buf.get();
+#else
+  auto dst = compressed.data;
+#endif
+  DECOMPRESS_IMPL_SWITCH(_dtype, DecompressImpl, dst, compressed.data,
+                         compressed.size);
+}
+
+template <typename index_t, typename scalar_t>
+void RandomkCompressor::FastUpdateErrorImpl(scalar_t* error,
+                                            scalar_t* corrected,
+                                            const index_t* compressed,
+                                            size_t compressed_size) {
+  static_assert(sizeof(index_t) == sizeof(scalar_t),
+                "index_t should be the same size as scalar_t");
+  using pair_t = std::pair<index_t, scalar_t>;
+
+  std::memcpy(error, corrected, _size);
+
+  auto ptr = reinterpret_cast<const pair_t*>(compressed);
+  for (size_t i = 0; i < this->_k; ++i) {
+    auto& pair = ptr[i];
+    error[pair.first] = 0;
+  }
+}
+
+void RandomkCompressor::FastUpdateError(tensor_t error, tensor_t corrected,
+                                        tensor_t compressed) {
+  FAST_UPDATE_ERROR_IMPL_SWITCH(_dtype, FastUpdateErrorImpl, error.data,
+                                corrected.data, compressed.data,
+                                compressed.size);
+}
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
\ No newline at end of file
diff --git a/byteps/common/compressor/impl/randomk.h b/byteps/common/compressor/impl/randomk.h
new file mode 100644
index 000000000..b657cc5d3
--- /dev/null
+++ b/byteps/common/compressor/impl/randomk.h
@@ -0,0 +1,104 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_IMPL_RANDOMK_H
+#define BYTEPS_COMPRESSOR_IMPL_RANDOMK_H
+
+#include <random>
+
+#include "../compressor.h"
+#include "../utils.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+
+/*!
+ * \brief RandomK Compressor
+ *
+ * paper: Sparsified SGD with Memory
+ * https://arxiv.org/pdf/1809.07599.pdf
+ *
+ * randomly sending k entries of the stochastic gradient
+ *
+ * \note it is a stochastic algorithm. If you want to have deterministic
+ * behavior, please set a seed in the configurations.
+ */
+class RandomkCompressor : public Compressor {
+ public:
+  RandomkCompressor(size_t size, DataType dtype, unsigned int k, unsigned int seed = 0)
+      : Compressor(size, dtype), _k(k) {
+    if (seed != 0) {
+      BPS_LOG(INFO) << "SET SEED = " << seed;
+      _rng.set_seed(seed);
+    }
+  };
+  virtual ~RandomkCompressor() = default;
+
+  /*!
+   * \brief Compress function
+   *
+   * randomly select k entries and corresponding indices
+   *
+   * \param grad gradient tensor
+   * \param compressed compressed tensor
+   */
+  tensor_t Compress(tensor_t grad) override;
+
+  /*!
+   * \brief Decompress function
+   *
+   * fill a zero tensor with topk entries and corresponding indices
+   *
+   * \param compressed compressed tensor
+   * \param decompressed decompressed tensor
+   */
+  tensor_t Decompress(tensor_t compressed) override;
+
+  /*!
+   * \brief faster version of `UpdateError`
+   *
+   * 1. e <- p (e is the error and p is the corrected gradient)
+   * 2. zero-fill e with selected k indices
+   *
+   * \param corrected gradient corrected with error
+   * \param error error
+   * \param compressed compressed gradient
+   */
+  void FastUpdateError(tensor_t error, tensor_t corrected,
+                       tensor_t compressed) override;
+
+ private:
+  template <typename index_t, typename scalar_t>
+  tensor_t CompressImpl(index_t* dst, const scalar_t* src, size_t len);
+
+  template <typename index_t, typename scalar_t>
+  tensor_t DecompressImpl(scalar_t* dst, const index_t* src,
+                          size_t compressed_size);
+
+  template <typename index_t, typename scalar_t>
+  void FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected,
+                           const index_t* compressed, size_t compressed_size);
+
+ private:
+  unsigned int _k;
+  std::random_device _rd;
+  XorShift128PlusBitShifterRNG _rng;
+};
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_IMPL_RANDOMK_H
\ No newline at end of file
diff --git a/byteps/common/compressor/impl/topk.cc b/byteps/common/compressor/impl/topk.cc
new file mode 100644
index 000000000..e24f287f5
--- /dev/null
+++ b/byteps/common/compressor/impl/topk.cc
@@ -0,0 +1,140 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include <cstring>
+#include <queue>
+
+#include "../compressor_registry.h"
+#include "topk.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+namespace {
+CompressorRegistry::Register reg(
+    "topk_compressor",
+    [](const kwargs_t& kwargs, size_t size,
+       DataType dtype) -> std::unique_ptr<Compressor> {
+      auto factor = HyperParamFinder<float>(kwargs, "compressor_k", false,
+                                            [](float x) { return x > 0; });
+      unsigned k;
+      if (factor < 1) {
+        k = static_cast<unsigned>(factor * size / getDataTypeLength(dtype));
+        if (k == 0) k = 1;
+      } else {
+        k = static_cast<unsigned>(factor);
+      }
+      return std::unique_ptr<Compressor>(new TopkCompressor(size, dtype, k));
+    });
+}
+
+template <typename index_t, typename scalar_t>
+tensor_t TopkCompressor::CompressImpl(index_t* dst, const scalar_t* src,
+                                      size_t len) {
+  static_assert(sizeof(index_t) == sizeof(scalar_t),
+                "index_t should be the same size as scalar_t");
+  BPS_CHECK_LE(this->_k, len / 2);
+  using pair_t = std::pair<index_t, scalar_t>;
+  auto comp = [](const pair_t& lhs, const pair_t& rhs) {
+    return std::abs(lhs.second) > std::abs(rhs.second);
+  };
+
+  auto beg = reinterpret_cast<pair_t*>(dst);
+  size_t size = 0;
+  for (size_t i = 0; i < len; ++i) {
+    if (i < this->_k) {
+      beg[size] = std::make_pair(i, src[i]);
+      size++;
+      std::push_heap(beg, beg + size, comp);
+    } else {
+      auto& top = *beg;
+      // note: compare absolute value
+      if (std::abs(src[i]) > std::abs(top.second)) {
+        std::pop_heap(beg, beg + size, comp);
+        beg[size - 1] = std::make_pair(i, src[i]);
+        std::push_heap(beg, beg + size, comp);
+      }
+    }
+  }
+
+  return {dst, this->_k * sizeof(pair_t)};
+}
+
+tensor_t TopkCompressor::Compress(tensor_t grad) {
+  COMPRESS_IMPL_SWITCH(grad.dtype, CompressImpl, _buf.get(), grad.data,
+                       grad.size);
+}
+
+template <typename index_t, typename scalar_t>
+tensor_t TopkCompressor::DecompressImpl(scalar_t* dst, const index_t* src,
+                                        size_t compressed_size) {
+  static_assert(sizeof(index_t) == sizeof(scalar_t),
+                "index_t should be the same size as scalar_t");
+  using pair_t = std::pair<index_t, scalar_t>;
+
+  auto ptr = reinterpret_cast<const pair_t*>(src);
+  if ((void*)dst == (void*)src) {
+    auto buf = reinterpret_cast<pair_t*>(_buf.get());
+    std::memcpy(buf, ptr, compressed_size);
+    ptr = const_cast<const pair_t*>(buf);
+  }
+
+  // reset to zeros
+  std::memset(dst, 0, _size);
+  size_t len = compressed_size / sizeof(pair_t);
+  for (size_t i = 0; i < len; ++i) {
+    auto& pair = ptr[i];
+    dst[pair.first] = pair.second;
+  }
+
+  return {dst, _size};
+}
+
+tensor_t TopkCompressor::Decompress(tensor_t compressed) {
+#ifdef BYTEPS_BUILDING_SERVER
+  auto dst = _buf.get();
+#else
+  auto dst = compressed.data;
+#endif
+  DECOMPRESS_IMPL_SWITCH(_dtype, DecompressImpl, dst, compressed.data,
+                         compressed.size);
+}
+
+template <typename index_t, typename scalar_t>
+void TopkCompressor::FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected,
+                                         const index_t* compressed,
+                                         size_t compressed_size) {
+  static_assert(sizeof(index_t) == sizeof(scalar_t),
+                "index_t should be the same size as scalar_t");
+  using pair_t = std::pair<index_t, scalar_t>;
+
+  std::memcpy(error, corrected, _size);
+
+  auto ptr = reinterpret_cast<const pair_t*>(compressed);
+  for (size_t i = 0; i < this->_k; ++i) {
+    auto& pair = ptr[i];
+    error[pair.first] = 0;
+  }
+}
+
+void TopkCompressor::FastUpdateError(tensor_t error, tensor_t corrected,
+                                     tensor_t compressed) {
+  FAST_UPDATE_ERROR_IMPL_SWITCH(_dtype, FastUpdateErrorImpl, error.data,
+                                corrected.data, compressed.data,
+                                compressed.size);
+}
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
\ No newline at end of file
diff --git a/byteps/common/compressor/impl/topk.h b/byteps/common/compressor/impl/topk.h
new file mode 100644
index 000000000..afb514efe
--- /dev/null
+++ b/byteps/common/compressor/impl/topk.h
@@ -0,0 +1,94 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_IMPL_TOPK_H
+#define BYTEPS_COMPRESSOR_IMPL_TOPK_H
+
+#include "../compressor.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+
+/*!
+ * \brief TopK Compressor
+ *
+ * paper: Sparsified SGD with Memory
+ * https://arxiv.org/pdf/1809.07599.pdf
+ *
+ * sending the most significant entries of the stochastic gradient
+ *
+ */
+class TopkCompressor : public Compressor {
+ public:
+  TopkCompressor(size_t size, DataType dtype, unsigned int k)
+      : Compressor(size, dtype), _k(k){};
+  virtual ~TopkCompressor() = default;
+
+  /*!
+   * \brief Compress function
+   *
+   * select topk entries and corresponding indices
+   *
+   * \note compare with absolute values
+   *
+   * \param grad gradient tensor
+   * \param compressed compressed tensor
+   */
+  tensor_t Compress(tensor_t grad) override;
+
+  /*!
+   * \brief Decompress function
+   *
+   * fill a zero tensor with topk entries and corresponding indices
+   *
+   * \param compressed compressed tensor
+   * \param decompressed decompressed tensor
+   */
+  tensor_t Decompress(tensor_t compressed) override;
+
+  /*!
+   * \brief faster version of `UpdateError`
+   *
+   * 1. e <- p (e is the error and p is the corrected gradient)
+   * 2. zero-fill e with selected k indices
+   *
+   * \param corrected gradient corrected with error
+   * \param error error
+   * \param compressed compressed gradient
+   */
+  void FastUpdateError(tensor_t error, tensor_t corrected,
+                       tensor_t compressed) override;
+
+ private:
+  template <typename index_t, typename scalar_t>
+  tensor_t CompressImpl(index_t* dst, const scalar_t* src, size_t len);
+
+  template <typename index_t, typename scalar_t>
+  tensor_t DecompressImpl(scalar_t* dst, const index_t* src,
+                          size_t compressed_size);
+
+  template <typename index_t, typename scalar_t>
+  void FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected,
+                           const index_t* compressed, size_t compressed_size);
+
+ private:
+  unsigned int _k;
+};
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_IMPL_TOPK_H
\ No newline at end of file
diff --git a/byteps/common/compressor/impl/vanilla_error_feedback.cc b/byteps/common/compressor/impl/vanilla_error_feedback.cc
new file mode 100644
index 000000000..30d6e9d50
--- /dev/null
+++ b/byteps/common/compressor/impl/vanilla_error_feedback.cc
@@ -0,0 +1,68 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "../compressor_registry.h"
+#include "vanilla_error_feedback.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+namespace {
+CompressorRegistry::Register reg(
+    "vanilla_ef",
+    [](const kwargs_t& kwargs, size_t size,
+       DataType dtype) -> std::unique_ptr<Compressor> {
+      // register cptr
+      auto kwargs_clone = kwargs;
+      kwargs_clone.erase("ef_type");
+      auto cptr = CompressorRegistry::Create(kwargs_clone, size, dtype);
+      BPS_CHECK_NE(cptr, nullptr);
+      return std::unique_ptr<VanillaErrorFeedbackCompressor>(
+          new VanillaErrorFeedbackCompressor(size, dtype, std::move(cptr)));
+    });
+}
+
+VanillaErrorFeedbackCompressor::VanillaErrorFeedbackCompressor(
+    size_t size, DataType dtype, std::unique_ptr<Compressor> cptr)
+    : ErrorFeedback(size, dtype, std::move(cptr)) {
+  _fd = open("lr.s", O_RDONLY);
+  BPS_CHECK(_fd > 0) << "open lr.s failed, errno=" << strerror(errno);
+  void* ptr = mmap(0, 8, PROT_READ, MAP_SHARED, _fd, 0);
+  BPS_CHECK_NE(ptr, MAP_FAILED) << "mmap failed, errno=" << strerror(errno);
+  _mm = ptr;
+  _pre_lr = _cur_lr = *reinterpret_cast<double*>(_mm);
+}
+
+VanillaErrorFeedbackCompressor::~VanillaErrorFeedbackCompressor() {
+  munmap(_mm, 8);
+  close(_fd);
+}
+
+void VanillaErrorFeedbackCompressor::UpdateGradient(tensor_t grad) {
+  _cur_lr = *reinterpret_cast<double*>(_mm);
+  this->_cpu_reducer->sum(grad.data, _error.get(), grad.size,
+                          static_cast<DataType>(grad.dtype),
+                          (_pre_lr / _cur_lr));
+  _pre_lr = _cur_lr;
+}
+
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
\ No newline at end of file
diff --git a/byteps/common/compressor/impl/vanilla_error_feedback.h b/byteps/common/compressor/impl/vanilla_error_feedback.h
new file mode 100644
index 000000000..ebb049532
--- /dev/null
+++ b/byteps/common/compressor/impl/vanilla_error_feedback.h
@@ -0,0 +1,68 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_IMPL_VANILLA_ERROR_FEEDBACK_H
+#define BYTEPS_COMPRESSOR_IMPL_VANILLA_ERROR_FEEDBACK_H
+
+#include "../error_feedback.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+
+/*!
+ * \brief Vanilla Error Feedback Compressor
+ *
+ * paper: Communication-efficient distributed blockwise momentum sgd with
+ * error-feedback
+ * https://arxiv.org/pdf/1905.10936.pdf
+ *
+ * each worker i:
+ *    p_{t,i} <- g_{t,i} + \frac{\eta_{t-1}}{\eta_t} e_{t,i}
+ *    c_{t,i} <- Compress(p_{t,i})
+ *    e_{t,i} <- p_{t,i} - c_{t,i}
+ *
+ * server:
+ *    \tilde{p}_{t} <- \frac{1}{M} \sum_{i=1}^{M} c_{t,i}
+ * +\frac{\eta_{t-1}}{\eta_{t}} \tilde{e_t} \tilde{e}_{t+1} <-
+ * \tilde{p}_{t}-\tilde{c_t}
+ *
+ * Error-correction: error needs to be scaled with \frac{\eta_{t-1}}{\eta_t}.
+ */
+class VanillaErrorFeedbackCompressor : public ErrorFeedback {
+ public:
+  VanillaErrorFeedbackCompressor(size_t size, DataType dtype,
+                                 std::unique_ptr<Compressor> cptr);
+  virtual ~VanillaErrorFeedbackCompressor();
+
+ protected:
+  void UpdateGradient(tensor_t grad) override;
+
+ private:
+  /*!
+   * \brief learning rate
+   *
+   * read from file each step
+   */
+  double _pre_lr, _cur_lr;
+
+  int _fd;
+  void* _mm;
+};
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_IMPL_VANILLA_ERROR_FEEDBACK_H
\ No newline at end of file
diff --git a/byteps/common/compressor/momentum.cc b/byteps/common/compressor/momentum.cc
new file mode 100644
index 000000000..581858385
--- /dev/null
+++ b/byteps/common/compressor/momentum.cc
@@ -0,0 +1,40 @@
+// Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "momentum.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+
+tensor_t Momentum::Compress(tensor_t grad) {
+  // 1. m_t = \mu * m_{t-1} + g_t
+  UpdateMom(grad);
+
+  // 2. p_t = \mu m_t + g_t
+  UpdateGradient(grad);
+
+  // 3. compress
+  return _cptr->Compress(grad);
+}
+
+tensor_t Momentum::Decompress(tensor_t compressed) {
+  // directly forward to internal compressor
+  return _cptr->Decompress(compressed);
+}
+
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
\ No newline at end of file
diff --git a/byteps/common/compressor/momentum.h b/byteps/common/compressor/momentum.h
new file mode 100644
index 000000000..07fc2fada
--- /dev/null
+++ b/byteps/common/compressor/momentum.h
@@ -0,0 +1,95 @@
+// Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_MOMENTUM_H
+#define BYTEPS_COMPRESSOR_MOMENTUM_H
+
+#include "../cpu_reducer.h"
+#include "compressor.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+/*!
+ * \brief Momentum
+ *
+ * Stochastic gradient descent with momentum
+ *
+ * \note
+ * The momentum is added to gradient before compression. This should not be used
+ * at the same time with the momentum implemented in the framework such as
+ * MXNet, Tensorflow or PyTorch etc. The key difference between the two is the
+ * position where they are added to the gradients. For this one, it is added
+ * before push_pull. But for framework's momentum, it is added after push_pull.
+ *
+ * \note
+ * The framework's momentum is disabled when using this momentum. User do not
+ * need to disable it manully.
+ *
+ * \sa Compressor, NesterovMomentumCompressor
+ */
+class Momentum : public Compressor {
+ public:
+  // momentum should be cleared to zeros
+  Momentum(size_t size, DataType dtype, std::unique_ptr<Compressor> cptr,
+           float mu)
+      : Compressor(size, dtype),
+        _mom(new byte_t[size]()),
+        _mu(mu),
+        _cpu_reducer(new CpuReducer(nullptr)),
+        _cptr(std::move(cptr)){};
+  virtual ~Momentum() = default;
+
+  virtual tensor_t Compress(tensor_t grad) final;
+
+  virtual tensor_t Decompress(tensor_t compressed) final;
+
+ protected:
+  /*!
+   * \brief Update momentum
+   *
+   * e.g. m_t = \mu * m_{t-1} + g_t
+   *
+   * \param grad refers to gradient
+   */
+  virtual void UpdateMom(tensor_t grad) = 0;
+
+  /*!
+   * \brief Update gradient with momentum
+   *
+   * e.g. g_t = \mu m_t + g_t
+   *
+   * \param grad refers to gradient which adds momentum in place.
+   */
+  virtual void UpdateGradient(tensor_t grad) = 0;
+
+ protected:
+  /*! \brief buffer of momentum */
+  std::unique_ptr<byte_t[]> _mom;
+
+  /*! \brief momentum factor */
+  float _mu;
+
+  std::unique_ptr<CpuReducer> _cpu_reducer;
+
+ private:
+  /*! \brief compressor pointer */
+  std::unique_ptr<Compressor> _cptr;
+};
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_MOMENTUM_H
\ No newline at end of file
diff --git a/byteps/common/compressor/utils.h b/byteps/common/compressor/utils.h
new file mode 100644
index 000000000..64683fdc5
--- /dev/null
+++ b/byteps/common/compressor/utils.h
@@ -0,0 +1,251 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_UTILS_H
+#define BYTEPS_COMPRESSOR_UTILS_H
+
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <random>
+#include <sstream>
+#include <string>
+#include <type_traits>
+
+#include "common.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+
+/*!
+ * \brief serialize key-vals hyper-params for network transmission
+ *
+ * \param kwargs hyper-params
+ * \return std::string serialized data
+ */
+inline std::string Serialize(const kwargs_t& kwargs) {
+  std::ostringstream os;
+  os << kwargs.size();
+  for (auto const& kwarg : kwargs) {
+    os << " " << kwarg.first << " " << kwarg.second;
+  }
+  return os.str();
+}
+
+/*!
+ * \brief deserialize serialized data into key-vals hyper-params
+ *
+ * \param content serialized data
+ * \return kwargs_t hyper-params
+ */
+inline kwargs_t Deserialize(const std::string& content) {
+  kwargs_t kwargs;
+  std::istringstream is(content);
+  size_t size = 0;
+  is >> size;
+  for (size_t i = 0; i < size; ++i) {
+    kwargs_t::key_type key;
+    kwargs_t::mapped_type val;
+    is >> key >> val;
+    kwargs[key] = val;
+  }
+
+  return kwargs;
+}
+
+/*!
+ * \brief random number generator based on xorshift128plus
+ *
+ * refer to https://en.wikipedia.org/wiki/Xorshift#xorshift+
+ */
+class XorShift128PlusBitShifterRNG {
+ public:
+  XorShift128PlusBitShifterRNG() {
+    std::random_device rd;
+    _state = {rd(), rd()};
+  }
+
+  // uniform int among [low, high)
+  uint64_t Randint(uint64_t low, uint64_t high) {
+    return xorshift128p() % (high - low) + low;
+  };
+
+  // uniform [0, 1]
+  double Rand() { return double(xorshift128p()) / MAX; }
+
+  // Bernoulli Distributation
+  bool Bernoulli(double p) { return xorshift128p() < p * MAX; }
+
+  void set_seed(uint64_t seed) { _state = {seed, seed}; }
+
+ private:
+  struct xorshift128p_state {
+    uint64_t a, b;
+  };
+
+  uint64_t xorshift128p() {
+    uint64_t t = _state.a;
+    uint64_t const s = _state.b;
+    _state.a = s;
+    t ^= t << 23;        // a
+    t ^= t >> 17;        // b
+    t ^= s ^ (s >> 26);  // c
+    _state.b = t;
+    return t + s;
+  };
+
+  xorshift128p_state _state;
+
+  static constexpr uint64_t MAX = std::numeric_limits<uint64_t>::max();
+};
+
+/*!
+ * \brief Bit Writer
+ *
+ */
+template <typename T>
+class BitWriter {
+ public:
+  explicit BitWriter(T* data)
+      : _dptr(data), _accum(0), _used_bits(0), _blocks(0) {}
+  void Put(bool x) {
+    _accum <<= 1;
+    _accum |= x;
+
+    if (++_used_bits == PACKING_SIZE) {
+      _dptr[_blocks++] = _accum;
+      _used_bits = 0;
+    }
+  }
+
+  void Flush() {
+    if (_used_bits > 0) {
+      size_t padding_size = PACKING_SIZE - _used_bits;
+      _accum <<= padding_size;
+      _dptr[_blocks] = _accum;
+    }
+  }
+
+  size_t bits() const { return _blocks * PACKING_SIZE + _used_bits; }
+  size_t blocks() const { return std::ceil((float)bits() / PACKING_SIZE); }
+
+ private:
+  static constexpr size_t PACKING_SIZE = sizeof(T) * 8;
+  T* _dptr;  // allocated
+  T _accum;
+  size_t _used_bits;
+  size_t _blocks;
+};
+
+/*!
+ * \brief Bit Reader
+ *
+ */
+template <typename T>
+class BitReader {
+ public:
+  explicit BitReader(const T* data) : _dptr(data), _used_bits(0), _blocks(0) {}
+  bool Get() {
+    if (_used_bits == 0) {
+      _accum = _dptr[_blocks++];
+      _used_bits = PACKING_SIZE;
+    }
+    return _accum & (1 << --_used_bits);
+  }
+
+  size_t bits() const { return _blocks * PACKING_SIZE - _used_bits; }
+
+ private:
+  static constexpr size_t PACKING_SIZE = sizeof(T) * 8;
+  const T* _dptr;  // allocated
+  size_t _used_bits;
+  size_t _blocks;
+  T _accum;
+};
+
+inline uint32_t RoundNextPow2(uint32_t v) {
+  v -= 1;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v += 1;
+  return v;
+}
+
+template <typename T>
+void EliasDeltaEncode(BitWriter<T>& bit_writer, unsigned long x) {
+  int len = 1 + std::floor(std::log2(x));
+  int lenth_of_len = std::floor(std::log2(len));
+
+  for (int i = lenth_of_len; i > 0; --i) bit_writer.Put(0);
+  for (int i = lenth_of_len; i >= 0; --i) bit_writer.Put((len >> i) & 1);
+  for (int i = len - 2; i >= 0; i--) bit_writer.Put((x >> i) & 1);
+}
+
+template <typename T>
+unsigned long EliasDeltaDecode(BitReader<T>& bit_reader) {
+  unsigned long num = 1;
+  int len = 1;
+  int lenth_of_len = 0;
+  while (!bit_reader.Get()) lenth_of_len++;
+  for (int i = 0; i < lenth_of_len; i++) {
+    len <<= 1;
+    if (bit_reader.Get()) len |= 1;
+  }
+  for (int i = 0; i < len - 1; i++) {
+    num <<= 1;
+    if (bit_reader.Get()) num |= 1;
+  }
+  return num;
+}
+
+template <typename T, class F = std::function<bool(T)>>
+T HyperParamFinder(const kwargs_t& kwargs, std::string name,
+                   bool optional = false, F&& check = [](T) { return true; }) {
+  static_assert(std::is_fundamental<T>::value,
+                "custom type is not allow for HyperParamFinder");
+  T value{T()};
+  auto iter = kwargs.find(name);
+  if (iter == kwargs.end()) {
+    // necessary hp
+    if (optional == false) {
+      BPS_LOG(FATAL) << "Hyper-parameter '" << name
+                     << "' is not found! Aborted.";
+    }
+    return value;
+  } else {
+    std::istringstream ss(iter->second);
+    if (std::is_same<T, bool>::value) {
+      ss >> std::boolalpha >> value;
+    } else {
+      ss >> value;
+    }
+    if (!check(value)) {
+      BPS_LOG(FATAL) << "Hyper-parameter '" << name << "' should not be "
+                     << value << "! Aborted.";
+    }
+  }
+
+  BPS_LOG(INFO) << "Register hyper-parameter '" << name << "'=" << value;
+  return value;
+}
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_UTILS_H
\ No newline at end of file
diff --git a/byteps/common/core_loops.cc b/byteps/common/core_loops.cc
index 8a6bc2f3e..4fd697b3c 100644
--- a/byteps/common/core_loops.cc
+++ b/byteps/common/core_loops.cc
@@ -13,11 +13,14 @@
 // limitations under the License.
 // =============================================================================
 
-#include "core_loops.h"
 #include <cuda_runtime.h>
+
 #include <chrono>
 #include <memory>
+
 #include "common.h"
+#include "compressor/compressor.h"
+#include "core_loops.h"
 #include "global.h"
 #include "logging.h"
 
@@ -63,25 +66,27 @@ void FinishOrProceed(std::shared_ptr<TensorTableEntry> task) {
   }
 
   if (task->context->profile_flag) {
-    BPS_CHECK(task->context->part_comm_time[task->key][this_op].back()->dur == 0)
-                    << " tensor: " << task->tensor_name
-                    << " task->key:" << task->key
-                    << " type:" << this_op
-                    << " 'dur' has already been assigned:" << task->context->part_comm_time[task->key][this_op].back()->dur;
+    BPS_CHECK(task->context->part_comm_time[task->key][this_op].back()->dur ==
+              0)
+        << " tensor: " << task->tensor_name << " task->key:" << task->key
+        << " type:" << this_op << " 'dur' has already been assigned:"
+        << task->context->part_comm_time[task->key][this_op].back()->dur;
     auto now = std::chrono::system_clock::now();
     auto duration = now.time_since_epoch();
     auto us = std::chrono::duration_cast<std::chrono::microseconds>(duration);
-    auto _ts = task->context->part_comm_time[task->key][this_op].back()->start_t;
-    BPS_CHECK(task->context->part_comm_time.find(task->key) != task->context->part_comm_time.end())
-                    << " tensor: " << task->tensor_name
-                    << " task->key:" << task->key
-                    << " type:" << this_op;
-    BPS_CHECK(task->context->part_comm_time[task->key].find(this_op) != task->context->part_comm_time[task->key].end())
-                    << " tensor: " << task->tensor_name
-                    << " task->key:" << task->key
-                    << " type:" << this_op;
+    auto _ts =
+        task->context->part_comm_time[task->key][this_op].back()->start_t;
+    BPS_CHECK(task->context->part_comm_time.find(task->key) !=
+              task->context->part_comm_time.end())
+        << " tensor: " << task->tensor_name << " task->key:" << task->key
+        << " type:" << this_op;
+    BPS_CHECK(task->context->part_comm_time[task->key].find(this_op) !=
+              task->context->part_comm_time[task->key].end())
+        << " tensor: " << task->tensor_name << " task->key:" << task->key
+        << " type:" << this_op;
 
-    task->context->part_comm_time[task->key][this_op].back()->dur = (long long)(us.count()) - _ts;
+    task->context->part_comm_time[task->key][this_op].back()->dur =
+        (long long)(us.count()) - _ts;
   }
 
   // finish current QueueType of this task, erase current QueueType.
@@ -97,7 +102,8 @@ void FinishOrProceed(std::shared_ptr<TensorTableEntry> task) {
     BPS_CHECK(task->counter_ptr) << task->tensor_name << " counter_ptr is null";
     int v = task->counter_ptr.get()->fetch_add(1);
     if (v == (int)(task->total_partnum - 1)) {
-      // if meet this condition, that means all sub-tasks of this task have been done
+      // if meet this condition, that means all sub-tasks of this task have been
+      // done
       BPS_CHECK(task->tensor_name != "");
       BPS_LOG(TRACE) << "Rank=" << BytePSGlobal::GetRank()
                      << " finish processing tensor: " << task->tensor_name;
@@ -105,11 +111,13 @@ void FinishOrProceed(std::shared_ptr<TensorTableEntry> task) {
       //* Add for profiling communication events
       if (task->context->profile_flag) {
         BPS_CHECK(task->context->comm_time.back()->dur == 0)
-                    << " tensor: " << task->tensor_name
-                    << " 'dur' has already been assigned:" << task->context->comm_time.back()->dur;
+            << " tensor: " << task->tensor_name
+            << " 'dur' has already been assigned:"
+            << task->context->comm_time.back()->dur;
         auto now = std::chrono::system_clock::now();
         auto duration = now.time_since_epoch();
-        auto us = std::chrono::duration_cast<std::chrono::microseconds>(duration);
+        auto us =
+            std::chrono::duration_cast<std::chrono::microseconds>(duration);
         auto _ts = task->context->comm_time.back()->start_t;
         task->context->comm_time.back()->dur = (long long)(us.count()) - _ts;
       }
@@ -205,8 +213,7 @@ inline void PostNcclCalls(
     nccl_root = BytePSGlobal::GetReduceRootByKey(key);
     num_elem_per_gpu = 0;
     left_elem = len / unit_len;
-    BPS_LOG(TRACE) << "Reduce key=" << key
-                   << " to root=" << nccl_root
+    BPS_LOG(TRACE) << "Reduce key=" << key << " to root=" << nccl_root
                    << " rank=" << BytePSGlobal::GetLocalRank();
   }
 
@@ -416,8 +423,7 @@ bool RunCopyDevice2HostLoopOnce() {
 
     if (copy_len) {
       CUDA_CALL(cudaMemcpyAsync(
-          (void *)(cpubuff + copy_offset),
-          (const void *)(p + copy_offset),
+          (void *)(cpubuff + copy_offset), (const void *)(p + copy_offset),
           (size_t)copy_len, (cudaMemcpyKind)cudaMemcpyDeviceToHost,
           (cudaStream_t)*copy_d2h_Stream));
       CUDA_CALL(cudaStreamSynchronize(*copy_d2h_Stream));
@@ -483,6 +489,46 @@ bool RunPcieReduceLoopOnce() {
   return true;
 }
 
+bool RunCompressLoopOnce() {
+  QueueType this_op = COMPRESS;
+  auto q = BytePSGlobal::GetScheduledQueue(this_op);
+  auto task = q->getTask();
+  if (task) {
+    BPS_CHECK(BytePSGlobal::IsRootDevice())
+        << "only root device should enter COMPRESS loop";
+    BPS_CHECK(task->compressor != nullptr);
+    BPS_CHECK(task->compressed == nullptr);
+
+    // spawn
+    BytePSGlobal::GetThreadPool()->enqueue([task]() {
+      char *data = const_cast<char *>(static_cast<const char *>(task->cpubuff) +
+                                      task->offset);
+      int len = task->len;
+      int dtype = task->tensor->dtype();
+      compressor::tensor_t grad(data, len, dtype);
+      auto compressed = task->compressor->Compress(grad);
+      BPS_CHECK_LE(compressed.size, len)
+          << "Compressor Implementation Error "
+          << ", key=" << task->key << ", src_len=" << len
+          << ", compressed_len=" << compressed.size;
+
+      task->compressed = std::make_shared<decltype(compressed)>(compressed);
+
+      // restore rt
+      auto &queue_list = task->queue_list;
+      BytePSGlobal::GetScheduledQueue(queue_list[1])
+          ->reset(task->key, BytePSGlobal::GetLocalSize() - 1);
+
+      FinishOrProceed(task);
+    });
+
+  } else {
+    std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  }
+
+  return true;
+}
+
 bool RunPushLoopOnce() {
   QueueType this_op = PUSH;
   auto q = BytePSGlobal::GetScheduledQueue(this_op);
@@ -503,6 +549,14 @@ bool RunPushLoopOnce() {
       // get metadata
       const int dtype = task->tensor->dtype();
 
+      // use compressed data/len
+      if (task->compressed) {
+        BPS_LOG(DEBUG) << "PUSH with gradient compression. key=" << task->key;
+        data = task->compressed->data;
+        len = task->compressed->size;
+        task->compressed = nullptr;
+      }
+
       // false means not to delete data when SArray is deleted
       ps::SArray<char> vals(data, len, false);
 
@@ -557,6 +611,36 @@ bool RunPullLoopOnce() {
   return true;
 }
 
+bool RunDecompressLoopOnce() {
+  QueueType this_op = DECOMPRESS;
+  auto q = BytePSGlobal::GetScheduledQueue(this_op);
+  auto task = q->getTask();
+  if (task) {
+    BPS_CHECK(BytePSGlobal::IsRootDevice())
+        << "only root device should enter DECOMPRESS loop";
+    BPS_CHECK(task->compressor != nullptr);
+
+    // spawn
+    BytePSGlobal::GetThreadPool()->enqueue([task]() {
+      char *data = const_cast<char *>(static_cast<const char *>(task->cpubuff) +
+                                      task->offset);
+      auto &pskv = BytePSGlobal::EncodeDefaultKey(task->key, 0);
+      auto len = pskv.lens[0];
+      int dtype = task->tensor->dtype();
+      compressor::tensor_t compressed(data, len, dtype);
+      auto decompressed = task->compressor->Decompress(compressed);
+      BPS_LOG(DEBUG) << "PULL with gradient compression. key=" << task->key;
+
+      FinishOrProceed(task);
+    });
+
+  } else {
+    std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  }
+
+  return true;
+}
+
 void CopyHost2Device(std::shared_ptr<byteps::common::TensorTableEntry> task) {
   auto copy_h2d_stream = BytePSGlobal::GetCopyHost2DeviceStream();
   auto tensor = task->output;
@@ -594,8 +678,7 @@ void CopyHost2Device(std::shared_ptr<byteps::common::TensorTableEntry> task) {
 
   if (copy_len) {
     CUDA_CALL(cudaMemcpyAsync(
-        (void *)(gpu_addr + copy_offset),
-        (const void *)(cpubuff + copy_offset),
+        (void *)(gpu_addr + copy_offset), (const void *)(cpubuff + copy_offset),
         (size_t)copy_len, (cudaMemcpyKind)cudaMemcpyHostToDevice,
         (cudaStream_t)*copy_h2d_stream));
     CUDA_CALL(cudaStreamSynchronize(*copy_h2d_stream));
@@ -719,6 +802,12 @@ void CopyDevice2HostLoop() {
   BytePSGlobal::ReportThreadFinish();
 }
 
+void CompressLoop() {
+  while (RunCompressLoopOnce() && !BytePSGlobal::ShouldShutdown()) {
+  }
+  BytePSGlobal::ReportThreadFinish();
+}
+
 void PushLoop() {
   while (RunPushLoopOnce() && !BytePSGlobal::ShouldShutdown()) {
   }
@@ -731,6 +820,12 @@ void PullLoop() {
   BytePSGlobal::ReportThreadFinish();
 }
 
+void DecompressLoop() {
+  while (RunDecompressLoopOnce() && !BytePSGlobal::ShouldShutdown()) {
+  }
+  BytePSGlobal::ReportThreadFinish();
+}
+
 void RootCopyHost2DeviceLoop() {
   CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
   while (RunRootCopyHost2DeviceLoopOnce() && !BytePSGlobal::ShouldShutdown()) {
diff --git a/byteps/common/core_loops.h b/byteps/common/core_loops.h
index 561eea17c..2437c3323 100644
--- a/byteps/common/core_loops.h
+++ b/byteps/common/core_loops.h
@@ -35,10 +35,14 @@ void SyncNcclLoop();
 
 void CopyDevice2HostLoop();
 
+void CompressLoop();
+
 void PushLoop();
 
 void PullLoop();
 
+void DecompressLoop();
+
 void RootCopyHost2DeviceLoop();
 
 void NonRootCopyListenLoop();
diff --git a/byteps/common/cpu_reducer.cc b/byteps/common/cpu_reducer.cc
index e066d7384..03647961d 100644
--- a/byteps/common/cpu_reducer.cc
+++ b/byteps/common/cpu_reducer.cc
@@ -17,13 +17,14 @@
 #include "global.h"
 #endif
 
+#include <cmath>
+
 #include "cpu_reducer.h"
 
 namespace byteps {
 namespace common {
 
 CpuReducer::CpuReducer(std::shared_ptr<BytePSComm> comm) {
-
 #ifndef BYTEPS_BUILDING_SERVER
   std::vector<int> peers;
   auto pcie_size = BytePSGlobal::GetPcieSwitchSize();
@@ -33,17 +34,16 @@ CpuReducer::CpuReducer(std::shared_ptr<BytePSComm> comm) {
   }
   if (comm) {
     _comm = std::make_shared<BytePSCommSocket>(comm, std::string("cpu"), peers);
-  }
-  else {
+  } else {
     _comm = nullptr;
   }
 #endif
-
   if (getenv("BYTEPS_OMP_THREAD_PER_GPU")) {
     _num_threads = atoi(getenv("BYTEPS_OMP_THREAD_PER_GPU"));
   } else {
     _num_threads = 4;
   }
+
   return;
 }
 
@@ -56,28 +56,28 @@ bool CpuReducer::isRoot() {
 }
 #endif
 
-int CpuReducer::sum(void* dst, void* src, size_t len, DataType dtype) {
+int CpuReducer::sum(void* dst, const void* src, size_t len, DataType dtype) {
   switch (dtype) {
     case BYTEPS_FLOAT32:
-      return _sum(reinterpret_cast<float*>(dst), reinterpret_cast<float*>(src),
-                  len);
+      return _sum(reinterpret_cast<float*>(dst),
+                  reinterpret_cast<const float*>(src), len);
     case BYTEPS_FLOAT64:
       return _sum(reinterpret_cast<double*>(dst),
-                  reinterpret_cast<double*>(src), len);
+                  reinterpret_cast<const double*>(src), len);
     case BYTEPS_FLOAT16:
       return _sum_float16(dst, src, len);
     case BYTEPS_UINT8:
       return _sum(reinterpret_cast<uint8_t*>(dst),
-                  reinterpret_cast<uint8_t*>(src), len);
+                  reinterpret_cast<const uint8_t*>(src), len);
     case BYTEPS_INT32:
       return _sum(reinterpret_cast<int32_t*>(dst),
-                  reinterpret_cast<int32_t*>(src), len);
+                  reinterpret_cast<const int32_t*>(src), len);
     case BYTEPS_INT8:
       return _sum(reinterpret_cast<int8_t*>(dst),
-                  reinterpret_cast<int8_t*>(src), len);
+                  reinterpret_cast<const int8_t*>(src), len);
     case BYTEPS_INT64:
       return _sum(reinterpret_cast<int64_t*>(dst),
-                  reinterpret_cast<int64_t*>(src), len);
+                  reinterpret_cast<const int64_t*>(src), len);
     default:
       BPS_CHECK(0) << "Unsupported data type: " << dtype;
   }
@@ -85,7 +85,7 @@ int CpuReducer::sum(void* dst, void* src, size_t len, DataType dtype) {
 }
 
 template <typename T>
-int CpuReducer::_sum(T* dst, T* src, size_t len) {
+int CpuReducer::_sum(T* dst, const T* src, size_t len) {
 #pragma omp parallel for simd num_threads(_num_threads)
   for (size_t i = 0; i < len / (size_t)sizeof(T); ++i) {
     dst[i] = dst[i] + src[i];
@@ -93,10 +93,10 @@ int CpuReducer::_sum(T* dst, T* src, size_t len) {
   return 0;
 }
 
-int CpuReducer::_sum_float16(void* dst, void* src, size_t len) {
+int CpuReducer::_sum_float16(void* dst, const void* src, size_t len) {
   // cast src and dst to your float16 type
-  auto in = (unsigned short*)src;
-  auto inout = (unsigned short*)dst;
+  auto in = reinterpret_cast<const unsigned short*>(src);
+  auto inout = reinterpret_cast<unsigned short*>(dst);
   len = len / (size_t)2;
 
 #if __AVX__ && __F16C__
@@ -116,7 +116,7 @@ int CpuReducer::_sum_float16(void* dst, void* src, size_t len) {
       _mm_storeu_si128((__m128i*)(inout + i), new_inout_m128i);
     }
   }
-#endif
+
   for (size_t i = (len / 8) * 8; i < (size_t)len; ++i) {
     float in_float;
     float inout_float;
@@ -125,38 +125,50 @@ int CpuReducer::_sum_float16(void* dst, void* src, size_t len) {
     inout_float += in_float;
     Float2HalfBits(&inout_float, inout + i);
   }
+#else
+#pragma omp parallel for simd num_threads(_num_threads)
+  for (size_t i = 0; i < (size_t)len; ++i) {
+    float in_float;
+    float inout_float;
+    HalfBits2Float(in + i, &in_float);
+    HalfBits2Float(inout + i, &inout_float);
+    inout_float += in_float;
+    Float2HalfBits(&inout_float, inout + i);
+  }
+#endif
 
   return 0;
 }
 
-int CpuReducer::sum(void* dst, void* src1, void* src2, size_t len,
+int CpuReducer::sum(void* dst, const void* src1, const void* src2, size_t len,
                     DataType dtype) {
   switch (dtype) {
     case BYTEPS_FLOAT32:
-      return _sum(reinterpret_cast<float*>(dst), reinterpret_cast<float*>(src1),
-                  reinterpret_cast<float*>(src2), len);
+      return _sum(reinterpret_cast<float*>(dst),
+                  reinterpret_cast<const float*>(src1),
+                  reinterpret_cast<const float*>(src2), len);
     case BYTEPS_FLOAT64:
       return _sum(reinterpret_cast<double*>(dst),
-                  reinterpret_cast<double*>(src1),
-                  reinterpret_cast<double*>(src2), len);
+                  reinterpret_cast<const double*>(src1),
+                  reinterpret_cast<const double*>(src2), len);
     case BYTEPS_FLOAT16:
       return _sum_float16(dst, src1, src2, len);
     case BYTEPS_UINT8:
       return _sum(reinterpret_cast<uint8_t*>(dst),
-                  reinterpret_cast<uint8_t*>(src1),
-                  reinterpret_cast<uint8_t*>(src2), len);
+                  reinterpret_cast<const uint8_t*>(src1),
+                  reinterpret_cast<const uint8_t*>(src2), len);
     case BYTEPS_INT32:
       return _sum(reinterpret_cast<int32_t*>(dst),
-                  reinterpret_cast<int32_t*>(src1),
-                  reinterpret_cast<int32_t*>(src2), len);
+                  reinterpret_cast<const int32_t*>(src1),
+                  reinterpret_cast<const int32_t*>(src2), len);
     case BYTEPS_INT8:
       return _sum(reinterpret_cast<int8_t*>(dst),
-                  reinterpret_cast<int8_t*>(src1),
-                  reinterpret_cast<int8_t*>(src2), len);
+                  reinterpret_cast<const int8_t*>(src1),
+                  reinterpret_cast<const int8_t*>(src2), len);
     case BYTEPS_INT64:
       return _sum(reinterpret_cast<int64_t*>(dst),
-                  reinterpret_cast<int64_t*>(src1),
-                  reinterpret_cast<int64_t*>(src2), len);
+                  reinterpret_cast<const int64_t*>(src1),
+                  reinterpret_cast<const int64_t*>(src2), len);
     default:
       BPS_CHECK(0) << "Unsupported data type: " << dtype;
   }
@@ -164,7 +176,7 @@ int CpuReducer::sum(void* dst, void* src1, void* src2, size_t len,
 }
 
 template <typename T>
-int CpuReducer::_sum(T* dst, T* src1, T* src2, size_t len) {
+int CpuReducer::_sum(T* dst, const T* src1, const T* src2, size_t len) {
 #pragma omp parallel for simd num_threads(_num_threads)
   for (size_t i = 0; i < len / (size_t)sizeof(T); ++i) {
     dst[i] = src1[i] + src2[i];
@@ -172,11 +184,12 @@ int CpuReducer::_sum(T* dst, T* src1, T* src2, size_t len) {
   return 0;
 }
 
-int CpuReducer::_sum_float16(void* dst, void* src1, void* src2, size_t len) {
+int CpuReducer::_sum_float16(void* dst, const void* src1, const void* src2,
+                             size_t len) {
   // cast src and dst to your float16 type
-  auto in1 = (unsigned short*)src1;
-  auto in2 = (unsigned short*)src2;
-  auto out = (unsigned short*)dst;
+  auto in1 = reinterpret_cast<const unsigned short*>(src1);
+  auto in2 = reinterpret_cast<const unsigned short*>(src2);
+  auto out = reinterpret_cast<unsigned short*>(dst);
   len = len / (size_t)2;
 
 #if __AVX__ && __F16C__
@@ -195,7 +208,7 @@ int CpuReducer::_sum_float16(void* dst, void* src1, void* src2, size_t len) {
       _mm_storeu_si128((__m128i*)(out + i), new_inout_m128i);
     }
   }
-#endif
+
   for (size_t i = (size_t)(len / 8) * 8; i < (size_t)len; ++i) {
     float in1_float;
     float in2_float;
@@ -205,10 +218,212 @@ int CpuReducer::_sum_float16(void* dst, void* src1, void* src2, size_t len) {
     out_float = in1_float + in2_float;
     Float2HalfBits(&out_float, out + i);
   }
+#else
+#pragma omp parallel for simd num_threads(_num_threads)
+  for (size_t i = 0; i < (size_t)len; ++i) {
+    float in1_float;
+    float in2_float;
+    float out_float;
+    HalfBits2Float(in1 + i, &in1_float);
+    HalfBits2Float(in2 + i, &in2_float);
+    out_float = in1_float + in2_float;
+    Float2HalfBits(&out_float, out + i);
+  }
+#endif
+  return 0;
+}
+
+int CpuReducer::sum(void* dst, const void* src, size_t len, DataType dtype,
+                    float alpha) {
+  switch (dtype) {
+    case BYTEPS_FLOAT32:
+      return _sum(reinterpret_cast<float*>(dst),
+                  reinterpret_cast<const float*>(src), len, alpha);
+    case BYTEPS_FLOAT64:
+      return _sum(reinterpret_cast<double*>(dst),
+                  reinterpret_cast<const double*>(src), len, alpha);
+    case BYTEPS_FLOAT16:
+      return _sum_float16(dst, src, len, alpha);
+    case BYTEPS_UINT8:
+      return _sum(reinterpret_cast<uint8_t*>(dst),
+                  reinterpret_cast<const uint8_t*>(src), len, alpha);
+    case BYTEPS_INT32:
+      return _sum(reinterpret_cast<int32_t*>(dst),
+                  reinterpret_cast<const int32_t*>(src), len, alpha);
+    case BYTEPS_INT8:
+      return _sum(reinterpret_cast<int8_t*>(dst),
+                  reinterpret_cast<const int8_t*>(src), len, alpha);
+    case BYTEPS_INT64:
+      return _sum(reinterpret_cast<int64_t*>(dst),
+                  reinterpret_cast<const int64_t*>(src), len, alpha);
+    default:
+      BPS_CHECK(0) << "Unsupported data type: " << dtype;
+  }
+  return 0;
+}
+
+template <typename T>
+int CpuReducer::_sum(T* dst, const T* src, size_t len, float alpha) {
+#pragma omp parallel for simd num_threads(_num_threads)
+  for (size_t i = 0; i < len / (size_t)sizeof(T); ++i) {
+    dst[i] = dst[i] + alpha * src[i];
+  }
   return 0;
 }
 
-int CpuReducer::copy(void* dst, void* src, size_t len) {
+int CpuReducer::_sum_float16(void* dst, const void* src, size_t len,
+                             float alpha) {
+  // cast src and dst to your float16 type
+  auto in = reinterpret_cast<const unsigned short*>(src);
+  auto inout = reinterpret_cast<unsigned short*>(dst);
+  len = len / (size_t)2;
+
+#if __AVX__ && __F16C__
+  float mm256_alpha[8];
+  for (int i = 0; i < 8; ++i) mm256_alpha[i] = alpha;
+
+  if (is_avx_and_f16c()) {
+    __m256 __mm256_alpha = _mm256_loadu_ps(mm256_alpha);
+#pragma omp parallel for simd num_threads(_num_threads)
+    for (size_t i = 0; i < (size_t)(len / 8) * 8; i += 8) {
+      // convert in & inout to m256
+      __m256 in_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in + i)));
+      __m256 inout_m256 =
+          _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(inout + i)));
+
+      __m256 scaled_in_m256 = _mm256_mul_ps(in_m256, __mm256_alpha);
+      // add them together to new_inout_m256
+      __m256 new_inout_m256 = _mm256_add_ps(scaled_in_m256, inout_m256);
+
+      // convert back and store in inout
+      __m128i new_inout_m128i = _mm256_cvtps_ph(new_inout_m256, 0);
+      _mm_storeu_si128((__m128i*)(inout + i), new_inout_m128i);
+    }
+  }
+
+  for (size_t i = (len / 8) * 8; i < (size_t)len; ++i) {
+    float in_float;
+    float inout_float;
+    HalfBits2Float(in + i, &in_float);
+    HalfBits2Float(inout + i, &inout_float);
+    inout_float += in_float * alpha;
+    Float2HalfBits(&inout_float, inout + i);
+  }
+#else
+#pragma omp parallel for simd num_threads(_num_threads)
+  for (size_t i = 0; i < (size_t)len; ++i) {
+    float in_float;
+    float inout_float;
+    HalfBits2Float(in + i, &in_float);
+    HalfBits2Float(inout + i, &inout_float);
+    inout_float += in_float * alpha;
+    Float2HalfBits(&inout_float, inout + i);
+  }
+#endif
+
+  return 0;
+}
+
+int CpuReducer::sum(void* dst, const void* src1, const void* src2, size_t len,
+                    DataType dtype, float alpha) {
+  switch (dtype) {
+    case BYTEPS_FLOAT32:
+      return _sum(reinterpret_cast<float*>(dst),
+                  reinterpret_cast<const float*>(src1),
+                  reinterpret_cast<const float*>(src2), len, alpha);
+    case BYTEPS_FLOAT64:
+      return _sum(reinterpret_cast<double*>(dst),
+                  reinterpret_cast<const double*>(src1),
+                  reinterpret_cast<const double*>(src2), len, alpha);
+    case BYTEPS_FLOAT16:
+      return _sum_float16(dst, src1, src2, len, alpha);
+    case BYTEPS_UINT8:
+      return _sum(reinterpret_cast<uint8_t*>(dst),
+                  reinterpret_cast<const uint8_t*>(src1),
+                  reinterpret_cast<const uint8_t*>(src2), len, alpha);
+    case BYTEPS_INT32:
+      return _sum(reinterpret_cast<int32_t*>(dst),
+                  reinterpret_cast<const int32_t*>(src1),
+                  reinterpret_cast<const int32_t*>(src2), len, alpha);
+    case BYTEPS_INT8:
+      return _sum(reinterpret_cast<int8_t*>(dst),
+                  reinterpret_cast<const int8_t*>(src1),
+                  reinterpret_cast<const int8_t*>(src2), len, alpha);
+    case BYTEPS_INT64:
+      return _sum(reinterpret_cast<int64_t*>(dst),
+                  reinterpret_cast<const int64_t*>(src1),
+                  reinterpret_cast<const int64_t*>(src2), len, alpha);
+    default:
+      BPS_CHECK(0) << "Unsupported data type: " << dtype;
+  }
+  return 0;
+}
+
+template <typename T>
+int CpuReducer::_sum(T* dst, const T* src1, const T* src2, size_t len,
+                     float alpha) {
+#pragma omp parallel for simd num_threads(_num_threads)
+  for (size_t i = 0; i < len / (size_t)sizeof(T); ++i) {
+    dst[i] = src1[i] + alpha * src2[i];
+  }
+  return 0;
+}
+
+int CpuReducer::_sum_float16(void* dst, const void* src1, const void* src2,
+                             size_t len, float alpha) {
+  // cast src and dst to your float16 type
+  auto in1 = reinterpret_cast<const unsigned short*>(src1);
+  auto in2 = reinterpret_cast<const unsigned short*>(src2);
+  auto out = reinterpret_cast<unsigned short*>(dst);
+  len = len / (size_t)2;
+
+#if __AVX__ && __F16C__
+  float mm256_alpha[8];
+  for (int i = 0; i < 8; ++i) mm256_alpha[i] = alpha;
+
+  if (is_avx_and_f16c()) {
+    __m256 __mm256_alpha = _mm256_loadu_ps(mm256_alpha);
+#pragma omp parallel for simd num_threads(_num_threads)
+    for (size_t i = 0; i < (size_t)(len / 8) * 8; i += 8) {
+      // convert in1 & in2 to m256
+      __m256 in1_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in1 + i)));
+      __m256 in2_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in2 + i)));
+
+      __m256 scaled_in2_m256 = _mm256_mul_ps(in2_m256, __mm256_alpha);
+      // add them together to new_inout_m256
+      __m256 new_out_m256 = _mm256_add_ps(in1_m256, scaled_in2_m256);
+
+      // convert back and store in out
+      __m128i new_out_m128i = _mm256_cvtps_ph(new_out_m256, 0);
+      _mm_storeu_si128((__m128i*)(out + i), new_out_m128i);
+    }
+  }
+
+  for (size_t i = (size_t)(len / 8) * 8; i < (size_t)len; ++i) {
+    float in1_float;
+    float in2_float;
+    float out_float;
+    HalfBits2Float(in1 + i, &in1_float);
+    HalfBits2Float(in2 + i, &in2_float);
+    out_float = in1_float + in2_float * alpha;
+    Float2HalfBits(&out_float, out + i);
+  }
+#else
+#pragma omp parallel for simd num_threads(_num_threads)
+  for (size_t i = 0; i < (size_t)len; ++i) {
+    float in1_float;
+    float in2_float;
+    float out_float;
+    HalfBits2Float(in1 + i, &in1_float);
+    HalfBits2Float(in2 + i, &in2_float);
+    out_float = in1_float + in2_float * alpha;
+    Float2HalfBits(&out_float, out + i);
+  }
+#endif
+  return 0;
+}
+
+int CpuReducer::copy(void* dst, const void* src, size_t len) {
   auto in = (float*)src;
   auto out = (float*)dst;
 #pragma omp parallel for simd num_threads(_num_threads)
@@ -220,7 +435,5 @@ int CpuReducer::copy(void* dst, void* src, size_t len) {
   }
   return 0;
 }
-
-
 }  // namespace common
 }  // namespace byteps
diff --git a/byteps/common/cpu_reducer.h b/byteps/common/cpu_reducer.h
index a6e682c55..3bff0c680 100644
--- a/byteps/common/cpu_reducer.h
+++ b/byteps/common/cpu_reducer.h
@@ -21,8 +21,8 @@
 #include <immintrin.h>
 #endif
 
-#include <memory>
 #include <cstring>
+#include <memory>
 #include "common.h"
 #include "logging.h"
 
@@ -45,21 +45,25 @@ class CpuReducer {
     BPS_LOG(DEBUG) << "Clear CpuReducer";
   }
 
-  int sum(void* dst, void* src, size_t len, DataType dtype);
-  int sum(void* dst, void* src1, void* src2, size_t len, DataType dtype);
-  int copy(void* dst, void* src, size_t len);
+  int sum(void* dst, const void* src, size_t len, DataType dtype);
+  int sum(void* dst, const void* src1, const void* src2, size_t len,
+          DataType dtype);
+
+  int sum(void* dst, const void* src, size_t len, DataType dtype, float alpha);
+  int sum(void* dst, const void* src1, const void* src2, size_t len,
+          DataType dtype, float alpha);
+
+  int copy(void* dst, const void* src, size_t len);
 
 #ifndef BYTEPS_BUILDING_SERVER
   bool isRoot();
   std::shared_ptr<BytePSComm> getComm() { return _comm; }
 #endif
 
-
-  DataType GetDataType(int dtype) {
-    return static_cast<DataType>(dtype);
-  }
+  DataType GetDataType(int dtype) { return static_cast<DataType>(dtype); }
 
  private:
+
 #if __AVX__ && __F16C__
   // Query CPUID to determine AVX and F16C runtime support.
   bool is_avx_and_f16c() {
@@ -76,7 +80,7 @@ class CpuReducer {
   }
 #endif
 
-  inline void HalfBits2Float(unsigned short* src, float* res) {
+  inline void HalfBits2Float(const unsigned short* src, float* res) {
     unsigned h = *src;
     int sign = ((h >> 15) & 1);
     int exp = ((h >> 10) & 0x1f);
@@ -112,7 +116,7 @@ class CpuReducer {
     *res = *reinterpret_cast<float const*>(&f);
   }
 
-  inline void Float2HalfBits(float* src, unsigned short* dest) {
+  inline void Float2HalfBits(const float* src, unsigned short* dest) {
     // software implementation rounds toward nearest even
     unsigned const& s = *reinterpret_cast<unsigned const*>(src);
     uint16_t sign = uint16_t((s >> 16) & 0x8000);
@@ -175,19 +179,29 @@ class CpuReducer {
   }
 
   template <typename T>
-  int _sum(T* dst, T* src, size_t len);
+  int _sum(T* dst, const T* src, size_t len);
+  template <typename T>
+  int _sum(T* dst, const T* src1, const T* src2, size_t len);
+
+  int _sum_float16(void* dst, const void* src, size_t len);
+  int _sum_float16(void* dst, const void* src1, const void* src2, size_t len);
+
+  template <typename T>
+  int _sum(T* dst, const T* src, size_t len, float alpha);
 
   template <typename T>
-  int _sum(T* dst, T* src1, T* src2, size_t len);
+  int _sum(T* dst, const T* src1, const T* src2, size_t len, float alpha);
 
-  int _sum_float16(void* dst, void* src, size_t len);
-  int _sum_float16(void* dst, void* src1, void* src2, size_t len);
+  int _sum_float16(void* dst, const void* src, size_t len, float alpha);
+  int _sum_float16(void* dst, const void* src1, const void* src2, size_t len,
+                   float alpha);
 
   float _convert_half_to_full_precision(uint16_t h);
   uint16_t _convert_full_to_half_precision(float f);
 
   std::shared_ptr<BytePSComm> _comm;
   int _num_threads;
+  size_t _single_thread_threshold; 
 };
 
 }  // namespace common
diff --git a/byteps/common/global.cc b/byteps/common/global.cc
index d9ade8f37..73ddf8444 100644
--- a/byteps/common/global.cc
+++ b/byteps/common/global.cc
@@ -13,11 +13,14 @@
 // limitations under the License.
 // =============================================================================
 
-#include "global.h"
 #include <malloc.h>
 #include <numa.h>
+
 #include <sstream>
 
+#include "compressor/compressor.h"
+#include "global.h"
+
 namespace byteps {
 namespace common {
 
@@ -37,6 +40,7 @@ bool BytePSGlobal::_is_root_device;
 bool BytePSGlobal::_is_distributed_job;
 bool BytePSGlobal::_is_cross_pcie_switch;
 uint32_t BytePSGlobal::_partition_bytes = 4096000;
+uint32_t BytePSGlobal::_min_compress_bytes = (1 << 16);
 
 int BytePSGlobal::_is_trace = 0;
 int BytePSGlobal::_start_step = 10;
@@ -77,6 +81,7 @@ cudaStream_t* BytePSGlobal::_copy_device2host_stream = NULL;
 cudaStream_t* BytePSGlobal::_copy_host2device_stream = NULL;
 std::shared_ptr<NcclManager> BytePSGlobal::_nccl_manager;
 std::shared_ptr<CpuReducer> BytePSGlobal::_cpu_reducer;
+std::shared_ptr<ThreadPool> BytePSGlobal::_thread_pool;
 
 std::hash<std::string> BytePSGlobal::_built_in_hash_fn;
 unsigned int BytePSGlobal::_built_in_hash_coefficient;
@@ -106,10 +111,17 @@ void BytePSGlobal::Init() {
   }
 
   // Set the profiling-related variables
-  _is_trace = getenv("BYTEPS_TRACE_ON") ? atoi(getenv("BYTEPS_TRACE_ON")) : _is_trace;
-  _start_step = getenv("BYTEPS_TRACE_START_STEP") ? atoi(getenv("BYTEPS_TRACE_START_STEP")) : _start_step;
-  _end_step = getenv("BYTEPS_TRACE_END_STEP") ? atoi(getenv("BYTEPS_TRACE_END_STEP")) : _end_step;
-  _trace_dir = getenv("BYTEPS_TRACE_DIR") ? std::string(getenv("BYTEPS_TRACE_DIR")) : "./trace";
+  _is_trace =
+      getenv("BYTEPS_TRACE_ON") ? atoi(getenv("BYTEPS_TRACE_ON")) : _is_trace;
+  _start_step = getenv("BYTEPS_TRACE_START_STEP")
+                    ? atoi(getenv("BYTEPS_TRACE_START_STEP"))
+                    : _start_step;
+  _end_step = getenv("BYTEPS_TRACE_END_STEP")
+                  ? atoi(getenv("BYTEPS_TRACE_END_STEP"))
+                  : _end_step;
+  _trace_dir = getenv("BYTEPS_TRACE_DIR")
+                   ? std::string(getenv("BYTEPS_TRACE_DIR"))
+                   : "./trace";
 
   _basic_comm = std::make_shared<BytePSCommSocket>();
 
@@ -122,10 +134,14 @@ void BytePSGlobal::Init() {
   if (getenv("BYTEPS_PARTITION_BYTES")) {
     _partition_bytes = atoi(getenv("BYTEPS_PARTITION_BYTES"));
   }
+  if (getenv("BYTEPS_MIN_COMPRESS_BYTES")) {
+    _min_compress_bytes = atoi(getenv("BYTEPS_MIN_COMPRESS_BYTES"));
+  }
   _pagesize = sysconf(_SC_PAGESIZE);
   BPS_CHECK_GT(_pagesize, 0);
   _partition_bytes = RoundUp(_partition_bytes, _local_size * _pagesize);
-  BPS_LOG(DEBUG) << "Partition size round up to " << _partition_bytes << " (bytes)";
+  BPS_LOG(DEBUG) << "Partition size round up to " << _partition_bytes
+                 << " (bytes)";
 
   BPS_CHECK(getenv("DMLC_NUM_WORKER")) << "error: env DMLC_NUM_WORKER not set";
   _num_worker = atoi(getenv("DMLC_NUM_WORKER"));
@@ -140,15 +156,22 @@ void BytePSGlobal::Init() {
         << "error: launch distributed job, but env DMLC_NUM_SERVER not set";
 
     // set hash function
-    _hash_knob = std::string(getenv("BYTEPS_KEY_HASH_FN") ? getenv("BYTEPS_KEY_HASH_FN") : "djb2");
-    _mixed_mode = getenv("BYTEPS_ENABLE_MIXED_MODE") ? atoi(getenv("BYTEPS_ENABLE_MIXED_MODE")) : false;
+    _hash_knob = std::string(
+        getenv("BYTEPS_KEY_HASH_FN") ? getenv("BYTEPS_KEY_HASH_FN") : "djb2");
+    _mixed_mode = getenv("BYTEPS_ENABLE_MIXED_MODE")
+                      ? atoi(getenv("BYTEPS_ENABLE_MIXED_MODE"))
+                      : false;
     if (_mixed_mode) {
       _hash_knob = std::string("mixed");
     }
     BPS_LOG(DEBUG) << "Using key hash function type: " << _hash_knob;
     if (!_hash_knob.compare(std::string("built_in"))) {
-      _built_in_hash_coefficient = getenv("BYTEPS_BUILT_IN_HASH_COEF") ? atoi(getenv("BYTEPS_BUILT_IN_HASH_COEF")) : 1;
-      BPS_LOG(DEBUG) << "The built in hash coefficient is set to " << _built_in_hash_coefficient;
+      _built_in_hash_coefficient =
+          getenv("BYTEPS_BUILT_IN_HASH_COEF")
+              ? atoi(getenv("BYTEPS_BUILT_IN_HASH_COEF"))
+              : 1;
+      BPS_LOG(DEBUG) << "The built in hash coefficient is set to "
+                     << _built_in_hash_coefficient;
     }
 
     // set server load counter
@@ -188,6 +211,14 @@ void BytePSGlobal::Init() {
     _copy_table = new ReadyTable(1, "COPY");
   }
 
+  if (_is_root_device) {
+    size_t pool_size = 4;
+    if (getenv("BYTEPS_THREADPOOL_SIZE")) {
+      pool_size = atoi(getenv("BYTEPS_THREADPOOL_SIZE"));
+      _thread_pool.reset(new ThreadPool(pool_size));
+    }
+  }
+
   // ReadyTable for cross-PCIe-switch reduce
   if (_is_cross_pcie_switch) {
     if (_cpu_reducer->isRoot()) {
@@ -205,8 +236,8 @@ void BytePSGlobal::Init() {
 
   // Configure the reduce strategy
   if (getenv("BYTEPS_REDUCE_ROOTS")) {
-    BPS_CHECK(!_is_cross_pcie_switch) <<
-      "BYTEPS_REDUCE_ROOTS cannot be used with BYTEPS_PCIE_SWITCH_SIZE.";
+    BPS_CHECK(!_is_cross_pcie_switch)
+        << "BYTEPS_REDUCE_ROOTS cannot be used with BYTEPS_PCIE_SWITCH_SIZE.";
     _is_using_reduce = true;
     auto roots_str = std::string(getenv("BYTEPS_REDUCE_ROOTS"));
     BPS_LOG(DEBUG) << "Setting roots for reduce:" << roots_str;
@@ -253,14 +284,13 @@ ps::KVWorker<char>* BytePSGlobal::GetOrInitPS() {
   // we reuse _init_mutex, because BytePS should have been inited
   std::lock_guard<std::mutex> lock(_init_mutex);
   if (!_ps && IsDistributed() &&
-      _my_role ==
-          BytePSRole::LOCAL_ROOT) {  // only the root needs networking
-      // init low-level ps implementation
-      _ps = new ps::KVWorker<char>(0, 0);
-      ps::StartAsync(0, "byteps\0");
-      if (BytePSGlobal::IsResuming() || !ps::Postoffice::Get()->is_recovery()) {
-        ps::Postoffice::Get()->Barrier(
-            0, ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler);
+      _my_role == BytePSRole::LOCAL_ROOT) {  // only the root needs networking
+    // init low-level ps implementation
+    _ps = new ps::KVWorker<char>(0, 0);
+    ps::StartAsync(0, "byteps\0");
+    if (BytePSGlobal::IsResuming() || !ps::Postoffice::Get()->is_recovery()) {
+      ps::Postoffice::Get()->Barrier(
+          0, ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler);
     }
   }
   return _ps;
@@ -382,7 +412,8 @@ BPSContext& BytePSGlobal::GetContextFromName(const std::string& name) {
 bool BytePSGlobal::IsTensorDeclared(const std::string& name) {
   std::lock_guard<std::mutex> lock(_context_mutex);
   if (_name_to_cxt.find(name) == _name_to_cxt.end()) {
-    if (std::find(_declared_tensors.begin(), _declared_tensors.end(), name) == _declared_tensors.end()) {
+    if (std::find(_declared_tensors.begin(), _declared_tensors.end(), name) ==
+        _declared_tensors.end()) {
       _declared_tensors.push_back(name);
     }
     _name_to_cxt[name].initialized = false;
@@ -398,25 +429,34 @@ bool BytePSGlobal::IsTensorDeclared(const std::string& name) {
 }
 
 void BytePSGlobal::ReDeclareTensor() {
-  for (auto name: _declared_tensors) {
+  for (auto name : _declared_tensors) {
     BPS_LOG(DEBUG) << "Redeclare tensor " << name;
     BytePSGlobal::IsTensorDeclared(name);
   }
 }
 
+void BytePSGlobal::RegisterCompressor(
+    const std::string& name,
+    std::unordered_map<std::string, std::string>& kwargs) {
+  std::lock_guard<std::mutex> lock(_context_mutex);
+  BPS_CHECK(_name_to_cxt.find(name) != _name_to_cxt.end())
+      << name << " is not initialized";
+  _name_to_cxt[name].kwargs = std::move(kwargs);
+}
+
 // Append for communication traces
-void BytePSGlobal::SetProfileFlag(BytePSContext *ctxt) {
+void BytePSGlobal::SetProfileFlag(BytePSContext* ctxt) {
   if (_is_trace == 1) {
     // Enable trace, check the start and end step
     BPS_CHECK(_start_step >= 1 && _end_step > _start_step)
-                << "BYTEPS_TRACE_START_STEP must be larger than 1, "
-                << "BYTEPS_TRACE_END_STEP must be larger than BYTEPS_TRACE_START_STEP.";
-    if(ctxt->step_cnt == _start_step-1){
+        << "BYTEPS_TRACE_START_STEP must be larger than 1, "
+        << "BYTEPS_TRACE_END_STEP must be larger than BYTEPS_TRACE_START_STEP.";
+    if (ctxt->step_cnt == _start_step - 1) {
       ctxt->profile_flag = true;
       BytePSGlobal::Who2beOutput(ctxt->tensor_name);
-    } else if(ctxt->step_cnt == _end_step){
+    } else if (ctxt->step_cnt == _end_step) {
       ctxt->profile_flag = false;
-      if (BytePSGlobal::IsAllTensorOutput(ctxt->tensor_name)){
+      if (BytePSGlobal::IsAllTensorOutput(ctxt->tensor_name)) {
         std::thread _t(BytePSGlobal::OutputTraces);
         _t.detach();
       }
@@ -426,22 +466,24 @@ void BytePSGlobal::SetProfileFlag(BytePSContext *ctxt) {
   }
 }
 
-void BytePSGlobal::EmitTrace(std::ostream *os, const BPSCommTime *ret, BytePSContext *ctxt){
-    std::string tid = (ret->key == -1) ? "total" : std::to_string(ret->key);
-    std::string para_name = "Comm." + ctxt->tensor_name;
-    std::string para_name_type = (ret->key == -1) ? para_name : para_name + "." + LogStrings[ret->type];
-    (*os) << "        {\n"
-          << "            \"ph\": \"X\",\n"
-          << "            \"args\": {\n"
-          << "                \"name\": \"" << para_name << "\"\n"
-          << "            },\n"
-          << "            \"pid\": \"" << para_name << "\",\n"
-          << "            \"name\": \"" << para_name_type << "\",\n"
-          << "            \"ts\": " << ret->start_t << ",\n"
-          << "            \"dur\": " << ret->dur << ",\n"
-          << "            \"tid\": \"" << tid << "\",\n"
-          << "            \"cat\": \"Comm\"\n"
-          << "        }";
+void BytePSGlobal::EmitTrace(std::ostream* os, const BPSCommTime* ret,
+                             BytePSContext* ctxt) {
+  std::string tid = (ret->key == -1) ? "total" : std::to_string(ret->key);
+  std::string para_name = "Comm." + ctxt->tensor_name;
+  std::string para_name_type =
+      (ret->key == -1) ? para_name : para_name + "." + LogStrings[ret->type];
+  (*os) << "        {\n"
+        << "            \"ph\": \"X\",\n"
+        << "            \"args\": {\n"
+        << "                \"name\": \"" << para_name << "\"\n"
+        << "            },\n"
+        << "            \"pid\": \"" << para_name << "\",\n"
+        << "            \"name\": \"" << para_name_type << "\",\n"
+        << "            \"ts\": " << ret->start_t << ",\n"
+        << "            \"dur\": " << ret->dur << ",\n"
+        << "            \"tid\": \"" << tid << "\",\n"
+        << "            \"cat\": \"Comm\"\n"
+        << "        }";
 }
 
 void BytePSGlobal::Who2beOutput(const std::string& name) {
@@ -454,50 +496,61 @@ void BytePSGlobal::Who2beOutput(const std::string& name) {
 
 bool BytePSGlobal::IsAllTensorOutput(const std::string& name) {
   std::lock_guard<std::mutex> lock(_context_mutex);
-  BPS_CHECK(_name2end.find(name) != _name2end.end()) << "Output tensor must been registered to recorder first";
-  //  _output_counter decreases by 1 to confirm the arrival of this tensor
+  BPS_CHECK(_name2end.find(name) != _name2end.end())
+      << "Output tensor must been registered to recorder first";
+  //  _output_counter decreases by 1 to confirm the arrival of this tensro
   _output_counter -= 1;
-  if (_output_counter == 0) return true;
-  else return false;
+  if (_output_counter == 0)
+    return true;
+  else
+    return false;
 }
 
-void BytePSGlobal::OutputTraces(){
+void BytePSGlobal::OutputTraces() {
   // Asynchronously output communication traces
-  auto trace_path = _trace_dir + "/" + std::to_string(_local_rank) + "/comm.json";
+  auto trace_path =
+      _trace_dir + "/" + std::to_string(_local_rank) + "/comm.json";
   // Output these traces
   std::ofstream file;
   file.open(trace_path);
   file << "{" << std::endl;
   file << "    \"traceEvents\": [" << std::endl;
   auto first = true;
-  for(std::unordered_map<std::string, int>::iterator iter = _name2end.begin();
-    iter != _name2end.end(); iter++){
-    BPSContext *ctxt = &_name_to_cxt[iter->first];
+  for (std::unordered_map<std::string, int>::iterator iter = _name2end.begin();
+       iter != _name2end.end(); iter++) {
+    BPSContext* ctxt = &_name_to_cxt[iter->first];
     while (ctxt->comm_time.size() > 0) {
-      BPSCommTime *ret = ctxt->comm_time.front();
-      if (!first) file << ",\n";
-      else first = false;
+      BPSCommTime* ret = ctxt->comm_time.front();
+      if (!first)
+        file << ",\n";
+      else
+        first = false;
       BytePSGlobal::EmitTrace(&file, ret, ctxt);
       ctxt->comm_time.pop();
     }
-    while (!ctxt->part_comm_time.empty()){
+    while (!ctxt->part_comm_time.empty()) {
       auto part_id = ctxt->part_comm_time.begin()->first;
       auto& type2part_comm_time = ctxt->part_comm_time.begin()->second;
-      BPS_CHECK(!type2part_comm_time.empty()) << "type2part_comm_time should not be empty";
-      while (!type2part_comm_time.empty()){
+      BPS_CHECK(!type2part_comm_time.empty())
+          << "type2part_comm_time should not be empty";
+      while (!type2part_comm_time.empty()) {
         auto type = type2part_comm_time.begin()->first;
         auto& _part_comm_time_queue = type2part_comm_time.begin()->second;
-        BPS_CHECK(_part_comm_time_queue.size() > 0) << "_part_comm_time_queue should not be empty";
-        while (_part_comm_time_queue.size() > 0){
-          BPSCommTime *ret = _part_comm_time_queue.front();
-          if (!first) file << ",\n";
-          else first = false;
+        BPS_CHECK(_part_comm_time_queue.size() > 0)
+            << "_part_comm_time_queue should not be empty";
+        while (_part_comm_time_queue.size() > 0) {
+          BPSCommTime* ret = _part_comm_time_queue.front();
+          if (!first)
+            file << ",\n";
+          else
+            first = false;
           BytePSGlobal::EmitTrace(&file, ret, ctxt);
           _part_comm_time_queue.pop();
         }
         type2part_comm_time.erase(type);
       }
-      // if the unordered_map becomes empty, all the traces of this part_id has been read, delete this part_id
+      // if the unordered_map becomes empty, all the traces of this part_id has
+      // been read, delete this part_id
       ctxt->part_comm_time.erase(part_id);
     }
   }
@@ -506,22 +559,28 @@ void BytePSGlobal::OutputTraces(){
   file << "    \"displayTimeUnit\": \"ms\"" << std::endl;
   file << "}" << std::endl;
   // BPS_LOG(TRACE) << "Communication traces output done!";
-  std::cout << "Local rank " << _local_rank << ": communication traces output done!" << std::endl;
+  std::cout << "Local rank " << _local_rank
+            << ": communication traces output done!" << std::endl;
 }
 
 uint64_t BytePSGlobal::Hash_Mixed_Mode(uint64_t key) {
-  const int num_server_total = ps::Postoffice::Get()->GetServerKeyRanges().size();
+  const int num_server_total =
+      ps::Postoffice::Get()->GetServerKeyRanges().size();
   const int num_worker_total = GetNumWorker();
-  size_t num_server_noncolocate = num_server_total-num_worker_total;
+  size_t num_server_noncolocate = num_server_total - num_worker_total;
   size_t num_server_colocate = num_worker_total;
 
   // The bound should be larger than num_server_total
   // in order to cover each server, but it also
   // cannot be too large because it might cause unbalance
-  auto bound = getenv("BYTEPS_MIXED_MODE_BOUND") ? atoi(getenv("BYTEPS_MIXED_MODE_BOUND")) : 101;
+  auto bound = getenv("BYTEPS_MIXED_MODE_BOUND")
+                   ? atoi(getenv("BYTEPS_MIXED_MODE_BOUND"))
+                   : 101;
   BPS_CHECK_GE(bound, num_server_total);
-  auto ratio = (2.0 * num_server_noncolocate * (num_worker_total - 1)) /
-                  ((num_worker_total) * (num_worker_total+num_server_noncolocate) - 2 * num_server_noncolocate);
+  auto ratio =
+      (2.0 * num_server_noncolocate * (num_worker_total - 1)) /
+      ((num_worker_total) * (num_worker_total + num_server_noncolocate) -
+       2 * num_server_noncolocate);
   BPS_CHECK_LE(ratio, 1)
       << "number of (non-colocate servers) > number of (worker)"
       << ", which is not permitted in the mixed mode";
@@ -529,9 +588,9 @@ uint64_t BytePSGlobal::Hash_Mixed_Mode(uint64_t key) {
   auto threshold = ratio * bound;
 
   auto hash_res = Hash_DJB2(key) % bound;
-  if (hash_res < threshold) { // assign for non-colocate servers
+  if (hash_res < threshold) {  // assign for non-colocate servers
     return Hash_DJB2(hash_res) % num_server_noncolocate;
-  } else { // assign for colocate servers
+  } else {  // assign for colocate servers
     return num_server_noncolocate + (Hash_DJB2(hash_res) % num_server_colocate);
   }
 }
@@ -548,7 +607,7 @@ uint64_t BytePSGlobal::Hash_DJB2(uint64_t key) {
   auto str = std::to_string(key).c_str();
   uint64_t hash = 5381;
   int c;
-  while ((c = *str)) { // hash(i) = hash(i-1) * 33 ^ str[i]
+  while ((c = *str)) {  // hash(i) = hash(i-1) * 33 ^ str[i]
     hash = ((hash << 5) + hash) + c;
     str++;
   }
@@ -559,7 +618,7 @@ uint64_t BytePSGlobal::Hash_SDBM(uint64_t key) {
   auto str = std::to_string(key).c_str();
   uint64_t hash = 0;
   int c;
-  while ((c = *str)) { // hash(i) = hash(i-1) * 65599 + str[i]
+  while ((c = *str)) {  // hash(i) = hash(i-1) * 65599 + str[i]
     hash = c + (hash << 6) + (hash << 16) - hash;
     str++;
   }
@@ -570,8 +629,10 @@ PSKV& BytePSGlobal::EncodeDefaultKey(uint64_t key, size_t len) {
   std::lock_guard<std::mutex> lock(_encode_mutex);
   PSKV& pskv = ps_kv_[key];
   if (!pskv.keys.empty()) {
-    BPS_CHECK_EQ(static_cast<size_t>(pskv.size), len)
-        << "The value size cannot be changed " << len << ". Key is " << key;
+    if (len > 0 && pskv.size != len) {
+      pskv.size = len;
+      pskv.lens[0] = len;
+    }
   } else {
     auto krs = ps::Postoffice::Get()->GetServerKeyRanges();
     const int num_servers = krs.size();
@@ -600,8 +661,9 @@ PSKV& BytePSGlobal::EncodeDefaultKey(uint64_t key, size_t len) {
     _total_accumulated_len += len;
     BPS_LOG(DEBUG) << "key " << key << " assigned to server " << server
                    << ", accumulated workload for this server is "
-                   << _server_accumulated_len[server]
-                   << " (" << (100.0 * _server_accumulated_len[server] / _total_accumulated_len)
+                   << _server_accumulated_len[server] << " ("
+                   << (100.0 * _server_accumulated_len[server] /
+                       _total_accumulated_len)
                    << "%)";
 
     ps::Key ps_key = krs[server].begin() + key;
@@ -629,7 +691,7 @@ cudaStream_t* BytePSGlobal::GetCopyHost2DeviceStream() {
 
 bool BytePSGlobal::IsAllThreadFinish(int total_thread_num) {
   int k = BytePSGlobal::joined_thread_cnt.fetch_add(0);
-  return (k==total_thread_num);
+  return (k == total_thread_num);
 };
 
 }  // namespace common
diff --git a/byteps/common/global.h b/byteps/common/global.h
index cf2061f24..29a6b516a 100644
--- a/byteps/common/global.h
+++ b/byteps/common/global.h
@@ -16,6 +16,8 @@
 #ifndef BYTEPS_GLOBAL_H
 #define BYTEPS_GLOBAL_H
 
+#include <unistd.h>
+
 #include <map>
 #include <memory>
 #include <mutex>
@@ -24,7 +26,7 @@
 #include <thread>
 #include <unordered_map>
 #include <vector>
-#include <unistd.h>
+
 #include "common.h"
 #include "communicator.h"
 #include "cpu_reducer.h"
@@ -34,6 +36,7 @@
 #include "ready_table.h"
 #include "scheduled_queue.h"
 #include "shared_memory.h"
+#include "thread_pool.h"
 
 namespace byteps {
 namespace common {
@@ -86,6 +89,8 @@ class BytePSGlobal {
   static bool IsResuming() { return _is_resuming; }
   static void SetResumingFlag(bool flag) {_is_resuming = flag; }
 
+  static void RegisterCompressor(const std::string& name, 
+                                 std::unordered_map<std::string, std::string>& kwargs);
   static ps::Key GetKeyFromName(const std::string& name);
   static BPSContext& GetContextFromName(const std::string& name);
   static uint32_t GetTensorCount();
@@ -96,6 +101,7 @@ class BytePSGlobal {
   static PSKV& EncodeDefaultKey(uint64_t key, size_t len);
 
   static uint32_t GetPartitionBound() { return _partition_bytes; }
+  static uint32_t GetMinCompressBound() { return _min_compress_bytes; }
 
   static cudaStream_t* GetCopyDevice2HostStream();
   static cudaStream_t* GetCopyHost2DeviceStream();
@@ -120,8 +126,9 @@ class BytePSGlobal {
 
   static bool IsTensorSampled(uint64_t key) { return (key == _sample_key); }
 
-  static void SetProfileFlag(BPSContext *ctxt);
-  static void EmitTrace(std::ostream *os, const BPSCommTime *ret, BPSContext *ctxt);
+  static void SetProfileFlag(BPSContext* ctxt);
+  static void EmitTrace(std::ostream* os, const BPSCommTime* ret,
+                        BPSContext* ctxt);
   static void OutputTraces();
   static bool IsAllTensorOutput(const std::string& name);
   static void Who2beOutput(const std::string& name);
@@ -131,6 +138,8 @@ class BytePSGlobal {
   static std::atomic_int joined_thread_cnt;
   static int RoundUpToPageSize(int x) { return RoundUp(x, _pagesize); }
 
+  static std::shared_ptr<ThreadPool>& GetThreadPool() { return _thread_pool; }
+
  private:
   static std::mutex _init_mutex;
   static volatile bool _initialized;
@@ -172,6 +181,7 @@ class BytePSGlobal {
   static cudaStream_t* _copy_host2device_stream;
 
   static uint32_t _partition_bytes;
+  static uint32_t _min_compress_bytes;
 
   // (key, ready_signal_count) pair, only valid for root device
   static ReadyTable* _reduce_table;
@@ -182,6 +192,8 @@ class BytePSGlobal {
   // (key, ready_signal_count) pair, only valid for non-root device
   static ReadyTable* _copy_table;
 
+  static std::shared_ptr<ThreadPool> _thread_pool;
+
   // for reduce strategies
   static bool _is_using_reduce;
   static std::vector<int> _reduce_roots;
@@ -192,7 +204,9 @@ class BytePSGlobal {
   // for debug sampling
   static uint64_t _sample_key;
 
-  static int AlignTo(int input, int alignment) { return input / alignment * alignment; }
+  static int AlignTo(int input, int alignment) {
+    return input / alignment * alignment;
+  }
 
   static int _pagesize;
   static int DivUp(int x, int y) { return (x + y - 1) / y; }
diff --git a/byteps/common/half.h b/byteps/common/half.h
new file mode 100644
index 000000000..03bde0027
--- /dev/null
+++ b/byteps/common/half.h
@@ -0,0 +1,425 @@
+// Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved.
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file half.h
+ * \brief definition of half (float16) type.
+ *
+ * \author Junyuan Xie
+ */
+#ifndef MSHADOW_HALF_H_
+#define MSHADOW_HALF_H_
+
+// from mashadow/base.h
+#include <algorithm>
+#include <cfloat>
+#include <climits>
+#include <cmath>
+#include <cstdio>
+#include <functional>
+#include <limits>
+#include <sstream>
+#include <string>
+
+#ifdef _MSC_VER
+//! \cond Doxygen_Suppress
+typedef signed char int8_t;
+typedef __int16 int16_t;
+typedef __int32 int32_t;
+typedef __int64 int64_t;
+typedef unsigned char uint8_t;
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+//! \endcond
+#else
+#include <inttypes.h>
+#endif
+
+#if defined(_MSC_VER)
+#define MSHADOW_ALIGNED(x) __declspec(align(x))
+#else
+#define MSHADOW_ALIGNED(x) __attribute__ ((aligned(x)))
+#endif
+
+// --------------------------------
+// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code
+#ifdef MSHADOW_XINLINE
+  #error "MSHADOW_XINLINE must not be defined"
+#endif
+#ifdef _MSC_VER
+#define MSHADOW_FORCE_INLINE __forceinline
+#pragma warning(disable : 4068)
+#else
+#define MSHADOW_FORCE_INLINE inline __attribute__((always_inline))
+#endif
+#ifdef __CUDACC__
+  #define MSHADOW_XINLINE MSHADOW_FORCE_INLINE __device__ __host__
+#else
+  #define MSHADOW_XINLINE MSHADOW_FORCE_INLINE
+#endif
+// end mashadow/base.h
+
+#if MSHADOW_USE_F16C
+  #include <x86intrin.h>
+#endif  // MSHADOW_USE_F16C
+
+// This flag dictates rounding for the float2half() routine only (used generally on Windows),
+// not the f16c lib or cuda v7.5 (or later) behavior which is fixed at round-to-nearest-even.
+#ifndef MSHADOW_HALF_ROUND_TO_NEAREST
+#define MSHADOW_HALF_ROUND_TO_NEAREST 1
+#endif
+
+#if (MSHADOW_USE_CUDA && CUDA_VERSION >= 7050)
+  #define MSHADOW_CUDA_HALF 1
+  #include <cuda_fp16.h>
+  #if defined(__CUDA_ARCH__)
+    /*! \brief __half2float_warp */
+    __host__ __device__ float __half2float_warp(const volatile __half& h) { /* NOLINT(*) */
+      __half val;
+#if CUDA_VERSION >= 9000
+      val = const_cast<__half&>(h);
+#else
+      val.x = h.x;
+#endif
+      return __half2float(val);
+    }
+  #endif
+#else
+  #define MSHADOW_CUDA_HALF 0
+#endif
+
+/*! \brief namespace for mshadow */
+namespace mshadow {
+/* \brief name space for host/device portable half-precision floats */
+namespace half {
+#define MSHADOW_HALF_OPERATOR(RTYPE, OP)                                  \
+  MSHADOW_XINLINE RTYPE operator OP (half_t a, half_t b) {                \
+    return RTYPE(float(a) OP float(b));  /* NOLINT(*) */                  \
+  }                                                                       \
+  template<typename T>                                                    \
+  MSHADOW_XINLINE RTYPE operator OP (half_t a, T b) {                     \
+    return RTYPE(float(a) OP float(b));  /* NOLINT(*) */                  \
+  }                                                                       \
+  template<typename T>                                                    \
+  MSHADOW_XINLINE RTYPE operator OP (T a, half_t b) {                     \
+    return RTYPE(float(a) OP float(b));  /* NOLINT(*) */                  \
+  }
+
+#define MSHADOW_HALF_ASSIGNOP(AOP, OP)                                    \
+  template<typename T>                                                    \
+  MSHADOW_XINLINE half_t operator AOP (const T& a) {                      \
+    return *this = half_t(float(*this) OP float(a));  /* NOLINT(*)*/      \
+  }                                                                       \
+  template<typename T>                                                    \
+  MSHADOW_XINLINE half_t operator AOP (const volatile T& a) volatile {    \
+    return *this = half_t(float(*this) OP float(a));  /* NOLINT(*)*/      \
+  }
+
+#if (MSHADOW_CUDA_HALF && defined(__CUDA_ARCH__))
+#define MSHADOW_HALF_CONVERSIONOP(T)                                      \
+  MSHADOW_XINLINE operator T() const {                                    \
+    return T(__half2float(cuhalf_));  /* NOLINT(*)*/                      \
+  }                                                                       \
+  MSHADOW_XINLINE operator T() const volatile {                           \
+    return T(__half2float_warp(cuhalf_));  /* NOLINT(*)*/                 \
+  }
+#elif(MSHADOW_USE_F16C)
+#define MSHADOW_HALF_CONVERSIONOP(T)                                      \
+  MSHADOW_XINLINE operator T() const {                                    \
+    return T(_cvtsh_ss(half_));   /* NOLINT(*)*/                          \
+  }                                                                       \
+  MSHADOW_XINLINE operator T() const volatile {                           \
+    return T(_cvtsh_ss(half_));   /* NOLINT(*)*/                          \
+  }
+#else
+#define MSHADOW_HALF_CONVERSIONOP(T)                                      \
+  MSHADOW_XINLINE operator T() const {                                    \
+    return T(half2float(half_));  /* NOLINT(*)*/                          \
+  }                                                                       \
+  MSHADOW_XINLINE operator T() const volatile {                           \
+    return T(half2float(half_));  /* NOLINT(*)*/                          \
+  }
+#endif  // (MSHADOW_CUDA_HALF && defined(__CUDA_ARCH__))
+
+class MSHADOW_ALIGNED(2) half_t {
+ public:
+  union {
+    uint16_t half_;
+#if MSHADOW_CUDA_HALF
+    __half cuhalf_;
+#endif  // MSHADOW_CUDA_HALF
+  };
+
+  static MSHADOW_XINLINE half_t Binary(uint16_t value) {
+    half_t res;
+    res.half_ = value;
+    return res;
+  }
+
+  MSHADOW_XINLINE half_t() {}
+
+#if MSHADOW_CUDA_HALF
+  MSHADOW_XINLINE explicit half_t(const __half& value) {
+    cuhalf_ = value;
+  }
+#endif  // MSHADOW_CUDA_HALF
+
+  MSHADOW_XINLINE half_t(const float& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const double& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const int8_t& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const uint8_t& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const int32_t& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const uint32_t& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const int64_t& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const uint64_t& value) { constructor(value); }
+
+  MSHADOW_HALF_CONVERSIONOP(float)
+
+  MSHADOW_HALF_ASSIGNOP(+=, +)
+  MSHADOW_HALF_ASSIGNOP(-=, -)
+  MSHADOW_HALF_ASSIGNOP(*=, *)
+  MSHADOW_HALF_ASSIGNOP(/=, /)
+
+  MSHADOW_XINLINE half_t operator+() {
+    return *this;
+  }
+
+  MSHADOW_XINLINE half_t operator-() {
+    return half_t(-float(*this));  // NOLINT(*)
+  }
+
+  MSHADOW_XINLINE half_t operator=(const half_t& a) {
+    half_ = a.half_;
+    return a;
+  }
+
+  template<typename T>
+  MSHADOW_XINLINE half_t operator=(const T& a) {
+    return *this = half_t(a);  /* NOLINT(*)*/
+  }
+
+  MSHADOW_XINLINE half_t operator=(const half_t& a) volatile {
+    half_ = a.half_;
+    return a;
+  }
+
+  template<typename T>
+  MSHADOW_XINLINE half_t operator=(const T& a) volatile {
+    return *this = half_t(a);  /* NOLINT(*)*/
+  }
+
+ private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static int const fp16FractionBits = 10;
+  static int const fp32FractionBits = 23;
+  static int32_t const fp32FractionMask = ~(~0u << fp32FractionBits);  // == 0x7fffff
+  static int32_t const fp32HiddenBit = 1 << fp32FractionBits;         // == 0x800000
+  static int const shift = fp32FractionBits - fp16FractionBits;       // == 13
+  static int const shiftSign = 16;
+  static int32_t const expAdjust = 127 - 15;    // exp32-127 = exp16-15, so exp16 = exp32 - (127-15)
+
+  static int32_t const infN = 0x7F800000;  // flt32 infinity
+  static int32_t const maxN = 0x477FFFFF;  // max flt32 that's a flt16 normal after >> by shift
+  static int32_t const minN = 0x38800000;  // min flt16 normal as a flt32
+  static int32_t const maxZ = 0x33000000;  // max fp32 number that's still rounded to zero in fp16
+  static int32_t const signN = 0x80000000;  // flt32 sign bit
+
+  static int32_t const infC = infN >> shift;
+  static int32_t const nanN = (infC + 1) << shift;  // minimum flt16 nan as a flt32
+  static int32_t const maxC = maxN >> shift;
+  static int32_t const minC = minN >> shift;
+  static int32_t const signC = signN >> shiftSign;  // flt16 sign bit
+
+  static int32_t const mulN = 0x52000000;  // (1 << 23) / minN
+  static int32_t const mulC = 0x33800000;  // minN / (1 << (23 - shift))
+
+  static int32_t const subC = 0x003FF;  // max flt32 subnormal down shifted
+  static int32_t const norC = 0x00400;  // min flt32 normal down shifted
+
+  static int32_t const maxD = infC - maxC - 1;
+  static int32_t const minD = minC - subC - 1;
+
+  MSHADOW_XINLINE uint16_t float2half(const float& value) const {
+    Bits v;
+    v.f = value;
+    uint32_t sign = v.si & signN;    // grab sign bit
+    v.si ^= sign;                    // clear sign bit from v
+    sign >>= shiftSign;              // logical shift sign to fp16 position
+
+    if (v.si <= maxZ) {
+      // Handle eventual zeros here to ensure vshift will not exceed 32 below.
+      v.ui = 0;
+    } else if (v.si < minN) {
+      // Handle denorms
+      uint32_t exp32 = v.ui >> fp32FractionBits;
+      int32_t exp16 = exp32 - expAdjust;
+      // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1.
+      // Smaller (so negative) exp16 values should result in greater right shifts.
+      uint32_t vshift = 1 - exp16;
+      uint32_t significand = fp32HiddenBit | (v.ui & fp32FractionMask);
+      v.ui = significand >> vshift;
+      // The only time it's *not* OK to add 0x1000 (i.e. half the flt16 fraction lsb) is
+      // when the lsb of the flt16 fraction == 0 (so not rounding up to even) and the additional
+      // bits to the right of the lsb are 1000... (including flt32 significand bits
+      // that may be lost during the above vshift).  The first term below will always
+      // be true for vshift >=12 (since even the 'hidden bit' has been shifted to the
+      // right of the '1' bit in 0x1000). And when vshift <= 11, both terms combine to make
+      // the proper test of the flt32 significand bits, including those lost during the vshift.
+#if MSHADOW_HALF_ROUND_TO_NEAREST == 1
+      // Rounding may increase the exponent to 1, but that's OK.
+      v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0;
+#endif
+    } else if (v.si <= maxN) {
+      // Handle norms
+#if MSHADOW_HALF_ROUND_TO_NEAREST == 1
+      // Rounding may increase the exponent, possibly creating an inf, but that's OK.
+      v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0;
+#endif
+      v.ui -= expAdjust << fp32FractionBits;
+    } else if (v.si <= infN) {
+      v.si = infN;
+    } else if (v.si < nanN) {
+      v.si = nanN;
+    }
+
+    v.ui >>= shift;
+    return sign | (v.ui & 0x7fff);
+  }
+
+  // Same as above routine, except for addition of volatile keyword
+  MSHADOW_XINLINE uint16_t float2half(const volatile float& value) const volatile {  // NOLINT (*)
+    Bits v;
+    v.f = value;
+    uint32_t sign = v.si & signN;    // grab sign bit
+    v.si ^= sign;                    // clear sign bit from v
+    sign >>= shiftSign;              // logical shift sign to fp16 position
+
+    if (v.si <= maxZ) {
+      // Handle eventual zeros here to ensure vshift will not exceed 32 below.
+      v.ui = 0;
+    } else if (v.si < minN) {
+      // Handle denorms
+      uint32_t exp32 = v.ui >> fp32FractionBits;
+      int32_t exp16 = exp32 - expAdjust;
+      // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1.
+      // Smaller (so negative) exp16 values should result in greater right shifts.
+      uint32_t vshift = 1 - exp16;
+      uint32_t significand = fp32HiddenBit | (v.ui & fp32FractionMask);
+      v.ui = significand >> vshift;
+#if MSHADOW_HALF_ROUND_TO_NEAREST == 1
+      // Rounding may increase the exponent to 1, but that's OK.
+      v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0;
+#endif
+    } else if (v.si <= maxN) {
+      // Handle norms
+#if MSHADOW_HALF_ROUND_TO_NEAREST == 1
+      // Rounding may increase the exponent, possibly creating an inf, but that's OK.
+      v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0;
+#endif
+      v.ui -= expAdjust << fp32FractionBits;
+    } else if (v.si <= infN) {
+      v.si = infN;
+    } else if (v.si < nanN) {
+      v.si = nanN;
+    }
+
+    v.ui >>= shift;
+    return sign | (v.ui & 0x7fff);
+  }
+
+  MSHADOW_XINLINE float half2float(const uint16_t& value) const {
+    Bits v;
+    v.ui = value;
+    int32_t sign = v.si & signC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+  }
+
+  MSHADOW_XINLINE float half2float(const volatile uint16_t& value) const volatile {  // NOLINT(*)
+    Bits v;
+    v.ui = value;
+    int32_t sign = v.si & signC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+  }
+
+  template<typename T>
+  MSHADOW_XINLINE void constructor(const T& value) {
+#if (MSHADOW_CUDA_HALF && defined(__CUDA_ARCH__))
+    cuhalf_ = __float2half(float(value));  // NOLINT(*)
+#elif(MSHADOW_USE_F16C)
+    half_ = _cvtss_sh(static_cast<float>(value), 0);
+#else /* !MSHADOW_CUDA_HALF && !MSHADOW_USE_F16C */
+    half_ = float2half(float(value));  // NOLINT(*)
+#endif /* !MSHADOW_CUDA_HALF && !MSHADOW_USE_F16C */
+  }
+};
+
+/*! \brief overloaded + operator for half_t */
+MSHADOW_HALF_OPERATOR(half_t, +)
+/*! \brief overloaded - operator for half_t */
+MSHADOW_HALF_OPERATOR(half_t, -)
+/*! \brief overloaded * operator for half_t */
+MSHADOW_HALF_OPERATOR(half_t, *)
+/*! \brief overloaded / operator for half_t */
+MSHADOW_HALF_OPERATOR(half_t, /)
+/*! \brief overloaded > operator for half_t */
+MSHADOW_HALF_OPERATOR(bool, >)
+/*! \brief overloaded < operator for half_t */
+MSHADOW_HALF_OPERATOR(bool, <)
+/*! \brief overloaded >= operator for half_t */
+MSHADOW_HALF_OPERATOR(bool, >=)
+/*! \brief overloaded <= operator for half_t */
+MSHADOW_HALF_OPERATOR(bool, <=)
+
+#define MSHADOW_HALF_MIN mshadow::half::half_t::Binary(0xFBFF);
+#define MSHADOW_HALF_MAX mshadow::half::half_t::Binary(0x7BFF);
+#define MSHADOW_HALF_SIGN_BIT 0x8000
+#define MSHADOW_HALF_EXPONENT_BITS 0x7c00
+}  // namespace half
+}  // namespace mshadow
+#endif  // MSHADOW_HALF_H_
\ No newline at end of file
diff --git a/byteps/common/operations.cc b/byteps/common/operations.cc
index 8bef45b17..67c9e19e2 100644
--- a/byteps/common/operations.cc
+++ b/byteps/common/operations.cc
@@ -13,15 +13,20 @@
 // limitations under the License.
 // =============================================================================
 
-#include "operations.h"
 #include <cuda_runtime.h>
+#include <unistd.h>
+
 #include <cstring>
 #include <memory>
 #include <thread>
-#include <unistd.h>
+
+#include "compressor/compressor.h"
+#include "compressor/compressor_registry.h"
+#include "compressor/utils.h"
 #include "core_loops.h"
 #include "global.h"
 #include "logging.h"
+#include "operations.h"
 
 namespace byteps {
 namespace common {
@@ -43,6 +48,7 @@ void byteps_lazy_init() {
   if (BytePSGlobal::IsDistributed()) {
     if (BytePSGlobal::IsRootDevice()) {
       func.push_back(PullLoop);
+      func.push_back(DecompressLoop);
     }
   }
 
@@ -58,6 +64,7 @@ void byteps_lazy_init() {
       // PUSH can be a real push in distributed mode
       // Or a dummy barrier in cross-pcie-switch mode
       func.push_back(PushLoop);
+      func.push_back(CompressLoop);
       func.push_back(RootCopyHost2DeviceLoop);
     } else {
       func.push_back(CoordinatePushLoop);
@@ -88,8 +95,10 @@ void byteps_shutdown() {
 
 void byteps_resume(int num_workers, int num_servers) {
   // set ps, worker numbers
-  BPS_LOG(DEBUG) << "Resume worker number: " << num_workers << "DMLC_NUM_WORKER: " << getenv("DMLC_NUM_WORKER");
-  BPS_LOG(DEBUG) << "Resume server number: " << num_workers << "DMLC_NUM_SERVER: " << getenv("DMLC_NUM_SERVER");
+  BPS_LOG(DEBUG) << "Resume worker number: " << num_workers
+                 << "DMLC_NUM_WORKER: " << getenv("DMLC_NUM_WORKER");
+  BPS_LOG(DEBUG) << "Resume server number: " << num_workers
+                 << "DMLC_NUM_SERVER: " << getenv("DMLC_NUM_SERVER");
   BPS_LOG(DEBUG) << "Start resuming BytePS";
 
   BytePSGlobal::SetResumingFlag(true);
@@ -152,6 +161,9 @@ void PartitionTensor(
     e->len = ((size - accumulated) > bound) ? bound : (size - accumulated);
     e->counter_ptr = entry->counter_ptr;
     e->total_partnum = entry->total_partnum;
+    if (!entry->context->compressor_list.empty()) {
+      e->compressor = entry->context->compressor_list[i];
+    }
 
     accumulated += e->len;
     ++i;
@@ -176,6 +188,14 @@ Status EnqueueTensor(BPSContext &context, std::shared_ptr<Tensor> input,
         << name << " output tensor size does not match";
   }
 
+  // add queue
+  if (BytePSGlobal::IsRootDevice() && !context.compressor_list.empty()) {
+    auto it = std::find(queue_list->begin(), queue_list->end(), PUSH);
+    it = queue_list->insert(it, COMPRESS);  // before PUSH
+    it = std::find(queue_list->begin(), queue_list->end(), PULL);
+    queue_list->insert(it + 1, DECOMPRESS);  // after PULL
+  }
+
   std::shared_ptr<TensorTableEntry> e(new TensorTableEntry);
   e->tensor_name = name;
   e->context = &context;
@@ -188,11 +208,14 @@ Status EnqueueTensor(BPSContext &context, std::shared_ptr<Tensor> input,
   e->callback = callback;
 
   if (device == CPU_DEVICE_ID) {
-    cudaError_t err = cudaHostRegister(const_cast<void*>(input->data()), input->size(), cudaHostRegisterMapped);
+    cudaError_t err = cudaHostRegister(const_cast<void *>(input->data()),
+                                       input->size(), cudaHostRegisterMapped);
     if (err == cudaSuccess) {
-      BPS_LOG(DEBUG) << name << " cpu address has changed, so it is pinned again.";
+      BPS_LOG(DEBUG) << name
+                     << " cpu address has changed, so it is pinned again.";
     }
-    CUDA_CALL(cudaHostGetDevicePointer(&(context.gpu_ptr), const_cast<void*>(input->data()), 0));
+    CUDA_CALL(cudaHostGetDevicePointer(&(context.gpu_ptr),
+                                       const_cast<void *>(input->data()), 0));
   }
 
   e->cpubuff = context.cpubuff;
@@ -302,7 +325,7 @@ void InitTensor(BPSContext &context, size_t size, int dtype, void *cpubuff) {
     BPS_LOG(DEBUG) << name << " is already on cpu, len=" << size;
     cudaError_t e = cudaHostRegister(cpubuff, size, cudaHostRegisterMapped);
     if (e != cudaSuccess) {
-      BPS_LOG(INFO) << cudaGetErrorString(e) 
+      BPS_LOG(INFO) << cudaGetErrorString(e)
                     << " (You may ignore this if your program continues)";
     }
     CUDA_CALL(cudaHostGetDevicePointer(&(context.gpu_ptr), cpubuff, 0));
@@ -311,19 +334,27 @@ void InitTensor(BPSContext &context, size_t size, int dtype, void *cpubuff) {
   // We always allocate our own cpu buffer
   // use the first key in key_list as the index
   auto shm_obj = BytePSGlobal::GetSharedMemoryObj();
+
+  size_t aligned_size = Align(size, dtype);
   if (BytePSGlobal::IsCrossPcieSwitch()) {
-    context.pcie_cpubuff = shm_obj->openPcieSharedMemory(key_list[0], size);
+    context.pcie_cpubuff =
+        shm_obj->openPcieSharedMemory(key_list[0], aligned_size);
     context.cpubuff = context.pcie_cpubuff.back();
   } else {
     context.cpubuff = shm_obj->openSharedMemory(std::string("BytePS_ShM_"),
-                                                key_list[0], size);
+                                                key_list[0], aligned_size);
   }
-  BPS_LOG(TRACE) << name << ": open shared memory size " << size;
+  BPS_LOG(TRACE) << name << ": open shared memory size " << aligned_size;
 
   // Init tensors with BytePS server
   char *data = const_cast<char *>(static_cast<const char *>(context.cpubuff));
   accumulated = 0;
   size_t i = 0;
+  BPS_LOG(INFO) << "tensor size=" << size;
+  // small tensor does not need to be compressed
+  if (size < BytePSGlobal::GetMinCompressBound()) {
+    context.kwargs.clear();
+  }
   while (accumulated < size) {
     auto key = key_list[i];
     int len = ((size - accumulated) > bound) ? bound : (size - accumulated);
@@ -338,6 +369,13 @@ void InitTensor(BPSContext &context, size_t size, int dtype, void *cpubuff) {
       int cmd = GetCommandType(RequestType::kDefaultPushPull, dtype);
       // blocking push, also as a global barrirer
       ps->Wait(ps->ZPush(pskv.keys, vals, pskv.lens, cmd));
+
+      // register
+      if (!context.kwargs.empty()) {
+        auto compressor_ptr = compressor::CompressorRegistry::Create(
+            context.kwargs, Align(len, dtype), static_cast<DataType>(dtype));
+        context.compressor_list.push_back(std::move(compressor_ptr));
+      }
     }
 
     accumulated += len;
@@ -347,6 +385,21 @@ void InitTensor(BPSContext &context, size_t size, int dtype, void *cpubuff) {
   BPS_CHECK_EQ(accumulated, size);
   BPS_CHECK_EQ(i, key_list.size());
 
+  // send to server
+  if (!context.kwargs.empty() && BytePSGlobal::IsDistributed() &&
+      BytePSGlobal::IsRootDevice()) {
+    auto ps = BytePSGlobal::GetOrInitPS();
+    auto content = compressor::Serialize(context.kwargs);
+    auto len = content.size();
+    auto data = const_cast<char *>(content.c_str());
+    for (auto key : key_list) {
+      auto &kv = BytePSGlobal::EncodeDefaultKey(key, len);
+      ps::SArray<char> vals(data, len, false);
+      int cmd = GetCommandType(RequestType::kCompressedPushPull, dtype);
+      ps->Wait(ps->ZPush(kv.keys, vals, kv.lens, cmd));
+    }
+  }
+
   context.initialized = true;
 
   BPS_LOG(TRACE) << "Finish Init " << name << ", size=" << size
@@ -361,6 +414,11 @@ bool IsTensorDeclared(const std::string &name) {
   return BytePSGlobal::IsTensorDeclared(name);
 }
 
+void RegisterCompressor(const std::string &name,
+                        std::unordered_map<std::string, std::string> &kwargs) {
+  return BytePSGlobal::RegisterCompressor(name, kwargs);
+}
+
 std::shared_ptr<std::vector<QueueType>> GetPushQueueList(int device) {
   auto queue_list = std::make_shared<std::vector<QueueType>>();
 
diff --git a/byteps/common/operations.h b/byteps/common/operations.h
index d9b707d2a..95e4178cb 100644
--- a/byteps/common/operations.h
+++ b/byteps/common/operations.h
@@ -72,6 +72,9 @@ void InitTensor(BPSContext &context, size_t size, int dtype, void *cpubuff);
 // Only call these in Framework plugins for the best performance
 bool IsTensorDeclared(const std::string &name);
 
+void RegisterCompressor(const std::string &name,
+                        std::unordered_map<std::string, std::string> &kwargs);
+
 BPSContext &GetContextFromName(const std::string &name);
 
 std::shared_ptr<std::vector<QueueType>> GetPushQueueList(int device);
diff --git a/byteps/common/ready_table.cc b/byteps/common/ready_table.cc
index 07fcd0ef1..4c9b284e3 100644
--- a/byteps/common/ready_table.cc
+++ b/byteps/common/ready_table.cc
@@ -14,6 +14,7 @@
 // =============================================================================
 
 #include "ready_table.h"
+
 #include "logging.h"
 
 namespace byteps {
@@ -32,6 +33,11 @@ int ReadyTable::AddReadyCount(uint64_t key) {
   return ++_ready_table[key];
 }
 
+int ReadyTable::SetReadyCount(uint64_t key, int cnt) {
+  std::lock_guard<std::mutex> lock(_table_mutex);
+  _ready_table[key] = cnt;
+}
+
 void ReadyTable::ClearReadyCount(uint64_t key) {
   std::lock_guard<std::mutex> lock(_table_mutex);
   _ready_table[key] = 0;
diff --git a/byteps/common/ready_table.h b/byteps/common/ready_table.h
index 68d6ef1d9..7247c1466 100644
--- a/byteps/common/ready_table.h
+++ b/byteps/common/ready_table.h
@@ -32,6 +32,7 @@ class ReadyTable {
   // methods to access or modify the _ready_table
   bool IsKeyReady(uint64_t key);
   int AddReadyCount(uint64_t key);
+  int SetReadyCount(uint64_t key, int cnt);
   void ClearReadyCount(uint64_t key);
 
  private:
diff --git a/byteps/common/scheduled_queue.cc b/byteps/common/scheduled_queue.cc
index 322d47ffc..4b4a88e1e 100644
--- a/byteps/common/scheduled_queue.cc
+++ b/byteps/common/scheduled_queue.cc
@@ -14,7 +14,9 @@
 // =============================================================================
 
 #include "scheduled_queue.h"
+
 #include <algorithm>
+
 #include "global.h"
 #include "logging.h"
 
@@ -31,15 +33,16 @@ BytePSScheduledQueue::BytePSScheduledQueue(QueueType type) {
   size_t credit_in_partition = BytePSGlobal::GetNccl()->GetGroupSize() + 1;
 
   auto byteps_scheduling_credit = getenv("BYTEPS_SCHEDULING_CREDIT");
-  credit_in_partition = byteps_scheduling_credit ? atoi(byteps_scheduling_credit) : 0;
-  if (!credit_in_partition) { // disable scheduling by default
+  credit_in_partition =
+      byteps_scheduling_credit ? atoi(byteps_scheduling_credit) : 0;
+  if (!credit_in_partition) {  // disable scheduling by default
     _is_scheduled = false;
   }
 
   _qt = type;
   _credits = _is_scheduled
-              ? BytePSGlobal::GetPartitionBound() * credit_in_partition
-              : 34359738368;  // 32GB, basically disabling credit control
+                 ? BytePSGlobal::GetPartitionBound() * credit_in_partition
+                 : 34359738368;  // 32GB, basically disabling credit control
   _rt = nullptr;
 
   switch (_qt) {
@@ -55,6 +58,7 @@ BytePSScheduledQueue::BytePSScheduledQueue(QueueType type) {
         }
       }
       break;
+    case COMPRESS:
     case PUSH:
       if (BytePSGlobal::IsRootDevice()) {
         _rt = BytePSGlobal::GetPushTable();
@@ -158,7 +162,6 @@ std::shared_ptr<TensorTableEntry> BytePSScheduledQueue::getTask() {
   return nullptr;
 }
 
-
 std::shared_ptr<TensorTableEntry> BytePSScheduledQueue::getTask(uint64_t key) {
   BPS_CHECK(!_is_scheduled);
   std::lock_guard<std::mutex> lock(_mutex);
@@ -199,5 +202,12 @@ void BytePSScheduledQueue::reportFinish(int size) {
   return;
 }
 
+void BytePSScheduledQueue::reset(uint64_t key, int cnt) {
+  std::lock_guard<std::mutex> lock(_mutex);
+  if(_rt) {
+    _rt->SetReadyCount(key, cnt);
+  }
+}
+
 }  // namespace common
 }  // namespace byteps
diff --git a/byteps/common/scheduled_queue.h b/byteps/common/scheduled_queue.h
index 59e10f066..5167ce67f 100644
--- a/byteps/common/scheduled_queue.h
+++ b/byteps/common/scheduled_queue.h
@@ -36,6 +36,7 @@ class BytePSScheduledQueue {
   std::shared_ptr<TensorTableEntry> getTask(uint64_t key);
   uint32_t pendingSize();
   void reportFinish(int size);
+  void reset(uint64_t key, int cnt);
 
  private:
   // TODO: use priority queue or heap
diff --git a/byteps/common/thread_pool.h b/byteps/common/thread_pool.h
new file mode 100644
index 000000000..23397469d
--- /dev/null
+++ b/byteps/common/thread_pool.h
@@ -0,0 +1,77 @@
+/*
+ * Copy From https://github.com/progschj/ThreadPool/blob/master/ThreadPool.h
+ */
+#ifndef THREAD_POOL_H
+#define THREAD_POOL_H
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <stdexcept>
+#include <thread>
+#include <vector>
+
+class ThreadPool {
+ public:
+  ThreadPool(size_t);
+  template <class F>
+  void enqueue(F&& f);
+  ~ThreadPool();
+
+ private:
+  // need to keep track of threads so we can join them
+  std::vector<std::thread> workers;
+  // the task queue
+  std::queue<std::function<void()> > tasks;
+
+  // synchronization
+  std::mutex queue_mutex;
+  std::condition_variable condition;
+  bool stop;
+};
+
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads) : stop(false) {
+  for (size_t i = 0; i < threads; ++i)
+    workers.emplace_back([this] {
+      for (;;) {
+        std::function<void()> task;
+
+        {
+          std::unique_lock<std::mutex> lock(this->queue_mutex);
+          this->condition.wait(
+              lock, [this] { return this->stop || !this->tasks.empty(); });
+          if (this->stop && this->tasks.empty()) return;
+          task = std::move(this->tasks.front());
+          this->tasks.pop();
+        }
+
+        task();
+      }
+    });
+}
+
+// add new work item to the pool
+template <class F>
+void ThreadPool::enqueue(F&& f) {
+  {
+    std::lock_guard<std::mutex> lock(queue_mutex);
+    if (stop) throw std::runtime_error("enqueue on stopped ThreadPool");
+    tasks.emplace(std::forward<F>(f));
+  }
+  condition.notify_one();
+}
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool() {
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+    stop = true;
+  }
+  condition.notify_all();
+  for (std::thread& worker : workers) worker.join();
+}
+
+#endif
diff --git a/byteps/mxnet/__init__.py b/byteps/mxnet/__init__.py
index dfdac9fed..6fd21cc55 100644
--- a/byteps/mxnet/__init__.py
+++ b/byteps/mxnet/__init__.py
@@ -14,28 +14,32 @@
 # limitations under the License.
 # ==============================================================================
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
+import copy
+import os
+import struct
 import warnings
+
 import mxnet as mx
-import os
+import mxnet.ndarray as nd
 
-from byteps.mxnet.ops import byteps_push_pull, byteps_declare_tensor
-from byteps.mxnet.ops import init, shutdown, suspend, resume
-from byteps.mxnet.ops import size, local_size, rank, local_rank
+from byteps.mxnet.compression import Compression
+from byteps.mxnet.ops import (byteps_declare_tensor, byteps_push_pull, init,
+                              local_rank, local_size, rank, resume, shutdown,
+                              size, suspend)
 
 parameter_index = 0
 
 
 class DistributedOptimizer(mx.optimizer.Optimizer):
     """This is where BytePS's DistributedOptimizer wrapper for MXNet goes"""
+
     def __init__(self, optimizer):
         self._optimizer = optimizer
         self._enable_async = (int(os.getenv('BYTEPS_ENABLE_ASYNC', 0)) != 0)
         if self._enable_async:
-            assert int(os.getenv('DMLC_NUM_WORKER'))>1, \
+            assert int(os.getenv('DMLC_NUM_WORKER')) > 1, \
                 "Async is only valid for distributed training"
             print('BytePS: enable asynchronous training')
 
@@ -178,10 +182,17 @@ class DistributedTrainer(mx.gluon.Trainer):
         Key-word arguments to be passed to optimizer constructor. For example,
         `{'learning_rate': 0.1}`. All optimizers accept learning_rate, wd (weight decay),
         clip_gradient, and lr_scheduler. See each optimizer's
-        constructor for a list of additional supported arguments.
+        constructor for a list of additional supported arguments
+    root_rank : int
+        rank of root
+    compression_params : dict
+        Key-word arguments to be passed to gradient compression constructor. For example, 
+        `{'compressor': 'onebit', 'ef': 'vanilla', 'momentum': 'nesterov', 'scaling': true}`.
+        All compressor accept 'compressor', 'ef'. See each compressor's constructor for a list 
+        of additional supported arguments
     """
 
-    def __init__(self, params, optimizer, optimizer_params=None, root_rank=0):
+    def __init__(self, params, optimizer, optimizer_params=None, root_rank=0, compression_params=None):
         if isinstance(optimizer, DistributedOptimizer):
             optimizer = optimizer._optimizer
             warnings.warn("DistributedTrainer does not take DistributedOptimizer "
@@ -192,25 +203,144 @@ def __init__(self, params, optimizer, optimizer_params=None, root_rank=0):
             for key in sorted(list(params.keys())):
                 param_list.append(params[key])
 
+        self._intra_compressor = self._register_compressor(
+            params, optimizer_params, compression_params)
+
         super(DistributedTrainer, self).__init__(
             param_list, optimizer, optimizer_params=optimizer_params, kvstore=None)
 
-        # _scale is used to check and set rescale_grad for optimizer in Trainer.step()
-        # function. Normalizing it by BytePS size, which is equivalent to performing
-        # average in push_pull, has better performance.
-        self._scale /= size()
+        if local_rank() == 0:
+            self._f = open("lr.s", "wb")
+            self._f.truncate(8)
+
+        self._bps_size = size()
         self.root_rank = root_rank
+        self._intra_compressors = {}
         for i, param in enumerate(self._params):
             byteps_declare_tensor("parameter_" + str(i))
+            self._intra_compressors[param.name] = copy.deepcopy(
+                self._intra_compressor)
             if param.grad_req != 'null':
-                byteps_declare_tensor("gradient_" + str(i))
-
+                byteps_params = dict(
+                    filter(lambda attr: attr[0].startswith(
+                        "byteps_",), param.__dict__.items())
+                )
+                byteps_declare_tensor("gradient_" + str(i), **byteps_params)
+
+    def __del__(self):
+        if local_rank() == 0:
+            self._f.close()
+            if os.path.exists("lr.s"):
+                os.remove("lr.s")
+
+    def _register_compressor(self, params, optimizer_params, compression_params):
+        """Register compressor for BytePS
+
+        params : mx.gluon.ParameterDict 
+        optimizer_params : dict
+        compression_params : dict
+        """
+        intra_compressor = Compression.none
+        if not compression_params:
+            return intra_compressor
+
+        if compression_params.get("fp16"):
+            intra_compressor = Compression.fp16
+
+        if "compressor" not in compression_params:
+            warnings.warn("Compressor is not defined")
+            return intra_compressor
+
+        check_list = ["compressor", "ef", "momentum"]
+
+        for _, param in params.items():
+            # generic
+            for item in check_list:
+                if compression_params.get(item):
+                    if isinstance(compression_params[item], str):
+                        setattr(param, "byteps_%s_type" %
+                                item, compression_params[item])
+                    else:
+                        raise TypeError("%s should be str" % item)
+
+            # need parameter
+            compressor = compression_params["compressor"]
+            if compressor == "onebit":
+                setattr(param, "byteps_compressor_onebit_scaling", str(
+                    compression_params.get("scaling", False)))
+            elif compressor == "topk" or compressor == "randomk" or compressor == "dithering":
+                # raise KeyError if 'k' is not found
+                setattr(param, "byteps_compressor_k",
+                        compression_params["k"])
+
+            if compression_params.get("momentum"):
+                setattr(param, "byteps_momentum_mu",
+                        optimizer_params["momentum"])
+
+            if compression_params.get("seed", None) is not None:
+                setattr(param, "byteps_seed",
+                        compression_params["seed"])
+
+            if compression_params.get("partition"):
+                if compression_params["partition"] == "linear":
+                    setattr(param, "byteps_dithering_partition", "0")
+                elif compression_params["partition"] == "natural":
+                    setattr(param, "byteps_dithering_partition", "1")
+                else:
+                    raise ValueError("Unsupported partition")
+
+            if compression_params.get("normalize"):
+                if compression_params["normalize"] == "max":
+                    setattr(param, "byteps_dithering_normalize", "0")
+                elif compression_params["normalize"] == "l2":
+                    setattr(param, "byteps_dithering_normalize", "1")
+                else:
+                    raise ValueError("Unsupported normalization")
+
+        # the following code will delete some items in `optimizer_params`
+        # to avoid duplication
+        if compression_params.get("momentum"):
+            threshold = int(os.environ.get(
+                "BYTEPS_MIN_COMPRESS_BYTES", 65536))
+            mu = optimizer_params["momentum"]
+
+            # 1bit compressor use an additional momentum for weight decay
+            if compressor == "onebit" and "wd" in optimizer_params:
+                wd = optimizer_params["wd"]
+                intra_compressor = Compression.wdmom(intra_compressor,
+                                                     mu, wd, threshold)
+                del optimizer_params["wd"]
+
+            intra_compressor = Compression.nag(intra_compressor, mu, threshold)
+            del optimizer_params['momentum']
+
+        return intra_compressor
+
+    def step(self, batch_size, ignore_stale_grad=False):
+        # grad is normalized with batch_size. setting _scale to batch_size is
+        # to prevent normalized by batch_size twice.
+        self._scale = batch_size
+        super(DistributedTrainer, self).step(batch_size, ignore_stale_grad)
 
     def _allreduce_grads(self):
+        # update lr
+        if local_rank() == 0:
+            self._f.seek(0)
+            ba = struct.pack("d", self.learning_rate)
+            self._f.write(ba)
+            self._f.flush()
+
         for i, param in enumerate(self._params):
             if param.grad_req != 'null':
-                byteps_push_pull(param.list_grad()[0], is_average=False,
+                # normalized with batch_size and num_workers
+                nd._internal._mul_scalar(
+                    param._grad[0], 1.0 / self._scale / self._bps_size, out=param._grad[0])
+                compressed, ctx = self._intra_compressors[param.name].compress(
+                    param._grad[0])
+                byteps_push_pull(compressed, is_average=False,
                                  name="gradient_" + str(i), priority=-i)
+                param._grad[0][:] = self._intra_compressors[param.name].decompress(
+                    compressed, ctx, x=param._data[0])
 
     def _init_params(self):
         tensors = []
@@ -223,6 +353,7 @@ def _init_params(self):
 
                 if rank() != self.root_rank:
                     param_arrays[0].__imul__(0)
+
                 byteps_push_pull(param_arrays[0], version=0, priority=0,
                                  name="parameter_" + str(idx), is_average=False)
 
diff --git a/byteps/mxnet/adapter.cc b/byteps/mxnet/adapter.cc
index 98b99ad8f..5b4f44947 100644
--- a/byteps/mxnet/adapter.cc
+++ b/byteps/mxnet/adapter.cc
@@ -25,6 +25,7 @@
 namespace byteps {
 namespace mxnet {
 
+
 template <class T>
 MXTensor<T>::MXTensor(T* tensor) : tensor_(tensor) {}
 
diff --git a/byteps/mxnet/compression.py b/byteps/mxnet/compression.py
new file mode 100644
index 000000000..3ee3957db
--- /dev/null
+++ b/byteps/mxnet/compression.py
@@ -0,0 +1,164 @@
+# Copyright 2019 Bytedance Inc. All Rights Reserved.
+# Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gradient compression algorithms."""
+from functools import reduce
+
+import mxnet.ndarray as nd
+
+
+def size(shape):
+    return reduce(lambda x, y: x*y, shape) * 4
+
+
+class Compressor(object):
+    """Interface for compressing and decompressing a given tensor."""
+
+    def compress(self, tensor, *args, **kwargs):
+        """Compresses a tensor and returns it with the context needed to decompress it."""
+        pass
+
+    def decompress(self, tensor, ctx, *args, **kwargs):
+        """Decompress the tensor with the given context."""
+        pass
+
+
+class NoneCompressor(Compressor):
+    """Default no-op compression."""
+
+    def compress(self, tensor, *args, **kwargs):
+        """Returns the tensor unmodified."""
+        return tensor, None
+
+    def decompress(self, tensor, ctx, *args, **kwargs):
+        """Returns the tensor unmodified."""
+        return tensor
+
+
+class FP16Compressor(Compressor):
+    """Compress all floating point gradients to 16-bit."""
+
+    def compress(self, tensor, *args, **kwargs):
+        """Downcasts the tensor to 16-bit."""
+        tensor_compressed = tensor
+        if 'float' in str(tensor.dtype):
+            # Only allow compression from other floating point types
+            tensor_compressed = tensor.astype('float16', copy=False)
+        return tensor_compressed, tensor.dtype
+
+    def decompress(self, tensor, ctx, *args, **kwargs):
+        """Upcasts the tensor to the initialization dtype."""
+        tensor_decompressed = tensor
+        dtype = ctx
+        if 'float' in str(dtype):
+            tensor_decompressed = tensor.astype(dtype, copy=False)
+        return tensor_decompressed
+
+
+class NagAdapter(Compressor):
+    """For uncompressed gradients"""
+
+    def __init__(self, compressor, mu, threshold, *args, **kwargs):
+        self.compressor = compressor
+        self.mu = mu
+        self.mom = None
+        self.threshold = threshold
+        self.inited = False
+        self.nag = False
+
+    def compress(self, tensor, *args, **kwargs):
+        """Returns the tensor unmodified."""
+        return self.compressor.compress(tensor)
+
+    def decompress(self, tensor, ctx, *args, **kwargs):
+        """Add nesterov momentum for uncompressed gradients"""
+        tensor = self.compressor.decompress(tensor, ctx, *args, **kwargs)
+        
+        # uncompressed gradients need to do nag explicitly
+        if not self.inited:
+            if size(tensor.shape) < self.threshold:
+                self.mom = nd.zeros_like(tensor)
+                self.nag = True
+            self.inited = True
+
+        if self.nag:
+            self.mom += tensor
+            nd._internal._mul_scalar(self.mom, self.mu, out=self.mom)
+            tensor += self.mom
+
+        return tensor
+
+
+class WeightDecayMomentumAdapter(Compressor):
+    """For 1bit compression."""
+
+    def __init__(self, compressor, mu, wd, threshold, *args, **kwargs):
+        self.compressor = compressor
+        self.mom = None
+        self.cache = None
+        self.mu = mu
+        self.wd = wd
+        self.threshold = threshold
+        self.wdmom = False
+        self.inited = False
+
+    def compress(self, tensor, *args, **kwargs):
+        """Returns the tensor unmodified."""
+        return self.compressor.compress(tensor)
+
+    def decompress(self, tensor, ctx, *args, **kwargs):
+        """Returns the tensor added with additional momentum for wd
+            m_t = \mu * m_{t-1} + wd * x_t
+            x_{t+1} = x_t - \eta_t (tensor + \mu m_t + wd * x_t)
+        """
+        if "x" not in kwargs:
+            raise ValueError("x is missing")
+
+        x = kwargs["x"].astype(tensor.dtype, copy=False)   
+        
+        if not self.inited:
+            self.cache = nd.zeros_like(tensor)
+            if size(tensor.shape) >= self.threshold:
+                self.mom = nd.zeros_like(tensor)
+                self.wdmom = True
+            self.inited = True
+        
+        # weight decay
+        nd._internal._mul_scalar(x, self.wd, out=self.cache)
+
+        # weight decay momentum
+        if self.wdmom:
+            self.mom += self.cache
+            nd._internal._mul_scalar(self.mom, self.mu, out=self.mom)
+            tensor += self.mom
+
+        tensor += self.cache
+        return self.compressor.decompress(tensor, ctx, *args, **kwargs)
+
+
+class Compression(object):
+    """Optional gradient compression algorithm used during push_pull."""
+
+    """Do not compress the gradients. This is the default."""
+    none = NoneCompressor()
+
+    """Compress all floating point gradients to 16-bit."""
+    fp16 = FP16Compressor()
+
+    """Additional Momentum for weight decay. This is only for 1bit. This is a wrapper."""
+    wdmom = WeightDecayMomentumAdapter
+
+    """NAG for uncompressed. This is a wrapper."""
+    nag = NagAdapter
diff --git a/byteps/mxnet/ops.cc b/byteps/mxnet/ops.cc
index f6df42156..5325142d3 100644
--- a/byteps/mxnet/ops.cc
+++ b/byteps/mxnet/ops.cc
@@ -15,7 +15,10 @@
 // =============================================================================
 
 #include "ops.h"
+
 #include <atomic>
+
+#include "../common/logging.h"
 #include "../common/operations.h"
 #include "adapter.h"
 #include "cuda_util.h"
@@ -73,7 +76,7 @@ void DoPushPull(void*, void* on_complete_ptr, void* param) {
   auto push_pull_param = static_cast<PushPullParam*>(param);
   int priority = push_pull_param->priority;
   int version = push_pull_param->version;
-  NDArray* input = push_pull_param->input.get();
+  auto input = push_pull_param->input.get();
   BPSContext& context = *push_pull_param->context;
 
   auto device = TensorUtil::GetDevice(input);
@@ -118,10 +121,12 @@ extern "C" int byteps_mxnet_push_pull_async(NDArray* tensor, char* name,
   // Use MXEnginePushAsync instead of Engine::Get()->PushAsync to avoid ABI
   // compatibility issues
   MXEnginePushAsync(DoPushPull, push_pull_param, DeletePushPullParam,
-                    &MX_EXEC_CTX, nullptr, 0, &var, 1,
-                    &MX_FUNC_PROP, 0, "BytePSPushPull");
+                    &MX_EXEC_CTX, nullptr, 0, &var, 1, &MX_FUNC_PROP, 0,
+                    "BytePSPushPull");
 
-  if (is_average) {
+  auto use_ef =
+      context.kwargs.find("error_feedback_type") != context.kwargs.end();
+  if (is_average && !(!context.kwargs.empty() && use_ef)) {
     // average the aggregated gradient
     auto num_worker = byteps_size();
     *tensor /= num_worker;
@@ -130,9 +135,25 @@ extern "C" int byteps_mxnet_push_pull_async(NDArray* tensor, char* name,
   MX_API_END();
 }
 
-extern "C" void byteps_mxnet_declare_tensor(char* name) {
+extern "C" void byteps_mxnet_declare_tensor(char* name, int num_args,
+                                            char** args_keys,
+                                            char** args_vals) {
   std::string tensor_name = GetOpName("byteps", name);
   common::IsTensorDeclared(tensor_name);
+
+  std::unordered_map<std::string, std::string> kwargs;
+  std::string key, val;
+  std::string::size_type pos;
+  for (int i = 0; i < num_args; ++i) {
+    key = args_keys[i];
+    val = args_vals[i];
+    kwargs[key] = val;
+  }
+
+  if (num_args > 0) {
+    common::RegisterCompressor(tensor_name, kwargs);
+  }
+
   return;
 }
 
diff --git a/byteps/mxnet/ops.h b/byteps/mxnet/ops.h
index b45bb46af..320bde706 100644
--- a/byteps/mxnet/ops.h
+++ b/byteps/mxnet/ops.h
@@ -37,7 +37,9 @@ extern "C" int byteps_mxnet_push_pull_async(NDArray* input, char* name,
                                             int version, int priority,
                                             bool is_average);
 
-extern "C" void byteps_mxnet_declare_tensor(char* name);
+extern "C" void byteps_mxnet_declare_tensor(char* name, int num_args,
+                                            char** args_keys,
+                                            char** args_vals);
 
 }  // namespace mxnet
 }  // namespace byteps
diff --git a/byteps/mxnet/ops.py b/byteps/mxnet/ops.py
index 03819284b..8cd5962a2 100644
--- a/byteps/mxnet/ops.py
+++ b/byteps/mxnet/ops.py
@@ -21,6 +21,7 @@
 # Load all the necessary MXNet C types.
 import ctypes
 import os
+import warnings
 
 import mxnet as mx
 from mxnet.base import c_str, check_call, string_types
@@ -78,5 +79,46 @@ def byteps_push_pull(tensor, version=0, priority=0, name=None, is_average=True):
     return
 
 
-def byteps_declare_tensor(name):
-    check_call(MXNET_LIB_CTYPES.byteps_mxnet_declare_tensor(c_str(name)))
+def byteps_declare_tensor(name, **kwargs):
+    """create ctx for tensors and register compressor 
+
+    Warpper of the c++ function. Build parameter dict.
+
+    Arguments:
+        name : str, tensor name
+        **kwargs: extra params w.r.t gradient compression  
+
+    Returns:
+        None
+    """
+    def _create_c_style_string_array(strings):
+        byte_arr = [bytes(string, 'utf-8') for string in strings]
+        arr = (ctypes.c_char_p*len(byte_arr))()
+        arr[:] = byte_arr
+        return arr
+
+    args = {}
+    for k, v in kwargs.items():
+        splits = k.split('_')
+        if len(splits) < 2 and not all(splits):
+            warnings.warn("Ignore invalid params %s of %s" % (k, name))
+            continue
+        
+        # remove first prefix "byteps"
+        k = '_'.join(splits[1:])
+        if isinstance(v, str):
+            args[k] = v.lower()
+        elif isinstance(v, (int, float,)):
+            args[k] = str(v)
+        elif isinstance(v, bool):
+            args[k] = str(int(v)).lower()
+        else:
+            raise ValueError("Invalid %s of type %s of %s" %
+                             (v, type(v), name))
+
+    check_call(MXNET_LIB_CTYPES.byteps_mxnet_declare_tensor(
+        c_str(name),
+        ctypes.c_int(len(args)),
+        _create_c_style_string_array(list(args.keys())),
+        _create_c_style_string_array(list(args.values()))
+    ))
\ No newline at end of file
diff --git a/byteps/server/server.cc b/byteps/server/server.cc
index 7951bd036..a1503d3d8 100644
--- a/byteps/server/server.cc
+++ b/byteps/server/server.cc
@@ -14,6 +14,7 @@
 // =============================================================================
 
 #include "server.h"
+#include "../common/compressor/utils.h"
 #include "queue.h"
 
 namespace byteps {
@@ -23,48 +24,51 @@ using namespace ps;
 
 // engine related
 std::vector<PriorityQueue*> engine_queues_;
-std::vector<std::thread *> engine_threads_;
+std::vector<std::thread*> engine_threads_;
 
 BytePSArray* GetStore(uint64_t key) {
   std::lock_guard<std::mutex> lock(store_mu_);
   return &store_[key];
 }
 
-void SendPushResponse(uint64_t key, const ps::KVMeta& req, ps::KVServer<char>* server){
+void SendPushResponse(uint64_t key, const ps::KVMeta& req,
+                      ps::KVServer<char>* server) {
   auto iterator = push_response_map_.find(key);
-  if (iterator == push_response_map_.end()) { // new key
+  if (iterator == push_response_map_.end()) {  // new key
     ps::KVPairs<char> response;
-    push_response_map_[key] = response; // add to the map
+    push_response_map_[key] = response;  // add to the map
     server->Response(req, response);
-  } else { // not new key, then reuse the memory address to avoid ibv_reg_mr on RDMA data path
-    ps::KVPairs<char> *response = &iterator->second;
+  } else {  // not new key, then reuse the memory address to avoid ibv_reg_mr on
+            // RDMA data path
+    ps::KVPairs<char>* response = &iterator->second;
     server->Response(req, *response);
   }
 }
 
-void SendPullResponse(const DataHandleType type,
-                      const uint64_t key,
-                      const ps::KVMeta& req_meta,
-                      ps::KVServer<char>* server) {
+void SendPullResponse(const DataHandleType type, const uint64_t key,
+                      const ps::KVMeta& req_meta, ps::KVServer<char>* server) {
   std::lock_guard<std::mutex> lock(pullresp_mu_);
-  auto stored = GetStore(key);
-  CHECK(stored->tensor) << "init " << key << " first";
-  auto len = stored->len;
+  auto& updates = update_buf_[key];
+  CHECK(updates.merged.tensor) << "init " << key << " first";
+  char* data = updates.merged.tensor;
+  auto len = updates.merged.len;
 
   // send pull response
   auto iterator = pull_response_map_.find(key);
-  if (iterator == pull_response_map_.end()) { // new key
+  if (iterator == pull_response_map_.end()) {  // new key
     ps::KVPairs<char> response;
     response.keys = {EncodeKey(key)};
     response.lens = {len};
-    response.vals = ps::SArray<char>(stored->tensor, len, false); // zero copy
-    pull_response_map_[key] = response; // add to the map
+    response.vals = ps::SArray<char>(data, len, false);  // zero copy
+    pull_response_map_[key] = response;                  // add to the map
     server->Response(req_meta, response);
-  } else { // not new key, then reuse the memory address to avoid ibv_reg_mr on RDMA data path
-    ps::KVPairs<char> *response = &iterator->second;
-    // keys and lens remain unchanged, just update vals
-    auto p = static_cast<char*>(stored->tensor);
+  } else {  // not new key, then reuse the memory address to avoid ibv_reg_mr on
+            // RDMA data path
+    ps::KVPairs<char>* response = &iterator->second;
+
+    auto p = static_cast<char*>(data);
     CHECK(p);
+    response->lens = {len};
     response->vals = ps::SArray<char>(p, len, false);
     server->Response(req_meta, *response);
   }
@@ -80,16 +84,46 @@ void BytePSServerEngineThread(int i) {
     CHECK(msg.dst);
     CHECK(msg.src);
 
+    auto iter = compressor_map_.find(msg.key);
+    if (iter != compressor_map_.end()) {
+      // compress
+      if (msg.ops == ALL_RECV) {
+        common::compressor::tensor_t grad(reinterpret_cast<char*>(msg.src),
+                                          msg.len, msg.type.dtype);
+        auto compressed = iter->second->Compress(grad);
+        // 1. compress
+        auto& updates = update_buf_[msg.key];
+        updates.merged.tensor = compressed.data;
+        updates.merged.len = compressed.size;
+      } else {  // decompress
+        auto compressed_len = msg.sarray.lens[0];
+        CHECK_LE(compressed_len, msg.len);
+        common::compressor::tensor_t compressed(
+            reinterpret_cast<char*>(msg.src), compressed_len, msg.type.dtype);
+        auto decompressed = iter->second->Decompress(compressed);
+        msg.src = decompressed.data;
+      }
+    } else {
+      if (msg.ops == ALL_RECV) {
+        // 2. no compress
+        auto& updates = update_buf_[msg.key];
+        updates.merged.tensor = reinterpret_cast<char*>(msg.src);
+        updates.merged.len = msg.len;
+      }
+    }
+
     bool is_debug = (debug_mode_ && (debug_key_ == msg.key));
     switch (msg.ops) {
-      case COPY_MERGED: {
+      case COPY_FIRST: {
         if (is_debug) {
           std::lock_guard<std::mutex> lock(debug_mu_);
           LOG(INFO) << "stage: ENGINE_COPY_MERGED_TO_STORE_BEFORE \t"
                     << "dst: " << DEBUG_PRINT_TENSOR_VALUE(msg.dst) << "\t"
                     << "src: " << DEBUG_PRINT_TENSOR_VALUE(msg.src) << "\t"
-                    << "dst_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.dst) << "\t"
-                    << "src_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.src) << "\t";
+                    << "dst_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.dst)
+                    << "\t"
+                    << "src_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.src)
+                    << "\t";
         }
         bps_reducer_->copy(msg.dst, msg.src, msg.len);
         if (is_debug) {
@@ -97,9 +131,14 @@ void BytePSServerEngineThread(int i) {
           LOG(INFO) << "stage: ENGINE_COPY_MERGED_TO_STORE_AFTER \t"
                     << "dst: " << DEBUG_PRINT_TENSOR_VALUE(msg.dst) << "\t"
                     << "src: " << DEBUG_PRINT_TENSOR_VALUE(msg.src) << "\t"
-                    << "dst_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.dst) << "\t"
-                    << "src_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.src) << "\t";
+                    << "dst_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.dst)
+                    << "\t"
+                    << "src_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.src)
+                    << "\t";
         }
+      } break;
+
+      case ALL_RECV: {
         std::lock_guard<std::mutex> lock(flag_mu_[i]);
         if (is_push_finished_[i].find(msg.key) == is_push_finished_[i].end()) {
           is_push_finished_[i][msg.key] = false;
@@ -110,7 +149,8 @@ void BytePSServerEngineThread(int i) {
 
         auto it = q_pull_reqmeta_[i][msg.key].begin();
         while (it != q_pull_reqmeta_[i][msg.key].end()) {
-          if (seen_sender_[i][msg.key].find(it->sender) == seen_sender_[i][msg.key].end()) {
+          if (seen_sender_[i][msg.key].find(it->sender) ==
+              seen_sender_[i][msg.key].end()) {
             SendPullResponse(msg.type, msg.key, *it, byteps_server_);
             pull_cnt_[i][msg.key] += 1;
             seen_sender_[i][msg.key].insert(it->sender);
@@ -118,7 +158,7 @@ void BytePSServerEngineThread(int i) {
           } else {
             ++it;
           }
-          if (pull_cnt_[i][msg.key] == (size_t) ps::NumWorkers()) {
+          if (pull_cnt_[i][msg.key] == (size_t)ps::NumWorkers()) {
             is_push_finished_[i][msg.key] = false;
             pull_cnt_[i][msg.key] = 0;
             seen_sender_[i][msg.key].clear();
@@ -134,144 +174,191 @@ void BytePSServerEngineThread(int i) {
           LOG(INFO) << "stage: ENGINE_SUM_RECV_BEFORE \t"
                     << "dst: " << DEBUG_PRINT_TENSOR_VALUE(msg.dst) << "\t"
                     << "src: " << DEBUG_PRINT_TENSOR_VALUE(msg.src) << "\t"
-                    << "dst_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.dst) << "\t"
-                    << "src_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.src) << "\t";
+                    << "dst_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.dst)
+                    << "\t"
+                    << "src_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.src)
+                    << "\t";
         }
-        CHECK_GE(bps_reducer_->sum(msg.dst,
-                                  msg.src,
-                                  msg.len,
-                                  bps_type), 0);
+        CHECK_GE(bps_reducer_->sum(msg.dst, msg.src, msg.len, bps_type), 0);
         if (is_debug) {
           std::lock_guard<std::mutex> lock(debug_mu_);
           LOG(INFO) << "stage: ENGINE_SUM_RECV_AFTER \t"
                     << "dst: " << DEBUG_PRINT_TENSOR_VALUE(msg.dst) << "\t"
                     << "src: " << DEBUG_PRINT_TENSOR_VALUE(msg.src) << "\t"
-                    << "dst_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.dst) << "\t"
-                    << "src_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.src) << "\t";
+                    << "dst_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.dst)
+                    << "\t"
+                    << "src_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.src)
+                    << "\t";
         }
       } break;
-
       default:
         CHECK(0);
     }
   }
-}
+}  // namespace server
 
 void BytePSHandler(const ps::KVMeta& req_meta,
-                   const ps::KVPairs<char> &req_data, ps::KVServer<char>* server) {
-  std::lock_guard<std::mutex> lock(handle_mu_); // push & pull may have racing
+                   const ps::KVPairs<char>& req_data,
+                   ps::KVServer<char>* server) {
+  std::lock_guard<std::mutex> lock(handle_mu_);  // push & pull may have racing
   DataHandleType type = DepairDataHandleType(req_meta.cmd);
-  CHECK_EQ(type.requestType, RequestType::kDefaultPushPull);
+  // CHECK_EQ(type.requestType, RequestType::kDefaultPushPull);
   // do some check
   CHECK_EQ(req_data.keys.size(), (size_t)1);
   if (log_key_info_) {
     if (req_meta.push) {
       CHECK_EQ(req_data.lens.size(), (size_t)1);
       CHECK_EQ(req_data.vals.size(), (size_t)req_data.lens[0]);
-      LOG(INFO) << "push key="
-                << DecodeKey(req_data.keys[0])
+      LOG(INFO) << "push key=" << DecodeKey(req_data.keys[0])
                 << "\t sender=" << req_meta.sender
-                << "\t size=" << (size_t) req_data.lens[0];
+                << "\t size=" << (size_t)req_data.lens[0];
     } else {
-      LOG(INFO) << "pull key="
-                << (uint64_t) DecodeKey(req_data.keys[0])
+      LOG(INFO) << "pull key=" << (uint64_t)DecodeKey(req_data.keys[0])
                 << "\t sender=" << req_meta.sender;
     }
   }
   uint64_t key = DecodeKey(req_data.keys[0]);
-  if (req_meta.push) { // push request
+
+  // register compressor
+  if (type.requestType == RequestType::kCompressedPushPull) {
+    if (compressor_map_.find(key) == compressor_map_.end()) {
+      std::string content{reinterpret_cast<char*>(req_data.vals.data()),
+                          static_cast<size_t>(req_data.lens[0])};
+      auto kwargs = byteps::common::compressor::Deserialize(content);
+      auto stored = GetStore(key);
+      size_t aligned_size = byteps::common::Align(stored->len, stored->dtype);
+      auto compressor_ptr =
+          byteps::common::compressor::CompressorRegistry::Create(
+              kwargs, aligned_size,
+              static_cast<byteps::common::DataType>(stored->dtype));
+      CHECK_NE(compressor_ptr, nullptr);
+      compressor_map_[key] = std::move(compressor_ptr);
+      if (log_key_info_) {
+        LOG(INFO) << "register compressor for key=" << key;
+      }
+    }
+
+    // buffer the request meta
+    auto& updates = update_buf_[key];
+    updates.request.push_back(req_meta);
+    // should send response after collecting all init push
+    if (updates.request.size() < (size_t)ps::NumWorkers()) return;
+
+    for (const auto& req : updates.request) {
+      SendPushResponse(key, req, server);
+    }
+    updates.request.clear();
+    return;
+  }
+
+  if (req_meta.push) {  // push request
     CHECK_EQ(req_data.lens.size(), (size_t)1);
     CHECK_EQ(req_data.vals.size(), (size_t)req_data.lens[0]);
     auto stored = GetStore(key);
-    auto len = (size_t) req_data.lens[0];
+    auto len = (size_t)req_data.lens[0];
     auto recved = reinterpret_cast<char*>(req_data.vals.data());
+
     if (!stored->tensor) {
       if (sync_mode_ && (update_buf_.find(key) == update_buf_.end())) {
         update_buf_[key].merged.len = len;
         update_buf_[key].merged.dtype = type.dtype;
       }
       // buffer the request meta
-      auto &updates = update_buf_[key];
+      auto& updates = update_buf_[key];
       updates.request.push_back(req_meta);
       // should send response after collecting all init push
-      if (updates.request.size() < (size_t) ps::NumWorkers()) return;
+      if (updates.request.size() < (size_t)ps::NumWorkers()) return;
       if (log_key_info_) {
         LOG(INFO) << "Collected all " << updates.request.size()
                   << " requests for key=" << key
-                  << ", init the store buffer size=" << (size_t) req_data.lens[0];
+                  << ", init the store buffer size="
+                  << (size_t)req_data.lens[0];
       }
       // init stored buffer, use page aligned memory
-      PageAlignedMalloc((void**) &stored->tensor, len);
+      size_t aligned_size = common::Align(len, type.dtype);
+      PageAlignedMalloc((void**)&stored->tensor, aligned_size);
       stored->len = len;
       stored->dtype = type.dtype;
       CHECK(stored->tensor);
 
-      bps_reducer_->copy(stored->tensor, recved, len); // we may not need this copy
+      bps_reducer_->copy(stored->tensor, recved,
+                         len);  // we may not need this copy
       for (const auto& req : updates.request) {
         SendPushResponse(key, req, server);
       }
       updates.request.clear();
     } else {
-      auto &updates = update_buf_[key];
+      auto& updates = update_buf_[key];
       auto tid = GetThreadID(key, len);
-      if (updates.request.empty()) { // from the first incoming worker
+      if (updates.request.empty()) {  // from the first incoming worker
         if (sync_mode_) {
           if (debug_mode_ && (debug_key_ == key)) {
             std::lock_guard<std::mutex> lock(debug_mu_);
             LOG(INFO) << "stage: FIRST_WORKER_RECV \t"
-                      << "stored: " << DEBUG_PRINT_TENSOR_VALUE(stored->tensor) << "\t"
+                      << "stored: " << DEBUG_PRINT_TENSOR_VALUE(stored->tensor)
+                      << "\t"
                       << "recved: " << DEBUG_PRINT_TENSOR_VALUE(recved) << "\t"
                       << "len: " << len << "\t"
                       << "addr: " << DEBUG_PRINT_TENSOR_ADDRESS(recved);
           }
-          // zero copy
-          updates.merged.tensor = recved;
           updates.merged.tmp_sarray = req_data;
-        } else { // async mode, directly add to the buffer
-          CHECK_GE(bps_reducer_->sum((void *) stored->tensor,
-                                    (void *) recved,
-                                    len,
-                                    bps_reducer_->GetDataType(stored->dtype)), 0);
+          // copy
+          BytePSEngineMessage msg = {timestamp_++,   type,     key,
+                                     stored->tensor, recved,   stored->len,
+                                     COPY_FIRST,     req_data, req_meta};
+          engine_queues_[tid]->Push(msg);
+        } else {  // async mode, directly add to the buffer
+          CHECK_GE(bps_reducer_->sum((void*)stored->tensor, (void*)recved, len,
+                                     bps_reducer_->GetDataType(stored->dtype)),
+                   0);
         }
-      } else { // from other workers
+      } else {  // from other workers
         CHECK(sync_mode_);
-        CHECK(updates.merged.tensor);
+        // CHECK(updates.merged.tensor);
         if (debug_mode_ && (debug_key_ == key)) {
           std::lock_guard<std::mutex> lock(debug_mu_);
           LOG(INFO) << "stage: OTHER_WORKER_SUM \t"
-                    << "stored: " << DEBUG_PRINT_TENSOR_VALUE(stored->tensor) << "\t"
-                    << "merged: " << DEBUG_PRINT_TENSOR_VALUE(updates.merged.tensor) << "\t"
+                    << "stored: " << DEBUG_PRINT_TENSOR_VALUE(stored->tensor)
+                    << "\t"
                     << "recved: " << DEBUG_PRINT_TENSOR_VALUE(recved) << "\t"
                     << "len: " << len << "\t"
                     << "addr: " << DEBUG_PRINT_TENSOR_ADDRESS(recved);
         }
         if (is_engine_blocking_) {
-          CHECK_GE(bps_reducer_->sum((void *) updates.merged.tensor,
-                                    (void *) recved,
-                                    len,
-                                    bps_reducer_->GetDataType(updates.merged.dtype)), 0);
-        } else { // non-blocking
-          BytePSEngineMessage msg = {timestamp_++, type, key, updates.merged.tensor, recved, len, SUM_RECV, req_data, req_meta};
+          // TODO: decompress
+          CHECK_GE(bps_reducer_->sum(
+                       (void*)updates.merged.tensor, (void*)recved, len,
+                       bps_reducer_->GetDataType(updates.merged.dtype)),
+                   0);
+        } else {  // non-blocking
+          BytePSEngineMessage msg = {timestamp_++,   type,     key,
+                                     stored->tensor, recved,   stored->len,
+                                     SUM_RECV,       req_data, req_meta};
           engine_queues_[tid]->Push(msg);
         }
       }
       // add a worker information (request.size() is the # workers received)
       updates.request.push_back(req_meta);
       SendPushResponse(key, req_meta, server);
-      if (sync_mode_ && updates.request.size() == (size_t) ps::NumWorkers()) {
+      if (sync_mode_ && updates.request.size() == (size_t)ps::NumWorkers()) {
         auto stored = GetStore(key);
         auto& update = updates.merged;
         if (debug_mode_ && (debug_key_ == key)) {
           std::lock_guard<std::mutex> lock(debug_mu_);
           LOG(INFO) << "stage: COPY_MERGED_TO_STORE \t"
-                    << "stored: " << DEBUG_PRINT_TENSOR_VALUE(stored->tensor) << "\t"
-                    << "merged: " << DEBUG_PRINT_TENSOR_VALUE(updates.merged.tensor) << "\t"
+                    << "stored: " << DEBUG_PRINT_TENSOR_VALUE(stored->tensor)
+                    << "\t"
+                    << "merged: "
+                    << DEBUG_PRINT_TENSOR_VALUE(updates.merged.tensor) << "\t"
                     << "recved: " << DEBUG_PRINT_TENSOR_VALUE(recved);
         }
         if (is_engine_blocking_) {
+          // TODO: compress
           bps_reducer_->copy(stored->tensor, updates.merged.tensor, len);
         } else {
-          BytePSEngineMessage msg = {timestamp_++, type, key, stored->tensor, update.tensor, len, COPY_MERGED};
+          BytePSEngineMessage msg = {
+              timestamp_++,   type,        key,     stored->tensor,
+              stored->tensor, stored->len, ALL_RECV};
           engine_queues_[tid]->Push(msg);
           engine_queues_[tid]->ClearCounter(key);
         }
@@ -281,9 +368,10 @@ void BytePSHandler(const ps::KVMeta& req_meta,
         updates.request.clear();
       }
     }
-  } else { // pull request
+  } else {  // pull request
     auto stored = GetStore(key);
-    CHECK(stored->tensor) << "Should init the buffer for key=" << key << " first";
+    CHECK(stored->tensor) << "Should init the buffer for key=" << key
+                          << " first";
     if (is_engine_blocking_ || !sync_mode_) {
       SendPullResponse(type, key, req_meta, server);
     } else {
@@ -297,12 +385,13 @@ void BytePSHandler(const ps::KVMeta& req_meta,
 
       auto it = seen_sender_[tid][key].find(req_meta.sender);
       if (is_push_finished_[tid][key] && (it == seen_sender_[tid][key].end())) {
-        // push already finished && not received the associated pull response yet
+        // push already finished && not received the associated pull response
+        // yet
         SendPullResponse(type, key, req_meta, server);
         pull_cnt_[tid][key] += 1;
         seen_sender_[tid][key].insert(req_meta.sender);
 
-        if (pull_cnt_[tid][key] == (size_t) ps::NumWorkers()) {
+        if (pull_cnt_[tid][key] == (size_t)ps::NumWorkers()) {
           is_push_finished_[tid][key] = false;
           pull_cnt_[tid][key] = 0;
           seen_sender_[tid][key].clear();
@@ -311,7 +400,6 @@ void BytePSHandler(const ps::KVMeta& req_meta,
         // push not finished, put into the queue, and wait for the engine
         q_pull_reqmeta_[tid][key].push_back(req_meta);
       }
-
     }
   }
 }
@@ -322,27 +410,32 @@ void init_global_env() {
 
   // enable engine block mode (default disabled)
   is_engine_blocking_ = GetEnv("BYTEPS_SERVER_ENGINE_BLOCKING", false);
-  if (is_engine_blocking_) LOG(INFO) << "Enable blocking mode of the server engine";
+  if (is_engine_blocking_)
+    LOG(INFO) << "Enable blocking mode of the server engine";
 
   // sync or async training
   sync_mode_ = !GetEnv("BYTEPS_ENABLE_ASYNC", false);
-  if (!sync_mode_) LOG(INFO) << "BytePS server is enabled asynchronous training";
+  if (!sync_mode_)
+    LOG(INFO) << "BytePS server is enabled asynchronous training";
 
   // debug mode
   debug_mode_ = GetEnv("BYTEPS_SERVER_DEBUG", false);
   debug_key_ = GetEnv("BYTEPS_SERVER_DEBUG_KEY", 0);
-  if (debug_mode_) LOG(INFO) << "Debug mode enabled! Printing key " << debug_key_;
+  if (debug_mode_)
+    LOG(INFO) << "Debug mode enabled! Printing key " << debug_key_;
 
   // number of engine thread
   // invalid if is_engine_blocking = true
   engine_thread_num_ = GetEnv("BYTEPS_SERVER_ENGINE_THREAD", 4);
   LOG(INFO) << "BytePS server engine uses " << engine_thread_num_ << " threads"
-            << ", consider increasing BYTEPS_SERVER_ENGINE_THREAD for higher performance";
+            << ", consider increasing BYTEPS_SERVER_ENGINE_THREAD for higher "
+               "performance";
   CHECK_GE(engine_thread_num_, 1);
 
   // enable scheduling for server engine
   enable_schedule_ = GetEnv("BYTEPS_SERVER_ENABLE_SCHEDULE", false);
-  if (enable_schedule_) LOG(INFO) << "Enable engine scheduling for BytePS server";
+  if (enable_schedule_)
+    LOG(INFO) << "Enable engine scheduling for BytePS server";
 }
 
 extern "C" void byteps_server() {
@@ -353,10 +446,14 @@ extern "C" void byteps_server() {
 
   // flag mu and its protected map
   std::vector<std::mutex> tmp_flagmu(engine_thread_num_);
-  std::vector<std::unordered_map<uint64_t, bool> > tmp_ispushfinished(engine_thread_num_);
-  std::vector<std::unordered_map<uint64_t, std::vector<ps::KVMeta> > > tmp_qpullreqmeta(engine_thread_num_);
-  std::vector<std::unordered_map<uint64_t, std::set<int> > > tmp_seensender(engine_thread_num_);
-  std::vector<std::unordered_map<uint64_t, size_t> > tmp_pullcnt(engine_thread_num_);
+  std::vector<std::unordered_map<uint64_t, bool> > tmp_ispushfinished(
+      engine_thread_num_);
+  std::vector<std::unordered_map<uint64_t, std::vector<ps::KVMeta> > >
+      tmp_qpullreqmeta(engine_thread_num_);
+  std::vector<std::unordered_map<uint64_t, std::set<int> > > tmp_seensender(
+      engine_thread_num_);
+  std::vector<std::unordered_map<uint64_t, size_t> > tmp_pullcnt(
+      engine_thread_num_);
   flag_mu_.swap(tmp_flagmu);
   is_push_finished_.swap(tmp_ispushfinished);
   q_pull_reqmeta_.swap(tmp_qpullreqmeta);
@@ -387,8 +484,8 @@ extern "C" void byteps_server() {
   byteps_server_->set_request_handle(BytePSHandler);
   StartAsync(0, "byteps_server\0");
   if (!Postoffice::Get()->is_recovery()) {
-    Postoffice::Get()->Barrier(0,
-      ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler);
+    Postoffice::Get()->Barrier(
+        0, ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler);
   }
 
   // clean the server resource
diff --git a/byteps/server/server.h b/byteps/server/server.h
index 6354e0499..f24412d90 100644
--- a/byteps/server/server.h
+++ b/byteps/server/server.h
@@ -23,6 +23,8 @@
 #include <unistd.h>
 #include "ps/ps.h"
 #include "../common/cpu_reducer.h"
+#include "../common/compressor/compressor.h"
+#include "../common/compressor/compressor_registry.h"
 
 namespace byteps {
 namespace server {
@@ -39,7 +41,7 @@ enum class RequestType {
 };
 
 enum BytePSEngineOperation {
-  SUM_RECV, COPY_MERGED, TERMINATE
+  SUM_RECV, COPY_FIRST, ALL_RECV, TERMINATE
 };
 
 struct PSKV {
@@ -107,6 +109,7 @@ std::vector<std::unordered_map<uint64_t, size_t> > pull_cnt_;
 // byteps handler
 std::mutex handle_mu_;
 std::unordered_map<uint64_t, UpdateBuf> update_buf_;
+std::unordered_map<uint64_t, std::unique_ptr<common::compressor::Compressor>> compressor_map_;
 
 // address map
 std::mutex store_mu_;
diff --git a/docker/Dockerfile.mxnet b/docker/Dockerfile.mxnet
new file mode 100644
index 000000000..ab82b51a4
--- /dev/null
+++ b/docker/Dockerfile.mxnet
@@ -0,0 +1,116 @@
+FROM nvidia/cuda:10.0-devel-ubuntu18.04
+
+ARG https_proxy
+ARG http_proxy
+
+ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64
+ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH}
+ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH
+
+ENV BYTEPS_BASE_PATH /usr/local
+ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps
+ENV BYTEPS_GIT_LINK https://github.com/zhongyuchen/byteps
+
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -qq
+RUN apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \
+        build-essential \
+        tzdata \
+        ca-certificates \
+        git \
+        curl \
+        wget \
+        vim \
+        cmake \
+        lsb-release \
+        libcudnn7=7.6.0.64-1+cuda10.0 \
+        libnuma-dev \
+        ibverbs-providers \
+        librdmacm-dev \
+        ibverbs-utils \
+        rdmacm-utils \
+        libibverbs-dev \
+        python3 \
+        python3-dev \
+        python3-pip \
+        python3-setuptools
+
+# Install NCCL
+ENV NCCL_VERSION=7c72dee660e4d055b81721dd6b03e4e1c0a983cf
+RUN cd / && \
+    wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \
+    cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \
+    mkdir -p /usr/local/nccl && \
+    tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \
+    echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    ldconfig && rm -rf /nccl-$NCCL_VERSION
+
+WORKDIR /root/
+
+# install gcc 4.9
+RUN mkdir -p /root/gcc/ && cd /root/gcc &&\
+    wget http://launchpadlibrarian.net/247707088/libmpfr4_3.1.4-1_amd64.deb &&\
+    wget http://launchpadlibrarian.net/253728424/libasan1_4.9.3-13ubuntu2_amd64.deb &&\
+    wget http://launchpadlibrarian.net/253728426/libgcc-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\
+    wget http://launchpadlibrarian.net/253728314/gcc-4.9-base_4.9.3-13ubuntu2_amd64.deb &&\
+    wget http://launchpadlibrarian.net/253728399/cpp-4.9_4.9.3-13ubuntu2_amd64.deb &&\
+    wget http://launchpadlibrarian.net/253728404/gcc-4.9_4.9.3-13ubuntu2_amd64.deb &&\
+    wget http://launchpadlibrarian.net/253728432/libstdc++-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\
+    wget http://launchpadlibrarian.net/253728401/g++-4.9_4.9.3-13ubuntu2_amd64.deb
+
+RUN cd /root/gcc &&\
+    dpkg -i gcc-4.9-base_4.9.3-13ubuntu2_amd64.deb &&\
+    dpkg -i libmpfr4_3.1.4-1_amd64.deb &&\
+    dpkg -i libasan1_4.9.3-13ubuntu2_amd64.deb &&\
+    dpkg -i libgcc-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\
+    dpkg -i cpp-4.9_4.9.3-13ubuntu2_amd64.deb &&\
+    dpkg -i gcc-4.9_4.9.3-13ubuntu2_amd64.deb &&\
+    dpkg -i libstdc++-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\
+    dpkg -i g++-4.9_4.9.3-13ubuntu2_amd64.deb
+
+# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet.
+RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \
+    update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200
+
+
+RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \
+    echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \
+    echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \
+    ldconfig
+
+RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \
+    ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\
+    ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \
+    ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1
+
+# install mxnet
+ARG FRAMEWORK_VERSION=1.5.0
+RUN python3 -m pip --no-cache-dir install mxnet-cu100==$FRAMEWORK_VERSION
+
+# Install BytePS
+ARG BYTEPS_NCCL_LINK=shared
+ARG BYTEPS_USE_RDMA=1
+ARG BYTEPS_WITHOUT_PYTORCH=1
+ARG BYTEPS_WITHOUT_TENSORFLOW=1
+ARG BYTEPS_BRANCH=gradient_compression
+RUN cd $BYTEPS_BASE_PATH &&\
+    git clone --recursive -b $BYTEPS_BRANCH $BYTEPS_GIT_LINK
+RUN cd $BYTEPS_PATH &&\
+    python3 setup.py install
+
+# Remove GCC pinning
+RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \
+    update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \
+    update-alternatives --remove g++ /usr/bin/g++-4.9 && \
+    update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9
+
+RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \
+    rm -rf /usr/local/cuda/lib64/libcuda.so.1
diff --git a/docs/gradient-compression.md b/docs/gradient-compression.md
new file mode 100644
index 000000000..66bd6af12
--- /dev/null
+++ b/docs/gradient-compression.md
@@ -0,0 +1,294 @@
+## Motivation
+
+Currently BytePS does not fully support gradient compression. The compression it supports lies in each plugin in Python. Such design may ease the difficulty of the implementation but leads to major inabilities for more aggressive compression. This is because NCCL only supports limited reduction operations such as Sum, Prod etc but these operations are meaningless for the compressed data which have been highly bit-wisely packed. For example, for [signSGD](https://arxiv.org/pdf/1802.04434.pdf),  one of the most popular methods for gradient compression due to its simplicity and effectiveness, each bit represents a signbit of an element in the original data tensor, making reduction operations like summation totally meaningless. But reduction is necessary for multi-GPU devices.
+
+Another problem is that compared to inter-node communication, intra-node communication is not the bottleneck. Furthermore, too much compression at first will lose much information, which may cause low accuracy. So there is no need to make too radical compression before running into BytePS core in worker nodes.
+
+Therefore, changes need to be made.
+
+## Design Overview
+
+In light of the problems mentioned above, we propose two-level gradient compression:
+
+1. intra-node: This is just an alias for the current implementation, named after its communication property. Transform FP32 tensors into FP16 on each GPU, reduce them across multi-GPUs via NCCL, and copy them to the CPU buffer waiting for next-level compression. The purpose of the compression is to reduce intra-node communication overhead introduced by multi-GPUs. Since intra-node communication is very fast, especially with NCCL, only mild compression methods will be applied, most of which is type-conversion. It is framework-specific and will be implemented in each plugin.
+
+2. inter-node: Usually inter-node communication is a bottleneck, so more drastically gradient compression algorithms will be applied here. This is framework-agnostic and will be implemented in BytePS core.
+
+It is worth mentioning that our design supports all frameworks. 
+
+![architecture](https://user-images.githubusercontent.com/25879526/86322951-7abf4000-bc6e-11ea-871f-572a7efed7cd.png)
+
+## Interface
+
+Only a few changes to be made for users. Users only have to add a few LOC in the script to specify which compression algorithm to be used and the parameters needed by the algorithm. Take MXNet for example. 
+
+```python
+compression_params = {
+            "compressor": opt.compressor,
+            "ef": opt.ef,
+            "momentum": opt.compress_momentum,
+            "scaling": opt.onebit_scaling,
+            "k": opt.k
+}
+
+trainer = bps.DistributedTrainer(params, optimizer, optimizer_params, compression_params=compression_params)
+```
+
+Here we prescribe some keys. Users can lookup documentations to determine which key should be used. Here are some common keys.
+
+| KEYS | DESC |
+| --- | --- |
+| compressor | compression algorithms, including onebit / dithering / topk / randomk |
+| k | an integer, must be specified when using dithering / topk / randomk |
+| scaling | optional, whether to enable scaling for onebit, default is false |
+| ef | error-feedback algorithms, e.g. vanilla |
+| momentum |  momentum algorithms, e.g. nesterov  |
+| seed |  random seed  |
+
+If the user's input is not correct, it will give a warning and abort.
+
+## Implementation
+
+### Parameter Data Structure
+
+To offer users a unified interface to use,  we have to address the registration problem. parameters vary from different kinds of compression algorithms. For example, topk and randomk algorithms need parameter k to be specified while onebit algorithm may need to input whether to enable scaling flag. Some parameters are optional but others are not. So parameter passing is a challenge.
+
+We address this challenge using _string-string dictionary_ (`std::unorded_map<std::string, std::string>` for C++ or `dict` for Python) as our unified data structure to pass parameters. As mentioned above, we prescribe specific strings as keys, so the _dictionary_ will look like:
+
+```python
+{"byteps_compressor_type": "topk", "byteps_compressor_k": "3", "byteps_error_feedback_type": "vanilla"}
+```
+
+**Python**
+
+For MXNet users, the dictionary can be an attribute of ParameterDict. We can filter out those parameters by leveraging the prefix "byteps". For example,
+
+```python
+for i, param in enumerate(self._params):
+           byteps_declare_tensor("parameter_" + str(i))
+           if param.grad_req != 'null':
+               byteps_params = dict(
+                   filter(lambda attr: attr[0].startswith(
+                       "byteps_",), param.__dict__.items())
+               )
+               byteps_declare_tensor("gradient_" + str(i), **byteps_params)
+```
+
+**C++**
+
+Using ctypes, we can pass the dictionary conveniently. For example,
+```c++
+extern "C" void byteps_mxnet_declare_tensor(char* name, int num_params,
+                                           char** param_keys,
+                                           char** param_vals) {
+ ...
+
+ std::unordered_map<std::string, std::string> param_dict;
+ std::string key, val;
+ std::string::size_type pos;
+ for (int i = 0; i < num_params; ++i) {
+   key = param_keys[i];
+   val = param_vals[i];
+   param_dict[key] = val;
+ }
+
+ ...
+}
+```
+
+### Compressor - Development API
+
+We want developers to develop their own gradient compression algorithms without fully understanding how BytePS works. What they only need to know is development API. We currently implement some commonly used gradient compression algorithms, but in the future, we hope more novel algorithms will be implemented under our API. We abstract compression algorithms into `compressor`. The `Compressor` looks like this:
+
+```c++
+class Compressor {
+ public:
+  Compressor(size_t size, DataType dtype)
+      : _size(size),
+        _dtype(dtype),
+        _buf(new byte_t[size]),
+        _cpu_reducer(new CpuReducer(nullptr)){};
+  virtual ~Compressor() = default;
+
+  virtual tensor_t Compress(tensor_t grad) = 0;
+
+  virtual tensor_t Decompress(tensor_t compressed) = 0;
+
+  virtual void FastUpdateError(tensor_t error, tensor_t corrected,
+                               tensor_t compressed) {
+    BPS_LOG(FATAL) << "FastUpdateError is not implemented";
+  };
+
+  std::unique_ptr<byte_t[]> _buf;
+
+  size_t _size;
+
+  DataType _dtype;
+
+  std::unique_ptr<CpuReducer> _cpu_reducer;
+};
+```
+
+In order to make less modifications to BytePS core, we want compressors to be as general as possible. In the best case, the base compressor pointer/reference can represent all kinds of compressors and only need to expose two operations to users: `Compress` and `Decompress`. This is quite challenging because there are some optional features for gradient compression, such as error-feedback and momentum. These are two common methods to correct the bias and accelerate the training process respectively. For example, with error-feedback, before being compressed, gradients are first corrected with errors which refer to the information loss during the last compression,  and then errors are re-calculated. Therefore, the workflow is different from only using vanilla gradient compression.
+
+In order to support all these features and expose a unified API at the same time, we use the decorator pattern. We regard error-feedback as an additional behavior of compressors. We want a unified API, which means compressors with error-feedback should expose the same method as those without error-feedback. But in that case we have to create a subclass for each compressor, which is too redundant. So the decorator pattern just solves our problem. We create a decorator class named `ErrorFeedback` to inherit `BaseCompressor` while at the same time also keeping a member of `BaseCompressor`. For example,
+
+```c++
+class ErrorFeedback : public Compressor {
+ public:
+  ErrorFeedback(size_t size, DataType dtype, std::unique_ptr<Compressor> cptr)
+      : Compressor(size, dtype),
+        _cptr(std::move(cptr)),
+        _error(new byte_t[size]()) {}
+  virtual ~ErrorFeedback() = default;
+
+  virtual tensor_t Compress(tensor_t grad) final;
+
+  virtual tensor_t Decompress(tensor_t compressed) final;
+
+ protected:
+
+  virtual void UpdateGradient(tensor_t grad) = 0;
+
+  virtual void UpdateError(tensor_t corrected, tensor_t compressed);
+
+ protected:
+  std::unique_ptr<byte_t[]> _error;
+
+ private:
+  std::unique_ptr<Compressor> _cptr;
+};
+```
+
+And the workflow is implemented in `Compress` and `Decompress`. For example,
+```c++
+tensor_t ErrorFeedback::Compress(tensor_t grad) {
+  // 1. grad <- grad + error
+  UpdateGradient(grad);
+
+  // 2. c <- Compress(grad)
+  auto compressed = _cptr->Compress(grad);
+
+  // 3. e <- grad - Decompress(c)
+  UpdateError(grad, compressed);
+
+  return compressed;
+}
+
+tensor_t ErrorFeedback::Decompress(tensor_t compressed) {
+  // directly forward to internal compressor
+  return _cptr->Decompress(compressed);
+}
+```
+
+`Momentum` is implemented in the same way. `ErrorFeedBack` and `Momentum` are also base classes to inherit. In this way, error-feedback and momentum becomes optional features to be added to any vanilla gradient compression algorithms.
+
+BTW, momentum is not applied to servers. 
+
+## Exps
+
+### CIFAR100
+
+#### End-to-End Training
+
+We conduct the experiment in distributed training ResNet18_v2 on the CIFAR100 datasets with 4 AWS P3.16xlarge instances, each equipped with 8 V100 GPUs and 25Gbps network. The compression algorithms benchmarked here are also equipped with error-feedback and nesterov momentum. We set k = 1 for topk and k = 8 for randomk. We train it for 200 epochs. 
+
+![image](https://user-images.githubusercontent.com/25879526/86323315-38e2c980-bc6f-11ea-9c5c-038371d5d6b5.png)
+
+![image](https://user-images.githubusercontent.com/25879526/86323299-2ec0cb00-bc6f-11ea-82d8-ee31c4bb3ec8.png)
+
+|  f888c8d8f9e8483e46acd00042ed262e30c6856e  | VAl ACC | TIME(s) |
+| -- | -- | -- |
+|baseline| 0.713799| 703.1527987500002|
+|onebit| 0.705601| 629.4210848750001|
+|randomk| 0.6991| 501.99770550000005|
+|topk| 0.704202| 507.90769437499966|
+
+
+The results show that compression can reduce up to 28.6% end-to-end training time without accuracy loss. 
+
+#### Slow Network
+
+Gradient compression is more beneficial in slower network. Therefore we limit the network bandwidth to 100Mbps (both downlink and uplink) and keep all other settings not changed. The results show that we can achieve up to 6x reduciton in training time. 
+
+![image](https://user-images.githubusercontent.com/25879526/86326780-c96fd880-bc74-11ea-9bcf-673f061f0020.png)
+
+|  b382f996d159fbe4d48c1135290f5c4183fc6b46  |  TIME(s) |
+| -- | -- |
+|baseline| 518.321322125|
+|onebit| 195.236724875|
+|randomk| 89.672168625|
+|topk| 83.9287285|
+
+### IMAGENET
+
+To save time, we only tested 1bit algorithm. Topk and randomk are not guaranteed to converge on IMAGENET. 
+
+#### Workload Breakdown 
+
+In this experiment, we measure the workload breakdown into computation and communication. We use 8 Amazon EC2 p3.2xlarge instances, each of which is shipped with one Nvidia V100 GPU and 10Gbps Ethernet. We train two CNN models: Resnet-50_v2  and VGG-16. We first measure the computation time by collecting the elapsed time of running 50 iterations (t0) on one node. Then we measure the total training time for running 50 iterations (t1) on 8 nodes. Then, we get an estimate of communication time using t1 − t0.
+
+As the figure shows, dist-EF-SGDM can reduce communication to varying degrees. For ResNet50_v2, the drop is trivial (17.6% decrease), mainly due to the smaller model size. In contrast, a remarkable decline (73.2% decrease) occurs using dist-EF-SGDM for VGG-16, since VGG-16 has larger model size (528M).
+
+[ResNet50_v2]
+![image](https://user-images.githubusercontent.com/25879526/86327486-02f51380-bc76-11ea-8919-a66dcbc44862.png)
+
+[VGG-16]
+![image](https://user-images.githubusercontent.com/25879526/86327498-05576d80-bc76-11ea-95c6-b9f285193bb9.png)
+
+
+
+#### Scaling Efficiency
+
+We also measure scaling efficiency when the number of nodes varies from 1 to 8. We follow the same setup as in the above experiment. The figure shows that gradient compression improves the scaling efficiency. The efficiency gain in gradient compression is much higher for VGG-16 than ResNet-50_v2, since ResNet50_v2 has smaller communication overhead.
+
+[ResNet50_v2]
+![image](https://user-images.githubusercontent.com/25879526/86327513-0a1c2180-bc76-11ea-88a8-292f09d434b7.png)
+
+[VGG-16]
+![image](https://user-images.githubusercontent.com/25879526/86327520-0be5e500-bc76-11ea-9711-c5618923b956.png)
+
+___
+The above two sub-experiments were conducted 2 months ago. There have been large updates since then. So the results are a little outdated. They are just for reference. 
+
+#### End-to-End Training
+
+Finally, we train ResNet50_v2 and VGG-16 end-to-end to measure total reduction in training time. For such large batch training, warmup and linear scaling learning rate 
+ are used to avoid generalization gap. We set the number of warmup epochs to 5. We also leverage cosine annealing strategy for learning rate decay. For ResNet50_v2 we use 8 AWS EC2 P3.16xlarge instances while for VGG-16, we use 4 AWS EC2 P3.16xlarge. 
+
+[ResNet50_v2]
+![image](https://user-images.githubusercontent.com/25879526/86327533-10120280-bc76-11ea-99ef-5c9e4c17e1bc.png)
+![image](https://user-images.githubusercontent.com/25879526/86327537-11dbc600-bc76-11ea-84e6-bef6b88296b0.png)
+
+As the figure shows, we reduce the trianing time by 8.0% without accuracy loss for ResNet50_v2.
+
+|  6c44049fd49e532781af96add6a02a0427e6a1a8  | VAl ACC | TIME(h) |
+| -- | -- | -- |
+|sgdm| 0.76914465625| 2.6505945833029516|
+|dist-ef-sgdm| 0.7632242968749999|2.4378090010373263 |
+
+[VGG-16]
+![image](https://user-images.githubusercontent.com/25879526/86327546-143e2000-bc76-11ea-8969-30c037f7022c.png)
+![image](https://user-images.githubusercontent.com/25879526/86327556-16a07a00-bc76-11ea-943c-761b1b4dafbd.png)
+
+The above figure shows that our implementation of dist-EF-SGDM reduces the training time for 100 epochs by 39.04% compared to the full-precision SGDM. We note that there is a small gap in accuracy between dist-EF-SGDM and SGDM. We will investigate this problem in the future.
+
+
+
+## TODO
+
+- [x] support inter-node compression
+- [x] support intra-node for MXNet
+- [x] support onebit compressor
+- [x] support error-feedback
+- [x] support momentum
+- [ ] support other compressors
+- [ ] support PyTorch and Tensorflow
+
+## Precautions
+
+1. To run successfully,  `ps-lite` should change one LOC. see the PR here. https://github.com/dmlc/ps-lite/pull/168
+2. We only support Gluon for MXNet now. Raw MXNet's API does not support it.
+3. Since gradient compression also has some overhead, this is a trade-off. It is only suitable for some cases, e.g. slow network or large models. In other cases, gradient compression will even harm performance.
+4. Momentum here is the same as the framework's momentum. Why do we have to implement momentum again? This is because for some algorithms like [dist-EF-SGDM](https://papers.nips.cc/paper/9321-communication-efficient-distributed-blockwise-momentum-sgd-with-error-feedback.pdf) , momentum should be added first but many frameworks like MXNet exchange gradient first and then add the momentum. So we have to implement momentum inside BytePS. When inside momentum is used, outside momentum should be disabled (set \mu = 0) in the users' scripts.
+5. FP16 is not supported now. 
\ No newline at end of file
diff --git a/example/mxnet/train_cifar100_byteps_gc.py b/example/mxnet/train_cifar100_byteps_gc.py
new file mode 100644
index 000000000..7179b5aa4
--- /dev/null
+++ b/example/mxnet/train_cifar100_byteps_gc.py
@@ -0,0 +1,316 @@
+# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This file is modified from
+`gluon-cv/scripts/classification/cifar/train_cifar10.py`"""
+import argparse
+import logging
+import subprocess
+import time
+
+import gluoncv as gcv
+import matplotlib
+import mxnet as mx
+from gluoncv.data import transforms as gcv_transforms
+from gluoncv.model_zoo import get_model
+from gluoncv.utils import LRScheduler, LRSequential, makedirs
+from mxnet import autograd as ag
+from mxnet import gluon
+from mxnet.gluon.data.vision import transforms
+
+import byteps.mxnet as bps
+
+matplotlib.use('Agg')
+
+
+gcv.utils.check_version('0.6.0')
+
+
+# CLI
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Train a model for image classification.')
+    parser.add_argument('--batch-size', type=int, default=32,
+                        help='training batch size per device (CPU/GPU).')
+    parser.add_argument('--num-gpus', type=int, default=0,
+                        help='number of gpus to use.')
+    parser.add_argument('--model', type=str, default='resnet',
+                        help='model to use. options are resnet and wrn. default is resnet.')
+    parser.add_argument('-j', '--num-data-workers', dest='num_workers', default=4, type=int,
+                        help='number of preprocessing workers')
+    parser.add_argument('--num-epochs', type=int, default=200,
+                        help='number of training epochs.')
+    parser.add_argument('--lr', type=float, default=0.1,
+                        help='learning rate. default is 0.1.')
+    parser.add_argument('--momentum', type=float, default=0.9,
+                        help='momentum value for optimizer, default is 0.9.')
+    parser.add_argument('--wd', type=float, default=0.0005,
+                        help='weight decay rate. default is 0.0005.')
+    parser.add_argument('--lr-decay', type=float, default=0.1,
+                        help='decay rate of learning rate. default is 0.1.')
+    parser.add_argument('--lr-decay-period', type=int, default=0,
+                        help='period in epoch for learning rate decays. default is 0 (has no effect).')
+    parser.add_argument('--lr-decay-epoch', type=str, default='100,150',
+                        help='epochs at which learning rate decays. default is 100,150.')
+    parser.add_argument('--warmup-lr', type=float, default=0.0,
+                        help='starting warmup learning rate. default is 0.0.')
+    parser.add_argument('--warmup-epochs', type=int, default=0,
+                        help='number of warmup epochs.')
+    parser.add_argument('--drop-rate', type=float, default=0.0,
+                        help='dropout rate for wide resnet. default is 0.')
+    parser.add_argument('--mode', type=str,
+                        help='mode in which to train the model. options are imperative, hybrid')
+    parser.add_argument('--save-period', type=int, default=10,
+                        help='period in epoch of model saving.')
+    parser.add_argument('--save-dir', type=str, default='params',
+                        help='directory of saved models')
+    parser.add_argument('--resume-from', type=str,
+                        help='resume training from the model')
+    parser.add_argument('--logging-file', type=str, default='baseline',
+                        help='name of training log file')
+    # additional arguments for gradient compression
+    parser.add_argument('--compressor', type=str, default='',
+                        help='which compressor')
+    parser.add_argument('--ef', type=str, default='',
+                        help='which error-feedback')
+    parser.add_argument('--compress-momentum', type=str, default='',
+                        help='which compress momentum')
+    parser.add_argument('--onebit-scaling', action='store_true', default=False,
+                        help='enable scaling for onebit compressor')
+    parser.add_argument('--k', default=1.0, type=float,
+                        help='topk or randomk')
+    parser.add_argument('--fp16-pushpull', action='store_true', default=False,
+                        help='use fp16 compression during pushpull')
+    opt = parser.parse_args()
+    return opt
+
+
+def main():
+    opt = parse_args()
+
+    bps.init()
+
+    gpu_name = subprocess.check_output(
+        ['nvidia-smi', '--query-gpu=gpu_name', '--format=csv'])
+    gpu_name = gpu_name.decode('utf8').split('\n')[-2]
+    gpu_name = '-'.join(gpu_name.split())
+    filename = "cifar100-%d-%s-%s.log" % (bps.size(),
+                                          gpu_name, opt.logging_file)
+    filehandler = logging.FileHandler(filename)
+    streamhandler = logging.StreamHandler()
+
+    logger = logging.getLogger('')
+    logger.setLevel(logging.INFO)
+    logger.addHandler(filehandler)
+    logger.addHandler(streamhandler)
+
+    logger.info(opt)
+
+    batch_size = opt.batch_size
+    classes = 100
+
+    num_gpus = opt.num_gpus
+    # batch_size *= max(1, num_gpus)
+    context = mx.gpu(bps.local_rank()) if num_gpus > 0 else mx.cpu(
+        bps.local_rank())
+    num_workers = opt.num_workers
+    nworker = bps.size()
+    rank = bps.rank()
+
+    lr_decay = opt.lr_decay
+    lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')]
+
+    num_batches = 50000 // (opt.batch_size * nworker)
+    lr_scheduler = LRSequential([
+        LRScheduler('linear', base_lr=opt.warmup_lr,
+                    target_lr=opt.lr * nworker / bps.local_size(),
+                    nepochs=opt.warmup_epochs, iters_per_epoch=num_batches),
+        LRScheduler('step', base_lr=opt.lr * nworker / bps.local_size(),
+                    target_lr=0,
+                    nepochs=opt.num_epochs - opt.warmup_epochs,
+                    iters_per_epoch=num_batches,
+                    step_epoch=lr_decay_epoch,
+                    step_factor=lr_decay, power=2)
+    ])
+
+    num_batches = 50000 // (opt.batch_size * nworker)
+    lr_scheduler = LRSequential([
+        LRScheduler('linear', base_lr=opt.warmup_lr, target_lr=opt.lr * nworker / bps.local_size(),
+                    nepochs=opt.warmup_epochs, iters_per_epoch=num_batches),
+        LRScheduler('step', base_lr=opt.lr * nworker / bps.local_size(), target_lr=0,
+                    nepochs=opt.num_epochs - opt.warmup_epochs,
+                    iters_per_epoch=num_batches,
+                    step_epoch=lr_decay_epoch,
+                    step_factor=lr_decay, power=2)
+    ])
+
+    model_name = opt.model
+    if model_name.startswith('cifar_wideresnet'):
+        kwargs = {'classes': classes,
+                  'drop_rate': opt.drop_rate}
+    else:
+        kwargs = {'classes': classes}
+    net = get_model(model_name, **kwargs)
+    if opt.resume_from:
+        net.load_parameters(opt.resume_from, ctx=context)
+
+    if opt.compressor:
+        optimizer = 'sgd'
+    else:
+        optimizer = 'nag'
+
+    save_period = opt.save_period
+    if opt.save_dir and save_period:
+        save_dir = opt.save_dir
+        makedirs(save_dir)
+    else:
+        save_dir = ''
+        save_period = 0
+
+    # from https://github.com/weiaicunzai/pytorch-cifar/blob/master/conf/global_settings.py
+    CIFAR100_TRAIN_MEAN = [0.5070751592371323,
+                           0.48654887331495095, 0.4409178433670343]
+    CIFAR100_TRAIN_STD = [0.2673342858792401,
+                          0.2564384629170883, 0.27615047132568404]
+
+    transform_train = transforms.Compose([
+        gcv_transforms.RandomCrop(32, pad=4),
+        transforms.RandomFlipLeftRight(),
+        transforms.ToTensor(),
+        transforms.Normalize(CIFAR100_TRAIN_MEAN,
+                             CIFAR100_TRAIN_STD)
+    ])
+
+    transform_test = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(CIFAR100_TRAIN_MEAN,
+                             CIFAR100_TRAIN_STD)
+    ])
+
+    def test(ctx, val_data):
+        metric = mx.metric.Accuracy()
+        for i, batch in enumerate(val_data):
+            data = gluon.utils.split_and_load(
+                batch[0], ctx_list=ctx, batch_axis=0)
+            label = gluon.utils.split_and_load(
+                batch[1], ctx_list=ctx, batch_axis=0)
+            outputs = [net(X) for X in data]
+            metric.update(label, outputs)
+        return metric.get()
+
+    def train(epochs, ctx):
+        if isinstance(ctx, mx.Context):
+            ctx = [ctx]
+        net.initialize(mx.init.Xavier(), ctx=ctx)
+
+        train_data = gluon.data.DataLoader(
+            gluon.data.vision.CIFAR100(train=True).shard(
+                nworker, rank).transform_first(transform_train),
+            batch_size=batch_size, shuffle=True, last_batch='discard',
+            num_workers=num_workers)
+
+        val_data = gluon.data.DataLoader(
+            gluon.data.vision.CIFAR100(train=False).shard(
+                nworker, rank).transform_first(transform_test),
+            batch_size=batch_size, shuffle=False, num_workers=num_workers)
+
+        params = net.collect_params()
+
+        compression_params = {
+            "compressor": opt.compressor,
+            "ef": opt.ef,
+            "momentum": opt.compress_momentum,
+            "scaling": opt.onebit_scaling,
+            "k": opt.k
+        }
+
+        optimizer_params = {'lr_scheduler': lr_scheduler,
+                            'wd': opt.wd, 'momentum': opt.momentum}
+
+        trainer = bps.DistributedTrainer(params,
+                                         optimizer,
+                                         optimizer_params,
+                                         compression_params=compression_params)
+        metric = mx.metric.Accuracy()
+        train_metric = mx.metric.Accuracy()
+        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
+
+        iteration = 0
+        best_val_score = 0
+        bps.byteps_declare_tensor("acc")
+        for epoch in range(epochs):
+            tic = time.time()
+            train_metric.reset()
+            metric.reset()
+            train_loss = 0
+            num_batch = len(train_data)
+
+            for i, batch in enumerate(train_data):
+                data = gluon.utils.split_and_load(
+                    batch[0], ctx_list=ctx, batch_axis=0)
+                label = gluon.utils.split_and_load(
+                    batch[1], ctx_list=ctx, batch_axis=0)
+
+                with ag.record():
+                    output = [net(X) for X in data]
+                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
+                for l in loss:
+                    l.backward()
+                trainer.step(batch_size)
+                train_loss += sum([l.sum().asscalar() for l in loss])
+
+                train_metric.update(label, output)
+                name, train_acc = train_metric.get()
+                iteration += 1
+
+            train_loss /= batch_size * num_batch
+            name, train_acc = train_metric.get()
+            throughput = int(batch_size * nworker * i / (time.time() - tic))
+
+            logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f lr=%f' %
+                        (epoch, throughput, time.time()-tic, trainer.learning_rate))
+
+            name, val_acc = test(ctx, val_data)
+            acc = mx.nd.array([train_acc, val_acc], ctx=ctx[0])
+            bps.byteps_push_pull(acc, name="acc", is_average=False)
+            acc /= bps.size()
+            train_acc, val_acc = acc[0].asscalar(), acc[1].asscalar()
+            if bps.rank() == 0:
+                logger.info('[Epoch %d] training: %s=%f' %
+                            (epoch, name, train_acc))
+                logger.info('[Epoch %d] validation: %s=%f' %
+                            (epoch, name, val_acc))
+
+            if val_acc > best_val_score:
+                best_val_score = val_acc
+                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
+                                    (save_dir, best_val_score, model_name,
+                                     epoch))
+
+            if save_period and save_dir and (epoch + 1) % save_period == 0:
+                net.save_parameters('%s/cifar100-%s-%d.params' %
+                                    (save_dir, model_name, epoch))
+
+        if save_period and save_dir:
+            net.save_parameters('%s/cifar100-%s-%d.params' %
+                                (save_dir, model_name, epochs-1))
+
+    if opt.mode == 'hybrid':
+        net.hybridize()
+    train(opt.num_epochs, context)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/example/mxnet/train_gluon_imagenet_byteps_gc.py b/example/mxnet/train_gluon_imagenet_byteps_gc.py
new file mode 100644
index 000000000..aa759b82a
--- /dev/null
+++ b/example/mxnet/train_gluon_imagenet_byteps_gc.py
@@ -0,0 +1,548 @@
+import argparse
+import logging
+import math
+import os
+import subprocess
+import time
+
+import gluoncv as gcv
+import mxnet as mx
+import numpy as np
+from gluoncv.data import imagenet
+from gluoncv.model_zoo import get_model
+from gluoncv.utils import LRScheduler, LRSequential, makedirs
+from mxnet import autograd as ag
+from mxnet import gluon, nd
+from mxnet.gluon.data.vision import transforms
+
+import byteps.mxnet as bps
+
+gcv.utils.check_version('0.6.0')
+
+
+# CLI
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Train a model for image classification.')
+    parser.add_argument('--data-dir', type=str, default='~/.mxnet/datasets/imagenet',
+                        help='training and validation pictures to use.')
+    parser.add_argument('--rec-train', type=str, default='~/.mxnet/datasets/imagenet/rec/train.rec',
+                        help='the training data')
+    parser.add_argument('--rec-train-idx', type=str, default='~/.mxnet/datasets/imagenet/rec/train.idx',
+                        help='the index of training data')
+    parser.add_argument('--rec-val', type=str, default='~/.mxnet/datasets/imagenet/rec/val.rec',
+                        help='the validation data')
+    parser.add_argument('--rec-val-idx', type=str, default='~/.mxnet/datasets/imagenet/rec/val.idx',
+                        help='the index of validation data')
+    parser.add_argument('--use-rec', action='store_true',
+                        help='use image record iter for data input. default is false.')
+    parser.add_argument('--batch-size', type=int, default=32,
+                        help='training batch size per device (CPU/GPU).')
+    parser.add_argument('--dtype', type=str, default='float32',
+                        help='data type for training. default is float32')
+    parser.add_argument('--num-gpus', type=int, default=0,
+                        help='number of gpus to use.')
+    parser.add_argument('-j', '--num-data-workers', dest='num_workers', default=4, type=int,
+                        help='number of preprocessing workers')
+    parser.add_argument('--num-epochs', type=int, default=3,
+                        help='number of training epochs.')
+    parser.add_argument('--lr', type=float, default=0.1,
+                        help='learning rate. default is 0.1.')
+    parser.add_argument('--momentum', type=float, default=0.9,
+                        help='momentum value for optimizer, default is 0.9.')
+    parser.add_argument('--wd', type=float, default=0.0001,
+                        help='weight decay rate. default is 0.0001.')
+    parser.add_argument('--lr-mode', type=str, default='step',
+                        help='learning rate scheduler mode. options are step, poly and cosine.')
+    parser.add_argument('--lr-decay', type=float, default=0.1,
+                        help='decay rate of learning rate. default is 0.1.')
+    parser.add_argument('--lr-decay-period', type=int, default=0,
+                        help='interval for periodic learning rate decays. default is 0 to disable.')
+    parser.add_argument('--lr-decay-epoch', type=str, default='40,60',
+                        help='epochs at which learning rate decays. default is 40,60.')
+    parser.add_argument('--warmup-lr', type=float, default=0.0,
+                        help='starting warmup learning rate. default is 0.0.')
+    parser.add_argument('--warmup-epochs', type=int, default=0,
+                        help='number of warmup epochs.')
+    parser.add_argument('--last-gamma', action='store_true',
+                        help='whether to init gamma of the last BN layer in each bottleneck to 0.')
+    parser.add_argument('--mode', type=str,
+                        help='mode in which to train the model. options are symbolic, imperative, hybrid')
+    parser.add_argument('--model', type=str, required=True,
+                        help='type of model to use. see vision_model for options.')
+    parser.add_argument('--input-size', type=int, default=224,
+                        help='size of the input image size. default is 224')
+    parser.add_argument('--crop-ratio', type=float, default=0.875,
+                        help='Crop ratio during validation. default is 0.875')
+    parser.add_argument('--use-pretrained', action='store_true',
+                        help='enable using pretrained model from gluon.')
+    parser.add_argument('--use_se', action='store_true',
+                        help='use SE layers or not in resnext. default is false.')
+    parser.add_argument('--mixup', action='store_true',
+                        help='whether train the model with mix-up. default is false.')
+    parser.add_argument('--mixup-alpha', type=float, default=0.2,
+                        help='beta distribution parameter for mixup sampling, default is 0.2.')
+    parser.add_argument('--mixup-off-epoch', type=int, default=0,
+                        help='how many last epochs to train without mixup, default is 0.')
+    parser.add_argument('--label-smoothing', action='store_true',
+                        help='use label smoothing or not in training. default is false.')
+    parser.add_argument('--no-wd', action='store_true',
+                        help='whether to remove weight decay on bias, and beta/gamma for batchnorm layers.')
+    parser.add_argument('--teacher', type=str, default=None,
+                        help='teacher model for distillation training')
+    parser.add_argument('--temperature', type=float, default=20,
+                        help='temperature parameter for distillation teacher model')
+    parser.add_argument('--hard-weight', type=float, default=0.5,
+                        help='weight for the loss of one-hot label for distillation training')
+    parser.add_argument('--batch-norm', action='store_true',
+                        help='enable batch normalization or not in vgg. default is false.')
+    parser.add_argument('--save-frequency', type=int, default=10,
+                        help='frequency of model saving.')
+    parser.add_argument('--save-dir', type=str, default='params',
+                        help='directory of saved models')
+    parser.add_argument('--resume-epoch', type=int, default=0,
+                        help='epoch to resume training from.')
+    parser.add_argument('--resume-params', type=str, default='',
+                        help='path of parameters to load from.')
+    parser.add_argument('--resume-states', type=str, default='',
+                        help='path of trainer state to load from.')
+    parser.add_argument('--log-interval', type=int, default=50,
+                        help='Number of batches to wait before logging.')
+    parser.add_argument('--logging-file', type=str, default='train_imagenet.log',
+                        help='name of training log file')
+    parser.add_argument('--use-gn', action='store_true',
+                        help='whether to use group norm.')
+    # additional arguments for gradient compression
+    parser.add_argument('--compressor', type=str, default='',
+                        help='which compressor')
+    parser.add_argument('--ef', type=str, default='',
+                        help='which error-feedback')
+    parser.add_argument('--compress-momentum', type=str, default='',
+                        help='which compress momentum')
+    parser.add_argument('--onebit-scaling', action='store_true', default=False,
+                        help='enable scaling for onebit compressor')
+    parser.add_argument('--k', default=1, type=int,
+                        help='topk or randomk')
+    parser.add_argument('--fp16-pushpull', action='store_true', default=False,
+                        help='use fp16 compression during pushpull')
+
+    opt = parser.parse_args()
+    return opt
+
+
+def main():
+    opt = parse_args()
+
+    bps.init()
+    gpu_name = subprocess.check_output(
+        ['nvidia-smi', '--query-gpu=gpu_name', '--format=csv'])
+    gpu_name = gpu_name.decode('utf8').split('\n')[-2]
+    gpu_name = '-'.join(gpu_name.split())
+    filename = "imagenet-%d-%s-%s.log" % (bps.size(),
+                                          gpu_name, opt.logging_file)
+    filehandler = logging.FileHandler(filename)
+    streamhandler = logging.StreamHandler()
+
+    logger = logging.getLogger('')
+    logger.setLevel(logging.INFO)
+    logger.addHandler(filehandler)
+    logger.addHandler(streamhandler)
+
+    logger.info(opt)
+
+    batch_size = opt.batch_size
+    classes = 1000
+    num_training_samples = 1281167
+
+    num_gpus = opt.num_gpus
+    # batch_size *= max(1, num_gpus)
+    context = mx.gpu(bps.local_rank()) if num_gpus > 0 else mx.cpu(
+        bps.local_rank())
+    num_workers = opt.num_workers
+    nworker = bps.size()
+    rank = bps.rank()
+
+    lr_decay = opt.lr_decay
+    lr_decay_period = opt.lr_decay_period
+    if opt.lr_decay_period > 0:
+        lr_decay_epoch = list(
+            range(lr_decay_period, opt.num_epochs, lr_decay_period))
+    else:
+        lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')]
+    lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch]
+    num_batches = num_training_samples // (batch_size*nworker)
+
+    lr_scheduler = LRSequential([
+        LRScheduler('linear', base_lr=opt.warmup_lr, target_lr=opt.lr * nworker / bps.local_size(),
+                    nepochs=opt.warmup_epochs, iters_per_epoch=num_batches),
+        LRScheduler(opt.lr_mode, base_lr=opt.lr * nworker / bps.local_size(), target_lr=0,
+                    nepochs=opt.num_epochs - opt.warmup_epochs,
+                    iters_per_epoch=num_batches,
+                    step_epoch=lr_decay_epoch,
+                    step_factor=lr_decay, power=2)
+    ])
+
+    model_name = opt.model
+
+    kwargs = {'ctx': context,
+              'pretrained': opt.use_pretrained, 'classes': classes}
+    if opt.use_gn:
+        from gluoncv.nn import GroupNorm
+        kwargs['norm_layer'] = GroupNorm
+    if model_name.startswith('vgg'):
+        kwargs['batch_norm'] = opt.batch_norm
+    elif model_name.startswith('resnext'):
+        kwargs['use_se'] = opt.use_se
+
+    if opt.last_gamma:
+        kwargs['last_gamma'] = True
+
+    if opt.compressor:
+        optimizer = 'sgd'
+    else:
+        optimizer = 'nag'
+
+    optimizer_params = {'wd': opt.wd,
+                        'momentum': opt.momentum, 'lr_scheduler': lr_scheduler}
+
+    if opt.dtype != 'float32':
+        optimizer_params['multi_precision'] = True
+
+    net = get_model(model_name, **kwargs)
+    net.cast(opt.dtype)
+    if opt.resume_params is not '':
+        net.load_parameters(opt.resume_params, ctx=context)
+
+    # teacher model for distillation training
+    if opt.teacher is not None and opt.hard_weight < 1.0:
+        teacher_name = opt.teacher
+        teacher = get_model(teacher_name, pretrained=True,
+                            classes=classes, ctx=context)
+        teacher.cast(opt.dtype)
+        distillation = True
+    else:
+        distillation = False
+
+    # Two functions for reading data from record file or raw images
+    def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers):
+        rec_train = os.path.expanduser(rec_train)
+        rec_train_idx = os.path.expanduser(rec_train_idx)
+        rec_val = os.path.expanduser(rec_val)
+        rec_val_idx = os.path.expanduser(rec_val_idx)
+        jitter_param = 0.4
+        lighting_param = 0.1
+        input_size = opt.input_size
+        crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875
+        resize = int(math.ceil(input_size / crop_ratio))
+        mean_rgb = [123.68, 116.779, 103.939]
+        std_rgb = [58.393, 57.12, 57.375]
+
+        def batch_fn(batch, ctx):
+            data = gluon.utils.split_and_load(
+                batch.data[0], ctx_list=ctx, batch_axis=0)
+            label = gluon.utils.split_and_load(
+                batch.label[0], ctx_list=ctx, batch_axis=0)
+            return data, label
+
+        train_data = mx.io.ImageRecordIter(
+            path_imgrec=rec_train,
+            path_imgidx=rec_train_idx,
+            preprocess_threads=num_workers,
+            shuffle=True,
+            batch_size=batch_size,
+
+            data_shape=(3, input_size, input_size),
+            mean_r=mean_rgb[0],
+            mean_g=mean_rgb[1],
+            mean_b=mean_rgb[2],
+            std_r=std_rgb[0],
+            std_g=std_rgb[1],
+            std_b=std_rgb[2],
+            rand_mirror=True,
+            random_resized_crop=True,
+            max_aspect_ratio=4. / 3.,
+            min_aspect_ratio=3. / 4.,
+            max_random_area=1,
+            min_random_area=0.08,
+            brightness=jitter_param,
+            saturation=jitter_param,
+            contrast=jitter_param,
+            pca_noise=lighting_param,
+            num_parts=nworker,
+            part_index=rank
+        )
+        val_data = mx.io.ImageRecordIter(
+            path_imgrec=rec_val,
+            path_imgidx=rec_val_idx,
+            preprocess_threads=num_workers,
+            shuffle=False,
+            batch_size=batch_size,
+
+            resize=resize,
+            data_shape=(3, input_size, input_size),
+            mean_r=mean_rgb[0],
+            mean_g=mean_rgb[1],
+            mean_b=mean_rgb[2],
+            std_r=std_rgb[0],
+            std_g=std_rgb[1],
+            std_b=std_rgb[2],
+            num_parts=nworker,
+            part_index=rank
+        )
+        return train_data, val_data, batch_fn
+
+    def get_data_loader(data_dir, batch_size, num_workers):
+        normalize = transforms.Normalize(
+            [0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        jitter_param = 0.4
+        lighting_param = 0.1
+        input_size = opt.input_size
+        crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875
+        resize = int(math.ceil(input_size / crop_ratio))
+
+        def batch_fn(batch, ctx):
+            data = gluon.utils.split_and_load(
+                batch[0], ctx_list=ctx, batch_axis=0)
+            label = gluon.utils.split_and_load(
+                batch[1], ctx_list=ctx, batch_axis=0)
+            return data, label
+
+        transform_train = transforms.Compose([
+            transforms.RandomResizedCrop(input_size),
+            transforms.RandomFlipLeftRight(),
+            transforms.RandomColorJitter(brightness=jitter_param, contrast=jitter_param,
+                                         saturation=jitter_param),
+            transforms.RandomLighting(lighting_param),
+            transforms.ToTensor(),
+            normalize
+        ])
+        transform_test = transforms.Compose([
+            transforms.Resize(resize, keep_ratio=True),
+            transforms.CenterCrop(input_size),
+            transforms.ToTensor(),
+            normalize
+        ])
+
+        train_data = gluon.data.DataLoader(
+            imagenet.classification.ImageNet(
+                data_dir, train=True).transform_first(transform_train),
+            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)
+        val_data = gluon.data.DataLoader(
+            imagenet.classification.ImageNet(
+                data_dir, train=False).transform_first(transform_test),
+            batch_size=batch_size, shuffle=False, num_workers=num_workers)
+
+        return train_data, val_data, batch_fn
+
+    if opt.use_rec:
+        train_data, val_data, batch_fn = get_data_rec(opt.rec_train, opt.rec_train_idx,
+                                                      opt.rec_val, opt.rec_val_idx,
+                                                      batch_size, num_workers)
+    else:
+        train_data, val_data, batch_fn = get_data_loader(
+            opt.data_dir, batch_size, num_workers)
+
+    if opt.mixup:
+        train_metric = mx.metric.RMSE()
+    else:
+        train_metric = mx.metric.Accuracy()
+    acc_top1 = mx.metric.Accuracy()
+    acc_top5 = mx.metric.TopKAccuracy(5)
+
+    save_frequency = opt.save_frequency
+    if opt.save_dir and save_frequency:
+        save_dir = opt.save_dir
+        makedirs(save_dir)
+    else:
+        save_dir = ''
+        save_frequency = 0
+
+    def mixup_transform(label, classes, lam=1, eta=0.0):
+        if isinstance(label, nd.NDArray):
+            label = [label]
+        res = []
+        for l in label:
+            y1 = l.one_hot(classes, on_value=1 - eta + eta /
+                           classes, off_value=eta/classes)
+            y2 = l[::-1].one_hot(classes, on_value=1 -
+                                 eta + eta/classes, off_value=eta/classes)
+            res.append(lam*y1 + (1-lam)*y2)
+        return res
+
+    def smooth(label, classes, eta=0.1):
+        if isinstance(label, nd.NDArray):
+            label = [label]
+        smoothed = []
+        for l in label:
+            res = l.one_hot(classes, on_value=1 - eta + eta /
+                            classes, off_value=eta/classes)
+            smoothed.append(res)
+        return smoothed
+
+    def test(ctx, val_data):
+        if opt.use_rec:
+            val_data.reset()
+        acc_top1.reset()
+        acc_top5.reset()
+        for i, batch in enumerate(val_data):
+            data, label = batch_fn(batch, ctx)
+            outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
+            acc_top1.update(label, outputs)
+            acc_top5.update(label, outputs)
+
+        _, top1 = acc_top1.get()
+        _, top5 = acc_top5.get()
+        return (1-top1, 1-top5)
+
+    def train(ctx):
+        if isinstance(ctx, mx.Context):
+            ctx = [ctx]
+        if opt.resume_params is '':
+            net.initialize(mx.init.MSRAPrelu(), ctx=ctx)
+
+        if opt.no_wd:
+            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
+                v.wd_mult = 0.0
+
+        compression_params = {
+            "compressor": opt.compressor,
+            "ef": opt.ef,
+            "momentum": opt.compress_momentum,
+            "scaling": opt.onebit_scaling,
+            "k": opt.k
+        }
+
+        trainer = bps.DistributedTrainer(
+            net.collect_params(), optimizer, optimizer_params,
+            compression_params=compression_params)
+
+        if opt.resume_states is not '':
+            trainer.load_states(opt.resume_states)
+
+        if opt.label_smoothing or opt.mixup:
+            sparse_label_loss = False
+        else:
+            sparse_label_loss = True
+        if distillation:
+            L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(temperature=opt.temperature,
+                                                             hard_weight=opt.hard_weight,
+                                                             sparse_label=sparse_label_loss)
+        else:
+            L = gluon.loss.SoftmaxCrossEntropyLoss(
+                sparse_label=sparse_label_loss)
+
+        best_val_score = 1
+
+        # bps.byteps_declare_tensor("acc")
+        for epoch in range(opt.resume_epoch, opt.num_epochs):
+            tic = time.time()
+            if opt.use_rec:
+                train_data.reset()
+            train_metric.reset()
+            btic = time.time()
+
+            for i, batch in enumerate(train_data):
+                data, label = batch_fn(batch, ctx)
+
+                if opt.mixup:
+                    lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha)
+                    if epoch >= opt.num_epochs - opt.mixup_off_epoch:
+                        lam = 1
+                    data = [lam*X + (1-lam)*X[::-1] for X in data]
+
+                    if opt.label_smoothing:
+                        eta = 0.1
+                    else:
+                        eta = 0.0
+                    label = mixup_transform(label, classes, lam, eta)
+
+                elif opt.label_smoothing:
+                    hard_label = label
+                    label = smooth(label, classes)
+
+                if distillation:
+                    teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature)
+                                    for X in data]
+
+                with ag.record():
+                    outputs = [net(X.astype(opt.dtype, copy=False))
+                               for X in data]
+                    if distillation:
+                        loss = [L(yhat.astype('float32', copy=False),
+                                  y.astype('float32', copy=False),
+                                  p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)]
+                    else:
+                        loss = [L(yhat, y.astype(opt.dtype, copy=False))
+                                for yhat, y in zip(outputs, label)]
+                for l in loss:
+                    l.backward()
+                trainer.step(batch_size)
+
+                if opt.mixup:
+                    output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False))
+                                      for out in outputs]
+                    train_metric.update(label, output_softmax)
+                else:
+                    if opt.label_smoothing:
+                        train_metric.update(hard_label, outputs)
+                    else:
+                        train_metric.update(label, outputs)
+
+                if opt.log_interval and not (i+1) % opt.log_interval:
+                    train_metric_name, train_metric_score = train_metric.get()
+                    logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f\ttime=%f' % (
+                                epoch, i, batch_size*nworker *
+                                opt.log_interval/(time.time()-btic),
+                                train_metric_name, train_metric_score, trainer.learning_rate, time.time()-btic))
+                    btic = time.time()
+
+            train_metric_name, train_metric_score = train_metric.get()
+            throughput = int(batch_size * nworker * i / (time.time() - tic))
+
+            logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f' %
+                        (epoch, throughput, time.time()-tic))
+
+            err_top1_val, err_top5_val = test(ctx, val_data)
+
+            # acc = mx.nd.array([train_metric_score, err_top1_val, err_top5_val],
+            #                   ctx=ctx[0])
+            # bps.byteps_push_pull(acc, name="acc", is_average=False)
+            # acc /= bps.size()
+            # train_metric_score, err_top1_val, err_top5_val = acc[0].asscalar(
+            # ), acc[1].asscalar(), acc[2].asscalar()
+
+            # if bps.rank() == 0:
+            logger.info('[Epoch %d] training: %s=%f' %
+                        (epoch, train_metric_name, train_metric_score))
+            logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f' %
+                        (epoch, err_top1_val, err_top5_val))
+
+            if err_top1_val < best_val_score:
+                best_val_score = err_top1_val
+                net.save_parameters('%s/%.4f-imagenet-%s-%d-best.params' %
+                                    (save_dir, best_val_score, model_name, epoch))
+                trainer.save_states('%s/%.4f-imagenet-%s-%d-best.states' %
+                                    (save_dir, best_val_score, model_name, epoch))
+
+            if save_frequency and save_dir and (epoch + 1) % save_frequency == 0:
+                net.save_parameters('%s/imagenet-%s-%d.params' %
+                                    (save_dir, model_name, epoch))
+                trainer.save_states('%s/imagenet-%s-%d.states' %
+                                    (save_dir, model_name, epoch))
+
+        if save_frequency and save_dir:
+            net.save_parameters('%s/imagenet-%s-%d.params' %
+                                (save_dir, model_name, opt.num_epochs-1))
+            trainer.save_states('%s/imagenet-%s-%d.states' %
+                                (save_dir, model_name, opt.num_epochs-1))
+
+    if opt.mode == 'hybrid':
+        net.hybridize(static_alloc=True, static_shape=True)
+        if distillation:
+            teacher.hybridize(static_alloc=True, static_shape=True)
+    train(context)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/example/mxnet/train_gluon_mnist_byteps_gc.py b/example/mxnet/train_gluon_mnist_byteps_gc.py
new file mode 100644
index 000000000..d3361fcdd
--- /dev/null
+++ b/example/mxnet/train_gluon_mnist_byteps_gc.py
@@ -0,0 +1,221 @@
+# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This file is modified from `horovod/examples/mxnet_mnist.py`, using gluon
+style MNIST dataset and data_loader."""
+import argparse
+import logging
+import subprocess
+import time
+
+import mxnet as mx
+from mxnet import autograd, gluon, nd
+from mxnet.gluon.data.vision import MNIST
+
+import byteps.mxnet as bps
+
+# Higher download speed for chinese users
+# os.environ['MXNET_GLUON_REPO'] =
+# 'https://apache-mxnet.s3.cn-north-1.amazonaws.com.cn/'
+
+# Training settings
+parser = argparse.ArgumentParser(description='MXNet MNIST Example')
+
+parser.add_argument('--batch-size', type=int, default=64,
+                    help='training batch size (default: 64)')
+parser.add_argument('--dtype', type=str, default='float32',
+                    help='training data type (default: float32)')
+parser.add_argument('--epochs', type=int, default=5,
+                    help='number of training epochs (default: 5)')
+parser.add_argument('--j', type=int, default=2,
+                    help='number of cpu processes for dataloader')
+parser.add_argument('--lr', type=float, default=0.01,
+                    help='learning rate (default: 0.01)')
+parser.add_argument('--wd', type=float, default=0.0001,
+                    help='weight decay rate. default is 0.0001.')
+parser.add_argument('--momentum', type=float, default=0.9,
+                    help='SGD momentum (default: 0.9)')
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disable training on GPU (default: False)')
+parser.add_argument('--compressor', type=str, default='',
+                    help='which compressor')
+parser.add_argument('--ef', type=str, default='',
+                    help='which error feedback')
+parser.add_argument('--compress-momentum', type=str, default='',
+                    help='which compress momentum')
+parser.add_argument('--scaling', action='store_true', default=False,
+                    help='enable scaling for onebit compressor')
+parser.add_argument('--k', type=int, default=1,
+                    help='topk or randomk')
+parser.add_argument('--fp16-pushpull', action='store_true', default=False,
+                    help='use fp16 compression during pushpull')
+parser.add_argument('--logging-file', type=str, default='baseline',
+                    help='name of training log file')
+args = parser.parse_args()
+
+
+if not args.no_cuda:
+    # Disable CUDA if there are no GPUs.
+    if mx.context.num_gpus() == 0:
+        args.no_cuda = True
+
+# Initialize BytePS
+bps.init()
+
+gpu_name = subprocess.check_output(
+    ['nvidia-smi', '--query-gpu=gpu_name', '--format=csv'])
+gpu_name = gpu_name.decode('utf8').split('\n')[-2]
+gpu_name = '-'.join(gpu_name.split())
+filename = "mnist-%d-%s-%s.log" % (bps.size(), gpu_name, args.logging_file)
+filehandler = logging.FileHandler(filename)
+streamhandler = logging.StreamHandler()
+
+logger = logging.getLogger('')
+logger.setLevel(level=logging.INFO)
+logger.addHandler(filehandler)
+logger.addHandler(streamhandler)
+logger.info(args)
+
+
+def dummy_transform(data, label):
+    im = data.astype(args.dtype, copy=False) / 255 - 0.5
+    im = nd.transpose(im, (2, 0, 1))
+    return im, label
+
+
+# Function to get mnist iterator
+def get_mnist_iterator():
+    train_set = MNIST(train=True, transform=dummy_transform)
+    train_iter = gluon.data.DataLoader(
+        train_set, args.batch_size, True, num_workers=args.j,
+        last_batch='discard')
+    val_set = MNIST(train=False, transform=dummy_transform)
+    val_iter = gluon.data.DataLoader(
+        val_set, args.batch_size, False, num_workers=args.j)
+
+    return train_iter, val_iter, len(train_set)
+
+
+# Function to define neural network
+def conv_nets():
+    net = gluon.nn.HybridSequential()
+    with net.name_scope():
+        net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
+        net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+        net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))
+        net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+        net.add(gluon.nn.Flatten())
+        net.add(gluon.nn.Dense(512, activation="relu"))
+        net.add(gluon.nn.Dense(10))
+    return net
+
+
+# Function to evaluate accuracy for a model
+def evaluate(model, data_iter, context):
+    metric = mx.metric.Accuracy()
+    for _, batch in enumerate(data_iter):
+        data = batch[0].as_in_context(context)
+        label = batch[1].as_in_context(context)
+        output = model(data)
+        metric.update([label], [output])
+
+    return metric.get()
+
+
+# Load training and validation data
+train_data, val_data, train_size = get_mnist_iterator()
+
+# BytePS: pin context to local rank
+context = mx.cpu(bps.local_rank()) if args.no_cuda else mx.gpu(
+    bps.local_rank())
+num_workers = bps.size()
+
+# Build model
+model = conv_nets()
+model.cast(args.dtype)
+
+# Initialize parameters
+model.initialize(mx.init.MSRAPrelu(), ctx=context)
+# if bps.rank() == 0:
+model.summary(nd.ones((1, 1, 28, 28), ctx=mx.gpu(bps.local_rank())))
+model.hybridize()
+
+params = model.collect_params()
+
+# BytePS: create DistributedTrainer, a subclass of gluon.Trainer
+optimizer_params = {'momentum': args.momentum, 'wd': args.wd,
+                    'learning_rate': args.lr * num_workers}
+
+compression_params = {
+    "compressor": args.compressor,
+    "ef": args.ef,
+    "momentum": args.compress_momentum,
+    "scaling": args.scaling,
+    "k": args.k,
+    "fp16": args.fp16_pushpull
+}
+
+trainer = bps.DistributedTrainer(
+    params, "sgd", optimizer_params, compression_params=compression_params)
+
+# Create loss function and train metric
+loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
+metric = mx.metric.Accuracy()
+
+total_time = 0
+# Train model
+bps.byteps_declare_tensor("acc")
+for epoch in range(args.epochs):
+    tic = time.time()
+    metric.reset()
+    for i, batch in enumerate(train_data):
+        data = batch[0].as_in_context(context)
+        label = batch[1].as_in_context(context)
+
+        with autograd.record():
+            output = model(data)
+            loss = loss_fn(output, label)
+
+        loss.backward()
+        trainer.step(args.batch_size)
+        metric.update([label], [output])
+
+        if i % 100 == 0:
+            name, acc = metric.get()
+            logger.info('[Epoch %d Batch %d] Training: %s=%f' %
+                        (epoch, i, name, acc))
+
+    elapsed = time.time() - tic
+    total_time += elapsed
+    speed = train_size * num_workers / elapsed
+    logger.info('Epoch[%d]\tSpeed=%.2f samples/s\tTime cost=%f',
+                epoch, speed, elapsed)
+
+    # Evaluate model accuracy
+    _, train_acc = metric.get()
+    name, val_acc = evaluate(model, val_data, context)
+    acc = mx.nd.array([train_acc, val_acc], ctx=context)
+    bps.byteps_push_pull(acc, name="acc", is_average=False)
+    acc /= bps.size()
+    train_acc, val_acc = acc[0].asscalar(), acc[1].asscalar()
+    if bps.rank() == 0:
+        logger.info('Epoch[%d]\tTrain: %s=%f\tValidation: %s=%f', epoch, name,
+                    train_acc, name, val_acc)
+
+
+if bps.rank() == 0 and epoch == args.epochs - 1:
+    assert val_acc > 0.96, "Achieved accuracy (%f) is lower than expected\
+                            (0.96)" % val_acc
+
+logger.info("total time=%.2f", total_time)
diff --git a/launcher/launch.py b/launcher/launch.py
index 33415ae04..c99ece38f 100644
--- a/launcher/launch.py
+++ b/launcher/launch.py
@@ -2,10 +2,12 @@
 
 from __future__ import print_function
 import os
+import re
 import subprocess
 import threading
 import sys
 import time
+from functools import reduce
 
 
 class PropagatingThread(threading.Thread):
@@ -36,6 +38,88 @@ def join(self):
 COMMON_REQUIRED_ENVS = ["DMLC_ROLE", "DMLC_NUM_WORKER", "DMLC_NUM_SERVER",
                         "DMLC_PS_ROOT_URI", "DMLC_PS_ROOT_PORT"]
 WORKER_REQUIRED_ENVS = ["DMLC_WORKER_ID"]
+NUMA_PATH = "/sys/devices/system/node"
+
+
+def get_numa_info():
+    ret = []
+    if os.path.exists(NUMA_PATH):
+        items = os.listdir(NUMA_PATH)
+        nodes = list(filter(lambda str: str.startswith("node"), items))
+        if nodes:
+            for node in nodes:
+                items = os.listdir(os.path.join(NUMA_PATH, node))
+                cpus = [re.findall("cpu\d+", cpu) for cpu in items]
+                cpus = list(filter(lambda x: x, cpus))
+                cpu_ids = [int(cpu[0].split('cpu')[1]) for cpu in cpus]
+                cpu_ids = sorted(cpu_ids)
+                ret.append(cpu_ids)
+    else:
+        print("NUMA PATH %s NOT FOUND" % NUMA_PATH)
+    return ret
+
+
+def allocate_cpu(local_size):
+    def _get_allocation(nodes, quota):
+        if quota < 1:
+            raise ValueError("quota should be no less than 1")
+        ret = []
+        for node in nodes:
+            if len(node) < quota:
+                continue
+            split_index = []
+            for i in range(1, quota):
+                if node[i] != node[i-1] + 1:
+                    split_index.append(i)
+            quota_bck = quota
+            last_idx = 0
+            for idx in split_index:
+                ret.append(node[last_idx:idx])
+                quota -= idx - last_idx
+                last_idx = idx
+            ret.append(node[last_idx:last_idx+quota])
+            for idx in sorted(range(quota_bck), reverse=True):
+                del node[idx]
+            return ret
+        return ret
+
+    def _get_quota(nodes, local_size):
+        if len(nodes) > 1:
+            cpu_nums = reduce(lambda x, y: (len(x) + len(y)), nodes)
+        else:
+            cpu_nums = len(nodes[0])
+        
+        # default quota is the number of cpus for non-root processess
+        default_quota = int(os.getenv("BYTEPS_NUMA_DEFAULT_QUOTA", 6))
+        while default_quota >= 1 and default_quota * local_size > cpu_nums:
+            default_quota -= 2
+
+        # root quota is the number of cpus for root processess
+        # root does more work, thus using more cpus
+        root_quota = cpu_nums - default_quota * (local_size - 1)
+        if int(os.getenv("BYTEPS_NUMA_ROOT_QUOTA", 0)):
+            root_quota = int(os.getenv("BYTEPS_NUMA_ROOT_QUOTA", 0))
+
+        node_size = len(nodes[0])
+        while root_quota >= 1 and root_quota > node_size:
+            root_quota -= 2
+        return [default_quota] * (local_size - 1) + [root_quota]
+
+    nodes = get_numa_info()
+    if not nodes:
+        return None
+    quota_list = _get_quota(nodes, local_size)
+    ret = []
+    for quota in quota_list:
+        while quota > 0:
+            allocation = _get_allocation(nodes, quota)
+            if allocation:
+                ret.append(allocation)
+                break
+            else:
+                quota -= 2
+
+    return ret
 
 
 def check_env():
@@ -55,7 +139,7 @@ def check_env():
             os._exit(0)
 
 
-def worker(local_rank, local_size, command):
+def worker(local_rank, local_size, command, allocation=None):
     my_env = os.environ.copy()
     my_env["BYTEPS_LOCAL_RANK"] = str(local_rank)
     my_env["BYTEPS_LOCAL_SIZE"] = str(local_size)
@@ -64,6 +148,20 @@ def worker(local_rank, local_size, command):
             command = "python " + command
         command = "gdb -ex 'run' -ex 'bt' -batch --args " + command
 
+    if allocation:
+        print("enable NUMA finetune...")
+        retval = subprocess.call(
+            ["dpkg", "-s", "numactl"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
+        if retval == 0:
+            numa = "numactl --physcpubind "
+            for cpu_set in allocation:
+                numa += "{}-{},".format(cpu_set[0], cpu_set[-1])
+            numa = numa.strip(',') + ' '
+            command = numa + command
+            print("Command: %s\n" % command)
+        else:
+            print("Warning: numactl not found. try `sudo apt-get install numactl`.")
+
     if os.environ.get("BYTEPS_TRACE_ON", "") == "1":
         print("\n!!!Enable profiling for WORKER_ID: %s and local_rank: %d!!!" %
               (os.environ.get("DMLC_WORKER_ID"), local_rank))
@@ -89,10 +187,18 @@ def launch_bps():
         else:
             local_size = 1
         t = [None] * local_size
+
+        if os.environ.get("BYTEPS_NUMA_ON", "") == "1":
+            allocations = allocate_cpu(local_size)
+
         for i in range(local_size):
             command = ' '.join(sys.argv[1:])
-            t[i] = PropagatingThread(target=worker, args=[
-                                    i, local_size, command])
+            if os.environ.get("BYTEPS_NUMA_ON", "") == "1":
+                t[i] = PropagatingThread(target=worker, args=[
+                    i, local_size, command, allocations[i]])
+            else:
+                t[i] = PropagatingThread(target=worker, args=[
+                    i, local_size, command])
             t[i].daemon = True
             t[i].start()
 
diff --git a/setup.py b/setup.py
index 8a7f50b93..f57f29048 100644
--- a/setup.py
+++ b/setup.py
@@ -171,19 +171,14 @@ def get_mpi_flags():
 
 def get_cpp_flags(build_ext):
     last_err = None
-    default_flags = ['-std=c++11', '-fPIC', '-O2', '-Wall', '-fopenmp']
-    avx_flags = ['-mf16c', '-mavx']
+    default_flags = ['-std=c++11', '-fPIC', '-Ofast', '-Wall', '-fopenmp', '-march=native']
     flags_to_try = []
     if sys.platform == 'darwin':
         # Darwin most likely will have Clang, which has libc++.
-        flags_to_try = [default_flags + ['-stdlib=libc++'] + avx_flags,
-                        default_flags + avx_flags,
-                        default_flags + ['-stdlib=libc++'],
+        flags_to_try = [default_flags + ['-stdlib=libc++'],
                         default_flags]
     else:
-        flags_to_try = [default_flags + avx_flags,
-                        default_flags + ['-stdlib=libc++'] + avx_flags,
-                        default_flags,
+        flags_to_try = [default_flags ,
                         default_flags + ['-stdlib=libc++']]
     for cpp_flags in flags_to_try:
         try:
@@ -254,7 +249,16 @@ def get_common_options(build_ext):
                'byteps/common/ready_table.cc',
                'byteps/common/shared_memory.cc',
                'byteps/common/nccl_manager.cc',
-               'byteps/common/cpu_reducer.cc']
+               'byteps/common/cpu_reducer.cc'] + [
+               'byteps/common/compressor/compressor_registry.cc',
+               'byteps/common/compressor/error_feedback.cc',
+               'byteps/common/compressor/momentum.cc',
+               'byteps/common/compressor/impl/dithering.cc',
+               'byteps/common/compressor/impl/onebit.cc',
+               'byteps/common/compressor/impl/randomk.cc',
+               'byteps/common/compressor/impl/topk.cc',
+               'byteps/common/compressor/impl/vanilla_error_feedback.cc',
+               'byteps/common/compressor/impl/nesterov_momentum.cc']
     if "BYTEPS_USE_MPI" in os.environ and os.environ["BYTEPS_USE_MPI"] == "1":
         mpi_flags = get_mpi_flags()
         COMPILE_FLAGS = cpp_flags + \
@@ -263,6 +267,7 @@ def get_common_options(build_ext):
     else:
         COMPILE_FLAGS = cpp_flags
         LINK_FLAGS = link_flags
+    
     LIBRARY_DIRS = []
     LIBRARIES = []
 
@@ -297,7 +302,15 @@ def build_server(build_ext, options):
     server_lib.include_dirs = options['INCLUDES']
     server_lib.sources = ['byteps/server/server.cc',
                           'byteps/common/cpu_reducer.cc',
-                          'byteps/common/logging.cc']
+                          'byteps/common/logging.cc',
+                          'byteps/common/common.cc'] + [
+                          'byteps/common/compressor/compressor_registry.cc',
+                          'byteps/common/compressor/error_feedback.cc',
+                          'byteps/common/compressor/impl/dithering.cc',
+                          'byteps/common/compressor/impl/onebit.cc',
+                          'byteps/common/compressor/impl/randomk.cc',
+                          'byteps/common/compressor/impl/topk.cc',
+                          'byteps/common/compressor/impl/vanilla_error_feedback.cc']
     server_lib.extra_compile_args = options['COMPILE_FLAGS'] + \
         ['-DBYTEPS_BUILDING_SERVER']
     server_lib.extra_link_args = options['LINK_FLAGS']
diff --git a/tests/meta_test.py b/tests/meta_test.py
new file mode 100644
index 000000000..fbdbfb456
--- /dev/null
+++ b/tests/meta_test.py
@@ -0,0 +1,85 @@
+# Copyright 2020 Amazon Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import copy
+import time
+import os
+import subprocess
+import sys
+import threading
+
+import byteps.mxnet as bps
+
+
+class MetaTest(type):
+    BASE_ENV = {"DMLC_NUM_WORKER": "1",
+                "DMLC_NUM_SERVER": "1",
+                "DMLC_PS_ROOT_URI": "127.0.0.1",
+                "DMLC_PS_ROOT_PORT": "1234",
+                "BYTEPS_LOG_LEVEL": "INFO",
+                "BYTEPS_MIN_COMPRESS_BYTES": "0",
+                "BYTEPS_PARTITION_BYTES": "2147483647"}
+    for name, value in os.environ.items():
+        if name not in BASE_ENV:
+            BASE_ENV[name] = value
+    SCHEDULER_ENV = copy.copy(BASE_ENV)
+    SCHEDULER_ENV.update(DMLC_ROLE="scheduler")
+    SERVER_ENV = copy.copy(BASE_ENV)
+    SERVER_ENV.update(DMLC_ROLE="server")
+
+    def __new__(cls, name, bases, dict):
+        # decorate all test cases
+        for k, v in dict.items():
+            if k.startswith("test_") and hasattr(v, "__call__"):
+                dict[k] = cls.launch_bps(v)
+
+        for k, v in cls.BASE_ENV.items():
+            os.environ[k] = v
+        os.environ["NVIDIA_VISIBLE_DEVICES"] = "0"
+        os.environ["DMLC_WORKER_ID"] = "0"
+        os.environ["DMLC_ROLE"] = "worker"
+        os.environ["BYTEPS_THREADPOOL_SIZE"] = "4"
+        os.environ["BYTEPS_FORCE_DISTRIBUTED"] = "1"
+        os.environ["BYTEPS_LOCAL_RANK"] = "0"
+        os.environ["BYTEPS_LOCAL_SIZE"] = "1"
+        return type(name, bases, dict)
+
+    @classmethod
+    def launch_bps(cls, func):
+        def wrapper(*args, **kwargs):
+            def run(env):
+                subprocess.check_call(args=["bpslaunch"], shell=True,
+                                      stdout=sys.stdout, stderr=sys.stderr,
+                                      env=env)
+                
+            print("bps init")
+            scheduler = threading.Thread(target=run,
+                                         args=(cls.SCHEDULER_ENV,))
+            server = threading.Thread(target=run, args=(cls.SERVER_ENV,))
+            scheduler.daemon = True
+            server.daemon = True
+            scheduler.start()
+            server.start()
+
+            bps.init()
+            func(*args, **kwargs)
+            bps.shutdown()
+
+            scheduler.join()
+            server.join()
+            print("bps shutdown")
+            time.sleep(2)
+
+        return wrapper
diff --git a/tests/run_byteps_test.sh b/tests/run_byteps_test.sh
index ad88b2ebe..a0d125ca6 100755
--- a/tests/run_byteps_test.sh
+++ b/tests/run_byteps_test.sh
@@ -1,13 +1,22 @@
 #!/bin/bash
 
-path="`dirname $0`"
+path="$(dirname $0)"
 
-export PATH=~/.local/bin:$PATH
+export PATH=~/anaconda3/envs/mxnet_p36/bin:$PATH
 export DMLC_NUM_WORKER=1
 export DMLC_NUM_SERVER=1
 export DMLC_PS_ROOT_URI=127.0.0.1
 export DMLC_PS_ROOT_PORT=1234
 
+function cleanup() {
+  rm -rf lr.s
+}
+
+trap cleanup EXIT
+
+pkill bpslaunch
+pkill python3
+
 echo "Launch scheduler"
 export DMLC_ROLE=scheduler
 bpslaunch &
@@ -19,12 +28,11 @@ bpslaunch &
 export NVIDIA_VISIBLE_DEVICES=0
 export DMLC_WORKER_ID=0
 export DMLC_ROLE=worker
+export BYTEPS_THREADPOOL_SIZE=4
 export BYTEPS_FORCE_DISTRIBUTED=1
+export BYTEPS_LOG_LEVEL=WARNING
 
-if [ "$TEST_TYPE" == "mxnet" ]; then
-  echo "TEST MXNET ..."
-  bpslaunch python $path/test_mxnet.py $@
-elif [ "$TEST_TYPE" == "keras" ]; then
+if [ "$TEST_TYPE" == "keras" ]; then
   echo "TEST KERAS ..."
   python $path/test_tensorflow_keras.py $@
 else
diff --git a/tests/test_dithering.py b/tests/test_dithering.py
new file mode 100644
index 000000000..f1fa159d0
--- /dev/null
+++ b/tests/test_dithering.py
@@ -0,0 +1,178 @@
+# Copyright 2020 Amazon Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import copy
+import itertools
+import unittest
+
+import byteps.mxnet as bps
+import mxnet as mx
+import mxnet.ndarray as nd
+import numpy as np
+from gluoncv.model_zoo import get_model
+from mxnet import autograd, gluon
+from numba import jit
+from parameterized import parameterized
+from tqdm import tqdm
+
+from meta_test import MetaTest
+from utils import bernoulli, fake_data
+
+
+@jit(nopython=True)
+def round_next_pow2(v):
+    v -= np.uint32(1)
+    v |= v >> np.uint32(1)
+    v |= v >> np.uint32(2)
+    v |= v >> np.uint32(4)
+    v |= v >> np.uint32(8)
+    v |= v >> np.uint32(16)
+    v += np.uint32(1)
+    return v
+
+
+def dithering(x, k, state, partition='linear', norm="max"):
+    y = x.flatten()
+    if norm == "max":
+        scale = np.max(np.abs(y))
+    elif norm == "l2":
+        scale = np.linalg.norm(y.astype(np.float64), ord=2)
+    else:
+        raise ValueError("Unsupported normalization")
+    y /= scale
+    sign = np.sign(y)
+    y = np.abs(y)
+
+    # stocastic rounding
+    if partition == 'linear':
+        y *= k
+        low = np.floor(y)
+        p = y - low  # whether to ceil
+        y = low + bernoulli(p, state)
+        y /= k
+    elif partition == "natural":
+        y *= 2**(k-1)
+        low = round_next_pow2((np.ceil(y).astype(np.uint32))) >> 1
+        length = copy.deepcopy(low)
+        length[length == 0] = 1
+        p = (y - low) / length
+        y = low + length * bernoulli(p, state)
+        y = y.astype(np.float32)
+        y /= 2**(k-1)
+    else:
+        raise ValueError("Unsupported partition")
+
+    y *= sign
+    y *= scale
+    return y.reshape(x.shape)
+
+
+class DitheringTestCase(unittest.TestCase, metaclass=MetaTest):
+    @parameterized.expand(itertools.product([2, 4, 8], ["linear, natural"], ["max", "l2"], np.random.randint(0, 2020, size=3).tolist()))
+    def test_dithering(self, k, ptype, ntype, seed):
+        ctx = mx.gpu(0)
+        net = get_model("resnet18_v2")
+        net.initialize(mx.init.Xavier(), ctx=ctx)
+        net.summary(nd.ones((1, 3, 224, 224), ctx=ctx))
+
+        # hyper-params
+        batch_size = 32
+        optimizer_params = {'momentum': 0, 'wd': 0,
+                            'learning_rate': 0.01}
+
+        compression_params = {
+            "compressor": "dithering",
+            "k": k,
+            "partition": ptype,
+            "normalize": ntype,
+            "seed": seed
+        }
+        print(compression_params)
+
+        trainer = bps.DistributedTrainer(net.collect_params(
+        ), "sgd", optimizer_params, compression_params=compression_params)
+
+        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
+
+        train_data = fake_data(batch_size=batch_size)
+
+        params = {}
+        rngs = {}
+        rngs_s = {}
+
+        for i, param in enumerate(trainer._params):
+            if param.grad_req != 'null':
+                params[i] = param._data[0].asnumpy()
+                rngs[i] = np.array([seed, seed], dtype=np.uint64)
+                rngs_s[i] = np.array([seed, seed], dtype=np.uint64)
+
+        for it, batch in tqdm(enumerate(train_data)):
+            data = batch[0].as_in_context(ctx)
+            label = batch[1].as_in_context(ctx)
+
+            with autograd.record():
+                output = net(data)
+                loss = loss_fn(output, label)
+
+            loss.backward()
+
+            gs = {}
+            xs = {}
+
+            for i, param in enumerate(trainer._params):
+                if param.grad_req != 'null':
+                    gs[i] = param._grad[0].asnumpy()
+                    xs[i] = param._data[0].asnumpy()
+
+            trainer.step(batch_size)
+
+            for i, param in enumerate(trainer._params):
+                if param.grad_req != "null":
+                    g = gs[i] / (batch_size * bps.size())
+                    c = dithering(g, k, rngs[i], ptype, ntype)
+
+                    cs = dithering(c, k, rngs_s[i], ptype, ntype)
+                    c = cs
+
+                    params[i] -= optimizer_params["learning_rate"] * c
+
+                    np_g = c.flatten()
+                    mx_g = param._grad[0].asnumpy().flatten()
+                    if not np.allclose(np_g, mx_g, atol=np.finfo(np.float32).eps):
+                        diff = np.abs(np_g - mx_g)
+                        print("np", np_g)
+                        print("mx", mx_g)
+                        print("diff", diff)
+                        print("max diff", np.max(diff))
+                        idx = np.nonzero(diff > 1e-5)
+                        print("idx", idx, np_g[idx], mx_g[idx])
+                        input()
+
+        cnt = 0
+        tot = 0
+        for i, param in enumerate(trainer._params):
+            if param.grad_req != "null":
+                x = param._data[0].asnumpy()
+                tot += len(x.flatten())
+                if not np.allclose(params[i], x, atol=np.finfo(np.float32).eps):
+                    diff = np.abs(x.flatten() - params[i].flatten())
+                    idx = np.where(diff > np.finfo(np.float32).eps)
+                    cnt += len(idx[0])
+
+        assert cnt == 0, "false/tot=%d/%d=%f" % (cnt, tot, cnt/tot)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_mxnet.py b/tests/test_mxnet.py
index 9bb2e5db6..7f192915b 100644
--- a/tests/test_mxnet.py
+++ b/tests/test_mxnet.py
@@ -1,3 +1,5 @@
+# Copyright 2020 Amazon Technologies, Inc. All Rights Reserved.
+# Copyright 2019 ByteDance Technologies, Inc. All Rights Reserved.
 # Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,45 +15,29 @@
 # limitations under the License.
 # ==============================================================================
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+import itertools
+import unittest
 
 import byteps.mxnet as bps
-import itertools
 import mxnet as mx
-import os
 import numpy as np
-import unittest
-from mxnet.base import MXNetError
-from mxnet.test_utils import same
+
+from meta_test import MetaTest
 
 has_gpu = mx.context.num_gpus() > 0
 
-# MLSL supports only byte, float and double data types
-mlsl_supported_types = set(['float32', 'float64'])
 
-class MXTest:
+class MXTest(unittest.TestCase, metaclass=MetaTest):
     """
     Tests for ops in byteps.mxnet.
     """
-
     def _current_context(self):
         if has_gpu:
             return mx.gpu(bps.local_rank())
         else:
             return mx.current_context()
-
-    def filter_supported_types(self, types):
-        if 'MLSL_ROOT' in os.environ:
-           types = [t for t in types if t in mlsl_supported_types]
-        return types
-
+    
     def test_byteps_trainer_param_order(self):
-        size = bps.size()
-        dtypes = self.filter_supported_types(['float32'])
-        dims = [1]
-        ctx = self._current_context()
         net = mx.gluon.nn.Sequential()
         # layers may be added in a random order for all workers
         layers = {'ones_': 1, 'zeros_': 0}
@@ -65,19 +51,19 @@ def test_byteps_trainer_param_order(self):
         # check the result of bps_broadcast
         for name, init in layers.items():
             weight = params[name + 'weight'].data()[0].asnumpy()
-            expected = np.full(shape=weight.shape, fill_value=init, dtype=weight.dtype)
+            expected = np.full(shape=weight.shape,
+                               fill_value=init, dtype=weight.dtype)
             assert np.array_equal(weight, expected), (weight, expected)
 
         print('test_byteps_trainer_param_order passed')
 
     def test_byteps_push_pull(self):
         """Test that the byteps_push_pull correctly sums 1D, 2D, 3D tensors."""
-        size = bps.size()
-        dtypes = self.filter_supported_types(['float32'])
-        dims = [1]
+        dtypes = ['float16', 'float32', 'float64']
+        dims = [1, 2, 3]
+        count = 0
         ctx = self._current_context()
-        count = 100
-        shapes = [(), (17)]
+        shapes = [(), (17), (17, 17), (17, 17, 17)]
         for dtype, dim in itertools.product(dtypes, dims):
             # MXNet uses gpu_id as part of the seed, so to get identical seeds
             # we must set a context.
@@ -85,24 +71,24 @@ def test_byteps_push_pull(self):
             tensor = mx.nd.random.uniform(-100, 100, shape=shapes[dim],
                                           ctx=ctx)
             tensor = tensor.astype(dtype)
+            input = tensor.asnumpy()
 
-            print("tensor before push_pull:", tensor)
             bps.byteps_declare_tensor("tensor_" + str(count))
             bps.byteps_push_pull(tensor, name="tensor_"+str(count))
             tensor.wait_to_read()
-            print("tensor after push_pull:", tensor)
+            output = tensor.asnumpy()
+            assert np.allclose(input, output)
+            count += 1
 
         print('test_byteps_push_pull passed')
 
-
     def test_byteps_push_pull_inplace(self):
         """Test that the byteps_push_pull correctly sums 1D, 2D, 3D tensors."""
         size = bps.size()
-        dtypes = self.filter_supported_types(['int32',   'int64',
-                                              'float32', 'float64'])
+        dtypes = ['float16', 'float32', 'float64']
         dims = [1, 2, 3]
+        count = 0
         ctx = self._current_context()
-        count = 200
         shapes = [(), (17), (17, 17), (17, 17, 17)]
         for dtype, dim in itertools.product(dtypes, dims):
             mx.random.seed(1234, ctx=ctx)
@@ -111,7 +97,7 @@ def test_byteps_push_pull_inplace(self):
             tensor = tensor.astype(dtype)
             multiplied = tensor.copy()
             bps.byteps_declare_tensor("tensor_" + str(count))
-            bps.byteps_push_pull(tensor, name= "tensor_" + str(count))
+            bps.byteps_push_pull(tensor, name="tensor_" + str(count))
             max_difference = mx.nd.max(mx.nd.subtract(tensor, multiplied))
             count += 1
 
@@ -136,54 +122,5 @@ def test_byteps_push_pull_inplace(self):
         print('test_byteps_push_pull_inplace passed')
 
 
-    def test_byteps_broadcast(self):
-        """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors."""
-        rank = bps.rank()
-        size = bps.size()
-
-        # This test does not apply if there is only one worker.
-        if size == 1:
-            return
-
-        dtypes = ['int32',   'int64',
-                  'float32', 'float64']
-        dims = [1, 2, 3]
-        ctx = self._current_context()
-        count = 300
-        shapes = [(), (17), (17, 17), (17, 17, 17)]
-        root_ranks = list(range(size))
-        for dtype, dim, root_rank in itertools.product(dtypes, dims,
-                                                       root_ranks):
-            tensor = mx.nd.ones(shapes[dim], ctx=ctx) * rank
-            root_tensor = mx.nd.ones(shapes[dim], ctx=ctx) * root_rank
-            tensor = tensor.astype(dtype)
-            root_tensor = root_tensor.astype(dtype)
-
-            broadcast_tensor = bps.broadcast(tensor, root_rank=root_rank,
-                                             name=str(count))
-            if rank != root_rank:
-                if same(tensor.asnumpy(), root_tensor.asnumpy()):
-                    print("broadcast", count, dtype, dim,
-                          mx.nd.max(tensor == root_tensor))
-                    print("tensor", bps.rank(), tensor)
-                    print("root_tensor", bps.rank(), root_tensor)
-                    print("comparison", bps.rank(), tensor == root_tensor)
-                assert not same(tensor.asnumpy(), root_tensor.asnumpy()), \
-                    'bps.broadcast modifies source tensor'
-            if not same(broadcast_tensor.asnumpy(), root_tensor.asnumpy()):
-                print("broadcast", count, dtype, dim)
-                print("broadcast_tensor", bps.rank(), broadcast_tensor)
-                print("root_tensor", bps.rank(), root_tensor)
-                print("comparison", bps.rank(),
-                      broadcast_tensor == root_tensor)
-            assert same(broadcast_tensor.asnumpy(), root_tensor.asnumpy()), \
-                'bps.broadcast produces incorrect broadcasted tensor'
-
-
 if __name__ == '__main__':
-    mxtest = MXTest()
-    bps.init()
-    mxtest.test_byteps_push_pull()
-    mxtest.test_byteps_trainer_param_order()
-    #mxtest.test_byteps_broadcast()
-    mxtest.test_byteps_push_pull_inplace()
+    unittest.main()
diff --git a/tests/test_onebit.py b/tests/test_onebit.py
new file mode 100644
index 000000000..426e803f2
--- /dev/null
+++ b/tests/test_onebit.py
@@ -0,0 +1,120 @@
+# Copyright 2020 Amazon Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import itertools
+import unittest
+
+import byteps.mxnet as bps
+import mxnet as mx
+import mxnet.ndarray as nd
+import numpy as np
+from gluoncv.model_zoo import get_model
+from mxnet import autograd, gluon
+from parameterized import parameterized
+from tqdm import tqdm
+
+from meta_test import MetaTest
+from utils import fake_data
+
+
+def onebit(x, scaling):
+    if scaling:
+        l1 = np.linalg.norm(x.flatten(), 1)
+    sign = x < 0
+    sign = -((sign << 1) - 1)
+    if scaling:
+        return l1 / len(x.flatten()) * sign
+    else:
+        return sign
+
+
+class OnebitTestCase(unittest.TestCase, metaclass=MetaTest):
+    @parameterized.expand(itertools.product([True, False]))
+    def test_onebit(self, scaling):
+        bps.init()
+        ctx = mx.gpu(0)
+        net = get_model("resnet18_v2")
+        net.initialize(mx.init.Xavier(), ctx=ctx)
+        net.summary(nd.ones((1, 3, 224, 224), ctx=ctx))
+
+        # hyper-params
+        batch_size = 32
+        optimizer_params = {'momentum': 0, 'wd': 0,
+                            'learning_rate': 0.01}
+
+        compression_params = {
+            "compressor": "onebit",
+            "scaling": scaling,
+        }
+
+        trainer = bps.DistributedTrainer(net.collect_params(
+        ), "sgd", optimizer_params, compression_params=compression_params)
+
+        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
+
+        train_data = fake_data(batch_size=batch_size)
+
+        params = {}
+
+        for i, param in enumerate(trainer._params):
+            if param.grad_req != 'null':
+                params[i] = param._data[0].asnumpy()
+
+        for it, batch in tqdm(enumerate(train_data)):
+            data = batch[0].as_in_context(ctx)
+            label = batch[1].as_in_context(ctx)
+
+            with autograd.record():
+                output = net(data)
+                loss = loss_fn(output, label)
+
+            loss.backward()
+
+            gs = {}
+            xs = {}
+
+            for i, param in enumerate(trainer._params):
+                if param.grad_req != 'null':
+                    gs[i] = param._grad[0].asnumpy()
+                    xs[i] = param._data[0].asnumpy()
+
+            trainer.step(batch_size)
+
+            for i, param in enumerate(trainer._params):
+                if param.grad_req != "null":
+                    g = gs[i] / (batch_size * bps.size())
+                    c = onebit(g, scaling)
+
+                    cs = onebit(c, scaling)
+                    c = cs
+
+                    params[i] -= optimizer_params["learning_rate"] * c
+
+        cnt = 0
+        tot = 0
+        for i, param in enumerate(trainer._params):
+            if param.grad_req != "null":
+                x = param._data[0].asnumpy()
+                tot += len(x.flatten())
+                if not np.allclose(params[i], x, atol=np.finfo(np.float32).eps):
+                    diff = np.abs(x.flatten() - params[i].flatten())
+                    idx = np.where(diff > np.finfo(np.float32).eps)
+                    cnt += len(idx[0])
+
+        assert cnt == 0, "false/tot=%d/%d=%f" % (cnt, tot, cnt/tot)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_randomk.py b/tests/test_randomk.py
new file mode 100644
index 000000000..9c481e7d8
--- /dev/null
+++ b/tests/test_randomk.py
@@ -0,0 +1,128 @@
+# Copyright 2020 Amazon Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import itertools
+import unittest
+
+import byteps.mxnet as bps
+import mxnet as mx
+import mxnet.ndarray as nd
+import numpy as np
+from gluoncv.model_zoo import get_model
+from mxnet import autograd, gluon
+from numba import jit
+from parameterized import parameterized
+from tqdm import tqdm
+
+from meta_test import MetaTest
+from utils import fake_data, randint
+
+
+@jit(nopython=True)
+def randomk(x, k, state):
+    y = x.flatten()
+    low = np.uint64(0)
+    high = np.uint64(len(y))
+    indices = np.array([randint(low, high, state)
+                        for _ in range(k)], dtype=np.uint64)
+    vals = y[indices]
+    y.fill(0)
+    for idx, val in zip(indices, vals):
+        y[idx] = val
+    return y.reshape(x.shape)
+
+
+class RandomkTestCase(unittest.TestCase, metaclass=MetaTest):
+    @parameterized.expand(itertools.product([1, 3, 5], np.random.randint(0, 2020, size=3).tolist()))
+    def test_randomk(self, k, seed):
+        ctx = mx.gpu(0)
+        net = get_model("resnet18_v2")
+        net.initialize(mx.init.Xavier(), ctx=ctx)
+        net.summary(nd.ones((1, 3, 224, 224), ctx=ctx))
+
+        # hyper-params
+        batch_size = 32
+        optimizer_params = {'momentum': 0, 'wd': 0,
+                            'learning_rate': 0.01}
+
+        compression_params = {
+            "compressor": "randomk",
+            "k": k,
+            "seed": seed
+        }
+
+        trainer = bps.DistributedTrainer(net.collect_params(
+        ), "sgd", optimizer_params, compression_params=compression_params)
+
+        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
+
+        train_data = fake_data(batch_size=batch_size)
+
+        params = {}
+        rngs = {}
+        rngs_s = {}
+
+        for i, param in enumerate(trainer._params):
+            if param.grad_req != 'null':
+                params[i] = param._data[0].asnumpy()
+                rngs[i] = np.array([seed, seed], dtype=np.uint64)
+                rngs_s[i] = np.array([seed, seed], dtype=np.uint64)
+
+        for it, batch in tqdm(enumerate(train_data)):
+            data = batch[0].as_in_context(ctx)
+            label = batch[1].as_in_context(ctx)
+
+            with autograd.record():
+                output = net(data)
+                loss = loss_fn(output, label)
+
+            loss.backward()
+
+            gs = {}
+            xs = {}
+
+            for i, param in enumerate(trainer._params):
+                if param.grad_req != 'null':
+                    gs[i] = param._grad[0].asnumpy()
+                    xs[i] = param._data[0].asnumpy()
+
+            trainer.step(batch_size)
+
+            for i, param in enumerate(trainer._params):
+                if param.grad_req != "null":
+                    g = gs[i] / (batch_size * bps.size())
+                    c = randomk(g, k, rngs[i])
+
+                    cs = randomk(c, k, rngs_s[i])
+                    c = cs
+
+                    params[i] -= optimizer_params["learning_rate"] * c
+
+        cnt = 0
+        tot = 0
+        for i, param in enumerate(trainer._params):
+            if param.grad_req != "null":
+                x = param._data[0].asnumpy()
+                tot += len(x.flatten())
+                if not np.allclose(params[i], x, atol=np.finfo(np.float32).eps):
+                    diff = np.abs(x.flatten() - params[i].flatten())
+                    idx = np.where(diff > np.finfo(np.float32).eps)
+                    cnt += len(idx[0])
+
+        assert cnt == 0, "false/tot=%d/%d=%f" % (cnt, tot, cnt/tot)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_topk.py b/tests/test_topk.py
new file mode 100644
index 000000000..2a5409b89
--- /dev/null
+++ b/tests/test_topk.py
@@ -0,0 +1,119 @@
+# Copyright 2020 Amazon Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import itertools
+import random
+import unittest
+
+import byteps.mxnet as bps
+import mxnet as mx
+import mxnet.ndarray as nd
+import numpy as np
+from gluoncv.model_zoo import get_model
+from mxnet import autograd, gluon
+from parameterized import parameterized
+from tqdm import tqdm
+
+from meta_test import MetaTest
+from utils import fake_data
+
+
+def topk(x, k):
+    y = x.flatten()
+    indices = np.argsort(np.abs(y))[-k:][::-1]
+    vals = y[indices]
+    y.fill(0)
+    for idx, val in zip(indices, vals):
+        y[idx] = val
+    return y.reshape(x.shape)
+
+
+class TopkTestCase(unittest.TestCase, metaclass=MetaTest):
+    @parameterized.expand(itertools.product([1, 3, 5]))
+    def test_topk(self, k):
+        ctx = mx.gpu(0)
+        net = get_model("resnet18_v2")
+        net.initialize(mx.init.Xavier(), ctx=ctx)
+        net.summary(nd.ones((1, 3, 224, 224), ctx=ctx))
+
+        # hyper-params
+        batch_size = 32
+        optimizer_params = {'momentum': 0, 'wd': 0,
+                            'learning_rate': 0.01}
+
+        compression_params = {
+            "compressor": "topk",
+            "k": k,
+        }
+
+        trainer = bps.DistributedTrainer(net.collect_params(
+        ), "sgd", optimizer_params, compression_params=compression_params)
+
+        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
+
+        train_data = fake_data(batch_size=batch_size)
+
+        params = {}
+
+        for i, param in enumerate(trainer._params):
+            if param.grad_req != 'null':
+                params[i] = param._data[0].asnumpy()
+
+        for it, batch in tqdm(enumerate(train_data)):
+            data = batch[0].as_in_context(ctx)
+            label = batch[1].as_in_context(ctx)
+
+            with autograd.record():
+                output = net(data)
+                loss = loss_fn(output, label)
+
+            loss.backward()
+
+            gs = {}
+            xs = {}
+
+            for i, param in enumerate(trainer._params):
+                if param.grad_req != 'null':
+                    gs[i] = param._grad[0].asnumpy()
+                    xs[i] = param._data[0].asnumpy()
+
+            trainer.step(batch_size)
+
+            for i, param in enumerate(trainer._params):
+                if param.grad_req != "null":
+                    g = gs[i] / (batch_size * bps.size())
+                    c = topk(g, k)
+
+                    cs = topk(c, k)
+                    c = cs
+
+                    params[i] -= optimizer_params["learning_rate"] * c
+
+        cnt = 0
+        tot = 0
+        for i, param in enumerate(trainer._params):
+            if param.grad_req != "null":
+                x = param._data[0].asnumpy()
+                tot += len(x.flatten())
+                if not np.allclose(params[i], x, atol=np.finfo(np.float32).eps):
+                    diff = np.abs(x.flatten() - params[i].flatten())
+                    idx = np.where(diff > np.finfo(np.float32).eps)
+                    cnt += len(idx[0])
+
+        assert cnt == 0, "false/tot=%d/%d=%f" % (cnt, tot, cnt/tot)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 000000000..b3ee93fcf
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,52 @@
+import mxnet as mx
+import mxnet.ndarray as nd
+import numpy as np
+from numba import jit
+
+
+def fake_data(dtype="float32", batch_size=32, height=224, width=224, depth=3, num_classes=1000):
+    image_list = []
+    label_list = []
+    for _ in range(8):
+        image = mx.ndarray.random.normal(-1, 1,
+                                         shape=[1, depth, height, width],
+                                         dtype=dtype)
+        label = mx.ndarray.random.randint(0, num_classes, [1, 1])
+
+        images = mx.ndarray.repeat(image, 128, axis=0)
+        labels = mx.ndarray.repeat(label, 128, axis=0)
+        # print(labels)
+        image_list.append(images)
+        label_list.append(labels)
+
+    images = nd.concat(*image_list, dim=0)
+    labels = nd.concat(*label_list, dim=0)
+    # print(labels)
+    fake_dataset = mx.gluon.data.ArrayDataset(images, labels)
+
+    return mx.gluon.data.DataLoader(fake_dataset, batch_size=batch_size, num_workers=4,
+                                    shuffle=True, last_batch='discard')
+
+
+@jit(nopython=True)
+def xorshift128p(state):
+    t = state[0]
+    s = state[1]
+    state[0] = s
+    t ^= t << np.uint64(23)
+    t ^= t >> np.uint64(17)
+    t ^= s ^ (s >> np.uint64(26))
+    state[1] = t
+    return int(t + s)
+
+
+@jit(nopython=True)
+def bernoulli(p, state):
+    t = p * np.iinfo(np.uint64).max
+    r = np.array([xorshift128p(state) for _ in range(len(p))], dtype=np.float32)
+    return r < t
+
+
+@jit(nopython=True)
+def randint(low, high, state):
+    return xorshift128p(state) % (high - low) + low