bytedance · pleasantrabbit · Aug 13, 2020 · Feb 19, 2020 · Feb 19, 2020 · Feb 19, 2020
diff --git a/.gitignore b/.gitignore
@@ -116,3 +116,4 @@ venv.bak/
 
 # for development
 scripts/
+exps/
diff --git a/3rdparty/ps-lite b/3rdparty/ps-lite
diff --git a/byteps/common/common.cc b/byteps/common/common.cc
@@ -100,6 +100,7 @@ int GetCommandType(RequestType requestType, int d) {
   return (((m + d) * (m + d + 1)) / 2) + d;
 }
 
+#ifndef BYTEPS_BUILDING_SERVER
 ncclDataType_t getNcclDataType(DataType dtype) {
   switch (dtype) {
     case BYTEPS_FLOAT32:
@@ -121,6 +122,7 @@ ncclDataType_t getNcclDataType(DataType dtype) {
   }
   return ncclFloat32;
 }
+#endif
 
 int getDataTypeLength(int dtype) {
   switch (dtype) {

diff --git a/byteps/common/common.h b/byteps/common/common.h
@@ -31,16 +31,23 @@
 #include <vector>
 
 // Add for profiling communication events
-#include <fstream>
 #include <stdio.h>
 #include <stdlib.h>
-#include <iostream>
-#include <thread>
+
 #include <chrono>
+#include <fstream>
+#include <iostream>
 #include <queue>
+#include <thread>
 
 namespace byteps {
 namespace common {
+namespace compressor {
+struct BPSTensor;
+typedef BPSTensor tensor_t;
+class Compressor;
+class ErrorFeedback;
+}  // namespace compressor
 
 // Device ID used for CPU.
 #define CPU_DEVICE_ID (-1)
@@ -83,8 +90,10 @@ enum QueueType {
   COPYD2H,
   PCIE_REDUCE,
   COORDINATE_PUSH,
+  COMPRESS,
   PUSH,
   PULL,
+  DECOMPRESS,
   COPYH2D,
   COORDINATE_BROADCAST,
   BROADCAST,
@@ -94,10 +103,18 @@ enum QueueType {
 const int QueueNum =
     (int)QUEUE_NUM_AND_NOT_A_REAL_QUEUE_TYPE_AND_MUST_BE_THE_LAST;
 
-const std::vector<std::string> LogStrings = {
-    "COORDINATE_REDUCE",    "REDUCE",   "COPYD2H", "PCIE_REDUCE",
-    "COORDINATE_PUSH",      "PUSH",     "PULL",    "COPYH2D",
-    "COORDINATE_BROADCAST", "BROADCAST"};
+const std::vector<std::string> LogStrings = {"COORDINATE_REDUCE",
+                                             "REDUCE",
+                                             "COPYD2H",
+                                             "PCIE_REDUCE",
+                                             "COORDINATE_PUSH",
+                                             "COMPRESS",
+                                             "PUSH",
+                                             "PULL",
+                                             "DECOMPRESS",
+                                             "COPYH2D",
+                                             "COORDINATE_BROADCAST",
+                                             "BROADCAST"};
 
 class Status {
  public:
@@ -173,11 +190,17 @@ typedef struct BytePSContext {
   std::vector<void*> pcie_cpubuff;
   size_t buff_len;
   // Used for profiling communication events
-  std::queue<BPSCommTime *> comm_time;
+  std::queue<BPSCommTime*> comm_time;
   bool profile_flag = false;
   int step_cnt = 0;
   int local_rank = 0;
-  std::unordered_map<uint64_t, std::unordered_map<int, std::queue<BPSCommTime *>>> part_comm_time;
+  std::unordered_map<uint64_t,
+                     std::unordered_map<int, std::queue<BPSCommTime*>>>
+      part_comm_time;
+  // Compressor list
+  std::vector<std::shared_ptr<compressor::Compressor>> compressor_list;
+  // kwargs
+  std::unordered_map<std::string, std::string> kwargs;
 } BPSContext;
 
 class Tensor {
@@ -233,6 +256,10 @@ struct TensorTableEntry {
   std::shared_ptr<std::atomic_int> counter_ptr;
   // How many partitions
   unsigned int total_partnum = 0;
+  // Compressor
+  std::shared_ptr<compressor::Compressor> compressor;
+  // Compressed
+  std::shared_ptr<compressor::tensor_t> compressed;
 };
 using TensorTable = std::unordered_map<std::string, TensorTableEntry>;
 
@@ -250,6 +277,11 @@ ncclDataType_t getNcclDataType(DataType dtype);
 
 int getDataTypeLength(int dtype);
 
+inline size_t Align(size_t size, int dtype) {
+  const size_t min_size =
+      (getDataTypeLength(dtype) * getDataTypeLength(dtype)) * 8;
+  return size + (min_size - size % min_size) % min_size;
+}
 }  // namespace common
 }  // namespace byteps
 

diff --git a/byteps/common/compressor/common.h b/byteps/common/compressor/common.h
@@ -0,0 +1,84 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_COMMON_H
+#define BYTEPS_COMPRESSOR_COMMON_H
+
+#include <unordered_map>
+
+namespace byteps {
+namespace common {
+namespace compressor {
+typedef char byte_t;
+/*!
+ * \brief Tensor type
+ */
+typedef struct BPSTensor {
+  byte_t* data;
+  size_t size;
+  int dtype;
+
+  BPSTensor() : data(nullptr), size(0), dtype(0) {}
+  BPSTensor(void* data, size_t size = 0, int dtype = 0)
+      : data(reinterpret_cast<byte_t*>(data)), size(size), dtype(dtype) {}
+} tensor_t;
+
+using kwargs_t = std::unordered_map<std::string, std::string>;
+
+#define COMPRESS_IMPL_SWITCH(dtype, func, dst, src, size)                     \
+  switch (dtype) {                                                            \
+    case BYTEPS_FLOAT32:                                                      \
+      return func(reinterpret_cast<uint32_t*>(dst),                           \
+                  reinterpret_cast<const float*>(src), size / sizeof(float)); \
+    case BYTEPS_FLOAT64:                                                      \
+      return func(reinterpret_cast<uint64_t*>(dst),                           \
+                  reinterpret_cast<const double*>(src),                       \
+                  size / sizeof(double));                                     \
+    default:                                                                  \
+      BPS_CHECK(0) << "Unsupported data type:" << dtype;                      \
+  }
+
+#define DECOMPRESS_IMPL_SWITCH(dtype, func, dst, src, compressed_size)      \
+  switch (dtype) {                                                          \
+    case BYTEPS_FLOAT32:                                                    \
+      return func(reinterpret_cast<float*>(dst),                            \
+                  reinterpret_cast<const uint32_t*>(src), compressed_size); \
+    case BYTEPS_FLOAT64:                                                    \
+      return func(reinterpret_cast<double*>(dst),                           \
+                  reinterpret_cast<const uint64_t*>(src), compressed_size); \
+    default:                                                                \
+      BPS_CHECK(0) << "Unsupported data type:" << dtype;                    \
+  }
+
+#define SWITCH_TO_FAST_UPDATE_ERROR_IMPL_SWITCH(dtype, func, dst, src1, src2, \
+                                                compressed_size)              \
+  switch (dtype) {                                                            \
+    case BYTEPS_FLOAT32:                                                      \
+      return func(reinterpret_cast<float*>(dst),                              \
+                  reinterpret_cast<float*>(src1),                             \
+                  reinterpret_cast<const uint32_t*>(src2), compressed_size);  \
+    case BYTEPS_FLOAT64:                                                      \
+      return func(reinterpret_cast<double*>(dst),                             \
+                  reinterpret_cast<double*>(src1),                            \
+                  reinterpret_cast<const uint64_t*>(src2), compressed_size);  \
+    default:                                                                  \
+      BPS_CHECK(0) << "Unsupported data type:" << dtype;                      \
+  }
+
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_COMMON_H
diff --git a/byteps/common/compressor/compressor.h b/byteps/common/compressor/compressor.h
@@ -0,0 +1,137 @@
+// Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef BYTEPS_COMPRESSOR_COMPRESSOR_H
+#define BYTEPS_COMPRESSOR_COMPRESSOR_H
+
+#include <memory>
+
+#include "../cpu_reducer.h"
+#include "common.h"
+
+namespace byteps {
+namespace common {
+namespace compressor {
+/*!
+ * \brief Compressor interface
+ * Compressor defines two universal API - Compress & Decompress
+ *
+ * \par
+ * The caller do not need to allocate additional memory to store compressed data
+ * because there is an internal buffer to store the compressed data and the
+ * pointer will be returned to the caller. Then the caller can send the returned
+ * compressed data as normal.
+ *
+ * \par
+ * There are two optional features of the compressor - error-feedback &
+ * momentum. These two features can be added to any common compressors like 1bit
+ * and topk. To be generic, these two features are also compressors, exposing
+ * the same API as Compressor. More details can be found in their own files.
+ *
+ * \par
+ * To add a new compressor, developers need to inherit this class in 'impl'
+ * directory. If a new optional feature like error-feedback is needed,
+ * developers need to use decorator pattern and add new files in the current
+ * directory. The existing implementation can be used as a reference.
+ *
+ *
+ * \sa ErrorFeedback, Momentum
+ */
+class Compressor {
+ public:
+  Compressor(size_t size, DataType dtype)
+      : _size(size),
+        _dtype(dtype),
+        _buf(new byte_t[size]),
+        _cpu_reducer(new CpuReducer(nullptr)){};
+  virtual ~Compressor() = default;
+
+  /*!
+   * \brief Compress function
+   *
+   * \note Except for error-feedback and momentum, the underlying data of input
+   * should never be changed. this is because input is still used in error
+   * feedback if enabled.
+   *
+   * \note Compressed data should be stored in the buffer of the compressor. So
+   * it is not an inplace operation.
+   *
+   * \param grad gradient tensor, passed by value.
+   * \return compressed tensor. it is the buffer of the compressor,
+   * which contains the compressed data. the returned size is the size of
+   * compressed data.
+   */
+  virtual tensor_t Compress(tensor_t grad) = 0;
+
+  /*!
+   * \brief Decompress function
+   *
+   * \note For servers, decompression is not an inplace operation. The
+   * decompressed results locates in the buffer of the compressor. For workers,
+   * it is an inplace operation.
+   *
+   * \param compressed compressed tensor.
+   * \return decompressed tensor. For servers, it is the buffer of the
+   * compressor, which contains the decompressed data. For workers, its pointer
+   * is the same as the input's, while the size is decompressed size, which is
+   * also the original size.
+   */
+  virtual tensor_t Decompress(tensor_t compressed) = 0;
+
+  /*!
+   * \brief faster version of `UpdateError` via operation fusion
+   *
+   * \par
+   * This is a helper function implemented by each compressor. If defined,
+   * `ErrorFeedback` will use this function instead of defualt `UpdateError`
+   * function implemented in error_feedback.cc. If undefined, default
+   * `UpdateError` will be used.
+   *
+   * \par
+   * Typically `UpdateError` needs to decompress and do a substraction. But for
+   * most compressors, the step of decompression can be avoided. For example,
+   * for topk compressor, `UpdateError` can be simplied in this way:
+   * 1. e <- p (e is the error and p is the corrected gradient)
+   * 2. zero-fill e with selected k indices
+   *
+   * Actually it is a fusion of original decompression and substraction. It is
+   * optional to override.
+   *
+   * \param corrected gradient corrected with error
+   * \param error error
+   * \param compressed compressed gradient
+   */
+  virtual void FastUpdateError(tensor_t error, tensor_t corrected,
+                               tensor_t compressed) {
+    BPS_LOG(FATAL) << "FastUpdateError is not implemented";
+  };
+
+  /*! \brief buffer to store compressed grad */
+  std::unique_ptr<byte_t[]> _buf;
+
+  /*! \brief original size */
+  size_t _size;
+
+  DataType _dtype;
+
+  /*! \brief CPU reducer */
+  std::unique_ptr<CpuReducer> _cpu_reducer;
+};
+
+}  // namespace compressor
+}  // namespace common
+}  // namespace byteps
+
+#endif  // BYTEPS_COMPRESSOR_COMPRESSOR_H