PaddlePaddle · LittleMaer · Jun 28, 2018 · Jun 26, 2018 · Jun 26, 2018 · Jun 26, 2018
diff --git a/saber/core/context.h b/saber/core/context.h
@@ -17,6 +17,7 @@
 
 #include "core/env.h"
 #include "saber/saber_types.h"
+#include <type_traits>
 
 #ifdef USE_BM
 #include "bmlib_runtime.h"
@@ -40,6 +41,11 @@ class Context final{
      * @param compute_stream_id
      */
     Context(int device_id = 0, int data_stream_id = 0, int compute_stream_id = 0){
+        if(std::is_same<TargetType, BM>::value){
+            LOG(INFO) << "context init for BM";
+            return;
+        }
+
         CHECK_GT(devs.size(), 0) << "Env is not initialized or current target is not exit!";
         if (device_id >= devs.size()){
             LOG(WARNING) << "device index exceeds the number of devices, set to default device(0)!";
@@ -63,6 +69,11 @@ class Context final{
     }
 
     Context(const Context<TargetType>& ctx){
+        if(std::is_same<TargetType, BM>::value){
+            LOG(INFO) << "context init for BM";
+            return;
+        }
+
         _device_id = ctx._device_id;
         _data_stream_id = ctx._data_stream_id;
         _compute_stream_id = ctx._compute_stream_id;

diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp
@@ -37,17 +37,17 @@ namespace saber{
 
 typedef TargetWrapper<BM, __device_target> BM_API;
 
-//TODO: check exception
-//static bm_handle_t handle = get_bm_handle();
+// Init handle only once in the lifetime
 static bm_handle_t handle;
+static bm_status_t init_handle{bmdnn_init(&handle)};
 
 void BM_API::get_device_count(int &count) {
     BMDNN_CHECK(bm_dev_getcount(&count));
 }
 
 void BM_API::set_device(int id){
     //(bm_handle_t &handle, bool bmkernel_used, int id){
-    BMDNN_CHECK(bm_dev_request(&handle, 0, id));
+    //BMDNN_CHECK(bm_dev_request(&handle, 0, id));
 }
 
 //TODO: Do we have this functionality?
@@ -78,23 +78,45 @@ void BM_API::mem_set(void* ptr, int value, size_t n){
     //BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
 }
 
-//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-//    size_t count, __DtoD) {};
+void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+    size_t count, __DtoD) {
+    handle = get_bm_handle(); 
+    //BMDNN_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
+    BMDNN_CHECK(bm_memcpy_d2d(handle, *(bm_device_mem_t *)(dst), dst_id, *(bm_device_mem_t *)(src), src_id, count));
+    LOG(INFO) << "BM sync_memcpy: device to device, finished";
+};
 
-//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-//    size_t count, __HtoD) {};
+void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+    size_t count, __HtoD) {
+    handle = get_bm_handle(); 
+    BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src)));
+
+    #ifdef DEBUG
+    for(int i=0; i<10; i++)
+	    LOG(INFO) << "HtoD src: " << *((float *)(src)+i);
+    #endif
+
+    LOG(INFO) << "BM sync_memcpy: host to device, finished";
+};
 
 void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
     size_t count, __DtoH) {
     handle = get_bm_handle(); 
-    //auto* dev_ptr = const_cast<bm_device_mem_t *>(src);
     BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
-    //BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *src));
-    LOG(INFO) << "End sync_memcpy process";
+
+    #ifdef DEBUG
+    for(int i=0; i<10; i++)
+        LOG(INFO) << "DtoH dst: " << *((float *)(dst)+i);
+    #endif
+
+    LOG(INFO) << "BM sync_memcpy: device to host, finished";
 };
 
-//static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
-//    int src_dev, size_t count) {};
+void BM_API::sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+    int src_dev, size_t count) { 
+
+    LOG(INFO) << "BM sync_memcpy_p2p: temporarily no used";
+};
 
 
 //! target wrapper

diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
@@ -368,6 +368,15 @@ struct TargetWrapper<NV, __device_target> {
 */
 template <>
 struct TargetWrapper<BM, __device_target> {
+//    TargetWrapper<BM, __device_target> ()
+//    {
+//        CHECK_EQ(bmdnn_init(&handle),BM_SUCCESS) << "Error:bmdnn_init failed";
+//    }
+//    ~TargetWrapper<BM, __device_target> ()
+//    {
+//        CHECK_EQ(bmdnn_deinit(handle),BM_SUCCESS) << "Error:bmdnn_deinit failed";
+//    }
+
     typedef void* event_t;
     typedef void* stream_t;
 
@@ -398,22 +407,26 @@ struct TargetWrapper<BM, __device_target> {
     // brief create event, empty function for bitmain target
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __DtoD) {};
+        size_t count, __DtoD);
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-        size_t count, __HtoD) {};
+        size_t count, __HtoD);
 
     static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
         size_t count, __DtoH);
 
     static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
-        int src_dev, size_t count) {};
+        int src_dev, size_t count);
 
     /**
      * \brief device target return currently used device id
      * @return          currently activated device id
      */
     static int get_device_id();
+
+//    static bm_handle_t get_handler();
+
+//    bm_handle_t handle;
 };
 
 #endif //USE_BM

diff --git a/saber/core/tensor.h b/saber/core/tensor.h
@@ -19,7 +19,7 @@
 #include "core/shape.h"
 #include "core/events.h"
 #include "core/tensor_traits.h"
-
+#include <typeinfo>
 namespace anakin{
 
 namespace saber{
@@ -117,20 +117,49 @@ class Tensor : public TensorBase {
     /**
      * \brief Constructor with allocated data ptr and entire memory shape.
      */
-    template <typename TargetType_t>
-    Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape) {
+//    template <typename TargetType_t>
+//    Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape) {
+//
+//        CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \
+//            "shape dims is not matched to layout type";
+//        _shape = shape;
+//        _valid_shape = shape;
+//        _offset = Shape::zero(shape.dims());
+//        std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
+//            std::make_shared<Buffer<TargetType_t>>(data_ptr, shape.count() * _type_len(), id);
+//        BufferMemShare(_buf, buf_from_date);
+//        _is_subbuf = false;
+//    }
 
+#ifdef USE_BM
+    /**
+     * \brief Constructor with allocated data ptr and entire memory shape. only for BM
+    */ 
+    template <typename Dtype_s,typename TargetType_t>
+    Tensor(Dtype_s* data_ptr, TargetType_t target, int id, Shape shape) {
         CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \
             "shape dims is not matched to layout type";
         _shape = shape;
         _valid_shape = shape;
         _offset = Shape::zero(shape.dims());
+
+        if(typeid(Dtype_s) == typeid(AK_FLOAT))
+        {
+        std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
+            std::make_shared<Buffer<TargetType_t>>(&bm_mem_from_system(const_cast<Dtype_s *>(data_ptr)), shape.count() * _type_len(), id);
+
+        BufferMemShare(_buf, buf_from_date);
+        }
+        else
+        {
         std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
             std::make_shared<Buffer<TargetType_t>>(data_ptr, shape.count() * _type_len(), id);
+
         BufferMemShare(_buf, buf_from_date);
+        }
         _is_subbuf = false;
     }
-
+#endif
     /**
      * \brief Copy constructor, shallow copy.
      */
@@ -580,7 +609,7 @@ class Tensor : public TensorBase {
         }
         CHECK_EQ(valid_size(), tensor.valid_size()) \
             << "sizes of two valid shapes must be the same";
-
+        
         /// get the proper process target wrapper
         typedef  TargetWrapper<TargetType_t> API_t;
         typedef typename TargetTypeTraits<TargetType_t>::target_type target_type_t;
@@ -727,7 +756,8 @@ class Tensor : public TensorBase {
     SaberStatus copy_from(const Tensor<NewTargetType_t, NewDataType_t, NewLayOutType_t>& tensor) {
         LOG(WARNING) << "Invalid: copy_from is not allowed for current type.";
         return SaberInvalidValue;
-    }
+    }  
+
 #endif
 
     /**
@@ -942,15 +972,19 @@ class Tensor : public TensorBase {
 
 #ifdef USE_BM
 
+#ifndef BM_TENSOR_COPY
+#define BM_TENSOR_COPY
+
+
 template<> inline
 size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
-    return 1;
+    return 4;
 }
 
 template<>
 template<> inline
 SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
-    LOG(INFO) << "BM copy_from";
+    LOG(INFO) << "BM copy_from X86";
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
 
     auto* device_data_ptr = mutable_data();
@@ -961,16 +995,62 @@ SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor
 template<>
 template<> inline
 SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
-    LOG(INFO) << "X86 copy_from";
+    LOG(INFO) << "X86 copy_from BM";
     CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
 
     auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
     BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
     return SaberSuccess;
 }
 
+/*
+    template<> inline
+    size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
+        return 4;
+    }  
+
+    template<>
+    template<> inline
+    SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
+        LOG(INFO) << "BM copy_from X86";
+        CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
+
+        auto* device_data_ptr = mutable_data();
+        BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
+        //BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *(bm_device_mem_t *)(mutable_data()), bm_mem_from_system(tensor.data())));
+        return SaberSuccess;
+    }
+
+    template<>
+    template<> inline
+    SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
+        LOG(INFO) << "X86 copy_from BM";
+        CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
+
+        auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+        BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
+        //BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *(bm_device_mem_t *)(tensor.data())));
+        return SaberSuccess;
+    }
+
+    template<>
+    template<> inline
+    SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
+        LOG(INFO) << "BM copy_from BM";
+        CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";
+
+        auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+        //BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
+        //BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *(bm_device_mem_t *)(tensor.data())));
+        return SaberSuccess;
+    } 
+*/
+
 #endif
 
+#endif
+
+
 } //namespace saber
 
 } //namespace anakin

diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp
@@ -413,6 +413,42 @@ void fill_tensor_device_const(Tensor<BM, AK_BM, NCHW>& tensor, float value, \
     delete [] host_mem_input;
 }
 
+template <>
+void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tensor,  \
+    typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream) {
+
+    LOG(INFO) << "BM device tensor data:" << tensor.size();
+
+    /*
+    const bm_device_mem_t* device_data_ptr = tensor.data();
+    unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr);
+    bm_flush(get_bm_handle());
+    float* device_data = (float*)bm_get_global_addr(gaddr);
+
+    for (int i = 0; i < tensor.size(); ++i) {
+        printf("%.2f ", device_data[i]);
+
+        if ((i + 1) % (4 * tensor.width()) == 0) {
+            printf("\n");
+        }
+    }*/
+
+    float *host_mem = new float[tensor.size()];
+    auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
+    bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);
+
+    for (int i = 0; i < tensor.size(); ++i) {
+        printf("%.2f\t", host_mem[i]);
+
+        if ((i + 1) % tensor.width() == 0){
+            printf("\n");
+        }
+    }
+    printf("\n");
+
+    delete [] host_mem;
+}
+
 #endif
 
 } //namespace saber