Skip to content
This repository has been archived by the owner on Jan 24, 2024. It is now read-only.

Bitmain Sophon Saber Ops Implementation #151

Merged
merged 49 commits into from
Jun 28, 2018
Merged
Show file tree
Hide file tree
Changes from 47 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
21b2379
Implement print_tensor_device for BM
guangzhixie Jun 26, 2018
b558318
Update BM tensor test
guangzhixie Jun 26, 2018
99493a4
fix pooling api error
SophonTPU Jun 26, 2018
1f02e14
Update pooling test
guangzhixie Jun 26, 2018
a1e8214
Skip context init for BM
guangzhixie Jun 26, 2018
b1b9f7c
remove flush action in print
guangzhixie Jun 26, 2018
27517ca
ignore set_device for BM for now
guangzhixie Jun 26, 2018
949c4c4
Update logs for copy_from
guangzhixie Jun 26, 2018
51f0f2b
Initialize bm handle only in one place
guangzhixie Jun 26, 2018
1fe4f19
chage tensor type_len
hlzy Jun 26, 2018
683969c
Return correct size for AK_BM
guangzhixie Jun 26, 2018
adcac0e
Implement conv for BM
guangzhixie Jun 26, 2018
a4ed82e
Comment out last conv test for now
guangzhixie Jun 26, 2018
d4aa3eb
Modify sync_memcpy & add bm_mem_from_device
Jun 26, 2018
19b5ace
Update BM conv params
guangzhixie Jun 27, 2018
81e33aa
Init handle in init function
guangzhixie Jun 27, 2018
630cabc
Include BM conv implementation
guangzhixie Jun 27, 2018
e1c82c4
remove unecessary include
guangzhixie Jun 27, 2018
6905020
empty create function
guangzhixie Jun 27, 2018
59dba05
unit test for BM conv
guangzhixie Jun 27, 2018
c27573a
Update BM tensor print function
guangzhixie Jun 27, 2018
679ae3f
modify activation op, test pass
SophonTPU Jun 27, 2018
c0edd55
Merge branch 'bitmain' of https://github.com/guangzhixie/Anakin into …
SophonTPU Jun 27, 2018
1ab43e0
tensor_test
hlzy Jun 27, 2018
80f57fb
Fix sync_memcpy functions & test_saber_buffer_BM all passes
Jun 27, 2018
a1bd3fd
Implement BM softmax
guangzhixie Jun 27, 2018
7c0a0f0
only print in DEBUG
guangzhixie Jun 27, 2018
635ff42
reduce iteration
guangzhixie Jun 27, 2018
dc155af
tensor_test_update
hlzy Jun 27, 2018
69cf433
Merge branch 'bitmain' of https://github.com/guangzhixie/Anakin into …
hlzy Jun 27, 2018
4a9863f
Revert "reduce iteration"
guangzhixie Jun 27, 2018
4f08bea
Merge branch 'bitmain' of https://github.com/guangzhixie/Anakin into …
hlzy Jun 27, 2018
2997faf
modify fc op, compile error
SophonTPU Jun 27, 2018
ff5039f
Update for BM softmax
guangzhixie Jun 27, 2018
ebb12b4
xRevert "modify fc op, compile error"
SophonTPU Jun 27, 2018
9846cd9
Merge branch 'bitmain' of https://github.com/guangzhixie/Anakin into …
hlzy Jun 27, 2018
56f6122
change tensor_test_bm
hlzy Jun 27, 2018
048a61c
Merge branch 'bitmain' into tensor_test_lian
hlzy Jun 27, 2018
571e3a4
tensor test update
hlzy Jun 28, 2018
62a04c8
Add back missing files
guangzhixie Jun 28, 2018
bff601c
Add back missing files
guangzhixie Jun 28, 2018
19413c5
Implement BM scale
guangzhixie Jun 28, 2018
25fa481
pooling test
SophonTPU Jun 28, 2018
e532873
Merge branch 'bitmain' of https://github.com/guangzhixie/Anakin into …
SophonTPU Jun 28, 2018
56271d4
Fix d2d mem copy
Jun 28, 2018
80654f2
Merge branch 'bitmain' of https://github.com/guangzhixie/Anakin into …
SophonTPU Jun 28, 2018
c5a30a7
Add batch norm operation
guangzhixie Jun 28, 2018
b5cdc73
Implement batch norm for BM
guangzhixie Jun 28, 2018
5c6ec7f
Use template specifications instead of macro
guangzhixie Jun 28, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions saber/core/context.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "core/env.h"
#include "saber/saber_types.h"
#include <type_traits>

#ifdef USE_BM
#include "bmlib_runtime.h"
Expand All @@ -40,6 +41,11 @@ class Context final{
* @param compute_stream_id
*/
Context(int device_id = 0, int data_stream_id = 0, int compute_stream_id = 0){
if(std::is_same<TargetType, BM>::value){
LOG(INFO) << "context init for BM";
return;
}

CHECK_GT(devs.size(), 0) << "Env is not initialized or current target is not exit!";
if (device_id >= devs.size()){
LOG(WARNING) << "device index exceeds the number of devices, set to default device(0)!";
Expand All @@ -63,6 +69,11 @@ class Context final{
}

Context(const Context<TargetType>& ctx){
if(std::is_same<TargetType, BM>::value){
LOG(INFO) << "context init for BM";
return;
}

_device_id = ctx._device_id;
_data_stream_id = ctx._data_stream_id;
_compute_stream_id = ctx._compute_stream_id;
Expand Down
46 changes: 34 additions & 12 deletions saber/core/impl/bm/bm_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,17 @@ namespace saber{

typedef TargetWrapper<BM, __device_target> BM_API;

//TODO: check exception
//static bm_handle_t handle = get_bm_handle();
// Init handle only once in the lifetime
static bm_handle_t handle;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

线程安全嘛?还是说目前HW只支持single thread?

static bm_status_t init_handle{bmdnn_init(&handle)};

void BM_API::get_device_count(int &count) {
BMDNN_CHECK(bm_dev_getcount(&count));
}

void BM_API::set_device(int id){
//(bm_handle_t &handle, bool bmkernel_used, int id){
BMDNN_CHECK(bm_dev_request(&handle, 0, id));
//BMDNN_CHECK(bm_dev_request(&handle, 0, id));
}

//TODO: Do we have this functionality?
Expand Down Expand Up @@ -78,23 +78,45 @@ void BM_API::mem_set(void* ptr, int value, size_t n){
//BMDNN_CHECK(bm_memset_device(handle, value, *pmem));
}

//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
// size_t count, __DtoD) {};
void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
size_t count, __DtoD) {
handle = get_bm_handle();
//BMDNN_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count));
BMDNN_CHECK(bm_memcpy_d2d(handle, *(bm_device_mem_t *)(dst), dst_id, *(bm_device_mem_t *)(src), src_id, count));
LOG(INFO) << "BM sync_memcpy: device to device, finished";
};

//static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
// size_t count, __HtoD) {};
void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
size_t count, __HtoD) {
handle = get_bm_handle();
BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src)));

#ifdef DEBUG
for(int i=0; i<10; i++)
LOG(INFO) << "HtoD src: " << *((float *)(src)+i);
#endif

LOG(INFO) << "BM sync_memcpy: host to device, finished";
};

void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
size_t count, __DtoH) {
handle = get_bm_handle();
//auto* dev_ptr = const_cast<bm_device_mem_t *>(src);
BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src)));
//BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *src));
LOG(INFO) << "End sync_memcpy process";

#ifdef DEBUG
for(int i=0; i<10; i++)
LOG(INFO) << "DtoH dst: " << *((float *)(dst)+i);
#endif

LOG(INFO) << "BM sync_memcpy: device to host, finished";
};

//static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
// int src_dev, size_t count) {};
void BM_API::sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
int src_dev, size_t count) {

LOG(INFO) << "BM sync_memcpy_p2p: temporarily no used";
};


//! target wrapper
Expand Down
19 changes: 16 additions & 3 deletions saber/core/target_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,15 @@ struct TargetWrapper<NV, __device_target> {
*/
template <>
struct TargetWrapper<BM, __device_target> {
// TargetWrapper<BM, __device_target> ()
// {
// CHECK_EQ(bmdnn_init(&handle),BM_SUCCESS) << "Error:bmdnn_init failed";
// }
// ~TargetWrapper<BM, __device_target> ()
// {
// CHECK_EQ(bmdnn_deinit(handle),BM_SUCCESS) << "Error:bmdnn_deinit failed";
// }

typedef void* event_t;
typedef void* stream_t;

Expand Down Expand Up @@ -398,22 +407,26 @@ struct TargetWrapper<BM, __device_target> {
// brief create event, empty function for bitmain target

static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
size_t count, __DtoD) {};
size_t count, __DtoD);

static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
size_t count, __HtoD) {};
size_t count, __HtoD);

static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
size_t count, __DtoH);

static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
int src_dev, size_t count) {};
int src_dev, size_t count);

/**
* \brief device target return currently used device id
* @return currently activated device id
*/
static int get_device_id();

// static bm_handle_t get_handler();

// bm_handle_t handle;
};

#endif //USE_BM
Expand Down
98 changes: 89 additions & 9 deletions saber/core/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include "core/shape.h"
#include "core/events.h"
#include "core/tensor_traits.h"

#include <typeinfo>
namespace anakin{

namespace saber{
Expand Down Expand Up @@ -117,20 +117,49 @@ class Tensor : public TensorBase {
/**
* \brief Constructor with allocated data ptr and entire memory shape.
*/
template <typename TargetType_t>
Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape) {
// template <typename TargetType_t>
// Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape) {
//
// CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \
// "shape dims is not matched to layout type";
// _shape = shape;
// _valid_shape = shape;
// _offset = Shape::zero(shape.dims());
// std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
// std::make_shared<Buffer<TargetType_t>>(data_ptr, shape.count() * _type_len(), id);
// BufferMemShare(_buf, buf_from_date);
// _is_subbuf = false;
// }

#ifdef USE_BM
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这样问题,要用模板特化。而不是在原始代码上加宏来控制。这样如果同时打开CMAKE里面的USE_CPU USE_BM可以保证执行正确嘛?

/**
* \brief Constructor with allocated data ptr and entire memory shape. only for BM
*/
template <typename Dtype_s,typename TargetType_t>
Tensor(Dtype_s* data_ptr, TargetType_t target, int id, Shape shape) {
CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \
"shape dims is not matched to layout type";
_shape = shape;
_valid_shape = shape;
_offset = Shape::zero(shape.dims());

if(typeid(Dtype_s) == typeid(AK_FLOAT))
{
std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
std::make_shared<Buffer<TargetType_t>>(&bm_mem_from_system(const_cast<Dtype_s *>(data_ptr)), shape.count() * _type_len(), id);

BufferMemShare(_buf, buf_from_date);
}
else
{
std::shared_ptr<Buffer<TargetType_t>> buf_from_date = \
std::make_shared<Buffer<TargetType_t>>(data_ptr, shape.count() * _type_len(), id);

BufferMemShare(_buf, buf_from_date);
}
_is_subbuf = false;
}

#endif
/**
* \brief Copy constructor, shallow copy.
*/
Expand Down Expand Up @@ -580,7 +609,7 @@ class Tensor : public TensorBase {
}
CHECK_EQ(valid_size(), tensor.valid_size()) \
<< "sizes of two valid shapes must be the same";

/// get the proper process target wrapper
typedef TargetWrapper<TargetType_t> API_t;
typedef typename TargetTypeTraits<TargetType_t>::target_type target_type_t;
Expand Down Expand Up @@ -727,7 +756,8 @@ class Tensor : public TensorBase {
SaberStatus copy_from(const Tensor<NewTargetType_t, NewDataType_t, NewLayOutType_t>& tensor) {
LOG(WARNING) << "Invalid: copy_from is not allowed for current type.";
return SaberInvalidValue;
}
}

#endif

/**
Expand Down Expand Up @@ -942,15 +972,19 @@ class Tensor : public TensorBase {

#ifdef USE_BM

#ifndef BM_TENSOR_COPY
#define BM_TENSOR_COPY


template<> inline
size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
return 1;
return 4;
}

template<>
template<> inline
SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
LOG(INFO) << "BM copy_from";
LOG(INFO) << "BM copy_from X86";
CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";

auto* device_data_ptr = mutable_data();
Expand All @@ -961,16 +995,62 @@ SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor
template<>
template<> inline
SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
LOG(INFO) << "X86 copy_from";
LOG(INFO) << "X86 copy_from BM";
CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";

auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
return SaberSuccess;
}

/*
template<> inline
size_t Tensor<BM, AK_BM, NCHW>::_type_len(){
return 4;
}

template<>
template<> inline
SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<X86, AK_FLOAT, NCHW>(const Tensor<X86, AK_FLOAT, NCHW>& tensor) {
LOG(INFO) << "BM copy_from X86";
CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";

auto* device_data_ptr = mutable_data();
BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast<float *>(tensor.data()))));
//BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *(bm_device_mem_t *)(mutable_data()), bm_mem_from_system(tensor.data())));
return SaberSuccess;
}

template<>
template<> inline
SaberStatus Tensor<X86, AK_FLOAT, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
LOG(INFO) << "X86 copy_from BM";
CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";

auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
//BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *(bm_device_mem_t *)(tensor.data())));
return SaberSuccess;
}

template<>
template<> inline
SaberStatus Tensor<BM, AK_BM, NCHW>::copy_from<BM, AK_BM, NCHW>(const Tensor<BM, AK_BM, NCHW>& tensor) {
LOG(INFO) << "BM copy_from BM";
CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same";

auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
//BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr));
//BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *(bm_device_mem_t *)(tensor.data())));
return SaberSuccess;
}
*/

#endif

#endif


} //namespace saber

} //namespace anakin
Expand Down
36 changes: 36 additions & 0 deletions saber/core/tensor_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,42 @@ void fill_tensor_device_const(Tensor<BM, AK_BM, NCHW>& tensor, float value, \
delete [] host_mem_input;
}

template <>
void print_tensor_device<Tensor<BM, AK_BM, NCHW>>(Tensor<BM, AK_BM, NCHW>& tensor, \
typename Tensor<BM, AK_BM, NCHW>::API::stream_t stream) {

LOG(INFO) << "BM device tensor data:" << tensor.size();

/*
const bm_device_mem_t* device_data_ptr = tensor.data();
unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr);
bm_flush(get_bm_handle());
float* device_data = (float*)bm_get_global_addr(gaddr);

for (int i = 0; i < tensor.size(); ++i) {
printf("%.2f ", device_data[i]);

if ((i + 1) % (4 * tensor.width()) == 0) {
printf("\n");
}
}*/

float *host_mem = new float[tensor.size()];
auto* device_data_ptr = const_cast<bm_device_mem_t *>(tensor.data());
bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr);

for (int i = 0; i < tensor.size(); ++i) {
printf("%.2f\t", host_mem[i]);

if ((i + 1) % tensor.width() == 0){
printf("\n");
}
}
printf("\n");

delete [] host_mem;
}

#endif

} //namespace saber
Expand Down
Loading