Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] [libdevice] Add vector overloads of ConvertBFloat16ToFINTEL and ConvertFToBFloat16INTEL #14085

Merged
merged 11 commits into from
Jun 12, 2024
40 changes: 40 additions & 0 deletions libdevice/bfloat16_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#if defined(__SPIR__) || defined(__SPIRV__)

#include <CL/__spirv/spirv_ops.hpp>
#include <CL/__spirv/spirv_types.hpp>
#include <cassert>
#include <cstdint>

DEVICE_EXTERN_C_INLINE
Expand All @@ -23,4 +25,42 @@ float __devicelib_ConvertBF16ToFINTEL(const uint16_t &x) {
return __spirv_ConvertBF16ToFINTEL(x);
}

// For vector of size 1.
DEVICE_EXTERN_C_INLINE
void __devicelib_ConvertFToBF16INTELVec1(const float *src, uint16_t *dst) {
dst[0] = __spirv_ConvertFToBF16INTEL(src[0]);
}
DEVICE_EXTERN_C_INLINE
void __devicelib_ConvertBF16ToFINTELVec1(const uint16_t *src, float *dst) {
dst[0] = __spirv_ConvertBF16ToFINTEL(src[0]);
}

// Generate the conversion functions for vector of size 2, 3, 4, 8, 16.
#define GenerateConvertFunctionForVec(size) \
DEVICE_EXTERN_C_INLINE \
void __devicelib_ConvertFToBF16INTELVec##size(const float *src, \
uint16_t *dst) { \
__ocl_vec_t<float, size> x = \
*__builtin_bit_cast(const __ocl_vec_t<float, size> *, src); \
__ocl_vec_t<uint16_t, size> y = __spirv_ConvertFToBF16INTEL(x); \
*__builtin_bit_cast(__ocl_vec_t<uint16_t, size> *, dst) = y; \
} \
DEVICE_EXTERN_C_INLINE \
void __devicelib_ConvertBF16ToFINTELVec##size(const uint16_t *src, \
float *dst) { \
__ocl_vec_t<uint16_t, size> x = \
*__builtin_bit_cast(const __ocl_vec_t<uint16_t, size> *, src); \
__ocl_vec_t<float, size> y = __spirv_ConvertBF16ToFINTEL(x); \
*__builtin_bit_cast(__ocl_vec_t<float, size> *, dst) = y; \
}

// clang-format off
GenerateConvertFunctionForVec(2)
GenerateConvertFunctionForVec(3)
GenerateConvertFunctionForVec(4)
GenerateConvertFunctionForVec(8)
GenerateConvertFunctionForVec(16)
// clang-format on
#undef GenerateConvertFunctionForVec

#endif // __SPIR__ || __SPIRV__
27 changes: 27 additions & 0 deletions libdevice/fallback-bfloat16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,31 @@ __devicelib_ConvertBF16ToFINTEL(const uint16_t &a) {
return floatValue;
}

// Generate the conversion functions for vector of size 1, 2, 3, 4, 8, 16.
#define GenerateConvertFunctionForVec(size) \
DEVICE_EXTERN_C_INLINE \
void __devicelib_ConvertFToBF16INTELVec##size(const float *src, \
uint16_t *dst) { \
for (int i = 0; i < size; ++i) { \
dst[i] = __devicelib_ConvertFToBF16INTEL(src[i]); \
} \
} \
DEVICE_EXTERN_C_INLINE \
void __devicelib_ConvertBF16ToFINTELVec##size(const uint16_t *src, \
float *dst) { \
for (int i = 0; i < size; ++i) { \
dst[i] = __devicelib_ConvertBF16ToFINTEL(src[i]); \
} \
}

// clang-format off
GenerateConvertFunctionForVec(1)
GenerateConvertFunctionForVec(2)
GenerateConvertFunctionForVec(3)
GenerateConvertFunctionForVec(4)
GenerateConvertFunctionForVec(8)
GenerateConvertFunctionForVec(16)
// clang-format on
#undef GenerateConvertFunctionForVec

#endif // __SPIR__ || __SPIRV__
24 changes: 24 additions & 0 deletions llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,30 @@ SYCLDeviceLibFuncMap SDLMap = {
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertBF16ToFINTEL",
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertFToBF16INTELVec1",
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertBF16ToFINTELVec1",
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertFToBF16INTELVec2",
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertBF16ToFINTELVec2",
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertFToBF16INTELVec3",
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertBF16ToFINTELVec3",
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertFToBF16INTELVec4",
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertBF16ToFINTELVec4",
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertFToBF16INTELVec8",
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertBF16ToFINTELVec8",
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertFToBF16INTELVec16",
DeviceLibExt::cl_intel_devicelib_bfloat16},
{"__devicelib_ConvertBF16ToFINTELVec16",
DeviceLibExt::cl_intel_devicelib_bfloat16},
};

// Each fallback device library corresponds to one bit in "require mask" which
Expand Down
10 changes: 8 additions & 2 deletions sycl/include/CL/__spirv/spirv_ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1019,10 +1019,16 @@ extern __DPCPP_SYCL_EXTERNAL void
__spirv_ocl_prefetch(const __attribute__((opencl_global)) char *Ptr,
size_t NumBytes) noexcept;

extern __DPCPP_SYCL_EXTERNAL uint16_t
__spirv_ConvertFToBF16INTEL(float) noexcept;
extern __DPCPP_SYCL_EXTERNAL float
__spirv_ConvertBF16ToFINTEL(uint16_t) noexcept;
extern __DPCPP_SYCL_EXTERNAL uint16_t
__spirv_ConvertFToBF16INTEL(float) noexcept;
template <int N>
extern __DPCPP_SYCL_EXTERNAL __ocl_vec_t<float, N>
__spirv_ConvertBF16ToFINTEL(__ocl_vec_t<uint16_t, N>) noexcept;
template <int N>
extern __DPCPP_SYCL_EXTERNAL __ocl_vec_t<uint16_t, N>
__spirv_ConvertFToBF16INTEL(__ocl_vec_t<float, N>) noexcept;

__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL
__SYCL_EXPORT __ocl_vec_t<uint32_t, 4>
Expand Down
70 changes: 70 additions & 0 deletions sycl/include/sycl/ext/oneapi/bfloat16.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,30 @@ extern "C" __DPCPP_SYCL_EXTERNAL uint16_t
__devicelib_ConvertFToBF16INTEL(const float &) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL float
__devicelib_ConvertBF16ToFINTEL(const uint16_t &) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL void
__devicelib_ConvertFToBF16INTELVec1(const float *, uint16_t *) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL void
__devicelib_ConvertBF16ToFINTELVec1(const uint16_t *, float *) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL void
__devicelib_ConvertFToBF16INTELVec2(const float *, uint16_t *) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL void
__devicelib_ConvertBF16ToFINTELVec2(const uint16_t *, float *) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL void
__devicelib_ConvertFToBF16INTELVec3(const float *, uint16_t *) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL void
__devicelib_ConvertBF16ToFINTELVec3(const uint16_t *, float *) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL void
__devicelib_ConvertFToBF16INTELVec4(const float *, uint16_t *) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL void
__devicelib_ConvertBF16ToFINTELVec4(const uint16_t *, float *) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL void
__devicelib_ConvertFToBF16INTELVec8(const float *, uint16_t *) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL void
__devicelib_ConvertBF16ToFINTELVec8(const uint16_t *, float *) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL void
__devicelib_ConvertFToBF16INTELVec16(const float *, uint16_t *) noexcept;
extern "C" __DPCPP_SYCL_EXTERNAL void
__devicelib_ConvertBF16ToFINTELVec16(const uint16_t *, float *) noexcept;

namespace sycl {
inline namespace _V1 {
Expand All @@ -30,6 +54,52 @@ using Bfloat16StorageT = uint16_t;
Bfloat16StorageT bfloat16ToBits(const bfloat16 &Value);
bfloat16 bitsToBfloat16(const Bfloat16StorageT Value);

template <int N> void BF16VecToFloatVec(const bfloat16 src[N], float dst[N]) {
#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__))
const uint16_t *src_i16 = sycl::bit_cast<const uint16_t *>(src);
if constexpr (N == 1)
__devicelib_ConvertBF16ToFINTELVec1(src_i16, dst);
else if constexpr (N == 2)
__devicelib_ConvertBF16ToFINTELVec2(src_i16, dst);
else if constexpr (N == 3)
__devicelib_ConvertBF16ToFINTELVec3(src_i16, dst);
else if constexpr (N == 4)
__devicelib_ConvertBF16ToFINTELVec4(src_i16, dst);
else if constexpr (N == 8)
__devicelib_ConvertBF16ToFINTELVec8(src_i16, dst);
else if constexpr (N == 16)
__devicelib_ConvertBF16ToFINTELVec16(src_i16, dst);
#else
for (int i = 0; i < N; ++i) {
dst[i] = (float)src[i];
}
#endif
}

template <int N> void FloatVecToBF16Vec(float src[N], bfloat16 dst[N]) {
#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__))
uint16_t *dst_i16 = sycl::bit_cast<uint16_t *>(dst);
if constexpr (N == 1)
__devicelib_ConvertFToBF16INTELVec1(src, dst_i16);
else if constexpr (N == 2)
__devicelib_ConvertFToBF16INTELVec2(src, dst_i16);
else if constexpr (N == 3)
__devicelib_ConvertFToBF16INTELVec3(src, dst_i16);
else if constexpr (N == 4)
__devicelib_ConvertFToBF16INTELVec4(src, dst_i16);
else if constexpr (N == 8)
__devicelib_ConvertFToBF16INTELVec8(src, dst_i16);
else if constexpr (N == 16)
__devicelib_ConvertFToBF16INTELVec16(src, dst_i16);
#else
for (int i = 0; i < N; ++i) {
// No need to cast as bfloat16 has a assignment op overload that takes
// a float.
dst[i] = src[i];
}
#endif
}

// sycl::vec support
namespace bf16 {
#ifdef __SYCL_DEVICE_ONLY__
Expand Down
84 changes: 84 additions & 0 deletions sycl/test-e2e/BFloat16/bfloat16_conversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
#include <iostream>
#include <sycl/detail/core.hpp>

#include <sycl/ext/oneapi/bfloat16.hpp>

using namespace sycl;
using bfloat16 = sycl::ext::oneapi::bfloat16;

template <typename T> T calculate(T a, T b) {
sycl::ext::oneapi::bfloat16 x = -a;
Expand Down Expand Up @@ -55,6 +58,82 @@ template <typename T> int test_host() {
return 1;
}

int test_host_vector_conversions() {
bool Passed = true;
std::cout << "float[4] -> bfloat16[4] -> float[4] conversion on host..."
<< std::flush;

float FloatArray[4] = {1.0f, 2.0f, 3.0f, 4.0f};

// float[4] -> bfloat16[4]
bfloat16 BFloatArray[4];
sycl::ext::oneapi::detail::FloatVecToBF16Vec<4>(FloatArray, BFloatArray);

// bfloat16[4] -> float[4]
float NewFloatArray[4];
sycl::ext::oneapi::detail::BF16VecToFloatVec<4>(BFloatArray, NewFloatArray);

// Check results.
for (int i = 0; i < 4; ++i)
Passed &= (FloatArray[i] == NewFloatArray[i]);

if (Passed)
std::cout << "passed\n";
else
std::cout << "failed\n";

return !Passed;
}

int test_device_vector_conversions(queue Q) {
int err = 0;
buffer<int> err_buf(&err, 1);

std::cout << "float[4] -> bfloat16[4] conversion on device..." << std::flush;
// Convert float array to bfloat16 array
Q.submit([&](handler &CGH) {
accessor<int, 1, access::mode::write, target::device> ERR(err_buf, CGH);
CGH.single_task([=]() {
float FloatArray[4] = {1.0f, -1.0f, 0.0f, 2.0f};
bfloat16 BF16Array[4];
sycl::ext::oneapi::detail::FloatVecToBF16Vec<4>(FloatArray, BF16Array);
for (int i = 0; i < 4; i++) {
if (FloatArray[i] != (float)BF16Array[i]) {
ERR[0] = 1;
}
}
});
}).wait();

if (err)
std::cout << "failed\n";
else
std::cout << "passed\n";

std::cout << "bfloat16[4] -> float[4] conversion on device..." << std::flush;
// Convert bfloat16 array back to float array
Q.submit([&](handler &CGH) {
accessor<int, 1, access::mode::write, target::device> ERR(err_buf, CGH);
CGH.single_task([=]() {
bfloat16 BF16Array[3] = {1.0f, 0.0f, -1.0f};
float FloatArray[3];
sycl::ext::oneapi::detail::BF16VecToFloatVec<4>(BF16Array, FloatArray);
for (int i = 0; i < 3; i++) {
if (FloatArray[i] != (float)BF16Array[i]) {
ERR[0] = 1;
}
}
});
}).wait();

if (err)
std::cout << "failed\n";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if there is a future change in the source that causes this test to fail someday, it'll be more work for whoever catches that ticket. Make the failure output more explicit. At minimum output "failed on host" and "failed on device". And if you know of anything else that might be useful to some unlucky co-worker, maybe include that too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. I'll improve the test output.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, after looking at the test again, the test will currently output something like:

    bfloat16[4] -> float[4] conversion on device...  passed/failed

or

float[4] -> bfloat16[4] -> float[4] conversion on host.... passed/failed

for device and host respectively. There's a std::cout before queue.submit() which will output the type of conversion being performed along where the conversion takes place (host/device).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this go to std::cerr instead of std::cout?

else
std::cout << "passed\n";

return err;
}

int main() {
queue Q;
int result;
Expand All @@ -63,6 +142,11 @@ int main() {
if (Q.get_device().has(aspect::fp16))
result |= test_device<sycl::half>(Q);
result |= test_device<float>(Q);

// Test vector BF16 -> float conversion and vice versa.
result |= test_host_vector_conversions();
result |= test_device_vector_conversions(Q);

if (result)
std::cout << "FAIL\n";
else
Expand Down
Loading
Loading