-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbswap.cc
214 lines (188 loc) · 7.89 KB
/
bswap.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#include "node_api.h"
#include <cstdint>
#include <cstddef>
///// Min compiler support and bswap intrins used in scalar region
#if defined(__GNUC__) // GCC, clang
# ifdef __clang__
# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 4)
# error("Requires clang >= 3.4")
# endif
# else
# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
# error("Requires GCC >= 4.8")
# endif
# endif
# define BSWAP_INTRINSIC_2(x) __builtin_bswap16(x)
# define BSWAP_INTRINSIC_4(x) __builtin_bswap32(x)
# define BSWAP_INTRINSIC_8(x) __builtin_bswap64(x)
#elif defined(_MSC_VER)
# define BSWAP_INTRINSIC_2(x) _byteswap_ushort(x);
# define BSWAP_INTRINSIC_4(x) _byteswap_ulong(x);
# define BSWAP_INTRINSIC_8(x) _byteswap_uint64(x);
#endif // _MSC_VER
using std::uint8_t;
using std::uint16_t;
using std::uint32_t;
using std::uint64_t;
using std::size_t;
using std::uintptr_t;
#include "x86.h"
#include "neon.h"
static inline void swap(uint16_t* val) { *val = BSWAP_INTRINSIC_2(*val); }
static inline void swap(uint32_t* val) { *val = BSWAP_INTRINSIC_4(*val); }
static inline void swap(uint64_t* val) { *val = BSWAP_INTRINSIC_8(*val); }
template<typename STYPE, class VTYPE>
static void shuffle(STYPE* data, size_t elemLength) {
size_t elemIdx = 0;
constexpr size_t vectSize = VTYPE::size();
// 1. Head: Scalar until aligned to SIMD register. Not optimized for buffers
// < VTYPE::size() bytes, where we could use narrower vectors than the
// widest supported by the ISA.
while ((reinterpret_cast<uintptr_t>(data + elemIdx) % vectSize) && elemIdx < elemLength) {
swap(&data[elemIdx++]);
}
VTYPE mask = VTYPE::template getMask<STYPE>();
// 2. Main body: vectors unrolled by 8
constexpr size_t elemsPerVec = vectSize / sizeof(STYPE);
constexpr size_t elemsPerIter = 8 * elemsPerVec;
while (elemIdx + elemsPerIter < elemLength) {
VTYPE::swap(&data[elemIdx + 0 * elemsPerVec], mask);
VTYPE::swap(&data[elemIdx + 1 * elemsPerVec], mask);
VTYPE::swap(&data[elemIdx + 2 * elemsPerVec], mask);
VTYPE::swap(&data[elemIdx + 3 * elemsPerVec], mask);
VTYPE::swap(&data[elemIdx + 4 * elemsPerVec], mask);
VTYPE::swap(&data[elemIdx + 5 * elemsPerVec], mask);
VTYPE::swap(&data[elemIdx + 6 * elemsPerVec], mask);
VTYPE::swap(&data[elemIdx + 7 * elemsPerVec], mask);
elemIdx += elemsPerIter;
}
// 3. Tail A: vectors without unrolling
while (elemIdx + elemsPerVec < elemLength) {
VTYPE::swap(data + elemIdx, mask);
elemIdx += vectSize / sizeof(STYPE);
}
// 4. Tail B: scalar until end
while (elemIdx < elemLength) swap(&data[elemIdx++]);
}
template <class VTYPE>
napi_value flipBytes(napi_env env, napi_callback_info info) {
// TODO(perf): consider this to warm up the wider registers:
// asm volatile("vxorps ymm0, ymm0, ymm0" : : "ymm0")
napi_status status;
napi_value args[1];
size_t argc = 1;
status = napi_get_cb_info(env, info, &argc, args, nullptr, nullptr);
if (status != napi_ok) goto error;
if (argc < 1) {
napi_throw_error(env, NULL, "Expected typed array");
return nullptr;
}
bool isTypedArray;
status = napi_is_typedarray(env, args[0], &isTypedArray);
if (status != napi_ok) goto error;
if (!isTypedArray) {
napi_throw_error(env, NULL, "Expected typed array");
return nullptr;
}
napi_typedarray_type type;
size_t elemLength;
void* data;
status = napi_get_typedarray_info(env, args[0], &type, &elemLength, &data, nullptr, nullptr);
if (status != napi_ok) goto error;
switch (type) {
case napi_int16_array:
case napi_uint16_array:
shuffle<uint16_t, VTYPE>(reinterpret_cast<uint16_t*>(data), elemLength);
return nullptr;
case napi_int32_array:
case napi_uint32_array:
case napi_float32_array:
shuffle<uint32_t, VTYPE>(reinterpret_cast<uint32_t*>(data), elemLength);
return nullptr;
case napi_float64_array:
#if (NAPI_VERSION > 5)
case napi_bigint64_array:
case napi_biguint64_array:
#endif // (NAPI_VERSION > 5)
shuffle<uint64_t, VTYPE>(reinterpret_cast<uint64_t*>(data), elemLength);
return nullptr;
case napi_int8_array:
case napi_uint8_array:
case napi_uint8_clamped_array:
default:
return nullptr;
}
error:
const napi_extended_error_info* error_info = nullptr;
napi_get_last_error_info(env, &error_info);
const char* err_message = error_info->error_message;
bool is_pending;
napi_is_exception_pending(env, &is_pending);
if (!is_pending) {
const char* message = err_message == nullptr ? "empty error message" : err_message;
napi_throw_error(env, nullptr, message);
}
return nullptr;
}
#define NAPI_CALL(env, call) \
do { \
napi_status status = (call); \
if (status != napi_ok) { \
const napi_extended_error_info* error_info = NULL; \
napi_get_last_error_info((env), &error_info); \
const char* err_message = error_info->error_message; \
bool is_pending; \
napi_is_exception_pending((env), &is_pending); \
if (!is_pending) { \
const char* message = (err_message == NULL) \
? "empty error message" \
: err_message; \
napi_throw_error((env), NULL, message); \
return NULL; \
} \
} \
} while(0)
napi_value Init(napi_env env, napi_value exports) {
napi_value fn;
napi_value ise;
// MSVC doesn't have any equivalent to -march=native, but it will emit
// instructions from any instruction set when intrinsics are used. This lets
// us set EnableEnhancedInstructionSet to a low/compatible value while using
// run-time dispatch to pick a faster version.
#ifdef _MSC_VER
// Warning: Do not put the ternary outside of the New. Performance will tank.
# ifdef BSWAP_USE_AVX512
NAPI_CALL(env, napi_create_function(env, "bswap", NAPI_AUTO_LENGTH,
supportsAVX512BW() ? flipBytes<Vec512> :
supportsAVX2() ? flipBytes<Vec256> : flipBytes<Vec128>, NULL, &fn));
NAPI_CALL(env, napi_create_string_latin1(env,
supportsAVX512BW() ? "AVX512" : supportsAVX2() ? "AVX2" : "SSSE3",
NAPI_AUTO_LENGTH, &ise));
# else
NAPI_CALL(env, napi_create_function(env, "bswap", NAPI_AUTO_LENGTH,
supportsAVX2() ? flipBytes<Vec256> : flipBytes<Vec128>, NULL, &fn));
NAPI_CALL(env, napi_create_string_latin1(env, supportsAVX2() ? "AVX2" : "SSSE3", NAPI_AUTO_LENGTH, &ise));
# endif // BSWAP_USE_AVX512
#else
// GNU-compatible compilers have -march=native, and refuse to emit
// instructions from an instruction set less than the -m flags allow.
# if defined(__AVX512BW__) && defined(BSWAP_USE_AVX512)
// Disabled by default because it is slower than AVX2.
NAPI_CALL(env, napi_create_function(env, "bswap", NAPI_AUTO_LENGTH, flipBytes<Vec512>, NULL, &fn));
NAPI_CALL(env, napi_create_string_latin1(env, "AVX512", NAPI_AUTO_LENGTH, &ise));
# elif defined(__AVX2__)
NAPI_CALL(env, napi_create_function(env, "bswap", NAPI_AUTO_LENGTH, flipBytes<Vec256>, NULL, &fn));
NAPI_CALL(env, napi_create_string_latin1(env, "AVX2", NAPI_AUTO_LENGTH, &ise));
# elif defined(__SSSE3__)
NAPI_CALL(env, napi_create_function(env, "bswap", NAPI_AUTO_LENGTH, flipBytes<Vec128>, NULL, &fn));
NAPI_CALL(env, napi_create_string_latin1(env, "SSSE3", NAPI_AUTO_LENGTH, &ise));
# elif defined(__ARM_NEON)
NAPI_CALL(env, napi_create_function(env, "bswap", NAPI_AUTO_LENGTH, flipBytes<VecNeon>, NULL, &fn));
NAPI_CALL(env, napi_create_string_latin1(env, "NEON", NAPI_AUTO_LENGTH, &ise));
# endif
#endif
NAPI_CALL(env, napi_set_named_property(env, exports, "bswap", fn));
NAPI_CALL(env, napi_set_named_property(env, exports, "ise", ise));
return exports;
}
NAPI_MODULE(bswap, Init)