-
Notifications
You must be signed in to change notification settings - Fork 12.6k
/
Copy pathop_aarch64.h
271 lines (244 loc) · 9.63 KB
/
op_aarch64.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
//===-- aarch64 implementation of memory function building blocks ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides aarch64 specific building blocks to compose memory
// functions.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
#include "src/__support/macros/properties/architectures.h"
#if defined(LIBC_TARGET_ARCH_IS_AARCH64)
#include "src/__support/CPP/type_traits.h" // cpp::always_false
#include "src/__support/common.h"
#include "src/string/memory_utils/op_generic.h"
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif //__ARM_NEON
namespace LIBC_NAMESPACE::aarch64 {
LIBC_INLINE_VAR constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
namespace neon {
struct BzeroCacheLine {
static constexpr size_t SIZE = 64;
LIBC_INLINE static void block(Ptr dst, uint8_t) {
#if __SIZEOF_POINTER__ == 4
asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
#else
asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
#endif
}
LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
size_t offset = 0;
do {
block(dst + offset, value);
offset += SIZE;
} while (offset < count - SIZE);
// Unaligned store, we can't use 'dc zva' here.
generic::Memset<generic_v512>::tail(dst, value, count);
}
};
LIBC_INLINE bool hasZva() {
uint64_t zva_val;
asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
// DC ZVA is permitted if DZP, bit [4] is zero.
// BS, bits [3:0] is log2 of the block count in words.
// So the next line checks whether the instruction is permitted and block
// count is 16 words (i.e. 64 bytes).
return (zva_val & 0b11111) == 0b00100;
}
} // namespace neon
///////////////////////////////////////////////////////////////////////////////
// Bcmp
template <size_t Size> struct Bcmp {
static constexpr size_t SIZE = Size;
static constexpr size_t BlockSize = 32;
LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) {
return reinterpret_cast<const unsigned char *>(ptr);
}
LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
if constexpr (Size == 16) {
auto _p1 = as_u8(p1);
auto _p2 = as_u8(p2);
uint8x16_t a = vld1q_u8(_p1);
uint8x16_t n = vld1q_u8(_p2);
uint8x16_t an = veorq_u8(a, n);
uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
return vmaxv_u32(an_reduced);
} else if constexpr (Size == 32) {
auto _p1 = as_u8(p1);
auto _p2 = as_u8(p2);
uint8x16_t a = vld1q_u8(_p1);
uint8x16_t b = vld1q_u8(_p1 + 16);
uint8x16_t n = vld1q_u8(_p2);
uint8x16_t o = vld1q_u8(_p2 + 16);
uint8x16_t an = veorq_u8(a, n);
uint8x16_t bo = veorq_u8(b, o);
// anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is
// a difference between the two buffers. We reduce this value down to 4
// bytes in two steps. First, calculate the saturated move value when
// going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
// a single 32 bit nonzero value if a mismatch occurred.
uint8x16_t anbo = vorrq_u8(an, bo);
uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
return vmaxv_u32(anbo_reduced);
} else if constexpr ((Size % BlockSize) == 0) {
for (size_t offset = 0; offset < Size; offset += BlockSize)
if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
return value;
} else {
static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
}
return BcmpReturnType::zero();
}
LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
return block(p1 + count - SIZE, p2 + count - SIZE);
}
LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
if constexpr (Size == 16) {
auto _p1 = as_u8(p1);
auto _p2 = as_u8(p2);
uint8x16_t a = vld1q_u8(_p1);
uint8x16_t b = vld1q_u8(_p1 + count - 16);
uint8x16_t n = vld1q_u8(_p2);
uint8x16_t o = vld1q_u8(_p2 + count - 16);
uint8x16_t an = veorq_u8(a, n);
uint8x16_t bo = veorq_u8(b, o);
// anbo = (a ^ n) | (b ^ o)
uint8x16_t anbo = vorrq_u8(an, bo);
uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
return vmaxv_u32(anbo_reduced);
} else if constexpr (Size == 32) {
auto _p1 = as_u8(p1);
auto _p2 = as_u8(p2);
uint8x16_t a = vld1q_u8(_p1);
uint8x16_t b = vld1q_u8(_p1 + 16);
uint8x16_t c = vld1q_u8(_p1 + count - 16);
uint8x16_t d = vld1q_u8(_p1 + count - 32);
uint8x16_t n = vld1q_u8(_p2);
uint8x16_t o = vld1q_u8(_p2 + 16);
uint8x16_t p = vld1q_u8(_p2 + count - 16);
uint8x16_t q = vld1q_u8(_p2 + count - 32);
uint8x16_t an = veorq_u8(a, n);
uint8x16_t bo = veorq_u8(b, o);
uint8x16_t cp = veorq_u8(c, p);
uint8x16_t dq = veorq_u8(d, q);
uint8x16_t anbo = vorrq_u8(an, bo);
uint8x16_t cpdq = vorrq_u8(cp, dq);
// abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to
// a nonzero 32 bit value if a mismatch occurred.
uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
return vmaxv_u32(abnocpdq_reduced);
} else {
static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
}
return BcmpReturnType::zero();
}
LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
size_t count) {
static_assert(Size > 1, "a loop of size 1 does not need tail");
size_t offset = 0;
do {
if (auto value = block(p1 + offset, p2 + offset))
return value;
offset += SIZE;
} while (offset < count - SIZE);
return tail(p1, p2, count);
}
};
} // namespace LIBC_NAMESPACE::aarch64
namespace LIBC_NAMESPACE::generic {
///////////////////////////////////////////////////////////////////////////////
// Specializations for uint16_t
template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
}
template <>
LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
}
template <>
LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
static_cast<int32_t>(load_be<uint16_t>(p2, offset));
}
///////////////////////////////////////////////////////////////////////////////
// Specializations for uint32_t
template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {};
template <>
LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
}
template <>
LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load_be<uint32_t>(p1, offset);
const auto b = load_be<uint32_t>(p2, offset);
return a > b ? 1 : a < b ? -1 : 0;
}
///////////////////////////////////////////////////////////////////////////////
// Specializations for uint64_t
template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {};
template <>
LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset);
}
template <>
LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load_be<uint64_t>(p1, offset);
const auto b = load_be<uint64_t>(p2, offset);
if (a != b)
return a > b ? 1 : -1;
return MemcmpReturnType::zero();
}
///////////////////////////////////////////////////////////////////////////////
// Specializations for uint8x16_t
template <> struct is_vector<uint8x16_t> : cpp::true_type {};
template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {};
template <>
LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
for (size_t i = 0; i < 2; ++i) {
auto a = load<uint64_t>(p1, offset);
auto b = load<uint64_t>(p2, offset);
uint32_t cond = a != b;
if (cond)
return cond;
offset += sizeof(uint64_t);
}
return 0;
}
template <>
LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
for (size_t i = 0; i < 2; ++i) {
auto a = load_be<uint64_t>(p1, offset);
auto b = load_be<uint64_t>(p2, offset);
if (a != b)
return cmp_neq_uint64_t(a, b);
offset += sizeof(uint64_t);
}
return MemcmpReturnType::zero();
}
///////////////////////////////////////////////////////////////////////////////
// Specializations for uint8x16x2_t
template <> struct is_vector<uint8x16x2_t> : cpp::true_type {};
template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {};
template <>
LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2,
size_t offset) {
for (size_t i = 0; i < 4; ++i) {
auto a = load_be<uint64_t>(p1, offset);
auto b = load_be<uint64_t>(p2, offset);
if (a != b)
return cmp_neq_uint64_t(a, b);
offset += sizeof(uint64_t);
}
return MemcmpReturnType::zero();
}
} // namespace LIBC_NAMESPACE::generic
#endif // LIBC_TARGET_ARCH_IS_AARCH64
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H