Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix x86 SIMD byte shift intrinsics #1168

Merged
merged 1 commit into from
May 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 40 additions & 32 deletions crates/core_arch/src/x86/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2585,44 +2585,52 @@ pub unsafe fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(IMM8);
const fn mask(shift: i32, i: u32) -> u32 {
let shift = shift as u32 & 0xff;
if shift > 15 || i % 16 < shift {
0
} else {
32 + (i - shift)
}
}
let a = a.as_i8x32();
let zero = _mm256_setzero_si256().as_i8x32();
let r: i8x32 = simd_shuffle32!(
zero,
a,
<const IMM8: i32> [
32 - (IMM8 as u32 & 0xff),
33 - (IMM8 as u32 & 0xff),
34 - (IMM8 as u32 & 0xff),
35 - (IMM8 as u32 & 0xff),
36 - (IMM8 as u32 & 0xff),
37 - (IMM8 as u32 & 0xff),
38 - (IMM8 as u32 & 0xff),
39 - (IMM8 as u32 & 0xff),
40 - (IMM8 as u32 & 0xff),
41 - (IMM8 as u32 & 0xff),
42 - (IMM8 as u32 & 0xff),
43 - (IMM8 as u32 & 0xff),
44 - (IMM8 as u32 & 0xff),
45 - (IMM8 as u32 & 0xff),
46 - (IMM8 as u32 & 0xff),
47 - (IMM8 as u32 & 0xff),
48 - (IMM8 as u32 & 0xff) - 16,
49 - (IMM8 as u32 & 0xff) - 16,
50 - (IMM8 as u32 & 0xff) - 16,
51 - (IMM8 as u32 & 0xff) - 16,
52 - (IMM8 as u32 & 0xff) - 16,
53 - (IMM8 as u32 & 0xff) - 16,
54 - (IMM8 as u32 & 0xff) - 16,
55 - (IMM8 as u32 & 0xff) - 16,
56 - (IMM8 as u32 & 0xff) - 16,
57 - (IMM8 as u32 & 0xff) - 16,
58 - (IMM8 as u32 & 0xff) - 16,
59 - (IMM8 as u32 & 0xff) - 16,
60 - (IMM8 as u32 & 0xff) - 16,
61 - (IMM8 as u32 & 0xff) - 16,
62 - (IMM8 as u32 & 0xff) - 16,
63 - (IMM8 as u32 & 0xff) - 16,
mask(IMM8, 0),
mask(IMM8, 1),
mask(IMM8, 2),
mask(IMM8, 3),
mask(IMM8, 4),
mask(IMM8, 5),
mask(IMM8, 6),
mask(IMM8, 7),
mask(IMM8, 8),
mask(IMM8, 9),
mask(IMM8, 10),
mask(IMM8, 11),
mask(IMM8, 12),
mask(IMM8, 13),
mask(IMM8, 14),
mask(IMM8, 15),
mask(IMM8, 16),
mask(IMM8, 17),
mask(IMM8, 18),
mask(IMM8, 19),
mask(IMM8, 20),
mask(IMM8, 21),
mask(IMM8, 22),
mask(IMM8, 23),
mask(IMM8, 24),
mask(IMM8, 25),
mask(IMM8, 26),
mask(IMM8, 27),
mask(IMM8, 28),
mask(IMM8, 29),
mask(IMM8, 30),
mask(IMM8, 31),
],
);
transmute(r)
Expand Down
136 changes: 72 additions & 64 deletions crates/core_arch/src/x86/avx512bw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8873,76 +8873,84 @@ pub unsafe fn _mm_maskz_cvtepu8_epi16(k: __mmask8, a: __m128i) -> __m128i {
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_bslli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
static_assert_imm8!(IMM8);
const fn mask(shift: i32, i: u32) -> u32 {
let shift = shift as u32 & 0xff;
if shift > 15 || i % 16 < shift {
0
} else {
64 + (i - shift)
}
}
let a = a.as_i8x64();
let zero = _mm512_setzero_si512().as_i8x64();
let r: i8x64 = simd_shuffle64!(
zero,
a,
<const IMM8: i32> [
64 - (IMM8 as u32 & 0xff),
65 - (IMM8 as u32 & 0xff),
66 - (IMM8 as u32 & 0xff),
67 - (IMM8 as u32 & 0xff),
68 - (IMM8 as u32 & 0xff),
69 - (IMM8 as u32 & 0xff),
70 - (IMM8 as u32 & 0xff),
71 - (IMM8 as u32 & 0xff),
72 - (IMM8 as u32 & 0xff),
73 - (IMM8 as u32 & 0xff),
74 - (IMM8 as u32 & 0xff),
75 - (IMM8 as u32 & 0xff),
76 - (IMM8 as u32 & 0xff),
77 - (IMM8 as u32 & 0xff),
78 - (IMM8 as u32 & 0xff),
79 - (IMM8 as u32 & 0xff),
80 - (IMM8 as u32 & 0xff) - 16,
81 - (IMM8 as u32 & 0xff) - 16,
82 - (IMM8 as u32 & 0xff) - 16,
83 - (IMM8 as u32 & 0xff) - 16,
84 - (IMM8 as u32 & 0xff) - 16,
85 - (IMM8 as u32 & 0xff) - 16,
86 - (IMM8 as u32 & 0xff) - 16,
87 - (IMM8 as u32 & 0xff) - 16,
88 - (IMM8 as u32 & 0xff) - 16,
89 - (IMM8 as u32 & 0xff) - 16,
90 - (IMM8 as u32 & 0xff) - 16,
91 - (IMM8 as u32 & 0xff) - 16,
92 - (IMM8 as u32 & 0xff) - 16,
93 - (IMM8 as u32 & 0xff) - 16,
94 - (IMM8 as u32 & 0xff) - 16,
95 - (IMM8 as u32 & 0xff) - 16,
96 - (IMM8 as u32 & 0xff) - 32,
97 - (IMM8 as u32 & 0xff) - 32,
98 - (IMM8 as u32 & 0xff) - 32,
99 - (IMM8 as u32 & 0xff) - 32,
100 - (IMM8 as u32 & 0xff) - 32,
101 - (IMM8 as u32 & 0xff) - 32,
102 - (IMM8 as u32 & 0xff) - 32,
103 - (IMM8 as u32 & 0xff) - 32,
104 - (IMM8 as u32 & 0xff) - 32,
105 - (IMM8 as u32 & 0xff) - 32,
106 - (IMM8 as u32 & 0xff) - 32,
107 - (IMM8 as u32 & 0xff) - 32,
108 - (IMM8 as u32 & 0xff) - 32,
109 - (IMM8 as u32 & 0xff) - 32,
110 - (IMM8 as u32 & 0xff) - 32,
111 - (IMM8 as u32 & 0xff) - 32,
112 - (IMM8 as u32 & 0xff) - 48,
113 - (IMM8 as u32 & 0xff) - 48,
114 - (IMM8 as u32 & 0xff) - 48,
115 - (IMM8 as u32 & 0xff) - 48,
116 - (IMM8 as u32 & 0xff) - 48,
117 - (IMM8 as u32 & 0xff) - 48,
118 - (IMM8 as u32 & 0xff) - 48,
119 - (IMM8 as u32 & 0xff) - 48,
120 - (IMM8 as u32 & 0xff) - 48,
121 - (IMM8 as u32 & 0xff) - 48,
122 - (IMM8 as u32 & 0xff) - 48,
123 - (IMM8 as u32 & 0xff) - 48,
124 - (IMM8 as u32 & 0xff) - 48,
125 - (IMM8 as u32 & 0xff) - 48,
126 - (IMM8 as u32 & 0xff) - 48,
127 - (IMM8 as u32 & 0xff) - 48,
mask(IMM8, 0),
mask(IMM8, 1),
mask(IMM8, 2),
mask(IMM8, 3),
mask(IMM8, 4),
mask(IMM8, 5),
mask(IMM8, 6),
mask(IMM8, 7),
mask(IMM8, 8),
mask(IMM8, 9),
mask(IMM8, 10),
mask(IMM8, 11),
mask(IMM8, 12),
mask(IMM8, 13),
mask(IMM8, 14),
mask(IMM8, 15),
mask(IMM8, 16),
mask(IMM8, 17),
mask(IMM8, 18),
mask(IMM8, 19),
mask(IMM8, 20),
mask(IMM8, 21),
mask(IMM8, 22),
mask(IMM8, 23),
mask(IMM8, 24),
mask(IMM8, 25),
mask(IMM8, 26),
mask(IMM8, 27),
mask(IMM8, 28),
mask(IMM8, 29),
mask(IMM8, 30),
mask(IMM8, 31),
mask(IMM8, 32),
mask(IMM8, 33),
mask(IMM8, 34),
mask(IMM8, 35),
mask(IMM8, 36),
mask(IMM8, 37),
mask(IMM8, 38),
mask(IMM8, 39),
mask(IMM8, 40),
mask(IMM8, 41),
mask(IMM8, 42),
mask(IMM8, 43),
mask(IMM8, 44),
mask(IMM8, 45),
mask(IMM8, 46),
mask(IMM8, 47),
mask(IMM8, 48),
mask(IMM8, 49),
mask(IMM8, 50),
mask(IMM8, 51),
mask(IMM8, 52),
mask(IMM8, 53),
mask(IMM8, 54),
mask(IMM8, 55),
mask(IMM8, 56),
mask(IMM8, 57),
mask(IMM8, 58),
mask(IMM8, 59),
mask(IMM8, 60),
mask(IMM8, 61),
mask(IMM8, 62),
mask(IMM8, 63),
],
);
transmute(r)
Expand Down
5 changes: 3 additions & 2 deletions crates/core_arch/src/x86/sse2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -425,10 +425,11 @@ pub unsafe fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
#[target_feature(enable = "sse2")]
unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
const fn mask(shift: i32, i: u32) -> u32 {
if (shift as u32) > 15 {
let shift = shift as u32 & 0xff;
if shift > 15 {
i
} else {
16 - (shift as u32) + i
16 - shift + i
}
}
let zero = _mm_set1_epi8(0).as_i8x16();
Expand Down