Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve emulation of AVX2 min/max 64-bit #99

Merged
merged 5 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/bench-compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ if [ ! -d .bench/google-benchmark ]; then
fi
compare=$(realpath .bench/google-benchmark/tools/compare.py)

meson setup --warnlevel 0 --buildtype release builddir-${branch}
meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir-${branch}
cd builddir-${branch}
ninja
$compare filters ./benchexe $1 $2
2 changes: 1 addition & 1 deletion scripts/branch-compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ build_branch() {
fi
fi
cd $dir_name
meson setup --warnlevel 0 --buildtype release builddir
meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir
cd builddir
ninja
cd ../../
Expand Down
29 changes: 18 additions & 11 deletions src/avx2-64bit-qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ struct avx2_vector<int64_t> {
#else
static constexpr int network_sort_threshold = 64;
#endif
static constexpr int partition_unroll_factor = 4;
static constexpr int partition_unroll_factor = 8;

using swizzle_ops = avx2_64bit_swizzle_ops;

Expand Down Expand Up @@ -89,12 +89,15 @@ struct avx2_vector<int64_t> {
{
return _mm256_xor_si256(x, y);
}
static opmask_t gt(reg_t x, reg_t y)
{
return _mm256_cmpgt_epi64(x, y);
}
static opmask_t ge(reg_t x, reg_t y)
{
opmask_t equal = eq(x, y);
opmask_t greater = _mm256_cmpgt_epi64(x, y);
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(equal),
_mm256_castsi256_pd(greater)));
return _mm256_or_si256(equal, greater);
}
static opmask_t eq(reg_t x, reg_t y)
{
Expand Down Expand Up @@ -221,7 +224,7 @@ struct avx2_vector<uint64_t> {
#else
static constexpr int network_sort_threshold = 64;
#endif
static constexpr int partition_unroll_factor = 4;
static constexpr int partition_unroll_factor = 8;

using swizzle_ops = avx2_64bit_swizzle_ops;

Expand Down Expand Up @@ -258,17 +261,21 @@ struct avx2_vector<uint64_t> {
return _mm256_i64gather_epi64(
(long long int const *)base, index, scale);
}
static opmask_t gt(reg_t x, reg_t y)
{
const __m256i offset = _mm256_set1_epi64x(0x8000000000000000);
x = _mm256_xor_si256(x, offset);
y = _mm256_xor_si256(y, offset);
return _mm256_cmpgt_epi64(x, y);
}
static opmask_t ge(reg_t x, reg_t y)
{
opmask_t equal = eq(x, y);

const __m256i offset = _mm256_set1_epi64x(0x8000000000000000);
x = _mm256_add_epi64(x, offset);
y = _mm256_add_epi64(y, offset);

x = _mm256_xor_si256(x, offset);
y = _mm256_xor_si256(y, offset);
opmask_t greater = _mm256_cmpgt_epi64(x, y);
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(equal),
_mm256_castsi256_pd(greater)));
return _mm256_or_si256(equal, greater);
}
static opmask_t eq(reg_t x, reg_t y)
{
Expand Down Expand Up @@ -380,7 +387,7 @@ struct avx2_vector<double> {
#else
static constexpr int network_sort_threshold = 64;
#endif
static constexpr int partition_unroll_factor = 4;
static constexpr int partition_unroll_factor = 8;

using swizzle_ops = avx2_64bit_swizzle_ops;

Expand Down
4 changes: 2 additions & 2 deletions src/avx2-emu-funcs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ typename avx2_vector<T>::reg_t avx2_emu_max(typename avx2_vector<T>::reg_t x,
typename avx2_vector<T>::reg_t y)
{
using vtype = avx2_vector<T>;
typename vtype::opmask_t nlt = vtype::ge(x, y);
typename vtype::opmask_t nlt = vtype::gt(x, y);
return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(y),
_mm256_castsi256_pd(x),
_mm256_castsi256_pd(nlt)));
Expand All @@ -284,7 +284,7 @@ typename avx2_vector<T>::reg_t avx2_emu_min(typename avx2_vector<T>::reg_t x,
typename avx2_vector<T>::reg_t y)
{
using vtype = avx2_vector<T>;
typename vtype::opmask_t nlt = vtype::ge(x, y);
typename vtype::opmask_t nlt = vtype::gt(x, y);
return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(x),
_mm256_castsi256_pd(y),
_mm256_castsi256_pd(nlt)));
Expand Down
Loading