diff --git a/cmake/VWFlags.cmake b/cmake/VWFlags.cmake index 1b25f1e2cbf..4805eb9f51c 100644 --- a/cmake/VWFlags.cmake +++ b/cmake/VWFlags.cmake @@ -18,16 +18,13 @@ if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64") endif() endif() -# TODO: Hide this behind some compile flag. -set(LINUX_AVX512_FLAGS -mfma -mavx512f -mavx512bw -mavx512vl -mavx512vpopcntdq) - # Add -ffast-math for speed, remove for testability. # no-stack-check is added to mitigate stack alignment issue on Catalina where there is a bug with aligning stack-check instructions, and stack-check became default option set(LINUX_RELEASE_CONFIG -fno-strict-aliasing ${LINUX_X86_64_OPT_FLAGS} -fno-stack-check -fomit-frame-pointer) set(LINUX_DEBUG_CONFIG -fno-stack-check) -#Use default visiblity on UNIX otherwise a lot of the C++ symbols end up for exported and interpose'able -set(VW_LINUX_FLAGS $<$:${LINUX_DEBUG_CONFIG}> $<$:${LINUX_RELEASE_CONFIG}> $<$:${LINUX_RELEASE_CONFIG}> ${LINUX_AVX512_FLAGS}) +# Use default visiblity on UNIX otherwise a lot of the C++ symbols end up for exported and interpose'able +set(VW_LINUX_FLAGS $<$:${LINUX_DEBUG_CONFIG}> $<$:${LINUX_RELEASE_CONFIG}> $<$:${LINUX_RELEASE_CONFIG}>) set(VW_WIN_FLAGS /MP /Zc:__cplusplus) # Turn on warnings diff --git a/test/benchmarks/standalone/benchmark_text_input.cc b/test/benchmarks/standalone/benchmark_text_input.cc index 1457f49e10d..c6adea66dd2 100644 --- a/test/benchmarks/standalone/benchmark_text_input.cc +++ b/test/benchmarks/standalone/benchmark_text_input.cc @@ -292,8 +292,7 @@ BENCHMARK_CAPTURE(benchmark_multi_predict, cb_las_300actions_20features_1thread, ->UseRealTime() ->Unit(benchmark::kMillisecond); -BENCHMARK_CAPTURE(benchmark_multi_predict, cb_las_300actions, - gen_cb_examples(1, 50, 10, 300, 5, 5, 20, 10, false), +BENCHMARK_CAPTURE(benchmark_multi_predict, cb_las_300actions, gen_cb_examples(1, 50, 10, 300, 5, 5, 20, 10, false), "--cb_explore_adf --large_action_space -q :: --max_actions 20 --quiet") ->MinTime(15.0) ->UseRealTime() @@ -313,8 +312,7 @@ BENCHMARK_CAPTURE(benchmark_multi_predict, cb_las_300actions_plaincb, ->UseRealTime() ->Unit(benchmark::kMillisecond); -BENCHMARK_CAPTURE(benchmark_multi_predict, cb_las_500actions, - gen_cb_examples(1, 50, 10, 500, 5, 5, 20, 10, false), +BENCHMARK_CAPTURE(benchmark_multi_predict, cb_las_500actions, gen_cb_examples(1, 50, 10, 500, 5, 5, 20, 10, false), "--cb_explore_adf --large_action_space -q :: --max_actions 20 --quiet") ->MinTime(15.0) ->UseRealTime() diff --git a/test/unit_test/CMakeLists.txt b/test/unit_test/CMakeLists.txt index 57e575dab88..f9aed5ac7ba 100644 --- a/test/unit_test/CMakeLists.txt +++ b/test/unit_test/CMakeLists.txt @@ -62,6 +62,9 @@ add_executable(vw-unit-test.out weights_test.cc ) +# TODO: Hide behind some compile flag. +set_source_files_properties( cb_las_one_pass_svd_test.cc PROPERTIES COMPILE_FLAGS "-mfma -mavx512f -mavx512bw -mavx512vl -mavx512vpopcntdq" ) + # Add the include directories from vw target for testing target_link_libraries(vw-unit-test.out PRIVATE vw_core Boost::unit_test_framework) target_include_directories(vw-unit-test.out PRIVATE $) diff --git a/test/unit_test/cb_las_one_pass_svd_test.cc b/test/unit_test/cb_las_one_pass_svd_test.cc index 6476da6a60d..992ae66ca5d 100644 --- a/test/unit_test/cb_las_one_pass_svd_test.cc +++ b/test/unit_test/cb_las_one_pass_svd_test.cc @@ -2,6 +2,7 @@ // individual contributors. All rights reserved. Released under a BSD (revised) // license as described in the file LICENSE. +#include "reductions/cb/details/large_action/compute_dot_prod_simd.h" #include "reductions/cb/details/large_action_space.h" #include "test_common.h" #include "vw/core/qr_decomposition.h" @@ -91,11 +92,11 @@ BOOST_AUTO_TEST_CASE(check_AO_linear_combination_of_actions) vws.push_back(vw_rs); - auto* vw_zero_threads_zs = VW::initialize("--cb_explore_adf --large_action_space --max_actions " + std::to_string(d) + + auto* vw_zs = VW::initialize("--cb_explore_adf --large_action_space --max_actions " + std::to_string(d) + " --quiet --random_seed 0 --noconstant", nullptr, false, nullptr, nullptr); - vws.push_back(vw_zero_threads_zs); + vws.push_back(vw_zs); for (auto* vw_ptr : vws) { @@ -174,6 +175,43 @@ BOOST_AUTO_TEST_CASE(check_AO_linear_combination_of_actions) } } -// TODO: Add tests for vectorized implementation. +// TODO: Hide behind some compile flag. +BOOST_AUTO_TEST_CASE(compute16_has_same_results_with_16_compute1) +{ + constexpr uint64_t column_index = 666; + constexpr uint64_t seed = 233; + constexpr uint64_t weights_len = (1 << 18); + constexpr uint64_t weights_mask = weights_len - 1; + constexpr uint64_t offset = 4; + + constexpr int num_features = 16; + std::vector feature_indices(num_features); + std::vector feature_values(num_features); + for (int i = 0; i < num_features; ++i) + { + feature_indices[i] = rand() % weights_len; + feature_values[i] = rand() / static_cast(RAND_MAX); + } + + std::vector expected_results(num_features, 0.f); + for (int i = 0; i < num_features; ++i) + { + VW::cb_explore_adf::compute1( + feature_values[i], feature_indices[i], offset, weights_mask, column_index, seed, expected_results[i]); + } + + __m512 sums = _mm512_setzero_ps(); + const __m512i column_indices = _mm512_set1_epi64(column_index); + const __m512i seeds = _mm512_set1_epi64(seed); + const __m512i weights_masks = _mm512_set1_epi64(weights_mask); + const __m512i offsets = _mm512_set1_epi64(offset); + const __m512 values = _mm512_loadu_ps(&feature_values[0]); + const __m512i indices1 = _mm512_loadu_si512(&feature_indices[0]); + const __m512i indices2 = _mm512_loadu_si512(&feature_indices[8]); + VW::cb_explore_adf::compute16(values, indices1, indices2, offsets, weights_masks, column_indices, seeds, sums); + + const float* results = reinterpret_cast(&sums); + for (int i = 0; i < num_features; ++i) { BOOST_CHECK_CLOSE(results[i], expected_results[i], FLOAT_TOL); } +} BOOST_AUTO_TEST_SUITE_END() diff --git a/vowpalwabbit/core/CMakeLists.txt b/vowpalwabbit/core/CMakeLists.txt index 0715ab3abd7..d8b8f4564f8 100644 --- a/vowpalwabbit/core/CMakeLists.txt +++ b/vowpalwabbit/core/CMakeLists.txt @@ -352,6 +352,9 @@ set(vw_core_sources src/vw_validate.cc ) +# TODO: Hide behind some compile flag. +set_source_files_properties( src/reductions/cb/details/large_action/one_pass_svd_impl.cc PROPERTIES COMPILE_FLAGS "-mfma -mavx512f -mavx512bw -mavx512vl -mavx512vpopcntdq" ) + vw_add_library( NAME "core" TYPE "STATIC_ONLY" diff --git a/vowpalwabbit/core/src/reductions/cb/details/large_action/one_pass_svd_impl.cc b/vowpalwabbit/core/src/reductions/cb/details/large_action/one_pass_svd_impl.cc index 50e8043e31d..971e6ef91a9 100644 --- a/vowpalwabbit/core/src/reductions/cb/details/large_action/one_pass_svd_impl.cc +++ b/vowpalwabbit/core/src/reductions/cb/details/large_action/one_pass_svd_impl.cc @@ -137,7 +137,7 @@ void one_pass_svd_impl::generate_AOmega(const multi_ex& examples, const std::vec #ifdef __linux__ // Only works for linux for now. // TODO: Expose this parameter. - bool use_simd = false; + bool use_simd = true; auto compute_dot_prod = use_simd ? compute_dot_prod_simd : compute_dot_prod_scalar; #else // TODO: Make simd work with msvc. diff --git a/vowpalwabbit/core/src/reductions/cb/details/large_action_space.h b/vowpalwabbit/core/src/reductions/cb/details/large_action_space.h index 45cf78467b1..e42eed9af89 100644 --- a/vowpalwabbit/core/src/reductions/cb/details/large_action_space.h +++ b/vowpalwabbit/core/src/reductions/cb/details/large_action_space.h @@ -12,7 +12,7 @@ #include "vw/core/v_array.h" #include "vw/core/vw_fwd.h" -// Eigen explicit vectorization does not work with smaller ALIGN_BYTES. +// Eigen explicit vectorization does not work with AVX512 when using smaller MAX_ALIGN_BYTES. For more info: // https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html#TopicPreprocessorDirectivesPerformance #define EIGEN_MAX_ALIGN_BYTES 32