From 97cb9a6e810e4941c4889de909420f1c8be2ec93 Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Mon, 16 Dec 2024 13:08:06 +0200 Subject: [PATCH 01/14] temp commit --- icicle/backend/cpu/include/ntt_cpu.h | 141 +++++++++++++-------------- icicle/tests/test_field_api.cpp | 31 +++--- 2 files changed, 84 insertions(+), 88 deletions(-) diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index 7d7e2d386..1225c0d96 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -79,42 +79,76 @@ namespace ntt_cpu { eIcicleError NttCpu::run() { copy_and_reorder_if_needed(input, ntt_data.elements); - if (ntt_data.is_parallel) { - if (ntt_data.logn > HIERARCHY_1) { - for (uint32_t hierarchy_1_layer_idx = 0; hierarchy_1_layer_idx < 2; hierarchy_1_layer_idx++) { - const uint32_t sunbtt_plus_batch_logn = - ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[hierarchy_1_layer_idx] + - uint32_t(log2(ntt_data.config.batch_size)); - const uint32_t log_nof_hierarchy_1_subntts_todo_in_parallel = - sunbtt_plus_batch_logn < HIERARCHY_1 ? HIERARCHY_1 - sunbtt_plus_batch_logn : 0; - const uint32_t nof_hierarchy_1_subntts_todo_in_parallel = 1 << log_nof_hierarchy_1_subntts_todo_in_parallel; - const uint32_t log_nof_subntts_chunks = - ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1 - hierarchy_1_layer_idx] - - log_nof_hierarchy_1_subntts_todo_in_parallel; - const uint32_t nof_subntts_chunks = 1 << log_nof_subntts_chunks; - for (uint32_t hierarchy_1_subntts_chunck_idx = 0; hierarchy_1_subntts_chunck_idx < nof_subntts_chunks; - hierarchy_1_subntts_chunck_idx++) { - for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; - hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; - hierarchy_1_subntt_idx_in_chunck++) { - hierarchy1_push_tasks( - hierarchy_1_layer_idx, hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + - hierarchy_1_subntt_idx_in_chunck); - } - handle_pushed_tasks(hierarchy_1_layer_idx); - } - if (hierarchy_1_layer_idx == 0) { hierarchy_1_reorder(); } - } - reorder_output(); - } else { - hierarchy1_push_tasks(0, 0); - handle_pushed_tasks(0); - } - } else { + if (!ntt_data.is_parallel) { if (ntt_data.direction == NTTDir::kForward && ntt_data.config.coset_gen != S::one()) { coset_mul(); } NttTask task; task.set_data(ntt_data); task.execute(); + } else { + uint32_t nof_hierarchy_1_layers = ntt_data.logn > HIERARCHY_1? 2 : 1; + for (uint32_t hierarchy_1_layer_idx = 0; hierarchy_1_layer_idx < nof_hierarchy_1_layers; hierarchy_1_layer_idx++) { + const uint32_t sunbtt_plus_batch_logn = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[hierarchy_1_layer_idx] + uint32_t(log2(ntt_data.config.batch_size)); + const uint32_t log_nof_hierarchy_1_subntts_todo_in_parallel = ((sunbtt_plus_batch_logn < HIERARCHY_1) && (ntt_data.logn > HIERARCHY_1)) ? HIERARCHY_1 - sunbtt_plus_batch_logn : 0; + const uint32_t nof_hierarchy_1_subntts_todo_in_parallel = 1 << log_nof_hierarchy_1_subntts_todo_in_parallel; + const uint32_t log_nof_subntts_chunks = (ntt_data.logn > HIERARCHY_1) ? ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1 - hierarchy_1_layer_idx] - log_nof_hierarchy_1_subntts_todo_in_parallel : 0; + const uint32_t nof_subntts_chunks = 1 << log_nof_subntts_chunks; + + for (uint32_t hierarchy_1_subntts_chunck_idx = 0; hierarchy_1_subntts_chunck_idx < nof_subntts_chunks; hierarchy_1_subntts_chunck_idx++) { + + uint32_t nof_hierarchy_0_layers = (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2] != 0) ? 3 : (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1] != 0) ? 2 : 1; + for (uint32_t hierarchy_0_layer_idx = 0; hierarchy_0_layer_idx < nof_hierarchy_0_layers; hierarchy_0_layer_idx++) { + uint64_t nof_blocks; + uint64_t nof_subntts; + if (hierarchy_0_layer_idx == 0) { + nof_blocks = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2]; + nof_subntts = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1]; + } else if (hierarchy_0_layer_idx == 1) { + nof_blocks = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2]; + nof_subntts = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0]; + } else { + nof_blocks = 1 << (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0] + ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1]); + nof_subntts = 1; + } + //openmp tasks collapse + for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { + for (uint32_t hierarchy_0_block_idx = 0; hierarchy_0_block_idx < (nof_blocks); hierarchy_0_block_idx++) { + for (uint32_t hierarchy_0_subntt_idx = 0; hierarchy_0_subntt_idx < (nof_subntts); hierarchy_0_subntt_idx++) { + NttTask task; + NttTaskCoordinates ntt_task_coordinates; + task.set_data(ntt_data); + ntt_task_coordinates.hierarchy_1_layer_idx = hierarchy_1_layer_idx; // this is line 119 + ntt_task_coordinates.hierarchy_1_subntt_idx = hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck; + ntt_task_coordinates.hierarchy_0_layer_idx = hierarchy_0_layer_idx; + ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx; + ntt_task_coordinates.hierarchy_0_subntt_idx = hierarchy_0_subntt_idx; + ntt_task_coordinates.reorder = false; + task.set_coordinates(&ntt_task_coordinates); + task.execute(); + } + } + } + if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // all ntt tasks in hierarchy 1 are pushed, now push reorder task so that the data + // is in the correct order for the next hierarchy 1 layer + //openmp + for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { + NttTask task; + NttTaskCoordinates ntt_task_coordinates; + task.set_data(ntt_data); + ntt_task_coordinates.hierarchy_1_layer_idx = hierarchy_1_layer_idx; + ntt_task_coordinates.hierarchy_1_subntt_idx = hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck; + ntt_task_coordinates.hierarchy_0_layer_idx = nof_hierarchy_0_layers; + ntt_task_coordinates.hierarchy_0_block_idx = 0; + ntt_task_coordinates.hierarchy_0_subntt_idx = 0; + ntt_task_coordinates.reorder = true; + task.set_coordinates(&ntt_task_coordinates); + task.execute(); + } + } + } + } + if ((ntt_data.logn > HIERARCHY_1) && (hierarchy_1_layer_idx == 0)) { hierarchy_1_reorder(); } + } + if (ntt_data.logn > HIERARCHY_1){ reorder_output(); } } if (ntt_data.direction == NTTDir::kInverse && ntt_data.config.coset_gen != S::one()) { coset_mul(); } @@ -355,45 +389,6 @@ namespace ntt_cpu { return eIcicleError::UNKNOWN_ERROR; // Handle case where no task manager is available } - uint32_t nof_hierarchy_0_layers = - (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2] != 0) ? 3 - : (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1] != 0) ? 2 - : 1; - uint64_t nof_blocks; - uint64_t nof_subntts; - for (uint32_t hierarchy_0_layer_idx = 0; hierarchy_0_layer_idx < nof_hierarchy_0_layers; hierarchy_0_layer_idx++) { - if (hierarchy_0_layer_idx == 0) { - nof_blocks = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2]; - nof_subntts = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1]; - } else if (hierarchy_0_layer_idx == 1) { - nof_blocks = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2]; - nof_subntts = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0]; - } else { - nof_blocks = 1 - << (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0] + - ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1]); - nof_subntts = 1; - } - for (uint32_t hierarchy_0_block_idx = 0; hierarchy_0_block_idx < (nof_blocks); hierarchy_0_block_idx++) { - for (uint32_t hierarchy_0_subntt_idx = 0; hierarchy_0_subntt_idx < (nof_subntts); hierarchy_0_subntt_idx++) { - if (hierarchy_0_layer_idx == 0) { - NttTaskCoordinates* ntt_task_coordinates = ntt_tasks_manager->get_slot_for_next_task_coordinates(); - ntt_task_coordinates->hierarchy_1_layer_idx = hierarchy_1_layer_idx; - ntt_task_coordinates->hierarchy_1_subntt_idx = hierarchy_1_subntt_idx; - ntt_task_coordinates->hierarchy_0_layer_idx = hierarchy_0_layer_idx; - ntt_task_coordinates->hierarchy_0_block_idx = hierarchy_0_block_idx; - ntt_task_coordinates->hierarchy_0_subntt_idx = hierarchy_0_subntt_idx; - ntt_task_coordinates->reorder = false; - } else { - ntt_tasks_manager->nof_pending_tasks++; - } - } - } - } - if (nof_hierarchy_0_layers > 1) { // all ntt tasks in hierarchy 1 are pushed, now push reorder task so that the data - // is in the correct order for the next hierarchy 1 layer - ntt_tasks_manager->nof_pending_tasks++; - } return eIcicleError::SUCCESS; } @@ -430,7 +425,7 @@ namespace ntt_cpu { } else { task_slot->set_data(ntt_data); } - NttTaskCoordinates* next_task_c_ptr = ntt_tasks_manager->get_available_task(); + NttTaskCoordinates next_task_c_ptr = ntt_tasks_manager->get_available_task(); task_slot->set_coordinates(next_task_c_ptr); task_slot->dispatch(); } else { @@ -438,7 +433,7 @@ namespace ntt_cpu { task_slot = tasks_manager->get_completed_task(); if (ntt_tasks_manager->handle_completed(task_slot, nof_subntts_l1)) { continue; } if (ntt_tasks_manager->available_tasks()) { - NttTaskCoordinates* next_task_c_ptr = ntt_tasks_manager->get_available_task(); + NttTaskCoordinates next_task_c_ptr = ntt_tasks_manager->get_available_task(); task_slot->set_coordinates(next_task_c_ptr); task_slot->dispatch(); } else { diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 3414811a7..3ea241148 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -813,22 +813,23 @@ TEST_F(FieldApiTestBase, polynomialDivision) TYPED_TEST(FieldApiTest, ntt) { // Randomize configuration - const bool inplace = rand_uint_32b(0, 1); - const int logn = rand_uint_32b(3, 17); + for(int logn=3; logn<25; logn++){ + const bool inplace = 0; + // const int logn = rand_uint_32b(3, 17); const uint64_t N = 1 << logn; - const int log_ntt_domain_size = logn + 1; - const int log_batch_size = rand_uint_32b(0, 2); + const int log_ntt_domain_size = logn; + const int log_batch_size = 0; const int batch_size = 1 << log_batch_size; - const int _ordering = rand_uint_32b(0, 3); + const int _ordering = 0; const Ordering ordering = static_cast(_ordering); - bool columns_batch; - if (logn == 7 || logn < 4) { - columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) - } else { - columns_batch = rand_uint_32b(0, 1); - } - const NTTDir dir = static_cast(rand_uint_32b(0, 1)); // 0: forward, 1: inverse - const int log_coset_stride = rand_uint_32b(0, 2); + bool columns_batch = false; + // if (logn == 7 || logn < 4) { + // columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) + // } else { + // columns_batch = rand_uint_32b(0, 1); + // } + const NTTDir dir = static_cast(0); // 0: forward, 1: inverse + const int log_coset_stride = 0; scalar_t coset_gen; if (log_coset_stride) { coset_gen = scalar_t::omega(logn + log_coset_stride); @@ -836,7 +837,7 @@ TYPED_TEST(FieldApiTest, ntt) coset_gen = scalar_t::one(); } - ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "LOGN = " << logn; ICICLE_LOG_DEBUG << "batch_size = " << batch_size; ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; ICICLE_LOG_DEBUG << "inplace = " << inplace; @@ -901,7 +902,7 @@ TYPED_TEST(FieldApiTest, ntt) run(IcicleTestBase::reference_device(), out_ref.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); run(IcicleTestBase::main_device(), out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); -} +}} #endif // NTT // define program From 10e0974b78d00a930b59f007e3864e0e171a4e69 Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Wed, 18 Dec 2024 17:30:08 +0200 Subject: [PATCH 02/14] parallel with omp, temp commit --- icicle/CMakeLists.txt | 8 +++++++- icicle/backend/cpu/include/ntt_cpu.h | 16 +++++++++++++--- icicle/backend/cpu/include/ntt_data.h | 6 +++--- icicle/cmake/curve.cmake | 2 +- icicle/cmake/field.cmake | 2 +- icicle/cmake/hash.cmake | 2 +- icicle/tests/CMakeLists.txt | 16 ++++++++-------- 7 files changed, 34 insertions(+), 18 deletions(-) diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt index 64f2479a9..ab9685daa 100644 --- a/icicle/CMakeLists.txt +++ b/icicle/CMakeLists.txt @@ -79,13 +79,19 @@ if(SANITIZE) set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} -fsanitize=address") endif() +# Find and include OpenMP +set(OpenMP_ROOT $ENV{OPENMP_ROOT}) +find_package(OpenMP REQUIRED) + # device API library add_library(icicle_device SHARED src/device_api.cpp src/runtime.cpp src/config_extension.cpp ) -target_link_libraries(icicle_device PUBLIC dl) +target_link_libraries(icicle_device PUBLIC dl OpenMP::OpenMP_CXX) +message(STATUS "OpenMP CXX Flags: ${OpenMP_CXX_FLAGS}") + include_directories(include) # Define the install directory (default is /usr/local) diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index 1225c0d96..23b9ac8ef 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -5,6 +5,7 @@ #include "ntt_utils.h" #include #include +#include #ifdef CURVE_ID #include "icicle/curves/curve_config.h" @@ -109,14 +110,14 @@ namespace ntt_cpu { nof_blocks = 1 << (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0] + ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1]); nof_subntts = 1; } - //openmp tasks collapse for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { + #pragma omp parallel for collapse(2) schedule(dynamic) for (uint32_t hierarchy_0_block_idx = 0; hierarchy_0_block_idx < (nof_blocks); hierarchy_0_block_idx++) { for (uint32_t hierarchy_0_subntt_idx = 0; hierarchy_0_subntt_idx < (nof_subntts); hierarchy_0_subntt_idx++) { NttTask task; NttTaskCoordinates ntt_task_coordinates; task.set_data(ntt_data); - ntt_task_coordinates.hierarchy_1_layer_idx = hierarchy_1_layer_idx; // this is line 119 + ntt_task_coordinates.hierarchy_1_layer_idx = hierarchy_1_layer_idx; ntt_task_coordinates.hierarchy_1_subntt_idx = hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck; ntt_task_coordinates.hierarchy_0_layer_idx = hierarchy_0_layer_idx; ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx; @@ -124,12 +125,20 @@ namespace ntt_cpu { ntt_task_coordinates.reorder = false; task.set_coordinates(&ntt_task_coordinates); task.execute(); + // ntt_task_coordinates.hierarchy_1_layer_idx = hierarchy_1_layer_idx; + // ntt_task_coordinates.hierarchy_1_subntt_idx = hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck; + // ntt_task_coordinates.hierarchy_0_layer_idx = hierarchy_0_layer_idx; + // ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx+1; + // ntt_task_coordinates.hierarchy_0_subntt_idx = hierarchy_0_subntt_idx; + // ntt_task_coordinates.reorder = false; + // task.set_coordinates(&ntt_task_coordinates); + // task.execute(); } } } if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // all ntt tasks in hierarchy 1 are pushed, now push reorder task so that the data // is in the correct order for the next hierarchy 1 layer - //openmp + #pragma omp parallel for for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { NttTask task; NttTaskCoordinates ntt_task_coordinates; @@ -290,6 +299,7 @@ namespace ntt_cpu { ntt_data.config.columns_batch ? ntt_data.elements + batch : ntt_data.elements + batch * ntt_data.size; E* cur_temp_elements = ntt_data.config.columns_batch ? temp_elements.get() + batch : temp_elements.get() + batch * ntt_data.size; + #pragma omp parallel for collapse(2) for (uint32_t sntt_idx = 0; sntt_idx < nof_sntts; sntt_idx++) { for (uint32_t elem = 0; elem < sntt_size; elem++) { cur_temp_elements[stride * (sntt_idx * sntt_size + elem)] = diff --git a/icicle/backend/cpu/include/ntt_data.h b/icicle/backend/cpu/include/ntt_data.h index 924325e3c..dde1a7ce5 100644 --- a/icicle/backend/cpu/include/ntt_data.h +++ b/icicle/backend/cpu/include/ntt_data.h @@ -10,7 +10,7 @@ #include #include -#define HIERARCHY_1 22 +#define HIERARCHY_1 25 namespace ntt_cpu { @@ -27,8 +27,8 @@ namespace ntt_cpu { constexpr uint32_t layers_sub_logn[31][3] = { {0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0}, {4, 0, 0}, {5, 0, 0}, {3, 3, 0}, {4, 3, 0}, {4, 4, 0}, {5, 4, 0}, {5, 5, 0}, {4, 4, 3}, {4, 4, 4}, {5, 4, 4}, {5, 5, 4}, {5, 5, 5}, - {5, 5, 6}, {5, 5, 7}, {5, 5, 8}, {5, 5, 9}, {5, 5, 10}, {5, 5, 11}, {5, 5, 12}, {12, 11, 0}, - {12, 12, 0}, {13, 12, 0}, {13, 13, 0}, {14, 13, 0}, {14, 14, 0}, {15, 14, 0}, {15, 15, 0}}; + {5, 5, 6}, {5, 5, 7}, {5, 8, 5}, {5, 5, 9}, {5, 5, 10}, {5, 5, 11}, {5, 5, 12}, {5, 5, 13}, + {5, 5, 14}, {5, 5, 15}, {13, 13, 0}, {14, 13, 0}, {14, 14, 0}, {15, 14, 0}, {15, 15, 0}}; /** * @brief Represents the log sizes of sub-NTTs in the NTT computation hierarchy. diff --git a/icicle/cmake/curve.cmake b/icicle/cmake/curve.cmake index c82d1b90b..e14d8d2b8 100644 --- a/icicle/cmake/curve.cmake +++ b/icicle/cmake/curve.cmake @@ -58,7 +58,7 @@ function(setup_curve_target CURVE CURVE_INDEX FEATURES_STRING) # Add additional feature handling calls here set_target_properties(icicle_curve PROPERTIES OUTPUT_NAME "icicle_curve_${CURVE}") - target_link_libraries(icicle_curve PUBLIC icicle_device icicle_field pthread) + target_link_libraries(icicle_curve PUBLIC icicle_device icicle_field pthread OpenMP::OpenMP_CXX) # Ensure CURVE is defined in the cache for backends to see set(CURVE "${CURVE}" CACHE STRING "") diff --git a/icicle/cmake/field.cmake b/icicle/cmake/field.cmake index 953c0d6fc..3c263d97b 100644 --- a/icicle/cmake/field.cmake +++ b/icicle/cmake/field.cmake @@ -56,7 +56,7 @@ function(setup_field_target FIELD FIELD_INDEX FEATURES_STRING) # Add additional feature handling calls here set_target_properties(icicle_field PROPERTIES OUTPUT_NAME "icicle_field_${FIELD}") - target_link_libraries(icicle_field PUBLIC icicle_device pthread) + target_link_libraries(icicle_field PUBLIC icicle_device pthread OpenMP::OpenMP_CXX) # Ensure FIELD is defined in the cache for backends to see set(FIELD "${FIELD}" CACHE STRING "") diff --git a/icicle/cmake/hash.cmake b/icicle/cmake/hash.cmake index 6d43c3e03..df18e8eae 100644 --- a/icicle/cmake/hash.cmake +++ b/icicle/cmake/hash.cmake @@ -10,7 +10,7 @@ function(setup_hash_target) src/hash/merkle_c_api.cpp ) - target_link_libraries(icicle_hash PUBLIC icicle_device) + target_link_libraries(icicle_hash PUBLIC icicle_device OpenMP::OpenMP_CXX) install(TARGETS icicle_hash RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/" diff --git a/icicle/tests/CMakeLists.txt b/icicle/tests/CMakeLists.txt index 066fe0803..1b1c0d6ba 100644 --- a/icicle/tests/CMakeLists.txt +++ b/icicle/tests/CMakeLists.txt @@ -25,20 +25,20 @@ enable_testing() # device API test add_executable(test_device_api test_device_api.cpp) target_include_directories(test_device_api PRIVATE ${CMAKE_SOURCE_DIR}/include/) -target_link_libraries(test_device_api PRIVATE GTest::gtest_main icicle_device) +target_link_libraries(test_device_api PRIVATE GTest::gtest_main icicle_device OpenMP::OpenMP_CXX) gtest_discover_tests(test_device_api) #field API test if (FIELD) add_executable(test_field_api test_field_api.cpp) target_include_directories(test_field_api PRIVATE ${CMAKE_SOURCE_DIR}/include/) - target_link_libraries(test_field_api PRIVATE GTest::gtest_main icicle_device icicle_field) + target_link_libraries(test_field_api PRIVATE GTest::gtest_main icicle_device icicle_field OpenMP::OpenMP_CXX) gtest_discover_tests(test_field_api) if (NTT) add_executable(test_polynomial_api test_polynomial_api.cpp) target_include_directories(test_polynomial_api PRIVATE ${CMAKE_SOURCE_DIR}/include/) - target_link_libraries(test_polynomial_api PRIVATE GTest::gtest_main icicle_field) + target_link_libraries(test_polynomial_api PRIVATE GTest::gtest_main icicle_field OpenMP::OpenMP_CXX) gtest_discover_tests(test_polynomial_api) endif() endif() @@ -47,11 +47,11 @@ endif() if (CURVE) add_executable(test_curve_api test_curve_api.cpp) target_include_directories(test_curve_api PRIVATE ${CMAKE_SOURCE_DIR}/include/) - target_link_libraries(test_curve_api PRIVATE GTest::gtest_main icicle_device icicle_field icicle_curve) + target_link_libraries(test_curve_api PRIVATE GTest::gtest_main icicle_device icicle_field icicle_curve OpenMP::OpenMP_CXX) gtest_discover_tests(test_curve_api) if (NTT) - target_link_libraries(test_polynomial_api PRIVATE GTest::gtest_main icicle_curve) + target_link_libraries(test_polynomial_api PRIVATE GTest::gtest_main icicle_curve OpenMP::OpenMP_CXX) endif() endif() @@ -59,12 +59,12 @@ endif() if (HASH) add_executable(test_hash_api test_hash_api.cpp) target_include_directories(test_hash_api PRIVATE ${CMAKE_SOURCE_DIR}/include/) - target_link_libraries(test_hash_api PRIVATE GTest::gtest_main icicle_device icicle_hash) + target_link_libraries(test_hash_api PRIVATE GTest::gtest_main icicle_device icicle_hash OpenMP::OpenMP_CXX) gtest_discover_tests(test_hash_api) if (POSEIDON AND (FIELD OR CURVE)) - target_link_libraries(test_hash_api PRIVATE icicle_field) + target_link_libraries(test_hash_api PRIVATE icicle_field OpenMP::OpenMP_CXX) endif() if (POSEIDON2 AND (FIELD OR CURVE)) - target_link_libraries(test_hash_api PRIVATE icicle_field) + target_link_libraries(test_hash_api PRIVATE icicle_field OpenMP::OpenMP_CXX) endif() endif() From 00ac6d41b85dc80102021d3c4c6e39a413c96ede Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Wed, 18 Dec 2024 18:22:24 +0200 Subject: [PATCH 03/14] =?UTF-8?q?separate=20code=20for=20 task; task.set_data(ntt_data); task.execute(); + } else if (__builtin_expect((ntt_data.logn <= HIERARCHY_1),1)){ + uint32_t nof_hierarchy_0_layers = (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][2] != 0) ? 3 : (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][1] != 0) ? 2 : 1; + for (uint32_t hierarchy_0_layer_idx = 0; hierarchy_0_layer_idx < nof_hierarchy_0_layers; hierarchy_0_layer_idx++) { + uint64_t nof_blocks; + uint64_t nof_subntts; + if (hierarchy_0_layer_idx == 0) { + nof_blocks = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][2]; + nof_subntts = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][1]; + } else if (hierarchy_0_layer_idx == 1) { + nof_blocks = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][2]; + nof_subntts = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][0]; + } else { + nof_blocks = 1 << (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][0] + ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][1]); + nof_subntts = 1; + } + #pragma omp parallel for collapse(2) schedule(dynamic) + for (uint32_t hierarchy_0_block_idx = 0; hierarchy_0_block_idx < (nof_blocks); hierarchy_0_block_idx++) { + for (uint32_t hierarchy_0_subntt_idx = 0; hierarchy_0_subntt_idx < (nof_subntts); hierarchy_0_subntt_idx++) { + NttTask task; + NttTaskCoordinates ntt_task_coordinates; + task.set_data(ntt_data); + ntt_task_coordinates.hierarchy_1_layer_idx = 0; + ntt_task_coordinates.hierarchy_1_subntt_idx = 0; + ntt_task_coordinates.hierarchy_0_layer_idx = hierarchy_0_layer_idx; + ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx; + ntt_task_coordinates.hierarchy_0_subntt_idx = hierarchy_0_subntt_idx; + ntt_task_coordinates.reorder = false; + task.set_coordinates(&ntt_task_coordinates); + task.execute(); + // ntt_task_coordinates.hierarchy_1_layer_idx = 0; + // ntt_task_coordinates.hierarchy_1_subntt_idx = 0; + // ntt_task_coordinates.hierarchy_0_layer_idx = hierarchy_0_layer_idx; + // ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx+1; + // ntt_task_coordinates.hierarchy_0_subntt_idx = hierarchy_0_subntt_idx; + // ntt_task_coordinates.reorder = false; + // task.set_coordinates(&ntt_task_coordinates); + // task.execute(); + } + } + if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // all ntt tasks in hierarchy 1 are pushed, now push reorder task so that the data + // is in the correct order for the next hierarchy 1 layer + NttTask task; + NttTaskCoordinates ntt_task_coordinates; + task.set_data(ntt_data); + ntt_task_coordinates.hierarchy_1_layer_idx = 0; + ntt_task_coordinates.hierarchy_1_subntt_idx = 0; + ntt_task_coordinates.hierarchy_0_layer_idx = nof_hierarchy_0_layers; + ntt_task_coordinates.hierarchy_0_block_idx = 0; + ntt_task_coordinates.hierarchy_0_subntt_idx = 0; + ntt_task_coordinates.reorder = true; + task.set_coordinates(&ntt_task_coordinates); + task.execute(); + } + } } else { - uint32_t nof_hierarchy_1_layers = ntt_data.logn > HIERARCHY_1? 2 : 1; - for (uint32_t hierarchy_1_layer_idx = 0; hierarchy_1_layer_idx < nof_hierarchy_1_layers; hierarchy_1_layer_idx++) { + for (uint32_t hierarchy_1_layer_idx = 0; hierarchy_1_layer_idx < 2; hierarchy_1_layer_idx++) { const uint32_t sunbtt_plus_batch_logn = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[hierarchy_1_layer_idx] + uint32_t(log2(ntt_data.config.batch_size)); - const uint32_t log_nof_hierarchy_1_subntts_todo_in_parallel = ((sunbtt_plus_batch_logn < HIERARCHY_1) && (ntt_data.logn > HIERARCHY_1)) ? HIERARCHY_1 - sunbtt_plus_batch_logn : 0; + const uint32_t log_nof_hierarchy_1_subntts_todo_in_parallel = (sunbtt_plus_batch_logn < HIERARCHY_1) ? HIERARCHY_1 - sunbtt_plus_batch_logn : 0; const uint32_t nof_hierarchy_1_subntts_todo_in_parallel = 1 << log_nof_hierarchy_1_subntts_todo_in_parallel; - const uint32_t log_nof_subntts_chunks = (ntt_data.logn > HIERARCHY_1) ? ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1 - hierarchy_1_layer_idx] - log_nof_hierarchy_1_subntts_todo_in_parallel : 0; + const uint32_t log_nof_subntts_chunks = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1 - hierarchy_1_layer_idx] - log_nof_hierarchy_1_subntts_todo_in_parallel; const uint32_t nof_subntts_chunks = 1 << log_nof_subntts_chunks; for (uint32_t hierarchy_1_subntts_chunck_idx = 0; hierarchy_1_subntts_chunck_idx < nof_subntts_chunks; hierarchy_1_subntts_chunck_idx++) { @@ -110,9 +163,9 @@ namespace ntt_cpu { nof_blocks = 1 << (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0] + ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1]); nof_subntts = 1; } + #pragma omp parallel for collapse(3) schedule(dynamic, 512) for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { - #pragma omp parallel for collapse(2) schedule(dynamic) - for (uint32_t hierarchy_0_block_idx = 0; hierarchy_0_block_idx < (nof_blocks); hierarchy_0_block_idx++) { + for (uint32_t hierarchy_0_block_idx = 0; hierarchy_0_block_idx < (nof_blocks); hierarchy_0_block_idx+=2) { for (uint32_t hierarchy_0_subntt_idx = 0; hierarchy_0_subntt_idx < (nof_subntts); hierarchy_0_subntt_idx++) { NttTask task; NttTaskCoordinates ntt_task_coordinates; @@ -125,14 +178,14 @@ namespace ntt_cpu { ntt_task_coordinates.reorder = false; task.set_coordinates(&ntt_task_coordinates); task.execute(); - // ntt_task_coordinates.hierarchy_1_layer_idx = hierarchy_1_layer_idx; - // ntt_task_coordinates.hierarchy_1_subntt_idx = hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck; - // ntt_task_coordinates.hierarchy_0_layer_idx = hierarchy_0_layer_idx; - // ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx+1; - // ntt_task_coordinates.hierarchy_0_subntt_idx = hierarchy_0_subntt_idx; - // ntt_task_coordinates.reorder = false; - // task.set_coordinates(&ntt_task_coordinates); - // task.execute(); + ntt_task_coordinates.hierarchy_1_layer_idx = hierarchy_1_layer_idx; + ntt_task_coordinates.hierarchy_1_subntt_idx = hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck; + ntt_task_coordinates.hierarchy_0_layer_idx = hierarchy_0_layer_idx; + ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx+1; + ntt_task_coordinates.hierarchy_0_subntt_idx = hierarchy_0_subntt_idx; + ntt_task_coordinates.reorder = false; + task.set_coordinates(&ntt_task_coordinates); + task.execute(); } } } @@ -155,9 +208,9 @@ namespace ntt_cpu { } } } - if ((ntt_data.logn > HIERARCHY_1) && (hierarchy_1_layer_idx == 0)) { hierarchy_1_reorder(); } + if (hierarchy_1_layer_idx == 0) { hierarchy_1_reorder(); } } - if (ntt_data.logn > HIERARCHY_1){ reorder_output(); } + reorder_output(); } if (ntt_data.direction == NTTDir::kInverse && ntt_data.config.coset_gen != S::one()) { coset_mul(); } From f46e3c92896bb7f61f108d0b250c0a6d75b3fa87 Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Thu, 19 Dec 2024 14:57:10 +0200 Subject: [PATCH 04/14] Removed redundant code such as ntt_tasks_manager. NttTask no longer inherits from TaskBase, simplifying the code structure. --- icicle/backend/cpu/include/ntt_cpu.h | 169 +------ icicle/backend/cpu/include/ntt_data.h | 2 +- icicle/backend/cpu/include/ntt_task.h | 190 ++++---- .../backend/cpu/include/ntt_tasks_manager.h | 418 ------------------ icicle/backend/cpu/include/ntt_utils.h | 22 +- 5 files changed, 125 insertions(+), 676 deletions(-) delete mode 100644 icicle/backend/cpu/include/ntt_tasks_manager.h diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index 1196a9e19..c825b1069 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -1,7 +1,7 @@ #pragma once #include "icicle/errors.h" #include "icicle/utils/log.h" -#include "ntt_tasks_manager.h" +#include "ntt_task.h" #include "ntt_utils.h" #include #include @@ -42,18 +42,12 @@ namespace ntt_cpu { const E* input; NttData ntt_data; - // Parallel-specific members - std::optional> ntt_tasks_manager; - std::unique_ptr>> tasks_manager; - bool compute_if_is_parallel(uint32_t logn, const NTTConfig& config); void coset_mul(); void reorder_by_bit_reverse(); void copy_and_reorder_if_needed(const E* input, E* output); // Parallel-specific methods - eIcicleError hierarchy1_push_tasks(uint32_t hierarchy_1_layer_idx, uint32_t hierarchy_1_subntt_idx); - eIcicleError handle_pushed_tasks(uint32_t hierarchy_1_layer_idx); void hierarchy_1_reorder(); eIcicleError reorder_output(); @@ -66,13 +60,7 @@ namespace ntt_cpu { */ template NttCpu::NttCpu(uint32_t logn, NTTDir direction, const NTTConfig& config, const E* input, E* output) - : input(input), ntt_data(logn, output, config, direction, compute_if_is_parallel(logn, config)), - ntt_tasks_manager( - ntt_data.is_parallel ? std::optional>(std::in_place, ntt_data.ntt_sub_hierarchies, logn) - : std::nullopt), - tasks_manager( - ntt_data.is_parallel ? std::make_unique>>(std::thread::hardware_concurrency() - 1) - : nullptr) + : input(input), ntt_data(logn, output, config, direction, compute_if_is_parallel(logn, config)) { } @@ -82,8 +70,8 @@ namespace ntt_cpu { copy_and_reorder_if_needed(input, ntt_data.elements); if (!ntt_data.is_parallel) { if (ntt_data.direction == NTTDir::kForward && ntt_data.config.coset_gen != S::one()) { coset_mul(); } - NttTask task; - task.set_data(ntt_data); + NttTaskCoordinates ntt_task_coordinates(0, 0, 0, 0, 0, false); + NttTask task(ntt_task_coordinates, ntt_data); task.execute(); } else if (__builtin_expect((ntt_data.logn <= HIERARCHY_1),1)){ uint32_t nof_hierarchy_0_layers = (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][2] != 0) ? 3 : (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][1] != 0) ? 2 : 1; @@ -103,39 +91,15 @@ namespace ntt_cpu { #pragma omp parallel for collapse(2) schedule(dynamic) for (uint32_t hierarchy_0_block_idx = 0; hierarchy_0_block_idx < (nof_blocks); hierarchy_0_block_idx++) { for (uint32_t hierarchy_0_subntt_idx = 0; hierarchy_0_subntt_idx < (nof_subntts); hierarchy_0_subntt_idx++) { - NttTask task; - NttTaskCoordinates ntt_task_coordinates; - task.set_data(ntt_data); - ntt_task_coordinates.hierarchy_1_layer_idx = 0; - ntt_task_coordinates.hierarchy_1_subntt_idx = 0; - ntt_task_coordinates.hierarchy_0_layer_idx = hierarchy_0_layer_idx; - ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx; - ntt_task_coordinates.hierarchy_0_subntt_idx = hierarchy_0_subntt_idx; - ntt_task_coordinates.reorder = false; - task.set_coordinates(&ntt_task_coordinates); + NttTaskCoordinates ntt_task_coordinates(0, 0, hierarchy_0_layer_idx, hierarchy_0_block_idx, hierarchy_0_subntt_idx, false); + NttTask task(ntt_task_coordinates, ntt_data); task.execute(); - // ntt_task_coordinates.hierarchy_1_layer_idx = 0; - // ntt_task_coordinates.hierarchy_1_subntt_idx = 0; - // ntt_task_coordinates.hierarchy_0_layer_idx = hierarchy_0_layer_idx; - // ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx+1; - // ntt_task_coordinates.hierarchy_0_subntt_idx = hierarchy_0_subntt_idx; - // ntt_task_coordinates.reorder = false; - // task.set_coordinates(&ntt_task_coordinates); - // task.execute(); } } if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // all ntt tasks in hierarchy 1 are pushed, now push reorder task so that the data // is in the correct order for the next hierarchy 1 layer - NttTask task; - NttTaskCoordinates ntt_task_coordinates; - task.set_data(ntt_data); - ntt_task_coordinates.hierarchy_1_layer_idx = 0; - ntt_task_coordinates.hierarchy_1_subntt_idx = 0; - ntt_task_coordinates.hierarchy_0_layer_idx = nof_hierarchy_0_layers; - ntt_task_coordinates.hierarchy_0_block_idx = 0; - ntt_task_coordinates.hierarchy_0_subntt_idx = 0; - ntt_task_coordinates.reorder = true; - task.set_coordinates(&ntt_task_coordinates); + NttTaskCoordinates ntt_task_coordinates(0, 0, hierarchy_0_layer_idx, 0, 0, true); + NttTask task(ntt_task_coordinates, ntt_data); task.execute(); } } @@ -167,25 +131,13 @@ namespace ntt_cpu { for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { for (uint32_t hierarchy_0_block_idx = 0; hierarchy_0_block_idx < (nof_blocks); hierarchy_0_block_idx+=2) { for (uint32_t hierarchy_0_subntt_idx = 0; hierarchy_0_subntt_idx < (nof_subntts); hierarchy_0_subntt_idx++) { - NttTask task; - NttTaskCoordinates ntt_task_coordinates; - task.set_data(ntt_data); - ntt_task_coordinates.hierarchy_1_layer_idx = hierarchy_1_layer_idx; - ntt_task_coordinates.hierarchy_1_subntt_idx = hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck; - ntt_task_coordinates.hierarchy_0_layer_idx = hierarchy_0_layer_idx; - ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx; - ntt_task_coordinates.hierarchy_0_subntt_idx = hierarchy_0_subntt_idx; - ntt_task_coordinates.reorder = false; - task.set_coordinates(&ntt_task_coordinates); + NttTaskCoordinates ntt_task_coordinates(hierarchy_1_layer_idx, hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck, hierarchy_0_layer_idx, hierarchy_0_block_idx, hierarchy_0_subntt_idx, false); + NttTask task(ntt_task_coordinates, ntt_data); task.execute(); - ntt_task_coordinates.hierarchy_1_layer_idx = hierarchy_1_layer_idx; - ntt_task_coordinates.hierarchy_1_subntt_idx = hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck; - ntt_task_coordinates.hierarchy_0_layer_idx = hierarchy_0_layer_idx; ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx+1; - ntt_task_coordinates.hierarchy_0_subntt_idx = hierarchy_0_subntt_idx; - ntt_task_coordinates.reorder = false; - task.set_coordinates(&ntt_task_coordinates); - task.execute(); + // task.set_coordinates(ntt_task_coordinates); + NttTask task_with_elements_in_the_same_cachline(ntt_task_coordinates, ntt_data); + task_with_elements_in_the_same_cachline.execute(); } } } @@ -193,16 +145,8 @@ namespace ntt_cpu { // is in the correct order for the next hierarchy 1 layer #pragma omp parallel for for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { - NttTask task; - NttTaskCoordinates ntt_task_coordinates; - task.set_data(ntt_data); - ntt_task_coordinates.hierarchy_1_layer_idx = hierarchy_1_layer_idx; - ntt_task_coordinates.hierarchy_1_subntt_idx = hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck; - ntt_task_coordinates.hierarchy_0_layer_idx = nof_hierarchy_0_layers; - ntt_task_coordinates.hierarchy_0_block_idx = 0; - ntt_task_coordinates.hierarchy_0_subntt_idx = 0; - ntt_task_coordinates.reorder = true; - task.set_coordinates(&ntt_task_coordinates); + NttTaskCoordinates ntt_task_coordinates(hierarchy_1_layer_idx, hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck, nof_hierarchy_0_layers, 0, 0, true); + NttTask task(ntt_task_coordinates, ntt_data); task.execute(); } } @@ -432,89 +376,6 @@ namespace ntt_cpu { } } } - - /** - * @brief Schedules tasks for the first hierarchy layer of the NTT computation. - * - * This function organizes and pushes tasks corresponding to a specific hierarchy 1 layer - * and sub-NTT index into the task manager. It calculates the number of blocks and sub-NTTs - * based on the layer indices and logs, then schedules tasks accordingly. If multiple hierarchy 0 - * layers are involved, it also schedules a reorder task after processing. - * - * @param hierarchy_1_layer_idx Index of the current hierarchy 1 layer. - * @param hierarchy_1_subntt_idx Index of the sub-NTT within the hierarchy 1 layer. - * @return eIcicleError Returns SUCCESS if tasks are successfully scheduled, or an error code otherwise. - */ - template - eIcicleError NttCpu::hierarchy1_push_tasks(uint32_t hierarchy_1_layer_idx, uint32_t hierarchy_1_subntt_idx) - { - if (!ntt_tasks_manager) { - return eIcicleError::UNKNOWN_ERROR; // Handle case where no task manager is available - } - - return eIcicleError::SUCCESS; - } - - /** - * @brief Manages the execution and completion of scheduled tasks. - * - * This function handles the lifecycle of tasks at a given hierarchy level. It retrieves - * available tasks from the task manager, dispatches them for execution, and processes - * their completion. The function ensures that all tasks are executed and dependencies - * are correctly managed, including idle states for waiting tasks. - * - * @param hierarchy_1_layer_idx Index of the current hierarchy 1 layer being processed. - * @return eIcicleError Returns SUCCESS if all tasks are successfully handled, or an error code otherwise. - */ - template - eIcicleError NttCpu::handle_pushed_tasks(uint32_t hierarchy_1_layer_idx) - { - if (!ntt_tasks_manager) { return eIcicleError::UNKNOWN_ERROR; } - - NttTask* task_slot = nullptr; - // std::deque completed_tasks_list; - - uint32_t nof_subntts_l1 = 1 - << ((ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0]) + - (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1])); - while (ntt_tasks_manager->tasks_to_do()) { - // There are tasks that are available or waiting - - if (ntt_tasks_manager->available_tasks()) { - // Task is available to dispatch - task_slot = tasks_manager->get_idle_or_completed_task(); - if (task_slot->is_completed()) { - if (ntt_tasks_manager->handle_completed(task_slot, nof_subntts_l1)) { continue; } - } else { - task_slot->set_data(ntt_data); - } - NttTaskCoordinates next_task_c_ptr = ntt_tasks_manager->get_available_task(); - task_slot->set_coordinates(next_task_c_ptr); - task_slot->dispatch(); - } else { - // Wait for available tasks - task_slot = tasks_manager->get_completed_task(); - if (ntt_tasks_manager->handle_completed(task_slot, nof_subntts_l1)) { continue; } - if (ntt_tasks_manager->available_tasks()) { - NttTaskCoordinates next_task_c_ptr = ntt_tasks_manager->get_available_task(); - task_slot->set_coordinates(next_task_c_ptr); - task_slot->dispatch(); - } else { - task_slot->set_idle(); - } - } - } - while (true) { - task_slot = tasks_manager->get_completed_task(); - if (task_slot == nullptr) { - break; - } else { - task_slot->set_idle(); - } - } - return eIcicleError::SUCCESS; - } - /** * @brief Determines if the NTT computation should be parallelized. * diff --git a/icicle/backend/cpu/include/ntt_data.h b/icicle/backend/cpu/include/ntt_data.h index dde1a7ce5..7bd0ea27f 100644 --- a/icicle/backend/cpu/include/ntt_data.h +++ b/icicle/backend/cpu/include/ntt_data.h @@ -27,7 +27,7 @@ namespace ntt_cpu { constexpr uint32_t layers_sub_logn[31][3] = { {0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0}, {4, 0, 0}, {5, 0, 0}, {3, 3, 0}, {4, 3, 0}, {4, 4, 0}, {5, 4, 0}, {5, 5, 0}, {4, 4, 3}, {4, 4, 4}, {5, 4, 4}, {5, 5, 4}, {5, 5, 5}, - {5, 5, 6}, {5, 5, 7}, {5, 8, 5}, {5, 5, 9}, {5, 5, 10}, {5, 5, 11}, {5, 5, 12}, {5, 5, 13}, + {5, 5, 6}, {5, 5, 7}, {5, 5, 8}, {5, 5, 9}, {5, 5, 10}, {5, 5, 11}, {5, 5, 12}, {5, 5, 13}, {5, 5, 14}, {5, 5, 15}, {13, 13, 0}, {14, 13, 0}, {14, 14, 0}, {15, 14, 0}, {15, 15, 0}}; /** diff --git a/icicle/backend/cpu/include/ntt_task.h b/icicle/backend/cpu/include/ntt_task.h index c0157ffa4..b3483b098 100644 --- a/icicle/backend/cpu/include/ntt_task.h +++ b/icicle/backend/cpu/include/ntt_task.h @@ -17,34 +17,20 @@ namespace ntt_cpu { * for a given sub-NTT or reordering the output if required. * * @method void execute() Executes the task, either performing the NTT computation or reordering the output. - * @method NttTaskCoordinates get_coordinates() const Returns the task's coordinates. * @method void set_coordinates(NttTaskParams params) Sets the task parameters. * @method void set_data(NttData& data) Sets the NTT data for the task. */ template - class NttTask : public TaskBase + class NttTask { public: - NttTask() : ntt_data(nullptr) {} - + NttTask(const NttTaskCoordinates coords, NttData& data) + : ntt_task_coordinates(coords), ntt_data(&data) + {} void execute(); - NttTaskCoordinates* get_coordinates() const - { - return ntt_task_coordinates; - } // Returns a pointer to the `NttTaskCoordinates` structure that specifies the task's position within the NTT - // computation hierarchy. - void set_coordinates(NttTaskCoordinates* task_c_ptr) - { - ntt_task_coordinates = task_c_ptr; - } // Assigns a pointer to a `NttTaskCoordinates` structure, which specifies the task's position within the NTT - // computation hierarchy. - void set_data(NttData& data) - { - ntt_data = &data; - } // Sets the `NttData` structure that contains all necessary data and configurations required. private: - NttTaskCoordinates* ntt_task_coordinates = nullptr; + NttTaskCoordinates ntt_task_coordinates; NttData* ntt_data = nullptr; eIcicleError reorder_and_refactor_if_needed(); void apply_coset_multiplication(E* current_elements, const std::vector& index_in_mem, const S* twiddles); @@ -57,7 +43,7 @@ namespace ntt_cpu { void reorder_by_bit_reverse(); void reorder_by_bit_reverse_non_parallel(); void refactor_output_hierarchy_0(); - uint64_t idx_in_mem(NttTaskCoordinates* ntt_task_coordinates, uint32_t element); + uint64_t idx_in_mem(NttTaskCoordinates ntt_task_coordinates, uint32_t element); }; //////////////////////////// NttTask Implementation //////////////////////////// @@ -73,7 +59,7 @@ namespace ntt_cpu { template void NttTask::execute() { - if (!ntt_data->is_parallel || !ntt_task_coordinates->reorder) { + if (!ntt_data->is_parallel || !ntt_task_coordinates.reorder) { hierarchy_0_cpu_ntt(); } else { // if all hierarchy_0_subntts are done, and at least 2 layers in hierarchy 0 - reorder the subntt's output @@ -104,22 +90,22 @@ namespace ntt_cpu { E* elements = ntt_data->elements + row_batch * ntt_data->size; bool is_only_hierarchy_0 = ntt_data->ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[0] == 0; const bool refactor_pre_hierarchy_1_next_layer = - (!is_only_hierarchy_0) && (ntt_task_coordinates->hierarchy_1_layer_idx == 0); + (!is_only_hierarchy_0) && (ntt_task_coordinates.hierarchy_1_layer_idx == 0); uint64_t size = (is_only_hierarchy_0) ? ntt_data->size - : 1 << ntt_data->ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx]; + : 1 << ntt_data->ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]; uint64_t temp_output_size = ntt_data->config.columns_batch ? size * ntt_data->config.batch_size : size; auto temp_output = std::make_unique(temp_output_size); uint64_t new_idx = 0; uint32_t subntt_idx; uint32_t element; uint32_t s0 = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][0]; + ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][0]; uint32_t s1 = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][1]; + ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][1]; uint32_t s2 = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][2]; + ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][2]; uint32_t p0, p1, p2; const uint32_t stride = ntt_data->config.columns_batch ? ntt_data->config.batch_size : 1; uint32_t rep = ntt_data->config.columns_batch ? ntt_data->config.batch_size : 1; @@ -127,9 +113,9 @@ namespace ntt_cpu { const S* twiddles = CpuNttDomain::s_ntt_domain.get_twiddles(); E* hierarchy_1_subntt_output = elements + - stride * (ntt_task_coordinates->hierarchy_1_subntt_idx + stride * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies - .hierarchy_1_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx]); // input + subntt_idx + .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx // * subntt_size for (uint32_t col_batch = 0; col_batch < columns_batch_reps; ++col_batch) { E* current_elements = @@ -151,10 +137,10 @@ namespace ntt_cpu { if (refactor_pre_hierarchy_1_next_layer) { tw_idx = (ntt_data->direction == NTTDir::kForward) ? ((CpuNttDomain::s_ntt_domain.get_max_size() >> ntt_data->logn) * - ntt_task_coordinates->hierarchy_1_subntt_idx * new_idx) + ntt_task_coordinates.hierarchy_1_subntt_idx * new_idx) : CpuNttDomain::s_ntt_domain.get_max_size() - ((CpuNttDomain::s_ntt_domain.get_max_size() >> ntt_data->logn) * - ntt_task_coordinates->hierarchy_1_subntt_idx * new_idx); + ntt_task_coordinates.hierarchy_1_subntt_idx * new_idx); current_temp_output[stride * new_idx] = current_elements[stride * i] * twiddles[tw_idx]; } else { current_temp_output[stride * new_idx] = current_elements[stride * i]; @@ -187,15 +173,15 @@ namespace ntt_cpu { E* current_elements, const std::vector& index_in_mem, const S* twiddles) { uint32_t current_subntt_size = - 1 << ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx] - [ntt_task_coordinates->hierarchy_0_layer_idx]; + 1 << ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] + [ntt_task_coordinates.hierarchy_0_layer_idx]; uint32_t subntt_idx; uint32_t s0 = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][0]; + ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][0]; uint32_t s1 = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][1]; + ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][1]; uint32_t s2 = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][2]; + ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][2]; uint32_t p0, p1, p2; for (uint32_t i = 0; i < current_subntt_size; i++) { uint64_t new_idx = i; @@ -239,8 +225,8 @@ namespace ntt_cpu { { const uint32_t subntt_size_log = ntt_data->is_parallel - ? ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx] - [ntt_task_coordinates->hierarchy_0_layer_idx] + ? ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] + [ntt_task_coordinates.hierarchy_0_layer_idx] : ntt_data->logn; switch (subntt_size_log) { case 3: @@ -259,9 +245,9 @@ namespace ntt_cpu { } if ( - ntt_data->is_parallel && ntt_task_coordinates->hierarchy_0_layer_idx != 2 && - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx] - [ntt_task_coordinates->hierarchy_0_layer_idx + 1] != 0) { + ntt_data->is_parallel && ntt_task_coordinates.hierarchy_0_layer_idx != 2 && + ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] + [ntt_task_coordinates.hierarchy_0_layer_idx + 1] != 0) { refactor_output_hierarchy_0(); } return eIcicleError::SUCCESS; @@ -296,21 +282,21 @@ namespace ntt_cpu { index_in_mem[i] = stride * idx_in_mem(ntt_task_coordinates, i); } last_layer = - (ntt_task_coordinates->hierarchy_1_layer_idx == 1 || + (ntt_task_coordinates.hierarchy_1_layer_idx == 1 || (ntt_data->ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1] == 0)) && - (ntt_task_coordinates->hierarchy_0_layer_idx == 2 || - (ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx] - [ntt_task_coordinates->hierarchy_0_layer_idx + 1] == + (ntt_task_coordinates.hierarchy_0_layer_idx == 2 || + (ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] + [ntt_task_coordinates.hierarchy_0_layer_idx + 1] == 0)); const bool first_layer = - ntt_task_coordinates->hierarchy_1_layer_idx == 0 && ntt_task_coordinates->hierarchy_0_layer_idx == 0; + ntt_task_coordinates.hierarchy_1_layer_idx == 0 && ntt_task_coordinates.hierarchy_0_layer_idx == 0; need_to_apply_coset_multiplication = first_layer && ntt_data->config.coset_gen != S::one() && ntt_data->direction == NTTDir::kForward; subntt_elements = ntt_data->elements + - offset * (ntt_task_coordinates->hierarchy_1_subntt_idx + offset * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies - .hierarchy_1_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx]); // input + subntt_idx + .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx // * // subntt_size } else { subntt_elements = ntt_data->elements; @@ -402,21 +388,21 @@ namespace ntt_cpu { index_in_mem[i] = stride * idx_in_mem(ntt_task_coordinates, i); } last_layer = - (ntt_task_coordinates->hierarchy_1_layer_idx == 1 || + (ntt_task_coordinates.hierarchy_1_layer_idx == 1 || (ntt_data->ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1] == 0)) && - (ntt_task_coordinates->hierarchy_0_layer_idx == 2 || - (ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx] - [ntt_task_coordinates->hierarchy_0_layer_idx + 1] == + (ntt_task_coordinates.hierarchy_0_layer_idx == 2 || + (ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] + [ntt_task_coordinates.hierarchy_0_layer_idx + 1] == 0)); const bool first_layer = - ntt_task_coordinates->hierarchy_1_layer_idx == 0 && ntt_task_coordinates->hierarchy_0_layer_idx == 0; + ntt_task_coordinates.hierarchy_1_layer_idx == 0 && ntt_task_coordinates.hierarchy_0_layer_idx == 0; need_to_apply_coset_multiplication = first_layer && ntt_data->config.coset_gen != S::one() && ntt_data->direction == NTTDir::kForward; subntt_elements = ntt_data->elements + - offset * (ntt_task_coordinates->hierarchy_1_subntt_idx + offset * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies - .hierarchy_1_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx]); // input + subntt_idx + .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx // * // subntt_size } else { subntt_elements = ntt_data->elements; @@ -595,21 +581,21 @@ namespace ntt_cpu { index_in_mem[i] = stride * idx_in_mem(ntt_task_coordinates, i); } last_layer = - (ntt_task_coordinates->hierarchy_1_layer_idx == 1 || + (ntt_task_coordinates.hierarchy_1_layer_idx == 1 || (ntt_data->ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1] == 0)) && - (ntt_task_coordinates->hierarchy_0_layer_idx == 2 || - (ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx] - [ntt_task_coordinates->hierarchy_0_layer_idx + 1] == + (ntt_task_coordinates.hierarchy_0_layer_idx == 2 || + (ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] + [ntt_task_coordinates.hierarchy_0_layer_idx + 1] == 0)); const bool first_layer = - ntt_task_coordinates->hierarchy_1_layer_idx == 0 && ntt_task_coordinates->hierarchy_0_layer_idx == 0; + ntt_task_coordinates.hierarchy_1_layer_idx == 0 && ntt_task_coordinates.hierarchy_0_layer_idx == 0; need_to_apply_coset_multiplication = first_layer && ntt_data->config.coset_gen != S::one() && ntt_data->direction == NTTDir::kForward; subntt_elements = ntt_data->elements + - offset * (ntt_task_coordinates->hierarchy_1_subntt_idx + offset * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies - .hierarchy_1_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx]); // input + subntt_idx + .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx // * // subntt_size } else { subntt_elements = ntt_data->elements; @@ -1268,26 +1254,26 @@ namespace ntt_cpu { if (ntt_data->is_parallel) { last_layer = - (ntt_task_coordinates->hierarchy_1_layer_idx == 1 || + (ntt_task_coordinates.hierarchy_1_layer_idx == 1 || (ntt_data->ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1] == 0)) && - (ntt_task_coordinates->hierarchy_0_layer_idx == 2 || - (ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx] - [ntt_task_coordinates->hierarchy_0_layer_idx + 1] == + (ntt_task_coordinates.hierarchy_0_layer_idx == 2 || + (ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] + [ntt_task_coordinates.hierarchy_0_layer_idx + 1] == 0)); const bool first_layer = - ntt_task_coordinates->hierarchy_1_layer_idx == 0 && ntt_task_coordinates->hierarchy_0_layer_idx == 0; + ntt_task_coordinates.hierarchy_1_layer_idx == 0 && ntt_task_coordinates.hierarchy_0_layer_idx == 0; need_to_apply_coset_multiplication = first_layer && ntt_data->config.coset_gen != S::one() && ntt_data->direction == NTTDir::kForward; subntt_elements = ntt_data->elements + - offset * (ntt_task_coordinates->hierarchy_1_subntt_idx + offset * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies - .hierarchy_1_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx]); // input + subntt_idx + .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx // * subntt_size subntt_size_log = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx] - [ntt_task_coordinates->hierarchy_0_layer_idx]; + ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] + [ntt_task_coordinates.hierarchy_0_layer_idx]; subntt_size = 1 << subntt_size_log; } else { subntt_elements = ntt_data->elements; @@ -1355,16 +1341,16 @@ namespace ntt_cpu { uint32_t offset = ntt_data->config.columns_batch ? ntt_data->config.batch_size : 1; E* subntt_elements = ntt_data->elements + - offset * (ntt_task_coordinates->hierarchy_1_subntt_idx + offset * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies - .hierarchy_1_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx]); // input + subntt_idx * + .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx * // subntt_size uint64_t subntt_size = - 1 << ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx] - [ntt_task_coordinates->hierarchy_0_layer_idx]; + 1 << ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] + [ntt_task_coordinates.hierarchy_0_layer_idx]; uint32_t subntt_log_size = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx] - [ntt_task_coordinates->hierarchy_0_layer_idx]; + ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] + [ntt_task_coordinates.hierarchy_0_layer_idx]; uint64_t original_size = (1 << ntt_data->logn); uint32_t stride = ntt_data->config.columns_batch ? ntt_data->config.batch_size : 1; for (uint32_t batch = 0; batch < ntt_data->config.batch_size; ++batch) { @@ -1419,45 +1405,45 @@ namespace ntt_cpu { void NttTask::refactor_output_hierarchy_0() { uint32_t hierarchy_0_subntt_size = - 1 << ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx] - [ntt_task_coordinates->hierarchy_0_layer_idx]; + 1 << ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] + [ntt_task_coordinates.hierarchy_0_layer_idx]; uint32_t hierarchy_0_nof_subntts = 1 << ntt_data->ntt_sub_hierarchies - .hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][0]; // only relevant for layer 1 + .hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][0]; // only relevant for layer 1 uint32_t i, j, i_0; - uint32_t ntt_size = ntt_task_coordinates->hierarchy_0_layer_idx == 0 + uint32_t ntt_size = ntt_task_coordinates.hierarchy_0_layer_idx == 0 ? 1 << (ntt_data->ntt_sub_hierarchies - .hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][0] + + .hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][0] + ntt_data->ntt_sub_hierarchies - .hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][1]) + .hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][1]) : 1 << (ntt_data->ntt_sub_hierarchies - .hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][0] + + .hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][0] + ntt_data->ntt_sub_hierarchies - .hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][1] + + .hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][1] + ntt_data->ntt_sub_hierarchies - .hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][2]); + .hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][2]); uint32_t stride = ntt_data->config.columns_batch ? ntt_data->config.batch_size : 1; uint64_t original_size = (1 << ntt_data->logn); const S* twiddles = CpuNttDomain::s_ntt_domain.get_twiddles(); for (uint32_t batch = 0; batch < ntt_data->config.batch_size; ++batch) { E* hierarchy_1_subntt_elements = ntt_data->elements + - stride * (ntt_task_coordinates->hierarchy_1_subntt_idx + stride * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies - .hierarchy_1_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx]); // input + subntt_idx + .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx // * subntt_size E* elements_of_current_batch = ntt_data->config.columns_batch ? hierarchy_1_subntt_elements + batch : hierarchy_1_subntt_elements + batch * original_size; for (uint32_t elem = 0; elem < hierarchy_0_subntt_size; elem++) { uint64_t elem_mem_idx = stride * idx_in_mem(ntt_task_coordinates, elem); - i = (ntt_task_coordinates->hierarchy_0_layer_idx == 0) + i = (ntt_task_coordinates.hierarchy_0_layer_idx == 0) ? elem - : elem * hierarchy_0_nof_subntts + ntt_task_coordinates->hierarchy_0_subntt_idx; - j = (ntt_task_coordinates->hierarchy_0_layer_idx == 0) ? ntt_task_coordinates->hierarchy_0_subntt_idx - : ntt_task_coordinates->hierarchy_0_block_idx; + : elem * hierarchy_0_nof_subntts + ntt_task_coordinates.hierarchy_0_subntt_idx; + j = (ntt_task_coordinates.hierarchy_0_layer_idx == 0) ? ntt_task_coordinates.hierarchy_0_subntt_idx + : ntt_task_coordinates.hierarchy_0_block_idx; uint64_t tw_idx = (ntt_data->direction == NTTDir::kForward) ? ((CpuNttDomain::s_ntt_domain.get_max_size() / ntt_size) * j * i) : CpuNttDomain::s_ntt_domain.get_max_size() - @@ -1484,24 +1470,24 @@ namespace ntt_cpu { */ template - uint64_t NttTask::idx_in_mem(NttTaskCoordinates* ntt_task_coordinates, uint32_t element_idx) + uint64_t NttTask::idx_in_mem(NttTaskCoordinates ntt_task_coordinates, uint32_t element_idx) { uint32_t s0 = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][0]; + ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][0]; uint32_t s1 = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][1]; + ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][1]; uint32_t s2 = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates->hierarchy_1_layer_idx][2]; - switch (ntt_task_coordinates->hierarchy_0_layer_idx) { + ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][2]; + switch (ntt_task_coordinates.hierarchy_0_layer_idx) { case 0: - return ntt_task_coordinates->hierarchy_0_block_idx + - ((ntt_task_coordinates->hierarchy_0_subntt_idx + (element_idx << s1)) << s2); + return ntt_task_coordinates.hierarchy_0_block_idx + + ((ntt_task_coordinates.hierarchy_0_subntt_idx + (element_idx << s1)) << s2); case 1: - return ntt_task_coordinates->hierarchy_0_block_idx + - ((element_idx + (ntt_task_coordinates->hierarchy_0_subntt_idx << s1)) << s2); + return ntt_task_coordinates.hierarchy_0_block_idx + + ((element_idx + (ntt_task_coordinates.hierarchy_0_subntt_idx << s1)) << s2); case 2: - return ((ntt_task_coordinates->hierarchy_0_block_idx << (s1 + s2)) & ((1 << (s0 + s1 + s2)) - 1)) + - (((ntt_task_coordinates->hierarchy_0_block_idx << (s1 + s2)) >> (s0 + s1 + s2)) << s2) + element_idx; + return ((ntt_task_coordinates.hierarchy_0_block_idx << (s1 + s2)) & ((1 << (s0 + s1 + s2)) - 1)) + + (((ntt_task_coordinates.hierarchy_0_block_idx << (s1 + s2)) >> (s0 + s1 + s2)) << s2) + element_idx; default: ICICLE_ASSERT(false) << "Unsupported layer"; } diff --git a/icicle/backend/cpu/include/ntt_tasks_manager.h b/icicle/backend/cpu/include/ntt_tasks_manager.h deleted file mode 100644 index fd165059f..000000000 --- a/icicle/backend/cpu/include/ntt_tasks_manager.h +++ /dev/null @@ -1,418 +0,0 @@ -#pragma once -#include "icicle/utils/log.h" -#include "ntt_task.h" -#include - -using namespace field_config; -using namespace icicle; -namespace ntt_cpu { - - /** - * @brief Manages task dependency counters for NTT computation, tracking readiness of tasks to execute. - * - * This class tracks and manages counters for tasks within the NTT hierarchy, determining when tasks are ready to - * execute based on the completion of their dependencies. - * - * @param hierarchy_1_layer_idx Index of the hierarchy_1 layer this counter set belongs to. - * @param nof_hierarchy_0_layers Number of hierarchy_0 layers in the current hierarchy_1 layer. - * @param dependent_subntt_count Number of counters pointing to each hierarchy_0 layer. - * @param hierarchy_0_counters A 3D vector of uint32_t - counters for groups of sub-NTTs in hierarchy_0 layers. - * @param hierarchy_1_counters A vector of shared pointers to counters for each sub-NTT in hierarchy_1 layers, used to - * signal when an hierarchy_1_subntt is ready for reordering. - * - * @method TasksDependenciesCounters(NttSubHierarchies ntt_sub_hierarchies, uint32_t hierarchy_1_layer_idx) - * Constructor that initializes the counters based on NTT structure. - * @method bool decrement_counter(NttTaskCoordinates ntt_task_coordinates) Decrements the counter for a given task and - * returns true if the task is ready to execute. - * @method uint32_t get_dependent_subntt_count(uint32_t hierarchy_0_layer_idx) Returns the number of counters pointing - * to the given hierarchy_0 layer. - * @method uint32_t get_nof_hierarchy_0_layers() Returns the number of hierarchy_0 layers in the current hierarchy_1 - * layer. - */ - class TasksDependenciesCounters - { - public: - // Constructor that initializes the counters - TasksDependenciesCounters(const NttSubHierarchies& ntt_sub_hierarchies, uint32_t hierarchy_1_layer_idx); - - // Function to decrement the counter for a given task and check if it is ready to execute. if so, return true - bool decrement_counter(NttTaskCoordinates ntt_task_coordinates); - uint32_t get_dependent_subntt_count(uint32_t hierarchy_0_layer_idx) - { - return dependent_subntt_count[hierarchy_0_layer_idx]; - } - uint32_t get_nof_hierarchy_0_layers() { return nof_hierarchy_0_layers; } - - private: - uint32_t hierarchy_1_layer_idx; // Index of the current hierarchy 1 layer. - uint32_t nof_hierarchy_0_layers; // Number of hierarchy 0 layers in the current hierarchy 1 layer. - std::vector dependent_subntt_count; // Number of subntt that are getting available together when a group - // of hierarchy_0_subntts from previous layer are done - - std::vector>> - hierarchy_0_counters; // 3D vector of counters for groups of sub-NTTs in hierarchy 0 layers: - // hierarchy_0_counters[hierarchy_1_subntt_idx][hierarchy_0_layer_idx][hierarchy_0_counter_idx] - - // One counter for each hierarchy_1_subntt to signal the end of the hierarchy_1_subntt. each hierarchy_0_subntt of - // last hierarchy_0_layer will decrement this counter when it finishes and when it reaches 0, the hierarchy_1_subntt - // is ready to reorder - std::vector hierarchy_1_counters; // hierarchy_1_counters[hierarchy_1_subntt_idx] - }; - - /** - * @brief Manages tasks for the NTT computation, handling task scheduling and dependency management. - * - * The NttTasksManager is responsible for adding tasks, updating task dependencies, - * and determining the readiness of tasks for execution. This class ensures that - * tasks are executed in the correct order based on their dependencies within the NTT hierarchy. - * - */ - template - class NttTasksManager - { - public: - NttTasksManager(const NttSubHierarchies& ntt_sub_logn_ref, uint32_t logn); - - // Add a new task to the ntt_task_manager - eIcicleError push_task(const NttTaskCoordinates& ntt_task_coordinates); - - // Set a task as completed and update dependencies - bool handle_completed(NttTask* completed_task, uint32_t nof_subntts_l1); - NttTaskCoordinates* get_slot_for_next_task_coordinates(); - - bool tasks_to_do() const; - bool available_tasks() const; - NttTaskCoordinates* get_available_task(); - uint32_t nof_pending_tasks = 0; // the current count of tasks that are pending execution - - private: - const uint32_t logn; // log of the NTT size - const NttSubHierarchies& ntt_sub_hierarchies; // Reference to NttSubHierarchies - std::vector counters; // Dependencies counters by layer - std::vector task_buffer; // Buffer holding task coordinates for pending tasks - size_t head; // Head index for the task buffer (used in circular buffer implementation) - size_t tail; // Tail index for the task buffer (used in circular buffer implementation) - - bool is_full() const; - bool is_empty() const; - void increment(size_t& index); - void decrement(size_t& index); - }; - - //////////////////////////// TasksDependenciesCounters Implementation //////////////////////////// - - /** - * @brief Constructs a TasksDependenciesCounters instance with specified NTT sub-logarithms and hierarchy layer index. - * - * Initializes dependency counters based on the provided NTT structure and hierarchy layer. It sets up - * counters for each sub-NTT in hierarchy 1 and initializes counters for hierarchy 0 layers. - * - * @param ntt_sub_hierarchies The structure containing logarithmic sizes of sub-NTTs. - * @param hierarchy_1_layer_idx The index of the current hierarchy 1 layer. - */ - TasksDependenciesCounters::TasksDependenciesCounters( - const NttSubHierarchies& ntt_sub_hierarchies, uint32_t hierarchy_1_layer_idx) - : hierarchy_0_counters( - 1 << ntt_sub_hierarchies.hierarchy_1_layers_sub_logn - [1 - hierarchy_1_layer_idx]), // nof_hierarchy_1_subntts = - // hierarchy_1_layers_sub_logn[1-hierarchy_1_layer_idx]. - hierarchy_1_counters(1 << ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1 - hierarchy_1_layer_idx]) - { - nof_hierarchy_0_layers = ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2] - ? 3 - : (ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1] ? 2 : 1); - dependent_subntt_count.resize(nof_hierarchy_0_layers); - dependent_subntt_count[0] = 1; - uint32_t l1_counter_size; - uint32_t l2_counter_size; - uint32_t l1_nof_counters; - uint32_t l2_nof_counters; - if (nof_hierarchy_0_layers > 1) { - // Initialize counters for layer 1 - N2 counters initialized with N1. - dependent_subntt_count[1] = 1 << ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0]; - l1_nof_counters = 1 << ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2]; - l1_counter_size = 1 << ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1]; - } - if (nof_hierarchy_0_layers > 2) { - // Initialize counters for layer 2 - N0 counters initialized with N2. - dependent_subntt_count[2] = 1 << ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1]; - l2_nof_counters = 1 << ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0]; - l2_counter_size = 1 << ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2]; - } - - for (uint32_t hierarchy_1_subntt_idx = 0; - hierarchy_1_subntt_idx < (1 << ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1 - hierarchy_1_layer_idx]); - ++hierarchy_1_subntt_idx) { - hierarchy_0_counters[hierarchy_1_subntt_idx].resize(3); // 3 possible layers (0, 1, 2) - // Initialize counters for layer 0 - 1 counter1 initialized with 0. - hierarchy_0_counters[hierarchy_1_subntt_idx][0].resize(1); - hierarchy_0_counters[hierarchy_1_subntt_idx][0][0] = - 0; //[hierarchy_1_subntt_idx][hierarchy_0_layer_idx][hierarchy_0_counter_idx] - - if (nof_hierarchy_0_layers > 1) { - // Initialize counters for layer 1 - N2 counters initialized with N1. - hierarchy_0_counters[hierarchy_1_subntt_idx][1].resize(l1_nof_counters); - for (uint32_t counter_idx = 0; counter_idx < l1_nof_counters; ++counter_idx) { - hierarchy_0_counters[hierarchy_1_subntt_idx][1][counter_idx] = l1_counter_size; - } - } - if (nof_hierarchy_0_layers > 2) { - // Initialize counters for layer 2 - N0 counters initialized with N2. - hierarchy_0_counters[hierarchy_1_subntt_idx][2].resize(l2_nof_counters); - for (uint32_t counter_idx = 0; counter_idx < l2_nof_counters; ++counter_idx) { - hierarchy_0_counters[hierarchy_1_subntt_idx][2][counter_idx] = l2_counter_size; - } - } - // Initialize hierarchy_1_counters with N0 * N1 - uint32_t hierarchy_1_counter_size = - nof_hierarchy_0_layers == 3 ? (1 << ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0]) * - (1 << ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1]) - : nof_hierarchy_0_layers == 2 ? (1 << ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0]) - : 0; - hierarchy_1_counters[hierarchy_1_subntt_idx] = hierarchy_1_counter_size; - } - } - - /** - * @brief Decrements the dependency counter for a given task and checks if the dependent task is ready to execute. - * - * This function decrements the counter associated with a task in hierarchy 0 or the global counter in hierarchy 1. - * If the counter reaches zero, it indicates that the dependent task is now ready to be executed. - * - * @param task_c The coordinates of the task whose counter is to be decremented. - * @return `true` if the dependent task is ready to execute, `false` otherwise. - */ - bool TasksDependenciesCounters::decrement_counter(NttTaskCoordinates task_c) - { - if (nof_hierarchy_0_layers == 1) { return false; } - if (task_c.hierarchy_0_layer_idx < nof_hierarchy_0_layers - 1) { - // Extract the coordinates from the task - uint32_t counter_group_idx = - task_c.hierarchy_0_layer_idx == 0 ? task_c.hierarchy_0_block_idx : - /*task_c.hierarchy_0_layer_idx==1*/ task_c.hierarchy_0_subntt_idx; - - uint32_t& counter_ref = - hierarchy_0_counters[task_c.hierarchy_1_subntt_idx][task_c.hierarchy_0_layer_idx + 1][counter_group_idx]; - counter_ref--; - - if (counter_ref == 0) { return true; } - } else { - // Decrement the counter for the given hierarchy_1_subntt_idx - uint32_t& hierarchy_1_counter_ref = hierarchy_1_counters[task_c.hierarchy_1_subntt_idx]; - hierarchy_1_counter_ref--; - - if (hierarchy_1_counter_ref == 0) { return true; } - } - return false; - } - - //////////////////////////// NttTasksManager Implementation //////////////////////////// - - /** - * @brief Constructs the task manager for a given problem size. - * @param logn The log2(size) of the NTT problem. - */ - template - NttTasksManager::NttTasksManager(const NttSubHierarchies& ntt_sub_logn_ref, uint32_t logn) - : logn(logn), ntt_sub_hierarchies(ntt_sub_logn_ref), - counters(logn > HIERARCHY_1 ? 2 : 1, TasksDependenciesCounters(ntt_sub_logn_ref, 0)), - task_buffer(1 << (logn)), // Pre-allocate buffer - head(0), tail(0), nof_pending_tasks(0) - - { - if (logn > HIERARCHY_1) { counters[1] = TasksDependenciesCounters(ntt_sub_logn_ref, 1); } - } - - /** - * @brief Checks if the task buffer is full. - * - * Determines whether the task buffer has reached its maximum capacity. - * - * @return `true` if the buffer is full, `false` otherwise. - */ - template - bool NttTasksManager::is_full() const - { - return (tail + 1) % (1 << (logn)) == head; - } - - /** - * @brief Checks if the task buffer is empty. - * - * Determines whether there are no tasks in the task buffer and no pending tasks. - * - * @return `true` if the buffer is empty and there are no pending tasks, `false` otherwise. - */ - template - bool NttTasksManager::is_empty() const - { - return head == tail && nof_pending_tasks == 0; - } - - /** - * @brief Increments a buffer index in a circular manner. - * - * Advances the given index to the next position in the circular buffer, wrapping around if necessary. - * - * @param index Reference to the index to be incremented. - */ - template - void NttTasksManager::increment(size_t& index) - { - index = (index + 1) % (1 << (logn)); - } - - /** - * @brief Decrements a buffer index in a circular manner. - * - * Moves the given index to the previous position in the circular buffer, wrapping around if necessary. - * - * @param index Reference to the index to be decremented. - */ - template - void NttTasksManager::decrement(size_t& index) - { - index = (index - 1) % (1 << (logn)); - } - - /** - * @brief Adds a new task to the task manager. - * - * Inserts a new task into the task buffer if there is available space. This task will be managed - * by the task manager, which will handle its execution based on dependency resolution. - * - * @param ntt_task_coordinates Task coordinates specifying the task's position in the hierarchy. - * @return `eIcicleError::SUCCESS` if the task was successfully added, otherwise an appropriate error code. - */ - template - eIcicleError NttTasksManager::push_task(const NttTaskCoordinates& ntt_task_coordinates) - { - if (is_full()) { return eIcicleError::OUT_OF_MEMORY; } - task_buffer[tail] = ntt_task_coordinates; - increment(tail); - return eIcicleError::SUCCESS; - } - - /** - * @brief Retrieves a pointer to a slot for the next task coordinates. - * - * Provides access to a slot in the task buffer where new task coordinates can be assigned. - * @return Pointer to `NttTaskCoordinates` if a slot is available, `nullptr` otherwise. - */ - template - NttTaskCoordinates* NttTasksManager::get_slot_for_next_task_coordinates() - { - if (is_full()) { return nullptr; } - NttTaskCoordinates* task = &task_buffer[tail]; - increment(tail); - return task; - } - - /** - * @brief Retrieves the next available task ready for execution. - * - * Fetches the next task from the available task buffer that is ready to be executed based on dependency - * resolution. If no tasks are available, returns `nullptr`. - * - * @return Pointer to `NttTaskCoordinates` of the available task, or `nullptr` if none are available. - */ - template - NttTaskCoordinates* NttTasksManager::get_available_task() - { - if (head == tail) { - // No available tasks - return nullptr; - } - NttTaskCoordinates* task = &task_buffer[head]; - increment(head); - return task; - } - - /** - * @brief Checks if there are tasks remaining to be processed. - * - * Determines whether there are any tasks left to execute or pending dependencies. - * - * @return `true` if there are tasks to do, `false` otherwise. - */ - template - bool NttTasksManager::tasks_to_do() const - { - return head != tail || nof_pending_tasks != 0; - } - - /** - * @brief Checks if there are available tasks ready for execution. - * - * Determines whether there are any tasks in the buffer that are ready to be executed. - * - * @return `true` if there are available tasks, `false` otherwise. - */ - template - bool NttTasksManager::available_tasks() const - { - return head != tail; - } - - /** - * @brief Marks a task as completed and updates dependencies. - * - * This function should be called when a task has finished execution. It decrements the relevant - * dependency counters and, if dependencies are resolved, dispatches dependent tasks for execution or adds them to the - * task buffer. - * - * @param completed_task Pointer to the completed task. - * @param nof_subntts_l1 Number of sub-NTTs in the second layer of hierarchy 1. - * @return `true` if a dependent task was dispatched as a result of this completion, `false` otherwise. - */ - template - bool NttTasksManager::handle_completed(NttTask* completed_task, uint32_t nof_subntts_l1) - { - bool task_dispatched = false; - NttTaskCoordinates task_c = *completed_task->get_coordinates(); - uint32_t nof_hierarchy_0_layers = counters[task_c.hierarchy_1_layer_idx].get_nof_hierarchy_0_layers(); - // Update dependencies in counters - if (counters[task_c.hierarchy_1_layer_idx].decrement_counter(task_c)) { - if (task_c.hierarchy_0_layer_idx < nof_hierarchy_0_layers - 1) { - NttTaskCoordinates* next_task_c_ptr = nullptr; - uint32_t nof_new_ready_tasks = - (task_c.hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1) - ? 1 - : counters[task_c.hierarchy_1_layer_idx].get_dependent_subntt_count(task_c.hierarchy_0_layer_idx + 1); - uint32_t stride = nof_subntts_l1 / nof_new_ready_tasks; - - for (uint32_t i = 0; i < nof_new_ready_tasks; i++) { - next_task_c_ptr = get_slot_for_next_task_coordinates(); - next_task_c_ptr->hierarchy_1_layer_idx = task_c.hierarchy_1_layer_idx; - next_task_c_ptr->hierarchy_1_subntt_idx = task_c.hierarchy_1_subntt_idx; - next_task_c_ptr->hierarchy_0_layer_idx = task_c.hierarchy_0_layer_idx + 1; - next_task_c_ptr->hierarchy_0_block_idx = (task_c.hierarchy_0_layer_idx == 0) - ? task_c.hierarchy_0_block_idx - : task_c.hierarchy_0_subntt_idx + stride * i; - next_task_c_ptr->hierarchy_0_subntt_idx = (task_c.hierarchy_0_layer_idx == 0) ? i : 0; - if (i == 0) { - completed_task->set_coordinates(get_available_task()); - completed_task->dispatch(); - task_dispatched = true; - } - nof_pending_tasks--; - } - } else { - // Reorder the output - NttTaskCoordinates* next_task_c_ptr = nullptr; - next_task_c_ptr = get_slot_for_next_task_coordinates(); - next_task_c_ptr->hierarchy_1_layer_idx = task_c.hierarchy_1_layer_idx; - next_task_c_ptr->hierarchy_1_subntt_idx = task_c.hierarchy_1_subntt_idx; - next_task_c_ptr->hierarchy_0_layer_idx = nof_hierarchy_0_layers; - next_task_c_ptr->hierarchy_0_block_idx = 0; - next_task_c_ptr->hierarchy_0_subntt_idx = 0; - next_task_c_ptr->reorder = true; - completed_task->set_coordinates(get_available_task()); - completed_task->dispatch(); - task_dispatched = true; - nof_pending_tasks--; - } - } - return task_dispatched; - } -} // namespace ntt_cpu diff --git a/icicle/backend/cpu/include/ntt_utils.h b/icicle/backend/cpu/include/ntt_utils.h index 0751b2404..57128b072 100644 --- a/icicle/backend/cpu/include/ntt_utils.h +++ b/icicle/backend/cpu/include/ntt_utils.h @@ -34,9 +34,29 @@ namespace ntt_cpu { hierarchy_1_subntt_idx == other.hierarchy_1_subntt_idx && hierarchy_0_layer_idx == other.hierarchy_0_layer_idx && hierarchy_0_block_idx == other.hierarchy_0_block_idx && - hierarchy_0_subntt_idx == other.hierarchy_0_subntt_idx; + hierarchy_0_subntt_idx == other.hierarchy_0_subntt_idx && + reorder == other.reorder; } + + // Default constructor + NttTaskCoordinates() = default; + + // Constructor with parameters + NttTaskCoordinates(uint32_t h1_layer_idx, + uint32_t h1_subntt_idx, + uint32_t h0_layer_idx, + uint32_t h0_block_idx, + uint32_t h0_subntt_idx, + bool reorder_flag = false) + : hierarchy_1_layer_idx(h1_layer_idx), + hierarchy_1_subntt_idx(h1_subntt_idx), + hierarchy_0_layer_idx(h0_layer_idx), + hierarchy_0_block_idx(h0_block_idx), + hierarchy_0_subntt_idx(h0_subntt_idx), + reorder(reorder_flag) + {} }; + uint64_t bit_reverse(uint64_t i, uint32_t logn) { From 70319adbe83774974f33f0c9436a605872fb430a Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Sun, 22 Dec 2024 08:00:41 +0200 Subject: [PATCH 05/14] smaller size done parallel, __builtin_expect, reorder_by_bit_reverse with omp parallel for --- icicle/backend/cpu/include/ntt_cpu.h | 2 +- icicle/backend/cpu/include/ntt_task.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index c825b1069..920e57c2a 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -396,7 +396,7 @@ namespace ntt_cpu { } else { uint32_t scalar_size = sizeof(S); // for small scalars, the threshold for when it is faster to use parallel NTT is higher - if ((scalar_size >= 32 && (logn + log_batch_size) <= 13) || (scalar_size < 32 && (logn + log_batch_size) <= 16)) { + if ((scalar_size >= 32 && (logn + log_batch_size) <= 11) || (scalar_size < 32 && (logn + log_batch_size) <= 16)) { return false; } } diff --git a/icicle/backend/cpu/include/ntt_task.h b/icicle/backend/cpu/include/ntt_task.h index b3483b098..11d41191e 100644 --- a/icicle/backend/cpu/include/ntt_task.h +++ b/icicle/backend/cpu/include/ntt_task.h @@ -59,7 +59,7 @@ namespace ntt_cpu { template void NttTask::execute() { - if (!ntt_data->is_parallel || !ntt_task_coordinates.reorder) { + if (__builtin_expect((!ntt_data->is_parallel || !ntt_task_coordinates.reorder),1)) { hierarchy_0_cpu_ntt(); } else { // if all hierarchy_0_subntts are done, and at least 2 layers in hierarchy 0 - reorder the subntt's output @@ -1359,6 +1359,7 @@ namespace ntt_cpu { uint64_t rev; uint64_t i_mem_idx; uint64_t rev_mem_idx; + #pragma omp parallel for for (uint64_t i = 0; i < subntt_size; ++i) { // rev = NttUtils::bit_reverse(i, subntt_log_size); rev = bit_reverse(i, subntt_log_size); From 2b33cde878708bd151083b7049eb73ab016a6c18 Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Wed, 25 Dec 2024 18:12:51 +0200 Subject: [PATCH 06/14] fix H1 bug --- icicle/backend/cpu/include/ntt_cpu.h | 6 ++++-- icicle/tests/test_field_api.cpp | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index 920e57c2a..d595f1747 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -127,10 +127,12 @@ namespace ntt_cpu { nof_blocks = 1 << (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0] + ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1]); nof_subntts = 1; } - #pragma omp parallel for collapse(3) schedule(dynamic, 512) + nof_blocks = nof_blocks>>1; + #pragma omp parallel for collapse(3) schedule(dynamic) for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { - for (uint32_t hierarchy_0_block_idx = 0; hierarchy_0_block_idx < (nof_blocks); hierarchy_0_block_idx+=2) { + for (uint32_t hierarchy_0_block_idx_half = 0; hierarchy_0_block_idx_half < (nof_blocks); hierarchy_0_block_idx_half++) { for (uint32_t hierarchy_0_subntt_idx = 0; hierarchy_0_subntt_idx < (nof_subntts); hierarchy_0_subntt_idx++) { + uint32_t hierarchy_0_block_idx = hierarchy_0_block_idx_half<<1; NttTaskCoordinates ntt_task_coordinates(hierarchy_1_layer_idx, hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck, hierarchy_0_layer_idx, hierarchy_0_block_idx, hierarchy_0_subntt_idx, false); NttTask task(ntt_task_coordinates, ntt_data); task.execute(); diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 3ea241148..65aa7e371 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -813,7 +813,7 @@ TEST_F(FieldApiTestBase, polynomialDivision) TYPED_TEST(FieldApiTest, ntt) { // Randomize configuration - for(int logn=3; logn<25; logn++){ + for(int logn=3; logn<26; logn++){ const bool inplace = 0; // const int logn = rand_uint_32b(3, 17); const uint64_t N = 1 << logn; @@ -837,7 +837,7 @@ TYPED_TEST(FieldApiTest, ntt) coset_gen = scalar_t::one(); } - ICICLE_LOG_DEBUG << "LOGN = " << logn; + ICICLE_LOG_DEBUG << "logn = " << logn; ICICLE_LOG_DEBUG << "batch_size = " << batch_size; ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; ICICLE_LOG_DEBUG << "inplace = " << inplace; @@ -885,7 +885,7 @@ TYPED_TEST(FieldApiTest, ntt) ICICLE_CHECK(ntt(d_in, N, dir, config, d_out)); } } - END_TIMER(NTT_sync, oss.str().c_str(), measure); + END_TIMER_AVERAGE(NTT_sync, oss.str().c_str(), measure, iters); if (inplace) { ICICLE_CHECK(icicle_copy_to_host_async(out, d_in, total_size * sizeof(TypeParam), config.stream)); @@ -900,7 +900,7 @@ TYPED_TEST(FieldApiTest, ntt) }; run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); // warmup run(IcicleTestBase::reference_device(), out_ref.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); - run(IcicleTestBase::main_device(), out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); + run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); }} #endif // NTT From 76d1b9f04fb639addeaf04e7ecd15e845dd03c55 Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Thu, 26 Dec 2024 16:24:24 +0200 Subject: [PATCH 07/14] temp commit for working on mac --- icicle/CMakeLists.txt | 16 ++++++++-------- icicle/tests/test_field_api.cpp | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt index ab9685daa..2c33aadf9 100644 --- a/icicle/CMakeLists.txt +++ b/icicle/CMakeLists.txt @@ -10,14 +10,14 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) find_program(CLANG_COMPILER clang++) find_program(CLANG_C_COMPILER clang) -if(CLANG_COMPILER AND CLANG_C_COMPILER) - set(CMAKE_CXX_COMPILER ${CLANG_COMPILER} CACHE STRING "Clang++ compiler" FORCE) - set(CMAKE_C_COMPILER ${CLANG_C_COMPILER} CACHE STRING "Clang compiler" FORCE) - message(STATUS "Using Clang++ as the C++ compiler: ${CLANG_COMPILER}") - message(STATUS "Using Clang as the C compiler: ${CLANG_C_COMPILER}") -else() - message(WARNING "ICICLE CPU works best with clang++ and clang. Defaulting to ${CLANG_COMPILER}") -endif() +# if(CLANG_COMPILER AND CLANG_C_COMPILER) +# set(CMAKE_CXX_COMPILER ${CLANG_COMPILER} CACHE STRING "Clang++ compiler" FORCE) +# set(CMAKE_C_COMPILER ${CLANG_C_COMPILER} CACHE STRING "Clang compiler" FORCE) +# message(STATUS "Using Clang++ as the C++ compiler: ${CLANG_COMPILER}") +# message(STATUS "Using Clang as the C compiler: ${CLANG_C_COMPILER}") +# else() +# message(WARNING "ICICLE CPU works best with clang++ and clang. Defaulting to ${CLANG_COMPILER}") +# endif() diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 65aa7e371..78a73d426 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -898,10 +898,10 @@ TYPED_TEST(FieldApiTest, ntt) ICICLE_CHECK(icicle_destroy_stream(stream)); ICICLE_CHECK(ntt_release_domain()); }; - run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); // warmup + // run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); // warmup run(IcicleTestBase::reference_device(), out_ref.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); - run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); + // run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); + // ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); }} #endif // NTT From b1cc063eb159f5034c9294c5e2622be11b7c7fe0 Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Sun, 29 Dec 2024 15:53:43 +0200 Subject: [PATCH 08/14] allocate buffer memory for reordering in large ntts. fixed coset --- icicle/CMakeLists.txt | 16 +++--- icicle/backend/cpu/include/ntt_cpu.h | 44 ++++++++++----- icicle/backend/cpu/include/ntt_data.h | 2 +- icicle/backend/cpu/include/ntt_task.h | 78 --------------------------- icicle/tests/test_field_api.cpp | 40 ++++++++++---- 5 files changed, 68 insertions(+), 112 deletions(-) diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt index 2c33aadf9..ab9685daa 100644 --- a/icicle/CMakeLists.txt +++ b/icicle/CMakeLists.txt @@ -10,14 +10,14 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) find_program(CLANG_COMPILER clang++) find_program(CLANG_C_COMPILER clang) -# if(CLANG_COMPILER AND CLANG_C_COMPILER) -# set(CMAKE_CXX_COMPILER ${CLANG_COMPILER} CACHE STRING "Clang++ compiler" FORCE) -# set(CMAKE_C_COMPILER ${CLANG_C_COMPILER} CACHE STRING "Clang compiler" FORCE) -# message(STATUS "Using Clang++ as the C++ compiler: ${CLANG_COMPILER}") -# message(STATUS "Using Clang as the C compiler: ${CLANG_C_COMPILER}") -# else() -# message(WARNING "ICICLE CPU works best with clang++ and clang. Defaulting to ${CLANG_COMPILER}") -# endif() +if(CLANG_COMPILER AND CLANG_C_COMPILER) + set(CMAKE_CXX_COMPILER ${CLANG_COMPILER} CACHE STRING "Clang++ compiler" FORCE) + set(CMAKE_C_COMPILER ${CLANG_C_COMPILER} CACHE STRING "Clang compiler" FORCE) + message(STATUS "Using Clang++ as the C++ compiler: ${CLANG_COMPILER}") + message(STATUS "Using Clang as the C compiler: ${CLANG_C_COMPILER}") +else() + message(WARNING "ICICLE CPU works best with clang++ and clang. Defaulting to ${CLANG_COMPILER}") +endif() diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index d595f1747..0d0ccff26 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -40,6 +40,8 @@ namespace ntt_cpu { private: const E* input; + E* output; + std::unique_ptr temp_elements; NttData ntt_data; bool compute_if_is_parallel(uint32_t logn, const NTTConfig& config); @@ -60,16 +62,20 @@ namespace ntt_cpu { */ template NttCpu::NttCpu(uint32_t logn, NTTDir direction, const NTTConfig& config, const E* input, E* output) - : input(input), ntt_data(logn, output, config, direction, compute_if_is_parallel(logn, config)) + : input(input), output(output), ntt_data(logn, output, config, direction, compute_if_is_parallel(logn, config)) { + if (logn > HIERARCHY_1) { + // Allocate temporary storage to handle reordering + temp_elements = std::make_unique(ntt_data.size * config.batch_size); + } } template eIcicleError NttCpu::run() { copy_and_reorder_if_needed(input, ntt_data.elements); + if (ntt_data.direction == NTTDir::kForward && ntt_data.config.coset_gen != S::one()) { coset_mul(); } if (!ntt_data.is_parallel) { - if (ntt_data.direction == NTTDir::kForward && ntt_data.config.coset_gen != S::one()) { coset_mul(); } NttTaskCoordinates ntt_task_coordinates(0, 0, 0, 0, 0, false); NttTask task(ntt_task_coordinates, ntt_data); task.execute(); @@ -190,7 +196,11 @@ namespace ntt_cpu { E* temp_output = output; std::unique_ptr temp_storage; if (input == output) { - // Allocate temporary storage to handle in-place reordering + if (!(logn > HIERARCHY_1 || bit_rev)) { + // no reordering needed, and input and output are the same + return; + } + // Allocate temporary storage to handle in-place reordering, can't be done inplace when input and output are the same temp_storage = std::make_unique(total_memory_size); temp_output = temp_storage.get(); } @@ -254,7 +264,8 @@ namespace ntt_cpu { for (uint32_t batch = 0; batch < ntt_data.config.batch_size; ++batch) { E* current_elements = ntt_data.config.columns_batch ? ntt_data.elements + batch : ntt_data.elements + batch * ntt_data.size; - + + #pragma omp parallel for for (uint64_t i = 1; i < ntt_data.size; ++i) { uint64_t idx = i; @@ -292,7 +303,6 @@ namespace ntt_cpu { const uint32_t stride = ntt_data.config.columns_batch ? ntt_data.config.batch_size : 1; const uint64_t temp_elements_size = ntt_data.size * ntt_data.config.batch_size; - auto temp_elements = std::make_unique(temp_elements_size); for (uint32_t batch = 0; batch < ntt_data.config.batch_size; ++batch) { E* cur_layer_output = ntt_data.config.columns_batch ? ntt_data.elements + batch : ntt_data.elements + batch * ntt_data.size; @@ -306,7 +316,13 @@ namespace ntt_cpu { } } } - std::copy(temp_elements.get(), temp_elements.get() + temp_elements_size, ntt_data.elements); + // printf("[hierarchy_1_reorder] output = %p\n", (void*)output); + // printf("[hierarchy_1_reorder] ntt_data.elements = %p\n", (void*)ntt_data.elements); + // printf("[hierarchy_1_reorder] temp_elements.get = %p\n", (void*)temp_elements.get()); + + ntt_data.elements = temp_elements.get(); + // printf("hierarchy_1_reorder OK\n"); + // printf("[hierarchy_1_reorder] ntt_data.elements = %p\n", (void*)ntt_data.elements); } /** @@ -320,6 +336,11 @@ namespace ntt_cpu { template eIcicleError NttCpu::reorder_output() { + // printf("reorder_output.....\n"); + // printf("[hierarchy_1_reorder] output = %p\n", (void*)output); + // printf("[hierarchy_1_reorder] ntt_data.elements = %p\n", (void*)ntt_data.elements); + // printf("[hierarchy_1_reorder] temp_elements.get = %p\n", (void*)temp_elements.get()); + uint32_t columns_batch_reps = ntt_data.config.columns_batch ? ntt_data.config.batch_size : 1; uint32_t rows_batch_reps = ntt_data.config.columns_batch ? 1 : ntt_data.config.batch_size; uint32_t s0 = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[0]; @@ -328,9 +349,6 @@ namespace ntt_cpu { for (uint32_t row_batch = 0; row_batch < rows_batch_reps; ++row_batch) { // if columns_batch=false, then elements pointer is shifted by batch*size E* elements = ntt_data.elements + row_batch * ntt_data.size; - uint64_t temp_output_size = - ntt_data.config.columns_batch ? ntt_data.size * ntt_data.config.batch_size : ntt_data.size; - auto temp_output = std::make_unique(temp_output_size); uint64_t new_idx = 0; uint32_t subntt_idx; uint32_t element; @@ -339,7 +357,7 @@ namespace ntt_cpu { ntt_data.config.columns_batch ? elements + col_batch : elements; // if columns_batch=true, then elements pointer is shifted by 1 for each batch - E* current_temp_output = ntt_data.config.columns_batch ? temp_output.get() + col_batch : temp_output.get(); + E* current_temp_output = ntt_data.config.columns_batch ? output + col_batch : output; for (uint64_t i = 0; i < ntt_data.size; i++) { subntt_idx = i >> s1; element = i & ((1 << s1) - 1); @@ -347,11 +365,9 @@ namespace ntt_cpu { current_temp_output[stride * new_idx] = current_elements[stride * i]; } } - std::copy( - temp_output.get(), temp_output.get() + temp_output_size, - elements); // columns_batch=false: for each row in the batch, copy the reordered elements back to the elements - // array } + ntt_data.elements = output; + // printf("reorder_output OK\n"); return eIcicleError::SUCCESS; } diff --git a/icicle/backend/cpu/include/ntt_data.h b/icicle/backend/cpu/include/ntt_data.h index 7bd0ea27f..ad0b938ea 100644 --- a/icicle/backend/cpu/include/ntt_data.h +++ b/icicle/backend/cpu/include/ntt_data.h @@ -75,7 +75,7 @@ namespace ntt_cpu { const uint32_t logn; // log of the original NTT size. const uint32_t size; // Size of the original NTT problem. const NttSubHierarchies ntt_sub_hierarchies; // Log sizes of sub-NTTs based on the original NTT log size. - E* const elements; // Pointer to the output elements array. + E* elements; // Pointer to the output elements array. const NTTConfig& config; // Configuration settings for the NTT computation. const NTTDir direction; // Direction of the NTT computation (forward or inverse). const bool is_parallel; // Flag indicating if the NTT computation is parallel. diff --git a/icicle/backend/cpu/include/ntt_task.h b/icicle/backend/cpu/include/ntt_task.h index 11d41191e..be5b84981 100644 --- a/icicle/backend/cpu/include/ntt_task.h +++ b/icicle/backend/cpu/include/ntt_task.h @@ -33,7 +33,6 @@ namespace ntt_cpu { NttTaskCoordinates ntt_task_coordinates; NttData* ntt_data = nullptr; eIcicleError reorder_and_refactor_if_needed(); - void apply_coset_multiplication(E* current_elements, const std::vector& index_in_mem, const S* twiddles); eIcicleError hierarchy_0_cpu_ntt(); void ntt8win(); void ntt16win(); @@ -155,58 +154,6 @@ namespace ntt_cpu { return eIcicleError::SUCCESS; } - /** - * @brief Applies coset multiplication to the current elements of the NTT computation. - * - * This function multiplies the current elements with the appropriate coset factors based on - * their indices. It handles both predefined and arbitrary coset multiplications depending - * on the availability of coset stride information. - * - * @param current_elements Pointer to the array of current elements being processed. - * @param index_in_mem Vector containing the memory indices of the elements to be multiplied. - * @param twiddles Pointer to the array of twiddle factors used for multiplication. - * - * @return void - */ - template - void NttTask::apply_coset_multiplication( - E* current_elements, const std::vector& index_in_mem, const S* twiddles) - { - uint32_t current_subntt_size = - 1 << ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] - [ntt_task_coordinates.hierarchy_0_layer_idx]; - uint32_t subntt_idx; - uint32_t s0 = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][0]; - uint32_t s1 = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][1]; - uint32_t s2 = - ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx][2]; - uint32_t p0, p1, p2; - for (uint32_t i = 0; i < current_subntt_size; i++) { - uint64_t new_idx = i; - uint64_t idx = idx_in_mem(ntt_task_coordinates, i); // don't need to multiply by stride here - // Adjust the index if reorder logic was applied on the input - if (ntt_data->logn > HIERARCHY_1) { - uint32_t cur_ntt_log_size = ntt_data->ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[0]; - uint32_t next_ntt_log_size = ntt_data->ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1]; - uint32_t subntt_idx = index_in_mem[i] >> cur_ntt_log_size; - uint32_t element = index_in_mem[i] & ((1 << cur_ntt_log_size) - 1); - idx = subntt_idx + (element << next_ntt_log_size); - } - // Apply coset multiplication based on the available coset information - if (ntt_data->arbitrary_coset) { - current_elements[index_in_mem[new_idx]] = - current_elements[index_in_mem[new_idx]] * ntt_data->arbitrary_coset[idx]; - } else { - uint32_t twiddle_idx = ntt_data->coset_stride * idx; - twiddle_idx = ntt_data->direction == NTTDir::kForward - ? twiddle_idx - : CpuNttDomain::s_ntt_domain.get_max_size() - twiddle_idx; - current_elements[index_in_mem[new_idx]] = current_elements[index_in_mem[new_idx]] * twiddles[twiddle_idx]; - } - } - } /** * @brief Executes the NTT on a sub-NTT at the hierarchy_0 level. @@ -269,7 +216,6 @@ namespace ntt_cpu { E* subntt_elements; E T; bool last_layer = true; - bool need_to_apply_coset_multiplication = false; std::vector index_in_mem(8); uint32_t offset = ntt_data->config.columns_batch ? ntt_data->config.batch_size : 1; const S* twiddles = ntt_data->direction == NTTDir::kForward @@ -290,8 +236,6 @@ namespace ntt_cpu { 0)); const bool first_layer = ntt_task_coordinates.hierarchy_1_layer_idx == 0 && ntt_task_coordinates.hierarchy_0_layer_idx == 0; - need_to_apply_coset_multiplication = - first_layer && ntt_data->config.coset_gen != S::one() && ntt_data->direction == NTTDir::kForward; subntt_elements = ntt_data->elements + offset * (ntt_task_coordinates.hierarchy_1_subntt_idx @@ -309,10 +253,6 @@ namespace ntt_cpu { E* current_elements = ntt_data->config.columns_batch ? subntt_elements + batch : subntt_elements + batch * (ntt_data->size); - if (need_to_apply_coset_multiplication) { - apply_coset_multiplication(current_elements, index_in_mem, CpuNttDomain::s_ntt_domain.get_twiddles()); - } - T = current_elements[index_in_mem[3]] - current_elements[index_in_mem[7]]; current_elements[index_in_mem[7]] = current_elements[index_in_mem[3]] + current_elements[index_in_mem[7]]; current_elements[index_in_mem[3]] = current_elements[index_in_mem[1]] - current_elements[index_in_mem[5]]; @@ -375,7 +315,6 @@ namespace ntt_cpu { E* subntt_elements; E T; bool last_layer = true; - bool need_to_apply_coset_multiplication = false; std::vector index_in_mem(16); uint32_t offset = ntt_data->config.columns_batch ? ntt_data->config.batch_size : 1; const S* twiddles = ntt_data->direction == NTTDir::kForward @@ -396,8 +335,6 @@ namespace ntt_cpu { 0)); const bool first_layer = ntt_task_coordinates.hierarchy_1_layer_idx == 0 && ntt_task_coordinates.hierarchy_0_layer_idx == 0; - need_to_apply_coset_multiplication = - first_layer && ntt_data->config.coset_gen != S::one() && ntt_data->direction == NTTDir::kForward; subntt_elements = ntt_data->elements + offset * (ntt_task_coordinates.hierarchy_1_subntt_idx @@ -415,10 +352,6 @@ namespace ntt_cpu { E* current_elements = ntt_data->config.columns_batch ? subntt_elements + batch : subntt_elements + batch * (ntt_data->size); - if (need_to_apply_coset_multiplication) { - apply_coset_multiplication(current_elements, index_in_mem, CpuNttDomain::s_ntt_domain.get_twiddles()); - } - T = current_elements[index_in_mem[0]] + current_elements[index_in_mem[8]]; current_elements[index_in_mem[0]] = current_elements[index_in_mem[0]] - current_elements[index_in_mem[8]]; current_elements[index_in_mem[8]] = current_elements[index_in_mem[4]] + current_elements[index_in_mem[12]]; @@ -568,7 +501,6 @@ namespace ntt_cpu { std::vector temp_0(46); std::vector temp_1(46); bool last_layer = true; - bool need_to_apply_coset_multiplication = false; std::vector index_in_mem(32); uint32_t offset = ntt_data->config.columns_batch ? ntt_data->config.batch_size : 1; const S* twiddles = ntt_data->direction == NTTDir::kForward @@ -589,8 +521,6 @@ namespace ntt_cpu { 0)); const bool first_layer = ntt_task_coordinates.hierarchy_1_layer_idx == 0 && ntt_task_coordinates.hierarchy_0_layer_idx == 0; - need_to_apply_coset_multiplication = - first_layer && ntt_data->config.coset_gen != S::one() && ntt_data->direction == NTTDir::kForward; subntt_elements = ntt_data->elements + offset * (ntt_task_coordinates.hierarchy_1_subntt_idx @@ -608,10 +538,6 @@ namespace ntt_cpu { E* current_elements = ntt_data->config.columns_batch ? subntt_elements + batch : subntt_elements + batch * (ntt_data->size); - if (need_to_apply_coset_multiplication) { - apply_coset_multiplication(current_elements, index_in_mem, CpuNttDomain::s_ntt_domain.get_twiddles()); - } - /* Stage s00 */ temp_0[0] = current_elements[index_in_mem[0]]; temp_0[1] = current_elements[index_in_mem[2]]; @@ -1245,7 +1171,6 @@ namespace ntt_cpu { E* subntt_elements; E T; bool last_layer = true; - bool need_to_apply_coset_multiplication = false; uint32_t offset = ntt_data->config.columns_batch ? ntt_data->config.batch_size : 1; const S* twiddles = CpuNttDomain::s_ntt_domain.get_twiddles(); uint32_t stride = ntt_data->config.columns_batch ? ntt_data->config.batch_size : 1; @@ -1262,8 +1187,6 @@ namespace ntt_cpu { 0)); const bool first_layer = ntt_task_coordinates.hierarchy_1_layer_idx == 0 && ntt_task_coordinates.hierarchy_0_layer_idx == 0; - need_to_apply_coset_multiplication = - first_layer && ntt_data->config.coset_gen != S::one() && ntt_data->direction == NTTDir::kForward; subntt_elements = ntt_data->elements + @@ -1288,7 +1211,6 @@ namespace ntt_cpu { for (uint32_t batch = 0; batch < ntt_data->config.batch_size; ++batch) { E* current_elements = ntt_data->config.columns_batch ? subntt_elements + batch : subntt_elements + batch * (ntt_data->size); - if (need_to_apply_coset_multiplication) { apply_coset_multiplication(current_elements, index_in_mem, twiddles); } for (uint32_t len = 2; len <= subntt_size; len <<= 1) { uint32_t half_len = len / 2; diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 78a73d426..49ff3bd4f 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -813,23 +813,41 @@ TEST_F(FieldApiTestBase, polynomialDivision) TYPED_TEST(FieldApiTest, ntt) { // Randomize configuration - for(int logn=3; logn<26; logn++){ - const bool inplace = 0; + for (int log_coset_stride=0; log_coset_stride<3; log_coset_stride++){ + ICICLE_LOG_INFO << "log_coset_stride = " << log_coset_stride; + for (int _dir=0; _dir<2; _dir++){ + ICICLE_LOG_INFO << "_dir = " << _dir; + for (int columns_batch=1; columns_batch<2; columns_batch++){ + ICICLE_LOG_INFO << "columns_batch = " << columns_batch; + for (int _ordering=0; _ordering<4; _ordering++){ + ICICLE_LOG_INFO << "_ordering = " << _ordering; + for (int inplace=0; inplace<2; inplace++){ + ICICLE_LOG_INFO << "inplace = " << inplace; + for(int log_batch_size=0; log_batch_size<10; log_batch_size++){ + ICICLE_LOG_INFO << "log_batch_size = " << log_batch_size; + int max_logn = 27 - log_batch_size; + for(int logn=3; logn(_ordering); - bool columns_batch = false; + // bool columns_batch = false; // if (logn == 7 || logn < 4) { // columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) // } else { // columns_batch = rand_uint_32b(0, 1); // } - const NTTDir dir = static_cast(0); // 0: forward, 1: inverse - const int log_coset_stride = 0; + if (columns_batch) { + if (logn == 7 || logn < 4) { + continue; + } + } + const NTTDir dir = static_cast(_dir); // 0: forward, 1: inverse + // const int log_coset_stride = 0; scalar_t coset_gen; if (log_coset_stride) { coset_gen = scalar_t::omega(logn + log_coset_stride); @@ -898,11 +916,11 @@ TYPED_TEST(FieldApiTest, ntt) ICICLE_CHECK(icicle_destroy_stream(stream)); ICICLE_CHECK(ntt_release_domain()); }; - // run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); // warmup + run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); // warmup run(IcicleTestBase::reference_device(), out_ref.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); - // run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); - // ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); -}} + run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); +}}}}}}}} #endif // NTT // define program From ebce824487a251ce6fb43ab6893512aa294859c7 Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Mon, 30 Dec 2024 13:57:48 +0200 Subject: [PATCH 09/14] ntt_cpu with openMP implementation --- icicle/backend/cpu/include/cpu_ntt_main.h | 6 +-- icicle/backend/cpu/include/ntt_cpu.h | 22 +--------- icicle/backend/cpu/include/ntt_data.h | 10 ++--- icicle/backend/cpu/include/ntt_task.h | 31 ++++++-------- icicle/cmake/curve.cmake | 2 +- icicle/cmake/field.cmake | 2 +- icicle/cmake/hash.cmake | 2 +- icicle/tests/CMakeLists.txt | 16 +++---- icicle/tests/test_field_api.cpp | 51 +++++++---------------- 9 files changed, 47 insertions(+), 95 deletions(-) diff --git a/icicle/backend/cpu/include/cpu_ntt_main.h b/icicle/backend/cpu/include/cpu_ntt_main.h index 827ea81d0..bd2207189 100644 --- a/icicle/backend/cpu/include/cpu_ntt_main.h +++ b/icicle/backend/cpu/include/cpu_ntt_main.h @@ -14,16 +14,14 @@ using namespace icicle; * hierarchy, and memory management for efficient computation. * * The NTT problem is given at a specific size and is divided into subproblems to enable - * parallel solving of independent tasks, ensuring that the number of problems solved - * simultaneously does not exceed cache size. The original problem is divided into hierarchies + * parallel solving of independent tasks. The original problem is divided into hierarchies * of subproblems. Beyond a certain size, the problem is divided into two layers of sub-NTTs in * hierarchy 1. Within hierarchy 1, the problem is further divided into 1-3 layers of sub-NTTs * belonging to hierarchy 0. The division into hierarchies and the sizes of the sub-NTTs are * determined by the original problem size. * * The sub-NTTs within hierarchy 0 are the units of work that are assigned to individual threads. - * The overall computation is executed in a multi-threaded fashion, with the degree of parallelism - * determined by the number of available hardware cores. + * The overall computation is executed in a multi-threaded fashion. * * @param device The device on which the NTT is being performed. * @param input Pointer to the input data. diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index 0d0ccff26..d48f1583b 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -48,8 +48,6 @@ namespace ntt_cpu { void coset_mul(); void reorder_by_bit_reverse(); void copy_and_reorder_if_needed(const E* input, E* output); - - // Parallel-specific methods void hierarchy_1_reorder(); eIcicleError reorder_output(); @@ -102,8 +100,7 @@ namespace ntt_cpu { task.execute(); } } - if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // all ntt tasks in hierarchy 1 are pushed, now push reorder task so that the data - // is in the correct order for the next hierarchy 1 layer + if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // All NTT tasks in hierarchy 1 have been executed; now executing the reorder task NttTaskCoordinates ntt_task_coordinates(0, 0, hierarchy_0_layer_idx, 0, 0, true); NttTask task(ntt_task_coordinates, ntt_data); task.execute(); @@ -143,14 +140,12 @@ namespace ntt_cpu { NttTask task(ntt_task_coordinates, ntt_data); task.execute(); ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx+1; - // task.set_coordinates(ntt_task_coordinates); NttTask task_with_elements_in_the_same_cachline(ntt_task_coordinates, ntt_data); task_with_elements_in_the_same_cachline.execute(); } } } - if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // all ntt tasks in hierarchy 1 are pushed, now push reorder task so that the data - // is in the correct order for the next hierarchy 1 layer + if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // All NTT tasks in hierarchy 1 have been executed; now executing the reorder task #pragma omp parallel for for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { NttTaskCoordinates ntt_task_coordinates(hierarchy_1_layer_idx, hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck, nof_hierarchy_0_layers, 0, 0, true); @@ -230,7 +225,6 @@ namespace ntt_cpu { E* output_batch = ntt_data.config.columns_batch ? (temp_output + batch) : (temp_output + batch * ntt_data.size); for (uint64_t i = 0; i < ntt_data.size; ++i) { - // uint64_t rev = NttUtils::bit_reverse(i, logn); uint64_t rev = bit_reverse(i, logn); output_batch[stride * i] = input_batch[stride * rev]; } @@ -316,13 +310,7 @@ namespace ntt_cpu { } } } - // printf("[hierarchy_1_reorder] output = %p\n", (void*)output); - // printf("[hierarchy_1_reorder] ntt_data.elements = %p\n", (void*)ntt_data.elements); - // printf("[hierarchy_1_reorder] temp_elements.get = %p\n", (void*)temp_elements.get()); - ntt_data.elements = temp_elements.get(); - // printf("hierarchy_1_reorder OK\n"); - // printf("[hierarchy_1_reorder] ntt_data.elements = %p\n", (void*)ntt_data.elements); } /** @@ -336,11 +324,6 @@ namespace ntt_cpu { template eIcicleError NttCpu::reorder_output() { - // printf("reorder_output.....\n"); - // printf("[hierarchy_1_reorder] output = %p\n", (void*)output); - // printf("[hierarchy_1_reorder] ntt_data.elements = %p\n", (void*)ntt_data.elements); - // printf("[hierarchy_1_reorder] temp_elements.get = %p\n", (void*)temp_elements.get()); - uint32_t columns_batch_reps = ntt_data.config.columns_batch ? ntt_data.config.batch_size : 1; uint32_t rows_batch_reps = ntt_data.config.columns_batch ? 1 : ntt_data.config.batch_size; uint32_t s0 = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[0]; @@ -367,7 +350,6 @@ namespace ntt_cpu { } } ntt_data.elements = output; - // printf("reorder_output OK\n"); return eIcicleError::SUCCESS; } diff --git a/icicle/backend/cpu/include/ntt_data.h b/icicle/backend/cpu/include/ntt_data.h index ad0b938ea..75de47840 100644 --- a/icicle/backend/cpu/include/ntt_data.h +++ b/icicle/backend/cpu/include/ntt_data.h @@ -25,9 +25,9 @@ namespace ntt_cpu { * layer, 13 for the second, and 0 for the third. */ constexpr uint32_t layers_sub_logn[31][3] = { - {0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0}, {4, 0, 0}, {5, 0, 0}, {3, 3, 0}, {4, 3, 0}, - {4, 4, 0}, {5, 4, 0}, {5, 5, 0}, {4, 4, 3}, {4, 4, 4}, {5, 4, 4}, {5, 5, 4}, {5, 5, 5}, - {5, 5, 6}, {5, 5, 7}, {5, 5, 8}, {5, 5, 9}, {5, 5, 10}, {5, 5, 11}, {5, 5, 12}, {5, 5, 13}, + {0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0}, {4, 0, 0}, {5, 0, 0}, {3, 3, 0}, {4, 3, 0}, + {4, 4, 0}, {5, 4, 0}, {5, 5, 0}, {4, 4, 3}, {4, 4, 4}, {5, 4, 4}, {5, 5, 4}, {5, 5, 5}, + {5, 5, 6}, {5, 5, 7}, {5, 5, 8}, {5, 5, 9}, {5, 5, 10}, {5, 5, 11}, {5, 5, 12}, {5, 5, 13}, {5, 5, 14}, {5, 5, 15}, {13, 13, 0}, {14, 13, 0}, {14, 14, 0}, {15, 14, 0}, {15, 15, 0}}; /** @@ -36,8 +36,6 @@ namespace ntt_cpu { * This struct stores the log sizes of the sub-NTTs for both hierarchy_0 and hierarchy_1 layers, * based on the overall log size (`logn`) of the NTT problem. * - * @param logn The log size of the entire NTT problem. - * @param size The size of the NTT problem, calculated as `1 << logn`. * @param hierarchy_0_layers_sub_logn Log sizes of sub-NTTs for hierarchy_0 layers. * @param hierarchy_1_layers_sub_logn Log sizes of sub-NTTs for hierarchy_1 layers. * @@ -75,7 +73,7 @@ namespace ntt_cpu { const uint32_t logn; // log of the original NTT size. const uint32_t size; // Size of the original NTT problem. const NttSubHierarchies ntt_sub_hierarchies; // Log sizes of sub-NTTs based on the original NTT log size. - E* elements; // Pointer to the output elements array. + E* elements; // Pointer to the elements array. const NTTConfig& config; // Configuration settings for the NTT computation. const NTTDir direction; // Direction of the NTT computation (forward or inverse). const bool is_parallel; // Flag indicating if the NTT computation is parallel. diff --git a/icicle/backend/cpu/include/ntt_task.h b/icicle/backend/cpu/include/ntt_task.h index be5b84981..e86436f40 100644 --- a/icicle/backend/cpu/include/ntt_task.h +++ b/icicle/backend/cpu/include/ntt_task.h @@ -17,16 +17,12 @@ namespace ntt_cpu { * for a given sub-NTT or reordering the output if required. * * @method void execute() Executes the task, either performing the NTT computation or reordering the output. - * @method void set_coordinates(NttTaskParams params) Sets the task parameters. - * @method void set_data(NttData& data) Sets the NTT data for the task. */ template class NttTask { public: - NttTask(const NttTaskCoordinates coords, NttData& data) - : ntt_task_coordinates(coords), ntt_data(&data) - {} + NttTask(const NttTaskCoordinates coords, NttData& data) : ntt_task_coordinates(coords), ntt_data(&data) {} void execute(); private: @@ -58,7 +54,7 @@ namespace ntt_cpu { template void NttTask::execute() { - if (__builtin_expect((!ntt_data->is_parallel || !ntt_task_coordinates.reorder),1)) { + if (__builtin_expect((!ntt_data->is_parallel || !ntt_task_coordinates.reorder), 1)) { hierarchy_0_cpu_ntt(); } else { // if all hierarchy_0_subntts are done, and at least 2 layers in hierarchy 0 - reorder the subntt's output @@ -115,7 +111,7 @@ namespace ntt_cpu { stride * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx - // * subntt_size + // * subntt_size for (uint32_t col_batch = 0; col_batch < columns_batch_reps; ++col_batch) { E* current_elements = ntt_data->config.columns_batch @@ -154,7 +150,6 @@ namespace ntt_cpu { return eIcicleError::SUCCESS; } - /** * @brief Executes the NTT on a sub-NTT at the hierarchy_0 level. * @@ -241,7 +236,7 @@ namespace ntt_cpu { offset * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx - // * // subntt_size + // * // subntt_size } else { subntt_elements = ntt_data->elements; for (uint32_t i = 0; i < 8; i++) { @@ -340,7 +335,7 @@ namespace ntt_cpu { offset * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx - // * // subntt_size + // * // subntt_size } else { subntt_elements = ntt_data->elements; for (uint32_t i = 0; i < 16; i++) { @@ -526,7 +521,7 @@ namespace ntt_cpu { offset * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx - // * // subntt_size + // * // subntt_size } else { subntt_elements = ntt_data->elements; for (uint32_t i = 0; i < 32; i++) { @@ -1193,7 +1188,7 @@ namespace ntt_cpu { offset * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx - // * subntt_size + // * subntt_size subntt_size_log = ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] [ntt_task_coordinates.hierarchy_0_layer_idx]; @@ -1266,7 +1261,7 @@ namespace ntt_cpu { offset * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx * - // subntt_size + // subntt_size uint64_t subntt_size = 1 << ntt_data->ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx] [ntt_task_coordinates.hierarchy_0_layer_idx]; @@ -1281,16 +1276,14 @@ namespace ntt_cpu { uint64_t rev; uint64_t i_mem_idx; uint64_t rev_mem_idx; - #pragma omp parallel for +#pragma omp parallel for for (uint64_t i = 0; i < subntt_size; ++i) { - // rev = NttUtils::bit_reverse(i, subntt_log_size); rev = bit_reverse(i, subntt_log_size); i_mem_idx = idx_in_mem(ntt_task_coordinates, i); rev_mem_idx = idx_in_mem(ntt_task_coordinates, rev); if (i < rev) { if (i_mem_idx < ntt_data->size && rev_mem_idx < ntt_data->size) { // Ensure indices - // are - // within bounds + // are within bounds std::swap(current_elements[stride * i_mem_idx], current_elements[stride * rev_mem_idx]); } else { // Handle out-of-bounds error @@ -1356,7 +1349,7 @@ namespace ntt_cpu { stride * (ntt_task_coordinates.hierarchy_1_subntt_idx << ntt_data->ntt_sub_hierarchies .hierarchy_1_layers_sub_logn[ntt_task_coordinates.hierarchy_1_layer_idx]); // input + subntt_idx - // * subntt_size + // * subntt_size E* elements_of_current_batch = ntt_data->config.columns_batch ? hierarchy_1_subntt_elements + batch : hierarchy_1_subntt_elements + batch * original_size; @@ -1366,7 +1359,7 @@ namespace ntt_cpu { ? elem : elem * hierarchy_0_nof_subntts + ntt_task_coordinates.hierarchy_0_subntt_idx; j = (ntt_task_coordinates.hierarchy_0_layer_idx == 0) ? ntt_task_coordinates.hierarchy_0_subntt_idx - : ntt_task_coordinates.hierarchy_0_block_idx; + : ntt_task_coordinates.hierarchy_0_block_idx; uint64_t tw_idx = (ntt_data->direction == NTTDir::kForward) ? ((CpuNttDomain::s_ntt_domain.get_max_size() / ntt_size) * j * i) : CpuNttDomain::s_ntt_domain.get_max_size() - diff --git a/icicle/cmake/curve.cmake b/icicle/cmake/curve.cmake index e14d8d2b8..c82d1b90b 100644 --- a/icicle/cmake/curve.cmake +++ b/icicle/cmake/curve.cmake @@ -58,7 +58,7 @@ function(setup_curve_target CURVE CURVE_INDEX FEATURES_STRING) # Add additional feature handling calls here set_target_properties(icicle_curve PROPERTIES OUTPUT_NAME "icicle_curve_${CURVE}") - target_link_libraries(icicle_curve PUBLIC icicle_device icicle_field pthread OpenMP::OpenMP_CXX) + target_link_libraries(icicle_curve PUBLIC icicle_device icicle_field pthread) # Ensure CURVE is defined in the cache for backends to see set(CURVE "${CURVE}" CACHE STRING "") diff --git a/icicle/cmake/field.cmake b/icicle/cmake/field.cmake index 3c263d97b..953c0d6fc 100644 --- a/icicle/cmake/field.cmake +++ b/icicle/cmake/field.cmake @@ -56,7 +56,7 @@ function(setup_field_target FIELD FIELD_INDEX FEATURES_STRING) # Add additional feature handling calls here set_target_properties(icicle_field PROPERTIES OUTPUT_NAME "icicle_field_${FIELD}") - target_link_libraries(icicle_field PUBLIC icicle_device pthread OpenMP::OpenMP_CXX) + target_link_libraries(icicle_field PUBLIC icicle_device pthread) # Ensure FIELD is defined in the cache for backends to see set(FIELD "${FIELD}" CACHE STRING "") diff --git a/icicle/cmake/hash.cmake b/icicle/cmake/hash.cmake index df18e8eae..6d43c3e03 100644 --- a/icicle/cmake/hash.cmake +++ b/icicle/cmake/hash.cmake @@ -10,7 +10,7 @@ function(setup_hash_target) src/hash/merkle_c_api.cpp ) - target_link_libraries(icicle_hash PUBLIC icicle_device OpenMP::OpenMP_CXX) + target_link_libraries(icicle_hash PUBLIC icicle_device) install(TARGETS icicle_hash RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/" diff --git a/icicle/tests/CMakeLists.txt b/icicle/tests/CMakeLists.txt index 1b1c0d6ba..066fe0803 100644 --- a/icicle/tests/CMakeLists.txt +++ b/icicle/tests/CMakeLists.txt @@ -25,20 +25,20 @@ enable_testing() # device API test add_executable(test_device_api test_device_api.cpp) target_include_directories(test_device_api PRIVATE ${CMAKE_SOURCE_DIR}/include/) -target_link_libraries(test_device_api PRIVATE GTest::gtest_main icicle_device OpenMP::OpenMP_CXX) +target_link_libraries(test_device_api PRIVATE GTest::gtest_main icicle_device) gtest_discover_tests(test_device_api) #field API test if (FIELD) add_executable(test_field_api test_field_api.cpp) target_include_directories(test_field_api PRIVATE ${CMAKE_SOURCE_DIR}/include/) - target_link_libraries(test_field_api PRIVATE GTest::gtest_main icicle_device icicle_field OpenMP::OpenMP_CXX) + target_link_libraries(test_field_api PRIVATE GTest::gtest_main icicle_device icicle_field) gtest_discover_tests(test_field_api) if (NTT) add_executable(test_polynomial_api test_polynomial_api.cpp) target_include_directories(test_polynomial_api PRIVATE ${CMAKE_SOURCE_DIR}/include/) - target_link_libraries(test_polynomial_api PRIVATE GTest::gtest_main icicle_field OpenMP::OpenMP_CXX) + target_link_libraries(test_polynomial_api PRIVATE GTest::gtest_main icicle_field) gtest_discover_tests(test_polynomial_api) endif() endif() @@ -47,11 +47,11 @@ endif() if (CURVE) add_executable(test_curve_api test_curve_api.cpp) target_include_directories(test_curve_api PRIVATE ${CMAKE_SOURCE_DIR}/include/) - target_link_libraries(test_curve_api PRIVATE GTest::gtest_main icicle_device icicle_field icicle_curve OpenMP::OpenMP_CXX) + target_link_libraries(test_curve_api PRIVATE GTest::gtest_main icicle_device icicle_field icicle_curve) gtest_discover_tests(test_curve_api) if (NTT) - target_link_libraries(test_polynomial_api PRIVATE GTest::gtest_main icicle_curve OpenMP::OpenMP_CXX) + target_link_libraries(test_polynomial_api PRIVATE GTest::gtest_main icicle_curve) endif() endif() @@ -59,12 +59,12 @@ endif() if (HASH) add_executable(test_hash_api test_hash_api.cpp) target_include_directories(test_hash_api PRIVATE ${CMAKE_SOURCE_DIR}/include/) - target_link_libraries(test_hash_api PRIVATE GTest::gtest_main icicle_device icicle_hash OpenMP::OpenMP_CXX) + target_link_libraries(test_hash_api PRIVATE GTest::gtest_main icicle_device icicle_hash) gtest_discover_tests(test_hash_api) if (POSEIDON AND (FIELD OR CURVE)) - target_link_libraries(test_hash_api PRIVATE icicle_field OpenMP::OpenMP_CXX) + target_link_libraries(test_hash_api PRIVATE icicle_field) endif() if (POSEIDON2 AND (FIELD OR CURVE)) - target_link_libraries(test_hash_api PRIVATE icicle_field OpenMP::OpenMP_CXX) + target_link_libraries(test_hash_api PRIVATE icicle_field) endif() endif() diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 49ff3bd4f..3414811a7 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -813,41 +813,22 @@ TEST_F(FieldApiTestBase, polynomialDivision) TYPED_TEST(FieldApiTest, ntt) { // Randomize configuration - for (int log_coset_stride=0; log_coset_stride<3; log_coset_stride++){ - ICICLE_LOG_INFO << "log_coset_stride = " << log_coset_stride; - for (int _dir=0; _dir<2; _dir++){ - ICICLE_LOG_INFO << "_dir = " << _dir; - for (int columns_batch=1; columns_batch<2; columns_batch++){ - ICICLE_LOG_INFO << "columns_batch = " << columns_batch; - for (int _ordering=0; _ordering<4; _ordering++){ - ICICLE_LOG_INFO << "_ordering = " << _ordering; - for (int inplace=0; inplace<2; inplace++){ - ICICLE_LOG_INFO << "inplace = " << inplace; - for(int log_batch_size=0; log_batch_size<10; log_batch_size++){ - ICICLE_LOG_INFO << "log_batch_size = " << log_batch_size; - int max_logn = 27 - log_batch_size; - for(int logn=3; logn(_ordering); - // bool columns_batch = false; - // if (logn == 7 || logn < 4) { - // columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) - // } else { - // columns_batch = rand_uint_32b(0, 1); - // } - if (columns_batch) { - if (logn == 7 || logn < 4) { - continue; - } + bool columns_batch; + if (logn == 7 || logn < 4) { + columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) + } else { + columns_batch = rand_uint_32b(0, 1); } - const NTTDir dir = static_cast(_dir); // 0: forward, 1: inverse - // const int log_coset_stride = 0; + const NTTDir dir = static_cast(rand_uint_32b(0, 1)); // 0: forward, 1: inverse + const int log_coset_stride = rand_uint_32b(0, 2); scalar_t coset_gen; if (log_coset_stride) { coset_gen = scalar_t::omega(logn + log_coset_stride); @@ -855,7 +836,7 @@ TYPED_TEST(FieldApiTest, ntt) coset_gen = scalar_t::one(); } - ICICLE_LOG_DEBUG << "logn = " << logn; + ICICLE_LOG_DEBUG << "N = " << N; ICICLE_LOG_DEBUG << "batch_size = " << batch_size; ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; ICICLE_LOG_DEBUG << "inplace = " << inplace; @@ -903,7 +884,7 @@ TYPED_TEST(FieldApiTest, ntt) ICICLE_CHECK(ntt(d_in, N, dir, config, d_out)); } } - END_TIMER_AVERAGE(NTT_sync, oss.str().c_str(), measure, iters); + END_TIMER(NTT_sync, oss.str().c_str(), measure); if (inplace) { ICICLE_CHECK(icicle_copy_to_host_async(out, d_in, total_size * sizeof(TypeParam), config.stream)); @@ -918,9 +899,9 @@ TYPED_TEST(FieldApiTest, ntt) }; run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); // warmup run(IcicleTestBase::reference_device(), out_ref.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); - run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); + run(IcicleTestBase::main_device(), out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); -}}}}}}}} +} #endif // NTT // define program From 61a36986ffd5905607ff0ab854cdc1643756f68f Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Mon, 30 Dec 2024 18:30:53 +0200 Subject: [PATCH 10/14] temp commit taskflow --- icicle/CMakeLists.txt | 7 +-- icicle/backend/cpu/CMakeLists.txt | 20 +++++++ icicle/backend/cpu/include/ntt_cpu.h | 37 ++++++++---- icicle/backend/cpu/include/ntt_task.h | 2 +- icicle/tests/test_field_api.cpp | 84 ++++++++++++++++++++++----- 5 files changed, 116 insertions(+), 34 deletions(-) diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt index ab9685daa..2a32d70d3 100644 --- a/icicle/CMakeLists.txt +++ b/icicle/CMakeLists.txt @@ -79,18 +79,13 @@ if(SANITIZE) set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} -fsanitize=address") endif() -# Find and include OpenMP -set(OpenMP_ROOT $ENV{OPENMP_ROOT}) -find_package(OpenMP REQUIRED) - # device API library add_library(icicle_device SHARED src/device_api.cpp src/runtime.cpp src/config_extension.cpp ) -target_link_libraries(icicle_device PUBLIC dl OpenMP::OpenMP_CXX) -message(STATUS "OpenMP CXX Flags: ${OpenMP_CXX_FLAGS}") +target_link_libraries(icicle_device PUBLIC dl) include_directories(include) diff --git a/icicle/backend/cpu/CMakeLists.txt b/icicle/backend/cpu/CMakeLists.txt index ab460bb5a..6aab97329 100644 --- a/icicle/backend/cpu/CMakeLists.txt +++ b/icicle/backend/cpu/CMakeLists.txt @@ -1,5 +1,25 @@ cmake_minimum_required(VERSION 3.18) +message(STATUS "Fetching Taskflow v3.8.0 (CPU backend)") +include(FetchContent) +FetchContent_Declare( + Taskflow + GIT_REPOSITORY https://github.com/taskflow/taskflow.git + GIT_TAG v3.8.0 + GIT_SHALLOW TRUE +) +# Disable unnecessary components +set(TF_BUILD_BENCHMARKS OFF CACHE BOOL "Disable Taskflow benchmarks" FORCE) +set(TF_BUILD_PROFILER OFF CACHE BOOL "Disable Taskflow profiler" FORCE) +set(TF_BUILD_CUDA OFF CACHE BOOL "Disable Taskflow CUDA support" FORCE) +set(TF_BUILD_SYCL OFF CACHE BOOL "Disable Taskflow SYCL support" FORCE) +set(TF_BUILD_TESTS OFF CACHE BOOL "Disable Taskflow tests" FORCE) +set(TF_BUILD_EXAMPLES OFF CACHE BOOL "Disable Taskflow examples" FORCE) + +FetchContent_MakeAvailable(Taskflow) +# Use icicle_device as interface for TaskFlow headers +target_include_directories(icicle_device INTERFACE ${Taskflow_SOURCE_DIR}) + # CPU backend is built directly into icicle library target_sources(icicle_device PRIVATE src/cpu_device_api.cpp) diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index d48f1583b..03b871c1d 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -3,6 +3,7 @@ #include "icicle/utils/log.h" #include "ntt_task.h" #include "ntt_utils.h" +#include #include #include #include @@ -78,6 +79,8 @@ namespace ntt_cpu { NttTask task(ntt_task_coordinates, ntt_data); task.execute(); } else if (__builtin_expect((ntt_data.logn <= HIERARCHY_1),1)){ + tf::Taskflow taskflow; + tf::Executor executor; uint32_t nof_hierarchy_0_layers = (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][2] != 0) ? 3 : (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][1] != 0) ? 2 : 1; for (uint32_t hierarchy_0_layer_idx = 0; hierarchy_0_layer_idx < nof_hierarchy_0_layers; hierarchy_0_layer_idx++) { uint64_t nof_blocks; @@ -92,14 +95,26 @@ namespace ntt_cpu { nof_blocks = 1 << (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][0] + ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][1]); nof_subntts = 1; } - #pragma omp parallel for collapse(2) schedule(dynamic) - for (uint32_t hierarchy_0_block_idx = 0; hierarchy_0_block_idx < (nof_blocks); hierarchy_0_block_idx++) { - for (uint32_t hierarchy_0_subntt_idx = 0; hierarchy_0_subntt_idx < (nof_subntts); hierarchy_0_subntt_idx++) { - NttTaskCoordinates ntt_task_coordinates(0, 0, hierarchy_0_layer_idx, hierarchy_0_block_idx, hierarchy_0_subntt_idx, false); - NttTask task(ntt_task_coordinates, ntt_data); - task.execute(); - } + // #pragma omp parallel for collapse(2) schedule(dynamic) + size_t num_chunks = std::thread::hardware_concurrency(); // Adjust based on the number of threads + // size_t num_chunks = 2; // Adjust based on the number of threads + size_t chunk_size = (nof_blocks*nof_subntts + num_chunks - 1) / num_chunks; + + for (size_t i = 0; i < num_chunks; ++i) { + size_t start_index = i * chunk_size; + size_t end_index = std::min(start_index + chunk_size, nof_blocks*nof_subntts); + taskflow.emplace([&, hierarchy_0_layer_idx, start_index, end_index, nof_subntts]() { + for (uint32_t j = start_index; j < (end_index); j++) { + uint32_t hierarchy_0_block_idx = j / nof_subntts; + uint32_t hierarchy_0_subntt_idx = j % nof_subntts; + NttTaskCoordinates ntt_task_coordinates(0, 0, hierarchy_0_layer_idx, hierarchy_0_block_idx, hierarchy_0_subntt_idx, false); + NttTask task(ntt_task_coordinates, ntt_data); + task.execute(); + } + }); } + executor.run(taskflow).wait(); + taskflow.clear(); if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // All NTT tasks in hierarchy 1 have been executed; now executing the reorder task NttTaskCoordinates ntt_task_coordinates(0, 0, hierarchy_0_layer_idx, 0, 0, true); NttTask task(ntt_task_coordinates, ntt_data); @@ -131,7 +146,7 @@ namespace ntt_cpu { nof_subntts = 1; } nof_blocks = nof_blocks>>1; - #pragma omp parallel for collapse(3) schedule(dynamic) + // #pragma omp parallel for collapse(3) schedule(dynamic) for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { for (uint32_t hierarchy_0_block_idx_half = 0; hierarchy_0_block_idx_half < (nof_blocks); hierarchy_0_block_idx_half++) { for (uint32_t hierarchy_0_subntt_idx = 0; hierarchy_0_subntt_idx < (nof_subntts); hierarchy_0_subntt_idx++) { @@ -146,7 +161,7 @@ namespace ntt_cpu { } } if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // All NTT tasks in hierarchy 1 have been executed; now executing the reorder task - #pragma omp parallel for + // #pragma omp parallel for for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { NttTaskCoordinates ntt_task_coordinates(hierarchy_1_layer_idx, hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck, nof_hierarchy_0_layers, 0, 0, true); NttTask task(ntt_task_coordinates, ntt_data); @@ -259,7 +274,7 @@ namespace ntt_cpu { E* current_elements = ntt_data.config.columns_batch ? ntt_data.elements + batch : ntt_data.elements + batch * ntt_data.size; - #pragma omp parallel for + // #pragma omp parallel for for (uint64_t i = 1; i < ntt_data.size; ++i) { uint64_t idx = i; @@ -302,7 +317,7 @@ namespace ntt_cpu { ntt_data.config.columns_batch ? ntt_data.elements + batch : ntt_data.elements + batch * ntt_data.size; E* cur_temp_elements = ntt_data.config.columns_batch ? temp_elements.get() + batch : temp_elements.get() + batch * ntt_data.size; - #pragma omp parallel for collapse(2) + // #pragma omp parallel for collapse(2) for (uint32_t sntt_idx = 0; sntt_idx < nof_sntts; sntt_idx++) { for (uint32_t elem = 0; elem < sntt_size; elem++) { cur_temp_elements[stride * (sntt_idx * sntt_size + elem)] = diff --git a/icicle/backend/cpu/include/ntt_task.h b/icicle/backend/cpu/include/ntt_task.h index e86436f40..315ac6696 100644 --- a/icicle/backend/cpu/include/ntt_task.h +++ b/icicle/backend/cpu/include/ntt_task.h @@ -1276,7 +1276,7 @@ namespace ntt_cpu { uint64_t rev; uint64_t i_mem_idx; uint64_t rev_mem_idx; -#pragma omp parallel for +// #pragma omp parallel for for (uint64_t i = 0; i < subntt_size; ++i) { rev = bit_reverse(i, subntt_log_size); i_mem_idx = idx_in_mem(ntt_task_coordinates, i); diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 3414811a7..73142534d 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -16,6 +16,8 @@ #include "icicle/program/returning_value_program.h" #include "../../icicle/backend/cpu/include/cpu_program_executor.h" #include "test_base.h" +#include +#include using namespace field_config; using namespace icicle; @@ -813,22 +815,23 @@ TEST_F(FieldApiTestBase, polynomialDivision) TYPED_TEST(FieldApiTest, ntt) { // Randomize configuration - const bool inplace = rand_uint_32b(0, 1); - const int logn = rand_uint_32b(3, 17); + for (int logn=3; logn<26; logn++){ + const bool inplace = 0; + // const int logn = 3; const uint64_t N = 1 << logn; - const int log_ntt_domain_size = logn + 1; - const int log_batch_size = rand_uint_32b(0, 2); + const int log_ntt_domain_size = logn; + const int log_batch_size = 0; const int batch_size = 1 << log_batch_size; - const int _ordering = rand_uint_32b(0, 3); + const int _ordering = 0; const Ordering ordering = static_cast(_ordering); - bool columns_batch; - if (logn == 7 || logn < 4) { - columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) - } else { - columns_batch = rand_uint_32b(0, 1); - } - const NTTDir dir = static_cast(rand_uint_32b(0, 1)); // 0: forward, 1: inverse - const int log_coset_stride = rand_uint_32b(0, 2); + bool columns_batch = false; + // if (logn == 7 || logn < 4) { + // columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) + // } else { + // columns_batch = rand_uint_32b(0, 1); + // } + const NTTDir dir = static_cast(0); // 0: forward, 1: inverse + const int log_coset_stride = 0; scalar_t coset_gen; if (log_coset_stride) { coset_gen = scalar_t::omega(logn + log_coset_stride); @@ -846,6 +849,9 @@ TYPED_TEST(FieldApiTest, ntt) const int total_size = N * batch_size; auto scalars = std::make_unique(total_size); TypeParam::rand_host_many(scalars.get(), total_size); + // for (int i = 0; i < total_size; i++) { + // scalars[i] = scalar_t::from(1); + // } auto out_main = std::make_unique(total_size); auto out_ref = std::make_unique(total_size); @@ -884,7 +890,7 @@ TYPED_TEST(FieldApiTest, ntt) ICICLE_CHECK(ntt(d_in, N, dir, config, d_out)); } } - END_TIMER(NTT_sync, oss.str().c_str(), measure); + END_TIMER_AVERAGE(NTT_sync, oss.str().c_str(), measure, iters); if (inplace) { ICICLE_CHECK(icicle_copy_to_host_async(out, d_in, total_size * sizeof(TypeParam), config.stream)); @@ -899,9 +905,9 @@ TYPED_TEST(FieldApiTest, ntt) }; run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); // warmup run(IcicleTestBase::reference_device(), out_ref.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); - run(IcicleTestBase::main_device(), out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); + run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); -} +}} #endif // NTT // define program @@ -1054,6 +1060,52 @@ TEST_F(FieldApiTestBase, CpuProgramExecutorReturningVal) ASSERT_EQ(0, memcmp(out_element_wise.get(), out_vec_ops.get(), total_size * sizeof(scalar_t))); } +TEST_F(FieldApiTestBase, Taskflow) +{ + constexpr size_t N = 1 << 22; + auto vec1 = std::make_unique(N); + auto vec2 = std::make_unique(N); + auto resultSerial = std::make_unique(N); + auto resultParallel = std::make_unique(N); + scalar_t::rand_host_many(vec1.get(), N); + scalar_t::rand_host_many(vec2.get(), N); + + // Measure time for Serial computation + START_TIMER(Serial) + for (size_t j = 0; j < N; ++j) { + resultSerial[j] = vec1[j] * vec2[j]; + } + auto end = std::chrono::high_resolution_clock::now(); + END_TIMER(Serial, "Serial computation completed in ", true); + + // Measure time for parallel computation + START_TIMER(Parallel) + + tf::Taskflow taskflow; + tf::Executor executor; + + // Number of chunks for parallel processing + size_t num_chunks = (std::thread::hardware_concurrency())<<8; // Adjust based on the number of threads + size_t chunk_size = (N + num_chunks - 1) / num_chunks; + + for (size_t i = 0; i < num_chunks; ++i) { + size_t start_index = i * chunk_size; + size_t end_index = std::min(start_index + chunk_size, N); + + taskflow.emplace([&, start_index, end_index]() { + for (size_t j = start_index; j < end_index; ++j) { + resultParallel[j] = vec1[j] * vec2[j]; + } + }); + } + + executor.run(taskflow).wait(); + + END_TIMER(Parallel, "Parallel computation completed in ", true); + + ASSERT_EQ(0, memcmp(resultSerial.get(), resultParallel.get(), N * sizeof(scalar_t))); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); From 881c1ab9c3318bb741f40f67d9b5206e20af1f6d Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Tue, 31 Dec 2024 11:51:49 +0200 Subject: [PATCH 11/14] Optimized NTT parallel implementation on CPU with Taskflow --- icicle/backend/cpu/include/ntt_cpu.h | 44 ++++++++------ icicle/backend/cpu/include/ntt_data.h | 4 +- icicle/backend/cpu/include/ntt_task.h | 1 - icicle/tests/test_field_api.cpp | 82 ++++++--------------------- 4 files changed, 46 insertions(+), 85 deletions(-) diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index 03b871c1d..452e62f7c 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -95,9 +95,7 @@ namespace ntt_cpu { nof_blocks = 1 << (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][0] + ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][1]); nof_subntts = 1; } - // #pragma omp parallel for collapse(2) schedule(dynamic) - size_t num_chunks = std::thread::hardware_concurrency(); // Adjust based on the number of threads - // size_t num_chunks = 2; // Adjust based on the number of threads + size_t num_chunks = (std::thread::hardware_concurrency())<<1; // Adjust based on the number of threads size_t chunk_size = (nof_blocks*nof_subntts + num_chunks - 1) / num_chunks; for (size_t i = 0; i < num_chunks; ++i) { @@ -122,6 +120,8 @@ namespace ntt_cpu { } } } else { + tf::Taskflow taskflow; + tf::Executor executor; for (uint32_t hierarchy_1_layer_idx = 0; hierarchy_1_layer_idx < 2; hierarchy_1_layer_idx++) { const uint32_t sunbtt_plus_batch_logn = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[hierarchy_1_layer_idx] + uint32_t(log2(ntt_data.config.batch_size)); const uint32_t log_nof_hierarchy_1_subntts_todo_in_parallel = (sunbtt_plus_batch_logn < HIERARCHY_1) ? HIERARCHY_1 - sunbtt_plus_batch_logn : 0; @@ -129,7 +129,7 @@ namespace ntt_cpu { const uint32_t log_nof_subntts_chunks = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1 - hierarchy_1_layer_idx] - log_nof_hierarchy_1_subntts_todo_in_parallel; const uint32_t nof_subntts_chunks = 1 << log_nof_subntts_chunks; - for (uint32_t hierarchy_1_subntts_chunck_idx = 0; hierarchy_1_subntts_chunck_idx < nof_subntts_chunks; hierarchy_1_subntts_chunck_idx++) { + for (uint32_t hierarchy_1_subntts_chunk_idx = 0; hierarchy_1_subntts_chunk_idx < nof_subntts_chunks; hierarchy_1_subntts_chunk_idx++) { uint32_t nof_hierarchy_0_layers = (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2] != 0) ? 3 : (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1] != 0) ? 2 : 1; for (uint32_t hierarchy_0_layer_idx = 0; hierarchy_0_layer_idx < nof_hierarchy_0_layers; hierarchy_0_layer_idx++) { @@ -146,27 +146,39 @@ namespace ntt_cpu { nof_subntts = 1; } nof_blocks = nof_blocks>>1; - // #pragma omp parallel for collapse(3) schedule(dynamic) - for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { - for (uint32_t hierarchy_0_block_idx_half = 0; hierarchy_0_block_idx_half < (nof_blocks); hierarchy_0_block_idx_half++) { - for (uint32_t hierarchy_0_subntt_idx = 0; hierarchy_0_subntt_idx < (nof_subntts); hierarchy_0_subntt_idx++) { + size_t num_chunks = (std::thread::hardware_concurrency())<<5; // Adjust based on the number of threads + size_t chunk_size = (nof_blocks*nof_subntts*nof_hierarchy_1_subntts_todo_in_parallel + num_chunks - 1) / num_chunks; + for (size_t i = 0; i < num_chunks; ++i) { + size_t start_index = i * chunk_size; + size_t end_index = std::min(start_index + chunk_size, nof_blocks*nof_subntts*nof_hierarchy_1_subntts_todo_in_parallel); + taskflow.emplace([this, start_index, end_index, nof_blocks, hierarchy_1_layer_idx, hierarchy_1_subntts_chunk_idx, hierarchy_0_layer_idx, + nof_subntts, nof_hierarchy_1_subntts_todo_in_parallel]() { + for (uint32_t j = start_index; j < (end_index); j++) { + uint32_t hierarchy_1_subntt_idx_in_chunk = j / (nof_subntts*nof_blocks); + uint32_t hierarchy_0_block_idx_half = (j / nof_subntts) % nof_blocks; uint32_t hierarchy_0_block_idx = hierarchy_0_block_idx_half<<1; - NttTaskCoordinates ntt_task_coordinates(hierarchy_1_layer_idx, hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck, hierarchy_0_layer_idx, hierarchy_0_block_idx, hierarchy_0_subntt_idx, false); + uint32_t hierarchy_0_subntt_idx = j % nof_subntts; + NttTaskCoordinates ntt_task_coordinates(hierarchy_1_layer_idx, hierarchy_1_subntts_chunk_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunk, hierarchy_0_layer_idx, hierarchy_0_block_idx, hierarchy_0_subntt_idx, false); NttTask task(ntt_task_coordinates, ntt_data); task.execute(); ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx+1; NttTask task_with_elements_in_the_same_cachline(ntt_task_coordinates, ntt_data); task_with_elements_in_the_same_cachline.execute(); } - } + }); } + executor.run(taskflow).wait(); + taskflow.clear(); if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // All NTT tasks in hierarchy 1 have been executed; now executing the reorder task - // #pragma omp parallel for - for (uint32_t hierarchy_1_subntt_idx_in_chunck = 0; hierarchy_1_subntt_idx_in_chunck < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunck++) { - NttTaskCoordinates ntt_task_coordinates(hierarchy_1_layer_idx, hierarchy_1_subntts_chunck_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunck, nof_hierarchy_0_layers, 0, 0, true); - NttTask task(ntt_task_coordinates, ntt_data); - task.execute(); - } + taskflow.emplace([this, hierarchy_1_layer_idx, hierarchy_1_subntts_chunk_idx, nof_hierarchy_1_subntts_todo_in_parallel, nof_hierarchy_0_layers]() { + for (uint32_t hierarchy_1_subntt_idx_in_chunk = 0; hierarchy_1_subntt_idx_in_chunk < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunk++) { + NttTaskCoordinates ntt_task_coordinates(hierarchy_1_layer_idx, hierarchy_1_subntts_chunk_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunk, nof_hierarchy_0_layers, 0, 0, true); + NttTask task(ntt_task_coordinates, ntt_data); + task.execute(); + } + }); + executor.run(taskflow).wait(); + taskflow.clear(); } } } diff --git a/icicle/backend/cpu/include/ntt_data.h b/icicle/backend/cpu/include/ntt_data.h index 75de47840..318be19bb 100644 --- a/icicle/backend/cpu/include/ntt_data.h +++ b/icicle/backend/cpu/include/ntt_data.h @@ -10,7 +10,7 @@ #include #include -#define HIERARCHY_1 25 +#define HIERARCHY_1 26 namespace ntt_cpu { @@ -28,7 +28,7 @@ namespace ntt_cpu { {0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0}, {4, 0, 0}, {5, 0, 0}, {3, 3, 0}, {4, 3, 0}, {4, 4, 0}, {5, 4, 0}, {5, 5, 0}, {4, 4, 3}, {4, 4, 4}, {5, 4, 4}, {5, 5, 4}, {5, 5, 5}, {5, 5, 6}, {5, 5, 7}, {5, 5, 8}, {5, 5, 9}, {5, 5, 10}, {5, 5, 11}, {5, 5, 12}, {5, 5, 13}, - {5, 5, 14}, {5, 5, 15}, {13, 13, 0}, {14, 13, 0}, {14, 14, 0}, {15, 14, 0}, {15, 15, 0}}; + {5, 5, 14}, {5, 5, 15}, {5, 5, 16}, {14, 13, 0}, {14, 14, 0}, {15, 14, 0}, {15, 15, 0}}; /** * @brief Represents the log sizes of sub-NTTs in the NTT computation hierarchy. diff --git a/icicle/backend/cpu/include/ntt_task.h b/icicle/backend/cpu/include/ntt_task.h index 315ac6696..f3aaa72d6 100644 --- a/icicle/backend/cpu/include/ntt_task.h +++ b/icicle/backend/cpu/include/ntt_task.h @@ -1276,7 +1276,6 @@ namespace ntt_cpu { uint64_t rev; uint64_t i_mem_idx; uint64_t rev_mem_idx; -// #pragma omp parallel for for (uint64_t i = 0; i < subntt_size; ++i) { rev = bit_reverse(i, subntt_log_size); i_mem_idx = idx_in_mem(ntt_task_coordinates, i); diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 73142534d..d3491ec41 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -815,23 +815,22 @@ TEST_F(FieldApiTestBase, polynomialDivision) TYPED_TEST(FieldApiTest, ntt) { // Randomize configuration - for (int logn=3; logn<26; logn++){ - const bool inplace = 0; - // const int logn = 3; + const bool inplace = rand_uint_32b(0, 1); + const int logn = rand_uint_32b(3, 17); const uint64_t N = 1 << logn; - const int log_ntt_domain_size = logn; - const int log_batch_size = 0; + const int log_ntt_domain_size = logn + 1; + const int log_batch_size = rand_uint_32b(0, 2); const int batch_size = 1 << log_batch_size; - const int _ordering = 0; + const int _ordering = rand_uint_32b(0, 3); const Ordering ordering = static_cast(_ordering); - bool columns_batch = false; - // if (logn == 7 || logn < 4) { - // columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) - // } else { - // columns_batch = rand_uint_32b(0, 1); - // } - const NTTDir dir = static_cast(0); // 0: forward, 1: inverse - const int log_coset_stride = 0; + bool columns_batch; + if (logn == 7 || logn < 4) { + columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) + } else { + columns_batch = rand_uint_32b(0, 1); + } + const NTTDir dir = static_cast(rand_uint_32b(0, 1)); // 0: forward, 1: inverse + const int log_coset_stride = rand_uint_32b(0, 2); scalar_t coset_gen; if (log_coset_stride) { coset_gen = scalar_t::omega(logn + log_coset_stride); @@ -849,9 +848,6 @@ TYPED_TEST(FieldApiTest, ntt) const int total_size = N * batch_size; auto scalars = std::make_unique(total_size); TypeParam::rand_host_many(scalars.get(), total_size); - // for (int i = 0; i < total_size; i++) { - // scalars[i] = scalar_t::from(1); - // } auto out_main = std::make_unique(total_size); auto out_ref = std::make_unique(total_size); @@ -890,7 +886,7 @@ TYPED_TEST(FieldApiTest, ntt) ICICLE_CHECK(ntt(d_in, N, dir, config, d_out)); } } - END_TIMER_AVERAGE(NTT_sync, oss.str().c_str(), measure, iters); + END_TIMER(NTT_sync, oss.str().c_str(), measure); if (inplace) { ICICLE_CHECK(icicle_copy_to_host_async(out, d_in, total_size * sizeof(TypeParam), config.stream)); @@ -905,9 +901,9 @@ TYPED_TEST(FieldApiTest, ntt) }; run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); // warmup run(IcicleTestBase::reference_device(), out_ref.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); - run(IcicleTestBase::main_device(), out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); + run(IcicleTestBase::main_device(), out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); -}} +} #endif // NTT // define program @@ -1060,52 +1056,6 @@ TEST_F(FieldApiTestBase, CpuProgramExecutorReturningVal) ASSERT_EQ(0, memcmp(out_element_wise.get(), out_vec_ops.get(), total_size * sizeof(scalar_t))); } -TEST_F(FieldApiTestBase, Taskflow) -{ - constexpr size_t N = 1 << 22; - auto vec1 = std::make_unique(N); - auto vec2 = std::make_unique(N); - auto resultSerial = std::make_unique(N); - auto resultParallel = std::make_unique(N); - scalar_t::rand_host_many(vec1.get(), N); - scalar_t::rand_host_many(vec2.get(), N); - - // Measure time for Serial computation - START_TIMER(Serial) - for (size_t j = 0; j < N; ++j) { - resultSerial[j] = vec1[j] * vec2[j]; - } - auto end = std::chrono::high_resolution_clock::now(); - END_TIMER(Serial, "Serial computation completed in ", true); - - // Measure time for parallel computation - START_TIMER(Parallel) - - tf::Taskflow taskflow; - tf::Executor executor; - - // Number of chunks for parallel processing - size_t num_chunks = (std::thread::hardware_concurrency())<<8; // Adjust based on the number of threads - size_t chunk_size = (N + num_chunks - 1) / num_chunks; - - for (size_t i = 0; i < num_chunks; ++i) { - size_t start_index = i * chunk_size; - size_t end_index = std::min(start_index + chunk_size, N); - - taskflow.emplace([&, start_index, end_index]() { - for (size_t j = start_index; j < end_index; ++j) { - resultParallel[j] = vec1[j] * vec2[j]; - } - }); - } - - executor.run(taskflow).wait(); - - END_TIMER(Parallel, "Parallel computation completed in ", true); - - ASSERT_EQ(0, memcmp(resultSerial.get(), resultParallel.get(), N * sizeof(scalar_t))); -} - int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); From 991e00c301be59d8e659f1476f29656ccc2621e9 Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Tue, 31 Dec 2024 11:56:13 +0200 Subject: [PATCH 12/14] format --- icicle/backend/cpu/include/ntt_cpu.h | 118 +++++++++++++++++-------- icicle/backend/cpu/include/ntt_data.h | 8 +- icicle/backend/cpu/include/ntt_utils.h | 29 +++--- 3 files changed, 96 insertions(+), 59 deletions(-) diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index 452e62f7c..1195664b4 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -63,10 +63,10 @@ namespace ntt_cpu { NttCpu::NttCpu(uint32_t logn, NTTDir direction, const NTTConfig& config, const E* input, E* output) : input(input), output(output), ntt_data(logn, output, config, direction, compute_if_is_parallel(logn, config)) { - if (logn > HIERARCHY_1) { - // Allocate temporary storage to handle reordering - temp_elements = std::make_unique(ntt_data.size * config.batch_size); - } + if (logn > HIERARCHY_1) { + // Allocate temporary storage to handle reordering + temp_elements = std::make_unique(ntt_data.size * config.batch_size); + } } template @@ -78,11 +78,14 @@ namespace ntt_cpu { NttTaskCoordinates ntt_task_coordinates(0, 0, 0, 0, 0, false); NttTask task(ntt_task_coordinates, ntt_data); task.execute(); - } else if (__builtin_expect((ntt_data.logn <= HIERARCHY_1),1)){ + } else if (__builtin_expect((ntt_data.logn <= HIERARCHY_1), 1)) { tf::Taskflow taskflow; tf::Executor executor; - uint32_t nof_hierarchy_0_layers = (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][2] != 0) ? 3 : (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][1] != 0) ? 2 : 1; - for (uint32_t hierarchy_0_layer_idx = 0; hierarchy_0_layer_idx < nof_hierarchy_0_layers; hierarchy_0_layer_idx++) { + uint32_t nof_hierarchy_0_layers = (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][2] != 0) ? 3 + : (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][1] != 0) ? 2 + : 1; + for (uint32_t hierarchy_0_layer_idx = 0; hierarchy_0_layer_idx < nof_hierarchy_0_layers; + hierarchy_0_layer_idx++) { uint64_t nof_blocks; uint64_t nof_subntts; if (hierarchy_0_layer_idx == 0) { @@ -92,28 +95,35 @@ namespace ntt_cpu { nof_blocks = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][2]; nof_subntts = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][0]; } else { - nof_blocks = 1 << (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][0] + ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][1]); + nof_blocks = 1 + << (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][0] + + ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[0][1]); nof_subntts = 1; } - size_t num_chunks = (std::thread::hardware_concurrency())<<1; // Adjust based on the number of threads - size_t chunk_size = (nof_blocks*nof_subntts + num_chunks - 1) / num_chunks; + size_t num_chunks = (std::thread::hardware_concurrency()) << 1; // Adjust based on the number of threads + size_t chunk_size = (nof_blocks * nof_subntts + num_chunks - 1) / num_chunks; for (size_t i = 0; i < num_chunks; ++i) { size_t start_index = i * chunk_size; - size_t end_index = std::min(start_index + chunk_size, nof_blocks*nof_subntts); + size_t end_index = std::min(start_index + chunk_size, nof_blocks * nof_subntts); taskflow.emplace([&, hierarchy_0_layer_idx, start_index, end_index, nof_subntts]() { for (uint32_t j = start_index; j < (end_index); j++) { uint32_t hierarchy_0_block_idx = j / nof_subntts; uint32_t hierarchy_0_subntt_idx = j % nof_subntts; - NttTaskCoordinates ntt_task_coordinates(0, 0, hierarchy_0_layer_idx, hierarchy_0_block_idx, hierarchy_0_subntt_idx, false); + NttTaskCoordinates ntt_task_coordinates( + 0, 0, hierarchy_0_layer_idx, hierarchy_0_block_idx, hierarchy_0_subntt_idx, false); NttTask task(ntt_task_coordinates, ntt_data); task.execute(); } }); } executor.run(taskflow).wait(); - taskflow.clear(); - if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // All NTT tasks in hierarchy 1 have been executed; now executing the reorder task + taskflow.clear(); + if ((hierarchy_0_layer_idx != 0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // All NTT tasks in + // hierarchy 1 have + // been executed; + // now executing + // the reorder task NttTaskCoordinates ntt_task_coordinates(0, 0, hierarchy_0_layer_idx, 0, 0, true); NttTask task(ntt_task_coordinates, ntt_data); task.execute(); @@ -123,16 +133,25 @@ namespace ntt_cpu { tf::Taskflow taskflow; tf::Executor executor; for (uint32_t hierarchy_1_layer_idx = 0; hierarchy_1_layer_idx < 2; hierarchy_1_layer_idx++) { - const uint32_t sunbtt_plus_batch_logn = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[hierarchy_1_layer_idx] + uint32_t(log2(ntt_data.config.batch_size)); - const uint32_t log_nof_hierarchy_1_subntts_todo_in_parallel = (sunbtt_plus_batch_logn < HIERARCHY_1) ? HIERARCHY_1 - sunbtt_plus_batch_logn : 0; + const uint32_t sunbtt_plus_batch_logn = + ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[hierarchy_1_layer_idx] + + uint32_t(log2(ntt_data.config.batch_size)); + const uint32_t log_nof_hierarchy_1_subntts_todo_in_parallel = + (sunbtt_plus_batch_logn < HIERARCHY_1) ? HIERARCHY_1 - sunbtt_plus_batch_logn : 0; const uint32_t nof_hierarchy_1_subntts_todo_in_parallel = 1 << log_nof_hierarchy_1_subntts_todo_in_parallel; - const uint32_t log_nof_subntts_chunks = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1 - hierarchy_1_layer_idx] - log_nof_hierarchy_1_subntts_todo_in_parallel; + const uint32_t log_nof_subntts_chunks = + ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1 - hierarchy_1_layer_idx] - + log_nof_hierarchy_1_subntts_todo_in_parallel; const uint32_t nof_subntts_chunks = 1 << log_nof_subntts_chunks; - - for (uint32_t hierarchy_1_subntts_chunk_idx = 0; hierarchy_1_subntts_chunk_idx < nof_subntts_chunks; hierarchy_1_subntts_chunk_idx++) { - - uint32_t nof_hierarchy_0_layers = (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2] != 0) ? 3 : (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1] != 0) ? 2 : 1; - for (uint32_t hierarchy_0_layer_idx = 0; hierarchy_0_layer_idx < nof_hierarchy_0_layers; hierarchy_0_layer_idx++) { + + for (uint32_t hierarchy_1_subntts_chunk_idx = 0; hierarchy_1_subntts_chunk_idx < nof_subntts_chunks; + hierarchy_1_subntts_chunk_idx++) { + uint32_t nof_hierarchy_0_layers = + (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2] != 0) ? 3 + : (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1] != 0) ? 2 + : 1; + for (uint32_t hierarchy_0_layer_idx = 0; hierarchy_0_layer_idx < nof_hierarchy_0_layers; + hierarchy_0_layer_idx++) { uint64_t nof_blocks; uint64_t nof_subntts; if (hierarchy_0_layer_idx == 0) { @@ -142,26 +161,35 @@ namespace ntt_cpu { nof_blocks = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][2]; nof_subntts = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0]; } else { - nof_blocks = 1 << (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0] + ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1]); + nof_blocks = 1 + << (ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][0] + + ntt_data.ntt_sub_hierarchies.hierarchy_0_layers_sub_logn[hierarchy_1_layer_idx][1]); nof_subntts = 1; } - nof_blocks = nof_blocks>>1; - size_t num_chunks = (std::thread::hardware_concurrency())<<5; // Adjust based on the number of threads - size_t chunk_size = (nof_blocks*nof_subntts*nof_hierarchy_1_subntts_todo_in_parallel + num_chunks - 1) / num_chunks; + nof_blocks = nof_blocks >> 1; + size_t num_chunks = (std::thread::hardware_concurrency()) << 5; // Adjust based on the number of threads + size_t chunk_size = + (nof_blocks * nof_subntts * nof_hierarchy_1_subntts_todo_in_parallel + num_chunks - 1) / num_chunks; for (size_t i = 0; i < num_chunks; ++i) { size_t start_index = i * chunk_size; - size_t end_index = std::min(start_index + chunk_size, nof_blocks*nof_subntts*nof_hierarchy_1_subntts_todo_in_parallel); - taskflow.emplace([this, start_index, end_index, nof_blocks, hierarchy_1_layer_idx, hierarchy_1_subntts_chunk_idx, hierarchy_0_layer_idx, - nof_subntts, nof_hierarchy_1_subntts_todo_in_parallel]() { + size_t end_index = + std::min(start_index + chunk_size, nof_blocks * nof_subntts * nof_hierarchy_1_subntts_todo_in_parallel); + taskflow.emplace([this, start_index, end_index, nof_blocks, hierarchy_1_layer_idx, + hierarchy_1_subntts_chunk_idx, hierarchy_0_layer_idx, nof_subntts, + nof_hierarchy_1_subntts_todo_in_parallel]() { for (uint32_t j = start_index; j < (end_index); j++) { - uint32_t hierarchy_1_subntt_idx_in_chunk = j / (nof_subntts*nof_blocks); + uint32_t hierarchy_1_subntt_idx_in_chunk = j / (nof_subntts * nof_blocks); uint32_t hierarchy_0_block_idx_half = (j / nof_subntts) % nof_blocks; - uint32_t hierarchy_0_block_idx = hierarchy_0_block_idx_half<<1; + uint32_t hierarchy_0_block_idx = hierarchy_0_block_idx_half << 1; uint32_t hierarchy_0_subntt_idx = j % nof_subntts; - NttTaskCoordinates ntt_task_coordinates(hierarchy_1_layer_idx, hierarchy_1_subntts_chunk_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunk, hierarchy_0_layer_idx, hierarchy_0_block_idx, hierarchy_0_subntt_idx, false); + NttTaskCoordinates ntt_task_coordinates( + hierarchy_1_layer_idx, + hierarchy_1_subntts_chunk_idx * nof_hierarchy_1_subntts_todo_in_parallel + + hierarchy_1_subntt_idx_in_chunk, + hierarchy_0_layer_idx, hierarchy_0_block_idx, hierarchy_0_subntt_idx, false); NttTask task(ntt_task_coordinates, ntt_data); task.execute(); - ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx+1; + ntt_task_coordinates.hierarchy_0_block_idx = hierarchy_0_block_idx + 1; NttTask task_with_elements_in_the_same_cachline(ntt_task_coordinates, ntt_data); task_with_elements_in_the_same_cachline.execute(); } @@ -169,10 +197,21 @@ namespace ntt_cpu { } executor.run(taskflow).wait(); taskflow.clear(); - if ((hierarchy_0_layer_idx !=0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // All NTT tasks in hierarchy 1 have been executed; now executing the reorder task - taskflow.emplace([this, hierarchy_1_layer_idx, hierarchy_1_subntts_chunk_idx, nof_hierarchy_1_subntts_todo_in_parallel, nof_hierarchy_0_layers]() { - for (uint32_t hierarchy_1_subntt_idx_in_chunk = 0; hierarchy_1_subntt_idx_in_chunk < nof_hierarchy_1_subntts_todo_in_parallel; hierarchy_1_subntt_idx_in_chunk++) { - NttTaskCoordinates ntt_task_coordinates(hierarchy_1_layer_idx, hierarchy_1_subntts_chunk_idx * nof_hierarchy_1_subntts_todo_in_parallel + hierarchy_1_subntt_idx_in_chunk, nof_hierarchy_0_layers, 0, 0, true); + if ( + (hierarchy_0_layer_idx != 0) && + (hierarchy_0_layer_idx == + nof_hierarchy_0_layers - + 1)) { // All NTT tasks in hierarchy 1 have been executed; now executing the reorder task + taskflow.emplace([this, hierarchy_1_layer_idx, hierarchy_1_subntts_chunk_idx, + nof_hierarchy_1_subntts_todo_in_parallel, nof_hierarchy_0_layers]() { + for (uint32_t hierarchy_1_subntt_idx_in_chunk = 0; + hierarchy_1_subntt_idx_in_chunk < nof_hierarchy_1_subntts_todo_in_parallel; + hierarchy_1_subntt_idx_in_chunk++) { + NttTaskCoordinates ntt_task_coordinates( + hierarchy_1_layer_idx, + hierarchy_1_subntts_chunk_idx * nof_hierarchy_1_subntts_todo_in_parallel + + hierarchy_1_subntt_idx_in_chunk, + nof_hierarchy_0_layers, 0, 0, true); NttTask task(ntt_task_coordinates, ntt_data); task.execute(); } @@ -222,7 +261,8 @@ namespace ntt_cpu { // no reordering needed, and input and output are the same return; } - // Allocate temporary storage to handle in-place reordering, can't be done inplace when input and output are the same + // Allocate temporary storage to handle in-place reordering, can't be done inplace when input and output are the + // same temp_storage = std::make_unique(total_memory_size); temp_output = temp_storage.get(); } @@ -285,7 +325,7 @@ namespace ntt_cpu { for (uint32_t batch = 0; batch < ntt_data.config.batch_size; ++batch) { E* current_elements = ntt_data.config.columns_batch ? ntt_data.elements + batch : ntt_data.elements + batch * ntt_data.size; - + // #pragma omp parallel for for (uint64_t i = 1; i < ntt_data.size; ++i) { uint64_t idx = i; diff --git a/icicle/backend/cpu/include/ntt_data.h b/icicle/backend/cpu/include/ntt_data.h index 318be19bb..3eb830be7 100644 --- a/icicle/backend/cpu/include/ntt_data.h +++ b/icicle/backend/cpu/include/ntt_data.h @@ -25,10 +25,10 @@ namespace ntt_cpu { * layer, 13 for the second, and 0 for the third. */ constexpr uint32_t layers_sub_logn[31][3] = { - {0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0}, {4, 0, 0}, {5, 0, 0}, {3, 3, 0}, {4, 3, 0}, - {4, 4, 0}, {5, 4, 0}, {5, 5, 0}, {4, 4, 3}, {4, 4, 4}, {5, 4, 4}, {5, 5, 4}, {5, 5, 5}, - {5, 5, 6}, {5, 5, 7}, {5, 5, 8}, {5, 5, 9}, {5, 5, 10}, {5, 5, 11}, {5, 5, 12}, {5, 5, 13}, - {5, 5, 14}, {5, 5, 15}, {5, 5, 16}, {14, 13, 0}, {14, 14, 0}, {15, 14, 0}, {15, 15, 0}}; + {0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0}, {4, 0, 0}, {5, 0, 0}, {3, 3, 0}, {4, 3, 0}, + {4, 4, 0}, {5, 4, 0}, {5, 5, 0}, {4, 4, 3}, {4, 4, 4}, {5, 4, 4}, {5, 5, 4}, {5, 5, 5}, + {5, 5, 6}, {5, 5, 7}, {5, 5, 8}, {5, 5, 9}, {5, 5, 10}, {5, 5, 11}, {5, 5, 12}, {5, 5, 13}, + {5, 5, 14}, {5, 5, 15}, {5, 5, 16}, {14, 13, 0}, {14, 14, 0}, {15, 14, 0}, {15, 15, 0}}; /** * @brief Represents the log sizes of sub-NTTs in the NTT computation hierarchy. diff --git a/icicle/backend/cpu/include/ntt_utils.h b/icicle/backend/cpu/include/ntt_utils.h index 57128b072..e84df813a 100644 --- a/icicle/backend/cpu/include/ntt_utils.h +++ b/icicle/backend/cpu/include/ntt_utils.h @@ -34,29 +34,26 @@ namespace ntt_cpu { hierarchy_1_subntt_idx == other.hierarchy_1_subntt_idx && hierarchy_0_layer_idx == other.hierarchy_0_layer_idx && hierarchy_0_block_idx == other.hierarchy_0_block_idx && - hierarchy_0_subntt_idx == other.hierarchy_0_subntt_idx && - reorder == other.reorder; + hierarchy_0_subntt_idx == other.hierarchy_0_subntt_idx && reorder == other.reorder; } // Default constructor NttTaskCoordinates() = default; // Constructor with parameters - NttTaskCoordinates(uint32_t h1_layer_idx, - uint32_t h1_subntt_idx, - uint32_t h0_layer_idx, - uint32_t h0_block_idx, - uint32_t h0_subntt_idx, - bool reorder_flag = false) - : hierarchy_1_layer_idx(h1_layer_idx), - hierarchy_1_subntt_idx(h1_subntt_idx), - hierarchy_0_layer_idx(h0_layer_idx), - hierarchy_0_block_idx(h0_block_idx), - hierarchy_0_subntt_idx(h0_subntt_idx), - reorder(reorder_flag) - {} + NttTaskCoordinates( + uint32_t h1_layer_idx, + uint32_t h1_subntt_idx, + uint32_t h0_layer_idx, + uint32_t h0_block_idx, + uint32_t h0_subntt_idx, + bool reorder_flag = false) + : hierarchy_1_layer_idx(h1_layer_idx), hierarchy_1_subntt_idx(h1_subntt_idx), + hierarchy_0_layer_idx(h0_layer_idx), hierarchy_0_block_idx(h0_block_idx), + hierarchy_0_subntt_idx(h0_subntt_idx), reorder(reorder_flag) + { + } }; - uint64_t bit_reverse(uint64_t i, uint32_t logn) { From 63c0abd533b42fdc82a2b82b34c6591232637c4c Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Tue, 31 Dec 2024 16:51:08 +0200 Subject: [PATCH 13/14] Parallelize reorder and coset operations with TaskFlow. Remove redundant includes. Fix review comments. --- icicle/backend/cpu/include/cpu_ntt_domain.h | 8 -- icicle/backend/cpu/include/cpu_ntt_main.h | 3 - icicle/backend/cpu/include/ntt_cpu.h | 89 +++++++++++++-------- icicle/backend/cpu/include/ntt_data.h | 5 -- icicle/backend/cpu/include/ntt_task.h | 2 - icicle/backend/cpu/include/ntt_utils.h | 3 - icicle/tests/test_field_api.cpp | 2 - 7 files changed, 55 insertions(+), 57 deletions(-) diff --git a/icicle/backend/cpu/include/cpu_ntt_domain.h b/icicle/backend/cpu/include/cpu_ntt_domain.h index 3cfe88018..7d3c0d413 100644 --- a/icicle/backend/cpu/include/cpu_ntt_domain.h +++ b/icicle/backend/cpu/include/cpu_ntt_domain.h @@ -1,22 +1,14 @@ #pragma once #include "icicle/backend/ntt_backend.h" #include "icicle/errors.h" -#include "icicle/runtime.h" #include "icicle/utils/log.h" -#include "icicle/fields/field_config.h" -#include "icicle/vec_ops.h" -#include -#include -#include #include -#include #include #include #include #include -using namespace field_config; using namespace icicle; namespace ntt_cpu { diff --git a/icicle/backend/cpu/include/cpu_ntt_main.h b/icicle/backend/cpu/include/cpu_ntt_main.h index bd2207189..7a00fb5d5 100644 --- a/icicle/backend/cpu/include/cpu_ntt_main.h +++ b/icicle/backend/cpu/include/cpu_ntt_main.h @@ -1,9 +1,6 @@ #pragma once -#include "icicle/utils/log.h" #include "ntt_cpu.h" -#include -using namespace field_config; using namespace icicle; /** diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index 1195664b4..1a4b4ecd3 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -1,12 +1,9 @@ #pragma once #include "icicle/errors.h" -#include "icicle/utils/log.h" #include "ntt_task.h" #include "ntt_utils.h" #include #include -#include -#include #ifdef CURVE_ID #include "icicle/curves/curve_config.h" @@ -105,7 +102,7 @@ namespace ntt_cpu { for (size_t i = 0; i < num_chunks; ++i) { size_t start_index = i * chunk_size; - size_t end_index = std::min(start_index + chunk_size, nof_blocks * nof_subntts); + size_t end_index = std::min(start_index + chunk_size, static_cast(nof_blocks * nof_subntts)); taskflow.emplace([&, hierarchy_0_layer_idx, start_index, end_index, nof_subntts]() { for (uint32_t j = start_index; j < (end_index); j++) { uint32_t hierarchy_0_block_idx = j / nof_subntts; @@ -117,7 +114,7 @@ namespace ntt_cpu { } }); } - executor.run(taskflow).wait(); + executor.run(taskflow).wait(); // TODO: Explore using task dependencies to optimize parallel execution taskflow.clear(); if ((hierarchy_0_layer_idx != 0) && (hierarchy_0_layer_idx == nof_hierarchy_0_layers - 1)) { // All NTT tasks in // hierarchy 1 have @@ -173,7 +170,7 @@ namespace ntt_cpu { for (size_t i = 0; i < num_chunks; ++i) { size_t start_index = i * chunk_size; size_t end_index = - std::min(start_index + chunk_size, nof_blocks * nof_subntts * nof_hierarchy_1_subntts_todo_in_parallel); + std::min(start_index + chunk_size, static_cast(nof_blocks * nof_subntts * nof_hierarchy_1_subntts_todo_in_parallel)); taskflow.emplace([this, start_index, end_index, nof_blocks, hierarchy_1_layer_idx, hierarchy_1_subntts_chunk_idx, hierarchy_0_layer_idx, nof_subntts, nof_hierarchy_1_subntts_todo_in_parallel]() { @@ -321,35 +318,48 @@ namespace ntt_cpu { uint32_t batch_stride = ntt_data.config.columns_batch ? ntt_data.config.batch_size : 1; const bool needs_reorder_input = ntt_data.direction == NTTDir::kForward && (ntt_data.logn > HIERARCHY_1); const S* twiddles = CpuNttDomain::s_ntt_domain.get_twiddles(); + tf::Executor executor; + tf::Taskflow taskflow; for (uint32_t batch = 0; batch < ntt_data.config.batch_size; ++batch) { E* current_elements = ntt_data.config.columns_batch ? ntt_data.elements + batch : ntt_data.elements + batch * ntt_data.size; - // #pragma omp parallel for - for (uint64_t i = 1; i < ntt_data.size; ++i) { - uint64_t idx = i; - - // Adjust the index if reorder logic was applied on the input - if (needs_reorder_input) { - uint32_t cur_ntt_log_size = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[0]; - uint32_t next_ntt_log_size = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1]; - uint32_t subntt_idx = i >> cur_ntt_log_size; - uint32_t element = i & ((1 << cur_ntt_log_size) - 1); - idx = subntt_idx + (element << next_ntt_log_size); - } + size_t num_chunks = (std::thread::hardware_concurrency()) << 1; // Adjust based on the number of threads + size_t chunk_size = (ntt_data.size + num_chunks - 1) / num_chunks; + + for (size_t cunk = 0; cunk < num_chunks; ++cunk) { + size_t start_index = cunk * chunk_size; + size_t end_index = std::min(start_index + chunk_size, static_cast(ntt_data.size)); + taskflow.emplace([&, needs_reorder_input, twiddles, current_elements, batch_stride, + start_index, end_index]() { + for (uint64_t i = start_index; i < end_index; i++) { + uint64_t idx = i; + + // Adjust the index if reorder logic was applied on the input + if (needs_reorder_input) { + uint32_t cur_ntt_log_size = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[0]; + uint32_t next_ntt_log_size = ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[1]; + uint32_t subntt_idx = i >> cur_ntt_log_size; + uint32_t element = i & ((1 << cur_ntt_log_size) - 1); + idx = subntt_idx + (element << next_ntt_log_size); + } - // Apply coset multiplication based on the available coset information - if (ntt_data.arbitrary_coset) { - current_elements[batch_stride * i] = current_elements[batch_stride * i] * ntt_data.arbitrary_coset[idx]; - } else { - uint32_t twiddle_idx = ntt_data.coset_stride * idx; - twiddle_idx = ntt_data.direction == NTTDir::kForward - ? twiddle_idx - : CpuNttDomain::s_ntt_domain.get_max_size() - twiddle_idx; - current_elements[batch_stride * i] = current_elements[batch_stride * i] * twiddles[twiddle_idx]; - } + // Apply coset multiplication based on the available coset information + if (ntt_data.arbitrary_coset) { + current_elements[batch_stride * i] = current_elements[batch_stride * i] * ntt_data.arbitrary_coset[idx]; + } else { + uint32_t twiddle_idx = ntt_data.coset_stride * idx; + twiddle_idx = ntt_data.direction == NTTDir::kForward + ? twiddle_idx + : CpuNttDomain::s_ntt_domain.get_max_size() - twiddle_idx; + current_elements[batch_stride * i] = current_elements[batch_stride * i] * twiddles[twiddle_idx]; + } + } + }); } + executor.run(taskflow).wait(); + taskflow.clear(); } } @@ -363,19 +373,30 @@ namespace ntt_cpu { const uint32_t nof_sntts = 1 << ntt_data.ntt_sub_hierarchies.hierarchy_1_layers_sub_logn[0]; const uint32_t stride = ntt_data.config.columns_batch ? ntt_data.config.batch_size : 1; const uint64_t temp_elements_size = ntt_data.size * ntt_data.config.batch_size; + tf::Executor executor; + tf::Taskflow taskflow; for (uint32_t batch = 0; batch < ntt_data.config.batch_size; ++batch) { E* cur_layer_output = ntt_data.config.columns_batch ? ntt_data.elements + batch : ntt_data.elements + batch * ntt_data.size; E* cur_temp_elements = ntt_data.config.columns_batch ? temp_elements.get() + batch : temp_elements.get() + batch * ntt_data.size; - // #pragma omp parallel for collapse(2) - for (uint32_t sntt_idx = 0; sntt_idx < nof_sntts; sntt_idx++) { - for (uint32_t elem = 0; elem < sntt_size; elem++) { - cur_temp_elements[stride * (sntt_idx * sntt_size + elem)] = - cur_layer_output[stride * (elem * nof_sntts + sntt_idx)]; - } + size_t num_chunks = std::thread::hardware_concurrency(); // Adjust based on the number of threads + size_t chunk_size = (static_cast(nof_sntts * sntt_size) + num_chunks - 1) / num_chunks; + for (size_t chunk = 0; chunk < num_chunks; ++chunk) { + size_t start_index = chunk * chunk_size; + size_t end_index = std::min(start_index + chunk_size, static_cast(nof_sntts * sntt_size)); + taskflow.emplace([this, stride, cur_layer_output, cur_temp_elements, nof_sntts, sntt_size, start_index, end_index]() { + for (size_t j = start_index; j < end_index; ++j) { + uint32_t sntt_idx = j / sntt_size; + uint32_t elem = j % sntt_size; + cur_temp_elements[stride * (sntt_idx * sntt_size + elem)] = + cur_layer_output[stride * (elem * nof_sntts + sntt_idx)]; + } + }); } + executor.run(taskflow).wait(); + taskflow.clear(); } ntt_data.elements = temp_elements.get(); } diff --git a/icicle/backend/cpu/include/ntt_data.h b/icicle/backend/cpu/include/ntt_data.h index 3eb830be7..68a05fa17 100644 --- a/icicle/backend/cpu/include/ntt_data.h +++ b/icicle/backend/cpu/include/ntt_data.h @@ -1,13 +1,8 @@ #pragma once -#include "icicle/utils/log.h" -#include "tasks_manager.h" #include "cpu_ntt_domain.h" -// #include <_types/_uint32_t.h> -#include #include #include -#include #include #define HIERARCHY_1 26 diff --git a/icicle/backend/cpu/include/ntt_task.h b/icicle/backend/cpu/include/ntt_task.h index f3aaa72d6..78473e792 100644 --- a/icicle/backend/cpu/include/ntt_task.h +++ b/icicle/backend/cpu/include/ntt_task.h @@ -4,9 +4,7 @@ #include "ntt_utils.h" #include "ntt_data.h" #include -#include -using namespace field_config; using namespace icicle; namespace ntt_cpu { diff --git a/icicle/backend/cpu/include/ntt_utils.h b/icicle/backend/cpu/include/ntt_utils.h index e84df813a..982e02894 100644 --- a/icicle/backend/cpu/include/ntt_utils.h +++ b/icicle/backend/cpu/include/ntt_utils.h @@ -1,10 +1,7 @@ #pragma once -#include "icicle/fields/field_config.h" -// #include <_types/_uint32_t.h> #include -using namespace field_config; using namespace icicle; namespace ntt_cpu { diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index d3491ec41..3414811a7 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -16,8 +16,6 @@ #include "icicle/program/returning_value_program.h" #include "../../icicle/backend/cpu/include/cpu_program_executor.h" #include "test_base.h" -#include -#include using namespace field_config; using namespace icicle; From 9cebf3db1cad7edd9b8568a278c25079d5382ac6 Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Tue, 31 Dec 2024 16:52:06 +0200 Subject: [PATCH 14/14] format --- icicle/backend/cpu/include/ntt_cpu.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index 1a4b4ecd3..b6198d44e 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -169,8 +169,9 @@ namespace ntt_cpu { (nof_blocks * nof_subntts * nof_hierarchy_1_subntts_todo_in_parallel + num_chunks - 1) / num_chunks; for (size_t i = 0; i < num_chunks; ++i) { size_t start_index = i * chunk_size; - size_t end_index = - std::min(start_index + chunk_size, static_cast(nof_blocks * nof_subntts * nof_hierarchy_1_subntts_todo_in_parallel)); + size_t end_index = std::min( + start_index + chunk_size, + static_cast(nof_blocks * nof_subntts * nof_hierarchy_1_subntts_todo_in_parallel)); taskflow.emplace([this, start_index, end_index, nof_blocks, hierarchy_1_layer_idx, hierarchy_1_subntts_chunk_idx, hierarchy_0_layer_idx, nof_subntts, nof_hierarchy_1_subntts_todo_in_parallel]() { @@ -331,8 +332,7 @@ namespace ntt_cpu { for (size_t cunk = 0; cunk < num_chunks; ++cunk) { size_t start_index = cunk * chunk_size; size_t end_index = std::min(start_index + chunk_size, static_cast(ntt_data.size)); - taskflow.emplace([&, needs_reorder_input, twiddles, current_elements, batch_stride, - start_index, end_index]() { + taskflow.emplace([&, needs_reorder_input, twiddles, current_elements, batch_stride, start_index, end_index]() { for (uint64_t i = start_index; i < end_index; i++) { uint64_t idx = i; @@ -382,18 +382,19 @@ namespace ntt_cpu { E* cur_temp_elements = ntt_data.config.columns_batch ? temp_elements.get() + batch : temp_elements.get() + batch * ntt_data.size; size_t num_chunks = std::thread::hardware_concurrency(); // Adjust based on the number of threads - size_t chunk_size = (static_cast(nof_sntts * sntt_size) + num_chunks - 1) / num_chunks; + size_t chunk_size = (static_cast(nof_sntts * sntt_size) + num_chunks - 1) / num_chunks; for (size_t chunk = 0; chunk < num_chunks; ++chunk) { size_t start_index = chunk * chunk_size; size_t end_index = std::min(start_index + chunk_size, static_cast(nof_sntts * sntt_size)); - taskflow.emplace([this, stride, cur_layer_output, cur_temp_elements, nof_sntts, sntt_size, start_index, end_index]() { - for (size_t j = start_index; j < end_index; ++j) { + taskflow.emplace( + [this, stride, cur_layer_output, cur_temp_elements, nof_sntts, sntt_size, start_index, end_index]() { + for (size_t j = start_index; j < end_index; ++j) { uint32_t sntt_idx = j / sntt_size; uint32_t elem = j % sntt_size; cur_temp_elements[stride * (sntt_idx * sntt_size + elem)] = cur_layer_output[stride * (elem * nof_sntts + sntt_idx)]; - } - }); + } + }); } executor.run(taskflow).wait(); taskflow.clear();