Skip to content

Commit

Permalink
Optimized NTT parallel implementation on CPU with Taskflow (#714)
Browse files Browse the repository at this point in the history
  • Loading branch information
ShanieWinitz authored Jan 1, 2025
1 parent 287dcd2 commit 0ed301f
Show file tree
Hide file tree
Showing 9 changed files with 355 additions and 851 deletions.
1 change: 1 addition & 0 deletions icicle/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ add_library(icicle_device SHARED
src/config_extension.cpp
)
target_link_libraries(icicle_device PUBLIC dl)

include_directories(include)

# Define the install directory (default is /usr/local)
Expand Down
20 changes: 20 additions & 0 deletions icicle/backend/cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,25 @@
cmake_minimum_required(VERSION 3.18)

message(STATUS "Fetching Taskflow v3.8.0 (CPU backend)")
include(FetchContent)
FetchContent_Declare(
Taskflow
GIT_REPOSITORY https://github.com/taskflow/taskflow.git
GIT_TAG v3.8.0
GIT_SHALLOW TRUE
)
# Disable unnecessary components
set(TF_BUILD_BENCHMARKS OFF CACHE BOOL "Disable Taskflow benchmarks" FORCE)
set(TF_BUILD_PROFILER OFF CACHE BOOL "Disable Taskflow profiler" FORCE)
set(TF_BUILD_CUDA OFF CACHE BOOL "Disable Taskflow CUDA support" FORCE)
set(TF_BUILD_SYCL OFF CACHE BOOL "Disable Taskflow SYCL support" FORCE)
set(TF_BUILD_TESTS OFF CACHE BOOL "Disable Taskflow tests" FORCE)
set(TF_BUILD_EXAMPLES OFF CACHE BOOL "Disable Taskflow examples" FORCE)

FetchContent_MakeAvailable(Taskflow)
# Use icicle_device as interface for TaskFlow headers
target_include_directories(icicle_device INTERFACE ${Taskflow_SOURCE_DIR})

# CPU backend is built directly into icicle library

target_sources(icicle_device PRIVATE src/cpu_device_api.cpp)
Expand Down
8 changes: 0 additions & 8 deletions icicle/backend/cpu/include/cpu_ntt_domain.h
Original file line number Diff line number Diff line change
@@ -1,22 +1,14 @@
#pragma once
#include "icicle/backend/ntt_backend.h"
#include "icicle/errors.h"
#include "icicle/runtime.h"
#include "icicle/utils/log.h"
#include "icicle/fields/field_config.h"
#include "icicle/vec_ops.h"

#include <thread>
#include <vector>
#include <chrono>
#include <algorithm>
#include <iostream>
#include <cmath>
#include <cstdint>
#include <memory>
#include <mutex>

using namespace field_config;
using namespace icicle;
namespace ntt_cpu {

Expand Down
9 changes: 2 additions & 7 deletions icicle/backend/cpu/include/cpu_ntt_main.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
#pragma once
#include "icicle/utils/log.h"
#include "ntt_cpu.h"
#include <iostream>

using namespace field_config;
using namespace icicle;

/**
Expand All @@ -14,16 +11,14 @@ using namespace icicle;
* hierarchy, and memory management for efficient computation.
*
* The NTT problem is given at a specific size and is divided into subproblems to enable
* parallel solving of independent tasks, ensuring that the number of problems solved
* simultaneously does not exceed cache size. The original problem is divided into hierarchies
* parallel solving of independent tasks. The original problem is divided into hierarchies
* of subproblems. Beyond a certain size, the problem is divided into two layers of sub-NTTs in
* hierarchy 1. Within hierarchy 1, the problem is further divided into 1-3 layers of sub-NTTs
* belonging to hierarchy 0. The division into hierarchies and the sizes of the sub-NTTs are
* determined by the original problem size.
*
* The sub-NTTs within hierarchy 0 are the units of work that are assigned to individual threads.
* The overall computation is executed in a multi-threaded fashion, with the degree of parallelism
* determined by the number of available hardware cores.
* The overall computation is executed in a multi-threaded fashion.
*
* @param device The device on which the NTT is being performed.
* @param input Pointer to the input data.
Expand Down
430 changes: 219 additions & 211 deletions icicle/backend/cpu/include/ntt_cpu.h

Large diffs are not rendered by default.

19 changes: 6 additions & 13 deletions icicle/backend/cpu/include/ntt_data.h
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
#pragma once
#include "icicle/utils/log.h"
#include "tasks_manager.h"
#include "cpu_ntt_domain.h"

// #include <_types/_uint32_t.h>
#include <csetjmp>
#include <sys/types.h>
#include <deque>
#include <functional>
#include <unordered_map>

#define HIERARCHY_1 22
#define HIERARCHY_1 26

namespace ntt_cpu {

Expand All @@ -25,19 +20,17 @@ namespace ntt_cpu {
* layer, 13 for the second, and 0 for the third.
*/
constexpr uint32_t layers_sub_logn[31][3] = {
{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0}, {4, 0, 0}, {5, 0, 0}, {3, 3, 0}, {4, 3, 0},
{4, 4, 0}, {5, 4, 0}, {5, 5, 0}, {4, 4, 3}, {4, 4, 4}, {5, 4, 4}, {5, 5, 4}, {5, 5, 5},
{5, 5, 6}, {5, 5, 7}, {5, 5, 8}, {5, 5, 9}, {5, 5, 10}, {5, 5, 11}, {5, 5, 12}, {12, 11, 0},
{12, 12, 0}, {13, 12, 0}, {13, 13, 0}, {14, 13, 0}, {14, 14, 0}, {15, 14, 0}, {15, 15, 0}};
{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0}, {4, 0, 0}, {5, 0, 0}, {3, 3, 0}, {4, 3, 0},
{4, 4, 0}, {5, 4, 0}, {5, 5, 0}, {4, 4, 3}, {4, 4, 4}, {5, 4, 4}, {5, 5, 4}, {5, 5, 5},
{5, 5, 6}, {5, 5, 7}, {5, 5, 8}, {5, 5, 9}, {5, 5, 10}, {5, 5, 11}, {5, 5, 12}, {5, 5, 13},
{5, 5, 14}, {5, 5, 15}, {5, 5, 16}, {14, 13, 0}, {14, 14, 0}, {15, 14, 0}, {15, 15, 0}};

/**
* @brief Represents the log sizes of sub-NTTs in the NTT computation hierarchy.
*
* This struct stores the log sizes of the sub-NTTs for both hierarchy_0 and hierarchy_1 layers,
* based on the overall log size (`logn`) of the NTT problem.
*
* @param logn The log size of the entire NTT problem.
* @param size The size of the NTT problem, calculated as `1 << logn`.
* @param hierarchy_0_layers_sub_logn Log sizes of sub-NTTs for hierarchy_0 layers.
* @param hierarchy_1_layers_sub_logn Log sizes of sub-NTTs for hierarchy_1 layers.
*
Expand Down Expand Up @@ -75,7 +68,7 @@ namespace ntt_cpu {
const uint32_t logn; // log of the original NTT size.
const uint32_t size; // Size of the original NTT problem.
const NttSubHierarchies ntt_sub_hierarchies; // Log sizes of sub-NTTs based on the original NTT log size.
E* const elements; // Pointer to the output elements array.
E* elements; // Pointer to the elements array.
const NTTConfig<S>& config; // Configuration settings for the NTT computation.
const NTTDir direction; // Direction of the NTT computation (forward or inverse).
const bool is_parallel; // Flag indicating if the NTT computation is parallel.
Expand Down
Loading

0 comments on commit 0ed301f

Please sign in to comment.