diff --git a/docs/build_api.py b/docs/build_api.py index a4c63dd..8d2c626 100644 --- a/docs/build_api.py +++ b/docs/build_api.py @@ -154,6 +154,7 @@ def build_index_page(groups): "all", "any", "count", + "dot", ], "Mathematical": [ ("abs", "abs(const V&)"), diff --git a/docs/guides.rst b/docs/guides.rst index 72c2ead..e1a39fd 100644 --- a/docs/guides.rst +++ b/docs/guides.rst @@ -5,5 +5,7 @@ Guides guides/introduction.rst guides/promotion.rst - guides/prelude.rst + guides/memory.rst + guides/accuracy.rst guides/constant.rst + guides/example.md diff --git a/docs/guides/accuracy.md b/docs/guides/accuracy.md new file mode 100644 index 0000000..da7bb53 --- /dev/null +++ b/docs/guides/accuracy.md @@ -0,0 +1,112 @@ +# Accuracy Level + +For certain operations, there might be alternative versions available that provide better performance at the cost of lower accuracy. +In other words, they are faster but also have a small error. + +## Fast Math + +For several operations in single precision (float32), there are "fast math" versions. These functions are faster since they are hardware accelerated, but are not IEEE compliant for all inputs. + +To use this functionality, use the `fast_*` functions from Kernel Float. + +```cpp +kf::vec x = {1.0f, 2.0f, 3.0f, 4.0f}; + +// Sine +kf::vec a = kf::fast_sin(x); + +// Square root +kf::vec b = kf::fast_sqrt(x); + +// Reciprocal `1/x` +kf::vec c = kf::fast_rcp(x); + +// Division `a/b` +kf::vec d = kf::fast_div(a, b); +``` + +These functions are only functional for 32-bit and 16-bit floats. +For other input types, the operation falls back to the regular version. + +## Approximate Math + +For 16-bit floats, several approximate functions are provided. +These use approximations (typically low-degree polynomials) to calculate rough estimates of the functions. +This can be very fast but also less accurate. + + +To use this functionality, use the `approx_*` functions from Kernel Float. For other input types, the operation falls back to the `fast_*` variant. + +```cpp +kf::vec x = {1.0, 2.0, 3.0, 4.0}; + +// Sine +kf::vec a = kf::approx_sin(x); + +// Square root +kf::vec b = kf::approx_sqrt(x); + +// Reciprocal `1/x` +kf::vec c = kf::approx_rcp(x); + +// Division `a/b` +kf::vec d = kf::approx_div(a, b); +``` + +You can adjust the degree of approximation by supplying an integer template parameter: + + +```cpp +// Sine approximation with polynomial of degree 1 +kf::vec a = kf::approx_sin<1>(x); + +// Polynomial of degree 2 +kf::vec a = kf::approx_sin<2>(x); + +// Polynomial of degree 3 +kf::vec a = kf::approx_sin<3>(x); +``` + +## Tuning Accuracy Level + +Many functions in Kernel Float accept an additional Accuracy option as a template parameter. +This allows you to tune the accuracy level without changing the function name. + +There are four possible values for this parameter: + +- `kf::accurate_policy`: Use the most accurate version of the function available. +- `kf::fast_policy`: Use the "fast math" version. +- `kf::approx_policy`: Use the approximate version with degree `N`. +- `kf::default_policy`: Use a global default policy (see the next section). + +For example, consider this code: + +```cpp +kf::vec input = {1.0f, 2.0f}; + +// Use the default policy +kf::vec a = kf::cos(input); + +// Use the default policy +kf::vec b = kf::cos(input); + +// Use the most accurate policy +kf::vec c = kf::cos(input); + +// Use the fastest policy +kf::vec d = kf::cos(input); + +// Use the approximate policy +kf::vec e = kf::cos>(input); + +// You can use aliases to define your own policy +using my_own_policy = kf::fast_policy; +kf::vec f = kf::cos(input); +``` + +## Setting `default_policy` + +By default, `kf::default_policy` is set to `kf::accurate_policy`. + +Set the preprocessor option `KERNEL_FLOAT_FAST_MATH=1` to change the default policy to `kf::fast_policy`. +This will use fast math for all functions and data types that support it. diff --git a/docs/guides/example.md b/docs/guides/example.md new file mode 100644 index 0000000..62a80da --- /dev/null +++ b/docs/guides/example.md @@ -0,0 +1,112 @@ +# Full CUDA example + +This page explains a CUDA program that estimates the value of pi using Kernel Float. + + +## Overview + +The program calculates Pi by generating random points within a unit square and counting how many fall inside the unit circle inscribed within that square. The ratio of points inside the circle to the total number of points approximates Pi/4. + +The kernel is shown below: + + +```c++ +namespace kf = kernel_float; + +using float_type = float; +static constexpr int VECTOR_SIZE = 4; + +__global__ void calculate_pi_kernel(int nx, int ny, int* global_count) { + int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + int thread_y = blockIdx.y * blockDim.y + threadIdx.y; + + kf::vec xi = thread_x * VECTOR_SIZE + kf::range(); + kf::vec yi = thread_y; + + kf::vec xf = kf::cast(xi) / float_type(nx); + kf::vec yf = kf::cast(yi) / float_type(ny); + + kf::vec dist_squared = xf * xf + yf * yf; + kf::vec dist = kf::sqrt(dist_squared); + + int n = kf::count(dist <= float_type(1)); + + if (n > 0) atomicAdd(global_count, n); +} +``` + + +## Code Explanation + +Let's go through the code step by step. + +```cpp +// Alias `kernel_float` as `kf` +namespace kf = kernel_float; +``` + +This creates an alias for `kernel_float`. + +```cpp +// Define the float type and vector size +using float_type = float; +static constexpr int VECTOR_SIZE = 4; +``` + +Define `float_type` as an alias for `float` to make it easy to change precision if needed. +The vector size is set to 4, meaning each thread will process 4 data points. + +```cpp +__global__ void calculate_pi_kernel(int nx, int ny, int* global_count) { +``` + +The CUDA kernel. There are `nx` points along the x axis and `ny` points along the y axis. + +```cpp + int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + int thread_y = blockIdx.y * blockDim.y + threadIdx.y; +``` + +Compute the global x- and y-index of this thread. + +```cpp + kf::vec xi = thread_x * VECTOR_SIZE + kf::range(); + kf::vec yi = thread_y; +``` + +Compute the points that this thread will process. +The x coordinates start at `thread_x * VECTOR_SIZE` and then the vector `[0, 1, 2, ..., VECTOR_SIZE-1]`. +The y coordinates are all `thread_y`. + +```cpp + kf::vec xf = kf::cast(xi) / float_type(nx); + kf::vec yf = kf::cast(yi) / float_type(ny); +``` + +Divide `xi` and `yi` by `nx` and `ny` to normalize them to `[0, 1]` range. + +```cpp + kf::vec dist_squared = xf * xf + yf * yf; +``` + +Compute the squared distance from the origin (0, 0) to each point from `xf`,`yf`. + +```cpp + kf::vec dist = kf::sqrt(dist_squared); +``` + +Take the element-wise square root. + +```cpp + int n = kf::count(dist <= float_type(1)); +``` + +Count the number of points in the unit circle (i.e., for which the distance is less than 4). +The expression `dist <= 1` returns a vector of booleans and `kf::count` counts the number of `true` values. + +```cpp +atomicAdd(global_count, n); +``` + +Add `n` to the `global_count` variable. +This must be done using an atomic operation since multiple thread will write this variable simultaneously. diff --git a/docs/guides/introduction.md b/docs/guides/introduction.md index e6a4434..31f44e4 100644 --- a/docs/guides/introduction.md +++ b/docs/guides/introduction.md @@ -1,56 +1,192 @@ -Getting started -=============== +# Getting started -Kernel Float is a header-only library that makes it easy to work with vector types and low-precision floating-point types, mainly focusing on CUDA kernel code. +**Kernel Float** is a header-only library that makes it easy to work with vector types and low-precision floating-point types, mainly focusing on CUDA kernel code. -Installation ------------- +## Installation -The easiest way to use the library is get the single header file from github: +The easiest way to use the library is to get the single header file from GitHub: ```bash wget https://raw.githubusercontent.com/KernelTuner/kernel_float/main/single_include/kernel_float.h ``` -Next, include this file into your program. -It is conventient to define a namespace alias `kf` to shorten the full name `kernel_float`. +Next, include this file in your program. It is convenient to define a namespace alias `kf` to shorten the full name `kernel_float`: - -```C++ +```cpp #include "kernel_float.h" namespace kf = kernel_float; ``` +## Vector types -Example C++ code ----------------- +Kernel Float essentially offers a single data type `kernel_float::vec` that stores `N` elements of type `T`. The simplest way to initialize a vector is using list-initialization: -Kernel Float essentially offers a single data-type `kernel_float::vec` that stores `N` elements of type `T`. -This type can be initialized normally using list-initialization (e.g., `{a, b, c}`) and elements can be accessed using the `[]` operator. -Operation overload is available to perform binary operations (such as `+`, `*`, and `&`), where the optimal intrinsic for the available types is selected automatically. +```cpp +kf::vec my_vector = {1.0f, 2.0f, 3.0f, 4.0f}; +``` -Many mathetical functions (like `log`, `sin`, `cos`) are also available, see the [API reference](../api) for the full list of functions. -In some cases, certain operations might not be natively supported by the platform for the some floating-point type. -In these cases, Kernel Float falls back to performing the operations in 32 bit precision. +It is also possible to automatically derive the type using `make_vec`: -The code below shows a very simple example of how to use Kernel Float: +```cpp +// The type will be vec +auto a = kf::make_vec(1.0, 2.0, 3.0); -```C++ -#include "kernel_float.h" -namespace kf = kernel_float; +// The type will be vec +auto b = kf::make_vec(7, 7); + +// The type will be vec +auto c = kf::make_vec(true, true, false, true); + +// This does not compile! +auto d = kf::make_vec(); +``` + +There are also many helper methods available to generate vectors; see the [API reference](../api). Some examples are `range`, `fill`, `ones`, and `zeros`. + +```cpp +// Generates [0, 1, 2, 3] +kf::vec a = kf::range(); + +// Generates [42.0, 42.0, 42.0, 42.0] +kf::vec b = kf::fill<4>(42.0); + +// Generates [0, 0, 0, 0] +kf::vec c = kf::zeros(); + +// Generates [true, true, true, true] +kf::vec d = kf::ones(); +``` + +You can also use the `*_like` functions to generate a vector based on another vector: + +```cpp +// Generates [1.0, 2.0, 3.0, 4.0] +kf::vec a = {1.0f, 2.0f, 3.0f, 4.0f}; + +// Generates [0.0, 0.0, 0.0, 0.0] +kf::vec b = kf::zeros_like(a); + +// Generates [1.0, 1.0, 1.0, 1.0] +kf::vec c = kf::ones_like(a); +``` + +## Accessing elements + +Accessing elements can be done using the regular `[]` operator. + +```cpp +// Generate [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] +kf::vec a = kf::range(); + +// Returns 2.0 +float x = a[2]; + +// Set element at 2 to 42.0 +a[2] = 42.0; + +// Returns 42.0 +float y = a[2]; +``` + +You can get a pointer to the vector buffer by calling `data`: + +```cpp +// Generate vector +kf::vec v = {1.0f, 2.0f, 3.0f, 4.0f}; + +float* address = v.data(); + +// Set element at 0 to element at 1 +address[0] = address[1]; +``` -int main() { - using Type = float; - const int N = 8; +Iteration can be done by using a regular for-loop: - kf::vec i = kf::range(); - kf::vec x = kf::cast(i); - kf::vec y = x * kf::sin(x); - Type result = kf::sum(y); - printf("result=%f", double(result)); +```cpp +kf::vec vector = {1.0f, 2.0f, 3.0f, 4.0f}; - return EXIT_SUCCESS; +for (float x : vector) { + printf("x=%f\n", x); } ``` -Notice how easy it would be to change the floating-point type `Type` or the vector length `N` without affecting the rest of the code. +## Operator overloading + +The **arithmetic** operators `+`, `-`, `*`, `/`, and `%` are overloaded to perform element-wise operations. + +```cpp +// Generate [1.0f, 2.0f, 3.0f] +kf::vec a = {1.0f, 2.0f, 3.0f}; + +// Generate [1.0f, 1.0f, 1.0f] +kf::vec b = kf::ones(); + +// Add them together to create [2.0f, 3.0f, 4.0f] +kf::vec c = a + b; +``` + +The **comparison** operators `<`, `>`, `==`, `!=`, `<=`, `>=` are overloaded to perform element-wise operations. Note that the returned value is a vector containing 0s (`false`) and 1s (`true`). The element type and vector length will match the inputs. + +```cpp +// Generate doubles +kf::vec a = {4.0, -100.0, 0.0, 0.5, -3.0}; + +// Generate zeros +kf::vec zeros = kf::zeros_like(a); + +// Generates [false, true, false, false, true] +kf::vec result = a < zeros; +``` + +The **logical** operators `&&` and `||` are NOT overloaded. This is because there is no method to simulate the short-circuiting behavior. Instead, the operators `!` (not), `&` (and), `|` (or), and `^` (xor) are overloaded to behave as logical operators. + +```cpp +// Generate doubles +kf::vec a = {4.0, -100.0, 0.0, 0.5, -3.0}; + +// Generate zeros and ones +kf::vec zeros = kf::zeros_like(a); +kf::vec ones = kf::ones_like(a); + +// Generates [false, false, true, true, false] +kf::vec result = (a >= zeros) & (a <= ones); + +// Using `&&` instead of `&` results in a compilation error! +// kf::vec fail = (a >= zeros) && (a <= ones); +``` + +If the two inputs of a binary operator do not match (either element type and/or vector length), Kernel Float will automatically perform **type promotion** (described on the page [Type Promotion](promotion)). This allows our example to be simplified to just: + +```cpp +// Generate doubles +kf::vec a = {4.0, -100.0, 0.0, 0.5, -3.0}; + +// Generates [false, false, true, true, false] +kf::vec result = (a >= 0.0) & (a <= 1.0); +``` + +## Mathematical functions + +Many mathematical functions (like `log`, `sin`, `cos`) are also available; see the [API reference](../api) for the full list of functions. These always work element-wise: + +```cpp +// Input vector +kf::vec x = {0.0f, 1.0f, 2.0f, 3.0f}; + +// Gives [0.0, 0.84147098, 0.9092974, 0.14112001] +kf::vec a = kf::sin(x); + +// Gives [1.0, 0.54030231, -0.41614684, -0.9899925] +kf::vec b = kf::cos(x); + +// Gives [0.0, 1.0, 1.4142135, 1.7320508] +kf::vec c = kf::sqrt(x); + +// Gives [1.0, 2.7182818, 7.3890561, 20.085537] +kf::vec d = kf::exp(x); + +// Gives [0, 0, 0, 0] +kf::vec e = kf::isnan(x); +``` + +In some cases, certain operations might not be natively supported by the platform for some floating-point types. In these cases, Kernel Float falls back to performing the operations in 32-bit precision. diff --git a/docs/guides/memory.md b/docs/guides/memory.md new file mode 100644 index 0000000..2ddc7a2 --- /dev/null +++ b/docs/guides/memory.md @@ -0,0 +1,112 @@ +# Memory Operations + +A common problem in vector programming is that a "simple" pointer (such as `float*`) is provided, but you would like to read a vector of elements. This page describes solutions to that problem. + +## Using `read`/`write` + +The simplest solution is to use `read` and `write` to access `N` consecutive elements. + +```cpp +std::vector buffer = {1.0f, 2.0f, 3.0f, 4.0f}; + +float* pointer = buffer.data(); + +// Read 2 elements: buffer[0], buffer[1] +kf::vec a = kf::read<2>(pointer); + +// Read 3 elements: buffer[1], buffer[2], buffer[3] +kf::vec b = kf::read<3>(pointer + 1); + +// Write 2 elements: buffer[0], buffer[1] +kf::write(pointer, kf::vec(100.0f, 200.0f)); + +// Write 3 elements: buffer[1], buffer[2], buffer[3] +kf::write(pointer + 1, kf::vec(0.0f, 0.0f, 0.0f)); +``` + +## Using `read_aligned`/`write_aligned` + +For small data types, it can be highly beneficial to use **aligned** memory operations. +These allow the compiler to read/write the elements more efficiently but require that the accessed pointer is **aligned** to a certain vector size. + +```cpp +// Read 2 elements: buffer[0], buffer[1] +kf::vec a = kf::read_aligned<2>(pointer); + +// Read 2 elements: buffer[2], buffer[3] +kf::vec b = kf::read_aligned<2>(pointer + 2); + +// This is not allowed! `pointer+1` is not aligned to 2 elements! +// kf::vec b = kf::read_aligned<2>(pointer + 1); + +// Write 2 elements: buffer[0], buffer[1] +kf::write_aligned<2>(pointer, kf::vec(100.0f, 200.0f)); + +// Again, this is not allowed! `pointer+1` is not aligned to 3 elements! +// kf::write_aligned<3>(pointer + 1, kf::vec(0.0f, 0.0f, 0.0f)); +``` + +Note that Kernel Float does **not** check the alignment, not at compile-time and not at runtime. +Using an unaligned address results in **undefined behavior**: either a runtime crash, miscompilation, or invalid results. + +## Using `vec_ptr` + +`kf::vec_ptr` is a data type that wraps a regular `T*` (or `const T*`) pointer and allows easy accessing of elements as a vector using aligned memory operations. + +For example, given `vec_ptr x`, reading `x[10]` returns a vector containing `{buffer[40], buffer[41], buffer[42], buffer[43]}`. +Each index advances by `N` elements because you're working with vectors of size `N`. + +The code below shows an example: + +```cpp +std::vector buffer = {/* some data */}; +float* pointer = buffer.data(); + +kf::vec_ptr x = kf::assert_aligned<2>(pointer); + +// Get the elements {buffer[20], buffer[21]} +kf::vec a = x[10]; + +// Set the elements at buffer[20] and buffer[21]. +x(10) = kf::make_vec(42.0f, 42.0f); +``` + +In this example: + +* `kf::vec_ptr` wraps the `float*` pointer, allowing vectorized access to the data. +* `x[10]` accesses the elements at positions `20` and `21` (`10 * 2` and `10 * 2 + 1`). +* `x(10)` provides a way to write to the data at the same positions. + +### Handling Different Storage Types + +A notable feature of `vec_ptr` is its ability to interact with data stored in a different underlying type than the one you operate with. +This is achieved by specifying a third template argument `U` in `vec_ptr`. + +The `vec_ptr` class automatically handles the necessary type casting between `U` and `T`. +A cast from `U` to `T` is automatically inserted after each read and from `T` to `U` before each write. + +For example, suppose your data is stored in half precision, but you want to read and write it using double precision. + + +```cpp +std::vector buffer = {/* some data */}; +half* pointer = buffer.data(); + +// Create vec_ptr +kf::vec_ptr x = kf::assert_aligned<2>(pointer); + +// Interact with the data in double precision +kf::vec_ptr y = x; + +// Get the elements {buffer[20], buffer[21]} in double precision +kf::vec a = y[10]; + +// Set the elements at buffer[20] and buffer[21]. +y(10) = kf::make_vec(42.0, 42.0); +``` + +In this example: + +* `x` is a `vec_ptr` pointing to the data stored in half precision. +* `y` is a `vec_ptr` that allows you to interact with the data as double precision, even though it is stored as half. + diff --git a/docs/guides/prelude.md b/docs/guides/prelude.md deleted file mode 100644 index 449dc88..0000000 --- a/docs/guides/prelude.md +++ /dev/null @@ -1,42 +0,0 @@ -Using `kernel_float::prelude` -=== - -When working with Kernel Float, you'll find that you need to prefix every function and type with the `kernel_float::...` prefix. -This can be a bit cumbersome. -It's strongly discouraged not to dump the entire `kernel_float` namespace into the global namespace (with `using namespace kernel_float`) since -many symbols in Kernel Float may clash with global symbols, causing conflicts and issues. - -To work around this, the library provides a handy `kernel_float::prelude` namespace. This namespace contains a variety of useful type and function aliases that won't conflict with global symbols. - -To make use of it, use the following code: - - -```C++ -#include "kernel_float.h" -using namespace kernel_float::prelude; - -// You can now use aliases like `kf`, `kvec`, `kint`, etc. -``` - -The prelude defines many aliases, include the following: - -| Prelude name | Full name | -|---|---| -| `kf` | `kernel_float` | -| `kvec` | `kernel_float::vec` | -| `into_kvec(v)` | `kernel_float::into_vec(v)` | -| `make_kvec(a, b, ...)` | `kernel_float::make_vec(a, b, ...)` | -| `kvec2`, `kvec3`, ... | `kernel_float::vec`, `kernel_float::vec`, ... | -| `kint` | `kernel_float::vec` | -| `kint2`, `kint3`, ... | `kernel_float::vec`, `kernel_float::vec`, ... | -| `klong` | `kernel_float::vec` | -| `klong2`, `klong3`, ... | `kernel_float::vec`, `kernel_float::vec`, ... | -| `kbfloat16x` | `kernel_float::vec` | -| `kbfloat16x2`, `kbfloat16x3`, ... | `kernel_float::vec`, `kernel_float::vec`, ... | -| `khalf` | `kernel_half::vec` | -| `khalf2`, `khalf3`, ... | `kernel_half::vec`, `kernel_half::vec`, ... | -| `kfloat` | `kernel_float::vec` | -| `kfloat2`, `kfloat3`, ... | `kernel_float::vec`, `kernel_float::vec`, ... | -| `kdouble` | `kernel_float::vec` | -| `kdouble2`, `kdouble3`, ... | `kernel_float::vec`, `kernel_float::vec`, ... | -| ... | ... | diff --git a/docs/guides/promotion.md b/docs/guides/promotion.md new file mode 100644 index 0000000..338ddb6 --- /dev/null +++ b/docs/guides/promotion.md @@ -0,0 +1,127 @@ +# Casting and Type Promotion + +This section is on data type casting and implicit type promotion. + +## Type Casting + +You can use the function `cast` to explicitly change the element type of a vector: + +```cpp +// a = [0, 1, 2, 3] +kf::vec a = kf::range(); + +// b = [0, 1, 2, 3] +kf::vec b = kf::cast(a); + +// c = [0.0f, 1.0f, 2.0f, 3.0f] +kf::vec c = kf::cast(a); + +// d = [false, true, true, true] +kf::vec d = kf::cast(a); +``` + +Kernel Float allows **implicit** type conversions between vector types **only** for **widening** casts (i.e., converting a smaller type into a larger type). + +```cpp +// a = [0, 1, 2, 3] +kf::vec a = kf::range(); + +// This is allowed: int -> long +kf::vec b = a; + +// This is also allowed: int -> float +kf::vec c = a; +``` + +Kernel Float does **not** perform implicit conversion for narrowing casts; this requires an explicit cast: + +```cpp +// a = [0, 1, 2, 3] +kf::vec a = kf::range(); + +// This is NOT allowed, use `kf::cast(a)` +kf::vec b = a; // ERROR + +// This is NOT allowed, use `kf::cast(a)` +kf::vec c = a; // ERROR + +// This is NOT allowed, use `kf::cast(a)` +kf::vec d = a; // ERROR +``` + +Alternatively, it is possible to use `kf::cast_to` on the left side of the assignment to perform a cast: + +```cpp +// a = [0, 1, 2, 3] +kf::vec a = kf::range(); + +// Define b +kf::vec b; + +// This is equivalent to `b = kf::cast(a)` +kf::cast_to(b) = a; +``` + +## Type Promotion + +When performing operations between vectors of different types or sizes, Kernel Float automatically promotes types to ensure compatibility. This process is known as **type promotion**. + + + +Consider the following example. What should the type of `c` be in this case? + +```cpp +kf::vec a = {1.0f, 2.0f, 3.0f, 4.0f}; +kf::vec b = {10.0, 20.0, 30.0, 40.0}; + +kf::vec c = a + b; +``` + +Another example is the following snippet. Again, what should the type of `c` be? + +```cpp +kf::vec x = {1.0f, 2.0f, 3.0f, 4.0f}; +int factor = 2; + +kf::vec c = x * factor; +``` + +### How Type Promotion Works +Type promotion in Kernel Float unifies arguments for binary (and ternary) operations through the following steps: + +* **Vectorization**: Each non-vector argument is converted into a vector using the `into_vec` function. + +* **Length Unification**: All arguments must have the same length N or length 1. Vectors of length 1 are broadcasted to match length N. + +* **Type Unification**: The element types are promoted to a common type based on the following promotion rules. + +### Promotion Rules +The rules for element type promotion in Kernel Float are slightly different from standard C++. Here's a summary: + +* **Boolean Types**: If one of the types is bool, the result type is the other type. + +* **Floating-Point and Integer**: If one type is floating-point and the other is an integer (signed or unsigned), the result is the floating-point type. + +* **Floating-Point Types**: If both are floating-point types, the larger (wider) type is chosen. +Exception: Combining half and bfloat16 results in float. + +* **Integer Types**: If both are integers of the same signedness (both signed or both unsigned), the larger type is chosen. +Combining a signed integer and an unsigned integer is not allowed. + +### Overview + +The following table summarizes the type promotion rules. The labels used are: + +- `b`: boolean +- `iN`: signed integer of `N` bits (e.g., `int`, `long`) +- `uN`: unsigned integer of `N` bits (e.g., `unsigned int`, `size_t`) +- `fN`: floating-point type of `N` bits (e.g., `float`, `double`) +- `bf16`: bfloat16 floating-point format. + +```{csv-table} Type Promotion Rules. +:file: promotion_table.csv +``` + +.. csv-table:: Type Promotion Rules. + :file: promotion_table.csv + diff --git a/docs/guides/promotion.rst b/docs/guides/promotion.rst deleted file mode 100644 index cf48d9e..0000000 --- a/docs/guides/promotion.rst +++ /dev/null @@ -1,34 +0,0 @@ -Type Promotion -============== - -For operations that involve two (or more) input arguments, ``kernel_float`` will first convert the inputs into a common type before applying the operation. -For example, when adding ``vec`` to ``vec``, both arguments must first be converted into a ``vec``. - -This procedure is called "type promotion" and is implemented as follows. -Initially, every argument is transformed into a vector using the ``into_vec`` function -Next, all arguments must have length ``N`` or length ``1``, where vectors of length ``1`` are repeated to become length ``N``. -Finally, the vector element types are promoted into a shared type. - -The rules for element type promotion in ``kernel_float`` are slightly different than in regular C++. -In a nutshell, for two element types, the promotion rules can be summarized as follows: - -* If one of the types is ``bool``, the result is the other type. -* If one type is a floating-point and the other is an integer (signed or unsigned), the outcome is the floating-point type. -* If both are floating-point types, the largest of the two is chosen. An exception is combining ``half`` and ``bfloat16``, which results in ``float``. -* If both types are integer types of the same signedness, the largest of the two is chosen. -* Combining a signed integer and unsigned integer type is not allowed. - -Overview --------- - -The type promotion rules are shown in the table below. -The labels are as follows: - -* ``b``: boolean -* ``iN``: signed integer of ``N`` bits (e.g., ``int``, ``long``) -* ``uN``: unsigned integer of ``N`` bits (e.g., ``unsigned int``, ``size_t``) -* ``fN``: floating-point type of ``N`` bits (e.g., ``float``, ``double``) -* ``bf16``: bfloat16 floating-point format. - -.. csv-table:: Type Promotion Rules. - :file: promotion_table.csv diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 145f80b..239fba5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,2 +1,3 @@ +add_subdirectory(pi) add_subdirectory(vector_add) add_subdirectory(vector_add_tiling) diff --git a/examples/pi/CMakeLists.txt b/examples/pi/CMakeLists.txt new file mode 100644 index 0000000..6a767d3 --- /dev/null +++ b/examples/pi/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.17) + +set (PROJECT_NAME kernel_float_pi) +project(${PROJECT_NAME} LANGUAGES CXX CUDA) +set (CMAKE_CXX_STANDARD 17) + +add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu") +target_link_libraries(${PROJECT_NAME} kernel_float) +set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80") + +find_package(CUDA REQUIRED) +target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE}) diff --git a/examples/pi/main.cu b/examples/pi/main.cu new file mode 100644 index 0000000..58edd8b --- /dev/null +++ b/examples/pi/main.cu @@ -0,0 +1,111 @@ +#include +#include + +#include "kernel_float.h" + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t __err = call; \ + if (__err != cudaSuccess) { \ + fprintf( \ + stderr, \ + "CUDA error at %s:%d code=%d(%s) \"%s\" \n", \ + __FILE__, \ + __LINE__, \ + __err, \ + cudaGetErrorString(__err), \ + #call); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// Alias `kernel_float` as `kf` +namespace kf = kernel_float; + +// Define the float type and vector size +using float_type = float; +static constexpr int VECTOR_SIZE = 4; + +__global__ void calculate_pi_kernel(int nx, int ny, int* global_count) { + // Calculate the global x and y indices for this thread within the grid + int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + int thread_y = blockIdx.y * blockDim.y + threadIdx.y; + + // Calculate the x and y coordinates as integers. + // The x coordinates are: [thread_x * VECTOR_SIZE, thread_x * VECTOR_SIZE + 1, ...] + // The y coordinates are: [thread_y, thread_y, ...] + kf::vec xi = thread_x * VECTOR_SIZE + kf::range(); + kf::vec yi = thread_y; + + // Normalize the integers to values between 0 and 1. + kf::vec xf = kf::cast(xi) / float_type(nx); + kf::vec yf = kf::cast(yi) / float_type(ny); + + // Compute the squared distance to the origin and then take the + // square root to get the distance to the origin. + kf::vec dist_squared = xf * xf + yf * yf; + kf::vec dist = kf::sqrt(dist_squared); + + // Count the number of points within the unit circle. + // The expression `dist <= 1` returns a boolean vector + // and `kf::count` counts how many elements are `true`. + int n = kf::count(dist <= float_type(1)); + + // Atomically add 'n' to 'global_count' + atomicAdd(global_count, n); +} + +double calculate_pi(int nx, int ny) { + // Allocate memory on the device (GPU) for 'global_count' to accumulate the count of points inside the circle + int* d_global_count; + CUDA_CHECK(cudaMalloc(&d_global_count, sizeof(int))); + + // Initialize the device memory to zero + CUDA_CHECK(cudaMemset(d_global_count, 0, sizeof(int))); + + // Each thread processes 'VECTOR_SIZE' points in the x-direction + int num_threads_x = (nx + VECTOR_SIZE - 1) / VECTOR_SIZE; + + // Define the dimensions of each thread block (number of threads per block) + dim3 block_size(16, 16); // Each block contains 16 threads in x and y directions + + // Calculate the number of blocks needed in the grid to cover all threads + dim3 grid_size( + (num_threads_x + block_size.x - 1) / block_size.x, // Number of blocks in x-direction + (ny + block_size.y - 1) / block_size.y // Number of blocks in y-direction + ); + + // Launch the kernel on the GPU with the calculated grid and block dimensions + calculate_pi_kernel<<>>(nx, ny, d_global_count); + + // Check for any errors during kernel launch (asynchronous) + CUDA_CHECK(cudaGetLastError()); + + // Wait for the kernel to finish executing and check for errors (synchronization point) + CUDA_CHECK(cudaDeviceSynchronize()); + + // Copy the result from device memory back to host memory + int h_global_count = 0; // Host variable to store the count + CUDA_CHECK(cudaMemcpy(&h_global_count, d_global_count, sizeof(int), cudaMemcpyDeviceToHost)); + + // Free the allocated device memory + CUDA_CHECK(cudaFree(d_global_count)); + + // Calculate the estimated value of Pi using the ratio of points inside the circle to the total points + int total_points = nx * ny; + double pi_estimate = 4.0 * (double(h_global_count) / total_points); + + return pi_estimate; +} + +int main() { + CUDA_CHECK(cudaSetDevice(0)); + + for (int n = 1; n <= 16384; n *= 2) { + double pi = calculate_pi(n, n); + + printf("nx=%d ny=%d pi=%f\n", n, n, pi); + } + + return EXIT_SUCCESS; +} diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu index 48a9a45..705cf8c 100644 --- a/examples/vector_add/main.cu +++ b/examples/vector_add/main.cu @@ -4,7 +4,7 @@ #include #include "kernel_float.h" -using namespace kernel_float::prelude; +namespace kf = kernel_float; void cuda_check(cudaError_t code) { if (code != cudaSuccess) { diff --git a/examples/vector_add_tiling/main.cu b/examples/vector_add_tiling/main.cu index af7aa94..ddd28ab 100644 --- a/examples/vector_add_tiling/main.cu +++ b/examples/vector_add_tiling/main.cu @@ -5,7 +5,7 @@ #include "kernel_float.h" #include "kernel_float/tiling.h" -using namespace kernel_float::prelude; +namespace kf = kernel_float; void cuda_check(cudaError_t code) { if (code != cudaSuccess) { diff --git a/include/kernel_float.h b/include/kernel_float.h index 53df30b..ee098fc 100644 --- a/include/kernel_float.h +++ b/include/kernel_float.h @@ -4,13 +4,13 @@ #include "kernel_float/base.h" #include "kernel_float/bf16.h" #include "kernel_float/binops.h" +#include "kernel_float/constant.h" #include "kernel_float/conversion.h" #include "kernel_float/fp16.h" #include "kernel_float/iterate.h" #include "kernel_float/macros.h" #include "kernel_float/memory.h" #include "kernel_float/meta.h" -#include "kernel_float/prelude.h" #include "kernel_float/reduce.h" #include "kernel_float/triops.h" #include "kernel_float/unops.h" diff --git a/include/kernel_float/apply.h b/include/kernel_float/apply.h index cf63787..3a7e02c 100644 --- a/include/kernel_float/apply.h +++ b/include/kernel_float/apply.h @@ -118,43 +118,66 @@ broadcast_like(const V& input, const R& other) { namespace detail { -template -struct apply_recur_impl; - template struct apply_impl { - KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { - apply_recur_impl::call(fun, result, inputs...); + KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) { +#pragma unroll + for (size_t i = 0; i < N; i++) { + output[i] = fun(args[i]...); + } } }; -template -struct apply_recur_impl { - static constexpr size_t K = round_up_to_power_of_two(N) / 2; +template +struct apply_fastmath_impl: apply_impl {}; +} // namespace detail - template - KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { - apply_impl::call(fun, result, inputs...); - apply_impl::call(fun, result + K, (inputs + K)...); - } +struct accurate_policy { + template + using type = detail::apply_impl; }; -template<> -struct apply_recur_impl<0> { - template - KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {} +struct fast_policy { + template + using type = detail::apply_fastmath_impl; }; -template<> -struct apply_recur_impl<1> { - template - KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { - result[0] = fun(inputs[0]...); +#ifdef KERNEL_FLOAT_POLICY +using default_policy = KERNEL_FLOAT_POLICY; +#else +using default_policy = accurate_policy; +#endif + +namespace detail { + +template +struct map_policy_impl { + static constexpr size_t packet_size = preferred_vector_size::value; + static constexpr size_t remainder = N % packet_size; + + KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) { + if constexpr (N / packet_size > 0) { +#pragma unroll + for (size_t i = 0; i < N - remainder; i += packet_size) { + Policy::template type::call( + fun, + output + i, + (args + i)...); + } + } + + if constexpr (remainder > 0) { +#pragma unroll + for (size_t i = N - remainder; i < N; i++) { + Policy::template type::call(fun, output + i, (args + i)...); + } + } } }; template -struct apply_fastmath_impl: apply_impl {}; +using map_impl = map_policy_impl; + } // namespace detail template @@ -171,40 +194,13 @@ using map_type = * vec squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f] * ``` */ -template +template KERNEL_FLOAT_INLINE map_type map(F fun, const Args&... args) { using Output = result_t...>; using E = broadcast_vector_extent_type; - vector_storage result; - - // Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled -#if KERNEL_FLOAT_FAST_MATH - using apply_impl = detail::apply_fastmath_impl...>; -#else - using apply_impl = detail::apply_impl...>; -#endif - - apply_impl::call( - fun, - result.data(), - (detail::broadcast_impl, vector_extent_type, E>::call( - into_vector_storage(args)) - .data())...); - - return result; -} - -/** - * Apply the function `F` to each element from the vector `input` and return the results as a new vector. This - * uses fast-math if available for the given function `F`, otherwise this function behaves like `map`. - */ -template -KERNEL_FLOAT_INLINE map_type fast_map(F fun, const Args&... args) { - using Output = result_t...>; - using E = broadcast_vector_extent_type; - vector_storage result; + vector_storage> result; - detail::apply_fastmath_impl...>::call( + detail::map_policy_impl, Output, vector_value_type...>::call( fun, result.data(), (detail::broadcast_impl, vector_extent_type, E>::call( diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h index e32ac41..403bceb 100644 --- a/include/kernel_float/base.h +++ b/include/kernel_float/base.h @@ -45,7 +45,6 @@ struct alignas(Alignment) aligned_array { }; template - struct aligned_array { KERNEL_FLOAT_INLINE T* data() { @@ -85,10 +84,12 @@ struct extent; template struct extent { - static constexpr size_t value = N; - static constexpr size_t size = N; + static constexpr size_t number_of_elements = N; }; +template +static constexpr size_t extent_size = E::number_of_elements; + namespace detail { // Indicates that elements of type `T` offer less precision than floats, thus operations // on elements of type `T` can be performed by upcasting them to ` float`. @@ -215,7 +216,7 @@ KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulon KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) -template> +template> struct vector; template @@ -224,11 +225,16 @@ struct into_vector_impl> { using extent_type = E; KERNEL_FLOAT_INLINE - static vector_storage call(const vector& input) { + static vector_storage> call(const vector& input) { return input.storage(); } }; +template +struct preferred_vector_size { + static constexpr size_t value = 1; +}; + template struct vector_traits; @@ -247,13 +253,13 @@ template using vector_extent_type = typename into_vector_impl::extent_type; template -static constexpr size_t vector_extent = vector_extent_type::value; +static constexpr size_t vector_size = extent_size>; template using into_vector_type = vector, vector_extent_type>; template -using vector_storage_type = vector_storage, vector_extent>; +using vector_storage_type = vector_storage, vector_size>; template using promoted_vector_value_type = promote_t...>; diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index 84d2f4c..7db292c 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -11,6 +11,12 @@ #include "vector.h" namespace kernel_float { + +template<> +struct preferred_vector_size<__nv_bfloat16> { + static constexpr size_t value = 2; +}; + KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__nv_bfloat16) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16) @@ -217,58 +223,10 @@ KERNEL_FLOAT_BF16_CAST( #endif using bfloat16 = __nv_bfloat16; +KERNEL_FLOAT_VECTOR_ALIAS(bfloat16x, __nv_bfloat16) //KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16) //KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16) -#if KERNEL_FLOAT_CUDA_ARCH >= 800 -namespace detail { -template<> -struct dot_impl<__nv_bfloat16, 0> { - KERNEL_FLOAT_INLINE - static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) { - return __nv_bfloat16(0); - } -}; - -template<> -struct dot_impl<__nv_bfloat16, 1> { - KERNEL_FLOAT_INLINE - static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) { - return __hmul(left[0], right[0]); - } -}; - -template -struct dot_impl<__nv_bfloat16, N> { - static_assert(N >= 2, "internal error"); - - KERNEL_FLOAT_INLINE - static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) { - __nv_bfloat162 first_a = {left[0], left[1]}; - __nv_bfloat162 first_b = {right[0], right[1]}; - __nv_bfloat162 accum = __hmul2(first_a, first_b); - -#pragma unroll - for (size_t i = 2; i + 1 < N; i += 2) { - __nv_bfloat162 a = {left[i], left[i + 1]}; - __nv_bfloat162 b = {right[i], right[i + 1]}; - accum = __hfma2(a, b, accum); - } - - __nv_bfloat16 result = __hadd(accum.x, accum.y); - - if (N % 2 != 0) { - __nv_bfloat16 a = left[N - 1]; - __nv_bfloat16 b = right[N - 1]; - result = __hfma(a, b, result); - } - - return result; - } -}; -} // namespace detail -#endif - } // namespace kernel_float #if KERNEL_FLOAT_FP16_AVAILABLE diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 82c375b..eb958f1 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -50,16 +50,9 @@ KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, co using O = result_t; using E = broadcast_vector_extent_type; - vector_storage result; + vector_storage> result; -// Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled -#if KERNEL_FLOAT_FAST_MATH - using apply_impl = detail::apply_fastmath_impl; -#else - using apply_impl = detail::apply_impl; -#endif - - apply_impl::call( + detail::map_impl, O, T, T>::call( fun, result.data(), detail::convert_impl, vector_extent_type, T, E>::call( @@ -304,20 +297,17 @@ struct apply_fastmath_impl, N, T, T, T> { T rhs_rcp[N]; // Fast way to perform division is to multiply by the reciprocal - apply_fastmath_impl, N, T, T, T>::call({}, rhs_rcp, rhs); + apply_fastmath_impl, N, T, T>::call({}, rhs_rcp, rhs); apply_fastmath_impl, N, T, T, T>::call({}, result, lhs, rhs_rcp); } }; #if KERNEL_FLOAT_IS_DEVICE -template -struct apply_fastmath_impl, N, float, float, float> { +template<> +struct apply_fastmath_impl, 1, float, float, float> { KERNEL_FLOAT_INLINE static void call(ops::divide fun, float* result, const float* lhs, const float* rhs) { -#pragma unroll - for (size_t i = 0; i < N; i++) { - result[i] = __fdividef(lhs[i], rhs[i]); - } + *result = __fdividef(*lhs, *rhs); } }; #endif @@ -327,9 +317,9 @@ template> KERNEL_FLOAT_INLINE zip_common_type, T, T> fast_divide(const L& left, const R& right) { using E = broadcast_vector_extent_type; - vector_storage result; + vector_storage> result; - detail::apply_fastmath_impl, E::value, T, T, T>::call( + detail::map_policy_impl, extent_size, T, T, T>::call( ops::divide {}, result.data(), detail::convert_impl, vector_extent_type, T, E>::call( diff --git a/include/kernel_float/conversion.h b/include/kernel_float/conversion.h index 3e56eaa..8538be7 100644 --- a/include/kernel_float/conversion.h +++ b/include/kernel_float/conversion.h @@ -14,10 +14,10 @@ namespace detail { template struct convert_impl { KERNEL_FLOAT_INLINE - static vector_storage call(vector_storage input) { + static vector_storage> call(vector_storage> input) { using F = ops::cast; - vector_storage intermediate; - detail::apply_impl::call(F {}, intermediate.data(), input.data()); + vector_storage> intermediate; + detail::map_impl, T2, T>::call(F {}, intermediate.data(), input.data()); return detail::broadcast_impl::call(intermediate); } }; @@ -26,7 +26,7 @@ struct convert_impl { template struct convert_impl { KERNEL_FLOAT_INLINE - static vector_storage call(vector_storage input) { + static vector_storage> call(vector_storage> input) { return input; } }; @@ -35,7 +35,7 @@ struct convert_impl { template struct convert_impl { KERNEL_FLOAT_INLINE - static vector_storage call(vector_storage input) { + static vector_storage> call(vector_storage> input) { return detail::broadcast_impl::call(input); } }; @@ -44,11 +44,11 @@ struct convert_impl { template struct convert_impl { KERNEL_FLOAT_INLINE - static vector_storage call(vector_storage input) { + static vector_storage> call(vector_storage> input) { using F = ops::cast; - vector_storage result; - detail::apply_impl::call(F {}, result.data(), input.data()); + vector_storage> result; + detail::map_impl, T2, T>::call(F {}, result.data(), input.data()); return result; } }; diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index ac760e9..0b62d9b 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -9,6 +9,12 @@ #include "vector.h" namespace kernel_float { + +template<> +struct preferred_vector_size<__half> { + static constexpr size_t value = 2; +}; + KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__half) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half) @@ -165,58 +171,10 @@ KERNEL_FLOAT_FP16_CAST(unsigned long, __ull2half_rn(input), (unsigned long)(__ha KERNEL_FLOAT_FP16_CAST(unsigned long long, __ull2half_rn(input), __half2ull_rz(input)); using half = __half; +KERNEL_FLOAT_VECTOR_ALIAS(half, __half) //KERNEL_FLOAT_TYPE_ALIAS(float16x, __half) //KERNEL_FLOAT_TYPE_ALIAS(f16x, __half) -#if KERNEL_FLOAT_IS_DEVICE -namespace detail { -template<> -struct dot_impl<__half, 0> { - KERNEL_FLOAT_INLINE - static __half call(const __half* left, const __half* right) { - return __half(0); - } -}; - -template<> -struct dot_impl<__half, 1> { - KERNEL_FLOAT_INLINE - static __half call(const __half* left, const __half* right) { - return __hmul(left[0], right[0]); - } -}; - -template -struct dot_impl<__half, N> { - static_assert(N >= 2, "internal error"); - - KERNEL_FLOAT_INLINE - static __half call(const __half* left, const __half* right) { - __half2 first_a = {left[0], left[1]}; - __half2 first_b = {right[0], right[1]}; - __half2 accum = __hmul2(first_a, first_b); - -#pragma unroll - for (size_t i = 2; i + 2 <= N; i += 2) { - __half2 a = {left[i], left[i + 1]}; - __half2 b = {right[i], right[i + 1]}; - accum = __hfma2(a, b, accum); - } - - __half result = __hadd(accum.x, accum.y); - - if (N % 2 != 0) { - __half a = left[N - 1]; - __half b = right[N - 1]; - result = __hfma(a, b, result); - } - - return result; - } -}; -} // namespace detail -#endif - } // namespace kernel_float #endif diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h index fd5e43d..2c7cc4a 100644 --- a/include/kernel_float/iterate.h +++ b/include/kernel_float/iterate.h @@ -22,7 +22,7 @@ KERNEL_FLOAT_INLINE void for_each(V&& input, F fun) { auto storage = into_vector_storage(input); #pragma unroll - for (size_t i = 0; i < vector_extent; i++) { + for (size_t i = 0; i < vector_size; i++) { fun(storage.data()[i]); } } @@ -86,7 +86,7 @@ KERNEL_FLOAT_INLINE vector> range() { */ template KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { - return range, vector_extent>(); + return range, vector_size>(); } /** @@ -111,11 +111,11 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { */ template KERNEL_FLOAT_INLINE vector> each_index(const V& = {}) { - return range>(); + return range>(); } namespace detail { -template, size_t N = vector_extent> +template, size_t N = vector_size> struct flatten_impl { using value_type = typename flatten_impl::value_type; static constexpr size_t size = N * flatten_impl::size; @@ -177,7 +177,7 @@ KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { namespace detail { template> struct concat_base_impl { - static constexpr size_t size = vector_extent; + static constexpr size_t size = vector_size; KERNEL_FLOAT_INLINE static void call(U* output, const V& input) { vector_storage storage = into_vector_storage(input); @@ -294,7 +294,7 @@ using select_type = vector, extent>>; template KERNEL_FLOAT_INLINE select_type select(const V& input, const Is&... indices) { using T = vector_value_type; - static constexpr size_t N = vector_extent; + static constexpr size_t N = vector_size; static constexpr size_t M = concat_size; vector_storage index_set; diff --git a/include/kernel_float/macros.h b/include/kernel_float/macros.h index 621cdea..4eee8f1 100644 --- a/include/kernel_float/macros.h +++ b/include/kernel_float/macros.h @@ -63,8 +63,8 @@ #define KERNEL_FLOAT_MAX_ALIGNMENT (32) -#ifndef KERNEL_FLOAT_FAST_MATH -#define KERNEL_FLOAT_FAST_MATH (0) +#if KERNEL_FLOAT_FAST_MATH +#define KERNEL_FLOAT_POLICY ::kernel_float::fast_policy; #endif #endif //KERNEL_FLOAT_MACROS_H diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h index 66308b6..d76ed1e 100644 --- a/include/kernel_float/memory.h +++ b/include/kernel_float/memory.h @@ -38,7 +38,7 @@ struct copy_impl> { */ template> KERNEL_FLOAT_INLINE vector read(const T* ptr, const I& indices, const M& mask = true) { - return detail::copy_impl::load( + return detail::copy_impl>::load( ptr, convert_storage(indices, E()).data(), convert_storage(mask, E()).data()); @@ -65,7 +65,7 @@ template< typename M = bool, typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE void write(T* ptr, const I& indices, const V& values, const M& mask = true) { - return detail::copy_impl::store( + return detail::copy_impl>::store( ptr, convert_storage(values, E()).data(), convert_storage(indices, E()).data(), @@ -102,7 +102,7 @@ KERNEL_FLOAT_INLINE vector> read(const T* ptr) { */ template KERNEL_FLOAT_INLINE void write(T* ptr, const V& values) { - static constexpr size_t N = vector_extent; + static constexpr size_t N = vector_size; write(ptr, range(), values); } @@ -277,7 +277,7 @@ KERNEL_FLOAT_INLINE vector> read_aligned(const T* ptr) { */ template KERNEL_FLOAT_INLINE void write_aligned(T* ptr, const V& values) { - static constexpr size_t N = vector_extent; + static constexpr size_t N = vector_size; static constexpr size_t alignment = detail::gcd(Align * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT); return detail::copy_aligned_impl::store( diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index 8ec790d..c859265 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -2,6 +2,7 @@ #define KERNEL_FLOAT_REDUCE_H #include "binops.h" +#include "triops.h" namespace kernel_float { namespace detail { @@ -23,7 +24,7 @@ struct reduce_recur_impl { template KERNEL_FLOAT_INLINE static T call(F fun, const T* input) { vector_storage temp; - apply_impl::call(fun, temp.data(), input, input + K); + map_impl::call(fun, temp.data(), input, input + K); if constexpr (N < 2 * K) { #pragma unroll @@ -73,7 +74,7 @@ struct reduce_recur_impl<2> { */ template KERNEL_FLOAT_INLINE vector_value_type reduce(F fun, const V& input) { - return detail::reduce_impl, vector_value_type>::call( + return detail::reduce_impl, vector_value_type>::call( fun, into_vector_storage(input).data()); } @@ -177,14 +178,38 @@ template struct dot_impl { KERNEL_FLOAT_INLINE static T call(const T* left, const T* right) { - vector_storage intermediate; - detail::apply_impl, N, T, T, T>::call( - ops::multiply(), - intermediate.data(), - left, - right); - - return detail::reduce_impl, N, T>::call(ops::add(), intermediate.data()); + static constexpr size_t K = preferred_vector_size::value; + T result = {}; + + if constexpr (N / K > 0) { + T accum[K] = {T {}}; + apply_impl, K, T, T, T>::call({}, accum, left, right); + +#pragma unroll + for (size_t i = 1; i < N / K; i++) { + apply_impl, K, T, T, T, T>::call( + ops::fma {}, + accum, + left + i * K, + right + i * K, + accum); + } + + result = reduce_impl, K, T>::call({}, accum); + } + + if constexpr (N % K > 0) { + for (size_t i = N - N % K; i < N; i++) { + apply_impl, 1, T, T, T, T>::call( + {}, + &result, + left + i, + right + i, + &result); + } + } + + return result; } }; } // namespace detail @@ -203,7 +228,7 @@ struct dot_impl { template> KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { using E = broadcast_vector_extent_type; - return detail::dot_impl::call( + return detail::dot_impl>::call( convert_storage(left, E {}).data(), convert_storage(right, E {}).data()); } @@ -273,7 +298,7 @@ struct magnitude_impl { */ template> KERNEL_FLOAT_INLINE T mag(const V& input) { - return detail::magnitude_impl>::call(into_vector_storage(input).data()); + return detail::magnitude_impl>::call(into_vector_storage(input).data()); } } // namespace kernel_float diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h index 44f6db2..12cca59 100644 --- a/include/kernel_float/triops.h +++ b/include/kernel_float/triops.h @@ -39,9 +39,9 @@ template< typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values, const R& false_values) { using F = ops::conditional; - vector_storage result; + vector_storage> result; - detail::apply_impl::call( + detail::map_impl, T, bool, T, T>::call( F {}, result.data(), detail::convert_impl, vector_extent_type, bool, E>::call( @@ -92,11 +92,25 @@ namespace ops { template struct fma { KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) { - return a * b + c; + return ops::add {}(ops::multiply {}(a, b), c); } }; +} // namespace ops + +namespace detail { +template +struct apply_impl, N, T, T, T, T> { + KERNEL_FLOAT_INLINE + static void call(ops::fma, T* output, const T* a, const T* b, const T* c) { + T temp[N]; + apply_impl, N, T, T, T>::call({}, temp, a, b); + apply_impl, N, T, T, T>::call({}, output, temp, c); + } +}; +} // namespace detail #if KERNEL_FLOAT_IS_DEVICE +namespace ops { template<> struct fma { KERNEL_FLOAT_INLINE float operator()(float a, float b, float c) { @@ -110,11 +124,11 @@ struct fma { return __fma_rn(a, b, c); } }; -#endif } // namespace ops +#endif /** - * Computes the result of `a * b + c`. This is done in a single operation if possible. + * Computes the result of `a * b + c`. This is done in a single operation if possible for the given vector type. */ template< typename A, @@ -124,9 +138,9 @@ template< typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { using F = ops::fma; - vector_storage result; + vector_storage> result; - detail::apply_impl::call( + detail::map_impl, T, T, T, T>::call( F {}, result.data(), detail::convert_impl, vector_extent_type, T, E>::call( diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h index 70d629c..fce130e 100644 --- a/include/kernel_float/unops.h +++ b/include/kernel_float/unops.h @@ -85,10 +85,10 @@ KERNEL_FLOAT_INLINE vector> cast(const V& input) { } #define KERNEL_FLOAT_DEFINE_UNARY_FUN(NAME) \ - template \ + template \ KERNEL_FLOAT_INLINE vector, vector_extent_type> NAME(const V& input) { \ using F = ops::NAME>; \ - return map(F {}, input); \ + return ::kernel_float::map(F {}, input); \ } #define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ @@ -166,7 +166,7 @@ KERNEL_FLOAT_DEFINE_UNARY_MATH(ilogb) KERNEL_FLOAT_DEFINE_UNARY_MATH(lgamma) KERNEL_FLOAT_DEFINE_UNARY_MATH(log) KERNEL_FLOAT_DEFINE_UNARY_MATH(log10) -KERNEL_FLOAT_DEFINE_UNARY_MATH(logb) +KERNEL_FLOAT_DEFINE_UNARY_MATH(log2) KERNEL_FLOAT_DEFINE_UNARY_MATH(nearbyint) KERNEL_FLOAT_DEFINE_UNARY_MATH(normcdf) KERNEL_FLOAT_DEFINE_UNARY_MATH(rcbrt) @@ -193,12 +193,11 @@ KERNEL_FLOAT_DEFINE_UNARY_STRUCT(rcp, 1.0 / input, 1.0f / input) KERNEL_FLOAT_DEFINE_UNARY_FUN(rcp) -#define KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(NAME) \ - template \ - KERNEL_FLOAT_INLINE vector, vector_extent_type> fast_##NAME( \ - const V& input) { \ - using F = ops::NAME>; \ - return fast_map(F {}, input); \ +#define KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(NAME) \ + template \ + KERNEL_FLOAT_INLINE vector, vector_extent_type> fast_##NAME( \ + const V& input) { \ + return ::kernel_float::map(ops::NAME> {}, input); \ } KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(exp) @@ -209,17 +208,17 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(rsqrt) KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(sin) KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(cos) KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(tan) +KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(exp2) +KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(log2) #if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(T, F, FAST_FUN) \ namespace detail { \ - template \ - struct apply_fastmath_impl, N, T, T> { \ + template<> \ + struct apply_fastmath_impl, 1, T, T> { \ KERNEL_FLOAT_INLINE static void call(ops::F, T* result, const T* inputs) { \ - for (size_t i = 0; i < N; i++) { \ - result[i] = FAST_FUN(inputs[i]); \ - } \ + *result = FAST_FUN(*inputs); \ } \ }; \ } @@ -227,26 +226,28 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(tan) KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, exp, __expf) KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, log, __logf) -#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR) \ +#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR, REG) \ namespace detail { \ - template \ - struct apply_fastmath_impl, N, T, T> { \ + template<> \ + struct apply_fastmath_impl, 1, T, T> { \ KERNEL_FLOAT_INLINE static void call(ops::F fun, T* result, const T* inputs) { \ - for (size_t i = 0; i < N; i++) { \ - asm(INSTR, : "=f"(result[i]) : "f"(inputs[i])); \ - } \ + asm(INSTR : "=" REG(*result) : REG(*inputs)); \ } \ }; \ } -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1") -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1;", "d") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1;", "d") -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1") -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1") -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1") -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1") -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1;", "f") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1;", "f") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1;", "f") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1;", "f") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1;", "f") + +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, exp2, "ex2.approx.f32 %0, %1;", "f") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, log2, "lg2.approx.f32 %0, %1;", "f") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, tanh, "tanh.approx.f32 %0, %1;", "f") #endif diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index bf9395b..e52fa02 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -54,7 +54,7 @@ struct vector: public S { typename A, typename B, typename... Rest, - typename = enable_if_t> + typename = enable_if_t>> KERNEL_FLOAT_INLINE vector(const A& a, const B& b, const Rest&... rest) : storage_type {T(a), T(b), T(rest)...} {} @@ -63,7 +63,7 @@ struct vector: public S { */ KERNEL_FLOAT_INLINE static constexpr size_t size() { - return E::size; + return extent_size; } /** @@ -276,6 +276,21 @@ struct vector: public S { KERNEL_FLOAT_INLINE void for_each(F fun) const { return kernel_float::for_each(*this, std::move(fun)); } + + /** + * Returns the result of `*this + lhs * rhs`. + * + * The operation is performed using a single `kernel_float::fma` call, which may be faster then perform + * the addition and multiplication separately. + */ + template< + typename L, + typename R, + typename T2 = promote_t, vector_value_type>, + typename E2 = broadcast_extent, vector_extent_type>> + KERNEL_FLOAT_INLINE vector fma(const L& lhs, const R& rhs) const { + return ::kernel_float::fma(lhs, rhs, *this); + } }; /** @@ -308,6 +323,21 @@ template using vec7 = vec; template using vec8 = vec; // clang-format on +#define KERNEL_FLOAT_VECTOR_ALIAS(NAME, T) \ + template \ + using NAME##1 = vec; \ + using NAME##2 = vec; \ + using NAME##3 = vec; \ + using NAME##4 = vec; \ + using NAME##5 = vec; \ + using NAME##6 = vec; \ + using NAME##7 = vec; \ + using NAME##8 = vec; + +KERNEL_FLOAT_VECTOR_ALIAS(int, int) +KERNEL_FLOAT_VECTOR_ALIAS(float, float) +KERNEL_FLOAT_VECTOR_ALIAS(double, double) + /** * Create a vector from a variable number of input values. * diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 42a5814..a79da18 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -16,8 +16,8 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2024-05-17 10:55:41.948281 -// git hash: 41246ab6db9fcc24639342c439e606ba143ee346 +// date: 2024-09-23 14:12:25.024358 +// git hash: 3a88b56a57cce5e1f3365aa6e8efb76a14f7f865 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -85,8 +85,8 @@ #define KERNEL_FLOAT_MAX_ALIGNMENT (32) -#ifndef KERNEL_FLOAT_FAST_MATH -#define KERNEL_FLOAT_FAST_MATH (0) +#if KERNEL_FLOAT_FAST_MATH +#define KERNEL_FLOAT_POLICY ::kernel_float::fast_policy; #endif #endif //KERNEL_FLOAT_MACROS_H @@ -424,7 +424,6 @@ struct alignas(Alignment) aligned_array { }; template - struct aligned_array { KERNEL_FLOAT_INLINE T* data() { @@ -464,10 +463,12 @@ struct extent; template struct extent { - static constexpr size_t value = N; - static constexpr size_t size = N; + static constexpr size_t number_of_elements = N; }; +template +static constexpr size_t extent_size = E::number_of_elements; + namespace detail { // Indicates that elements of type `T` offer less precision than floats, thus operations // on elements of type `T` can be performed by upcasting them to ` float`. @@ -594,7 +595,7 @@ KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulon KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) -template> +template> struct vector; template @@ -603,11 +604,16 @@ struct into_vector_impl> { using extent_type = E; KERNEL_FLOAT_INLINE - static vector_storage call(const vector& input) { + static vector_storage> call(const vector& input) { return input.storage(); } }; +template +struct preferred_vector_size { + static constexpr size_t value = 1; +}; + template struct vector_traits; @@ -626,13 +632,13 @@ template using vector_extent_type = typename into_vector_impl::extent_type; template -static constexpr size_t vector_extent = vector_extent_type::value; +static constexpr size_t vector_size = extent_size>; template using into_vector_type = vector, vector_extent_type>; template -using vector_storage_type = vector_storage, vector_extent>; +using vector_storage_type = vector_storage, vector_size>; template using promoted_vector_value_type = promote_t...>; @@ -765,43 +771,66 @@ broadcast_like(const V& input, const R& other) { namespace detail { -template -struct apply_recur_impl; - template struct apply_impl { - KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { - apply_recur_impl::call(fun, result, inputs...); + KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) { +#pragma unroll + for (size_t i = 0; i < N; i++) { + output[i] = fun(args[i]...); + } } }; -template -struct apply_recur_impl { - static constexpr size_t K = round_up_to_power_of_two(N) / 2; +template +struct apply_fastmath_impl: apply_impl {}; +} // namespace detail - template - KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { - apply_impl::call(fun, result, inputs...); - apply_impl::call(fun, result + K, (inputs + K)...); - } +struct accurate_policy { + template + using type = detail::apply_impl; }; -template<> -struct apply_recur_impl<0> { - template - KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {} +struct fast_policy { + template + using type = detail::apply_fastmath_impl; }; -template<> -struct apply_recur_impl<1> { - template - KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { - result[0] = fun(inputs[0]...); +#ifdef KERNEL_FLOAT_POLICY +using default_policy = KERNEL_FLOAT_POLICY; +#else +using default_policy = accurate_policy; +#endif + +namespace detail { + +template +struct map_policy_impl { + static constexpr size_t packet_size = preferred_vector_size::value; + static constexpr size_t remainder = N % packet_size; + + KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) { + if constexpr (N / packet_size > 0) { +#pragma unroll + for (size_t i = 0; i < N - remainder; i += packet_size) { + Policy::template type::call( + fun, + output + i, + (args + i)...); + } + } + + if constexpr (remainder > 0) { +#pragma unroll + for (size_t i = N - remainder; i < N; i++) { + Policy::template type::call(fun, output + i, (args + i)...); + } + } } }; template -struct apply_fastmath_impl: apply_impl {}; +using map_impl = map_policy_impl; + } // namespace detail template @@ -818,40 +847,13 @@ using map_type = * vec squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f] * ``` */ -template +template KERNEL_FLOAT_INLINE map_type map(F fun, const Args&... args) { using Output = result_t...>; using E = broadcast_vector_extent_type; - vector_storage result; - - // Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled -#if KERNEL_FLOAT_FAST_MATH - using apply_impl = detail::apply_fastmath_impl...>; -#else - using apply_impl = detail::apply_impl...>; -#endif - - apply_impl::call( - fun, - result.data(), - (detail::broadcast_impl, vector_extent_type, E>::call( - into_vector_storage(args)) - .data())...); - - return result; -} - -/** - * Apply the function `F` to each element from the vector `input` and return the results as a new vector. This - * uses fast-math if available for the given function `F`, otherwise this function behaves like `map`. - */ -template -KERNEL_FLOAT_INLINE map_type fast_map(F fun, const Args&... args) { - using Output = result_t...>; - using E = broadcast_vector_extent_type; - vector_storage result; + vector_storage> result; - detail::apply_fastmath_impl...>::call( + detail::map_policy_impl, Output, vector_value_type...>::call( fun, result.data(), (detail::broadcast_impl, vector_extent_type, E>::call( @@ -1200,10 +1202,10 @@ KERNEL_FLOAT_INLINE vector> cast(const V& input) { } #define KERNEL_FLOAT_DEFINE_UNARY_FUN(NAME) \ - template \ + template \ KERNEL_FLOAT_INLINE vector, vector_extent_type> NAME(const V& input) { \ using F = ops::NAME>; \ - return map(F {}, input); \ + return ::kernel_float::map(F {}, input); \ } #define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ @@ -1281,7 +1283,7 @@ KERNEL_FLOAT_DEFINE_UNARY_MATH(ilogb) KERNEL_FLOAT_DEFINE_UNARY_MATH(lgamma) KERNEL_FLOAT_DEFINE_UNARY_MATH(log) KERNEL_FLOAT_DEFINE_UNARY_MATH(log10) -KERNEL_FLOAT_DEFINE_UNARY_MATH(logb) +KERNEL_FLOAT_DEFINE_UNARY_MATH(log2) KERNEL_FLOAT_DEFINE_UNARY_MATH(nearbyint) KERNEL_FLOAT_DEFINE_UNARY_MATH(normcdf) KERNEL_FLOAT_DEFINE_UNARY_MATH(rcbrt) @@ -1308,12 +1310,11 @@ KERNEL_FLOAT_DEFINE_UNARY_STRUCT(rcp, 1.0 / input, 1.0f / input) KERNEL_FLOAT_DEFINE_UNARY_FUN(rcp) -#define KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(NAME) \ - template \ - KERNEL_FLOAT_INLINE vector, vector_extent_type> fast_##NAME( \ - const V& input) { \ - using F = ops::NAME>; \ - return fast_map(F {}, input); \ +#define KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(NAME) \ + template \ + KERNEL_FLOAT_INLINE vector, vector_extent_type> fast_##NAME( \ + const V& input) { \ + return ::kernel_float::map(ops::NAME> {}, input); \ } KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(exp) @@ -1324,17 +1325,17 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(rsqrt) KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(sin) KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(cos) KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(tan) +KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(exp2) +KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(log2) #if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(T, F, FAST_FUN) \ namespace detail { \ - template \ - struct apply_fastmath_impl, N, T, T> { \ + template<> \ + struct apply_fastmath_impl, 1, T, T> { \ KERNEL_FLOAT_INLINE static void call(ops::F, T* result, const T* inputs) { \ - for (size_t i = 0; i < N; i++) { \ - result[i] = FAST_FUN(inputs[i]); \ - } \ + *result = FAST_FUN(*inputs); \ } \ }; \ } @@ -1342,26 +1343,28 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(tan) KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, exp, __expf) KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, log, __logf) -#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR) \ +#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR, REG) \ namespace detail { \ - template \ - struct apply_fastmath_impl, N, T, T> { \ + template<> \ + struct apply_fastmath_impl, 1, T, T> { \ KERNEL_FLOAT_INLINE static void call(ops::F fun, T* result, const T* inputs) { \ - for (size_t i = 0; i < N; i++) { \ - asm(INSTR, : "=f"(result[i]) : "f"(inputs[i])); \ - } \ + asm(INSTR : "=" REG(*result) : REG(*inputs)); \ } \ }; \ } -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1") -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1;", "d") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1;", "d") -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1") -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1") -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1") -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1") -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1;", "f") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1;", "f") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1;", "f") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1;", "f") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1;", "f") + +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, exp2, "ex2.approx.f32 %0, %1;", "f") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, log2, "lg2.approx.f32 %0, %1;", "f") +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, tanh, "tanh.approx.f32 %0, %1;", "f") #endif @@ -1384,10 +1387,10 @@ namespace detail { template struct convert_impl { KERNEL_FLOAT_INLINE - static vector_storage call(vector_storage input) { + static vector_storage> call(vector_storage> input) { using F = ops::cast; - vector_storage intermediate; - detail::apply_impl::call(F {}, intermediate.data(), input.data()); + vector_storage> intermediate; + detail::map_impl, T2, T>::call(F {}, intermediate.data(), input.data()); return detail::broadcast_impl::call(intermediate); } }; @@ -1396,7 +1399,7 @@ struct convert_impl { template struct convert_impl { KERNEL_FLOAT_INLINE - static vector_storage call(vector_storage input) { + static vector_storage> call(vector_storage> input) { return input; } }; @@ -1405,7 +1408,7 @@ struct convert_impl { template struct convert_impl { KERNEL_FLOAT_INLINE - static vector_storage call(vector_storage input) { + static vector_storage> call(vector_storage> input) { return detail::broadcast_impl::call(input); } }; @@ -1414,11 +1417,11 @@ struct convert_impl { template struct convert_impl { KERNEL_FLOAT_INLINE - static vector_storage call(vector_storage input) { + static vector_storage> call(vector_storage> input) { using F = ops::cast; - vector_storage result; - detail::apply_impl::call(F {}, result.data(), input.data()); + vector_storage> result; + detail::map_impl, T2, T>::call(F {}, result.data(), input.data()); return result; } }; @@ -1637,16 +1640,9 @@ KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, co using O = result_t; using E = broadcast_vector_extent_type; - vector_storage result; + vector_storage> result; -// Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled -#if KERNEL_FLOAT_FAST_MATH - using apply_impl = detail::apply_fastmath_impl; -#else - using apply_impl = detail::apply_impl; -#endif - - apply_impl::call( + detail::map_impl, O, T, T>::call( fun, result.data(), detail::convert_impl, vector_extent_type, T, E>::call( @@ -1890,20 +1886,18 @@ struct apply_fastmath_impl, N, T, T, T> { call(ops::divide fun, T* result, const T* lhs, const T* rhs) { T rhs_rcp[N]; - apply_fastmath_impl, N, T, T, T>::call({}, rhs_rcp, rhs); + // Fast way to perform division is to multiply by the reciprocal + apply_fastmath_impl, N, T, T>::call({}, rhs_rcp, rhs); apply_fastmath_impl, N, T, T, T>::call({}, result, lhs, rhs_rcp); } }; #if KERNEL_FLOAT_IS_DEVICE -template -struct apply_fastmath_impl, N, float, float, float> { +template<> +struct apply_fastmath_impl, 1, float, float, float> { KERNEL_FLOAT_INLINE static void call(ops::divide fun, float* result, const float* lhs, const float* rhs) { -#pragma unroll - for (size_t i = 0; i < N; i++) { - result[i] = __fdividef(lhs[i], rhs[i]); - } + *result = __fdividef(*lhs, *rhs); } }; #endif @@ -1913,9 +1907,9 @@ template> KERNEL_FLOAT_INLINE zip_common_type, T, T> fast_divide(const L& left, const R& right) { using E = broadcast_vector_extent_type; - vector_storage result; + vector_storage> result; - detail::apply_fastmath_impl, E::value, T, T, T>::call( + detail::map_policy_impl, extent_size, T, T, T>::call( ops::divide {}, result.data(), detail::convert_impl, vector_extent_type, T, E>::call( @@ -2133,7 +2127,7 @@ KERNEL_FLOAT_INLINE void for_each(V&& input, F fun) { auto storage = into_vector_storage(input); #pragma unroll - for (size_t i = 0; i < vector_extent; i++) { + for (size_t i = 0; i < vector_size; i++) { fun(storage.data()[i]); } } @@ -2197,7 +2191,7 @@ KERNEL_FLOAT_INLINE vector> range() { */ template KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { - return range, vector_extent>(); + return range, vector_size>(); } /** @@ -2222,11 +2216,11 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { */ template KERNEL_FLOAT_INLINE vector> each_index(const V& = {}) { - return range>(); + return range>(); } namespace detail { -template, size_t N = vector_extent> +template, size_t N = vector_size> struct flatten_impl { using value_type = typename flatten_impl::value_type; static constexpr size_t size = N * flatten_impl::size; @@ -2288,7 +2282,7 @@ KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { namespace detail { template> struct concat_base_impl { - static constexpr size_t size = vector_extent; + static constexpr size_t size = vector_size; KERNEL_FLOAT_INLINE static void call(U* output, const V& input) { vector_storage storage = into_vector_storage(input); @@ -2405,7 +2399,7 @@ using select_type = vector, extent>>; template KERNEL_FLOAT_INLINE select_type select(const V& input, const Is&... indices) { using T = vector_value_type; - static constexpr size_t N = vector_extent; + static constexpr size_t N = vector_size; static constexpr size_t M = concat_size; vector_storage index_set; @@ -2467,7 +2461,7 @@ struct copy_impl> { */ template> KERNEL_FLOAT_INLINE vector read(const T* ptr, const I& indices, const M& mask = true) { - return detail::copy_impl::load( + return detail::copy_impl>::load( ptr, convert_storage(indices, E()).data(), convert_storage(mask, E()).data()); @@ -2494,7 +2488,7 @@ template< typename M = bool, typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE void write(T* ptr, const I& indices, const V& values, const M& mask = true) { - return detail::copy_impl::store( + return detail::copy_impl>::store( ptr, convert_storage(values, E()).data(), convert_storage(indices, E()).data(), @@ -2531,7 +2525,7 @@ KERNEL_FLOAT_INLINE vector> read(const T* ptr) { */ template KERNEL_FLOAT_INLINE void write(T* ptr, const V& values) { - static constexpr size_t N = vector_extent; + static constexpr size_t N = vector_size; write(ptr, range(), values); } @@ -2706,7 +2700,7 @@ KERNEL_FLOAT_INLINE vector> read_aligned(const T* ptr) { */ template KERNEL_FLOAT_INLINE void write_aligned(T* ptr, const V& values) { - static constexpr size_t N = vector_extent; + static constexpr size_t N = vector_size; static constexpr size_t alignment = detail::gcd(Align * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT); return detail::copy_aligned_impl::store( @@ -3034,11 +3028,173 @@ vec_ptr(const T*) -> vec_ptr; } // namespace kernel_float #endif //KERNEL_FLOAT_MEMORY_H +#ifndef KERNEL_FLOAT_TRIOPS_H +#define KERNEL_FLOAT_TRIOPS_H + + + + +namespace kernel_float { + +namespace ops { +template +struct conditional { + KERNEL_FLOAT_INLINE T operator()(bool cond, T true_value, T false_value) { + if (cond) { + return true_value; + } else { + return false_value; + } + } +}; +} // namespace ops + +/** + * Return elements chosen from `true_values` and `false_values` depending on `cond`. + * + * This function broadcasts all arguments to the same size and then promotes the values of `true_values` and + * `false_values` into the same type. Next, it casts the values of `cond` to booleans and returns a vector where + * the values are taken from `true_values` where the condition is true and `false_values` otherwise. + * + * @param cond The condition used for selection. + * @param true_values The vector of values to choose from when the condition is true. + * @param false_values The vector of values to choose from when the condition is false. + * @return A vector containing selected elements as per the condition. + */ +template< + typename C, + typename L, + typename R, + typename T = promoted_vector_value_type, + typename E = broadcast_vector_extent_type> +KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values, const R& false_values) { + using F = ops::conditional; + vector_storage> result; + + detail::map_impl, T, bool, T, T>::call( + F {}, + result.data(), + detail::convert_impl, vector_extent_type, bool, E>::call( + into_vector_storage(cond)) + .data(), + detail::convert_impl, vector_extent_type, T, E>::call( + into_vector_storage(true_values)) + .data(), + detail::convert_impl, vector_extent_type, T, E>::call( + into_vector_storage(false_values)) + .data()); + + return result; +} + +/** + * Selects elements from `true_values` depending on `cond`. + * + * This function returns a vector where the values are taken from `true_values` where `cond` is `true` and `0` where + * `cond is `false`. + * + * @param cond The condition used for selection. + * @param true_values The vector of values to choose from when the condition is true. + * @return A vector containing selected elements as per the condition. + */ +template< + typename C, + typename L, + typename T = vector_value_type, + typename E = broadcast_vector_extent_type> +KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values) { + vector> false_values = T {}; + return where(cond, true_values, false_values); +} + +/** + * Returns a vector having the value `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`. + * + * @param cond The condition used for selection. + * @return A vector containing elements as per the condition. + */ +template> +KERNEL_FLOAT_INLINE vector where(const C& cond) { + return cast(cast(cond)); +} + +namespace ops { +template +struct fma { + KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) { + return ops::add {}(ops::multiply {}(a, b), c); + } +}; +} // namespace ops + +namespace detail { +template +struct apply_impl, N, T, T, T, T> { + KERNEL_FLOAT_INLINE + static void call(ops::fma, T* output, const T* a, const T* b, const T* c) { + T temp[N]; + apply_impl, N, T, T, T>::call({}, temp, a, b); + apply_impl, N, T, T, T>::call({}, output, temp, c); + } +}; +} // namespace detail + +#if KERNEL_FLOAT_IS_DEVICE +namespace ops { +template<> +struct fma { + KERNEL_FLOAT_INLINE float operator()(float a, float b, float c) { + return __fmaf_rn(a, b, c); + } +}; + +template<> +struct fma { + KERNEL_FLOAT_INLINE double operator()(double a, double b, double c) { + return __fma_rn(a, b, c); + } +}; +} // namespace ops +#endif + +/** + * Computes the result of `a * b + c`. This is done in a single operation if possible for the given vector type. + */ +template< + typename A, + typename B, + typename C, + typename T = promoted_vector_value_type, + typename E = broadcast_vector_extent_type> +KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { + using F = ops::fma; + vector_storage> result; + + detail::map_impl, T, T, T, T>::call( + F {}, + result.data(), + detail::convert_impl, vector_extent_type, T, E>::call( + into_vector_storage(a)) + .data(), + detail::convert_impl, vector_extent_type, T, E>::call( + into_vector_storage(b)) + .data(), + detail::convert_impl, vector_extent_type, T, E>::call( + into_vector_storage(c)) + .data()); + + return result; +} + +} // namespace kernel_float + +#endif //KERNEL_FLOAT_TRIOPS_H #ifndef KERNEL_FLOAT_REDUCE_H #define KERNEL_FLOAT_REDUCE_H + namespace kernel_float { namespace detail { @@ -3059,7 +3215,7 @@ struct reduce_recur_impl { template KERNEL_FLOAT_INLINE static T call(F fun, const T* input) { vector_storage temp; - apply_impl::call(fun, temp.data(), input, input + K); + map_impl::call(fun, temp.data(), input, input + K); if constexpr (N < 2 * K) { #pragma unroll @@ -3109,7 +3265,7 @@ struct reduce_recur_impl<2> { */ template KERNEL_FLOAT_INLINE vector_value_type reduce(F fun, const V& input) { - return detail::reduce_impl, vector_value_type>::call( + return detail::reduce_impl, vector_value_type>::call( fun, into_vector_storage(input).data()); } @@ -3213,14 +3369,38 @@ template struct dot_impl { KERNEL_FLOAT_INLINE static T call(const T* left, const T* right) { - vector_storage intermediate; - detail::apply_impl, N, T, T, T>::call( - ops::multiply(), - intermediate.data(), - left, - right); + static constexpr size_t K = preferred_vector_size::value; + T result = {}; + + if constexpr (N / K > 0) { + T accum[K] = {T {}}; + apply_impl, K, T, T, T>::call({}, accum, left, right); + +#pragma unroll + for (size_t i = 1; i < N / K; i++) { + apply_impl, K, T, T, T, T>::call( + ops::fma {}, + accum, + left + i * K, + right + i * K, + accum); + } + + result = reduce_impl, K, T>::call({}, accum); + } - return detail::reduce_impl, N, T>::call(ops::add(), intermediate.data()); + if constexpr (N % K > 0) { + for (size_t i = N - N % K; i < N; i++) { + apply_impl, 1, T, T, T, T>::call( + {}, + &result, + left + i, + right + i, + &result); + } + } + + return result; } }; } // namespace detail @@ -3239,7 +3419,7 @@ struct dot_impl { template> KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { using E = broadcast_vector_extent_type; - return detail::dot_impl::call( + return detail::dot_impl>::call( convert_storage(left, E {}).data(), convert_storage(right, E {}).data()); } @@ -3309,158 +3489,11 @@ struct magnitude_impl { */ template> KERNEL_FLOAT_INLINE T mag(const V& input) { - return detail::magnitude_impl>::call(into_vector_storage(input).data()); + return detail::magnitude_impl>::call(into_vector_storage(input).data()); } } // namespace kernel_float #endif //KERNEL_FLOAT_REDUCE_H -#ifndef KERNEL_FLOAT_TRIOPS_H -#define KERNEL_FLOAT_TRIOPS_H - - - - -namespace kernel_float { - -namespace ops { -template -struct conditional { - KERNEL_FLOAT_INLINE T operator()(bool cond, T true_value, T false_value) { - if (cond) { - return true_value; - } else { - return false_value; - } - } -}; -} // namespace ops - -/** - * Return elements chosen from `true_values` and `false_values` depending on `cond`. - * - * This function broadcasts all arguments to the same size and then promotes the values of `true_values` and - * `false_values` into the same type. Next, it casts the values of `cond` to booleans and returns a vector where - * the values are taken from `true_values` where the condition is true and `false_values` otherwise. - * - * @param cond The condition used for selection. - * @param true_values The vector of values to choose from when the condition is true. - * @param false_values The vector of values to choose from when the condition is false. - * @return A vector containing selected elements as per the condition. - */ -template< - typename C, - typename L, - typename R, - typename T = promoted_vector_value_type, - typename E = broadcast_vector_extent_type> -KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values, const R& false_values) { - using F = ops::conditional; - vector_storage result; - - detail::apply_impl::call( - F {}, - result.data(), - detail::convert_impl, vector_extent_type, bool, E>::call( - into_vector_storage(cond)) - .data(), - detail::convert_impl, vector_extent_type, T, E>::call( - into_vector_storage(true_values)) - .data(), - detail::convert_impl, vector_extent_type, T, E>::call( - into_vector_storage(false_values)) - .data()); - - return result; -} - -/** - * Selects elements from `true_values` depending on `cond`. - * - * This function returns a vector where the values are taken from `true_values` where `cond` is `true` and `0` where - * `cond is `false`. - * - * @param cond The condition used for selection. - * @param true_values The vector of values to choose from when the condition is true. - * @return A vector containing selected elements as per the condition. - */ -template< - typename C, - typename L, - typename T = vector_value_type, - typename E = broadcast_vector_extent_type> -KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values) { - vector> false_values = T {}; - return where(cond, true_values, false_values); -} - -/** - * Returns a vector having the value `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`. - * - * @param cond The condition used for selection. - * @return A vector containing elements as per the condition. - */ -template> -KERNEL_FLOAT_INLINE vector where(const C& cond) { - return cast(cast(cond)); -} - -namespace ops { -template -struct fma { - KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) { - return a * b + c; - } -}; - -#if KERNEL_FLOAT_IS_DEVICE -template<> -struct fma { - KERNEL_FLOAT_INLINE float operator()(float a, float b, float c) { - return __fmaf_rn(a, b, c); - } -}; - -template<> -struct fma { - KERNEL_FLOAT_INLINE double operator()(double a, double b, double c) { - return __fma_rn(a, b, c); - } -}; -#endif -} // namespace ops - -/** - * Computes the result of `a * b + c`. This is done in a single operation if possible. - */ -template< - typename A, - typename B, - typename C, - typename T = promoted_vector_value_type, - typename E = broadcast_vector_extent_type> -KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { - using F = ops::fma; - vector_storage result; - - detail::apply_impl::call( - F {}, - result.data(), - detail::convert_impl, vector_extent_type, T, E>::call( - into_vector_storage(a)) - .data(), - detail::convert_impl, vector_extent_type, T, E>::call( - into_vector_storage(b)) - .data(), - detail::convert_impl, vector_extent_type, T, E>::call( - into_vector_storage(c)) - .data()); - - return result; -} - -} // namespace kernel_float - -#endif //KERNEL_FLOAT_TRIOPS_H #ifndef KERNEL_FLOAT_VECTOR_H #define KERNEL_FLOAT_VECTOR_H @@ -3517,7 +3550,7 @@ struct vector: public S { typename A, typename B, typename... Rest, - typename = enable_if_t> + typename = enable_if_t>> KERNEL_FLOAT_INLINE vector(const A& a, const B& b, const Rest&... rest) : storage_type {T(a), T(b), T(rest)...} {} @@ -3526,7 +3559,7 @@ struct vector: public S { */ KERNEL_FLOAT_INLINE static constexpr size_t size() { - return E::size; + return extent_size; } /** @@ -3739,6 +3772,21 @@ struct vector: public S { KERNEL_FLOAT_INLINE void for_each(F fun) const { return kernel_float::for_each(*this, std::move(fun)); } + + /** + * Returns the result of `*this + lhs * rhs`. + * + * The operation is performed using a single `kernel_float::fma` call, which may be faster then perform + * the addition and multiplication separately. + */ + template< + typename L, + typename R, + typename T2 = promote_t, vector_value_type>, + typename E2 = broadcast_extent, vector_extent_type>> + KERNEL_FLOAT_INLINE vector fma(const L& lhs, const R& rhs) const { + return ::kernel_float::fma(lhs, rhs, *this); + } }; /** @@ -3771,6 +3819,21 @@ template using vec7 = vec; template using vec8 = vec; // clang-format on +#define KERNEL_FLOAT_VECTOR_ALIAS(NAME, T) \ + template \ + using NAME##1 = vec; \ + using NAME##2 = vec; \ + using NAME##3 = vec; \ + using NAME##4 = vec; \ + using NAME##5 = vec; \ + using NAME##6 = vec; \ + using NAME##7 = vec; \ + using NAME##8 = vec; + +KERNEL_FLOAT_VECTOR_ALIAS(int, int) +KERNEL_FLOAT_VECTOR_ALIAS(float, float) +KERNEL_FLOAT_VECTOR_ALIAS(double, double) + /** * Create a vector from a variable number of input values. * @@ -3816,6 +3879,12 @@ vec(Args&&... args) -> vec, sizeof...(Args)>; namespace kernel_float { + +template<> +struct preferred_vector_size<__half> { + static constexpr size_t value = 2; +}; + KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__half) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half) @@ -3972,58 +4041,10 @@ KERNEL_FLOAT_FP16_CAST(unsigned long, __ull2half_rn(input), (unsigned long)(__ha KERNEL_FLOAT_FP16_CAST(unsigned long long, __ull2half_rn(input), __half2ull_rz(input)); using half = __half; +KERNEL_FLOAT_VECTOR_ALIAS(half, __half) //KERNEL_FLOAT_TYPE_ALIAS(float16x, __half) //KERNEL_FLOAT_TYPE_ALIAS(f16x, __half) -#if KERNEL_FLOAT_IS_DEVICE -namespace detail { -template<> -struct dot_impl<__half, 0> { - KERNEL_FLOAT_INLINE - static __half call(const __half* left, const __half* right) { - return __half(0); - } -}; - -template<> -struct dot_impl<__half, 1> { - KERNEL_FLOAT_INLINE - static __half call(const __half* left, const __half* right) { - return __hmul(left[0], right[0]); - } -}; - -template -struct dot_impl<__half, N> { - static_assert(N >= 2, "internal error"); - - KERNEL_FLOAT_INLINE - static __half call(const __half* left, const __half* right) { - __half2 first_a = {left[0], left[1]}; - __half2 first_b = {right[0], right[1]}; - __half2 accum = __hmul2(first_a, first_b); - -#pragma unroll - for (size_t i = 2; i + 2 <= N; i += 2) { - __half2 a = {left[i], left[i + 1]}; - __half2 b = {right[i], right[i + 1]}; - accum = __hfma2(a, b, accum); - } - - __half result = __hadd(accum.x, accum.y); - - if (N % 2 != 0) { - __half a = left[N - 1]; - __half b = right[N - 1]; - result = __hfma(a, b, result); - } - - return result; - } -}; -} // namespace detail -#endif - } // namespace kernel_float #endif @@ -4042,6 +4063,12 @@ struct dot_impl<__half, N> { namespace kernel_float { + +template<> +struct preferred_vector_size<__nv_bfloat16> { + static constexpr size_t value = 2; +}; + KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__nv_bfloat16) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16) @@ -4248,58 +4275,10 @@ KERNEL_FLOAT_BF16_CAST( #endif using bfloat16 = __nv_bfloat16; +KERNEL_FLOAT_VECTOR_ALIAS(bfloat16x, __nv_bfloat16) //KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16) //KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16) -#if KERNEL_FLOAT_CUDA_ARCH >= 800 -namespace detail { -template<> -struct dot_impl<__nv_bfloat16, 0> { - KERNEL_FLOAT_INLINE - static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) { - return __nv_bfloat16(0); - } -}; - -template<> -struct dot_impl<__nv_bfloat16, 1> { - KERNEL_FLOAT_INLINE - static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) { - return __hmul(left[0], right[0]); - } -}; - -template -struct dot_impl<__nv_bfloat16, N> { - static_assert(N >= 2, "internal error"); - - KERNEL_FLOAT_INLINE - static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) { - __nv_bfloat162 first_a = {left[0], left[1]}; - __nv_bfloat162 first_b = {right[0], right[1]}; - __nv_bfloat162 accum = __hmul2(first_a, first_b); - -#pragma unroll - for (size_t i = 2; i + 1 < N; i += 2) { - __nv_bfloat162 a = {left[i], left[i + 1]}; - __nv_bfloat162 b = {right[i], right[i + 1]}; - accum = __hfma2(a, b, accum); - } - - __nv_bfloat16 result = __hadd(accum.x, accum.y); - - if (N % 2 != 0) { - __nv_bfloat16 a = left[N - 1]; - __nv_bfloat16 b = right[N - 1]; - result = __hfma(a, b, result); - } - - return result; - } -}; -} // namespace detail -#endif - } // namespace kernel_float #if KERNEL_FLOAT_FP16_AVAILABLE