diff --git a/llvm/projects/CMakeLists.txt b/llvm/projects/CMakeLists.txt index d00a1a056c55..0a52fddeb72a 100644 --- a/llvm/projects/CMakeLists.txt +++ b/llvm/projects/CMakeLists.txt @@ -11,6 +11,7 @@ foreach(entry ${entries}) (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/libunwind) AND (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/test-suite) AND (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/parallel-libs) AND + (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/sycl) AND (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/llvm-spirv) AND (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/openmp) AND (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/debuginfo-tests)) @@ -43,6 +44,7 @@ endif() add_llvm_external_project(dragonegg) add_llvm_external_project(parallel-libs) add_llvm_external_project(openmp) +add_llvm_external_project(sycl) add_llvm_external_project(llvm-spirv) if(LLVM_INCLUDE_TESTS) diff --git a/sycl/.clang-tidy b/sycl/.clang-tidy new file mode 100644 index 000000000000..0af3553a0cad --- /dev/null +++ b/sycl/.clang-tidy @@ -0,0 +1 @@ +Checks: '-*,clang-analyzer-*,clang-diagnostic-*,cppcoreguidelines-*,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-bounds-constant-array-index,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-cppcoreguidelines-pro-type-member-init,google-*,-cppcoreguidelines-pro-type-union-access,-google-build-using-namespace,-google-explicit-constructor,-google-runtime-references,misc-*,-misc-macro-parentheses,-misc-unused-parameters' diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt new file mode 100644 index 000000000000..9e1fe82ebc10 --- /dev/null +++ b/sycl/CMakeLists.txt @@ -0,0 +1,147 @@ +cmake_minimum_required(VERSION 3.2) + +project(sycl-solution) +# Requirements +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +if(MSVC) + set_property(GLOBAL PROPERTY USE_FOLDERS ON) +endif() + +# Get clang's version +include(VersionFromVCS) +set(PACKAGE_VERSION "${LLVM_PACKAGE_VERSION}") + +# If CLANG_VERSION_* is specified, use it, if not use LLVM_VERSION_*. +if(NOT DEFINED CLANG_VERSION_MAJOR) + set(CLANG_VERSION_MAJOR ${LLVM_VERSION_MAJOR}) +endif() +if(NOT DEFINED CLANG_VERSION_MINOR) + set(CLANG_VERSION_MINOR ${LLVM_VERSION_MINOR}) +endif() +if(NOT DEFINED CLANG_VERSION_PATCHLEVEL) + set(CLANG_VERSION_PATCHLEVEL ${LLVM_VERSION_PATCH}) +endif() +# Unlike PACKAGE_VERSION, CLANG_VERSION does not include LLVM_VERSION_SUFFIX. +set(CLANG_VERSION "${CLANG_VERSION_MAJOR}.${CLANG_VERSION_MINOR}.${CLANG_VERSION_PATCHLEVEL}") + +set ( LLVM_INST_INC_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include" ) + +find_package(OpenCL REQUIRED) + +include_directories(${OpenCL_INCLUDE_DIRS}) +link_libraries(OpenCL) + +# Copy SYCL headers +set(sycl_inc_dir ${CMAKE_CURRENT_SOURCE_DIR}/include/CL) +set(dst_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include/CL) +add_custom_target(sycl-headers ALL +COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir} ${dst_dir} +COMMENT "Copying SYCL headers ...") + +# Main library + +set(sourceRootPath "${CMAKE_CURRENT_SOURCE_DIR}/source") +set(includeRootPath "${CMAKE_CURRENT_SOURCE_DIR}/include") + +set(SYCLLibrary sycl) + +#To-Do: +#1. Figure out why CMP0057 has to be set. Should have been taken care of earlier in the build +#2. Use AddLLVM to modify the build and access config options +#cmake_policy(SET CMP0057 NEW) +#include(AddLLVM) +set(LLVM_BUILD_LIBRARY_DIRS "${LLVM_BINARY_DIR}/lib/") + +set(SYCL_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) +set(SYCL_TESTS_BINARY_DIR ${SYCL_BINARY_DIR}/test) + +set(CLANG_IN_BUILD "${LLVM_BINARY_DIR}/bin/clang") + +set(LLVM_TOOLS_DIR "${LLVM_BINARY_DIR}/bin/") + +set(SYCL_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/include/") +set(OPENCL_INCLUDE "${OpenCL_INCLUDE_DIRS}") + +add_library("${SYCLLibrary}" SHARED + "${includeRootPath}/CL/sycl.hpp" + "${sourceRootPath}/detail/common.cpp" + "${sourceRootPath}/detail/device_info.cpp" + "${sourceRootPath}/detail/event_impl.cpp" + "${sourceRootPath}/detail/force_device.cpp" + "${sourceRootPath}/detail/helpers.cpp" + "${sourceRootPath}/detail/kernel_impl.cpp" + "${sourceRootPath}/detail/kernel_info.cpp" + "${sourceRootPath}/detail/platform_host.cpp" + "${sourceRootPath}/detail/platform_opencl.cpp" + "${sourceRootPath}/detail/platform_info.cpp" + "${sourceRootPath}/detail/program_impl.cpp" + "${sourceRootPath}/detail/program_manager/program_manager.cpp" + "${sourceRootPath}/detail/queue_impl.cpp" + "${sourceRootPath}/detail/scheduler/commands.cpp" + "${sourceRootPath}/detail/scheduler/printers.cpp" + "${sourceRootPath}/detail/scheduler/scheduler.cpp" + "${sourceRootPath}/context.cpp" + "${sourceRootPath}/device.cpp" + "${sourceRootPath}/device_selector.cpp" + "${sourceRootPath}/event.cpp" + "${sourceRootPath}/exception.cpp" + "${sourceRootPath}/kernel.cpp" + "${sourceRootPath}/platform.cpp" + "${sourceRootPath}/queue.cpp" + "${sourceRootPath}/spirv_ops.cpp" +) + +include_directories("${SYCLLibrary}" "${includeRootPath}") + +target_link_libraries("${SYCLLibrary}" "${OpenCL_LIBRARIES}") +set_target_properties("${SYCLLibrary}" PROPERTIES LINKER_LANGUAGE CXX) + +# Workaround for bug in GCC version 5. +# More information https://bugs.launchpad.net/ubuntu/+source/gcc-5/+bug/1568899 +if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND + CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0 AND + CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0) + target_link_libraries("${SYCLLibrary}" gcc_s gcc) +endif() + +install(TARGETS "${SYCLLibrary}" DESTINATION "lib" COMPONENT ${SYCLLibrary}) +install(DIRECTORY "${includeRootPath}/." DESTINATION "${LLVM_INST_INC_DIRECTORY}" COMPONENT sycl_headers) + +add_subdirectory( test ) +add_subdirectory( tools ) + +set(manifest_list) +set( DEPLOY_LIST + sycl + ocl_lib + ocl_headers + sycl_headers + clang + clang-offload-wrapper + clang-offload-bundler + llc + llvm-as + llvm-dis + llvm-spirv + llvm-link + opt +) + +foreach( comp ${DEPLOY_LIST} ) + + message( STATUS "Adding component ${comp} to deploy") + + set (manifest ${CMAKE_CURRENT_BINARY_DIR}/install_manifest_${comp}.txt) + add_custom_command(OUTPUT ${manifest} + COMMAND "${CMAKE_COMMAND}" + "-DCMAKE_INSTALL_COMPONENT=${comp}" + -P "${CMAKE_BINARY_DIR}/cmake_install.cmake" + COMMENT "Deploying component ${comp}" + USES_TERMINAL) + list(APPEND manifest_list ${manifest}) +endforeach( comp ) + +add_custom_target(deploy DEPENDS ${manifest_list}) diff --git a/sycl/LICENSE.TXT b/sycl/LICENSE.TXT new file mode 100644 index 000000000000..461398bab7a7 --- /dev/null +++ b/sycl/LICENSE.TXT @@ -0,0 +1,68 @@ +============================================================================== +LLVM Release License +============================================================================== +University of Illinois/NCSA +Open Source License + +Copyright (c) 2003-2018 University of Illinois at Urbana-Champaign. +All rights reserved. + +Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at + Urbana-Champaign, nor the names of its contributors may be used to + endorse or promote products derived from this Software without specific + prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. + +============================================================================== +Copyrights and Licenses for Third Party Software Distributed with LLVM: +============================================================================== +The LLVM software contains code written by third parties. Such software will +have its own individual LICENSE.TXT file in the directory in which it appears. +This file will describe the copyrights, license, and restrictions which apply +to that code. + +The disclaimer of warranty in the University of Illinois Open Source License +applies to all code in the LLVM Distribution, and nothing in any of the +other licenses gives permission to use the names of the LLVM Team or the +University of Illinois to endorse or promote products derived from this +Software. + +The following pieces of software have additional or alternate copyrights, +licenses, and/or restrictions: + +Program Directory +------- --------- +Google Test llvm/utils/unittest/googletest +OpenBSD regex llvm/lib/Support/{reg*, COPYRIGHT.regex} +pyyaml tests llvm/test/YAMLParser/{*.data, LICENSE.TXT} +ARM contributions llvm/lib/Target/ARM/LICENSE.TXT +md5 contributions llvm/lib/Support/MD5.cpp llvm/include/llvm/Support/MD5.h diff --git a/sycl/LICENSE2.TXT b/sycl/LICENSE2.TXT new file mode 100644 index 000000000000..f9dc50615d7e --- /dev/null +++ b/sycl/LICENSE2.TXT @@ -0,0 +1,219 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +--- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + diff --git a/sycl/doc/GetStartedWithSYCLCompiler.md b/sycl/doc/GetStartedWithSYCLCompiler.md new file mode 100644 index 000000000000..b5ee86c3bd1c --- /dev/null +++ b/sycl/doc/GetStartedWithSYCLCompiler.md @@ -0,0 +1,190 @@ +# Overview + +The SYCL* Compiler compiles C++\-based SYCL source files with code for both CPU and a wide range of compute accelerators. The compiler uses Khronos* OpenCL™ API to offload computations to accelerators. + +# Before You Begin + +Software requirements: + +Installing OpenCL 2.1 compatible software stack: +1. OpenCL headers: + + a. Download the OpenCL headers from [github.com/KhronosGroup/OpenCL-Headers](https://github.com/KhronosGroup/OpenCL-Headers) to your local machine. e.g. `/usr/local/include/CL` with environment var `$OPENCL_HEADERS`. +2. OpenCL runtime for CPU and GPU: + + a. OpenCL runtime for GPU: follow instructions on [github.com/intel/compute-runtime/releases](https://github.com/intel/compute-runtime/releases) to install. + + b. OpenCL runtime for CPU: follow instructions under section "Intel® CPU Runtime for OpenCL. Applications 18.1 for Linux* OS (64bit only)" on [https://software.intel.com/en-us/articles/opencl-drivers#cpu-section](https://software.intel.com/en-us/articles/opencl-drivers#cpu-section) and click on orange "Download" button to download & install. + +# Build the SYCL compiler + +Download the LLVM* repository with SYCL support to your local machine folder e.g. `$HOME/sycl` (assuming environment var `$SYCL_HOME`) folder using following command: + +``` +git clone https://github.com/intel/llvm -b sycl $HOME/sycl +``` + +Follow regular LLVM build instructions under: [llvm.org/docs/CMake.html](https://llvm.org/docs/CMake.html). To build SYCL runtime use modified CMake command below: + +``` +mkdir $SYCL_HOME/build +cd $SYCL_HOME/build +cmake -DCMAKE_BUILD_TYPE=Release -DOpenCL_INCLUDE_DIR=$OPENCL_HEADERS -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_EXTERNAL_PROJECTS="sycl;llvm-spirv" -DLLVM_EXTERNAL_SYCL_SOURCE_DIR=$SYCL_HOME/sycl -DLLVM_EXTERNAL_LLVM_SPIRV_SOURCE_DIR=$SYCL_HOME/llvm-spirv -DLLVM_TOOL_SYCL_BUILD=ON -DLLVM_TOOL_LLVM_SPIRV_BUILD=ON $SYCL_HOME/llvm +make -j`nproc` check-all +``` + +After the build completed, the SYCL compiler/include/libraries can be found under `$SYCL_HOME/build` directory. + +# Creating a simple SYCL program + +A simple SYCL program consists of following parts: +1. Header section +2. Allocating buffer for data +3. Creating SYCL queue +4. Submitting command group to SYCL queue which includes the kernel +5. Wait for the queue to complete the work +6. Use buffer accessor to retrieve the result on the device and verify the data +7. The end + +Creating a file `simple-sycl-app.cpp` with the following C++ SYCL code in it: + +``` + +#include + +int main() { + // Creating buffer of 4 ints to be used inside the kernel code + cl::sycl::buffer Buffer(4); + + // Creating SYCL queue + cl::sycl::queue Queue; + + // Size of index space for kernel + cl::sycl::range<1> NumOfWorkItems{Buffer.get_count()}; + + // Submitting command group(work) to queue + Queue.submit([&](cl::sycl::handler &cgh) { + // Getting write only access to the buffer on a device + auto Accessor = Buffer.get_access(cgh); + // Executing kernel + cgh.parallel_for( + NumOfWorkItems, [=](cl::sycl::id<1> WIid) { + // Fill buffer with indexes + Accessor[WIid] = (cl::sycl::cl_int)WIid.get(0); + }); + }); + + // Getting read only access to the buffer on the host. + // Implicit barrier waiting for queue to complete the work. + const auto HostAccessor = Buffer.get_access(); + + // Check the results + bool MismatchFound = false; + for (size_t I = 0; I < Buffer.get_count(); ++I) { + if (HostAccessor[I] != I) { + std::cout << "The result is incorrect for element: " << I + << " , expected: " << I << " , got: " << HostAccessor[I] + << std::endl; + MismatchFound = true; + } + } + + if (!MismatchFound) { + std::cout << "The results are correct!" << std::endl; + } + + return MismatchFound; +} + +``` + +# Build and Test a simple SYCL program +The SYCL Compiler supports two types of compilation: + +1. Simplified one step that compiles to binary directly + + ``` + clang++ -std=c++11 -fsycl simple-sycl-app.cpp -o simple-sycl-app -lsycl -lOpenCL + ``` + +2. Manual two steps compilation that compiles device (to SPIR-V) and host code separately (to binary) + + a. Compile the device code from the C++ file into the SPIR-V file: + + ``` + clang++ --sycl -Xclang -fsycl-int-header=simple-sycl-app-int-header.h -c simple-sycl-app.cpp -o kernel.spv + # NOTE: The section "-Xclang -fsycl-int-header=simple-sycl-app-int-header.h" + # generates `integration header` file. + # This file must be included for the host side compilation. + # NOTE: The output file name must be kernel.spv + ``` + + b. Compile host code from the same C++ file into an executable: + + ``` + clang++ -std=c++11 -include simple-sycl-app-int-header.h simple-sycl-app.cpp -o simple-sycl-app -lsycl -lOpenCL + # NOTE: The section "-include simple-sycl-app-int-header.h" includes + # integration header file, which is produced by the device compiler. + ``` + +This `simple-sycl-app` application doesn't specify SYCL device for execution, so SYCL runtime will first try to execute on OpenCL GPU device first, if OpenCL GPU device is not found, it will try to run OpenCL CPU device; and if OpenCL CPU device is also not available, SYCL runtime will run on SYCL host device. + +To run the `simple-sycl-app`: + + LD_LIBRARY_PATH=$SYCL_HOME/build/lib ./simple-sycl-app + The results are correct! + +NOTE: SYCL developer can specify SYCL device for execution using device selectors (e.g. `cl::sycl::cpu_selector`, `cl::sycl::gpu_selector`) as explained in following section [Code the program for a specific GPU](#code-the-program-for-a-specific-gpu). + +# Code the program for a specific GPU + +To specify OpenCL device SYCL provides the abstract `cl::sycl::device_selector` class which the can be used to define how the runtime should select the best device. + +The method `cl::sycl::device_selector::operator()` of the SYCL `cl::sycl::device_selector` is an abstract member function which takes a reference to a SYCL device and returns an integer score. This abstract member function can be implemented in a derived class to provide a logic for selecting a SYCL device. SYCL runtime uses the device for with the highest score is returned. Such object can be passed to `cl::sycl::queue` and `cl::sycl::device` constructors. + +The example below illustrates how to use `cl::sycl::device_selector` to create device and queue objects bound to Intel GPU device: + +``` +#include + +int main() { + class NEOGPUDeviceSelector : public cl::sycl::device_selector { + public: + int operator()(const cl::sycl::device &Device) const override { + using namespace cl::sycl::info; + + const std::string DeviceName = Device.get_info(); + const std::string DeviceVendor = Device.get_info(); + + return Device.is_gpu() && DeviceName.find("HD Graphics NEO") ? 1 : -1; + } + }; + + NEOGPUDeviceSelector Selector; + try { + cl::sycl::queue Queue(Selector); + cl::sycl::device Device(Selector); + } catch (cl::sycl::invalid_parameter_error &E) { + std::cout << E.what() << std::endl; + } +} + +``` + + +# Known Issues or Limitations + +- SYCL device compiler fails if the same kernel was used in different translation units. +- SYCL host device is not fully supported. +- SYCL works only with OpenCL implementations supporting out-of-order queues. +- `math.h` header is conflicting with SYCL headers. Please use `cmath` as a workaround for now like below: + +``` +//#include // conflicting +#include +``` + +# Find More + +SYCL 1.2.1 specification: [www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf](https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) + diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp new file mode 100644 index 000000000000..3197620f41e6 --- /dev/null +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -0,0 +1,150 @@ +//==---------- spirv_ops.hpp --- SPIRV operations -------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +namespace cl { +namespace __spirv { + +#ifdef __SYCL_DEVICE_ONLY__ + +template +extern OpTypeEvent * +OpGroupAsyncCopy(int32_t Scope, __local dataT *Dest, __global dataT *Src, + size_t NumElements, size_t Stride, OpTypeEvent *E) noexcept; + +template +extern OpTypeEvent * +OpGroupAsyncCopy(int32_t Scope, __global dataT *Dest, __local dataT *Src, + size_t NumElements, size_t Stride, OpTypeEvent *E) noexcept; + +#define OpGroupAsyncCopyGlobalToLocal OpGroupAsyncCopy +#define OpGroupAsyncCopyLocalToGlobal OpGroupAsyncCopy + +// Atomic SPIR-V builtins +#define __SPIRV_ATOMIC_LOAD(AS, Type) \ + extern Type OpAtomicLoad(AS Type *P, Scope S, MemorySemantics O); +#define __SPIRV_ATOMIC_STORE(AS, Type) \ + extern void OpAtomicStore(AS Type *P, Scope S, MemorySemantics O, Type V); +#define __SPIRV_ATOMIC_EXCHANGE(AS, Type) \ + extern Type OpAtomicExchange(AS Type *P, Scope S, MemorySemantics O, Type V); +#define __SPIRV_ATOMIC_CMP_EXCHANGE(AS, Type) \ + extern Type OpAtomicCompareExchange(AS Type *P, Scope S, MemorySemantics E, \ + MemorySemantics U, Type V, Type C); +#define __SPIRV_ATOMIC_IADD(AS, Type) \ + extern Type OpAtomicIAdd(AS Type *P, Scope S, MemorySemantics O, Type V); +#define __SPIRV_ATOMIC_ISUB(AS, Type) \ + extern Type OpAtomicISub(AS Type *P, Scope S, MemorySemantics O, Type V); +#define __SPIRV_ATOMIC_SMIN(AS, Type) \ + extern Type OpAtomicSMin(AS Type *P, Scope S, MemorySemantics O, Type V); +#define __SPIRV_ATOMIC_UMIN(AS, Type) \ + extern Type OpAtomicUMin(AS Type *P, Scope S, MemorySemantics O, Type V); +#define __SPIRV_ATOMIC_SMAX(AS, Type) \ + extern Type OpAtomicSMax(AS Type *P, Scope S, MemorySemantics O, Type V); +#define __SPIRV_ATOMIC_UMAX(AS, Type) \ + extern Type OpAtomicUMax(AS Type *P, Scope S, MemorySemantics O, Type V); +#define __SPIRV_ATOMIC_AND(AS, Type) \ + extern Type OpAtomicAnd(AS Type *P, Scope S, MemorySemantics O, Type V); +#define __SPIRV_ATOMIC_OR(AS, Type) \ + extern Type OpAtomicOr(AS Type *P, Scope S, MemorySemantics O, Type V); +#define __SPIRV_ATOMIC_XOR(AS, Type) \ + extern Type OpAtomicXor(AS Type *P, Scope S, MemorySemantics O, Type V); + +#define __SPIRV_ATOMIC_FLOAT(AS, Type) \ + __SPIRV_ATOMIC_LOAD(AS, Type) \ + __SPIRV_ATOMIC_STORE(AS, Type) \ + __SPIRV_ATOMIC_EXCHANGE(AS, Type) + +#define __SPIRV_ATOMIC_BASE(AS, Type) \ + __SPIRV_ATOMIC_FLOAT(AS, Type) \ + __SPIRV_ATOMIC_CMP_EXCHANGE(AS, Type) \ + __SPIRV_ATOMIC_IADD(AS, Type) \ + __SPIRV_ATOMIC_ISUB(AS, Type) \ + __SPIRV_ATOMIC_AND(AS, Type) \ + __SPIRV_ATOMIC_OR(AS, Type) \ + __SPIRV_ATOMIC_XOR(AS, Type) + +#define __SPIRV_ATOMIC_SIGNED(AS, Type) \ + __SPIRV_ATOMIC_BASE(AS, Type) \ + __SPIRV_ATOMIC_SMIN(AS, Type) \ + __SPIRV_ATOMIC_SMAX(AS, Type) + +#define __SPIRV_ATOMIC_UNSIGNED(AS, Type) \ + __SPIRV_ATOMIC_BASE(AS, Type) \ + __SPIRV_ATOMIC_UMIN(AS, Type) \ + __SPIRV_ATOMIC_UMAX(AS, Type) + +// Helper atomic operations which select correct signed/unsigned version +// of atomic min/max based on the signed-ness of the type +#define __SPIRV_ATOMIC_MINMAX(AS, Op) \ + template \ + typename std::enable_if::value, T>::type OpAtomic##Op( \ + AS T *Ptr, Scope Scope, MemorySemantics Semantics, T Value) { \ + return OpAtomicS##Op(Ptr, Scope, Semantics, Value); \ + } \ + template \ + typename std::enable_if::value, T>::type OpAtomic##Op( \ + AS T *Ptr, Scope Scope, MemorySemantics Semantics, T Value) { \ + return OpAtomicU##Op(Ptr, Scope, Semantics, Value); \ + } + +#define __SPIRV_ATOMICS(macro, Arg) macro(__global, Arg) macro(__local, Arg) + +__SPIRV_ATOMICS(__SPIRV_ATOMIC_FLOAT, float) +__SPIRV_ATOMICS(__SPIRV_ATOMIC_SIGNED, int) +__SPIRV_ATOMICS(__SPIRV_ATOMIC_SIGNED, long) +__SPIRV_ATOMICS(__SPIRV_ATOMIC_SIGNED, long long) +__SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned int) +__SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long) +__SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long long) +__SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Min) +__SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Max) + +#else + +template +extern OpTypeEvent * +OpGroupAsyncCopyGlobalToLocal(int32_t Scope, dataT *Dest, dataT *Src, + size_t NumElements, size_t Stride, + OpTypeEvent *E) noexcept { + for (int i = 0; i < NumElements; i++) { + Dest[i] = Src[i * Stride]; + } + // A real instance of the class is not needed, return dummy pointer. + return nullptr; +} + +template +extern OpTypeEvent * +OpGroupAsyncCopyLocalToGlobal(int32_t Scope, dataT *Dest, dataT *Src, + size_t NumElements, size_t Stride, + OpTypeEvent *E) noexcept { + for (int i = 0; i < NumElements; i++) { + Dest[i * Stride] = Src[i]; + } + // A real instance of the class is not needed, return dummy pointer. + return nullptr; +} + +#endif // __SYCL_DEVICE_ONLY__ + +extern void OpControlBarrier(Scope Execution, Scope Memory, + uint32_t Semantics) noexcept; + +extern void OpMemoryBarrier(Scope Memory, uint32_t Semantics) noexcept; + +extern void OpGroupWaitEvents(int32_t Scope, uint32_t NumEvents, + OpTypeEvent ** WaitEvents) noexcept; + +} // namespace __spirv +} // namespace cl diff --git a/sycl/include/CL/__spirv/spirv_types.hpp b/sycl/include/CL/__spirv/spirv_types.hpp new file mode 100644 index 000000000000..8e2d6bfa357b --- /dev/null +++ b/sycl/include/CL/__spirv/spirv_types.hpp @@ -0,0 +1,48 @@ +//===----------- spirv_types.hpp --- SPIRV types -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace cl { +namespace __spirv { + +// TODO: include the header file with SPIR-V declarations from SPIRV-Headers +// project. +enum Scope { + CrossDevice = 0, + Device = 1, + Workgroup = 2, + Subgroup = 3, + Invocation = 4, +}; + +enum MemorySemantics { + None = 0x0, + Acquire = 0x2, + Release = 0x4, + AcquireRelease = 0x8, + SequentiallyConsistent = 0x10, + UniformMemory = 0x40, + SubgroupMemory = 0x80, + WorkgroupMemory = 0x100, + CrossWorkgroupMemory = 0x200, + AtomicCounterMemory = 0x400, + ImageMemory = 0x800, +}; + +// This class does not have definition, it is only predeclared here. +// The pointers to this class objects can be passed to or returned from +// SPIRV built-in functions. +// Only in such cases the class is recognized as SPIRV type OpTypeEvent. +class OpTypeEvent; + +} // namespace __spirv +} // namespace cl diff --git a/sycl/include/CL/sycl.hpp b/sycl/include/CL/sycl.hpp new file mode 100644 index 000000000000..131d1863ba6e --- /dev/null +++ b/sycl/include/CL/sycl.hpp @@ -0,0 +1,47 @@ +//==------------ sycl.hpp - SYCL standard header file ----------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Do not include RT only function implementations for device code as it leads +// to problem. Should be finally fixed when we introduce library. +#ifndef __SYCL_DEVICE_ONLY__ +// The following files are supposed to be included after all SYCL classes +// processed. +#include +#include +#include +#endif //__SYCL_DEVICE_ONLY__ diff --git a/sycl/include/CL/sycl/access/access.hpp b/sycl/include/CL/sycl/access/access.hpp new file mode 100644 index 000000000000..2c844e22285a --- /dev/null +++ b/sycl/include/CL/sycl/access/access.hpp @@ -0,0 +1,159 @@ +//==---------------- access.hpp --- SYCL access ----------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#pragma once + +namespace cl { +namespace sycl { +namespace access { + +enum class target { + global_buffer = 2014, + constant_buffer, + local, + image, + host_buffer, + host_image, + image_array +}; + +enum class mode { + read = 1024, + write, + read_write, + discard_write, + discard_read_write, + atomic +}; + +enum class fence_space { + local_space, + global_space, + global_and_local +}; + +enum class placeholder { false_t, true_t }; + +enum class address_space : int { + private_space = 0, + global_space, + constant_space, + local_space +}; + +} // namespace access + +namespace detail { + +constexpr bool isTargetHostAccess(access::target T) { + return T == access::target::host_buffer || T == access::target::host_image; +} + +constexpr bool modeNeedsOldData(access::mode m) { + return m == access::mode::read || m == access::mode::write || + m == access::mode::read_write || m == access::mode::atomic; +} + +constexpr bool modeWritesNewData(access::mode m) { + return m != access::mode::read; +} + +#ifdef __SYCL_DEVICE_ONLY__ +#define SYCL_GLOBAL_AS __global +#define SYCL_LOCAL_AS __local +#define SYCL_CONSTANT_AS __constant +#define SYCL_PRIVATE_AS __private +#else +#define SYCL_GLOBAL_AS +#define SYCL_LOCAL_AS +#define SYCL_CONSTANT_AS +#define SYCL_PRIVATE_AS +#endif + +template +struct DeviceValueType; + +template +struct DeviceValueType { + using type = SYCL_GLOBAL_AS dataT; +}; + +template +struct DeviceValueType { + using type = SYCL_CONSTANT_AS dataT; +}; + +template +struct DeviceValueType { + using type = SYCL_LOCAL_AS dataT; +}; + +template +struct DeviceValueType { + using type = dataT; +}; + +template +struct PtrValueType; + +template +struct PtrValueType { + using type = SYCL_PRIVATE_AS ElementType; +}; + +template +struct PtrValueType { + using type = SYCL_GLOBAL_AS ElementType; +}; + +template +struct PtrValueType { + using type = SYCL_CONSTANT_AS ElementType; +}; + +template +struct PtrValueType { + using type = SYCL_LOCAL_AS ElementType; +}; + +template +struct remove_AS { + typedef T type; +}; + +#ifdef __SYCL_DEVICE_ONLY__ +template +struct remove_AS { + typedef T type; +}; + +template +struct remove_AS { + typedef T type; +}; + +template +struct remove_AS { + typedef T type; +}; + +template +struct remove_AS { + typedef T type; +}; +#endif + +#undef SYCL_GLOBAL_AS +#undef SYCL_LOCAL_AS +#undef SYCL_CONSTANT_AS +#undef SYCL_PRIVATE_AS + +} // namespace detail + +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp new file mode 100644 index 000000000000..0f865ccd40aa --- /dev/null +++ b/sycl/include/CL/sycl/accessor.hpp @@ -0,0 +1,848 @@ +//==--------- accessor.hpp --- SYCL accessor -------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +namespace cl { +namespace sycl { +// TODO: 4.3.2 Implement common reference semantics +namespace detail { + +template +class accessor_base; + +template +class subscript_obj { + using accessor_t = accessor_base; + + // TODO: Remove reference here as subscript_obj, can potentially outlive + // the accessor. There is no spec-defined usecase, so leave it for now. + const accessor_t &accRef; + cl::sycl::id ids; + +public: + subscript_obj(const accessor_t &acc, cl::sycl::id &indexes) + : accRef(acc), ids(indexes) {} + + INLINE_IF_DEVICE subscript_obj + operator[](size_t index) { + ids[accessorDim - dimensions] = index; + return subscript_obj(accRef, ids); + } +}; + +template +class subscript_obj { + using accessor_t = accessor_base; + + const accessor_t &accRef; + cl::sycl::id ids; + +public: + subscript_obj(const accessor_t &acc, cl::sycl::id &indexes) + : accRef(acc), ids(indexes) {} + + INLINE_IF_DEVICE dataT &operator[](size_t index) { + ids[accessorDim - 1] = index; + return accRef.__impl()->Data[getOffsetForId( + accRef.__impl()->Range, ids, accRef.__impl()->Offset)]; + } +}; + +template +class subscript_obj { + using accessor_t = accessor_base; + + const accessor_t &accRef; + cl::sycl::id ids; + +public: + subscript_obj(const accessor_t &acc, cl::sycl::id &indexes) + : accRef(acc), ids(indexes) {} + + INLINE_IF_DEVICE typename detail::remove_AS::type + operator[](size_t index) { + ids[accessorDim - 1] = index; + return accRef.__impl()->Data[getOffsetForId( + accRef.__impl()->Range, ids, accRef.__impl()->Offset)]; + } +}; + +/// Specializations of accessor_impl define data fields for accessor. +/// There is no default implementation for the class. This class is +/// not a root of the class hierarchy, because it should be +/// initialized at the bottom of the hierarchy. +template +struct accessor_impl; + +#define SYCL_ACCESSOR_IMPL(CONDITION) \ + template \ + struct accessor_impl::type> + +/// Implementation of host accessor providing access to a single element. +/// Available when (dimensions == 0). +SYCL_ACCESSOR_IMPL(isTargetHostAccess(accessTarget) && dimensions == 0) { + dataT *Data; + accessor_impl(dataT *Data) : Data(Data) {} + + // Returns the number of accessed elements. + INLINE_IF_DEVICE size_t get_count() const { return 1; } +}; + +/// Implementation of host accessor. +/// Available when (dimensions > 0). +SYCL_ACCESSOR_IMPL(isTargetHostAccess(accessTarget) && dimensions > 0) { + dataT *Data; + range Range; + id Offset; + + accessor_impl(dataT *Data, range Range, + id Offset = {}) + : Data(Data), Range(Range), Offset(Offset) {} + + // Returns the number of accessed elements. + INLINE_IF_DEVICE size_t get_count() const { return Range.size(); } +}; + +/// Implementation of device (kernel) accessor providing access to a single +/// element. Available only when (dimensions == 0). +/// There is no way to tell at compile time if this accessor will be used +/// on OpenCL device or on host. So, the class should fit both variants. +SYCL_ACCESSOR_IMPL(!isTargetHostAccess(accessTarget) && + accessTarget != access::target::local && + dimensions == 0) { + // This field must be the first to guarantee that it's safe to use + // reinterpret casting while setting kernel arguments in order to get cl_mem + // value from the buffer regardless of the accessor's dimensionality. +#ifndef __SYCL_DEVICE_ONLY__ + detail::buffer_impl *m_Buf = nullptr; + +#else + char padding[sizeof(detail::buffer_impl *)]; +#endif // __SYCL_DEVICE_ONLY__ + + dataT *Data; + + // Device accessors must be associated with a command group handler. + // The handler though can be nullptr at the creation point if the + // accessor is a placeholder accessor. + accessor_impl(dataT *Data, handler *Handler = nullptr) + : Data(Data) + {} + + // Returns the number of accessed elements. + INLINE_IF_DEVICE size_t get_count() const { return 1; } + + static_assert( + std::is_same::type, + dataT>::value, + "The type should have been adjusted before propagating through " + "class hierarchy"); +}; + +/// Implementation of device (kernel) accessor. There is no way to +/// tell at compile time if this accessor will be used on OpenCL +/// device or on host. So, the class should fit both variants. +/// Available only when (dimensions > 0). +SYCL_ACCESSOR_IMPL(!isTargetHostAccess(accessTarget) && + accessTarget != access::target::local && + dimensions > 0) { + // This field must be the first to guarantee that it's safe to use + // reinterpret casting while setting kernel arguments in order to get cl_mem + // value from the buffer regardless of the accessor's dimensionality. +#ifndef __SYCL_DEVICE_ONLY__ + detail::buffer_impl *m_Buf = nullptr; +#else + char padding[sizeof(detail::buffer_impl *)]; +#endif // __SYCL_DEVICE_ONLY__ + + dataT *Data; + range Range; + id Offset; + + // Device accessors must be associated with a command group handler. + // The handler though can be nullptr at the creation point if the + // accessor is a placeholder accessor. + accessor_impl(dataT *Data, range Range, + handler *Handler = nullptr, id Offset = {}) + : Data(Data), Range(Range), Offset(Offset) + {} + + // Returns the number of accessed elements. + INLINE_IF_DEVICE size_t get_count() const { return Range.size(); } + + static_assert( + std::is_same::type, + dataT>::value, + "The type should have been adjusted before propagating through " + "class hierarchy"); +}; + +/// Implementation of local accessor providing access to a single element. +/// Available only when (dimensions == 0). +SYCL_ACCESSOR_IMPL(accessTarget == access::target::local && + dimensions == 0) { + // This field must be the first to guarantee that it's safe to use + // reinterpret casting while setting kernel arguments in order to get size + // value from the accessor regardless of its dimensionality. + size_t ByteSize; + +#ifndef __SYCL_DEVICE_ONLY__ + shared_ptr_class> dataBuf; +#else + char padding[sizeof(shared_ptr_class>)]; +#endif + + dataT *Data; + + accessor_impl(handler * Handler) + : ByteSize(sizeof(dataT)) + { +#ifndef __SYCL_DEVICE_ONLY__ + assert(Handler != nullptr && "Handler is nullptr"); + if (Handler->is_host()) { + dataBuf = std::make_shared>(1); + Data = dataBuf->data(); + } +#endif + } + + // Returns the number of accessed elements. + INLINE_IF_DEVICE size_t get_count() const { return 1; } + + static_assert( + std::is_same::type, + dataT>::value, + "The type should have been adjusted before propagating through " + "class hierarchy"); +}; + +/// Implementation of local accessor. +/// Available only when (dimensions > 0). +SYCL_ACCESSOR_IMPL(accessTarget == access::target::local && + dimensions > 0) { + // This field must be the first to guarantee that it's safe to use + // reinterpret casting while setting kernel arguments in order to get size + // value from the accessor regardless of its dimensionality. + size_t ByteSize; + +#ifndef __SYCL_DEVICE_ONLY__ + shared_ptr_class> dataBuf; +#else + char padding[sizeof(shared_ptr_class>)]; +#endif + + dataT *Data; + range Range; + // TODO delete it when accessor class was remade + // Offset field is not need for local accessor, but this field is now used + // in the inheritance hierarchy. Getting rid of this field will cause + // duplication and complication of the code even more. + id Offset; + + accessor_impl(range Range, handler * Handler) : Range(Range), + ByteSize(Range.size() * sizeof(dataT)) + { +#ifndef __SYCL_DEVICE_ONLY__ + assert(Handler != nullptr && "Handler is nullptr"); + if (Handler->is_host()) { + dataBuf = std::make_shared>(Range.size()); + Data = dataBuf->data(); + } +#endif + } + + // Returns the number of accessed elements. + INLINE_IF_DEVICE size_t get_count() const { return Range.size(); } + + static_assert( + std::is_same::type, + dataT>::value, + "The type should have been adjusted before propagating through " + "class hierarchy"); +}; + +/// Base class for all accessor specializations. +template +class accessor_base { +protected: + template + friend class subscript_obj; + friend class ::cl::sycl::simple_scheduler::Node; + friend class ::cl::sycl::simple_scheduler::Scheduler; + using _ImplT = + accessor_impl; + + INLINE_IF_DEVICE const _ImplT *__impl() const { + return reinterpret_cast(this); + } + + INLINE_IF_DEVICE _ImplT *__impl() { return reinterpret_cast<_ImplT *>(this); } + + static_assert( + std::is_same::type, + dataT>::value, + "The type should have been adjusted before propagating through " + "class hierarchy"); +}; + +// The macro is used to conditionally define methods of accessor class +// by wrapping them into a structure that is non-empty only if the +// condition is met. +#define SYCL_ACCESSOR_SUBCLASS(TAG, PARENT, CONDITION) \ + template \ + struct TAG : ::cl::sycl::detail::PARENT {}; \ + \ + template \ + struct TAG::type> \ + : ::cl::sycl::detail::PARENT + +SYCL_ACCESSOR_SUBCLASS(accessor_common, accessor_base, true /* always */) { + // Returns true if the current accessor is a placeholder accessor. + INLINE_IF_DEVICE constexpr bool is_placeholder() const { + return isPlaceholder == access::placeholder::true_t; + } + + // Returns the size of the accessed memory in bytes. + INLINE_IF_DEVICE size_t get_size() const { return this->get_count() * sizeof(dataT); } + + // Returns the number of accessed elements. + INLINE_IF_DEVICE size_t get_count() const { return this->__impl()->get_count(); } + + template INLINE_IF_DEVICE + typename std::enable_if<(Dimensions > 0), range>::type + get_range() const { return this->__impl()->Range; } + + template INLINE_IF_DEVICE + typename std::enable_if<(Dimensions > 0), id>::type + get_offset() const { return this->__impl()->Offset; } +}; + +SYCL_ACCESSOR_SUBCLASS(accessor_opdata_w, accessor_common, + (accessMode == access::mode::write || + accessMode == access::mode::read_write || + accessMode == access::mode::discard_write || + accessMode == access::mode::discard_read_write) && + dimensions == 0) { + INLINE_IF_DEVICE operator dataT &() const { + return this->__impl()->Data[0]; + } +}; + +SYCL_ACCESSOR_SUBCLASS(accessor_subscript_wn, accessor_opdata_w, + (accessMode == access::mode::write || + accessMode == access::mode::read_write || + accessMode == access::mode::discard_write || + accessMode == access::mode::discard_read_write) && + dimensions > 0) { + dataT &operator[](id index) const { + return this->__impl()->Data[getOffsetForId( + this->get_range(), index, this->get_offset())]; + } + + subscript_obj + INLINE_IF_DEVICE operator[](size_t index) const { + id ids; + ids[0] = index; + return subscript_obj(*this, ids); + } +}; + +SYCL_ACCESSOR_SUBCLASS(accessor_subscript_w, accessor_subscript_wn, + (accessMode == access::mode::write || + accessMode == access::mode::read_write || + accessMode == access::mode::discard_write || + accessMode == access::mode::discard_read_write) && + dimensions == 1) { + // The tricky part here is that there is no function overloading + // between different scopes in C++. That is, operator[] defined in a + // child class hides any operator[] defined in any of the parent + // classes. That's why operator[] defined in accessor_subscript_wn + // is not visible here and we have to define + // operator[](id) once again. + INLINE_IF_DEVICE dataT &operator[](id index) const { + return this->operator[]( + getOffsetForId(this->get_range(), index, this->get_offset())); + } + INLINE_IF_DEVICE dataT &operator[](size_t index) const { + return this->__impl()->Data[index]; + } +}; + +SYCL_ACCESSOR_SUBCLASS(accessor_opdata_r, accessor_subscript_w, + accessMode == access::mode::read && dimensions == 0) { + using PureType = typename detail::remove_AS::type; + operator PureType() const { + return this->__impl()->Data[0]; + } +}; + +SYCL_ACCESSOR_SUBCLASS(accessor_subscript_rn, accessor_opdata_r, + accessMode == access::mode::read && dimensions > 0) { + typename detail::remove_AS::type + operator[](id index) const { + return this->__impl()->Data[getOffsetForId( + this->get_range(), index, this->get_offset())]; + } + + subscript_obj + operator[](size_t index) const { + id ids; + ids[0] = index; + return subscript_obj(*this, ids); + } +}; + +SYCL_ACCESSOR_SUBCLASS(accessor_subscript_r, accessor_subscript_rn, + accessMode == access::mode::read && dimensions == 1) { + typename detail::remove_AS::type + operator[](id index) const { + return this->operator[]( + getOffsetForId(this->get_range(), index, this->get_offset())); + } + typename detail::remove_AS::type + operator[](size_t index) const { + return this->__impl()->Data[index]; + } +}; + +template struct getAddressSpace { + constexpr static cl::sycl::access::address_space value = + cl::sycl::access::address_space::global_space; +}; + +template <> struct getAddressSpace { + constexpr static cl::sycl::access::address_space value = + cl::sycl::access::address_space::local_space; +}; + +// Available when: accessMode == access::mode::atomic && dimensions == 0 +SYCL_ACCESSOR_SUBCLASS(accessor_subscript_atomic_eq0, accessor_subscript_r, + accessMode == access::mode::atomic && dimensions == 0) { + using PureType = typename detail::remove_AS::type; + constexpr static access::address_space addressSpace = + getAddressSpace::value; + operator atomic() const { + return atomic( + multi_ptr(&(this->__impl()->Data[0]))); + } +}; + +// Available when: accessMode == access::mode::atomic && dimensions > 0 +SYCL_ACCESSOR_SUBCLASS(accessor_subscript_atomic_gt0, + accessor_subscript_atomic_eq0, + accessMode == access::mode::atomic && dimensions > 0) { + using PureType = typename detail::remove_AS::type; + constexpr static access::address_space addressSpace = + getAddressSpace::value; + atomic operator[](id index) const { + return atomic( + multi_ptr(&(this->__impl()->Data[getOffsetForId( + this->__impl()->Range, index, this->__impl()->Offset)]))); + } +}; + +// Available only when: accessMode == access::mode::atomic && dimensions == 1 +SYCL_ACCESSOR_SUBCLASS(accessor_subscript_atomic_eq1, + accessor_subscript_atomic_gt0, + accessMode == access::mode::atomic && dimensions == 1) { + using PureType = typename detail::remove_AS::type; + constexpr static access::address_space addressSpace = + getAddressSpace::value; + atomic operator[](size_t index) const { + return atomic( + multi_ptr(&(this->__impl()->Data[index]))); + } +}; + +// TODO: +// /* Available only when: dimensions > 1 */ +// __unspecified__ &operator[](size_t index) const; + +SYCL_ACCESSOR_SUBCLASS(accessor_pointer, accessor_subscript_atomic_eq1, true) { + /* Available only when: accessTarget == access::target::host_buffer */ + template ::type, + access::target AccessTarget = accessTarget> + typename std::enable_if<(AccessTarget == access::target::host_buffer), + dataT *>::type + get_pointer() const { + return this->__impl()->Data; + } + /* Available only when: accessTarget == access::target::global_buffer */ + template ::type, + access::target AccessTarget = accessTarget> + typename std::enable_if<(AccessTarget == access::target::global_buffer), + global_ptr>::type + get_pointer() const { + return global_ptr(this->__impl()->Data); + } + /* Available only when: accessTarget == access::target::constant_buffer */ + template ::type, + access::target AccessTarget = accessTarget> + typename std::enable_if<(AccessTarget == access::target::constant_buffer), + constant_ptr>::type + get_pointer() const { + return constant_ptr(this->__impl()->Data); + } + /* Available only when: accessTarget == access::target::local */ + template ::type, + access::target AccessTarget = accessTarget> + typename std::enable_if<(AccessTarget == access::target::local), + local_ptr>::type + get_pointer() const { + return local_ptr(this->__impl()->Data); + } +}; + +} // namespace detail + +// +// Actual definition of sycl::accessor class. +// +template +class accessor + : public detail::accessor_pointer< + typename detail::DeviceValueType::type, + dimensions, accessMode, accessTarget, isPlaceholder> { + using _ValueType = + typename detail::DeviceValueType::type; + using _ImplT = detail::accessor_impl<_ValueType, dimensions, accessMode, + accessTarget, isPlaceholder>; + + // Make sure Impl field is the first in the class, so that it is + // safe to reinterpret a pointer to accessor as a pointer to the + // implementation. + _ImplT __impl; + + INLINE_IF_DEVICE void __init(_ValueType *Ptr, range Range, + id Offset) { + __impl.Data = Ptr; + __impl.Range = Range; + __impl.Offset = Offset; + } + +public: + using value_type = dataT; + using reference = dataT &; + using const_reference = const dataT &; + + // buffer accessor ctor #1 + // accessor(buffer &); + // + // Available only when: + // ((isPlaceholder == access::placeholder::false_t && + // accessTarget == access::target::host_buffer) || + // (isPlaceholder == access::placeholder::true_t && + // (accessTarget == access::target::global_buffer|| + // accessTarget == access::target::constant_buffer))) && + // dimensions == 0 + template + accessor(typename std::enable_if< + (((IsPlaceholder == access::placeholder::false_t && + AccessTarget == access::target::host_buffer) || + (IsPlaceholder == access::placeholder::true_t && + (AccessTarget == access::target::global_buffer || + AccessTarget == access::target::constant_buffer))) && + Dimensions == 0), + buffer>::type &bufferRef) + : __impl(detail::getSyclObjImpl(bufferRef)->BufPtr) { + auto BufImpl = detail::getSyclObjImpl(bufferRef); + if (AccessTarget == access::target::host_buffer) { + if (BufImpl->OpenCLInterop) { + throw cl::sycl::runtime_error( + "Host access to interoperability buffer is not allowed"); + } else { + simple_scheduler::Scheduler::getInstance() + .copyBack(*BufImpl); + } + } + if (BufImpl->OpenCLInterop && !BufImpl->isValidAccessToMem(accessMode)) { + throw cl::sycl::runtime_error( + "Access mode is incompatible with opencl memory object of the " + "interoperability buffer"); + } + } + + // buffer accessor ctor #2: + // accessor(buffer &, handler &); + // + // Available only when: + // isPlaceholder == access::placeholder::false_t && + // (accessTarget == access::target::global_buffer || + // accessTarget == access::target::constant_buffer) && + // dimensions == 0 + template + accessor(typename std::enable_if< + (IsPlaceholder == access::placeholder::false_t && + (AccessTarget == access::target::global_buffer || + AccessTarget == access::target::constant_buffer) && + Dimensions == 0), + buffer>::type &bufferRef, + handler &commandGroupHandlerRef) +#ifdef __SYCL_DEVICE_ONLY__ + ; // This ctor can't be used in device code, so no need to define it. +#else // !__SYCL_DEVICE_ONLY__ + : __impl(detail::getSyclObjImpl(bufferRef)->BufPtr, + detail::getSyclObjImpl(bufferRef)->Range, + &commandGroupHandlerRef) { + auto BufImpl = detail::getSyclObjImpl(bufferRef); + if (BufImpl->OpenCLInterop && !BufImpl->isValidAccessToMem(accessMode)) { + throw cl::sycl::runtime_error( + "Access mode is incompatible with opencl memory object of the " + "interoperability buffer"); + } + commandGroupHandlerRef.AddBufDep(*BufImpl); + __impl.m_Buf = BufImpl.get(); + } +#endif // !__SYCL_DEVICE_ONLY__ + + // buffer accessor ctor #3: + // accessor(buffer &); + // + // Available only when: + // ((isPlaceholder == access::placeholder::false_t && + // accessTarget == access::target::host_buffer) || + // (isPlaceholder == access::placeholder::true_t && + // (accessTarget == access::target::global_buffer || + // accessTarget == access::target::constant_buffer))) && + // dimensions > 0) + template + accessor(typename std::enable_if< + (((IsPlaceholder == access::placeholder::false_t && + AccessTarget == access::target::host_buffer) || + (IsPlaceholder == access::placeholder::true_t && + (AccessTarget == access::target::global_buffer || + AccessTarget == access::target::constant_buffer))) && + Dimensions > 0), + buffer>::type &bufferRef) + : __impl(detail::getSyclObjImpl(bufferRef)->BufPtr, + detail::getSyclObjImpl(bufferRef)->Range) { + auto BufImpl = detail::getSyclObjImpl(bufferRef); + if (AccessTarget == access::target::host_buffer) { + if (BufImpl->OpenCLInterop) { + throw cl::sycl::runtime_error( + "Host access to interoperability buffer is not allowed"); + } else { + simple_scheduler::Scheduler::getInstance() + .copyBack(*BufImpl); + } + } + if (BufImpl->OpenCLInterop && !BufImpl->isValidAccessToMem(accessMode)) { + throw cl::sycl::runtime_error( + "Access mode is incompatible with opencl memory object of the " + "interoperability buffer"); + } + } + + // buffer ctor #4: + // accessor(buffer &, handler &); + // + // Available only when: + // isPlaceholder == access::placeholder::false_t && + // (accessTarget == access::target::global_buffer || + // accessTarget == access::target::constant_buffer) && + // dimensions > 0 + template + accessor(typename std::enable_if< + (IsPlaceholder == access::placeholder::false_t && + (AccessTarget == access::target::global_buffer || + AccessTarget == access::target::constant_buffer) && + Dimensions > 0), + buffer>::type &bufferRef, + handler &commandGroupHandlerRef) +#ifdef __SYCL_DEVICE_ONLY__ + ; // This ctor can't be used in device code, so no need to define it. +#else + : __impl(detail::getSyclObjImpl(bufferRef)->BufPtr, + detail::getSyclObjImpl(bufferRef)->Range, + &commandGroupHandlerRef) { + auto BufImpl = detail::getSyclObjImpl(bufferRef); + if (BufImpl->OpenCLInterop && !BufImpl->isValidAccessToMem(accessMode)) { + throw cl::sycl::runtime_error( + "Access mode is incompatible with opencl memory object of the " + "interoperability buffer"); + } + commandGroupHandlerRef.AddBufDep(*BufImpl); + __impl.m_Buf = BufImpl.get(); + } +#endif + + // accessor ctor #5: + // accessor(buffer &, range Range, id Offset = {}); + // + // Available only when: + // (isPlaceholder == access::placeholder::false_t && + // accessTarget == access::target::host_buffer) || + // (isPlaceholder == access::placeholder::true_t && + // (accessTarget == access::target::global_buffer || + // accessTarget == access::target::constant_buffer) && + // dimensions > 0) + template + accessor(typename std::enable_if< + ((IsPlaceholder == access::placeholder::false_t && + AccessTarget == access::target::host_buffer) || + (IsPlaceholder == access::placeholder::true_t && + (AccessTarget == access::target::global_buffer || + AccessTarget == access::target::constant_buffer) && + Dimensions > 0)), + buffer>::type &bufferRef, + range Range, + id Offset = {} + ) +#ifdef __SYCL_DEVICE_ONLY__ + ; // This ctor can't be used in device code, so no need to define it. +#else // !__SYCL_DEVICE_ONLY__ + : __impl(detail::getSyclObjImpl(bufferRef)->BufPtr, Range, Offset) { + auto BufImpl = detail::getSyclObjImpl(bufferRef); + if (AccessTarget == access::target::host_buffer) { + if (BufImpl->OpenCLInterop) { + throw cl::sycl::runtime_error( + "Host access to interoperability buffer is not allowed"); + } else { + simple_scheduler::Scheduler::getInstance() + .copyBack(*BufImpl); + } + } + if (BufImpl->OpenCLInterop && !BufImpl->isValidAccessToMem(accessMode)) { + throw cl::sycl::runtime_error( + "Access mode is incompatible with opencl memory object of the " + "interoperability buffer"); + } + } +#endif // !__SYCL_DEVICE_ONLY__ + + // buffer ctor #6: + // accessor(buffer &, handler &, range Range, id Offset); + // + // Available only when: + // isPlaceholder == access::placeholder::false_t && + // (accessTarget == access::target::global_buffer || + // accessTarget == access::target::constant_buffer) && + // dimensions > 0 + template + accessor(typename std::enable_if< + (IsPlaceholder == access::placeholder::false_t && + (AccessTarget == access::target::global_buffer || + AccessTarget == access::target::constant_buffer) && + Dimensions > 0), + buffer>::type &bufferRef, + handler &commandGroupHandlerRef, + range Range, + id Offset = {} + ) +#ifdef __SYCL_DEVICE_ONLY__ + ; // This ctor can't be used in device code, so no need to define it. +#else // !__SYCL_DEVICE_ONLY__ + : __impl(detail::getSyclObjImpl(bufferRef)->BufPtr, Range, + &commandGroupHandlerRef, Offset) { + auto BufImpl = detail::getSyclObjImpl(bufferRef); + if (BufImpl->OpenCLInterop && !BufImpl->isValidAccessToMem(accessMode)) { + throw cl::sycl::runtime_error( + "Access mode is incompatible with opencl memory object of the " + "interoperability buffer"); + } + commandGroupHandlerRef.AddBufDep(*BufImpl); + __impl.m_Buf = BufImpl.get(); + } +#endif // !__SYCL_DEVICE_ONLY__ + + // TODO: + // local accessor ctor #1 + // accessor(handler &); + // Available only when: + // AccessTarget == access::target::local && Dimensions == 0 + // + // template + // accessor(typename std::enable_if<(AccessTarget == access::target::local && + // Dimensions == 0), handler>::type &commandGroupHandlerRef); + + + // local accessor ctor #2 + // accessor(range allocationSize, handler &); + // Available only when: + // AccessTarget == access::target::local && Dimensions => 0 + template + accessor(typename std::enable_if<(AccessTarget == access::target::local && + Dimensions > 0), + range>::type allocationSize, + handler &commandGroupHandlerRef) + : __impl(allocationSize, &commandGroupHandlerRef) {} +}; + +} // namespace sycl +} // namespace cl + +#undef SYCL_ACCESSOR_IMPL +#undef SYCL_ACCESSOR_SUBCLASS + +//TODO hash for accessor diff --git a/sycl/include/CL/sycl/atomic.hpp b/sycl/include/CL/sycl/atomic.hpp new file mode 100644 index 000000000000..ab06af923330 --- /dev/null +++ b/sycl/include/CL/sycl/atomic.hpp @@ -0,0 +1,353 @@ +//==---------------- atomic.hpp - SYCL atomics -----------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#ifdef __SYCL_DEVICE_ONLY__ +#include +#else +#include +#include +#endif +#include + +#define STATIC_ASSERT_NOT_FLOAT(T) \ + static_assert(!std::is_same::value, \ + "SYCL atomic function not available for float type") + +namespace cl { +namespace sycl { + +enum class memory_order : int { relaxed }; + +// Forward declaration +template +class multi_ptr; + +namespace detail { + +using memory_order = cl::sycl::memory_order; + +template struct IsValidAtomicType { + static constexpr bool value = + (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value); +}; + +template struct IsValidAtomicAddressSpace { + static constexpr bool value = (AS == access::address_space::global_space || + AS == access::address_space::local_space); +}; + +// Type trait to translate a cl::sycl::access::address_space to +// a SPIR-V memory scope +template struct GetSpirvMemoryScope {}; +template <> struct GetSpirvMemoryScope { + static constexpr auto scope = cl::__spirv::Scope::Device; +}; +template <> struct GetSpirvMemoryScope { + static constexpr auto scope = ::cl::__spirv::Scope::Workgroup; +}; + +// Translate the cl::sycl::memory_order to a SPIR-V builtin order +static inline ::cl::__spirv::MemorySemantics +getSpirvMemorySemantics(memory_order Order) { + return ::cl::__spirv::MemorySemantics::None; +} + +} // namespace detail +} // namespace sycl +} // namespace cl + +#ifndef __SYCL_DEVICE_ONLY__ +// host implementation of SYCL atomics +namespace cl { +namespace sycl { +namespace detail { +// Translate cl::sycl::memory_order or cl::__spirv::MemorySemantics +// into std::memory_order +// Only relaxed memory semantics are supported currently +static inline std::memory_order +getStdMemoryOrder(::cl::__spirv::MemorySemantics MS) { + return std::memory_order_relaxed; +} +static inline std::memory_order getStdMemoryOrder(::cl::sycl::memory_order MS) { + return std::memory_order_relaxed; +} +} // namespace detail +} // namespace sycl + +// std::atomic version of atomic SPIR-V builtins +namespace __spirv { + +template +void OpAtomicStore(std::atomic *Ptr, Scope S, MemorySemantics MS, T V) { + Ptr->store(V, ::cl::sycl::detail::getStdMemoryOrder(MS)); +} + +template +T OpAtomicLoad(std::atomic *Ptr, Scope S, MemorySemantics MS) { + return Ptr->load(::cl::sycl::detail::getStdMemoryOrder(MS)); +} + +template +T OpAtomicExchange(std::atomic* Ptr, Scope S, MemorySemantics MS, T V) { + return Ptr->exchange(V, ::cl::sycl::detail::getStdMemoryOrder(MS)); +} + +template +extern T OpAtomicIAdd(std::atomic *Ptr, Scope S, MemorySemantics MS, T V) { + return Ptr->fetch_add(V, ::cl::sycl::detail::getStdMemoryOrder(MS)); +} + +template +extern T OpAtomicISub(std::atomic *Ptr, Scope S, MemorySemantics MS, T V) { + return Ptr->fetch_sub(V, ::cl::sycl::detail::getStdMemoryOrder(MS)); +} + +template +extern T OpAtomicAnd(std::atomic *Ptr, Scope S, MemorySemantics MS, T V) { + return Ptr->fetch_and(V, ::cl::sycl::detail::getStdMemoryOrder(MS)); +} + +template +extern T OpAtomicOr(std::atomic *Ptr, Scope S, MemorySemantics MS, T V) { + return Ptr->fetch_or(V, ::cl::sycl::detail::getStdMemoryOrder(MS)); +} + +template +extern T OpAtomicXor(std::atomic *Ptr, Scope S, MemorySemantics MS, T V) { + return Ptr->fetch_xor(V, ::cl::sycl::detail::getStdMemoryOrder(MS)); +} + +template +extern T OpAtomicMin(std::atomic *Ptr, Scope S, MemorySemantics MS, T V) { + std::memory_order MemoryOrder = ::cl::sycl::detail::getStdMemoryOrder(MS); + T Val = Ptr->load(MemoryOrder); + while (V < Val) { + if (Ptr->compare_exchange_strong(Val, V, MemoryOrder, MemoryOrder)) + break; + Val = Ptr->load(MemoryOrder); + } + return Val; +} + +template +extern T OpAtomicMax(std::atomic *Ptr, Scope S, MemorySemantics MS, T V) { + std::memory_order MemoryOrder = ::cl::sycl::detail::getStdMemoryOrder(MS); + T Val = Ptr->load(MemoryOrder); + while (V > Val) { + if (Ptr->compare_exchange_strong(Val, V, MemoryOrder, MemoryOrder)) + break; + Val = Ptr->load(MemoryOrder); + } + return Val; +} + +} // namespace __spirv +} // namespace cl +#endif // !defined(__SYCL_DEVICE_ONLY__) + +namespace cl { +namespace sycl { + +template +class atomic { + static_assert(detail::IsValidAtomicType::value, + "Invalid SYCL atomic type. Valid types are: int, " + "unsigned int, long, unsigned long, long long, unsigned " + "long long, float"); + static_assert(detail::IsValidAtomicAddressSpace::value, + "Invalid SYCL atomic address_space. Valid address spaces are: " + "global_space, local_space"); + static constexpr auto SpirvScope = + detail::GetSpirvMemoryScope::scope; + +public: + template +#ifdef __SYCL_DEVICE_ONLY__ + atomic(multi_ptr ptr) + : Ptr(ptr.get()) +#else + atomic(multi_ptr ptr) + : Ptr(reinterpret_cast *>(ptr.get())) +#endif + { + static_assert(sizeof(T) == sizeof(pointerT), + "T and pointerT must be same size"); + } + + void store(T Operand, memory_order Order = memory_order::relaxed) volatile { + ::cl::__spirv::OpAtomicStore( + Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand); + } + + T load(memory_order Order = memory_order::relaxed) volatile { + return ::cl::__spirv::OpAtomicLoad(Ptr, SpirvScope, + detail::getSpirvMemorySemantics(Order)); + } + + T exchange(T Operand, memory_order Order = memory_order::relaxed) volatile { + return ::cl::__spirv::OpAtomicExchange( + Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand); + } + + bool compare_exchange_strong( + T &Expected, T Desired, memory_order SuccessOrder = memory_order::relaxed, + memory_order FailOrder = memory_order::relaxed) volatile { + STATIC_ASSERT_NOT_FLOAT(T); +#ifdef __SYCL_DEVICE_ONLY__ + T Value = ::cl::__spirv::OpAtomicCompareExchange( + Ptr, SpirvScope, detail::getSpirvMemorySemantics(SuccessOrder), + detail::getSpirvMemorySemantics(FailOrder), Desired, Expected); + return (Value == Desired); +#else + return Ptr->compare_exchange_strong(Expected, Desired, + detail::getStdMemoryOrder(SuccessOrder), + detail::getStdMemoryOrder(FailOrder)); +#endif + } + + T fetch_add(T Operand, memory_order Order = memory_order::relaxed) volatile { + STATIC_ASSERT_NOT_FLOAT(T); + return ::cl::__spirv::OpAtomicIAdd( + Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand); + } + + T fetch_sub(T Operand, memory_order Order = memory_order::relaxed) volatile { + STATIC_ASSERT_NOT_FLOAT(T); + return ::cl::__spirv::OpAtomicISub( + Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand); + } + + T fetch_and(T Operand, memory_order Order = memory_order::relaxed) volatile { + STATIC_ASSERT_NOT_FLOAT(T); + return ::cl::__spirv::OpAtomicAnd( + Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand); + } + + T fetch_or(T Operand, memory_order Order = memory_order::relaxed) volatile { + STATIC_ASSERT_NOT_FLOAT(T); + return ::cl::__spirv::OpAtomicOr( + Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand); + } + + T fetch_xor(T Operand, memory_order Order = memory_order::relaxed) volatile { + STATIC_ASSERT_NOT_FLOAT(T); + return ::cl::__spirv::OpAtomicXor( + Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand); + } + + T fetch_min(T Operand, memory_order Order = memory_order::relaxed) volatile { + STATIC_ASSERT_NOT_FLOAT(T); + return ::cl::__spirv::OpAtomicMin( + Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand); + } + + T fetch_max(T Operand, memory_order Order = memory_order::relaxed) volatile { + STATIC_ASSERT_NOT_FLOAT(T); + return ::cl::__spirv::OpAtomicMax( + Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand); + } + +private: +#ifdef __SYCL_DEVICE_ONLY__ + typename detail::PtrValueType::type *Ptr; +#else + std::atomic *Ptr; +#endif +}; + +template +void atomic_store(atomic Object, T Operand, + memory_order MemoryOrder = memory_order::relaxed) { + Object.store(Operand, MemoryOrder); +} + +template +T atomic_load(atomic Object, + memory_order MemoryOrder = memory_order::relaxed) { + return Object.load(MemoryOrder); +} + +template +T atomic_exchange(atomic Object, T Operand, + memory_order MemoryOrder = memory_order::relaxed) { + return Object.exchange(Operand, MemoryOrder); +} + +// TODO: When CTS atomic tests are fixed remove this API +template +bool atomic_compare_exchange_strong( + atomic Object, T *Expected, T Desired, + memory_order SuccessOrder = memory_order::relaxed, + memory_order FailOrder = memory_order::relaxed) { + return Object.compare_exchange_strong(*Expected, Desired, SuccessOrder, + FailOrder); +} + +template +bool atomic_compare_exchange_strong( + atomic Object, T &Expected, T Desired, + memory_order SuccessOrder = memory_order::relaxed, + memory_order FailOrder = memory_order::relaxed) { + return Object.compare_exchange_strong(Expected, Desired, SuccessOrder, + FailOrder); +} + +template +T atomic_fetch_add(atomic Object, T Operand, + memory_order MemoryOrder = memory_order::relaxed) { + return Object.fetch_add(Operand, MemoryOrder); +} + +template +T atomic_fetch_sub(atomic Object, T Operand, + memory_order MemoryOrder = memory_order::relaxed) { + return Object.fetch_sub(Operand, MemoryOrder); +} + +template +T atomic_fetch_and(atomic Object, T Operand, + memory_order MemoryOrder = memory_order::relaxed) { + return Object.fetch_and(Operand, MemoryOrder); +} + +template +T atomic_fetch_or(atomic Object, T Operand, + memory_order MemoryOrder = memory_order::relaxed) { + return Object.fetch_or(Operand, MemoryOrder); +} + +template +T atomic_fetch_xor(atomic Object, T Operand, + memory_order MemoryOrder = memory_order::relaxed) { + return Object.fetch_xor(Operand, MemoryOrder); +} + +template +T atomic_fetch_min(atomic Object, T Operand, + memory_order MemoryOrder = memory_order::relaxed) { + return Object.fetch_min(Operand, MemoryOrder); +} + +template +T atomic_fetch_max(atomic Object, T Operand, + memory_order MemoryOrder = memory_order::relaxed) { + return Object.fetch_max(Operand, MemoryOrder); +} + +} // namespace sycl +} // namespace cl + +#undef STATIC_ASSERT_NOT_FLOAT diff --git a/sycl/include/CL/sycl/buffer.hpp b/sycl/include/CL/sycl/buffer.hpp new file mode 100644 index 000000000000..5cd1e5109ced --- /dev/null +++ b/sycl/include/CL/sycl/buffer.hpp @@ -0,0 +1,199 @@ +//==----------- buffer.hpp --- SYCL buffer ---------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +// TODO: 4.3.4 Properties + +namespace cl { +namespace sycl { +class handler; +class queue; +template class range; + +template > +class buffer { +public: + using value_type = T; + using reference = value_type &; + using const_reference = const value_type &; + using allocator_type = AllocatorT; + + buffer(const range &bufferRange, + const property_list &propList = {}) { + impl = std::make_shared>( + bufferRange, propList); + } + + // buffer(const range &bufferRange, AllocatorT allocator, + // const property_list &propList = {}) { + // impl = std::make_shared(bufferRange, allocator, + // propList); + // } + + buffer(T *hostData, const range &bufferRange, + const property_list &propList = {}) { + impl = std::make_shared>( + hostData, bufferRange, propList); + } + + // buffer(T *hostData, const range &bufferRange, + // AllocatorT allocator, const property_list &propList = {}) { + // impl = std::make_shared(hostData, bufferRange, + // allocator, propList); + // } + + buffer(const T *hostData, const range &bufferRange, + const property_list &propList = {}) { + impl = std::make_shared>( + hostData, bufferRange, propList); + } + + // buffer(const T *hostData, const range &bufferRange, + // AllocatorT allocator, const property_list &propList = {}) { + // impl = std::make_shared(hostData, bufferRange, + // allocator, propList); + // } + + // buffer(const shared_ptr_class &hostData, + // const range &bufferRange, AllocatorT allocator, + // const property_list &propList = {}) { + // impl = std::make_shared(hostData, bufferRange, + // allocator, propList); + // } + + buffer(const shared_ptr_class &hostData, + const range &bufferRange, + const property_list &propList = {}) { + impl = std::make_shared>( + hostData, bufferRange, propList); + } + + // template + // buffer(InputIterator first, InputIterator last, AllocatorT allocator, + // const property_list &propList = {}) { + // impl = std::make_shared(first, last, allocator, + // propList); + // } + + template > + buffer(InputIterator first, InputIterator last, + const property_list &propList = {}) { + impl = std::make_shared>( + first, last, propList); + } + + // buffer(buffer b, const id + // &baseIndex, const range &subRange) { + // impl = std::make_shared(b, baseIndex, subRange); + // } + + template > + buffer(cl_mem MemObject, const context &SyclContext, + event AvailableEvent = {}) { + impl = std::make_shared>( + MemObject, SyclContext, AvailableEvent); + } + + buffer(const buffer &rhs) = default; + + buffer(buffer &&rhs) = default; + + buffer &operator=(const buffer &rhs) = default; + + buffer &operator=(buffer &&rhs) = default; + + ~buffer() = default; + + bool operator==(const buffer &rhs) const { return impl == rhs.impl; } + + bool operator!=(const buffer &rhs) const { return !(*this == rhs); } + + /* -- common interface members -- */ + + /* -- property interface members -- */ + + range get_range() const { return impl->get_range(); } + + size_t get_count() const { return impl->get_count(); } + + size_t get_size() const { return impl->get_size(); } + + AllocatorT get_allocator() const { return impl->get_allocator(); } + + template + accessor + get_access(handler &commandGroupHandler) { + return impl->template get_access(*this, commandGroupHandler); + } + + template + accessor + get_access() { + return impl->template get_access(*this); + } + + // template accessor get_access( handler &commandGroupHandler, + // range accessRange, id accessOffset = {}) { + // return impl->get_access(commandGroupHandler, accessRange, + // accessOffset); + // } + + // template + // accessor get_access( range accessRange, + // id accessOffset = {}) { + // return impl->get_access(accessRange, accessOffset); + // } + + template + void set_final_data(Destination finalData = nullptr) { + impl->set_final_data(finalData); + } + + // void set_write_back(bool flag = true) { return impl->set_write_back(flag); + // } + + // bool is_sub_buffer() const { return impl->is_sub_buffer(); } + + // template + // buffer + // reinterpret(range reinterpretRange) const { + // return impl->reinterpret((reinterpretRange)); + // } + +private: + shared_ptr_class> impl; + template + friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject); +}; +} // namespace sycl +} // namespace cl + +namespace std { +template +struct hash> { + size_t + operator()(const cl::sycl::buffer &b) const { + return hash>>()( + cl::sycl::detail::getSyclObjImpl(b)); + } +}; +} // namespace std diff --git a/sycl/include/CL/sycl/context.hpp b/sycl/include/CL/sycl/context.hpp new file mode 100644 index 000000000000..a53933042a83 --- /dev/null +++ b/sycl/include/CL/sycl/context.hpp @@ -0,0 +1,80 @@ +//==---------------- context.hpp - SYCL context ----------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include +// 4.6.2 Context class + +namespace cl { +namespace sycl { +class context { +public: + explicit context(const async_handler &asyncHandler = {}) + : context(default_selector().select_device(), asyncHandler) {} + + context(const device &dev, async_handler asyncHandler = {}) + : context(vector_class(1, dev), asyncHandler) {} + + context(const platform &plt, async_handler asyncHandler = {}) + : context(plt.get_devices(), asyncHandler) {} + + context(const vector_class &deviceList, + async_handler asyncHandler = {}); + + context(cl_context clContext, async_handler asyncHandler = {}); + + template + typename info::param_traits::return_type + get_info() const { + return impl->get_info(); + } + + context(const context &rhs) = default; + + context(context &&rhs) = default; + + context &operator=(const context &rhs) = default; + + context &operator=(context &&rhs) = default; + + bool operator==(const context &rhs) const { return impl == rhs.impl; } + + bool operator!=(const context &rhs) const { return !(*this == rhs); } + + cl_context get() const { return impl->get(); } + + bool is_host() const { return impl->is_host(); } + + platform get_platform() const { return impl->get_platform(); } + + vector_class get_devices() const { return impl->get_devices(); } + +private: + std::shared_ptr impl; + template + friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject); +}; + +} // namespace sycl +} // namespace cl + +namespace std { +template <> struct hash { + size_t operator()(const cl::sycl::context &c) const { + return hash>()( + cl::sycl::detail::getSyclObjImpl(c)); + } +}; +} // namespace std diff --git a/sycl/include/CL/sycl/detail/array.hpp b/sycl/include/CL/sycl/detail/array.hpp new file mode 100644 index 000000000000..b8cf259dd0bd --- /dev/null +++ b/sycl/include/CL/sycl/detail/array.hpp @@ -0,0 +1,117 @@ +//==-------- array.hpp --- SYCL common iteration object ---------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +namespace cl { +namespace sycl { +template struct id; +template class range; +namespace detail { + +template class array { +public: + INLINE_IF_DEVICE array() : common_array{0} {} + + /* The following constructor is only available in the array struct + * specialization where: dimensions==1 */ + template INLINE_IF_DEVICE + array(typename std::enable_if<(N == 1), size_t>::type dim0) + : common_array{dim0} {} + + /* The following constructor is only available in the array struct + * specialization where: dimensions==2 */ + template INLINE_IF_DEVICE + array(typename std::enable_if<(N == 2), size_t>::type dim0, size_t dim1) + : common_array{dim0, dim1} {} + + /* The following constructor is only available in the array struct + * specialization where: dimensions==3 */ + template INLINE_IF_DEVICE + array(typename std::enable_if<(N == 3), size_t>::type dim0, size_t dim1, + size_t dim2) + : common_array{dim0, dim1, dim2} {} + + // Conversion operators to derived classes + INLINE_IF_DEVICE operator cl::sycl::id() const { + cl::sycl::id result; + for (int i = 0; i < dimensions; ++i) { + result[i] = common_array[i]; + } + return result; + } + + INLINE_IF_DEVICE operator cl::sycl::range() const { + cl::sycl::range result; + for (int i = 0; i < dimensions; ++i) { + result[i] = common_array[i]; + } + return result; + } + + INLINE_IF_DEVICE size_t get(int dimension) const { + check_dimension(dimension); + return common_array[dimension]; + } + + INLINE_IF_DEVICE size_t &operator[](int dimension) { + check_dimension(dimension); + return common_array[dimension]; + } + + INLINE_IF_DEVICE size_t operator[](int dimension) const { + check_dimension(dimension); + return common_array[dimension]; + } + + INLINE_IF_DEVICE array(const array &rhs) = default; + INLINE_IF_DEVICE array(array &&rhs) = default; + INLINE_IF_DEVICE array &operator=(const array &rhs) = default; + INLINE_IF_DEVICE array &operator=(array &&rhs) = default; + + // Returns true iff all elements in 'this' are equal to + // the corresponding elements in 'rhs'. + INLINE_IF_DEVICE bool operator==(const array &rhs) const { + for (int i = 0; i < dimensions; ++i) { + if (this->common_array[i] != rhs.common_array[i]) { + return false; + } + } + return true; + } + + // Returns true iff there is at least one element in 'this' + // which is not equal to the corresponding element in 'rhs'. + INLINE_IF_DEVICE bool operator!=(const array &rhs) const { + for (int i = 0; i < dimensions; ++i) { + if (this->common_array[i] != rhs.common_array[i]) { + return true; + } + } + return false; + } + +protected: + size_t common_array[dimensions]; + ALWAYS_INLINE void check_dimension(int dimension) const { +#ifndef __SYCL_DEVICE_ONLY__ + if (dimension >= dimensions || dimension < 0) { + throw cl::sycl::invalid_parameter_error("Index out of range"); + } +#endif + } +}; + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/buffer_impl.hpp b/sycl/include/CL/sycl/detail/buffer_impl.hpp new file mode 100644 index 000000000000..f763c06fe4c9 --- /dev/null +++ b/sycl/include/CL/sycl/detail/buffer_impl.hpp @@ -0,0 +1,516 @@ +//==---------- buffer_impl.hpp --- SYCL buffer ----------------*- C++-*---==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cl { +namespace sycl { +using QueueImplPtr = std::shared_ptr; +using EventImplPtr = std::shared_ptr; +// Forward declarations +template +class accessor; +template class buffer; +class handler; +class queue; +template class id; +template class range; +template using buffer_allocator = std::allocator; +namespace detail { +template > +class buffer_impl { +public: + buffer_impl(const range &bufferRange, + const property_list &propList = {}) + : buffer_impl((T *)nullptr, bufferRange, propList) {} + + buffer_impl(T *hostData, const range &bufferRange, + const property_list &propList = {}) + : Range(bufferRange), Props(propList) { + if (Props.has_property()) { + BufPtr = hostData; + } else { + BufData.resize(get_size()); + BufPtr = reinterpret_cast(BufData.data()); + if (hostData != nullptr) { + set_final_data(hostData); + std::copy(hostData, hostData + get_count(), BufPtr); + } + } + } + + // TODO temporary solution for allowing initialisation with const data + buffer_impl(const T *hostData, const range &bufferRange, + const property_list &propList = {}) + : Range(bufferRange), Props(propList) { + if (Props.has_property()) { + // TODO make this buffer read only + BufPtr = const_cast(hostData); + } else { + BufData.resize(get_size()); + BufPtr = reinterpret_cast(BufData.data()); + if (hostData != nullptr) { + std::copy(hostData, hostData + get_count(), BufPtr); + } + } + } + + buffer_impl(const shared_ptr_class &hostData, + const range &bufferRange, + const property_list &propList = {}) + : Range(bufferRange), Props(propList) { + if (Props.has_property()) { + BufPtr = hostData.get(); + } else { + BufData.resize(get_size()); + BufPtr = reinterpret_cast(BufData.data()); + if (hostData.get() != nullptr) { + weak_ptr_class hostDataWeak = hostData; + set_final_data(hostDataWeak); + std::copy(hostData.get(), hostData.get() + get_count(), BufPtr); + } + } + } + + template > + buffer_impl(InputIterator first, InputIterator last, + const property_list &propList = {}) + : Range(range<1>(std::distance(first, last))), Props(propList) { + if (Props.has_property()) { + BufPtr = &*first; + } else { + BufData.resize(get_size()); + BufPtr = reinterpret_cast(BufData.data()); + std::copy(first, last, BufPtr); + } + } + + template > + buffer_impl(cl_mem MemObject, const context &SyclContext, + event AvailableEvent = {}) + : OpenCLInterop(true), AvailableEvent(AvailableEvent) { + if (SyclContext.is_host()) + throw cl::sycl::invalid_parameter_error( + "Creation of interoperability buffer using host context is not " + "allowed"); + + CHECK_OCL_CODE(clGetMemObjectInfo(MemObject, CL_MEM_CONTEXT, + sizeof(OpenCLContext), &OpenCLContext, nullptr)); + if (SyclContext.get() != OpenCLContext) + throw cl::sycl::invalid_parameter_error( + "Input context must be the same as the context of cl_mem"); + OCLState.Mem = MemObject; + CHECK_OCL_CODE(clRetainMemObject(MemObject)); + } + + range get_range() const { return Range; } + + size_t get_count() const { return Range.size(); } + + size_t get_size() const { return get_count() * sizeof(T); } + + ~buffer_impl() { + if (!OpenCLInterop) + // TODO. Use node instead? + simple_scheduler::Scheduler::getInstance() + .copyBack( + *this); + + if (uploadData != nullptr) { + uploadData(); + } + + // TODO. Use node instead? + simple_scheduler::Scheduler::getInstance().removeBuffer(*this); + + if (OpenCLInterop) + CHECK_OCL_CODE_NO_EXC(clReleaseMemObject(OCLState.Mem)); + } + + void set_final_data(std::nullptr_t) { uploadData = nullptr; } + + void set_final_data(weak_ptr_class final_data) { + if (OpenCLInterop) + throw cl::sycl::runtime_error( + "set_final_data could not be used with interoperability buffer"); + uploadData = [this, final_data]() { + if (auto finalData = final_data.lock()) { + std::copy(BufPtr, BufPtr + get_count(), finalData.get()); + } + }; + } + + template void set_final_data(Destination final_data) { + if (OpenCLInterop) + throw cl::sycl::runtime_error( + "set_final_data could not be used with interoperability buffer"); + static_assert(!std::is_const::value, + "Сan not write in a constant Destination. Destination should " + "not be const."); + uploadData = [this, final_data]() mutable { + std::copy(BufPtr, BufPtr + get_count(), final_data); + }; + } + + template + accessor + get_access(buffer &Buffer, + handler &commandGroupHandler) { + return accessor( + Buffer, commandGroupHandler); + } + + template + accessor + get_access(buffer &Buffer) { + return accessor(Buffer); + } + +public: + void moveMemoryTo(QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event); + + void fill(QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event, const void *Pattern, size_t PatternSize, + int Dim, size_t *Offset, size_t *Range); + + void copy(QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event, simple_scheduler::BufferReqPtr SrcReq, + const int DimSrc, const size_t *const SrcRange, + const size_t *const SrcOffset, const size_t *const DestOffset, + const size_t SizeTySrc, const size_t SizeSrc, + const size_t *const BuffSrcRange); + + size_t convertSycl2OCLMode(cl::sycl::access::mode mode); + + bool isValidAccessToMem(cl::sycl::access::mode AccessMode); + + void allocate(QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event, cl::sycl::access::mode mode); + + cl_mem getOpenCLMem() const; + +private: + // There are internal structures in this section. + enum DeviceMemoryState { + DMS_NULL, // No data were transferred between host and device. + DMS_COPIED, // Data were copied from host to device. + DMS_MODIFIED, // Data in device memory were modified. + DMS_HOST // Use host pointer for device memory + }; + // Contains the latest virtual state of buffer during commands enqueueing. + // TODO: Need to find better solution, at least make state for each device. + struct OpenCLMemState { + QueueImplPtr Queue; + cl_mem Mem = nullptr; + }; + +private: + // This field must be the first to guarantee that it's safe to use + // reinterpret casting while setting kernel arguments in order to get cl_mem + // value from the buffer regardless of its dimensionality. + OpenCLMemState OCLState; + bool OpenCLInterop = false; + event AvailableEvent; + cl_context OpenCLContext = nullptr; + T *BufPtr = nullptr; + vector_class BufData; + // TODO: enable support of cl_mem objects from multiple contexts + // TODO: at the current moment, using a buffer on multiple devices + // or on a device and a host simultaneously is not supported (the + // implementation is incorrect). + range Range; + property_list Props; + std::function uploadData = nullptr; + template + friend class cl::sycl::accessor; +}; + +template +void buffer_impl::fill( + QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event, const void *Pattern, size_t PatternSize, int Dim, + size_t *OffsetArr, size_t *RangeArr) { + + assert(dimensions == 1 && + "OpenCL doesn't support multidimensional fill method."); + assert(!Queue->is_host() && "Host case is handled in other place."); + + size_t Offset = OffsetArr[0]; + size_t Size = RangeArr[0] * PatternSize; + + cl::sycl::context Context = Queue->get_context(); + + OCLState.Queue = std::move(Queue); + Event->setIsHostEvent(false); + + cl_event &BufEvent = Event->getHandleRef(); + std::vector CLEvents = + detail::getOrWaitEvents(std::move(DepEvents), Context); + + cl_command_queue CommandQueue = OCLState.Queue->get(); + cl_int Error = clEnqueueFillBuffer( + CommandQueue, OCLState.Mem, Pattern, PatternSize, Offset, Size, + CLEvents.size(), CLEvents.data(), &BufEvent); + + CHECK_OCL_CODE(Error); + CHECK_OCL_CODE(clReleaseCommandQueue(CommandQueue)); +} + +template +void buffer_impl::copy( + QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event, simple_scheduler::BufferReqPtr SrcReq, const int DimSrc, + const size_t *const SrcRange, const size_t *const SrcOffset, + const size_t *const DestOffset, const size_t SizeTySrc, + const size_t SizeSrc, const size_t *const BuffSrcRange) { + assert(!Queue->is_host() && "Host case is handled in other place."); + + size_t *BuffDestRange = &get_range()[0]; + size_t SizeTyDest = sizeof(T); + const int DimDest = dimensions; + + cl::sycl::context Context = Queue->get_context(); + + cl_event &BufEvent = Event->getHandleRef(); + std::vector CLEvents = + detail::getOrWaitEvents(std::move(DepEvents), Context); + cl_int Error; + + cl_command_queue CommandQueue = Queue->get(); + if (1 == DimSrc && 1 == DimDest) { + Error = clEnqueueCopyBuffer(CommandQueue, SrcReq->getCLMemObject(), + OCLState.Mem, SrcOffset[0], DestOffset[0], + SizeSrc * SizeTySrc, CLEvents.size(), + CLEvents.data(), &BufEvent); + } else { + size_t SrcOrigin[3] = {SrcOffset[0] * SizeTySrc, + (1 == DimSrc) ? 0 : SrcOffset[1], + (3 == DimSrc) ? SrcOffset[2] : 0}; + size_t DstOrigin[3] = {DestOffset[0] * SizeTyDest, + (1 == DimDest) ? 0 : DestOffset[1], + (3 == DimDest) ? DestOffset[2] : 0}; + size_t Region[3] = {SrcRange[0] * SizeTySrc, + (1 == DimSrc) ? 1 : SrcRange[1], + (3 == DimSrc) ? SrcRange[2] : 1}; + size_t SrcRowPitch = (1 == DimSrc) ? 0 : SizeTySrc * BuffSrcRange[0]; + size_t SrcSlicePitch = + (3 == DimSrc) ? SizeTySrc * BuffSrcRange[0] * BuffSrcRange[1] : 0; + size_t DstRowPitch = (1 == DimSrc) ? 0 : SizeTyDest * BuffDestRange[0]; + size_t DstSlicePitch = + (3 == DimSrc) ? SizeTyDest * BuffDestRange[0] * BuffDestRange[1] : 0; + + Error = clEnqueueCopyBufferRect( + CommandQueue, SrcReq->getCLMemObject(), OCLState.Mem, SrcOrigin, + DstOrigin, Region, SrcRowPitch, SrcSlicePitch, DstRowPitch, + DstSlicePitch, CLEvents.size(), CLEvents.data(), &BufEvent); + } + CHECK_OCL_CODE(Error); + CHECK_OCL_CODE(clReleaseCommandQueue(CommandQueue)); + OCLState.Queue = std::move(Queue); + Event->setIsHostEvent(false); +} + +template +void buffer_impl::moveMemoryTo( + QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event) { + + cl::sycl::context Context = Queue->get_context(); + + if (OpenCLInterop && (Context.get() != OpenCLContext)) + throw cl::sycl::runtime_error( + "Interoperability buffer could not be used in a context other than the " + "context associated with the OpenCL memory object."); + + // TODO: Move all implementation specific commands to separate file? + // TODO: Make allocation in separate command? + + // Special case, move to "user host" + // TODO: Check discuss if "user host" and "host device" are the same. + if ((Queue->is_host()) && (OCLState.Queue->is_host())) { + detail::waitEvents(DepEvents); + Event->setIsHostEvent(true); + OCLState.Queue = std::move(Queue); + return; + } + + assert(OCLState.Queue->get_context() != Context || + OCLState.Queue->get_device() != Queue->get_device() && + "Attempt to move to the same env"); + + // Copy from OCL device to host device. + if (!OCLState.Queue->is_host() && Queue->is_host()) { + const size_t ByteSize = get_size(); + + std::vector CLEvents = + detail::getOrWaitEvents(std::move(DepEvents), Context); + + // TODO: Handle different situations with host PTR. + // Enqueue copying from OCL buffer to host. + cl_event &ReadBufEvent = Event->getHandleRef(); + cl_int Error = clEnqueueReadBuffer( + OCLState.Queue->getHandleRef(), OCLState.Mem, + /*blocking_read=*/CL_FALSE, /*offset=*/0, ByteSize, BufPtr, + CLEvents.size(), CLEvents.data(), &ReadBufEvent); + CHECK_OCL_CODE(Error); + + Event->setIsHostEvent(false); + + OCLState.Queue = std::move(Queue); + OCLState.Mem = nullptr; + return; + } + // Copy from host to OCL device. + if (OCLState.Queue->is_host() && !Queue->is_host()) { + const size_t ByteSize = get_size(); + cl_int Error; + cl_mem Mem = clCreateBuffer(Context.get(), CL_MEM_READ_WRITE, ByteSize, + /*host_ptr=*/nullptr, &Error); + CHECK_OCL_CODE(Error); + + OCLState.Queue = std::move(Queue); + OCLState.Mem = Mem; + + // Just exit if nothing to read from host. + if (nullptr == BufPtr) { + return; + } + std::vector CLEvents = + detail::getOrWaitEvents(std::move(DepEvents), Context); + cl_event &WriteBufEvent = Event->getHandleRef(); + // Enqueue copying from host to new OCL buffer. + Error = + clEnqueueWriteBuffer(OCLState.Queue->getHandleRef(), Mem, + /*blocking_write=*/CL_FALSE, /*offset=*/0, + ByteSize, BufPtr, CLEvents.size(), CLEvents.data(), + &WriteBufEvent); // replace &WriteBufEvent to NULL + CHECK_OCL_CODE(Error); + Event->setIsHostEvent(false); + + return; + } + + assert(0 && "Not handled"); +} + +template +size_t buffer_impl::convertSycl2OCLMode( + cl::sycl::access::mode mode) { + switch (mode) { + case cl::sycl::access::mode::read: + return CL_MEM_READ_ONLY; + case cl::sycl::access::mode::write: + return CL_MEM_WRITE_ONLY; + case cl::sycl::access::mode::read_write: + case cl::sycl::access::mode::atomic: + return CL_MEM_READ_WRITE; + default: + assert(0 && "Unhandled conversion from Sycl access mode to OCL one."); + return 0; + } +} + +template +bool buffer_impl::isValidAccessToMem( + cl::sycl::access::mode AccessMode) { + cl_mem_flags Flags; + assert(OCLState.Mem != nullptr && + "OpenCL memory associated with the buffer is null"); + CHECK_OCL_CODE(clGetMemObjectInfo(OCLState.Mem, CL_MEM_FLAGS, sizeof(Flags), + &Flags, nullptr)); + if (((Flags & CL_MEM_READ_WRITE) == 0) && + ((convertSycl2OCLMode(AccessMode) & Flags) == 0)) + return false; + return true; +} + +template +void buffer_impl::allocate( + QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event, cl::sycl::access::mode mode) { + + detail::waitEvents(DepEvents); + + cl::sycl::context Context = Queue->get_context(); + + if (OpenCLInterop && (Context.get() != OpenCLContext)) + throw cl::sycl::runtime_error( + "Interoperability buffer could not be used in a context other than the " + "context associated with the OpenCL memory object."); + + if (OpenCLInterop) { + AvailableEvent.wait(); + OCLState.Queue = std::move(Queue); + Event->setIsHostEvent(true); + return; + } + + if (!Queue->is_host()) { + size_t ByteSize = get_size(); + cl_int Error; + + cl_mem Mem = clCreateBuffer(Context.get(), convertSycl2OCLMode(mode), + ByteSize, nullptr, &Error); + CHECK_OCL_CODE(Error); + + cl_event &WriteBufEvent = Event->getHandleRef(); + Error = clEnqueueWriteBuffer(Queue->getHandleRef(), Mem, + /*blocking_write=*/CL_FALSE, /*offset=*/0, + ByteSize, BufPtr, /*num_of_events=*/0, + /*dep_list=*/nullptr, &WriteBufEvent); + CHECK_OCL_CODE(Error); + + OCLState.Queue = std::move(Queue); + OCLState.Mem = Mem; + + Event->setIsHostEvent(false); + + return; + } + if (Queue->is_host()) { + Event->setIsHostEvent(true); + OCLState.Queue = std::move(Queue); + return; + } + assert(0 && "Unhandled Alloca"); +} + +template +cl_mem buffer_impl::getOpenCLMem() const { + assert(nullptr != OCLState.Mem); + return OCLState.Mem; +} + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/common.hpp b/sycl/include/CL/sycl/detail/common.hpp new file mode 100644 index 000000000000..7241f7fe1211 --- /dev/null +++ b/sycl/include/CL/sycl/detail/common.hpp @@ -0,0 +1,118 @@ +//==---------- common.hpp ----- Common declarations ------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +// Suppress a compiler warning about undefined CL_TARGET_OPENCL_VERSION +// Khronos ICD supports only latest OpenCL version +#define CL_TARGET_OPENCL_VERSION 220 +#include +#include +#include +#include + +const char *stringifyErrorCode(cl_int error); + +#define OCL_CODE_TO_STR(code) \ + std::string(std::to_string(code) + " (" + stringifyErrorCode(code) + ")") + +#define STRINGIFY_LINE_HELP(s) #s +#define STRINGIFY_LINE(s) STRINGIFY_LINE_HELP(s) + +#define OCL_ERROR_REPORT \ + "OpenCL API failed. " __FILE__ \ + ":" STRINGIFY_LINE(__LINE__) ": " \ + "OpenCL API returns: " + +#ifndef SYCL_SUPPRESS_OCL_ERROR_REPORT +#include +#define REPORT_OCL_ERR_TO_STREAM(code) \ + if (code != CL_SUCCESS) { \ + std::cerr << OCL_ERROR_REPORT << OCL_CODE_TO_STR(code) << std::endl; \ + } +#endif + +#ifndef SYCL_SUPPRESS_EXCEPTIONS +#include + +#define REPORT_OCL_ERR_TO_EXC(code, exc) \ + if (code != CL_SUCCESS) { \ + std::string errorMessage(OCL_ERROR_REPORT + OCL_CODE_TO_STR(code)); \ + std::cerr << errorMessage << std::endl; \ + throw exc(errorMessage.c_str(), (code)); \ + } +#define REPORT_OCL_ERR_TO_EXC_THROW(code, exc) REPORT_OCL_ERR_TO_EXC(code, exc) +#define REPORT_OCL_ERR_TO_EXC_BASE(code) \ + REPORT_OCL_ERR_TO_EXC(code, cl::sycl::runtime_error) +#else +#define REPORT_OCL_ERR_TO_EXC_BASE(code) REPORT_OCL_ERR_TO_STREAM(code) +#endif + +#ifdef SYCL_SUPPRESS_OCL_ERROR_REPORT +#define CHECK_OCL_CODE(X) (void)(X) +#define CHECK_OCL_CODE_THROW(X, EXC) (void)(X) +#define CHECK_OCL_CODE_NO_EXC(X) (void)(X) +#else +#define CHECK_OCL_CODE(X) REPORT_OCL_ERR_TO_EXC_BASE(X) +#define CHECK_OCL_CODE_THROW(X, EXC) REPORT_OCL_ERR_TO_EXC_THROW(X, EXC) +#define CHECK_OCL_CODE_NO_EXC(X) REPORT_OCL_ERR_TO_STREAM(X) +#endif + +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +#if __has_attribute(always_inline) +#define ALWAYS_INLINE __attribute__((always_inline)) +#else +#define ALWAYS_INLINE +#endif + +// TODO this macro is introduced to workaround SPIRV translator problem with +// dropping linkonce_odr attribute leading to duplicated symbol errors in +// the bitcode linker for functions defined in the headers. Remove once fixed. +#ifdef __SYCL_DEVICE_ONLY__ +#define INLINE_IF_DEVICE ALWAYS_INLINE +#else +#define INLINE_IF_DEVICE +#endif // __SYCL_DEVICE_ONLY__ + + +namespace cl { +namespace sycl { +namespace detail { +// Helper function for extracting implementation from SYCL's interface objects. +// Note! This function relies on the fact that all SYCL interface classes +// contain "impl" field that points to implementation object. "impl" field +// should be accessible from this function. +template decltype(T::impl) getSyclObjImpl(const T &SyclObject) { + return SyclObject.impl; +} + +// Helper function for creation SYCL interface objects from implementations. +// Note! This function relies on the fact that all SYCL interface classes +// contain "impl" field that points to implementation object. "impl" field +// should be accessible from this function. +template T createSyclObjFromImpl(decltype(T::impl) ImplObj) { + return T(ImplObj); +} + +#ifdef __SYCL_DEVICE_ONLY__ +// The flag type for passing flag arguments to barrier(), mem_fence(), +// read_mem_fence(), and write_mem_fence() functions. +typedef uint cl_mem_fence_flags; + +const cl_mem_fence_flags CLK_LOCAL_MEM_FENCE = 0x01; +const cl_mem_fence_flags CLK_GLOBAL_MEM_FENCE = 0x02; +const cl_mem_fence_flags CLK_CHANNEL_MEM_FENCE = 0x04; +#endif // __SYCL_DEVICE_ONLY__ + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/common_info.hpp b/sycl/include/CL/sycl/detail/common_info.hpp new file mode 100644 index 000000000000..636dd9a42819 --- /dev/null +++ b/sycl/include/CL/sycl/detail/common_info.hpp @@ -0,0 +1,22 @@ +//==------- common_info.hpp ----- Common SYCL info methods------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace cl { +namespace sycl { +namespace detail { + +vector_class split_string(const string_class &str, + char delimeter); + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/context_host.hpp b/sycl/include/CL/sycl/detail/context_host.hpp new file mode 100644 index 000000000000..0731a22c54f8 --- /dev/null +++ b/sycl/include/CL/sycl/detail/context_host.hpp @@ -0,0 +1,46 @@ +//==------------- context_host.hpp - SYCL host context ---------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +// 4.6.2 Context class + +namespace cl { +namespace sycl { +namespace detail { +class context_host : public context_impl { +public: + context_host(const device &rhs, async_handler asyncHandler) + : context_impl(asyncHandler), dev(rhs) {} + + cl_context get() const override { + throw invalid_object_error("This instance of context is a host instance"); + } + + bool is_host() const override { return true; } + + platform get_platform() const override { return platform(); } + + vector_class get_devices() const override { + return vector_class(1, dev); + } + + template + typename info::param_traits::return_type get_info() const; +private: + device dev; +}; +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/context_impl.hpp b/sycl/include/CL/sycl/detail/context_impl.hpp new file mode 100644 index 000000000000..087d09779f7a --- /dev/null +++ b/sycl/include/CL/sycl/detail/context_impl.hpp @@ -0,0 +1,84 @@ +//==---------------- context.hpp - SYCL context ----------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +// 4.6.2 Context class + +namespace cl { +namespace sycl { +// Forward declaration +class platform; +class device; +namespace detail { +template struct get_context_info_cl { + using RetType = + typename info::param_traits::return_type; + + static RetType _(cl_context ctx) { + RetType Result = 0; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetContextInfo(ctx, cl_context_info(param), sizeof(Result), + &Result, nullptr)); + return Result; + } +}; + +class context_impl { +public: + context_impl(async_handler asyncHandler) : m_AsyncHandler(asyncHandler) {} + + template + inline typename info::param_traits::return_type + get_info() const; + + const async_handler& get_async_handler() const { return m_AsyncHandler; } + + virtual cl_context get() const = 0; + + virtual bool is_host() const = 0; + + virtual platform get_platform() const = 0; + + virtual vector_class get_devices() const = 0; + + virtual ~context_impl() = default; + +private: + async_handler m_AsyncHandler; +}; +template <> +inline typename info::param_traits::return_type +context_impl::get_info() const { + if (is_host()) { + return 0; + } + return get_context_info_cl::_(this->get()); +} +template <> +inline typename info::param_traits::return_type +context_impl::get_info() const { + return get_platform(); +} +template <> +inline typename info::param_traits::return_type +context_impl::get_info() const { + return get_devices(); +} + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/context_opencl.hpp b/sycl/include/CL/sycl/detail/context_opencl.hpp new file mode 100644 index 000000000000..e5755902e29b --- /dev/null +++ b/sycl/include/CL/sycl/detail/context_opencl.hpp @@ -0,0 +1,90 @@ +//==------------ context_opencl.hpp - SYCL OpenCL context ------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +// 4.6.2 Context class + +namespace cl { +namespace sycl { +// Forward declaration +class platform; +namespace detail { +class context_opencl : public context_impl { +public: + context_opencl(const vector_class devices, + async_handler asyncHandler) + : context_impl(asyncHandler) { + dev_list = devices; + plt = dev_list[0].get_platform(); + vector_class dev_ids; + for (const auto &d : dev_list) + dev_ids.push_back(d.get()); + cl_int error; + id = clCreateContext(0, dev_ids.size(), dev_ids.data(), 0, 0, &error); + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(error); + } + + context_opencl(cl_context clContext, async_handler asyncHandler) + : context_impl(asyncHandler) { + id = clContext; + vector_class dev_ids; + size_t devicesBuffer = 0; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE( + clGetContextInfo(id, CL_CONTEXT_DEVICES, 0, nullptr, &devicesBuffer)); + dev_ids.resize(devicesBuffer / sizeof(cl_device_id)); + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetContextInfo(id, CL_CONTEXT_DEVICES, devicesBuffer, + &dev_ids[0], nullptr)); + + for (auto dev : dev_ids) { + dev_list.emplace_back(dev); + } + // TODO What if dev_list if empty? dev_list[0].get_platform() + plt = platform(dev_list[0].get_platform()); + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clRetainContext(id)); + } + + cl_context get() const override { + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clRetainContext(id)); + return id; + } + + bool is_host() const override { return false; } + + platform get_platform() const override { return plt; } + + vector_class get_devices() const override { return dev_list; } + + ~context_opencl() { + // TODO replace CHECK_OCL_CODE_NO_EXC to CHECK_OCL_CODE and + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE_NO_EXC(clReleaseContext(id)); + } + // TODO: implement param traits + // template + // typename param_traits::type get_info() const; +private: + vector_class dev_list; + cl_context id; + platform plt; +}; +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/device_host.hpp b/sycl/include/CL/sycl/detail/device_host.hpp new file mode 100644 index 000000000000..6ce172ac435e --- /dev/null +++ b/sycl/include/CL/sycl/detail/device_host.hpp @@ -0,0 +1,66 @@ +//==--------------- device_host.hpp - SYCL host device --------------------== // +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +namespace cl { +namespace sycl { +namespace detail { +// TODO: 4.6.4 Partitioning into multiple SYCL devices +// TODO: 4.6.4.2 Device information descriptors +// TODO: Make code thread-safe +class device_host : public device_impl { +public: + device_host() = default; + cl_device_id get() const override { + throw invalid_object_error("This instance of device is a host instance"); + } + + bool is_host() const override { return true; } + + bool is_cpu() const override { return false; } + + bool is_gpu() const override { return false; } + + bool is_accelerator() const override { return false; } + + platform get_platform() const override { return platform(); } + + bool has_extension(const string_class &extension_name) const override { + // TODO: implement extension management; + return false; + } + + vector_class create_sub_devices(size_t nbSubDev) const { + // TODO: implement host device partitioning + throw runtime_error( + "Partitioning to subdevices of the host device is not implemented yet"); + } + + vector_class + create_sub_devices(const vector_class &counts) const { + // TODO: implement host device partitioning + throw runtime_error( + "Partitioning to subdevices of the host device is not implemented yet"); + } + + vector_class + create_sub_devices(info::partition_affinity_domain affinityDomain) const { + // TODO: implement host device partitioning + throw runtime_error( + "Partitioning to subdevices of the host device is not implemented yet"); + } +}; +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/device_impl.hpp b/sycl/include/CL/sycl/detail/device_impl.hpp new file mode 100644 index 000000000000..671a9ed187d7 --- /dev/null +++ b/sycl/include/CL/sycl/detail/device_impl.hpp @@ -0,0 +1,83 @@ +//==----------------- device_impl.hpp - SYCL device ------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +namespace cl { +namespace sycl { + +// Forward declaration +class platform; + +namespace detail { +// TODO: 4.6.4 Partitioning into multiple SYCL devices +// TODO: 4.6.4.2 Device information descriptors +// TODO: Make code thread-safe +class device_impl { +public: + virtual ~device_impl() = default; + + virtual cl_device_id get() const = 0; + + virtual bool is_host() const = 0; + + virtual bool is_cpu() const = 0; + + virtual bool is_gpu() const = 0; + + virtual bool is_accelerator() const = 0; + + virtual platform get_platform() const = 0; + + virtual vector_class create_sub_devices(size_t nbSubDev) const = 0; + + virtual vector_class + create_sub_devices(const vector_class &counts) const = 0; + + virtual vector_class + create_sub_devices(info::partition_affinity_domain affinityDomain) const = 0; + + static vector_class + get_devices(info::device_type deviceType = info::device_type::all); + + template + typename info::param_traits::return_type + get_info() const { + if (is_host()) { + return get_device_info_host(); + } + return get_device_info_cl< + typename info::param_traits::return_type, + param>::_(this->get()); + } + + bool is_partition_supported(info::partition_property Prop) const { + auto SupportedProperties = get_info(); + return std::find(SupportedProperties.begin(), SupportedProperties.end(), + Prop) != SupportedProperties.end(); + } + + bool + is_affinity_supported(info::partition_affinity_domain AffinityDomain) const { + auto SupportedDomains = + get_info(); + return std::find(SupportedDomains.begin(), SupportedDomains.end(), + AffinityDomain) != SupportedDomains.end(); + } + + virtual bool has_extension(const string_class &extension_name) const = 0; +}; +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/device_info.hpp b/sycl/include/CL/sycl/detail/device_info.hpp new file mode 100644 index 000000000000..dc581c31160f --- /dev/null +++ b/sycl/include/CL/sycl/detail/device_info.hpp @@ -0,0 +1,481 @@ +//==-------- device_info.hpp - SYCL device info methods --------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +namespace cl { +namespace sycl { +namespace detail { + +vector_class read_fp_bitfield(cl_device_fp_config bits); + +vector_class +read_domain_bitfield(cl_device_affinity_domain bits); + +vector_class +read_execution_bitfield(cl_device_exec_capabilities bits); + +// Mapping expected SYCL return types to those returned by OpenCL calls +template struct sycl_to_ocl { using type = T; }; + +template <> struct sycl_to_ocl { using type = cl_bool; }; + +template <> struct sycl_to_ocl { using type = cl_device_id; }; + +template <> struct sycl_to_ocl { using type = cl_platform_id; }; + +// Mapping fp_config device info types to the values used to check fp support +template struct check_fp_support {}; + +template <> struct check_fp_support { + static const info::device value = info::device::native_vector_width_half; +}; + +template <> struct check_fp_support { + static const info::device value = info::device::native_vector_width_double; +}; + +// Structs for emulating function template partial specialization +// Default template for the general case +template struct get_device_info_cl { + static T _(cl_device_id dev) { + typename sycl_to_ocl::type result; + CHECK_OCL_CODE(clGetDeviceInfo(dev, (cl_device_info)param, sizeof(result), + &result, NULL)); + return T(result); + } +}; + +// Specialization for string return type, variable OpenCL return size +template struct get_device_info_cl { + static string_class _(cl_device_id dev) { + size_t resultSize; + CHECK_OCL_CODE( + clGetDeviceInfo(dev, (cl_device_info)param, 0, NULL, &resultSize)); + if (resultSize == 0) { + return string_class(); + } + unique_ptr_class result(new char[resultSize]); + CHECK_OCL_CODE(clGetDeviceInfo(dev, (cl_device_info)param, resultSize, + result.get(), NULL)); + return string_class(result.get()); + } +}; + +// Specialization for id return type +template struct get_device_info_cl, param> { + static id<3> _(cl_device_id dev) { + size_t result[3]; + CHECK_OCL_CODE(clGetDeviceInfo(dev, (cl_device_info)param, sizeof(result), + &result, NULL)); + return id<3>(result[0], result[1], result[2]); + } +}; + +// Specialization for fp_config types, checks the corresponding fp type support +template +struct get_device_info_cl, param> { + static vector_class _(cl_device_id dev) { + // Check if fp type is supported + if (!get_device_info_cl< + typename info::param_traits< + info::device, check_fp_support::value>::return_type, + check_fp_support::value>::_(dev)) { + return {}; + } + cl_device_fp_config result; + CHECK_OCL_CODE(clGetDeviceInfo(dev, (cl_device_info)param, sizeof(result), + &result, NULL)); + return read_fp_bitfield(result); + } +}; + +// Specialization for single_fp_config, no type support check required +template <> +struct get_device_info_cl, + info::device::single_fp_config> { + static vector_class _(cl_device_id dev) { + cl_device_fp_config result; + CHECK_OCL_CODE( + clGetDeviceInfo(dev, (cl_device_info)info::device::single_fp_config, + sizeof(result), &result, NULL)); + return read_fp_bitfield(result); + } +}; + +// Specialization for queue_profiling, OpenCL returns a bitfield +template <> struct get_device_info_cl { + static bool _(cl_device_id dev) { + cl_command_queue_properties result; + CHECK_OCL_CODE( + clGetDeviceInfo(dev, (cl_device_info)info::device::queue_profiling, + sizeof(result), &result, NULL)); + return (result & CL_QUEUE_PROFILING_ENABLE); + } +}; + +// Specialization for exec_capabilities, OpenCL returns a bitfield +template <> +struct get_device_info_cl, + info::device::execution_capabilities> { + static vector_class _(cl_device_id dev) { + cl_device_exec_capabilities result; + CHECK_OCL_CODE(clGetDeviceInfo( + dev, (cl_device_info)info::device::execution_capabilities, + sizeof(result), &result, NULL)); + return read_execution_bitfield(result); + } +}; + +// Specialization for built in kernels, splits the string returned by OpenCL +template <> +struct get_device_info_cl, + info::device::built_in_kernels> { + static vector_class _(cl_device_id dev) { + string_class result = + get_device_info_cl::_( + dev); + return split_string(result, ';'); + } +}; + +// Specialization for extensions, splits the string returned by OpenCL +template <> +struct get_device_info_cl, + info::device::extensions> { + static vector_class _(cl_device_id dev) { + string_class result = + get_device_info_cl::_(dev); + return split_string(result, ' '); + } +}; + +// Specialization for partition properties, variable OpenCL return size +template <> +struct get_device_info_cl, + info::device::partition_properties> { + static vector_class _(cl_device_id dev) { + size_t resultSize; + CHECK_OCL_CODE( + clGetDeviceInfo(dev, (cl_device_info)info::device::partition_properties, + 0, NULL, &resultSize)); + size_t arrayLength = resultSize / sizeof(cl_device_partition_property); + if (arrayLength == 0) { + return {}; + } + unique_ptr_class arrayResult( + new cl_device_partition_property[arrayLength]); + CHECK_OCL_CODE( + clGetDeviceInfo(dev, (cl_device_info)info::device::partition_properties, + resultSize, arrayResult.get(), NULL)); + + vector_class result; + for (size_t i = 0; i < arrayLength - 1; ++i) { + result.push_back(info::partition_property(arrayResult[i])); + } + return result; + } +}; + +// Specialization for partition affinity domains, OpenCL returns a bitfield +template <> +struct get_device_info_cl, + info::device::partition_affinity_domains> { + static vector_class _(cl_device_id dev) { + cl_device_affinity_domain result; + CHECK_OCL_CODE(clGetDeviceInfo( + dev, (cl_device_info)info::device::partition_affinity_domains, + sizeof(result), &result, NULL)); + return read_domain_bitfield(result); + } +}; + +// Specialization for partition type affinity domain, OpenCL can return other +// partition properties instead +template <> +struct get_device_info_cl { + static info::partition_affinity_domain _(cl_device_id dev) { + size_t resultSize; + CHECK_OCL_CODE(clGetDeviceInfo( + dev, (cl_device_info)info::device::partition_type_affinity_domain, 0, + NULL, &resultSize)); + if (resultSize != 1) { + return info::partition_affinity_domain::not_applicable; + } + cl_device_partition_property result; + CHECK_OCL_CODE(clGetDeviceInfo( + dev, (cl_device_info)info::device::partition_type_affinity_domain, + sizeof(result), &result, NULL)); + if (result == CL_DEVICE_AFFINITY_DOMAIN_NUMA || + result == CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE || + result == CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE || + result == CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE || + result == CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE) { + return info::partition_affinity_domain(result); + } + + return info::partition_affinity_domain::not_applicable; + } +}; + +// Specialization for partition type +template <> +struct get_device_info_cl { + static info::partition_property _(cl_device_id dev) { + size_t resultSize; + CHECK_OCL_CODE( + clGetDeviceInfo(dev, CL_DEVICE_PARTITION_TYPE, 0, NULL, &resultSize)); + if (!resultSize) + return info::partition_property::no_partition; + + size_t arrayLength = resultSize / sizeof(cl_device_partition_property); + + unique_ptr_class arrayResult( + new cl_device_partition_property[arrayLength]); + CHECK_OCL_CODE(clGetDeviceInfo(dev, CL_DEVICE_PARTITION_TYPE, resultSize, + arrayResult.get(), NULL)); + if (!arrayResult[0]) + return info::partition_property::no_partition; + return info::partition_property(arrayResult[0]); + } +}; + +// Specialization for parent device +template +struct get_device_info_cl { + static T _(cl_device_id dev) { + typename sycl_to_ocl::type result; + CHECK_OCL_CODE( + clGetDeviceInfo(dev, (cl_device_info)info::device::parent_device, + sizeof(result), &result, NULL)); + if (result == nullptr) + throw invalid_object_error( + "No parent for device because it is not a subdevice"); + return T(result); + } +}; + +// SYCL host device information + +// Default template is disabled, all possible instantiations are +// specified explicitly. +template +typename info::param_traits::return_type +get_device_info_host() = delete; + +template <> info::device_type get_device_info_host(); + +template <> cl_uint get_device_info_host(); + +template <> cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> id<3> get_device_info_host(); + +template <> size_t get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +cl_uint get_native_vector_width(size_t idx); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> cl_uint get_device_info_host(); + +template <> cl_uint get_device_info_host(); + +template <> cl_ulong get_device_info_host(); + +template <> cl_ulong get_device_info_host(); + +template <> bool get_device_info_host(); + +template <> cl_uint get_device_info_host(); + +template <> cl_uint get_device_info_host(); + +template <> size_t get_device_info_host(); + +template <> size_t get_device_info_host(); + +template <> size_t get_device_info_host(); + +template <> size_t get_device_info_host(); + +template <> size_t get_device_info_host(); + +template <> size_t get_device_info_host(); + +template <> size_t get_device_info_host(); + +template <> cl_uint get_device_info_host(); + +template <> size_t get_device_info_host(); + +template <> cl_uint get_device_info_host(); + +template <> +vector_class +get_device_info_host(); + +template <> +vector_class +get_device_info_host(); + +template <> +vector_class +get_device_info_host(); + +template <> +info::global_mem_cache_type +get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +cl_ulong get_device_info_host(); + +template <> +cl_ulong get_device_info_host(); + +template <> cl_uint get_device_info_host(); + +template <> +info::local_mem_type get_device_info_host(); + +template <> cl_ulong get_device_info_host(); + +template <> bool get_device_info_host(); + +template <> bool get_device_info_host(); + +template <> +size_t get_device_info_host(); + +template <> bool get_device_info_host(); + +template <> bool get_device_info_host(); + +template <> bool get_device_info_host(); + +template <> bool get_device_info_host(); + +template <> +vector_class +get_device_info_host(); + +template <> bool get_device_info_host(); + +template <> +vector_class +get_device_info_host(); + +template <> platform get_device_info_host(); + +template <> string_class get_device_info_host(); + +template <> string_class get_device_info_host(); + +template <> string_class get_device_info_host(); + +template <> string_class get_device_info_host(); + +template <> string_class get_device_info_host(); + +template <> string_class get_device_info_host(); + +template <> +vector_class get_device_info_host(); + +template <> size_t get_device_info_host(); + +template <> +bool get_device_info_host(); + +template <> device get_device_info_host(); + +template <> +cl_uint get_device_info_host(); + +template <> +vector_class +get_device_info_host(); + +template <> +vector_class +get_device_info_host(); + +template <> +info::partition_property +get_device_info_host(); + +template <> +info::partition_affinity_domain +get_device_info_host(); + +template <> cl_uint get_device_info_host(); + +template <> cl_uint get_device_info_host(); + +template <> +bool get_device_info_host< + info::device::sub_group_independent_forward_progress>(); + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/device_opencl.hpp b/sycl/include/CL/sycl/detail/device_opencl.hpp new file mode 100644 index 000000000000..a61206b8b760 --- /dev/null +++ b/sycl/include/CL/sycl/detail/device_opencl.hpp @@ -0,0 +1,143 @@ +//==------------ device_opencl.hpp - SYCL OpenCL device --------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +class device_selector; + +namespace cl { +namespace sycl { +namespace detail { +// TODO: 4.6.4 Partitioning into multiple SYCL devices +// TODO: 4.6.4.2 Device information descriptors +// TODO: Make code thread-safe +class device_opencl : public device_impl { +public: + /** Constructs a device class instance using cl device_id of the OpenCL + * device. */ + explicit device_opencl(cl_device_id deviceId) { + id = deviceId; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE( + clGetDeviceInfo(id, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, 0)); + cl_device_id parent; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetDeviceInfo(id, CL_DEVICE_PARENT_DEVICE, + sizeof(cl_device_id), &parent, nullptr)); + isRootDevice = (nullptr == parent); + if (!isRootDevice) { + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clRetainDevice(id)); + } + } + + ~device_opencl() { + if (!isRootDevice) { + // TODO replace CHECK_OCL_CODE_NO_EXC to CHECK_OCL_CODE and + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE_NO_EXC(clReleaseDevice(id)); + } + } + + cl_device_id get() const override { + if (!isRootDevice) { + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clRetainDevice(id)); + } + return id; + } + + bool is_host() const override { return false; } + + bool is_cpu() const override { return (type == CL_DEVICE_TYPE_CPU); } + + bool is_gpu() const override { return (type == CL_DEVICE_TYPE_GPU); } + + bool is_accelerator() const override { + return (type == CL_DEVICE_TYPE_ACCELERATOR); + } + + platform get_platform() const override { + cl_platform_id plt_id; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE( + clGetDeviceInfo(id, CL_DEVICE_PLATFORM, sizeof(plt_id), &plt_id, 0)); + return platform(plt_id); + } + + bool has_extension(const string_class &extension_name) const override { + string_class all_extension_names = + get_device_info_cl::_(id); + return (all_extension_names.find(extension_name) != std::string::npos); + } + + vector_class + create_sub_devices(const cl_device_partition_property *Properties, + size_t SubDevicesCount) const { + vector_class SubDevices(SubDevicesCount); + cl_uint ReturnedSubDevices; + CHECK_OCL_CODE(clCreateSubDevices(id, Properties, SubDevicesCount, + SubDevices.data(), &ReturnedSubDevices)); + return vector_class(SubDevices.begin(), SubDevices.end()); + } + + vector_class create_sub_devices(size_t ComputeUnits) const { + if (!is_partition_supported(info::partition_property::partition_equally)) { + throw cl::sycl::feature_not_supported(); + } + size_t SubDevicesCount = + get_info() / ComputeUnits; + const cl_device_partition_property Properties[3] = { + CL_DEVICE_PARTITION_EQUALLY, (cl_device_partition_property)ComputeUnits, + 0}; + return create_sub_devices(Properties, SubDevicesCount); + } + + vector_class + create_sub_devices(const vector_class &Counts) const { + if (!is_partition_supported( + info::partition_property::partition_by_counts)) { + throw cl::sycl::feature_not_supported(); + } + static const cl_device_partition_property P[] = { + CL_DEVICE_PARTITION_BY_COUNTS, CL_DEVICE_PARTITION_BY_COUNTS_LIST_END, + 0}; + vector_class Properties(P, P + 3); + Properties.insert(Properties.begin() + 1, Counts.begin(), Counts.end()); + return create_sub_devices(Properties.data(), Counts.size()); + } + + vector_class + create_sub_devices(info::partition_affinity_domain AffinityDomain) const { + if (!is_partition_supported( + info::partition_property::partition_by_affinity_domain) || + !is_affinity_supported(AffinityDomain)) { + throw cl::sycl::feature_not_supported(); + } + const cl_device_partition_property Properties[3] = { + CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, + (cl_device_partition_property)AffinityDomain, 0}; + size_t SubDevicesCount = + get_info(); + return create_sub_devices(Properties, SubDevicesCount); + } + +private: + cl_device_id id = 0; + cl_device_type type = 0; + bool isRootDevice = false; +}; +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/event_impl.hpp b/sycl/include/CL/sycl/detail/event_impl.hpp new file mode 100644 index 000000000000..833cc335bb32 --- /dev/null +++ b/sycl/include/CL/sycl/detail/event_impl.hpp @@ -0,0 +1,59 @@ +//==---------------- event_impl.hpp - SYCL event ---------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +#include + +namespace cl { +namespace sycl { +namespace detail { + +class event_impl { +public: + event_impl() = default; + event_impl(cl_event CLEvent, const context &SyclContext); + + // Threat all devices that don't support interoperability as host devices to + // avoid attempts to call method get on such events. + bool is_host() const; + + cl_event get() const; + + // Self is needed in order to pass shared_ptr to Scheduler. + void wait(std::shared_ptr Self) const; + + template + typename info::param_traits::return_type + get_profiling_info() const; + + template + typename info::param_traits::return_type get_info() const; + + ~event_impl(); + + void waitInternal() const; + + cl_event &getHandleRef(); + + void setIsHostEvent(bool Value); + +private: + cl_event m_Event = nullptr; + bool m_OpenCLInterop = false; + bool m_HostEvent = true; +}; + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/event_info.hpp b/sycl/include/CL/sycl/detail/event_info.hpp new file mode 100644 index 000000000000..56725642d80e --- /dev/null +++ b/sycl/include/CL/sycl/detail/event_info.hpp @@ -0,0 +1,45 @@ +//==---------------- event_info.hpp - SYCL event ---------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace cl { +namespace sycl { +namespace detail { + +template struct get_event_profiling_info_cl { + using RetType = + typename info::param_traits::return_type; + + static RetType _(cl_event Event) { + RetType Result = 0; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetEventProfilingInfo(Event, cl_profiling_info(Param), + sizeof(Result), &Result, nullptr)); + return Result; + } +}; + +template struct get_event_info_cl { + using RetType = typename info::param_traits::return_type; + + static RetType _(cl_event Event) { + RetType Result = (RetType)0; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetEventInfo(Event, cl_profiling_info(Param), + sizeof(Result), &Result, nullptr)); + return Result; + } +}; + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/helpers.hpp b/sycl/include/CL/sycl/detail/helpers.hpp new file mode 100644 index 000000000000..f8e95977eee4 --- /dev/null +++ b/sycl/include/CL/sycl/detail/helpers.hpp @@ -0,0 +1,72 @@ +//==---------------- helpers.hpp - SYCL helpers ----------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +namespace cl { +namespace sycl { +class context; +class event; +template class item; +template class group; +template class range; +template class id; +template class nd_item; +namespace detail { + +// The function returns list of events that can be passed to OpenCL API as +// dependency list and waits for others. +std::vector +getOrWaitEvents(std::vector DepEvents, + cl::sycl::context Context); + +void waitEvents(std::vector DepEvents); + +struct Builder { + Builder() = delete; + template + static group createGroup(const cl::sycl::range &G, + const cl::sycl::range &L, + const cl::sycl::id &I) { + return cl::sycl::group(G, L, I); + } + + template + static item createItem( + typename std::enable_if<(with_offset == true), + const cl::sycl::range>::type &R, + const cl::sycl::id &I, const cl::sycl::id &O) { + return cl::sycl::item(R, I, O); + } + + template + static item createItem( + typename std::enable_if<(with_offset == false), + const cl::sycl::range>::type &R, + const cl::sycl::id &I) { + return cl::sycl::item(R, I); + } + + template + static nd_item + createNDItem(const cl::sycl::item &GL, + const cl::sycl::item &L, + const cl::sycl::group &GR) { + return cl::sycl::nd_item(GL, L, GR); + } +}; + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/image_impl.hpp b/sycl/include/CL/sycl/detail/image_impl.hpp new file mode 100644 index 000000000000..a91eaf78ce53 --- /dev/null +++ b/sycl/include/CL/sycl/detail/image_impl.hpp @@ -0,0 +1,160 @@ +//==------------ image_impl.hpp --------------------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +namespace cl { +namespace sycl { + +enum class image_channel_order : unsigned int { + a, + r, + rx, + rg, + rgx, + ra, + rgb, + rgbx, + rgba, + argb, + bgra, + intensity, + luminance, + abgr +}; + +enum class image_channel_type : unsigned int { + snorm_int8, + snorm_int16, + unorm_int8, + unorm_int16, + unorm_short_565, + unorm_short_555, + unorm_int_101010, + signed_int8, + signed_int16, + signed_int32, + unsigned_int8, + unsigned_int16, + unsigned_int32, + fp16, + fp32 +}; + +namespace detail { + +template class image_impl { +public: + image_impl(image_channel_order order, image_channel_type type, + const range &range, + const property_list &propList) { + assert(!"Not implemented"); + } + + //image_impl(image_channel_order order, image_channel_type type, + //const range &range, AllocatorT allocator, + //const property_list &propList = {}); + + /* Available only when: dimensions > 1 */ + // image_impl(image_channel_order order, image_channel_type type, + // const range &range, const range &pitch, + // const property_list &propList = {}); + + /* Available only when: dimensions > 1 */ + // image_impl(image_channel_order order, image_channel_type type, + // const range &range, const range &pitch, + // AllocatorT allocator, const property_list &propList = {}); + + //image_impl(void *hostPointer, image_channel_order order, + //image_channel_type type, const range &range, + //const property_list &propList = {}); + + //image_impl(void *hostPointer, image_channel_order order, + //image_channel_type type, const range &range, + //AllocatorT allocator, const property_list &propList = {}); + + //image_impl(const void *hostPointer, image_channel_order order, + //image_channel_type type, const range &range, + //const property_list &propList = {}); + + //image_impl(const void *hostPointer, image_channel_order order, + //image_channel_type type, const range &range, + //AllocatorT allocator, const property_list &propList = {}); + + /* Available only when: dimensions > 1 */ + // image_impl(void *hostPointer, image_channel_order order, image_channel_type + // type, + // const range &range, range &pitch, + // const property_list &propList = {}) {assert(!"Not implemented");} + + /* Available only when: dimensions > 1 */ + // image_impl(void *hostPointer, image_channel_order order, image_channel_type + // type, + // const range &range, range &pitch, + // AllocatorT allocator, const property_list &propList = {}) {assert(!"Not + // implemented");} + + //image_impl(shared_ptr_class &hostPointer, image_channel_order order, + //image_channel_type type, const range &range, + //const property_list &propList = {}); + + //image_impl(shared_ptr_class &hostPointer, image_channel_order order, + //image_channel_type type, const range &range, + //AllocatorT allocator, const property_list &propList = {}); + + /* Available only when: dimensions > 1 */ + // image_impl(shared_ptr_class &hostPointer, image_channel_order order, + // image_channel_type type, const range &range, + // const range &pitch, const property_list &propList = {}) + // {assert(!"Not implemented");} + + /* Available only when: dimensions > 1 */ + // image_impl(shared_ptr_class &hostPointer, image_channel_order order, + // image_channel_type type, const range &range, + // const range &pitch, AllocatorT allocator, + // const property_list &propList = {}) {assert(!"Not implemented");} + + //image_impl(cl_mem clMemObject, const context &syclContext, + //event availableEvent = {}); + + /* -- property interface members -- */ + + range get_range() const { assert(!"Not implemented"); } + + /* Available only when: dimensions > 1 */ + range get_pitch() const { assert(!"Not implemented"); } + + size_t get_size() const { assert(!"Not implemented"); return 0;} + + size_t get_count() const { assert(!"Not implemented"); return 0; } + + AllocatorT get_allocator() const { assert(!"Not implemented"); } + + template + accessor + get_access(handler &commandGroupHandler) { + assert(!"Not implemented"); + } + + template + accessor + get_access() { + assert(!"Not implemented"); + } + + // template + // void set_final_data(Destination finalData = std::nullptr); + + void set_write_back(bool flag) { assert(!"Not implemented"); } +}; + +} // namespace detail + +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/kernel_desc.hpp b/sycl/include/CL/sycl/detail/kernel_desc.hpp new file mode 100644 index 000000000000..25862ab7dea3 --- /dev/null +++ b/sycl/include/CL/sycl/detail/kernel_desc.hpp @@ -0,0 +1,43 @@ +//==----------------------- kernel_desc.hpp --------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===//// + +#pragma once + +#include + +namespace cl { +namespace sycl { +namespace detail { + +// kernel parameter kinds +enum class kernel_param_kind_t { + kind_accessor, + kind_std_layout, // standard layout object parameters + kind_sampler +}; + +// describes a kernel parameter +struct kernel_param_desc_t { + // parameter kind + kernel_param_kind_t kind; + // kind == kind_std_layout + // parameter size in bytes (includes padding for structs) + // kind == kind_accessor + // access target; possible access targets are defined in access/access.hpp + int info; + // offset of the captured value of the parameter in the lambda or function + // object + int offset; +}; + +template struct KernelInfo; + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/kernel_impl.hpp b/sycl/include/CL/sycl/detail/kernel_impl.hpp new file mode 100644 index 000000000000..ea1a9ca0a91e --- /dev/null +++ b/sycl/include/CL/sycl/detail/kernel_impl.hpp @@ -0,0 +1,128 @@ +//==------- kernel_impl.hpp --- SYCL kernel implementation -----------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace cl { +namespace sycl { +// Forward declaration +class program; + +namespace detail { +class program_impl; + +class kernel_impl { +public: + kernel_impl(cl_kernel ClKernel, const context &SyclContext); + + kernel_impl(cl_kernel ClKernel, const context &SyclContext, + std::shared_ptr ProgramImpl) + : ClKernel(ClKernel), Context(SyclContext), ProgramImpl(ProgramImpl) {} + + // Host kernel constructor + kernel_impl(const context &SyclContext, + std::shared_ptr ProgramImpl) + : Context(SyclContext), ProgramImpl(ProgramImpl) {} + + ~kernel_impl() { + // TODO replace CHECK_OCL_CODE_NO_EXC to CHECK_OCL_CODE and + // TODO catch an exception and put it to list of asynchronous exceptions + if (!is_host()) { + CHECK_OCL_CODE_NO_EXC(clReleaseKernel(ClKernel)); + } + } + + cl_kernel get() const { + if (is_host()) { + throw invalid_object_error("This instance of kernel is a host instance"); + } + CHECK_OCL_CODE(clRetainKernel(ClKernel)); + return ClKernel; + } + + bool is_host() const { return Context.is_host(); } + + context get_context() const { return Context; } + + program get_program() const; + + template + typename info::param_traits::return_type + get_info() const { + if (is_host()) { + // TODO implement + assert(0 && "Not implemented"); + } + return get_kernel_info_cl< + typename info::param_traits::return_type, + param>::_(this->get()); + } + + template + typename info::param_traits::return_type + get_work_group_info(const device &Device) const { + if (is_host()) { + return get_kernel_work_group_info_host(Device); + } + return get_kernel_work_group_info_cl< + typename info::param_traits::return_type, + param>::_(this->get(), Device.get()); + } + + template + typename info::param_traits::return_type + get_sub_group_info(const device &Device) const { + if (is_host()) { + throw runtime_error("Sub-group feature is not supported on HOST device."); + } + return get_kernel_sub_group_info_cl< + typename info::param_traits::return_type, + param>::_(this->get(), Device.get()); + } + + template + typename info::param_traits::return_type + get_sub_group_info( + const device &Device, + typename info::param_traits::input_type + Value) const { + if (is_host()) { + throw runtime_error("Sub-group feature is not supported on HOST device."); + } + return get_kernel_sub_group_info_with_input_cl< + typename info::param_traits::return_type, + param, + typename info::param_traits::input_type>::_(this->get(), + Device.get(), Value); + } + +private: + cl_kernel ClKernel; + context Context; + std::shared_ptr ProgramImpl; +}; + +template <> context kernel_impl::get_info() const; + +template <> program kernel_impl::get_info() const; + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/kernel_info.hpp b/sycl/include/CL/sycl/detail/kernel_info.hpp new file mode 100644 index 000000000000..cbae1fb42edb --- /dev/null +++ b/sycl/include/CL/sycl/detail/kernel_info.hpp @@ -0,0 +1,161 @@ +//==-------- kernel_info.hpp - SYCL kernel info methods --------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +namespace cl { +namespace sycl { +namespace detail { + +// OpenCL kernel information methods +template struct get_kernel_info_cl {}; + +template struct get_kernel_info_cl { + static string_class _(cl_kernel ClKernel) { + size_t ResultSize; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetKernelInfo(ClKernel, cl_kernel_info(Param), 0, nullptr, + &ResultSize)); + if (ResultSize == 0) { + return ""; + } + string_class Result(ResultSize, ' '); + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetKernelInfo(ClKernel, cl_kernel_info(Param), ResultSize, + &Result[0], nullptr)); + return Result; + } +}; + +template struct get_kernel_info_cl { + static cl_uint _(cl_kernel ClKernel) { + cl_uint Result; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetKernelInfo(ClKernel, cl_kernel_info(Param), + sizeof(cl_uint), &Result, nullptr)); + return Result; + } +}; + +// OpenCL kernel work-group methods + +template +struct get_kernel_work_group_info_cl { + static T _(cl_kernel ClKernel, cl_device_id ClDevice) { + T Result; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetKernelWorkGroupInfo(ClKernel, ClDevice, + cl_kernel_work_group_info(Param), + sizeof(T), &Result, nullptr)); + return Result; + } +}; + +template +struct get_kernel_work_group_info_cl, Param> { + static cl::sycl::range<3> _(cl_kernel ClKernel, cl_device_id ClDevice) { + size_t Result[3]; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetKernelWorkGroupInfo( + ClKernel, ClDevice, cl_kernel_work_group_info(Param), + sizeof(size_t) * 3, Result, nullptr)); + return cl::sycl::range<3>(Result[0], Result[1], Result[2]); + } +}; + +template +typename info::param_traits::return_type +get_kernel_work_group_info_host(const cl::sycl::device &Device); + +template <> +cl::sycl::range<3> +get_kernel_work_group_info_host( + const cl::sycl::device &Device); + +template <> +size_t +get_kernel_work_group_info_host( + const cl::sycl::device &Device); + +template <> +cl::sycl::range<3> get_kernel_work_group_info_host< + info::kernel_work_group::compile_work_group_size>( + const cl::sycl::device &Device); + +template <> +size_t get_kernel_work_group_info_host< + info::kernel_work_group::preferred_work_group_size_multiple>( + const cl::sycl::device &Device); + +template <> +cl_ulong +get_kernel_work_group_info_host( + const cl::sycl::device &Device); + +// OpenCL kernel sub-group methods + +template +struct get_kernel_sub_group_info_cl { + static TOut _(cl_kernel ClKernel, cl_device_id ClDevice) { + TOut Result; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetKernelSubGroupInfo( + ClKernel, ClDevice, cl_kernel_sub_group_info(Param), 0, nullptr, + sizeof(TOut), &Result, nullptr)); + return Result; + } +}; + +template +struct get_kernel_sub_group_info_with_input_cl { + static TOut _(cl_kernel ClKernel, cl_device_id ClDevice, TIn In) { + TOut Result; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetKernelSubGroupInfo( + ClKernel, ClDevice, cl_kernel_sub_group_info(Param), sizeof(TIn), &In, + sizeof(TOut), &Result, nullptr)); + return Result; + } +}; + +template +struct get_kernel_sub_group_info_with_input_cl, Param, + size_t> { + static cl::sycl::range<3> _(cl_kernel ClKernel, cl_device_id ClDevice, + size_t In) { + size_t Result[3]; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetKernelSubGroupInfo( + ClKernel, ClDevice, cl_kernel_sub_group_info(Param), sizeof(size_t), + &In, sizeof(size_t) * 3, Result, nullptr)); + return cl::sycl::range<3>(Result[0], Result[1], Result[2]); + } +}; + +template +struct get_kernel_sub_group_info_with_input_cl> { + static size_t _(cl_kernel ClKernel, cl_device_id ClDevice, + cl::sycl::range<3> In) { + size_t Input[3] = {In[0], In[1], In[2]}; + size_t Result; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetKernelSubGroupInfo( + ClKernel, ClDevice, cl_kernel_sub_group_info(Param), sizeof(size_t) * 3, + Input, sizeof(size_t), &Result, nullptr)); + return Result; + } +}; +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/platform_host.hpp b/sycl/include/CL/sycl/detail/platform_host.hpp new file mode 100644 index 000000000000..ceed82b0a5bd --- /dev/null +++ b/sycl/include/CL/sycl/detail/platform_host.hpp @@ -0,0 +1,41 @@ +//==------------ platform_host.hpp - SYCL host platform --------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include + +// 4.6.2 Platform class for host platform +namespace cl { +namespace sycl { + +// Forward declaration +class device; + +namespace detail { +// TODO: implement extension management +// TODO: implement parameters treatment + +class platform_host : public platform_impl { +public: + vector_class get_devices( + info::device_type dev_type = info::device_type::all) const override; + + bool has_extension(const string_class &extension_name) const override { + return false; + } + + cl_platform_id get() const override { + throw invalid_object_error("This instance of platform is a host instance"); + } + + bool is_host() const override { return true; } +}; // class platform_host +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/platform_impl.hpp b/sycl/include/CL/sycl/detail/platform_impl.hpp new file mode 100644 index 000000000000..2da2488a1f38 --- /dev/null +++ b/sycl/include/CL/sycl/detail/platform_impl.hpp @@ -0,0 +1,57 @@ +//==-------------- platform_impl.hpp - SYCL platform -----------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +// 4.6.2 Platform class +namespace cl { +namespace sycl { + +// Forward declaration +class device_selector; +class device; + +namespace detail { + +class platform_impl { +public: + platform_impl() = default; + + explicit platform_impl(const device_selector &); + + virtual bool has_extension(const string_class &extension_name) const = 0; + + virtual vector_class + get_devices(info::device_type = info::device_type::all) const = 0; + + template + typename info::param_traits::return_type + get_info() const { + if (is_host()) { + return get_platform_info_host(); + } + return get_platform_info_cl< + typename info::param_traits::return_type, + param>::_(this->get()); + } + + virtual bool is_host() const = 0; + + virtual cl_platform_id get() const = 0; + + virtual ~platform_impl() = default; +}; // class platform_impl + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/platform_info.hpp b/sycl/include/CL/sycl/detail/platform_info.hpp new file mode 100644 index 000000000000..04d97f712c94 --- /dev/null +++ b/sycl/include/CL/sycl/detail/platform_info.hpp @@ -0,0 +1,68 @@ +//==------ platform_info.hpp - SYCL platform info methods ------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +namespace cl { +namespace sycl { +namespace detail { + +// OpenCL platform information methods +template struct get_platform_info_cl {}; + +template +struct get_platform_info_cl { + static string_class _(cl_platform_id plt) { + size_t resultSize; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE( + clGetPlatformInfo(plt, cl_platform_info(param), 0, NULL, &resultSize)); + if (resultSize == 0) { + return ""; + } + unique_ptr_class result(new char[resultSize]); + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetPlatformInfo(plt, cl_platform_info(param), resultSize, + result.get(), NULL)); + return result.get(); + } +}; + +template <> +struct get_platform_info_cl, + info::platform::extensions> { + static vector_class _(cl_platform_id plt) { + string_class result = + get_platform_info_cl::_(plt); + return split_string(result, ' '); + } +}; + +// Host platform information methods +template +typename info::param_traits::return_type +get_platform_info_host() = delete; + +template <> string_class get_platform_info_host(); + +template <> string_class get_platform_info_host(); + +template <> string_class get_platform_info_host(); + +template <> string_class get_platform_info_host(); + +template <> +vector_class get_platform_info_host(); + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/platform_opencl.hpp b/sycl/include/CL/sycl/detail/platform_opencl.hpp new file mode 100644 index 000000000000..47aaf1d459d5 --- /dev/null +++ b/sycl/include/CL/sycl/detail/platform_opencl.hpp @@ -0,0 +1,45 @@ +//==-------- platform_opencl.hpp - SYCL OpenCL platform --------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include + +// 4.6.2 Platform class for opencl platform +namespace cl { +namespace sycl { + +// Forward declaration +class device_selector; +class device; + +namespace detail { +// TODO: implement parameters treatment +class platform_opencl : public platform_impl { +public: + platform_opencl(cl_platform_id platform_id) : id(platform_id) {} + + vector_class get_devices( + info::device_type deviceType = info::device_type::all) const override; + + bool has_extension(const string_class &extension_name) const override { + string_class all_extension_names = + get_platform_info_cl::_(id); + return (all_extension_names.find(extension_name) != std::string::npos); + } + + cl_platform_id get() const override { return id; } + + bool is_host() const override { return false; } + +private: + cl_platform_id id = 0; +}; // class platform_opencl +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/program_impl.hpp b/sycl/include/CL/sycl/detail/program_impl.hpp new file mode 100644 index 000000000000..d10f1c636bee --- /dev/null +++ b/sycl/include/CL/sycl/detail/program_impl.hpp @@ -0,0 +1,378 @@ +//==----- program_impl.hpp --- SYCL program implementation -----------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace cl { +namespace sycl { + +enum class program_state { none, compiled, linked }; + +namespace detail { + +class program_impl { +public: + program_impl() = delete; + + explicit program_impl(const context &Context) + : program_impl(Context, Context.get_devices()) {} + + program_impl(const context &Context, vector_class DeviceList) + : Context(Context), Devices(DeviceList) {} + + program_impl(vector_class> ProgramList, + string_class LinkOptions = "") + : State(program_state::linked), LinkOptions(LinkOptions) { + // Verify arguments + if (ProgramList.empty()) { + throw runtime_error("Non-empty vector of programs expected"); + } + Context = ProgramList[0]->Context; + Devices = ProgramList[0]->Devices; + for (const auto &Prg : ProgramList) { + Prg->throw_if_state_is_not(program_state::compiled); + if (Prg->Context != Context) { + throw invalid_object_error( + "Not all programs are associated with the same context"); + } + if (Prg->Devices != Devices) { + throw invalid_object_error( + "Not all programs are associated with the same devices"); + } + } + + if (!is_host()) { + vector_class ClDevices(get_cl_devices()); + vector_class ClPrograms; + for (const auto &Prg : ProgramList) { + ClPrograms.push_back(Prg->ClProgram); + } + cl_int Err; + ClProgram = + clLinkProgram(Context.get(), ClDevices.size(), ClDevices.data(), + LinkOptions.c_str(), ProgramList.size(), + ClPrograms.data(), nullptr, nullptr, &Err); + CHECK_OCL_CODE_THROW(Err, compile_program_error); + } + } + + program_impl(const context &Context, cl_program ClProgram) + : ClProgram(ClProgram), Context(Context) { + // TODO it's unclear how to handle getting compile, link and build options + // in this case + // TODO handle the case when cl_program build is in progress + cl_uint NumDevices; + CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_NUM_DEVICES, + sizeof(cl_uint), &NumDevices, nullptr)); + vector_class ClDevices(NumDevices); + CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_DEVICES, + sizeof(cl_device_id) * NumDevices, + ClDevices.data(), nullptr)); + Devices = vector_class(ClDevices.begin(), ClDevices.end()); + // TODO check build for each device instead + cl_program_binary_type BinaryType; + CHECK_OCL_CODE(clGetProgramBuildInfo( + ClProgram, Devices[0].get(), CL_PROGRAM_BINARY_TYPE, + sizeof(cl_program_binary_type), &BinaryType, nullptr)); + switch (BinaryType) { + case CL_PROGRAM_BINARY_TYPE_NONE: + State = program_state::none; + break; + case CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT: + State = program_state::compiled; + break; + case CL_PROGRAM_BINARY_TYPE_LIBRARY: + case CL_PROGRAM_BINARY_TYPE_EXECUTABLE: + State = program_state::linked; + } + CHECK_OCL_CODE(clRetainProgram(ClProgram)); + } + + program_impl(const context &Context, cl_kernel ClKernel) + : program_impl( + Context, + ProgramManager::getInstance().getClProgramFromClKernel(ClKernel)) {} + + ~program_impl() { + // TODO replace CHECK_OCL_CODE_NO_EXC to CHECK_OCL_CODE and + // catch an exception and put it to list of asynchronous exceptions + if (!is_host() && ClProgram != nullptr) { + CHECK_OCL_CODE_NO_EXC(clReleaseProgram(ClProgram)); + } + } + + cl_program get() const { + throw_if_state_is(program_state::none); + if (is_host()) { + throw invalid_object_error("This instance of program is a host instance"); + } + CHECK_OCL_CODE(clRetainProgram(ClProgram)); + return ClProgram; + } + + bool is_host() const { return Context.is_host(); } + + template + void compile_with_kernel_type(string_class CompileOptions = "") { + throw_if_state_is_not(program_state::none); + // TODO Check for existence of kernel + if (!is_host()) { + create_cl_program_with_il(); + compile(CompileOptions); + } + State = program_state::compiled; + } + + void compile_with_source(string_class KernelSource, + string_class CompileOptions = "") { + throw_if_state_is_not(program_state::none); + // TODO should it throw if it's host? + if (!is_host()) { + create_cl_program_with_source(KernelSource); + compile(CompileOptions); + } + State = program_state::compiled; + } + + template + void build_with_kernel_type(string_class BuildOptions = "") { + throw_if_state_is_not(program_state::none); + // TODO Check for existence of kernel + if (!is_host()) { + create_cl_program_with_il(); + build(BuildOptions); + } + State = program_state::linked; + } + + void build_with_source(string_class KernelSource, + string_class BuildOptions = "") { + throw_if_state_is_not(program_state::none); + // TODO should it throw if it's host? + if (!is_host()) { + create_cl_program_with_source(KernelSource); + build(BuildOptions); + } + State = program_state::linked; + } + + void link(string_class LinkOptions = "") { + throw_if_state_is_not(program_state::compiled); + if (!is_host()) { + vector_class ClDevices(get_cl_devices()); + cl_int Err; + ClProgram = clLinkProgram(Context.get(), ClDevices.size(), + ClDevices.data(), LinkOptions.c_str(), 1, + &ClProgram, nullptr, nullptr, &Err); + CHECK_OCL_CODE_THROW(Err, compile_program_error); + LinkOptions = LinkOptions; + } + State = program_state::linked; + } + + template + bool has_kernel() const +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else + { + throw_if_state_is(program_state::none); + if (is_host()) { + return true; + } + return has_cl_kernel(KernelInfo::getName()); + } +#endif + + bool has_kernel(string_class KernelName) const { + throw_if_state_is(program_state::none); + if (is_host()) { + return false; + } + return has_cl_kernel(KernelName); + } + + template + kernel get_kernel(std::shared_ptr PtrToSelf) const +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else + { + throw_if_state_is(program_state::none); + if (is_host()) { + return createSyclObjFromImpl( + std::make_shared(Context, PtrToSelf)); + } + return createSyclObjFromImpl(std::make_shared( + get_cl_kernel(KernelInfo::getName()), Context, PtrToSelf)); + } +#endif + + kernel get_kernel(string_class KernelName, + std::shared_ptr PtrToSelf) const { + throw_if_state_is(program_state::none); + if (is_host()) { + throw invalid_object_error("This instance of program is a host instance"); + } + return createSyclObjFromImpl(std::make_shared( + get_cl_kernel(KernelName), Context, PtrToSelf)); + } + + template + typename info::param_traits::return_type + get_info() const; + + vector_class> get_binaries() const { + throw_if_state_is(program_state::none); + vector_class BinarySizes(Devices.size()); + CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_BINARY_SIZES, + sizeof(size_t) * BinarySizes.size(), + BinarySizes.data(), nullptr)); + + vector_class> Result; + vector_class Pointers; + for (size_t I = 0; I < BinarySizes.size(); ++I) { + Result.emplace_back(BinarySizes[I]); + Pointers.push_back(Result[I].data()); + } + CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_BINARIES, + sizeof(char *) * Pointers.size(), + Pointers.data(), nullptr)); + return Result; + } + + context get_context() const { return Context; } + + vector_class get_devices() const { return Devices; } + + string_class get_compile_options() const { return CompileOptions; } + + string_class get_link_options() const { return LinkOptions; } + + string_class get_build_options() const { return BuildOptions; } + + program_state get_state() const { return State; } + +private: + void create_cl_program_with_il() { + assert(!ClProgram && "This program already has an encapsulated cl_program"); + ClProgram = ProgramManager::getInstance().getBuiltOpenCLProgram(Context); + } + + void create_cl_program_with_source(const string_class &Source) { + assert(!ClProgram && "This program already has an encapsulated cl_program"); + cl_int Err; + const char *Src = Source.c_str(); + size_t Size = Source.size(); + ClProgram = clCreateProgramWithSource(Context.get(), 1, &Src, &Size, &Err); + CHECK_OCL_CODE(Err); + } + + void compile(const string_class &Options) { + vector_class ClDevices(get_cl_devices()); + // TODO make the exception message more descriptive + if (clCompileProgram(ClProgram, ClDevices.size(), ClDevices.data(), + Options.c_str(), 0, nullptr, nullptr, nullptr, + nullptr) != CL_SUCCESS) { + throw compile_program_error("Program compilation error"); + } + CompileOptions = Options; + } + + void build(const string_class &Options) { + vector_class ClDevices(get_cl_devices()); + // TODO make the exception message more descriptive + if (clBuildProgram(ClProgram, ClDevices.size(), ClDevices.data(), + Options.c_str(), nullptr, nullptr) != CL_SUCCESS) { + throw compile_program_error("Program build error"); + } + BuildOptions = Options; + } + + vector_class get_cl_devices() const { + vector_class ClDevices; + for (const auto &Device : Devices) { + ClDevices.push_back(Device.get()); + } + return ClDevices; + } + + bool has_cl_kernel(const string_class &KernelName) const { + size_t Size; + CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_KERNEL_NAMES, 0, + nullptr, &Size)); + string_class ClResult(Size, ' '); + CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_KERNEL_NAMES, + ClResult.size(), &ClResult[0], nullptr)); + // Get rid of the null terminator + ClResult.pop_back(); + vector_class KernelNames(split_string(ClResult, ';')); + for (const auto &Name : KernelNames) { + if (Name == KernelName) { + return true; + } + } + return false; + } + + cl_kernel get_cl_kernel(const string_class &KernelName) const { + cl_int Err; + cl_kernel ClKernel = clCreateKernel(ClProgram, KernelName.c_str(), &Err); + if (Err == CL_INVALID_KERNEL_NAME) { + throw invalid_object_error( + "This instance of program does not contain the kernel requested"); + } + CHECK_OCL_CODE(Err); + return ClKernel; + } + + void throw_if_state_is(program_state State) const { + if (this->State == State) { + throw invalid_object_error("Invalid program state"); + } + } + + void throw_if_state_is_not(program_state State) const { + if (this->State != State) { + throw invalid_object_error("Invalid program state"); + } + } + + cl_program ClProgram = nullptr; + program_state State = program_state::none; + context Context; + vector_class Devices; + string_class CompileOptions; + string_class LinkOptions; + string_class BuildOptions; +}; + +template <> +cl_uint program_impl::get_info() const; + +template <> context program_impl::get_info() const; + +template <> +vector_class program_impl::get_info() const; + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/program_manager/program_manager.hpp b/sycl/include/CL/sycl/detail/program_manager/program_manager.hpp new file mode 100644 index 000000000000..56376d7848f8 --- /dev/null +++ b/sycl/include/CL/sycl/detail/program_manager/program_manager.hpp @@ -0,0 +1,86 @@ +//==------ program_manager.hpp --- SYCL program manager---------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include +#include + +/// This struct is a record of the device image information +struct __tgt_device_image { + void *ImageStart; // Pointer to the target code start + void *ImageEnd; // Pointer to the target code end +}; + +/// This struct is a record of all the host code that may be offloaded to a +/// target. +struct __tgt_bin_desc { + int32_t NumDeviceImages; // Number of device types supported + __tgt_device_image *DeviceImages; // Array of device images (1 per dev. type) +}; + +// +++ Entry points referenced by the offload wrapper object { + +/// Executed as a part of current module's (.exe, .dll) static initialization. +/// Registers device executable images with the runtime. +extern "C" void __tgt_register_lib(__tgt_bin_desc *desc); + +/// Executed as a part of current module's (.exe, .dll) static +/// de-initialization. +/// Unregisters device executable images with the runtime. +extern "C" void __tgt_unregister_lib(__tgt_bin_desc *desc); + +// +++ } + +namespace cl { +namespace sycl { +class context; +namespace detail { + +// Provides single loading and building OpenCL programs with unique contexts +// that is necessary for no interoperability cases with lambda. +class ProgramManager { +public: + static ProgramManager &getInstance(); + cl_program getBuiltOpenCLProgram(const context &Context); + cl_kernel getOrCreateKernel(const context &Context, const char *KernelName); + cl_program getClProgramFromClKernel(cl_kernel ClKernel); + + void setDeviceImages(__tgt_bin_desc *_DeviceImages) { + // TODO thread-unsafe, see comments in __tgt_register_lib + DeviceImages = _DeviceImages; + } + +private: + const vector_class getSpirvSource(); + void build(cl_program &ClProgram, const string_class &Options = "", + std::vector ClDevices = std::vector()); + + struct ContextLess { + bool operator()(const context &LHS, const context &RHS) const; + }; + + ProgramManager() : DeviceImages(nullptr) {} + ~ProgramManager() = default; + ProgramManager(ProgramManager const &) = delete; + ProgramManager &operator=(ProgramManager const &) = delete; + + unique_ptr_class> m_SpirvSource; + std::map m_CachedSpirvPrograms; + std::map> m_CachedKernels; + + /// Device executable images available in this module (.exe or .dll). + __tgt_bin_desc *DeviceImages; +}; +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/queue_impl.hpp b/sycl/include/CL/sycl/detail/queue_impl.hpp new file mode 100644 index 000000000000..1c2782bbd53a --- /dev/null +++ b/sycl/include/CL/sycl/detail/queue_impl.hpp @@ -0,0 +1,170 @@ +//==------------------ queue_impl.hpp - SYCL queue -------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace cl { +namespace sycl { +namespace detail { + +class queue_impl { +public: + queue_impl(const device &SyclDevice, async_handler AsyncHandler, + const property_list &PropList) + : m_Device(SyclDevice), m_Context(m_Device), m_AsyncHandler(AsyncHandler), + m_PropList(PropList), m_HostQueue(m_Device.is_host()) { + m_OpenCLInterop = !m_HostQueue; + if (!m_HostQueue) { + cl_command_queue_properties CreationFlags = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; + + if (m_PropList.has_property()) { + CreationFlags |= CL_QUEUE_PROFILING_ENABLE; + } + + cl_int Error = CL_SUCCESS; +#ifdef CL_VERSION_2_0 + vector_class CreationFlagProperties = { + CL_QUEUE_PROPERTIES, CreationFlags, 0}; + m_CommandQueue = clCreateCommandQueueWithProperties( + m_Context.get(), m_Device.get(), CreationFlagProperties.data(), + &Error); +#else + m_CommandQueue = clCreateCommandQueue(m_Context.get(), m_Device.get(), + CreationFlags, &Error); +#endif + CHECK_OCL_CODE(Error); + // TODO catch an exception and put it to list of asynchronous exceptions + } + } + + queue_impl(cl_command_queue CLQueue, const context &SyclContext, + const async_handler &AsyncHandler) + : m_Context(SyclContext), m_AsyncHandler(AsyncHandler), + m_CommandQueue(CLQueue), m_OpenCLInterop(true), m_HostQueue(false) { + + cl_device_id CLDevice = nullptr; + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clGetCommandQueueInfo(m_CommandQueue, CL_QUEUE_DEVICE, + sizeof(CLDevice), &CLDevice, nullptr)); + m_Device = device(CLDevice); + // TODO catch an exception and put it to list of asynchronous exceptions + CHECK_OCL_CODE(clRetainCommandQueue(m_CommandQueue)); + } + + ~queue_impl() { + if (m_OpenCLInterop) { + CHECK_OCL_CODE_NO_EXC(clReleaseCommandQueue(m_CommandQueue)); + } + } + + cl_command_queue get() { + if (m_OpenCLInterop) { + CHECK_OCL_CODE(clRetainCommandQueue(m_CommandQueue)); + return m_CommandQueue; + } + throw invalid_object_error( + "This instance of queue doesn't support OpenCL interoperability"); + } + + context get_context() const { return m_Context; } + + device get_device() const { return m_Device; } + + bool is_host() const { return m_HostQueue; } + + template + typename info::param_traits::return_type get_info() const; + + template event submit(T cgf, std::shared_ptr self, + std::shared_ptr second_queue) { + event Event; + try { + Event = submit_impl(cgf, self); + } catch (...) { + m_Exceptions.push_back(std::current_exception()); + Event = second_queue->submit(cgf, second_queue); + } + return Event; + } + + template event submit(T cgf, std::shared_ptr self) { + event Event; + try { + Event = submit_impl(cgf, self); + } catch(...) { + m_Exceptions.push_back(std::current_exception()); + } + return Event; + } + + void wait() { + // TODO: Make thread safe. + for (auto &evnt : m_Events) + evnt.wait(); + m_Events.clear(); + } + + exception_list getExceptionList() const { return m_Exceptions; } + + void wait_and_throw() { + wait(); + throw_asynchronous(); + } + + void throw_asynchronous() { + if (m_AsyncHandler && m_Exceptions.size()) { + m_AsyncHandler(m_Exceptions); + } + m_Exceptions.clear(); + } + + cl_command_queue &getHandleRef() { return m_CommandQueue; } + + template bool has_property() const { + return m_PropList.has_property(); + } + + template propertyT get_property() const { + return m_PropList.get_property(); + } + +private: + template + event submit_impl(T cgf, std::shared_ptr self) { + handler Handler(std::move(self), m_HostQueue); + cgf(Handler); + event Event = Handler.finalize(); + // TODO: Make thread safe. + m_Events.push_back(Event); + return Event; + } + + device m_Device; + context m_Context; + vector_class m_Events; + exception_list m_Exceptions; + async_handler m_AsyncHandler; + property_list m_PropList; + + cl_command_queue m_CommandQueue = nullptr; + bool m_OpenCLInterop = false; + bool m_HostQueue = false; +}; + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/scheduler/commands.cpp b/sycl/include/CL/sycl/detail/scheduler/commands.cpp new file mode 100644 index 000000000000..09f7f09342fc --- /dev/null +++ b/sycl/include/CL/sycl/detail/scheduler/commands.cpp @@ -0,0 +1,161 @@ +//==----------- commands.cpp -----------------------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace csd = cl::sycl::detail; + +namespace cl { +namespace sycl { +namespace simple_scheduler { + +template +const Dst *getParamAddress(const Src *ptr, uint64_t Offset) { + return reinterpret_cast((const char *)ptr + Offset); +} + +template +void ExecuteKernelCommand< + KernelType, Dimensions, RangeType, KernelArgType, + SingleTask>::executeKernel(std::vector DepEvents, + EventImplPtr Event) { + if (m_Queue->is_host()) { + detail::waitEvents(DepEvents); + Event->setIsHostEvent(true); + return runOnHost(); + } + + if (!m_ClKernel) { + m_ClKernel = detail::ProgramManager::getInstance().getOrCreateKernel( + m_Queue->get_context(), m_KernelName.c_str()); + } + + if (m_KernelArgs != nullptr) { + for (unsigned I = 0; I < m_KernelArgsNum; ++I) { + switch (m_KernelArgs[I].kind) { + case csd::kernel_param_kind_t::kind_std_layout: { + const void *Ptr = + getParamAddress(&m_HostKernel, m_KernelArgs[I].offset); + CHECK_OCL_CODE( + clSetKernelArg(m_ClKernel, I, m_KernelArgs[I].info, Ptr)); + break; + } + case csd::kernel_param_kind_t::kind_accessor: { + switch (static_cast(m_KernelArgs[I].info)) { + case cl::sycl::access::target::global_buffer: + case cl::sycl::access::target::constant_buffer: { + auto *Ptr = + *(getParamAddress *>( + &m_HostKernel, m_KernelArgs[I].offset)); + cl_mem CLBuf = Ptr->getOpenCLMem(); + CHECK_OCL_CODE(clSetKernelArg(m_ClKernel, I, sizeof(cl_mem), &CLBuf)); + break; + } + case cl::sycl::access::target::local: { + auto *Ptr = + getParamAddress(&m_HostKernel, m_KernelArgs[I].offset); + CHECK_OCL_CODE(clSetKernelArg(m_ClKernel, I, *Ptr, nullptr)); + break; + } + // TODO handle these cases + case cl::sycl::access::target::image: + case cl::sycl::access::target::host_buffer: + case cl::sycl::access::target::host_image: + case cl::sycl::access::target::image_array: + assert(0); + } + break; + } + // TODO implement + case csd::kernel_param_kind_t::kind_sampler: + assert(0); + } + } + } + for (const auto &Arg : m_InteropArgs) { + if (Arg.m_Ptr.get() != nullptr) { + CHECK_OCL_CODE(clSetKernelArg(m_ClKernel, Arg.m_ArgIndex, Arg.m_Size, + Arg.m_Ptr.get())); + } else { + cl_mem CLBuf = Arg.m_BufReq->getCLMemObject(); + CHECK_OCL_CODE( + clSetKernelArg(m_ClKernel, Arg.m_ArgIndex, sizeof(cl_mem), &CLBuf)); + } + } + + std::vector CLEvents = + detail::getOrWaitEvents(std::move(DepEvents), m_Queue->get_context()); + cl_event &CLEvent = Event->getHandleRef(); + CLEvent = runEnqueueNDRangeKernel(m_Queue->getHandleRef(), m_ClKernel, + std::move(CLEvents)); + Event->setIsHostEvent(false); +} + +template +template +typename std::enable_if>::value, + cl_event>::type +ExecuteKernelCommand< + KernelType, Dimensions, RangeType, KernelArgType, + SingleTask>::runEnqueueNDRangeKernel(cl_command_queue &EnvQueue, + cl_kernel &Kernel, + std::vector CLEvents) { + size_t GlobalWorkSize[Dimensions]; + size_t GlobalWorkOffset[Dimensions]; + for (int I = 0; I < Dimensions; I++) { + GlobalWorkSize[I] = m_WorkItemsRange[I]; + GlobalWorkOffset[I] = m_WorkItemsOffset[I]; + } + cl_event CLEvent; + cl_int error = clEnqueueNDRangeKernel( + EnvQueue, Kernel, Dimensions, GlobalWorkOffset, GlobalWorkSize, nullptr, + CLEvents.size(), CLEvents.data(), &CLEvent); + CHECK_OCL_CODE(error); + return CLEvent; +} + +template +template +typename std::enable_if>::value, + cl_event>::type +ExecuteKernelCommand< + KernelType, Dimensions, RangeType, KernelArgType, + SingleTask>::runEnqueueNDRangeKernel(cl_command_queue &EnvQueue, + cl_kernel &Kernel, + std::vector CLEvents) { + size_t GlobalWorkSize[Dimensions]; + size_t LocalWorkSize[Dimensions]; + size_t GlobalWorkOffset[Dimensions]; + for (int I = 0; I < Dimensions; I++) { + GlobalWorkSize[I] = m_WorkItemsRange.get_global_range()[I]; + LocalWorkSize[I] = m_WorkItemsRange.get_local_range()[I]; + GlobalWorkOffset[I] = m_WorkItemsRange.get_offset()[I]; + } + cl_event CLEvent; + cl_int Err = clEnqueueNDRangeKernel( + EnvQueue, Kernel, Dimensions, GlobalWorkOffset, GlobalWorkSize, + LocalWorkSize, CLEvents.size(), CLEvents.data(), &CLEvent); + CHECK_OCL_CODE(Err); + return CLEvent; +} + +} // namespace simple_scheduler +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/scheduler/commands.h b/sycl/include/CL/sycl/detail/scheduler/commands.h new file mode 100644 index 000000000000..ab038e6e37dd --- /dev/null +++ b/sycl/include/CL/sycl/detail/scheduler/commands.h @@ -0,0 +1,400 @@ +//==----------- commands.h -------------------------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cl { +namespace sycl { +namespace detail { +class queue_impl; +} +namespace simple_scheduler { +using QueueImplPtr = std::shared_ptr; +using EventImplPtr = std::shared_ptr; +namespace csd = cl::sycl::detail; + +class Command { +public: + enum CommandType { RUN_KERNEL, MOVE_MEMORY, ALLOCA, COPY, FILL }; + + Command(CommandType Type, QueueImplPtr Queue); + + CommandType getType() const { return m_Type; } + + size_t getID() const { return m_ID; } + + void addDep(std::shared_ptr Dep, BufferReqPtr Buf) { + m_Deps.emplace_back(std::move(Dep), std::move(Buf)); + } + + void addInteropArg(InteropArg Arg) { m_InteropArgs.push_back(Arg); } + + cl::sycl::event enqueue(std::vector DepEvents) { + bool Expected = false; + if (m_Enqueued.compare_exchange_strong(Expected, true)) { + enqueueImp(std::move(DepEvents), detail::getSyclObjImpl(m_Event)); + } + return m_Event; + } + + bool isEnqueued() const { return m_Enqueued; } + + virtual void dump() const = 0; + + virtual void print(std::ostream &Stream) const = 0; + + virtual void printDot(std::ostream &Stream) const = 0; + + QueueImplPtr getQueue() const { return m_Queue; } + + cl::sycl::event getEvent() const { return m_Event; } + + std::shared_ptr getDepCommandForReqBuf(const BufferReqPtr &Buf) { + for (const auto &Dep : m_Deps) { + if (Dep.second->isSame(Buf)) { + return Dep.first; + } + } + return nullptr; + } + + cl::sycl::access::mode getAccessModeForReqBuf(const BufferReqPtr &Buf) const { + for (const auto &Dep : m_Deps) { + if (Dep.second->isSame(Buf)) { + return Dep.second->getAccessModeType(); + } + } + throw cl::sycl::runtime_error("Buffer not found."); + } + + void replaceDepCommandForReqBuf(const BufferReqPtr &Buf, + std::shared_ptr NewCommand) { + for (auto &Dep : m_Deps) { + if (Dep.second->isSame(Buf)) { + Dep.first = std::move(NewCommand); + return; + } + } + throw cl::sycl::runtime_error("Buffer not found."); + } + + std::vector, BufferReqPtr>> + getDependencies() { + return m_Deps; + } + + void removeAllDeps() { m_Deps.clear(); } + + virtual ~Command() = default; + +private: + virtual void enqueueImp(std::vector DepEvents, + EventImplPtr Event) = 0; + + CommandType m_Type; + size_t m_ID; + cl::sycl::event m_Event; + std::atomic m_Enqueued; + +protected: + QueueImplPtr m_Queue; + std::vector, BufferReqPtr>> m_Deps; + std::vector m_InteropArgs; +}; + +using CommandPtr = std::shared_ptr; + +class MemMoveCommand : public Command { +public: + MemMoveCommand(BufferReqPtr Buf, QueueImplPtr SrcQueue, QueueImplPtr DstQueue, + cl::sycl::access::mode mode) + : Command(Command::MOVE_MEMORY, std::move(DstQueue)), + m_Buf(std::move(Buf)), m_AccessMode(mode), + m_SrcQueue(std::move(SrcQueue)) {} + + access::mode getAccessModeType() const { return m_Buf->getAccessModeType(); } + void printDot(std::ostream &Stream) const override; + void print(std::ostream &Stream) const override; + void dump() const override { print(std::cout); } + +private: + void enqueueImp(std::vector DepEvents, + EventImplPtr Event) override; + BufferReqPtr m_Buf = nullptr; + cl::sycl::access::mode m_AccessMode; + QueueImplPtr m_SrcQueue; +}; + +class AllocaCommand : public Command { +public: + AllocaCommand(BufferReqPtr Buf, QueueImplPtr Queue, + cl::sycl::access::mode mode) + : Command(Command::ALLOCA, std::move(Queue)), m_Buf(std::move(Buf)), + m_AccessMode(mode) {} + + access::mode getAccessModeType() const { return m_Buf->getAccessModeType(); } + void printDot(std::ostream &Stream) const override; + void print(std::ostream &Stream) const override; + void dump() const override { print(std::cout); } + +private: + void enqueueImp(std::vector DepEvents, + EventImplPtr Event) override; + BufferReqPtr m_Buf = nullptr; + cl::sycl::access::mode m_AccessMode; +}; + +template +class ExecuteKernelCommand : public Command { +public: + ExecuteKernelCommand(KernelType &HostKernel, const std::string KernelName, + const unsigned int KernelArgsNum, + const detail::kernel_param_desc_t *KernelArgs, + RangeType workItemsRange, QueueImplPtr Queue, + cl_kernel ClKernel, id workItemOffset = {}) + : Command(Command::RUN_KERNEL, std::move(Queue)), + m_KernelName(KernelName), m_KernelArgsNum(KernelArgsNum), + m_KernelArgs(KernelArgs), m_WorkItemsRange(workItemsRange), + m_WorkItemsOffset(workItemOffset), m_HostKernel(HostKernel), + m_ClKernel(ClKernel) {} + + void printDot(std::ostream &Stream) const override; + void print(std::ostream &Stream) const override; + void dump() const override { print(std::cout); } + +private: + cl_kernel createKernel(const std::string &KernelName, + cl_program Program) const; + + template ::type> + void runOnHost() { + m_HostKernel(); + } + + template + typename std::enable_if< + (STask == false) && (Dims > 0 && Dims < 4) && + std::is_same>::value && + std::is_same>::value, + void>::type + runOnHost() { + const size_t ZMax = (Dims > 2) ? m_WorkItemsRange[2] : 1; + const size_t YMax = (Dims > 1) ? m_WorkItemsRange[1] : 1; + size_t XYZ[3]; + for (XYZ[2] = 0; XYZ[2] < ZMax; ++XYZ[2]) { + for (XYZ[1] = 0; XYZ[1] < YMax; ++XYZ[1]) { + for (XYZ[0] = 0; XYZ[0] < m_WorkItemsRange[0]; ++XYZ[0]) { + id ID; + for (int I = 0; I < Dims; ++I) { + ID[I] = XYZ[I]; + } + m_HostKernel(ID); + } + } + } + } + + template + typename std::enable_if< + (STask == false) && (Dims > 0 && Dims < 4) && + std::is_same>::value && + (std::is_same>::value || + std::is_same>::value), + void>::type + runOnHost() { + const size_t ZMax = (Dims > 2) ? m_WorkItemsRange[2] : 1; + const size_t YMax = (Dims > 1) ? m_WorkItemsRange[1] : 1; + size_t XYZ[3]; + for (XYZ[2] = 0; XYZ[2] < ZMax; ++XYZ[2]) { + for (XYZ[1] = 0; XYZ[1] < YMax; ++XYZ[1]) { + for (XYZ[0] = 0; XYZ[0] < m_WorkItemsRange[0]; ++XYZ[0]) { + id ID; + range Range; + for (int I = 0; I < Dims; ++I) { + ID[I] = XYZ[I]; + Range[I] = m_WorkItemsRange[I]; + } + item Item = + detail::Builder::createItem(Range, ID); + m_HostKernel(Item); + } + } + } + } + + template + typename std::enable_if< + (STask == false) && (Dims > 0 && Dims < 4) && + std::is_same>::value, + void>::type + runOnHost() { + // TODO add offset logic + + const id<3> GlobalSize{ + m_WorkItemsRange.get_global_range()[0], + ((Dims > 1) ? m_WorkItemsRange.get_global_range()[1] : 1), + ((Dims > 2) ? m_WorkItemsRange.get_global_range()[2] : 1)}; + const id<3> LocalSize{ + m_WorkItemsRange.get_local_range()[0], + ((Dims > 1) ? m_WorkItemsRange.get_local_range()[1] : 1), + ((Dims > 2) ? m_WorkItemsRange.get_local_range()[2] : 1)}; + id<3> GroupSize; + for (int I = 0; I < 3; ++I) { + GroupSize[I] = GlobalSize[I] / LocalSize[I]; + } + + size_t GlobalXYZ[3]; + for (GlobalXYZ[2] = 0; GlobalXYZ[2] < GroupSize[2]; ++GlobalXYZ[2]) { + for (GlobalXYZ[1] = 0; GlobalXYZ[1] < GroupSize[1]; ++GlobalXYZ[1]) { + for (GlobalXYZ[0] = 0; GlobalXYZ[0] < GroupSize[0]; ++GlobalXYZ[0]) { + id ID; + for (int I = 0; I < Dims; ++I) { + ID[I] = GlobalXYZ[I]; + } + group Group = detail::Builder::createGroup( + m_WorkItemsRange.get_global_range(), + m_WorkItemsRange.get_local_range(), ID); + size_t LocalXYZ[3]; + for (LocalXYZ[2] = 0; LocalXYZ[2] < LocalSize[2]; ++LocalXYZ[2]) { + for (LocalXYZ[1] = 0; LocalXYZ[1] < LocalSize[1]; ++LocalXYZ[1]) { + for (LocalXYZ[0] = 0; LocalXYZ[0] < LocalSize[0]; ++LocalXYZ[0]) { + id GlobalID; + id LocalID; + for (int I = 0; I < Dims; ++I) { + GlobalID[I] = GlobalXYZ[I] * LocalSize[I] + LocalXYZ[I]; + LocalID[I] = LocalXYZ[I]; + } + const item GlobalItem = + detail::Builder::createItem( + m_WorkItemsRange.get_global_range(), GlobalID, + m_WorkItemsRange.get_offset()); + const item LocalItem = + detail::Builder::createItem( + m_WorkItemsRange.get_local_range(), LocalID); + nd_item NDItem = detail::Builder::createNDItem( + GlobalItem, LocalItem, Group); + m_HostKernel(NDItem); + } + } + } + } + } + } + } + + void executeKernel(std::vector DepEvents, + EventImplPtr Event); + + void enqueueImp(std::vector DepEvents, + EventImplPtr Event) override { + executeKernel(std::move(DepEvents), std::move(Event)); + } + + template + typename std::enable_if>::value, + cl_event>::type + runEnqueueNDRangeKernel(cl_command_queue &EnvQueue, cl_kernel &Kernel, + std::vector CLEvents); + + template + typename std::enable_if>::value, + cl_event>::type + runEnqueueNDRangeKernel(cl_command_queue &EnvQueue, cl_kernel &Kernel, + std::vector CLEvents); + + std::string m_KernelName; + const unsigned int m_KernelArgsNum; + const detail::kernel_param_desc_t *m_KernelArgs; + RangeType m_WorkItemsRange; + id m_WorkItemsOffset; + KernelType m_HostKernel; + cl_kernel m_ClKernel; +}; + +template class FillCommand : public Command { +public: + FillCommand(BufferReqPtr Buf, T Pattern, QueueImplPtr Queue, range Range, + id Offset) + : Command(Command::FILL, std::move(Queue)), m_Buf(std::move(Buf)), + m_Pattern(std::move(Pattern)), m_Offset(std::move(Offset)), + m_Range(std::move(Range)) {} + + access::mode getAccessModeType() const { return m_Buf->getAccessModeType(); } + void printDot(std::ostream &Stream) const override; + void print(std::ostream &Stream) const override; + void dump() const override { print(std::cout); } + +private: + void enqueueImp(std::vector DepEvents, EventImplPtr Event) { + assert(nullptr != m_Buf && "Buf is nullptr"); + m_Buf->fill(m_Queue, std::move(DepEvents), std::move(Event), &m_Pattern, + sizeof(T), Dim, &m_Offset[0], &m_Range[0]); + } + BufferReqPtr m_Buf = nullptr; + T m_Pattern; + id m_Offset; + range m_Range; +}; + +template class CopyCommand : public Command { +public: + CopyCommand(BufferReqPtr BufSrc, BufferReqPtr BufDest, QueueImplPtr Queue, + range SrcRange, id SrcOffset, + id DestOffset, size_t SizeTySrc, size_t SizeSrc, + range BuffSrcRange) + : Command(Command::COPY, std::move(Queue)), m_BufSrc(std::move(BufSrc)), + m_BufDest(std::move(BufDest)), m_SrcRange(std::move(SrcRange)), + m_SrcOffset(std::move(SrcOffset)), m_DestOffset(std::move(DestOffset)), + m_SizeTySrc(SizeTySrc), m_SizeSrc(SizeSrc), + m_BuffSrcRange(BuffSrcRange) {} + + access::mode getAccessModeType() const { + return m_BufDest->getAccessModeType(); + } + void printDot(std::ostream &Stream) const override; + void print(std::ostream &Stream) const override; + void dump() const override { print(std::cout); } + +private: + void enqueueImp(std::vector DepEvents, EventImplPtr Event) { + assert(nullptr != m_BufSrc && "m_BufSrc is nullptr"); + assert(nullptr != m_BufDest && "m_BufDest is nullptr"); + m_BufDest->copy(m_Queue, std::move(DepEvents), std::move(Event), m_BufSrc, + DimSrc, &m_SrcRange[0], &m_SrcOffset[0], &m_DestOffset[0], + m_SizeTySrc, m_SizeSrc, &m_BuffSrcRange[0]); + } + BufferReqPtr m_BufSrc = nullptr; + BufferReqPtr m_BufDest = nullptr; + range m_SrcRange; + id m_SrcOffset; + id m_DestOffset; + size_t m_SizeTySrc; + size_t m_SizeSrc; + range m_BuffSrcRange; +}; + +} // namespace simple_scheduler +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/scheduler/printers.cpp b/sycl/include/CL/sycl/detail/scheduler/printers.cpp new file mode 100644 index 000000000000..660f470a8365 --- /dev/null +++ b/sycl/include/CL/sycl/detail/scheduler/printers.cpp @@ -0,0 +1,202 @@ +//==----------- printers.cpp -----------------------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include + +namespace cl { +namespace sycl { +namespace simple_scheduler { + +static std::string accessMode2String(cl::sycl::access::mode Type) { + switch (Type) { + case access::mode::write: + return "write"; + case access::mode::read: + return "read"; + case access::mode::read_write: + return "read_write"; + default: + return "unhandled"; + } +} + +static std::string +getDeviceTypeString(const cl::sycl::device &Device, + access::target Target = access::target::global_buffer) { + if (access::target::host_buffer == Target) { + return "User host."; + } + if (Device.is_cpu()) { + return "CPU"; + } + if (Device.is_gpu()) { + return "GPU"; + } + if (Device.is_accelerator()) { + return "ACC"; + } + if (Device.is_host()) { + return "HOST"; + } + return ""; +} + +static std::string +getColor(const cl::sycl::device &Device, + access::target Target = access::target::global_buffer) { + if (access::target::host_buffer == Target) { + return "#FFDEAD"; // navajowhite1 + } + if (Device.is_cpu()) { + return "#00BFFF"; // deepskyblue1 + } + if (Device.is_gpu()) { + return "#00FF7F"; // green + } + if (Device.is_accelerator()) { + return "#FF0000"; // red + } + if (Device.is_host()) { + return "#FFBBFF"; // plum1 + } + return ""; +} + +template +void ExecuteKernelCommand::printDot(std::ostream &Stream) const { + const std::string CommandColor = getColor(m_Queue->get_device()); + + Stream << "\"" << this << "\" [style=filled, label=\""; + + Stream << "ID = " << getID() << " ; "; + Stream << "RUN_KERNEL " + << "\\n" + << m_KernelName << " ON "; + Stream << getDeviceTypeString(m_Queue->get_device()) << "\\n"; + + Stream << "\", fillcolor=\"" << CommandColor << "\"];" << std::endl; + + for (const auto &Dep : m_Deps) { + const auto &Buf = Dep.second; + Stream << " \"" << this << "\" -> \"" << Dep.first << "\" [ label=\""; + Stream << accessMode2String(Buf->getAccessModeType()) << "\" ];"; + Stream << std::endl; + } +} + +template +void ExecuteKernelCommand::print(std::ostream &Stream) const { + Stream << "ID = " << getID() << " ; "; + Stream << "RUN_KERNEL " << m_KernelName << " ON "; + Stream << getDeviceTypeString(m_Queue->get_device()) << std::endl; + Stream << " Dependency:" << std::endl; + + for (const auto &Dep : m_Deps) { + const auto &Command = Dep.first; + const auto &Buf = Dep.second; + Stream << " Dep on buf " << Buf->getUniqID() << " "; + Stream << accessMode2String(Buf->getAccessModeType()); + Stream << " from Command ID = " << Command->getID() << std::endl; + } +} + +template +void FillCommand::printDot(std::ostream &Stream) const { + const std::string CommandColor = getColor(m_Queue->get_device()); + + Stream << "\"" << this << "\" [style=filled, label=\""; + + Stream << "ID = " << getID() << " ; "; + Stream << "Fill " + << "\\n" + << " Buf : " << m_Buf->getUniqID() << " ON "; + Stream << getDeviceTypeString(m_Queue->get_device()) << "\\n"; + + Stream << "\", fillcolor=\"" << CommandColor << "\"];" << std::endl; + + for (const auto &Dep : m_Deps) { + const auto &Buf = Dep.second; + Stream << " \"" << this << "\" -> \"" << Dep.first << "\" [ label=\""; + Stream << accessMode2String(Buf->getAccessModeType()) << "\" ];"; + Stream << std::endl; + } +} + +template +void FillCommand::print(std::ostream &Stream) const { + Stream << "ID = " << getID() << " ; "; + Stream << "Fill " + << " Buf : " << m_Buf->getUniqID() << " ON "; + Stream << getDeviceTypeString(m_Queue->get_device()) << std::endl; + Stream << " Dependency:" << std::endl; + + for (const auto &Dep : m_Deps) { + const auto &Command = Dep.first; + const auto &Buf = Dep.second; + Stream << " Dep on buf " << Buf->getUniqID() << " "; + Stream << accessMode2String(Buf->getAccessModeType()); + Stream << " from Command ID = " << Command->getID() << std::endl; + } +} + +template +void CopyCommand::printDot(std::ostream &Stream) const { + const std::string CommandColor = getColor(m_Queue->get_device()); + + Stream << "\"" << this << "\" [style=filled, label=\""; + + Stream << "ID = " << getID() << " ; "; + Stream << "Copy " + << "\\n" + << " Buf : " << m_BufSrc->getUniqID() << " ON "; + Stream << getDeviceTypeString(m_Queue->get_device()) << "\\n"; + Stream << " To Buf : " << m_BufDest->getUniqID(); + + Stream << "\", fillcolor=\"" << CommandColor << "\"];" << std::endl; + + for (const auto &Dep : m_Deps) { + const auto &Buf = Dep.second; + Stream << " \"" << this << "\" -> \"" << Dep.first << "\" [ label=\""; + Stream << accessMode2String(Buf->getAccessModeType()) << "\" ];"; + Stream << std::endl; + } +} + +template +void CopyCommand::print(std::ostream &Stream) const { + Stream << "ID = " << getID() << " ; "; + Stream << "Copy " + << " Buf : " << m_BufSrc->getUniqID() << " ON "; + Stream << getDeviceTypeString(m_Queue->get_device()) << std::endl; + Stream << " Buf : " << m_BufDest->getUniqID(); + Stream << " Dependency:" << std::endl; + + for (const auto &Dep : m_Deps) { + const auto &Command = Dep.first; + const auto &Buf = Dep.second; + Stream << " Dep on buf " << Buf->getUniqID() << " "; + Stream << accessMode2String(Buf->getAccessModeType()); + Stream << " from Command ID = " << Command->getID() << std::endl; + } +} + +} // namespace simple_scheduler +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/scheduler/requirements.h b/sycl/include/CL/sycl/detail/scheduler/requirements.h new file mode 100644 index 000000000000..7e36271b8220 --- /dev/null +++ b/sycl/include/CL/sycl/detail/scheduler/requirements.h @@ -0,0 +1,169 @@ +//==----------- requirements.h ---------------------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace cl { +namespace sycl { +namespace detail { +template class buffer_impl; +} // namespace detail + +namespace detail { +class queue_impl; +class event_impl; +} // namespace detail +namespace simple_scheduler { + +using QueueImplPtr = std::shared_ptr; +using EventImplPtr = std::shared_ptr; + +class BufferRequirement; +using BufferReqPtr = std::shared_ptr; + +class BufferRequirement { +public: + BufferRequirement(void *UniqID, access::mode AccessMode, + access::target TargetType) + : m_UniqID(UniqID), m_AccessMode(AccessMode), m_TargetType(TargetType) {} + + virtual ~BufferRequirement() = default; + + bool isBigger(const std::shared_ptr &RHS) const { + return m_UniqID > RHS->m_UniqID; + } + + bool isSame(const std::shared_ptr &RHS) const { + return m_UniqID == RHS->m_UniqID; + } + + void *getUniqID() const { return m_UniqID; } + + access::mode getAccessModeType() const { return m_AccessMode; } + + virtual cl_mem getCLMemObject() = 0; + + virtual void allocate(QueueImplPtr Queue, + std::vector DepEvents, + EventImplPtr Event) = 0; + + virtual void moveMemoryTo(QueueImplPtr Queue, + std::vector DepEvents, + EventImplPtr Event) = 0; + + virtual void fill(QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event, void *Pattern, size_t PatternSize, + int Dim, size_t *Offset, size_t *Range) = 0; + + virtual void copy(QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event, BufferReqPtr SrcReq, const int DimSrc, + const size_t *const SrcRange, const size_t *const SrcOffset, + const size_t *const DestOffset, const size_t SizeTySrc, + const size_t SizeSrc, const size_t *const BuffSrcRange) = 0; + + access::target getTargetType() const { return m_TargetType; } + + void addAccessMode(const access::mode AccessMode) { + if (access::mode::read == m_AccessMode && + access::mode::read != AccessMode) { + m_AccessMode = access::mode::read_write; + } else if (access::mode::write == m_AccessMode && + (AccessMode != access::mode::write && + AccessMode != access::mode::discard_write)) { + m_AccessMode = access::mode::read_write; + } + } + +protected: + void *m_UniqID; + access::mode m_AccessMode; + access::target m_TargetType; +}; + +template +class BufferStorage : public BufferRequirement { +public: + BufferStorage( + typename cl::sycl::detail::buffer_impl &Buffer) + : BufferRequirement(&Buffer, Mode, Target), m_Buffer(&Buffer) {} + + void allocate(QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event) override { + assert(m_Buffer != nullptr && "BufferStorage::m_Buffer is nullptr"); + m_Buffer->allocate(std::move(Queue), std::move(DepEvents), std::move(Event), + Mode); + } + + void moveMemoryTo(QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event) override { + assert(m_Buffer != nullptr && "BufferStorage::m_Buffer is nullptr"); + m_Buffer->moveMemoryTo(std::move(Queue), std::move(DepEvents), + std::move(Event)); + } + + void fill(QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event, void *Pattern, size_t PatternSize, int Dim, + size_t *Offset, size_t *Range) override { + assert(m_Buffer != nullptr && "BufferStorage::m_Buffer is nullptr"); + m_Buffer->fill(std::move(Queue), std::move(DepEvents), std::move(Event), + std::move(Pattern), PatternSize, Dim, Offset, Range); + } + + void copy(QueueImplPtr Queue, std::vector DepEvents, + EventImplPtr Event, BufferReqPtr SrcReq, const int DimSrc, + const size_t *const SrcRange, const size_t *const SrcOffset, + const size_t *const DestOffset, const size_t SizeTySrc, + const size_t SizeSrc, const size_t *const BuffSrcRange) override { + assert(m_Buffer != nullptr && "BufferStorage::m_Buffer is nullptr"); + assert(SrcReq != nullptr && "BufferStorage::SrcReq is nullptr"); + + m_Buffer->copy(std::move(Queue), std::move(DepEvents), std::move(Event), + std::move(SrcReq), DimSrc, SrcRange, SrcOffset, DestOffset, + SizeTySrc, SizeSrc, BuffSrcRange); + } + + cl_mem getCLMemObject() override { + assert(m_Buffer != nullptr && "BufferStorage::m_Buffer is nullptr"); + return m_Buffer->getOpenCLMem(); + } + +private: + cl::sycl::detail::buffer_impl *m_Buffer = nullptr; +}; + +struct classcomp { + bool operator()(const BufferReqPtr &LHS, const BufferReqPtr &RHS) const { + return LHS->isBigger(RHS); + } +}; + +// Represents a call of set_arg made in the SYCL application +struct InteropArg { + shared_ptr_class m_Ptr; + size_t m_Size; + int m_ArgIndex; + BufferReqPtr m_BufReq; + + InteropArg(shared_ptr_class Ptr, size_t Size, int ArgIndex, + BufferReqPtr BufReq) + : m_Ptr(Ptr), m_Size(Size), m_ArgIndex(ArgIndex), m_BufReq(BufReq) {} +}; + +} // namespace simple_scheduler +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/scheduler/scheduler.cpp b/sycl/include/CL/sycl/detail/scheduler/scheduler.cpp new file mode 100644 index 000000000000..dd12594823d1 --- /dev/null +++ b/sycl/include/CL/sycl/detail/scheduler/scheduler.cpp @@ -0,0 +1,303 @@ +//==----------- scheduler.cpp ----------------------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cl { +namespace sycl { +namespace simple_scheduler { + +template +static BufferReqPtr +getReqForBuffer(const std::set &BufReqs, + const detail::buffer_impl &Buf) { + for (const auto &Req : BufReqs) { + if (Req->getUniqID() == &Buf) { + return Req; + } + } + return nullptr; +} + +// Adds a buffer requirement for this node. +template +void Node::addBufRequirement( + detail::buffer_impl &Buf) { + BufferReqPtr Req = getReqForBuffer(m_Bufs, Buf); + + // Check if there is requirement for the same buffer already. + if (nullptr != Req) { + Req->addAccessMode(Mode); + } else { + BufferReqPtr BufStor = std::make_shared< + BufferStorage>(Buf); + m_Bufs.insert(BufStor); + } +} + +// Adds an accessor requirement for this node. +template +void Node::addAccRequirement( + accessor &&Acc, + int argIndex) { + detail::buffer_impl *buf = + Acc.template accessor_base::__impl() + ->m_Buf; + addBufRequirement(*buf); + addInteropArg(nullptr, buf->get_size(), argIndex, + getReqForBuffer(m_Bufs, *buf)); +} + +// Adds a kernel to this node, maps to single task. +template +void Node::addKernel(const std::string &KernelName, const int KernelArgsNum, + const detail::kernel_param_desc_t *KernelArgs, + KernelType KernelFunc, cl_kernel ClKernel) { + assert(!m_Kernel && "This node already contains an execution command"); + m_Kernel = + std::make_shared, id<1>, + /*SingleTask=*/true>>( + KernelFunc, KernelName, KernelArgsNum, KernelArgs, range<1>(1), + m_Queue, ClKernel); +} + +// Adds kernel to this node, maps on range parallel for. +template +void Node::addKernel(const std::string &KernelName, const int KernelArgsNum, + const detail::kernel_param_desc_t *KernelArgs, + KernelType KernelFunc, range NumWorkItems, + cl_kernel ClKernel) { + assert(!m_Kernel && "This node already contains an execution command"); + m_Kernel = + std::make_shared, KernelArgType>>( + KernelFunc, KernelName, KernelArgsNum, KernelArgs, NumWorkItems, + m_Queue, ClKernel); +} + +// Adds kernel to this node, maps to range parallel for with offset. +template +void Node::addKernel(const std::string &KernelName, const int KernelArgsNum, + const detail::kernel_param_desc_t *KernelArgs, + KernelType KernelFunc, range NumWorkItems, + id WorkItemOffset, cl_kernel ClKernel) { + assert(!m_Kernel && "This node already contains an execution command"); + m_Kernel = + std::make_shared, KernelArgType>>( + KernelFunc, KernelName, KernelArgsNum, KernelArgs, NumWorkItems, + m_Queue, ClKernel, WorkItemOffset); +} +// Adds kernel to this node, maps on nd_range parallel for. +template +void Node::addKernel(const std::string &KernelName, const int KernelArgsNum, + const detail::kernel_param_desc_t *KernelArgs, + KernelType KernelFunc, nd_range ExecutionRange, + cl_kernel ClKernel) { + assert(!m_Kernel && "This node already contains an execution command"); + m_Kernel = std::make_shared, nd_item>>( + KernelFunc, KernelName, KernelArgsNum, KernelArgs, ExecutionRange, + m_Queue, ClKernel); +} + +// Adds explicit memory operation to this node, maps on handler fill method +template +void Node::addExplicitMemOp( + accessor &Dest, T Src) { + auto *DestBase = Dest.template accessor_base::__impl(); + assert(DestBase != nullptr && + "Accessor should have an initialized accessor_base"); + detail::buffer_impl *Buf = DestBase->m_Buf; + + range Range = DestBase->Range; + id Offset = DestBase->Offset; + + BufferReqPtr Req = getReqForBuffer(m_Bufs, *Buf); + assert(Buf != nullptr && "Accessor should have an initialized buffer_impl"); + assert(!m_Kernel && "This node already contains an execution command"); + m_Kernel = std::make_shared>(Req, Src, m_Queue, + Range, Offset); +} + +// Adds explicit memory operation to this node, maps on handler copy method +template +void Node::addExplicitMemOp( + accessor Src, + accessor Dest) { + auto *SrcBase = Src.template accessor_base::__impl(); + assert(SrcBase != nullptr && + "Accessor should have an initialized accessor_base"); + auto *DestBase = + Dest.template accessor_base::__impl(); + assert(DestBase != nullptr && + "Accessor should have an initialized accessor_base"); + + detail::buffer_impl *SrcBuf = SrcBase->m_Buf; + assert(SrcBuf != nullptr && + "Accessor should have an initialized buffer_impl"); + detail::buffer_impl *DestBuf = DestBase->m_Buf; + assert(DestBuf != nullptr && + "Accessor should have an initialized buffer_impl"); + + range SrcRange = SrcBase->Range; + id SrcOffset = SrcBase->Offset; + id DestOffset = DestBase->Offset; + + range BuffSrcRange = SrcBase->m_Buf->get_range(); + + BufferReqPtr SrcReq = getReqForBuffer(m_Bufs, *SrcBuf); + BufferReqPtr DestReq = getReqForBuffer(m_Bufs, *DestBuf); + + assert(!m_Kernel && "This node already contains an execution command"); + m_Kernel = std::make_shared>( + SrcReq, DestReq, m_Queue, SrcRange, SrcOffset, DestOffset, sizeof(T_src), + SrcBase->get_count(), BuffSrcRange); +} + +// Updates host data of the specified accessor +template +void Scheduler::updateHost( + accessor &Acc, + cl::sycl::event &Event) { + auto *AccBase = Acc.template accessor_base::__impl(); + assert(AccBase != nullptr && + "Accessor should have an initialized accessor_base"); + detail::buffer_impl *Buf = AccBase->m_Buf; + + updateHost(*Buf, Event); +} + +template +void Scheduler::copyBack(detail::buffer_impl &Buf) { + cl::sycl::event Event; + updateHost(Buf, Event); + detail::getSyclObjImpl(Event)->waitInternal(); +} + +// Updates host data of the specified buffer_impl +template +void Scheduler::updateHost(detail::buffer_impl &Buf, + cl::sycl::event &Event) { + CommandPtr UpdateHostCmd; + BufferReqPtr BufStor = + std::make_shared>( + Buf); + + if (0 == m_BuffersEvolution.count(BufStor)) { + return; + } + + // TODO: Find a better way to say that we need copy to HOST, just nullptr? + cl::sycl::device HostDevice; + UpdateHostCmd = std::make_shared( + BufStor, m_BuffersEvolution[BufStor].back()->getQueue(), + detail::getSyclObjImpl(cl::sycl::queue(HostDevice)), + cl::sycl::access::mode::read_write); + + // Add dependency if there was operations with the buffer already. + UpdateHostCmd->addDep(m_BuffersEvolution[BufStor].back(), BufStor); + + m_BuffersEvolution[BufStor].push_back(UpdateHostCmd); + Event = EnqueueCommand(std::move(UpdateHostCmd)); +} + +template +void Scheduler::removeBuffer( + detail::buffer_impl &Buf) { + BufferReqPtr BufStor = std::make_shared< + BufferStorage>(Buf); + + if (0 == m_BuffersEvolution.count(BufStor)) { + return; + } + + for (auto Cmd : m_BuffersEvolution[BufStor]) { + Cmd->removeAllDeps(); + } + + m_BuffersEvolution.erase(BufStor); +} + +static bool cmdsHaveEqualCxtAndDev(const CommandPtr &LHS, + const CommandPtr &RHS) { + return LHS->getQueue()->get_device() == RHS->getQueue()->get_device() && + LHS->getQueue()->get_context() == LHS->getQueue()->get_context(); +} + +// Adds new node to graph, creating an Alloca and MemMove commands if +// needed. +inline cl::sycl::event Scheduler::addNode(Node NewNode) { + // Process global buffers. + CommandPtr Cmd = NewNode.getKernel(); + for (auto Buf : NewNode.getRequirements()) { + // If it's the first command for buffer - insert alloca command. + if (m_BuffersEvolution[Buf].empty()) { + CommandPtr AllocaCmd = + std::make_shared(Buf, std::move(NewNode.getQueue()), + cl::sycl::access::mode::read_write); + m_BuffersEvolution[Buf].push_back(AllocaCmd); + } + // If targets of previous and new command differ - insert memmove command. + if (!cmdsHaveEqualCxtAndDev(m_BuffersEvolution[Buf].back(), Cmd)) { + CommandPtr MemMoveCmd = std::make_shared( + Buf, std::move(m_BuffersEvolution[Buf].back()->getQueue()), + std::move(NewNode.getQueue()), cl::sycl::access::mode::read_write); + MemMoveCmd->addDep(m_BuffersEvolution[Buf].back(), Buf); + m_BuffersEvolution[Buf].push_back(MemMoveCmd); + } + // Finally insert command to the buffer evolution vector. + Cmd->addDep(m_BuffersEvolution[Buf].back(), Buf); + m_BuffersEvolution[Buf].push_back(Cmd); + } + // Process arguments set via interoperability interface + for (auto Arg : NewNode.getInteropArgs()) { + Cmd->addInteropArg(Arg); + } + // If the kernel has no requirements, store the event + if (NewNode.getRequirements().empty()) { + m_EventsWithoutRequirements.push_back( + detail::getSyclObjImpl(Cmd->getEvent())); + } + return EnqueueCommand(Cmd); +} +//} +} // namespace simple_scheduler +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/detail/scheduler/scheduler.h b/sycl/include/CL/sycl/detail/scheduler/scheduler.h new file mode 100644 index 000000000000..66b61c3c868c --- /dev/null +++ b/sycl/include/CL/sycl/detail/scheduler/scheduler.h @@ -0,0 +1,233 @@ +//==----------- scheduler.h ------------------------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cl { +namespace sycl { +// Forward declaration +template +class accessor; + +namespace detail { +class queue_impl; +} +using QueueImplPtr = std::shared_ptr; + +namespace simple_scheduler { + +class Node { +public: + Node(QueueImplPtr Queue) : m_Queue(std::move(Queue)) {} + + Node(Node &&RHS) + : m_Bufs(std::move(RHS.m_Bufs)), + m_InteropArgs(std::move(RHS.m_InteropArgs)), + m_Kernel(std::move(RHS.m_Kernel)), m_Queue(std::move(RHS.m_Queue)), + m_NextOCLIndex(RHS.m_NextOCLIndex) {} + + // Adds a buffer requirement for this node. + template + void addBufRequirement(detail::buffer_impl &Buf); + + // Adds an accessor requirement for this node. + template + void addAccRequirement(accessor &&Acc, + int argIndex); + + // Adds a kernel to this node, maps to single task. + template + void addKernel(const std::string &KernelName, const int KernelArgsNum, + const detail::kernel_param_desc_t *KernelArgs, + KernelType KernelFunc, cl_kernel ClKernel = nullptr); + + // Adds kernel to this node, maps on range parallel for. + template + void addKernel(const std::string &KernelName, const int KernelArgsNum, + const detail::kernel_param_desc_t *KernelArgs, + KernelType KernelFunc, range NumWorkItems, + cl_kernel ClKernel = nullptr); + + // Adds kernel to this node, maps on range parallel for with offset. + template + void addKernel(const std::string &KernelName, const int KernelArgsNum, + const detail::kernel_param_desc_t *KernelArgs, + KernelType KernelFunc, range NumWorkItems, + id WorkItemOffset, cl_kernel ClKernel = nullptr); + + // Adds kernel to this node, maps on nd_range parallel for. + template + void addKernel(const std::string &KernelName, const int KernelArgsNum, + const detail::kernel_param_desc_t *KernelArgs, + KernelType KernelFunc, nd_range ExecutionRange, + cl_kernel ClKernel = nullptr); + + // Adds explicit memory operation to this node, maps on handler fill method + template + void addExplicitMemOp(accessor &Dest, + T Src); + + // Adds explicit memory operation to this node, maps on handler copy method + template < + typename T_src, int dim_src, access::mode mode_src, + access::target tgt_src, typename T_dest, int dim_dest, + access::mode mode_dest, access::target tgt_dest, + access::placeholder isPlaceholder_src = access::placeholder::false_t, + access::placeholder isPlaceholder_dest = access::placeholder::false_t> + void addExplicitMemOp( + accessor Src, + accessor Dest); + + std::set &getRequirements() { return m_Bufs; } + + void addInteropArg(shared_ptr_class Ptr, size_t Size, int ArgIndex, + BufferReqPtr BufReq = nullptr); + + std::vector &getInteropArgs() { return m_InteropArgs; } + + CommandPtr getKernel() { return m_Kernel; } + + QueueImplPtr getQueue() { return m_Queue; } + +private: + // Contains buffer requirements for this node. + std::set m_Bufs; + // Contains arguments set via interoperability methods + std::vector m_InteropArgs; + // Represent execute kernel command. + CommandPtr m_Kernel; + + // SYCL queue for current command group. + QueueImplPtr m_Queue; + + // WORKAROUND. Id for mapping OpenCL buffer to OpenCL kernel argument. + size_t m_NextOCLIndex = 0; +}; + +class Scheduler { +public: + // Adds copying of the specified buffer_impl and waits for completion. + template + void copyBack(detail::buffer_impl &Buf); + + // Updates host data of the specified buffer_impl + template + void updateHost(detail::buffer_impl &Buf, + cl::sycl::event &Event); + + // Updates host data of the specified accessor + template + void updateHost(accessor &Acc, + cl::sycl::event &Event); + + // Frees the specified buffer_impl. + template + void removeBuffer(detail::buffer_impl &Buf); + + // Waits for the event passed. + void waitForEvent(EventImplPtr Event); + + // Adds new node to graph, creating an Alloca and MemMove commands if + // needed. + cl::sycl::event addNode(Node NewNode); + + void print(std::ostream &Stream) const; + void printDot(std::ostream &Stream) const; + void dump() const { print(std::cout); } + + void dumpGraph() const { + std::fstream GraphDot("graph.dot", std::ios::out); + printDot(GraphDot); + } + + void dumpGraphForCommand(CommandPtr Cmd) const; + + void optimize() { parallelReadOpt(); } + + // Converts the following: + // + // ========= ========= ========= + // | kernel1 |<-| kernel2 |<--| kernel3 | + // | write A | | read A | | read A | + // ========= ========= ========= + // + // to: --------------------------- + // \/ | + // ========= ========= ========= + // | kernel1 |<-| kernel2 | | kernel3 | + // | write A | | read A | | read A | + // ========= ========= ========= + // + void parallelReadOpt(); + + static Scheduler &getInstance() { + static Scheduler instance; + return instance; + } + + enum DumpOptions { Text = 0, WholeGraph = 1, RunGraph = 2 }; + bool getDumpFlagValue(DumpOptions DumpOption); + +protected: + // TODO: Add releasing of OpenCL buffers. + + void enqueueAndWaitForCommand(CommandPtr Cmd); + + // Enqueues Cmd command and all its dependencies. + cl::sycl::event EnqueueCommand(CommandPtr Cmd); + + cl::sycl::event dispatch(CommandPtr Cmd); + + // Recursively generates dot records for the command passed and all that the + // command depends on. + void printGraphForCommand(CommandPtr Cmd, std::ostream &Stream) const; + +private: + Scheduler(); + ~Scheduler(); + std::array m_DumpOptions; + // Buffer that represents evolution of buffers - actions that is added + // for each buffer. + std::map, classcomp> m_BuffersEvolution; + // Events for tracking execution of kernels without requirements + std::vector m_EventsWithoutRequirements; + // TODO: At some point of time we should remove already processed commands. + // But we have to be sure that nobody will references them(thru events). + + Scheduler(Scheduler const &) = delete; + Scheduler &operator=(Scheduler const &) = delete; +}; + +} // namespace simple_scheduler +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/device.hpp b/sycl/include/CL/sycl/device.hpp new file mode 100644 index 000000000000..9b660d357f93 --- /dev/null +++ b/sycl/include/CL/sycl/device.hpp @@ -0,0 +1,116 @@ +//==------------------- device.hpp - SYCL device ---------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace cl { +namespace sycl { +// Forward declarations +class device_selector; + +// TODO: 4.6.4 Partitioning into multiple SYCL devices +// TODO: 4.6.4.2 Device information descriptors +// TODO: Make code thread-safe +class device { +public: + device(); + + explicit device(cl_device_id deviceId); + + explicit device(const device_selector &deviceSelector); + + bool operator==(const device &rhs) const { return impl == rhs.impl; } + + bool operator!=(const device &rhs) const { return !(*this == rhs); } + + device(const device &rhs) = default; + + device(device &&rhs) = default; + + device &operator=(const device &rhs) = default; + + device &operator=(device &&rhs) = default; + + cl_device_id get() const { return impl->get(); } + + bool is_host() const { return impl->is_host(); } + + bool is_cpu() const { return impl->is_cpu(); } + + bool is_gpu() const { return impl->is_gpu(); } + + bool is_accelerator() const { return impl->is_accelerator(); } + + platform get_platform() const { return impl->get_platform(); } + + // Available only when prop == info::partition_property::partition_equally + template + typename std::enable_if<(prop == info::partition_property::partition_equally), + vector_class>::type + create_sub_devices(size_t ComputeUnits) const { + return impl->create_sub_devices(ComputeUnits); + } + + // Available only when prop == info::partition_property::partition_by_counts + template + typename std::enable_if<(prop == + info::partition_property::partition_by_counts), + vector_class>::type + create_sub_devices(const vector_class &Counts) const { + return impl->create_sub_devices(Counts); + } + + // Available only when prop == + // info::partition_property::partition_by_affinity_domain + template + typename std::enable_if< + (prop == info::partition_property::partition_by_affinity_domain), + vector_class>::type + create_sub_devices(info::partition_affinity_domain AffinityDomain) const { + return impl->create_sub_devices(AffinityDomain); + } + + template + typename info::param_traits::return_type + get_info() const { + return impl->get_info(); + } + + bool has_extension(const string_class &extension_name) const { + return impl->has_extension(extension_name); + } + + static vector_class + get_devices(info::device_type deviceType = info::device_type::all); + +private: + std::shared_ptr impl; + template + friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject); +}; + +} // namespace sycl +} // namespace cl + +namespace std { +template <> struct hash { + size_t operator()(const cl::sycl::device &d) const { + return hash>()( + cl::sycl::detail::getSyclObjImpl(d)); + } +}; +} // namespace std diff --git a/sycl/include/CL/sycl/device_event.hpp b/sycl/include/CL/sycl/device_event.hpp new file mode 100644 index 000000000000..9a057d39e999 --- /dev/null +++ b/sycl/include/CL/sycl/device_event.hpp @@ -0,0 +1,37 @@ +//==---------- device_event.hpp --- SYCL device event ---------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +namespace cl { +namespace sycl { + +class device_event { +private: + cl::__spirv::OpTypeEvent *m_Event; + +public: + device_event(const device_event &rhs) = default; + device_event(device_event &&rhs) = default; + device_event &operator=(const device_event &rhs) = default; + device_event &operator=(device_event &&rhs) = default; + + device_event(cl::__spirv::OpTypeEvent *Event) : m_Event(Event) {} + + void wait() { + cl::__spirv::OpGroupWaitEvents(cl::__spirv::Scope::Workgroup, 1, + &m_Event); + } +}; + +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/device_selector.hpp b/sycl/include/CL/sycl/device_selector.hpp new file mode 100644 index 000000000000..1e70a55d44b4 --- /dev/null +++ b/sycl/include/CL/sycl/device_selector.hpp @@ -0,0 +1,55 @@ +//==------ device_selector.hpp - SYCL device selector ---------*- C++ --*---==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +// 4.6.1 Device selection class + +namespace cl { +namespace sycl { + +// Forward declarations +class device; + +class device_selector { +public: + virtual ~device_selector() = default; + + device select_device() const; + + virtual int operator()(const device &device) const = 0; +}; + +class default_selector : public device_selector { +public: + int operator()(const device &dev) const override; +}; + +class gpu_selector : public device_selector { +public: + int operator()(const device &dev) const override; +}; + +class cpu_selector : public device_selector { +public: + int operator()(const device &dev) const override; +}; + +class accelerator_selector : public device_selector { +public: + int operator()(const device &dev) const override; +}; + +class host_selector : public device_selector { +public: + int operator()(const device &dev) const override; +}; + +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/event.hpp b/sycl/include/CL/sycl/event.hpp new file mode 100644 index 000000000000..3ac11194bbf5 --- /dev/null +++ b/sycl/include/CL/sycl/event.hpp @@ -0,0 +1,83 @@ +//==---------------- event.hpp --- SYCL event ------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +#include + +namespace cl { +namespace sycl { +// Forward declaration +class context; +class event { +public: + event(); + + event(cl_event clEvent, const context &syclContext); + + event(const event &rhs) = default; + + event(event &&rhs) = default; + + event &operator=(const event &rhs) = default; + + event &operator=(event &&rhs) = default; + + bool operator==(const event &rhs) const; + + bool operator!=(const event &rhs) const; + + cl_event get(); + + bool is_host() const; + + void wait() const; + + // vector_class get_wait_list(); + + // static void wait(const vector_class &eventList); + + // void wait_and_throw(); + + // static void wait_and_throw(const vector_class &eventList); + + template + typename info::param_traits::return_type get_info() const; + + template + typename info::param_traits::return_type + get_profiling_info() const; + +private: + event(std::shared_ptr event_impl); + + std::shared_ptr impl; + + template + friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject); + + template + friend T detail::createSyclObjFromImpl(decltype(T::impl) ImplObj); +}; + +} // namespace sycl +} // namespace cl + +namespace std { +template <> struct hash { + size_t operator()(const cl::sycl::event &e) const { + return hash>()( + cl::sycl::detail::getSyclObjImpl(e)); + } +}; +} // namespace std diff --git a/sycl/include/CL/sycl/exception.hpp b/sycl/include/CL/sycl/exception.hpp new file mode 100644 index 000000000000..67dd1d3242e3 --- /dev/null +++ b/sycl/include/CL/sycl/exception.hpp @@ -0,0 +1,115 @@ +//==---------------- exception.hpp - SYCL exception ------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +// 4.9.2 Exception Class Interface + +#include +#include +#include + +namespace cl { +namespace sycl { + +class context; + +struct exception { + exception() = default; + + const char *what() const noexcept { return msg.c_str(); } + bool has_context() const; + context get_context() const; + cl_int get_cl_code() const; + +private: + std::string msg = "Message not specified"; + cl_int cl_err = CL_SUCCESS; + shared_ptr_class Context; + +protected: + exception(const char *msg, int cl_err = CL_SUCCESS, + shared_ptr_class Context = nullptr) + : msg(std::string(msg) + " " + + ((cl_err == CL_SUCCESS) ? "" : OCL_CODE_TO_STR(cl_err))), + cl_err(cl_err), Context(Context) {} +}; + +class exception_list { + using list_t = vector_class; + list_t list; + +public: + using value_type = exception_ptr_class; + using reference = value_type &; + using const_reference = const value_type &; + using size_type = ::size_t; + using iterator = list_t::const_iterator; + using const_iterator = list_t::const_iterator; + + ::size_t size() const { return list.size(); } + + void clear() noexcept { + list.clear(); + } + + void push_back(const_reference value) { + list.push_back(value); + } + + void push_back(value_type&& value) { + list.push_back(std::move(value)); + } + + /** first asynchronous exception */ + iterator begin() const { return list.begin(); } + /** refer to past-the-end last asynchronous exception */ + iterator end() const { return list.end(); } + + bool operator==(const exception_list &rhs) const { return list == rhs.list; } + + bool operator!=(const exception_list &rhs) const { return !(*this == rhs); } +}; + +using async_handler = function_class; + +class runtime_error : public exception { +public: + runtime_error(const char *str, cl_int err = CL_SUCCESS) + : exception(str, err) {} +}; +class kernel_error : public runtime_error { + using runtime_error::runtime_error; +}; +class accessor_error : public runtime_error {}; +class nd_range_error : public runtime_error {}; +class event_error : public runtime_error {}; +class invalid_parameter_error : public runtime_error { + using runtime_error::runtime_error; +}; +class device_error : public exception { +public: + device_error(const char *str, cl_int err = CL_SUCCESS) + : exception(str, err) {} + device_error() : device_error("") {} +}; +class compile_program_error : public device_error { + using device_error::device_error; +}; +class link_program_error : public device_error {}; +class invalid_object_error : public device_error { + using device_error::device_error; +}; +class memory_allocation_error : public device_error {}; +class platform_error : public device_error {}; +class profiling_error : public device_error {}; +class feature_not_supported : public device_error {}; + +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/group.hpp b/sycl/include/CL/sycl/group.hpp new file mode 100644 index 000000000000..969efaa104b5 --- /dev/null +++ b/sycl/include/CL/sycl/group.hpp @@ -0,0 +1,195 @@ +//==-------------- group.hpp --- SYCL work group ---------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace cl { +namespace sycl { +namespace detail { +class Builder; +} // namespace detail + +template class group { +public: + group() = delete; + + id get_id() const { return index; } + + size_t get_id(int dimension) const { return index[dimension]; } + + range get_global_range() const { return globalRange; } + + size_t get_global_range(int dimension) const { + return globalRange[dimension]; + } + + range get_local_range() const { return localRange; } + + size_t get_local_range(int dimension) const { return localRange[dimension]; } + + range get_group_range() const { return localRange; } + + size_t get_group_range(int dimension) const { return localRange[dimension]; } + + size_t operator[](int dimension) const { return index[dimension]; } + + template + typename std::enable_if<(dims == 1), size_t>::type get_linear() const { + range groupNum = globalRange / localRange; + return index[0]; + } + + template + typename std::enable_if<(dims == 2), size_t>::type get_linear() const { + range groupNum = globalRange / localRange; + return index[1] * groupNum[0] + index[0]; + } + + template + typename std::enable_if<(dims == 3), size_t>::type get_linear() const { + range groupNum = globalRange / localRange; + return (index[2] * groupNum[1] * groupNum[0]) + (index[1] * groupNum[0]) + + index[0]; + } + + // template + // void parallel_for_work_item(workItemFunctionT func) const; + + // template + // void parallel_for_work_item(range flexibleRange, + // workItemFunctionT func) const; + + /// Executes a work-group mem-fence with memory ordering on the local address + /// space, global address space or both based on the value of \p accessSpace. + template + void mem_fence(typename std::enable_if< + accessMode == access::mode::read || + accessMode == access::mode::write || + accessMode == access::mode::read_write, + access::fence_space>::type accessSpace = + access::fence_space::global_and_local) const { + uint32_t flags = ::cl::__spirv::MemorySemantics::SequentiallyConsistent; + switch (accessSpace) { + case access::fence_space::global_space: + flags |= cl::__spirv::MemorySemantics::CrossWorkgroupMemory; + break; + case access::fence_space::local_space: + flags |= cl::__spirv::MemorySemantics::WorkgroupMemory; + break; + case access::fence_space::global_and_local: + default: + flags |= cl::__spirv::MemorySemantics::CrossWorkgroupMemory | + cl::__spirv::MemorySemantics::WorkgroupMemory; + break; + } + // TODO: currently, there is no good way in SPIRV to set the memory + // barrier only for load operations or only for store operations. + // The full read-and-write barrier is used and the template parameter + // 'accessMode' is ignored for now. Either SPIRV or SYCL spec may be + // changed to address this discrepancy between SPIRV and SYCL, + // or if we decide that 'accessMode' is the important feature then + // we can fix this later, for example, by using OpenCL 1.2 functions + // read_mem_fence() and write_mem_fence(). + cl::__spirv::OpMemoryBarrier(cl::__spirv::Scope::Workgroup, flags); + } + + template + device_event async_work_group_copy(local_ptr dest, + global_ptr src, + size_t numElements) const { + cl::__spirv::OpTypeEvent *e = + cl::__spirv::OpGroupAsyncCopyGlobalToLocal( + cl::__spirv::Scope::Workgroup, + dest.get(), src.get(), numElements, 1, 0); + return device_event(e); + } + + template + device_event async_work_group_copy(global_ptr dest, + local_ptr src, + size_t numElements) const { + cl::__spirv::OpTypeEvent *e = + cl::__spirv::OpGroupAsyncCopyLocalToGlobal( + cl::__spirv::Scope::Workgroup, + dest.get(), src.get(), numElements, 1, 0); + return device_event(e); + } + + template + device_event async_work_group_copy(local_ptr dest, + global_ptr src, + size_t numElements, + size_t srcStride) const { + cl::__spirv::OpTypeEvent *e = + cl::__spirv::OpGroupAsyncCopyGlobalToLocal( + cl::__spirv::Scope::Workgroup, + dest.get(), src.get(), numElements, srcStride, 0); + return device_event(e); + } + + template + device_event async_work_group_copy(global_ptr dest, + local_ptr src, + size_t numElements, + size_t destStride) const { + cl::__spirv::OpTypeEvent *e = + cl::__spirv::OpGroupAsyncCopyLocalToGlobal( + cl::__spirv::Scope::Workgroup, + dest.get(), src.get(), numElements, destStride, 0); + return device_event(e); + } + + template + void wait_for(eventTN... Events) const { + waitForHelper(Events...); + } + + bool operator==(const group &rhs) const { + return (rhs.globalRange == this->globalRange) && + (rhs.localRange == this->localRange) && (rhs.index == this->index); + } + + bool operator!=(const group &rhs) const { + return !((*this) == rhs); + } + +private: + range globalRange; + range localRange; + id index; + + void waitForHelper() const {} + + void waitForHelper(device_event Event) const { + Event.wait(); + } + + template + void waitForHelper(T E, Ts... Es) const { + waitForHelper(E); + waitForHelper(Es...); + } + +protected: + friend class detail::Builder; + group(const range &G, const range &L, + const id &I) + : globalRange(G), localRange(L), index(I) {} +}; + +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp new file mode 100644 index 000000000000..cfd909279619 --- /dev/null +++ b/sycl/include/CL/sycl/handler.hpp @@ -0,0 +1,691 @@ +//==-------- handler.hpp --- SYCL command group handler --------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#ifdef __SYCL_DEVICE_ONLY__ +size_t get_global_size(uint dimindx); +size_t get_local_size(uint dimindx); +size_t get_global_id(uint dimindx); +size_t get_local_id(uint dimindx); +size_t get_global_offset(uint dimindx); +size_t get_group_id(uint dimindx); +#endif + +template +class __copy; + +template +class __update_host; + +template +class __fill; + +namespace cl { +namespace sycl { +// Forward declaration +class queue; + +template +class accessor; +template class buffer; +namespace detail { +class queue_impl; +template +class accessor_impl; + +template class buffer_impl; +// Type inference of first arg from a lambda +// auto fun = [&](item a) { a; }; +// lambda_arg_type value; # value type is item + +// Templated static declaration of a function whose single parameter is a +// pointer to a member function of type 'Func'. The member function must have +// 'RetType' return type, single argument of type 'Arg' and be declared with +// the 'const' qualifier. +template +static Arg member_ptr_helper(RetType (Func::*)(Arg) const); + +// Non-const version of the above template to match functors whose 'operator()' +// is declared w/o the 'const' qualifier. +template +static Arg member_ptr_helper(RetType (Func::*)(Arg)); + +template +decltype(member_ptr_helper(&F::operator())) argument_helper(F); + +template +using lambda_arg_type = decltype(argument_helper(std::declval())); + +} // namespace detail + +template +class accessor_base; + +template +class accessor; + +// 4.8.3 Command group handler class +class handler { + template + friend class accessor; + + template + friend class detail::accessor_impl; + + template + friend class detail::buffer_impl; + + friend class detail::queue_impl; + +protected: + simple_scheduler::Node m_Node; + bool isHost = false; + unique_ptr_class m_Finalized; + // TODO: Obtain is host information from Queue when we split queue_impl + // interface and implementation. + handler(std::shared_ptr Queue, bool host) + : m_Node(std::move(Queue)), isHost(host) {} + + event finalize() { + if (!m_Finalized) { + event *Event = + new event(simple_scheduler::Scheduler::getInstance().addNode( + std::move(m_Node))); + m_Finalized.reset(Event); + } + return *m_Finalized.get(); + } + + ~handler() = default; + + bool is_host() { return isHost; } + + template + void AddBufDep(detail::buffer_impl &Buf) { + m_Node.addBufRequirement(Buf); + } + + template + void setArgsHelper(int ArgIndex, T &&Arg, Ts &&... Args) { + set_arg(ArgIndex, std::move(Arg)); + setArgsHelper(++ArgIndex, std::move(Args)...); + } + + void setArgsHelper(int ArgIndex) {} + + template + void setArgHelper(int argIndex, accessor &&arg) { + m_Node.addAccRequirement(std::move(arg), argIndex); + } + + template void setArgHelper(int argIndex, T &&arg) { + using Type = typename std::remove_reference::type; + shared_ptr_class Ptr = std::make_shared(std::move(arg)); + m_Node.addInteropArg(Ptr, sizeof(T), argIndex); + } + + // TODO: implement when sampler class is ready + // void setArgHelper(int argIndex, sampler &&arg) {} + + void verifySyclKernelInvoc(const kernel &SyclKernel) { + if (is_host()) { + throw invalid_object_error( + "This kernel invocation method cannot be used on the host"); + } + if (SyclKernel.is_host()) { + throw invalid_object_error("Invalid kernel type, OpenCL expected"); + } + } + + // This dummy functor is passed to Node::addKernel in SYCL kernel + // parallel_for invocation with range. + template struct DummyFunctor { + void operator()(id) {} + }; + + // Method provides unified getting of the range from an accessor, because + // 1 dimension accessor has no get_range method according to the SYCL + // specification + template + struct getAccessorRangeHelper { + static range + getAccessorRange(const accessor &Acc) { + return Acc.get_range(); + } + }; + + template + struct getAccessorRangeHelper { + static range<1> + getAccessorRange(const accessor &Acc) { + return range<1>(Acc.get_count()); + } + }; + +public: + handler(const handler &) = delete; + handler(handler &&) = delete; + handler &operator=(const handler &) = delete; + handler &operator=(handler &&) = delete; + + // template + // void require(accessor acc); + + // OpenCL interoperability interface + template void set_arg(int argIndex, T &&arg) { + setArgHelper(argIndex, std::move(arg)); + } + + template void set_args(Ts &&... args) { + setArgsHelper(0, std::move(args)...); + } + +#ifdef __SYCL_DEVICE_ONLY__ + template + __attribute__((sycl_kernel)) void kernel_single_task(KernelType kernelFunc) { + kernelFunc(); + } +#endif + + // Kernel dispatch API + // Kernel is represented as a lambda. + template + void single_task(KernelType kernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + kernel_single_task(kernelFunc); +#else + using KI = cl::sycl::detail::KernelInfo; + m_Node.addKernel(KI::getName(), KI::getNumParams(), &KI::getParamDesc(0), + std::move(kernelFunc)); +#endif + } + + // Kernel is represented as a functor - simply redirect to the lambda-based + // form of invocation, setting kernel name type to the functor type. + template + void single_task(KernelFunctorType KernelFunctor) { + single_task(KernelFunctor); + } + +#ifdef __SYCL_DEVICE_ONLY__ + template + __attribute__((sycl_kernel)) void kernel_parallel_for( + typename std::enable_if, + id>::value && + (dimensions > 0 && dimensions < 4), + KernelType>::type kernelFunc) { + id global_id; + for (int i = 0; i < dimensions; ++i) { + global_id[i] = get_global_id(i); + } + kernelFunc(global_id); + } + + template + __attribute__((sycl_kernel)) void kernel_parallel_for( + typename std::enable_if, + item>::value && + (dimensions > 0 && dimensions < 4), + KernelType>::type kernelFunc) { + id global_id; + range global_size; + for (int i = 0; i < dimensions; ++i) { + global_id[i] = get_global_id(i); + global_size[i] = get_global_size(i); + } + item Item = + detail::Builder::createItem(global_size, global_id); + kernelFunc(Item); + } + + template + __attribute__((sycl_kernel)) void kernel_parallel_for( + typename std::enable_if, + nd_item>::value && + (dimensions > 0 && dimensions < 4), + KernelType>::type kernelFunc) { + range global_size; + range local_size; + id group_id; + id global_id; + id local_id; + id global_offset; + + for (int i = 0; i < dimensions; ++i) { + global_size[i] = get_global_size(i); + local_size[i] = get_local_size(i); + group_id[i] = get_group_id(i); + global_id[i] = get_global_id(i); + local_id[i] = get_local_id(i); + global_offset[i] = get_global_offset(i); + } + + group Group = detail::Builder::createGroup( + global_size, local_size, group_id); + item globalItem = + detail::Builder::createItem(global_size, global_id, + global_offset); + item localItem = + detail::Builder::createItem(local_size, local_id); + nd_item Nd_item = + detail::Builder::createNDItem(globalItem, localItem, Group); + + kernelFunc(Nd_item); + } +#endif + + template + void parallel_for(range numWorkItems, KernelType kernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + kernel_parallel_for(kernelFunc); +#else + using KI = cl::sycl::detail::KernelInfo; + m_Node + .addKernel>( + KI::getName(), KI::getNumParams(), &KI::getParamDesc(0), + std::move(kernelFunc), numWorkItems); +#endif + } + + // The version for a functor kernel. + template + void parallel_for(range numWorkItems, KernelType kernelFunc) { + parallel_for(numWorkItems, kernelFunc); + } + + // The version with an offset + template + void parallel_for(range numWorkItems, + id workItemOffset, KernelType kernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + kernel_parallel_for(kernelFunc); +#else + using KI = cl::sycl::detail::KernelInfo; + m_Node + .addKernel>( + KI::getName(), KI::getNumParams(), &KI::getParamDesc(0), + std::move(kernelFunc), numWorkItems, workItemOffset); +#endif + } + + template + void parallel_for(nd_range executionRange, + KernelType kernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + kernel_parallel_for(kernelFunc); +#else + using KI = cl::sycl::detail::KernelInfo; + m_Node.addKernel( + KI::getName(), KI::getNumParams(), &KI::getParamDesc(0), + std::move(kernelFunc), executionRange); +#endif + } + + // The version for a functor kernel. + template + void parallel_for(nd_range executionRange, + KernelType kernelFunc) { + + parallel_for(executionRange, + kernelFunc); + } + + // template + // void parallel_for_work_group(range numWorkGroups, + // WorkgroupFunctionType kernelFunc); + + // template + // void parallel_for_work_group(range numWorkGroups, + // range workGroupSize, + // WorkgroupFunctionType kernelFunc); + + // The kernel invocation methods below have no functors and cannot be + // called on host. + // TODO current workaround passes dummy functors to Node::addKernel. + // A better way of adding kernels to scheduler if they cannot be run on host + // would be preferrable. + void single_task(kernel syclKernel) { + verifySyclKernelInvoc(syclKernel); + std::function DummyLambda = []() {}; + m_Node.addKernel(syclKernel.get_info(), 0, + nullptr, std::move(DummyLambda), syclKernel.get()); + } + + template + void parallel_for(range numWorkItems, kernel syclKernel) { + verifySyclKernelInvoc(syclKernel); + m_Node.addKernel, dimensions, id>( + syclKernel.get_info(), 0, nullptr, + DummyFunctor(), numWorkItems, syclKernel.get()); + } + + template + void parallel_for(range numWorkItems, + id workItemOffset, kernel syclKernel) { + verifySyclKernelInvoc(syclKernel); + m_Node.addKernel, dimensions, id>( + syclKernel.get_info(), 0, nullptr, + DummyFunctor(), numWorkItems, workItemOffset, + syclKernel.get()); + } + + template + void parallel_for(nd_range ndRange, kernel syclKernel) { + verifySyclKernelInvoc(syclKernel); + m_Node.addKernel( + syclKernel.get_info(), 0, nullptr, + [](nd_item) {}, ndRange, syclKernel.get()); + } + + // Note: the kernel invocation methods below are only planned to be added + // to the spec as of v1.2.1 rev. 3, despite already being present in SYCL + // conformance tests. + + template + void single_task(kernel syclKernel, KernelType kernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + kernel_single_task(kernelFunc); +#else + cl_kernel clKernel = nullptr; + if (!is_host()) { + clKernel = syclKernel.get(); + } + using KI = cl::sycl::detail::KernelInfo; + m_Node.addKernel(KI::getName(), KI::getNumParams(), &KI::getParamDesc(0), + std::move(kernelFunc), clKernel); +#endif + } + + // The version for a functor kernel. + template + void single_task(kernel syclKernel, KernelType kernelFunc) { + single_task(syclKernel, kernelFunc); + } + + template + void parallel_for(range numWorkItems, kernel syclKernel, + KernelType kernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + kernel_parallel_for(kernelFunc); +#else + cl_kernel clKernel = nullptr; + if (!is_host()) { + clKernel = syclKernel.get(); + } + using KI = cl::sycl::detail::KernelInfo; + m_Node + .addKernel>( + KI::getName(), KI::getNumParams(), &KI::getParamDesc(0), + std::move(kernelFunc), numWorkItems, clKernel); +#endif + } + + // The version for a functor kernel. + template + void parallel_for(range numWorkItems, kernel syclKernel, + KernelType kernelFunc) { + + parallel_for(numWorkItems, syclKernel, + kernelFunc); + } + + template + void parallel_for(range numWorkItems, + id workItemOffset, kernel syclKernel, + KernelType kernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + kernel_parallel_for(kernelFunc); +#else + cl_kernel clKernel = nullptr; + if (!is_host()) { + clKernel = syclKernel.get(); + } + using KI = cl::sycl::detail::KernelInfo; + m_Node + .addKernel>( + KI::getName(), KI::getNumParams(), &KI::getParamDesc(0), + std::move(kernelFunc), numWorkItems, workItemOffset, clKernel); +#endif + } + + template + void parallel_for(nd_range ndRange, kernel syclKernel, + KernelType kernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + kernel_parallel_for(kernelFunc); +#else + cl_kernel clKernel = nullptr; + if (!is_host()) { + clKernel = syclKernel.get(); + } + using KI = cl::sycl::detail::KernelInfo; + m_Node.addKernel( + KI::getName(), KI::getNumParams(), &KI::getParamDesc(0), + std::move(kernelFunc), ndRange, clKernel); +#endif + } + + // The version for a functor kernel. + template + void parallel_for(nd_range ndRange, kernel syclKernel, + KernelType kernelFunc) { + parallel_for(ndRange, syclKernel, + kernelFunc); + } + + // template + // void parallel_for_work_group(range num_work_groups, kernel + // syclKernel, WorkgroupFunctionType kernelFunc); + + // template + // void parallel_for_work_group(range num_work_groups, + // range work_group_size, kernel syclKernel, WorkgroupFunctionType + // kernelFunc); + + // Explicit copy operations API + template + typename std::enable_if<(tgt == access::target::global_buffer || + tgt == access::target::constant_buffer), + void>::type + copy(accessor src, + shared_ptr_class dest) { + range Range = + getAccessorRangeHelper::getAccessorRange(src); + // TODO use buffer_allocator when it is possible + buffer> Buffer( + (shared_ptr_class)dest, Range, + {property::buffer::use_host_ptr()}); + accessor + DestAcc(Buffer, *this); + copy(src, DestAcc); + } + + template + typename std::enable_if<(tgt == access::target::global_buffer || + tgt == access::target::constant_buffer), + void>::type + copy(shared_ptr_class src, + accessor dest) { + range Range = + getAccessorRangeHelper::getAccessorRange(dest); + // TODO use buffer_allocator when it is possible + buffer> Buffer( + (shared_ptr_class)src, Range, + {property::buffer::use_host_ptr()}); + accessor + SrcAcc(Buffer, *this); + copy(SrcAcc, dest); + } + + template + typename std::enable_if<(tgt == access::target::global_buffer || + tgt == access::target::constant_buffer), + void>::type + copy(accessor src, T_dest *dest) { + range Range = + getAccessorRangeHelper::getAccessorRange(src); + // TODO use buffer_allocator when it is possible + buffer> Buffer( + (T_src *)dest, Range, {property::buffer::use_host_ptr()}); + accessor + DestAcc(Buffer, *this); + copy(src, DestAcc); + } + + template + typename std::enable_if<(tgt == access::target::global_buffer || + tgt == access::target::constant_buffer), + void>::type + copy(const T_src *src, accessor dest) { + range Range = + getAccessorRangeHelper::getAccessorRange(dest); + // TODO use buffer_allocator when it is possible + buffer> Buffer( + (T_dest *)src, Range, {property::buffer::use_host_ptr()}); + accessor + SrcAcc(Buffer, *this); + copy(SrcAcc, dest); + } + + template < + typename T_src, int dim_src, access::mode mode_src, + access::target tgt_src, typename T_dest, int dim_dest, + access::mode mode_dest, access::target tgt_dest, + access::placeholder isPlaceholder_src = access::placeholder::false_t, + access::placeholder isPlaceholder_dest = access::placeholder::false_t> + typename std::enable_if<((tgt_src == access::target::global_buffer || + tgt_src == access::target::constant_buffer) && + (tgt_dest == access::target::global_buffer || + tgt_dest == access::target::constant_buffer)), + void>::type + copy(accessor src, + accessor + dest) { + if (isHost) { + range Range = + getAccessorRangeHelper::getAccessorRange(src); + parallel_for< + class __copy< + T_src, dim_src, mode_src, tgt_src, T_dest, dim_dest, mode_dest, + tgt_dest, isPlaceholder_src, isPlaceholder_dest> + >(Range, [=](id Index) { + dest[Index] = src[Index]; + }); + } else { +#ifndef __SYCL_DEVICE_ONLY__ + m_Node.addExplicitMemOp<>(src, dest); +#endif + } + finalize(); + // force wait. + } + + template + typename std::enable_if<(tgt == access::target::global_buffer || + tgt == access::target::constant_buffer), + void>::type + update_host(accessor acc) { +#ifndef __SYCL_DEVICE_ONLY__ + assert(!m_Finalized && "The final event of this handler must not be set."); + event *Event = new event; + simple_scheduler::Scheduler::getInstance().updateHost(acc, *Event); + m_Finalized.reset(Event); +#endif + } + + template + typename std::enable_if<(tgt == access::target::global_buffer || + tgt == access::target::constant_buffer), + void>::type + fill(accessor dest, const T &src) { + // TODO add check:T must be an integral scalar value or a SYCL vector type + if (!isHost && dim == 1) { +#ifndef __SYCL_DEVICE_ONLY__ + m_Node.addExplicitMemOp<>(dest, src); +#endif + } else { + // TODO multidimensional case with offset is not supported. + // Fix it when parallel_for with offset is implemented + range Range = + getAccessorRangeHelper::getAccessorRange(dest); + parallel_for>(Range, + [=](id Index) { + dest[Index] = src; + }); + } + } +}; +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/id.hpp b/sycl/include/CL/sycl/id.hpp new file mode 100644 index 000000000000..326e0021f789 --- /dev/null +++ b/sycl/include/CL/sycl/id.hpp @@ -0,0 +1,571 @@ +//==----------- id.hpp --- SYCL iteration id -------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +namespace cl { +namespace sycl { +template class range; +template struct id : public detail::array { +public: + using base = detail::array; + INLINE_IF_DEVICE id() = default; + + /* The following constructor is only available in the id struct + * specialization where: dimensions==1 */ + template + id(typename std::enable_if<(N == 1), size_t>::type dim0) : base(dim0) {} + + template + id(typename std::enable_if<(N == 1), const range &>::type + range_size) + : base(range_size.get(0)) {} + + template + id(typename std::enable_if<(N == 1), const item &>::type item) + : base(item.get_id(0)) {} + + /* The following constructor is only available in the id struct + * specialization where: dimensions==2 */ + template + id(typename std::enable_if<(N == 2), size_t>::type dim0, size_t dim1) + : base(dim0, dim1) {} + + template + id(typename std::enable_if<(N == 2), const range &>::type + range_size) + : base(range_size.get(0), range_size.get(1)) {} + + template + id(typename std::enable_if<(N == 2), const item &>::type item) + : base(item.get_id(0), item.get_id(1)) {} + + /* The following constructor is only available in the id struct + * specialization where: dimensions==3 */ + template + id(typename std::enable_if<(N == 3), size_t>::type dim0, size_t dim1, + size_t dim2) + : base(dim0, dim1, dim2) {} + + template + id(typename std::enable_if<(N == 3), const range &>::type + range_size) + : base(range_size.get(0), range_size.get(1), range_size.get(2)) {} + + template + id(typename std::enable_if<(N == 3), const item &>::type item) + : base(item.get_id(0), item.get_id(1), item.get_id(2)) {} + + explicit operator range() const { + range result; + for (int i = 0; i < dimensions; ++i) { + result[i] = this->get(i); + } + return result; + } + + // OP is: +, -, *, /, %, <<, >>, &, |, ^, &&, ||, <, >, <=, >= + id operator+(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] + rhs.common_array[i]; + } + return result; + } + id operator-(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] - rhs.common_array[i]; + } + return result; + } + id operator*(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] * rhs.common_array[i]; + } + return result; + } + id operator/(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] / rhs.common_array[i]; + } + return result; + } + id operator%(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] % rhs.common_array[i]; + } + return result; + } + id operator<<(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] << rhs.common_array[i]; + } + return result; + } + id operator>>(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] >> rhs.common_array[i]; + } + return result; + } + id operator&(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] & rhs.common_array[i]; + } + return result; + } + id operator|(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] | rhs.common_array[i]; + } + return result; + } + id operator^(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] ^ rhs.common_array[i]; + } + return result; + } + id operator&&(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] && rhs.common_array[i]; + } + return result; + } + id operator||(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] || rhs.common_array[i]; + } + return result; + } + id operator<(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] < rhs.common_array[i]; + } + return result; + } + id operator>(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] > rhs.common_array[i]; + } + return result; + } + id operator<=(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] <= rhs.common_array[i]; + } + return result; + } + id operator>=(const id &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] >= rhs.common_array[i]; + } + return result; + } + + // OP is: +, -, *, /, %, <<, >>, &, |, ^, &&, ||, <, >, <=, >= + id operator+(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] + rhs; + } + return result; + } + id operator-(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] - rhs; + } + return result; + } + id operator*(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] * rhs; + } + return result; + } + id operator/(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] / rhs; + } + return result; + } + id operator%(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] % rhs; + } + return result; + } + id operator<<(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] << rhs; + } + return result; + } + id operator>>(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] >> rhs; + } + return result; + } + id operator&(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] & rhs; + } + return result; + } + id operator|(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] | rhs; + } + return result; + } + id operator^(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] ^ rhs; + } + return result; + } + id operator&&(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] && rhs; + } + return result; + } + id operator||(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] || rhs; + } + return result; + } + id operator<(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] < rhs; + } + return result; + } + id operator>(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] > rhs; + } + return result; + } + id operator<=(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] <= rhs; + } + return result; + } + id operator>=(const size_t &rhs) const { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] >= rhs; + } + return result; + } + + // OP is: +=, -=, *=, /=, %=, <<=, >>=, &=, |=, ^= + id &operator+=(const id &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] += rhs[i]; + } + return *this; + } + id &operator-=(const id &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] -= rhs.common_array[i]; + } + return *this; + } + id &operator*=(const id &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] *= rhs.common_array[i]; + } + return *this; + } + id &operator/=(const id &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] /= rhs.common_array[i]; + } + return *this; + } + id &operator%=(const id &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] %= rhs.common_array[i]; + } + return *this; + } + id &operator<<=(const id &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] <<= rhs.common_array[i]; + } + return *this; + } + id &operator>>=(const id &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] >>= rhs.common_array[i]; + } + return *this; + } + id &operator&=(const id &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] &= rhs.common_array[i]; + } + return *this; + } + id &operator|=(const id &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] |= rhs.common_array[i]; + } + return *this; + } + id &operator^=(const id &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] ^= rhs.common_array[i]; + } + return *this; + } + + // OP is: +=, -=, *=, /=, %=, <<=, >>=, &=, |=, ^= + id &operator+=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] += rhs; + } + return *this; + } + id &operator-=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] -= rhs; + } + return *this; + } + id &operator*=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] *= rhs; + } + return *this; + } + id &operator/=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] /= rhs; + } + return *this; + } + id &operator%=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] %= rhs; + } + return *this; + } + id &operator<<=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] <<= rhs; + } + return *this; + } + id &operator>>=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] >>= rhs; + } + return *this; + } + id &operator&=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] &= rhs; + } + return *this; + } + id &operator|=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] |= rhs; + } + return *this; + } + id &operator^=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] ^= rhs; + } + return *this; + } + + // OP is: +, -, *, /, %, <<, >>, &, |, ^, <, >, <=, >=, &&, || + friend id operator+(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs + rhs.common_array[i]; + } + return result; + } + friend id operator-(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs - rhs.common_array[i]; + } + return result; + } + friend id operator*(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs * rhs.common_array[i]; + } + return result; + } + friend id operator/(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs / rhs.common_array[i]; + } + return result; + } + friend id operator%(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs % rhs.common_array[i]; + } + return result; + } + friend id operator<<(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs << rhs.common_array[i]; + } + return result; + } + friend id operator>>(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs >> rhs.common_array[i]; + } + return result; + } + friend id operator&(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs & rhs.common_array[i]; + } + return result; + } + friend id operator|(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs | rhs.common_array[i]; + } + return result; + } + friend id operator^(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs ^ rhs.common_array[i]; + } + return result; + } + friend id operator<(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs < rhs.common_array[i]; + } + return result; + } + friend id operator>(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs > rhs.common_array[i]; + } + return result; + } + friend id operator<=(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs <= rhs.common_array[i]; + } + return result; + } + friend id operator>=(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs >= rhs.common_array[i]; + } + return result; + } + friend id operator&&(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs && rhs.common_array[i]; + } + return result; + } + friend id operator||(const size_t &lhs, + const id &rhs) { + id result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs || rhs.common_array[i]; + } + return result; + } +}; + +namespace detail { +template INLINE_IF_DEVICE +size_t getOffsetForId(range Range, id Id, + id Offset) { + size_t offset = 0; + for (int i = 0; i < dimensions; ++i) + offset = offset * Range[i] + Offset[i] + Id[i]; + return offset; +} +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/image.hpp b/sycl/include/CL/sycl/image.hpp new file mode 100644 index 000000000000..fdbcdd1723a0 --- /dev/null +++ b/sycl/include/CL/sycl/image.hpp @@ -0,0 +1,158 @@ +//==------------ image.hpp -------------------------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +namespace cl { +namespace sycl { + +using byte = unsigned char; + +using image_allocator = std::allocator; + +template class range; + +template +class image { +public: + image(image_channel_order order, image_channel_type type, + const range &range, const property_list &propList = {}) { + impl = std::make_shared>( + order, type, range, propList); + } + + //image(image_channel_order order, image_channel_type type, + //const range &range, AllocatorT allocator, + //const property_list &propList = {}); + + /* Available only when: dimensions > 1 */ + //image(image_channel_order order, image_channel_type type, + //const range &range, const range &pitch, + //const property_list &propList = {}); + + /* Available only when: dimensions > 1 */ + //image(image_channel_order order, image_channel_type type, + //const range &range, const range &pitch, + //AllocatorT allocator, const property_list &propList = {}); + + //image(void *hostPointer, image_channel_order order, image_channel_type type, + //const range &range, const property_list &propList = {}); + + //image(void *hostPointer, image_channel_order order, image_channel_type type, + //const range &range, AllocatorT allocator, + //const property_list &propList = {}); + + //image(const void *hostPointer, image_channel_order order, + //image_channel_type type, const range &range, + //const property_list &propList = {}); + + //image(const void *hostPointer, image_channel_order order, + //image_channel_type type, const range &range, + //AllocatorT allocator, const property_list &propList = {}); + + /* Available only when: dimensions > 1 */ + //image(void *hostPointer, image_channel_order order, image_channel_type type, + //const range &range, range &pitch, + //const property_list &propList = {}); + + /* Available only when: dimensions > 1 */ + //image(void *hostPointer, image_channel_order order, image_channel_type type, + //const range &range, range &pitch, + //AllocatorT allocator, const property_list &propList = {}); + + //image(shared_ptr_class &hostPointer, image_channel_order order, + //image_channel_type type, const range &range, + //const property_list &propList = {}); + + //image(shared_ptr_class &hostPointer, image_channel_order order, + //image_channel_type type, const range &range, + //AllocatorT allocator, const property_list &propList = {}); + + /* Available only when: dimensions > 1 */ + //image(shared_ptr_class &hostPointer, image_channel_order order, + //image_channel_type type, const range &range, + //const range &pitch, const property_list &propList = {}); + + /* Available only when: dimensions > 1 */ + //image(shared_ptr_class &hostPointer, image_channel_order order, + //image_channel_type type, const range &range, + //const range &pitch, AllocatorT allocator, + //const property_list &propList = {}); + + image(cl_mem clMemObject, const context &syclContext, + event availableEvent = {}); + + image(const image &rhs) = default; + + image(image &&rhs) = default; + + image &operator=(const image &rhs) = default; + + image &operator=(image &&rhs) = default; + + ~image() = default; + + bool operator==(const image &rhs) const { return impl == rhs.impl; } + + bool operator!=(const image &rhs) const { return !(*this == rhs); } + + /* -- common interface members -- */ + + /* -- property interface members -- */ + + range get_range() const { return impl->get_range(); } + + /* Available only when: dimensions > 1 */ + range get_pitch() const { return impl->get_pitch(); } + + size_t get_size() const { return impl->get_size(); } + + size_t get_count() const { return impl->get_count(); } + + AllocatorT get_allocator() const { return impl->get_allocator(); } + + template + accessor + get_access(handler &commandGroupHandler) { + return impl->template get_access(); + } + + template + accessor + get_access() { + return impl->template get_access(); + } + + //template + //void set_final_data(Destination finalData = std::nullptr); + + void set_write_back(bool flag = true) { impl->set_write_back(flag); } + +private: + shared_ptr_class> impl; + template + friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject); +}; + +} // namespace sycl +} // namespace cl + +namespace std { +template +struct hash> { + size_t operator()(const cl::sycl::image &i) const { + return hash>>()(i.impl); + } +}; +} // namespace std diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp new file mode 100644 index 000000000000..264f3f4340ee --- /dev/null +++ b/sycl/include/CL/sycl/info/info_desc.hpp @@ -0,0 +1,382 @@ +//==------- info_desc.hpp - SYCL information descriptors -------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +namespace cl { +namespace sycl { + +class program; +class device; +class platform; + +namespace info { + +// Information descriptors +// A.1 Platform information descriptors +enum class platform : cl_platform_info { + profile = CL_PLATFORM_PROFILE, + version = CL_PLATFORM_VERSION, + name = CL_PLATFORM_NAME, + vendor = CL_PLATFORM_VENDOR, + extensions = CL_PLATFORM_EXTENSIONS +}; + +// A.2 Context information desctiptors +enum class context : cl_context_info { + reference_count = CL_CONTEXT_REFERENCE_COUNT, + platform = CL_CONTEXT_PLATFORM, + devices = CL_CONTEXT_DEVICES, +}; + +// A.3 Device information descriptors +enum class device : cl_device_info { + device_type = CL_DEVICE_TYPE, + vendor_id = CL_DEVICE_VENDOR_ID, + max_compute_units = CL_DEVICE_MAX_COMPUTE_UNITS, + max_work_item_dimensions = CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, + max_work_item_sizes = CL_DEVICE_MAX_WORK_ITEM_SIZES, + max_work_group_size = CL_DEVICE_MAX_WORK_GROUP_SIZE, + + preferred_vector_width_char = CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, + preferred_vector_width_short = CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, + preferred_vector_width_int = CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, + preferred_vector_width_long = CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, + preferred_vector_width_float = CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, + preferred_vector_width_double = CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, + preferred_vector_width_half = CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, + + native_vector_width_char = CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, + native_vector_width_short = CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, + native_vector_width_int = CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, + native_vector_width_long = CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, + native_vector_width_float = CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, + native_vector_width_double = CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, + native_vector_width_half = CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, + + max_clock_frequency = CL_DEVICE_MAX_CLOCK_FREQUENCY, + address_bits = CL_DEVICE_ADDRESS_BITS, + max_mem_alloc_size = CL_DEVICE_MAX_MEM_ALLOC_SIZE, + image_support = CL_DEVICE_IMAGE_SUPPORT, + max_read_image_args = CL_DEVICE_MAX_READ_IMAGE_ARGS, + max_write_image_args = CL_DEVICE_MAX_WRITE_IMAGE_ARGS, + image2d_max_width = CL_DEVICE_IMAGE2D_MAX_WIDTH, + image2d_max_height = CL_DEVICE_IMAGE2D_MAX_HEIGHT, + image3d_max_width = CL_DEVICE_IMAGE3D_MAX_WIDTH, + image3d_max_height = CL_DEVICE_IMAGE3D_MAX_HEIGHT, + image3d_max_depth = CL_DEVICE_IMAGE3D_MAX_DEPTH, + image_max_buffer_size = CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, + image_max_array_size = CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, + max_samplers = CL_DEVICE_MAX_SAMPLERS, + max_parameter_size = CL_DEVICE_MAX_PARAMETER_SIZE, + mem_base_addr_align = CL_DEVICE_MEM_BASE_ADDR_ALIGN, + half_fp_config = CL_DEVICE_HALF_FP_CONFIG, + single_fp_config = CL_DEVICE_SINGLE_FP_CONFIG, + double_fp_config = CL_DEVICE_DOUBLE_FP_CONFIG, + global_mem_cache_type = CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, + global_mem_cache_line_size = CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, + global_mem_cache_size = CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, + global_mem_size = CL_DEVICE_GLOBAL_MEM_SIZE, + max_constant_buffer_size = CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, + max_constant_args = CL_DEVICE_MAX_CONSTANT_ARGS, + local_mem_type = CL_DEVICE_LOCAL_MEM_TYPE, + local_mem_size = CL_DEVICE_LOCAL_MEM_SIZE, + error_correction_support = CL_DEVICE_ERROR_CORRECTION_SUPPORT, + host_unified_memory = CL_DEVICE_HOST_UNIFIED_MEMORY, + profiling_timer_resolution = CL_DEVICE_PROFILING_TIMER_RESOLUTION, + is_endian_little = CL_DEVICE_ENDIAN_LITTLE, + is_available = CL_DEVICE_AVAILABLE, + is_compiler_available = CL_DEVICE_COMPILER_AVAILABLE, + is_linker_available = CL_DEVICE_LINKER_AVAILABLE, + execution_capabilities = CL_DEVICE_EXECUTION_CAPABILITIES, + queue_profiling = CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, + built_in_kernels = CL_DEVICE_BUILT_IN_KERNELS, + platform = CL_DEVICE_PLATFORM, + name = CL_DEVICE_NAME, + vendor = CL_DEVICE_VENDOR, + driver_version = CL_DRIVER_VERSION, + profile = CL_DEVICE_PROFILE, + version = CL_DEVICE_VERSION, + opencl_c_version = CL_DEVICE_OPENCL_C_VERSION, + extensions = CL_DEVICE_EXTENSIONS, + printf_buffer_size = CL_DEVICE_PRINTF_BUFFER_SIZE, + preferred_interop_user_sync = CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, + parent_device = CL_DEVICE_PARENT_DEVICE, + partition_max_sub_devices = CL_DEVICE_PARTITION_MAX_SUB_DEVICES, + partition_properties = CL_DEVICE_PARTITION_PROPERTIES, + partition_affinity_domains = CL_DEVICE_PARTITION_AFFINITY_DOMAIN, + partition_type_affinity_domain = CL_DEVICE_PARTITION_TYPE, + reference_count = CL_DEVICE_REFERENCE_COUNT, + max_num_sub_groups = CL_DEVICE_MAX_NUM_SUB_GROUPS, + sub_group_independent_forward_progress = + CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, + partition_type_property +}; + +enum class device_type : cl_device_type { + cpu = CL_DEVICE_TYPE_CPU, + gpu = CL_DEVICE_TYPE_GPU, + accelerator = CL_DEVICE_TYPE_ACCELERATOR, + custom = CL_DEVICE_TYPE_CUSTOM, + automatic, + host, + all = CL_DEVICE_TYPE_ALL +}; + +enum class partition_property : cl_device_partition_property { + partition_equally = CL_DEVICE_PARTITION_EQUALLY, + partition_by_counts = CL_DEVICE_PARTITION_BY_COUNTS, + partition_by_affinity_domain = CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, + no_partition +}; + +enum class partition_affinity_domain : cl_device_affinity_domain { + not_applicable = 0, + numa = CL_DEVICE_AFFINITY_DOMAIN_NUMA, + L4_cache = CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE, + L3_cache = CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE, + L2_cache = CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE, + L1_cache = CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE, + next_partitionable = CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE +}; + +enum class local_mem_type : int { none, local, global }; + +enum class fp_config : cl_device_fp_config { + denorm = CL_FP_DENORM, + inf_nan = CL_FP_INF_NAN, + round_to_nearest = CL_FP_ROUND_TO_NEAREST, + round_to_zero = CL_FP_ROUND_TO_ZERO, + round_to_inf = CL_FP_ROUND_TO_INF, + fma = CL_FP_FMA, + correctly_rounded_divide_sqrt, + soft_float +}; + +enum class global_mem_cache_type : int { none, read_only, write_only }; + +enum class execution_capability : unsigned int { + exec_kernel, + exec_native_kernel +}; + +// A.4 Queue information desctiptors +enum class queue : cl_command_queue_info { + context = CL_QUEUE_CONTEXT, + device = CL_QUEUE_DEVICE, + reference_count = CL_QUEUE_REFERENCE_COUNT +}; + +// A.5 Kernel information desctiptors +enum class kernel : cl_kernel_info { + function_name = CL_KERNEL_FUNCTION_NAME, + num_args = CL_KERNEL_NUM_ARGS, + context = CL_KERNEL_CONTEXT, + program = CL_KERNEL_PROGRAM, + reference_count = CL_KERNEL_REFERENCE_COUNT, + attributes = CL_KERNEL_ATTRIBUTES +}; + +enum class kernel_work_group : cl_kernel_work_group_info { + global_work_size = CL_KERNEL_GLOBAL_WORK_SIZE, + work_group_size = CL_KERNEL_WORK_GROUP_SIZE, + compile_work_group_size = CL_KERNEL_COMPILE_WORK_GROUP_SIZE, + preferred_work_group_size_multiple = + CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, + private_mem_size = CL_KERNEL_PRIVATE_MEM_SIZE +}; + +enum class kernel_sub_group : cl_kernel_sub_group_info { + max_sub_group_size_for_ndrange = CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, + sub_group_count_for_ndrange = CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, + local_size_for_sub_group_count = CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, + max_num_sub_groups = CL_KERNEL_MAX_NUM_SUB_GROUPS, + compile_num_sub_groups = CL_KERNEL_COMPILE_NUM_SUB_GROUPS +}; + +// A.6 Program information desctiptors +enum class program : cl_program_info { + context = CL_PROGRAM_CONTEXT, + devices = CL_PROGRAM_DEVICES, + reference_count = CL_PROGRAM_REFERENCE_COUNT +}; + +// A.7 Event information desctiptors +enum class event : cl_event_info { + reference_count = CL_EVENT_REFERENCE_COUNT, + command_execution_status = CL_EVENT_COMMAND_EXECUTION_STATUS +}; + +enum class event_command_status : cl_int { + submitted = CL_SUBMITTED, + running = CL_RUNNING, + complete = CL_COMPLETE +}; + +enum class event_profiling : cl_profiling_info { + command_submit = CL_PROFILING_COMMAND_SUBMIT, + command_start = CL_PROFILING_COMMAND_START, + command_end = CL_PROFILING_COMMAND_END +}; + +// Provide an alias to the return type for each of the info parameters +template class param_traits {}; + +#define PARAM_TRAITS_SPEC(param_type, param, ret_type) \ + template <> class param_traits { \ + public: \ + using return_type = ret_type; \ + }; + +#define PARAM_TRAITS_SPEC_WITH_INPUT(param_type, param, ret_type, in_type) \ + template <> class param_traits { \ + public: \ + using return_type = ret_type; \ + using input_type = in_type; \ + }; + +PARAM_TRAITS_SPEC(device, device_type, device_type) +PARAM_TRAITS_SPEC(device, vendor_id, cl_uint) +PARAM_TRAITS_SPEC(device, max_compute_units, cl_uint) +PARAM_TRAITS_SPEC(device, max_work_item_dimensions, cl_uint) +PARAM_TRAITS_SPEC(device, max_work_item_sizes, id<3>) +PARAM_TRAITS_SPEC(device, max_work_group_size, size_t) +PARAM_TRAITS_SPEC(device, preferred_vector_width_char, cl_uint) +PARAM_TRAITS_SPEC(device, preferred_vector_width_short, cl_uint) +PARAM_TRAITS_SPEC(device, preferred_vector_width_int, cl_uint) +PARAM_TRAITS_SPEC(device, preferred_vector_width_long, cl_uint) +PARAM_TRAITS_SPEC(device, preferred_vector_width_float, cl_uint) +PARAM_TRAITS_SPEC(device, preferred_vector_width_double, cl_uint) +PARAM_TRAITS_SPEC(device, preferred_vector_width_half, cl_uint) +PARAM_TRAITS_SPEC(device, native_vector_width_char, cl_uint) +PARAM_TRAITS_SPEC(device, native_vector_width_short, cl_uint) +PARAM_TRAITS_SPEC(device, native_vector_width_int, cl_uint) +PARAM_TRAITS_SPEC(device, native_vector_width_long, cl_uint) +PARAM_TRAITS_SPEC(device, native_vector_width_float, cl_uint) +PARAM_TRAITS_SPEC(device, native_vector_width_double, cl_uint) +PARAM_TRAITS_SPEC(device, native_vector_width_half, cl_uint) +PARAM_TRAITS_SPEC(device, max_clock_frequency, cl_uint) +PARAM_TRAITS_SPEC(device, address_bits, cl_uint) +PARAM_TRAITS_SPEC(device, max_mem_alloc_size, cl_ulong) +PARAM_TRAITS_SPEC(device, image_support, bool) +PARAM_TRAITS_SPEC(device, max_read_image_args, cl_uint) +PARAM_TRAITS_SPEC(device, max_write_image_args, cl_uint) +PARAM_TRAITS_SPEC(device, image2d_max_width, size_t) +PARAM_TRAITS_SPEC(device, image2d_max_height, size_t) +PARAM_TRAITS_SPEC(device, image3d_max_width, size_t) +PARAM_TRAITS_SPEC(device, image3d_max_height, size_t) +PARAM_TRAITS_SPEC(device, image3d_max_depth, size_t) +PARAM_TRAITS_SPEC(device, image_max_buffer_size, size_t) +PARAM_TRAITS_SPEC(device, image_max_array_size, size_t) +PARAM_TRAITS_SPEC(device, max_samplers, cl_uint) +PARAM_TRAITS_SPEC(device, max_parameter_size, size_t) +PARAM_TRAITS_SPEC(device, mem_base_addr_align, cl_uint) +PARAM_TRAITS_SPEC(device, half_fp_config, vector_class) +PARAM_TRAITS_SPEC(device, single_fp_config, vector_class) +PARAM_TRAITS_SPEC(device, double_fp_config, vector_class) +PARAM_TRAITS_SPEC(device, global_mem_cache_type, info::global_mem_cache_type) +PARAM_TRAITS_SPEC(device, global_mem_cache_line_size, cl_uint) +PARAM_TRAITS_SPEC(device, global_mem_cache_size, cl_ulong) +PARAM_TRAITS_SPEC(device, global_mem_size, cl_ulong) +PARAM_TRAITS_SPEC(device, max_constant_buffer_size, cl_ulong) +PARAM_TRAITS_SPEC(device, max_constant_args, cl_uint) +PARAM_TRAITS_SPEC(device, local_mem_type, info::local_mem_type) +PARAM_TRAITS_SPEC(device, local_mem_size, cl_ulong) +PARAM_TRAITS_SPEC(device, error_correction_support, bool) +PARAM_TRAITS_SPEC(device, host_unified_memory, bool) +PARAM_TRAITS_SPEC(device, profiling_timer_resolution, size_t) +PARAM_TRAITS_SPEC(device, is_endian_little, bool) +PARAM_TRAITS_SPEC(device, is_available, bool) +PARAM_TRAITS_SPEC(device, is_compiler_available, bool) +PARAM_TRAITS_SPEC(device, is_linker_available, bool) +PARAM_TRAITS_SPEC(device, execution_capabilities, + vector_class) +PARAM_TRAITS_SPEC(device, queue_profiling, bool) +PARAM_TRAITS_SPEC(device, built_in_kernels, vector_class) +PARAM_TRAITS_SPEC(device, platform, cl::sycl::platform) +PARAM_TRAITS_SPEC(device, name, string_class) +PARAM_TRAITS_SPEC(device, vendor, string_class) +PARAM_TRAITS_SPEC(device, driver_version, string_class) +PARAM_TRAITS_SPEC(device, profile, string_class) +PARAM_TRAITS_SPEC(device, version, string_class) +PARAM_TRAITS_SPEC(device, opencl_c_version, string_class) +PARAM_TRAITS_SPEC(device, extensions, vector_class) +PARAM_TRAITS_SPEC(device, printf_buffer_size, size_t) +PARAM_TRAITS_SPEC(device, preferred_interop_user_sync, bool) +PARAM_TRAITS_SPEC(device, parent_device, cl::sycl::device) +PARAM_TRAITS_SPEC(device, partition_max_sub_devices, cl_uint) +PARAM_TRAITS_SPEC(device, partition_properties, + vector_class) +PARAM_TRAITS_SPEC(device, partition_affinity_domains, + vector_class) +PARAM_TRAITS_SPEC(device, partition_type_property, info::partition_property) +PARAM_TRAITS_SPEC(device, partition_type_affinity_domain, + info::partition_affinity_domain) +PARAM_TRAITS_SPEC(device, reference_count, cl_uint) +PARAM_TRAITS_SPEC(device, max_num_sub_groups, cl_uint) +PARAM_TRAITS_SPEC(device, sub_group_independent_forward_progress, bool) + +PARAM_TRAITS_SPEC(context, reference_count, cl_uint) +PARAM_TRAITS_SPEC(context, platform, cl::sycl::platform) +PARAM_TRAITS_SPEC(context, devices, vector_class) + +PARAM_TRAITS_SPEC(event, command_execution_status, event_command_status) +PARAM_TRAITS_SPEC(event, reference_count, cl_uint) + +PARAM_TRAITS_SPEC(event_profiling, command_submit, cl_ulong) +PARAM_TRAITS_SPEC(event_profiling, command_start, cl_ulong) +PARAM_TRAITS_SPEC(event_profiling, command_end, cl_ulong) + +PARAM_TRAITS_SPEC(kernel, function_name, string_class) +PARAM_TRAITS_SPEC(kernel, num_args, cl_uint) +PARAM_TRAITS_SPEC(kernel, reference_count, cl_uint) +PARAM_TRAITS_SPEC(kernel, attributes, string_class) +// Shilei: The following two traits are not covered in the current version of +// CTS (SYCL-1.2.1/master) +PARAM_TRAITS_SPEC(kernel, context, cl::sycl::context) +PARAM_TRAITS_SPEC(kernel, program, cl::sycl::program) + +PARAM_TRAITS_SPEC(kernel_work_group, compile_work_group_size, + cl::sycl::range<3>) +PARAM_TRAITS_SPEC(kernel_work_group, global_work_size, cl::sycl::range<3>) +PARAM_TRAITS_SPEC(kernel_work_group, preferred_work_group_size_multiple, size_t) +PARAM_TRAITS_SPEC(kernel_work_group, private_mem_size, cl_ulong) +PARAM_TRAITS_SPEC(kernel_work_group, work_group_size, size_t) + +PARAM_TRAITS_SPEC_WITH_INPUT(kernel_sub_group, max_sub_group_size_for_ndrange, + size_t, cl::sycl::range<3>) +PARAM_TRAITS_SPEC_WITH_INPUT(kernel_sub_group, sub_group_count_for_ndrange, + size_t, cl::sycl::range<3>) +PARAM_TRAITS_SPEC_WITH_INPUT(kernel_sub_group, local_size_for_sub_group_count, + cl::sycl::range<3>, size_t) +PARAM_TRAITS_SPEC(kernel_sub_group, max_num_sub_groups, size_t) +PARAM_TRAITS_SPEC(kernel_sub_group, compile_num_sub_groups, size_t) + +PARAM_TRAITS_SPEC(platform, profile, string_class) +PARAM_TRAITS_SPEC(platform, version, string_class) +PARAM_TRAITS_SPEC(platform, name, string_class) +PARAM_TRAITS_SPEC(platform, vendor, string_class) +PARAM_TRAITS_SPEC(platform, extensions, vector_class) + +PARAM_TRAITS_SPEC(program, context, cl::sycl::context) +PARAM_TRAITS_SPEC(program, devices, vector_class) +PARAM_TRAITS_SPEC(program, reference_count, cl_uint) + +PARAM_TRAITS_SPEC(queue, reference_count, cl_uint) +PARAM_TRAITS_SPEC(queue, context, cl::sycl::context) +PARAM_TRAITS_SPEC(queue, device, cl::sycl::device) + +#undef PARAM_TRAITS_SPEC + +} // namespace info +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/intel/sub_group.hpp b/sycl/include/CL/sycl/intel/sub_group.hpp new file mode 100644 index 000000000000..7897b0749f6a --- /dev/null +++ b/sycl/include/CL/sycl/intel/sub_group.hpp @@ -0,0 +1,428 @@ +//==----------- sub_group.hpp --- SYCL sub-group ---------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#ifdef __SYCL_DEVICE_ONLY__ +#define __NOEXCEPT noexcept +namespace cl { +namespace __spirv { +extern size_t BuiltInSubgroupLocalInvocationId() __NOEXCEPT; +extern size_t BuiltInSubgroupSize() __NOEXCEPT; +extern size_t BuiltInSubgroupMaxSize() __NOEXCEPT; +extern size_t BuiltInSubgroupId() __NOEXCEPT; +extern size_t BuiltInNumSubgroups() __NOEXCEPT; +extern size_t BuiltInNumEnqueuedSubgroups() __NOEXCEPT; +} // namespace __spirv +} // namespace cl + +// TODO: rework to use SPIRV +typedef uint uint2 __attribute__((ext_vector_type(2))); +typedef uint uint3 __attribute__((ext_vector_type(3))); +typedef uint uint4 __attribute__((ext_vector_type(4))); +typedef uint uint8 __attribute__((ext_vector_type(8))); +typedef ushort ushort2 __attribute__((ext_vector_type(2))); +typedef ushort ushort3 __attribute__((ext_vector_type(3))); +typedef ushort ushort4 __attribute__((ext_vector_type(4))); +typedef ushort ushort8 __attribute__((ext_vector_type(8))); +size_t get_sub_group_local_id(); // BuiltInSubgroupLocalInvocationId +size_t get_sub_group_size(); // BuiltInSubgroupSize +size_t get_max_sub_group_size(); // BuiltInSubgroupMaxSize +size_t get_sub_group_id(); // BuiltInSubgroupId +size_t get_num_sub_groups(); // BuiltInNumSubgroups +size_t get_enqueued_num_sub_groups(); // BuiltInNumEnqueuedSubgroups +int sub_group_any(int); +int sub_group_all(int); +int sub_group_broadcast(int x, uint sub_grou_local_id); +int sub_group_reduce_min(int x); +int sub_group_reduce_max(int x); +int sub_group_reduce_add(int x); +int sub_group_scan_exclusive_add(int x); +int sub_group_scan_exclusive_max(int x); +int sub_group_scan_exclusive_min(int x); +int sub_group_scan_inclusive_add(int x); +int sub_group_scan_inclusive_max(int x); +int sub_group_scan_inclusive_min(int x); +int intel_sub_group_shuffle(int data, uint c); +int intel_sub_group_shuffle_up(int prev, int cur, uint c); +int intel_sub_group_shuffle_down(int cur, int next, uint c); +int intel_sub_group_shuffle_xor(int data, uint c); +uint intel_sub_group_block_read(const __global uint *p); +uint2 intel_sub_group_block_read2(const __global uint *p); +uint4 intel_sub_group_block_read4(const __global uint *p); +uint8 intel_sub_group_block_read8(const __global uint *p); +void intel_sub_group_block_write(__global uint *p, uint data); +void intel_sub_group_block_write2(__global uint *p, uint2 data); +void intel_sub_group_block_write4(__global uint *p, uint4 data); +void intel_sub_group_block_write8(__global uint *p, uint8 data); + +ushort intel_sub_group_block_read_us(const __global ushort *p); +ushort2 intel_sub_group_block_read_us2(const __global ushort *p); +ushort4 intel_sub_group_block_read_us4(const __global ushort *p); +ushort8 intel_sub_group_block_read_us8(const __global ushort *p); +void intel_sub_group_block_write_us(__global ushort *p, ushort data); +void intel_sub_group_block_write_us2(__global ushort *p, ushort2 data); +void intel_sub_group_block_write_us4(__global ushort *p, ushort4 data); +void intel_sub_group_block_write_us8(__global ushort *p, ushort8 data); +void sub_group_barrier(cl::sycl::detail::cl_mem_fence_flags flags); + +namespace cl { +namespace sycl { +template class multi_ptr; +namespace intel { + +enum class Operation { exclusive_scan, inclusive_scan, reduce }; + +struct minimum { + Operation o; + minimum(Operation op) : o(op) {} + template T operator()(T x) { + switch (o) { + case Operation::exclusive_scan: { + return sub_group_scan_exclusive_min(x); + } + case Operation::inclusive_scan: { + return sub_group_scan_inclusive_min(x); + } + case Operation::reduce: { + return sub_group_reduce_min(x); + } + } + } +}; + +struct maximum { + Operation o; + maximum(Operation op) : o(op) {} + template T operator()(T x) { + switch (o) { + case Operation::exclusive_scan: { + return sub_group_scan_exclusive_max(x); + } + case Operation::inclusive_scan: { + return sub_group_scan_inclusive_max(x); + } + case Operation::reduce: { + return sub_group_reduce_max(x); + } + } + } +}; + +struct plus { + Operation o; + plus(Operation op) : o(op) {} + template T operator()(T x) { + switch (o) { + case Operation::exclusive_scan: { + return sub_group_scan_exclusive_add(x); + } + case Operation::inclusive_scan: { + return sub_group_scan_inclusive_add(x); + } + case Operation::reduce: { + return sub_group_reduce_add(x); + } + } + } +}; +struct sub_group { + /* --- common interface members --- */ + + id<1> get_local_id() const { + return get_sub_group_local_id(); //*cl::__spirv::BuiltInSubgroupLocalInvocationId(); + } + range<1> get_local_range() const { + return get_sub_group_size(); // cl::__spirv::BuiltInSubgroupSize(); + } + + range<1> get_max_local_range() const { + return get_max_sub_group_size(); // cl::__spirv::BuiltInSubgroupMaxSize(); + } + + id<1> get_group_id() const { + return get_sub_group_id(); // cl::__spirv::BuiltInSubgroupId(); + } + + size_t get_group_range() const { + return get_num_sub_groups(); // cl::__spirv::BuiltInNumSubgroups(); + } + + size_t get_uniform_group_range() const { + return get_enqueued_num_sub_groups(); // cl::__spirv::BuiltInNumEnqueuedSubgroups(); + } + + /* --- vote / ballot functions --- */ + + bool any(bool predicate) { return sub_group_any(predicate); } + + bool all(bool predicate) { return sub_group_all(predicate); } + + /* --- collectives --- */ + + template T broadcast(T x, id<1> local_id) { + return sub_group_broadcast(x, local_id.get(0)); + } + + template T reduce(T x) { + BinaryOperation o(Operation::reduce); + return o(x); + } + + template T exclusive_scan(T x) { + BinaryOperation o(Operation::exclusive_scan); + return o(x); + } + + template T inclusive_scan(T x) { + BinaryOperation o(Operation::inclusive_scan); + return o(x); + } + + /* --- one - input shuffles --- */ + /* indices in [0 , sub - group size ) */ + + template T shuffle(T x, id<1> local_id) { + return intel_sub_group_shuffle(x, local_id.get(0)); + } + + template T shuffle_down(T x, uint32_t delta) { + return intel_sub_group_shuffle_down(x, x, delta); + } + + template T shuffle_up(T x, uint32_t delta) { + return intel_sub_group_shuffle_up(x, x, delta); + } + + template T shuffle_xor(T x, id<1> value) { + return intel_sub_group_shuffle_xor(x, value.get(0)); + } + + /* --- two - input shuffles --- */ + /* indices in [0 , 2* sub - group size ) */ + template T shuffle(T x, T y, id<1> local_id) { + return intel_sub_group_shuffle_down( + x, y, local_id.get(0) - get_local_id().get(0)); + } + + template T shuffle_down(T current, T next, uint32_t delta) { + return intel_sub_group_shuffle_down(current, next, delta); + } + template T shuffle_up(T previous, T current, uint32_t delta) { + return intel_sub_group_shuffle_up(previous, current, delta); + } + + /* --- sub - group load / stores --- */ + /* these can map to SIMD or block read / write hardware where available */ + + template + typename std::enable_if::type + load(const multi_ptr src) { + uint t = intel_sub_group_block_read((const __global uint *)src.get()); + return *((T *)&t); + } + + template + typename std::enable_if::type + load(const multi_ptr src) { + ushort t = + intel_sub_group_block_read_us((const __global ushort *)src.get()); + return *((T *)&t); + } + + template + typename std::enable_if::type + load(const multi_ptr src) { + uint t = intel_sub_group_block_read((const __global uint *)src.get()); + return *((T *)&t); + } + + template + typename std::enable_if::type + load(const multi_ptr src) { + uint t = intel_sub_group_block_read_us((const __global ushort *)src.get()); + return *((T *)&t); + } + + template + vec::type, N> + load(const multi_ptr src) { + uint2 t = intel_sub_group_block_read2((const __global uint *)src.get()); + return *((typename vec::vector_t *)(&t)); + } + + template + vec::type, + N> + load(const multi_ptr src) { + ushort2 t = + intel_sub_group_block_read_us2((const __global ushort *)src.get()); + return *((typename vec::vector_t *)(&t)); + } + + template + vec::type, N> + load(const multi_ptr src) { + uint4 t = intel_sub_group_block_read4((const __global uint *)src.get()); + return *((typename vec::vector_t *)(&t)); + } + + template + vec::type, + N> + load(const multi_ptr src) { + ushort4 t = + intel_sub_group_block_read_us4((const __global ushort *)src.get()); + return *((typename vec::vector_t *)(&t)); + } + + template + vec::type, N> + load(const multi_ptr src) { + uint8 t = intel_sub_group_block_read8((const __global uint *)src.get()); + return *((typename vec::vector_t *)(&t)); + } + + template + vec::type, + N> + load(const multi_ptr src) { + ushort8 t = + intel_sub_group_block_read_us8((const __global ushort *)src.get()); + return *((typename vec::vector_t *)(&t)); + } + + template + void + store(multi_ptr dst, + const typename std::enable_if::type &x) { + intel_sub_group_block_write((__global uint *)dst.get(), *((uint *)&x)); + } + + template + void store( + multi_ptr dst, + const typename std::enable_if::type &x) { + intel_sub_group_block_write_us((__global ushort *)dst.get(), + *((ushort *)&x)); + } + + template + void store(multi_ptr dst, + const typename std::enable_if::type &x) { + intel_sub_group_block_write((__global uint *)dst.get(), *((uint *)&x)); + } + + template + void + store(multi_ptr dst, + const typename std::enable_if::type &x) { + intel_sub_group_block_write_us((__global ushort *)dst.get(), + *((ushort *)&x)); + } + + template + void store( + multi_ptr dst, + const vec< + typename std::enable_if::type, + N> &x) { + typename vec::vector_t t = x; + intel_sub_group_block_write2((__global uint *)dst.get(), *((uint2 *)&t)); + } + template + void + store(multi_ptr dst, + const vec::type, + N> &x) { + typename vec::vector_t t = x; + intel_sub_group_block_write_us2((__global ushort *)dst.get(), + *((ushort2 *)&t)); + } + + template + void store( + multi_ptr dst, + const vec< + typename std::enable_if::type, + N> &x) { + typename vec::vector_t t = x; + intel_sub_group_block_write4((__global uint *)dst.get(), *((uint4 *)&t)); + } + + template + void + store(multi_ptr dst, + const vec::type, + N> &x) { + typename vec::vector_t t = x; + intel_sub_group_block_write_us4((__global ushort *)dst.get(), + *((ushort4 *)&t)); + } + + template + void store( + multi_ptr dst, + const vec< + typename std::enable_if::type, + N> &x) { + typename vec::vector_t t = x; + intel_sub_group_block_write8((__global uint *)dst.get(), *((uint8 *)&t)); + } + + template + void + store(multi_ptr dst, + const vec::type, + N> &x) { + typename vec::vector_t t = x; + intel_sub_group_block_write_us8((__global ushort *)dst.get(), + *((ushort8 *)&t)); + } + + /* --- synchronization functions --- */ + void barrier(access::fence_space accessSpace = + access::fence_space::global_and_local) const { + cl::sycl::detail::cl_mem_fence_flags flags; + switch (accessSpace) { + case access::fence_space::local_space: + flags = cl::sycl::detail::CLK_LOCAL_MEM_FENCE; + break; + case access::fence_space::global_space: + flags = cl::sycl::detail::CLK_GLOBAL_MEM_FENCE; + break; + case access::fence_space::global_and_local: + default: + flags = cl::sycl::detail::CLK_LOCAL_MEM_FENCE | + cl::sycl::detail::CLK_GLOBAL_MEM_FENCE; + break; + } + ::sub_group_barrier(flags); + } + +protected: + template friend struct cl::sycl::nd_item; + sub_group() = default; +}; +} // namespace intel +} // namespace sycl +} // namespace cl +#else +#include +#endif diff --git a/sycl/include/CL/sycl/intel/sub_group_host.hpp b/sycl/include/CL/sycl/intel/sub_group_host.hpp new file mode 100644 index 000000000000..f9c43e1cbf79 --- /dev/null +++ b/sycl/include/CL/sycl/intel/sub_group_host.hpp @@ -0,0 +1,147 @@ +//==- sub_group_host.hpp --- SYCL sub-group for host device ---------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#ifndef __SYCL_DEVICE_ONLY__ + +namespace cl { +namespace sycl { +template class multi_ptr; +namespace intel { +struct minimum {}; +struct maximum {}; +struct plus {}; + +struct sub_group { + /* --- common interface members --- */ + + id<1> get_local_id() const { + throw runtime_error("Subgroups are not supported on host device. "); + } + range<1> get_local_range() const { + throw runtime_error("Subgroups are not supported on host device. "); + } + + range<1> get_max_local_range() const { + throw runtime_error("Subgroups are not supported on host device. "); + } + + id<1> get_group_id() const { + throw runtime_error("Subgroups are not supported on host device. "); + } + + size_t get_group_range() const { + throw runtime_error("Subgroups are not supported on host device. "); + } + + size_t get_uniform_group_range() const { + throw runtime_error("Subgroups are not supported on host device. "); + } + + /* --- vote / ballot functions --- */ + + bool any(bool predicate) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + bool all(bool predicate) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + /* --- collectives --- */ + + template T broadcast(T x, id<1> local_id) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + template T reduce(T x) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + template T exclusive_scan(T x) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + template T inclusive_scan(T x) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + /* --- one - input shuffles --- */ + /* indices in [0 , sub - group size ) */ + + template T shuffle(T x, id<1> local_id) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + template T shuffle_down(T x, uint32_t delta) { + throw runtime_error("Subgroups are not supported on host device. "); + } + template T shuffle_up(T x, uint32_t delta) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + template T shuffle_xor(T x, id<1> value) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + /* --- two - input shuffles --- */ + /* indices in [0 , 2* sub - group size ) */ + template T shuffle(T x, T y, id<1> local_id) { + throw runtime_error("Subgroups are not supported on host device. "); + } + template T shuffle_down(T current, T next, uint32_t delta) { + throw runtime_error("Subgroups are not supported on host device. "); + } + template T shuffle_up(T previous, T current, uint32_t delta) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + /* --- sub - group load / stores --- */ + /* these can map to SIMD or block read / write hardware where available */ + template + T load(const multi_ptr src) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + template + vec load(const multi_ptr src) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + template + void store(multi_ptr dst, T &x) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + template + void store(multi_ptr dst, const vec &x) { + throw runtime_error("Subgroups are not supported on host device. "); + } + + /* --- synchronization functions --- */ + void barrier(access::fence_space accessSpace = + access::fence_space::global_and_local) const { + throw runtime_error("Subgroups are not supported on host device. "); + } + +protected: + template friend struct cl::sycl::nd_item; + sub_group() { + throw runtime_error("Subgroups are not supported on host device. "); + } +}; +} // namespace intel +} // namespace sycl +} // namespace cl +#endif diff --git a/sycl/include/CL/sycl/item.hpp b/sycl/include/CL/sycl/item.hpp new file mode 100644 index 000000000000..a6c860d73d5d --- /dev/null +++ b/sycl/include/CL/sycl/item.hpp @@ -0,0 +1,110 @@ +//==------------ item.hpp --- SYCL iteration item --------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +namespace cl { +namespace sycl { +namespace detail { +class Builder; +} +template struct id; +template struct range; +template struct item { + + item() = delete; + + id get_id() const { return index; } + + size_t get_id(int dimension) const { return index[dimension]; } + + size_t &operator[](int dimension) { return index[dimension]; } + + size_t operator[](int dimension) const { return index[dimension]; } + + range get_range() const { return extent; } + + size_t get_range(int dimension) const { return extent.get(dimension); } + + // only available if with_offset is true; + template ::type> + id get_offset() const { + return offset; + } + + template + operator typename std::enable_if>::type() + const { + return item(extent, index, offset); + } + + /* The following member function is only available in the id class + * specialization where: dimensions>0 and dimensions<4 */ + template 0) && (N < 4))>::type> + size_t get_linear_id() const { + if (1 == dimensions) { + return index[0] - offset[0]; + } + if (2 == dimensions) { + return (index[0] - offset[0]) * extent[1] + (index[1] - offset[1]); + } + return ((index[0] - offset[0]) * extent[1] * extent[2]) + + ((index[1] - offset[1]) * extent[2]) + (index[2] - offset[2]); + } + + item(const item &rhs) = + default; + + item(item &&rhs) = default; + + item & + operator=(const item &rhs) = default; + + item & + operator=(item &&rhs) = default; + + bool operator==(const item &rhs) const { + return (rhs.index == this->index) && (rhs.extent == this->extent) && + (rhs.offset == this->offset); + } + + bool operator!=(const item &rhs) const { + return !((*this) == rhs); + } + +protected: + // For call constructor inside conversion operator + friend class item; + friend class detail::Builder; + + template + item(typename std::enable_if<(W == true), const range>::type &R, + const id &I, const id &O) + : extent(R), index(I), offset(O) {} + + template + item(typename std::enable_if<(W == false), const range>::type &R, + const id &I) + : extent(R), index(I), offset() {} + +private: + range extent; + id index; + id offset; +}; + +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/kernel.hpp b/sycl/include/CL/sycl/kernel.hpp new file mode 100644 index 000000000000..ca3d7aae19c3 --- /dev/null +++ b/sycl/include/CL/sycl/kernel.hpp @@ -0,0 +1,94 @@ +//==--------------- kernel.hpp --- SYCL kernel -----------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include + +namespace cl { +namespace sycl { +// Forward declaration +class program; +class context; + +class kernel { + template + friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject); + template + friend T detail::createSyclObjFromImpl(decltype(T::impl) ImplObj); + +public: + kernel(cl_kernel clKernel, const context &syclContext) + : impl(std::make_shared(clKernel, syclContext)) {} + + kernel(const kernel &rhs) = default; + + kernel(kernel &&rhs) = default; + + kernel &operator=(const kernel &rhs) = default; + + kernel &operator=(kernel &&rhs) = default; + + bool operator==(const kernel &rhs) const { return impl == rhs.impl; } + + bool operator!=(const kernel &rhs) const { return !operator==(rhs); } + + cl_kernel get() const { return impl->get(); } + + bool is_host() const { return impl->is_host(); } + + context get_context() const { return impl->get_context(); } + + program get_program() const; + + template + typename info::param_traits::return_type + get_info() const { + return impl->get_info(); + } + + template + typename info::param_traits::return_type + get_work_group_info(const device &dev) const { + return impl->get_work_group_info(dev); + } + + template + typename info::param_traits::return_type + get_sub_group_info(const device &dev) const { + return impl->get_sub_group_info(dev); + } + + template + typename info::param_traits::return_type + get_sub_group_info(const device &dev, + typename info::param_traits::input_type val) const { + return impl->get_sub_group_info(dev, val); + } + +private: + kernel(std::shared_ptr impl) : impl(impl) {} + + std::shared_ptr impl; +}; +} // namespace sycl +} // namespace cl + +namespace std { +template <> struct hash { + size_t operator()(const cl::sycl::kernel &k) const { + return hash>()( + cl::sycl::detail::getSyclObjImpl(k)); + } +}; +} // namespace std diff --git a/sycl/include/CL/sycl/macro.hpp b/sycl/include/CL/sycl/macro.hpp new file mode 100644 index 000000000000..526bf234c9a0 --- /dev/null +++ b/sycl/include/CL/sycl/macro.hpp @@ -0,0 +1,11 @@ +//==-------------- macro.hpp - SYCL macro header ---------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#define CL_SYCL_LANGUAGE_VERSION 121 diff --git a/sycl/include/CL/sycl/math.hpp b/sycl/include/CL/sycl/math.hpp new file mode 100644 index 000000000000..0b88b3de55ff --- /dev/null +++ b/sycl/include/CL/sycl/math.hpp @@ -0,0 +1,307 @@ +//==----------- math.hpp - SYCL math functions ------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#include + +#ifdef __SYCL_DEVICE_ONLY__ + +#define CONCAT_HELP(a, b) a##b +#define CONCAT(a, b) CONCAT_HELP(a, b) + +#define SCALAR(type) CONCAT(CONCAT(__, type), _t) +#define VECTOR(type, len) CONCAT(CONCAT(CONCAT(__, type), len), _vec_t) + +#define MAKE_FUN_OF_1_ARG(name, ret_ty, arg_1_ty) ret_ty name(arg_1_ty); + +#define MAKE_FUN_OF_2_ARG(name, ret_ty, arg_1_ty, arg_2_ty) \ + ret_ty name(arg_1_ty, arg_2_ty); + +#define MAKE_FUN_OF_3_ARG(name, ret_ty, arg_1_ty, arg_2_ty, arg_3_ty) \ + ret_ty name(arg_1_ty, arg_2_ty, arg_3_ty); + +#define GEN_FUNC_OF_ONE_ARG_V(name, ret_ty, arg_1_ty) \ + MAKE_FUN_OF_1_ARG(name, VECTOR(ret_ty, 2), VECTOR(arg_1_ty, 2)) \ + MAKE_FUN_OF_1_ARG(name, VECTOR(ret_ty, 3), VECTOR(arg_1_ty, 3)) \ + MAKE_FUN_OF_1_ARG(name, VECTOR(ret_ty, 4), VECTOR(arg_1_ty, 4)) \ + MAKE_FUN_OF_1_ARG(name, VECTOR(ret_ty, 8), VECTOR(arg_1_ty, 8)) \ + MAKE_FUN_OF_1_ARG(name, VECTOR(ret_ty, 16), VECTOR(arg_1_ty, 16)) + +#define GEN_FUNC_OF_TWO_ARG_V(name, ret_ty, arg_1_ty, arg_2_ty) \ + MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 2), VECTOR(arg_1_ty, 2), \ + VECTOR(arg_2_ty, 2)) \ + MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 3), VECTOR(arg_1_ty, 3), \ + VECTOR(arg_2_ty, 3)) \ + MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 4), VECTOR(arg_1_ty, 4), \ + VECTOR(arg_2_ty, 4)) \ + MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 8), VECTOR(arg_1_ty, 8), \ + VECTOR(arg_2_ty, 8)) \ + MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 16), VECTOR(arg_1_ty, 16), \ + VECTOR(arg_2_ty, 16)) + +#define GEN_FUNC_OF_THREE_ARG_V(name, ret_ty, arg_1_ty, arg_2_ty, arg_3_ty) \ + MAKE_FUN_OF_3_ARG(name, VECTOR(ret_ty, 2), VECTOR(arg_1_ty, 2), \ + VECTOR(arg_2_ty, 2), VECTOR(arg_3_ty, 2)) \ + MAKE_FUN_OF_3_ARG(name, VECTOR(ret_ty, 3), VECTOR(arg_1_ty, 3), \ + VECTOR(arg_2_ty, 3), VECTOR(arg_3_ty, 3)) \ + MAKE_FUN_OF_3_ARG(name, VECTOR(ret_ty, 4), VECTOR(arg_1_ty, 4), \ + VECTOR(arg_2_ty, 4), VECTOR(arg_3_ty, 4)) \ + MAKE_FUN_OF_3_ARG(name, VECTOR(ret_ty, 8), VECTOR(arg_1_ty, 8), \ + VECTOR(arg_2_ty, 8), VECTOR(arg_3_ty, 8)) \ + MAKE_FUN_OF_3_ARG(name, VECTOR(ret_ty, 16), VECTOR(arg_1_ty, 16), \ + VECTOR(arg_2_ty, 16), VECTOR(arg_3_ty, 16)) + +#define GEN_FUNC_OF_ONE_ARG_S(name, ret_ty, arg_1_ty) \ + MAKE_FUN_OF_1_ARG(name, SCALAR(ret_ty), SCALAR(arg_1_ty)) + +#define GEN_FUNC_OF_TWO_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty) \ + MAKE_FUN_OF_2_ARG(name, SCALAR(ret_ty), SCALAR(arg_1_ty), SCALAR(arg_2_ty)) + +#define GEN_FUNC_OF_THREE_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty, arg_3_ty) \ + MAKE_FUN_OF_3_ARG(name, SCALAR(ret_ty), SCALAR(arg_1_ty), SCALAR(arg_2_ty), \ + SCALAR(arg_3_ty)) + +#define GEN_FUNC_OF_ONE_ARG(name, ret_ty, arg_1_ty) \ + GEN_FUNC_OF_ONE_ARG_S(name, ret_ty, arg_1_ty) \ + GEN_FUNC_OF_ONE_ARG_V(name, ret_ty, arg_1_ty) + +#define GEN_FUNC_OF_TWO_ARG(name, ret_ty, arg_1_ty, arg_2_ty) \ + GEN_FUNC_OF_TWO_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty) \ + GEN_FUNC_OF_TWO_ARG_V(name, ret_ty, arg_1_ty, arg_2_ty) + +#define GEN_FUNC_OF_THREE_ARG(name, ret_ty, arg_1_ty, arg_2_ty, arg_3_ty) \ + GEN_FUNC_OF_THREE_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty, arg_3_ty) \ + GEN_FUNC_OF_THREE_ARG_V(name, ret_ty, arg_1_ty, arg_2_ty, arg_3_ty) + +#define GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty) \ + MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 2), VECTOR(arg_1_ty, 2), \ + SCALAR(arg_2_ty)) \ + MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 3), VECTOR(arg_1_ty, 3), \ + SCALAR(arg_2_ty)) \ + MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 4), VECTOR(arg_1_ty, 4), \ + SCALAR(arg_2_ty)) \ + MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 8), VECTOR(arg_1_ty, 8), \ + SCALAR(arg_2_ty)) \ + MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 16), VECTOR(arg_1_ty, 16), \ + SCALAR(arg_2_ty)) + +#define GEN_FUNC_OF_TWO_ARG_S_SECOND_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty) \ + MAKE_FUN_OF_2_ARG(name, SCALAR(ret_ty), SCALAR(arg_1_ty), SCALAR(arg_2_ty)) + +#define GEN_FUNC_OF_TWO_ARG_SECOND_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty) \ + GEN_FUNC_OF_TWO_ARG_S_SECOND_ARG_S(name, ret_ty, arg_1_ty) \ + GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty) + +#define GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(name, ret_ty, arg_1_ty) \ + GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, char) \ + GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, uchar) \ + GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, short) \ + GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, ushort) \ + GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, int) \ + GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, uint) \ + GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, long) \ + GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, ulong) + +namespace cl { +namespace __spirv { +/* ----------------- 4.13.3 Math functions. Device version ------------------*/ +// TODO: Enable built-in functions with 'half' parameters once 'half' data type +/// is supported by the clang +// genfloat exp (genfloat x ) +GEN_FUNC_OF_ONE_ARG(exp, float, float) +GEN_FUNC_OF_ONE_ARG(exp, double, double) +// GEN_FUNC_OF_ONE_ARG(exp, half, half) + +// genfloat fmax (genfloat x, genfloat y) +GEN_FUNC_OF_TWO_ARG(fmax, float, float, float) +GEN_FUNC_OF_TWO_ARG(fmax, double, double, double) +// GEN_FUNC_OF_TWO_ARG(fmax, half, half, half) + +// genfloat fmax (genfloat x, sgenfloat y) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, float, float, float) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, double, double, float) +// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, half, half, float) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, float, float, double) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, double, double, double) +// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, half, half, double) +// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, float, float, half) +// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, double, double, half) +// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, half, half, half) + +// genfloat fmin (genfloat x, genfloat y) +GEN_FUNC_OF_TWO_ARG(fmin, float, float, float) +GEN_FUNC_OF_TWO_ARG(fmin, double, double, double) +// GEN_FUNC_OF_TWO_ARG(fmin, half, half, half) + +// genfloat fmin (genfloat x, sgenfloat y) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, float, float, float) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, double, double, float) +// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, half, half, float) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, float, float, double) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, double, double, double) +// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, half, half, double) +// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, float, float, half) +// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, double, double, half) +// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, half, half, half) + +// genfloat sqrt (genfloat x) +GEN_FUNC_OF_ONE_ARG(sqrt, float, float) +GEN_FUNC_OF_ONE_ARG(sqrt, double, double) +// GEN_FUNC_OF_ONE_ARG(sqrt, half, half) + +// genfloatf log (genfloatf x) +GEN_FUNC_OF_ONE_ARG(log, float, float) + +// genfloatf sin (genfloatf x) +GEN_FUNC_OF_ONE_ARG(sin, float, float) + +// genfloatf cos (genfloatf x) +GEN_FUNC_OF_ONE_ARG(cos, float, float) + +// genfloat mad (genfloat a, genfloat b, genfloat c) +GEN_FUNC_OF_THREE_ARG(mad, float, float, float, float) +GEN_FUNC_OF_THREE_ARG(mad, double, double, double, double) +// GEN_FUNC_OF_THREE_ARG_V(mad, half, half, half, half) + +// genfloatf exp (genfloatf x) +GEN_FUNC_OF_ONE_ARG(native_exp, float, float) + +// genfloatf fabs (genfloatf x) +GEN_FUNC_OF_ONE_ARG(fabs, float, float) +GEN_FUNC_OF_ONE_ARG(fabs, double, double) +// GEN_FUNC_OF_ONE_ARG(fabs, half, half) + +/* --------------- 4.13.4 Integer functions. Device version -----------------*/ +// geninteger max (geninteger x, geninteger y) +GEN_FUNC_OF_TWO_ARG(max, char, char, char) +GEN_FUNC_OF_TWO_ARG(max, uchar, uchar, uchar) +GEN_FUNC_OF_TWO_ARG(max, short, short, short) +GEN_FUNC_OF_TWO_ARG(max, ushort, ushort, ushort) +GEN_FUNC_OF_TWO_ARG(max, int, int, int) +GEN_FUNC_OF_TWO_ARG(max, uint, uint, uint) +GEN_FUNC_OF_TWO_ARG(max, long, long, long) +GEN_FUNC_OF_TWO_ARG(max, ulong, ulong, ulong) + +// geninteger max (geninteger x, sgeninteger y) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, char, char) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, uchar, uchar) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, short, short) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, ushort, ushort) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, int, int) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, uint, uint) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, long, long) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, ulong, ulong) + +// geninteger min (geninteger x, geninteger y) +GEN_FUNC_OF_TWO_ARG(min, char, char, char) +GEN_FUNC_OF_TWO_ARG(min, uchar, uchar, uchar) +GEN_FUNC_OF_TWO_ARG(min, short, short, short) +GEN_FUNC_OF_TWO_ARG(min, ushort, ushort, ushort) +GEN_FUNC_OF_TWO_ARG(min, int, int, int) +GEN_FUNC_OF_TWO_ARG(min, uint, uint, uint) +GEN_FUNC_OF_TWO_ARG(min, long, long, long) +GEN_FUNC_OF_TWO_ARG(min, ulong, ulong, ulong) + +// geninteger min (geninteger x, sgeninteger y) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, char, char) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, uchar, uchar) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, short, short) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, ushort, ushort) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, int, int) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, uint, uint) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, long, long) +GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, ulong, ulong) +/* --------------- 4.13.5 Common functions. Device version ------------------*/ +/* --------------- 4.13.6 Geometric Functions. Device version ---------------*/ +/* --------------- 4.13.7 Relational functions. Device version --------------*/ +} // namespace __spirv +} // namespace cl + +#undef CONCAT_HELP +#undef CONCAT +#undef SCALAR +#undef VECTOR +#undef MAKE_FUN_OF_1_ARG +#undef MAKE_FUN_OF_2_ARG +#undef MAKE_FUN_OF_3_ARG +#undef GEN_FUNC_OF_ONE_ARG_V +#undef GEN_FUNC_OF_TWO_ARG_V +#undef GEN_FUNC_OF_THREE_ARG_V +#undef GEN_FUNC_OF_ONE_ARG_S +#undef GEN_FUNC_OF_TWO_ARG_S +#undef GEN_FUNC_OF_THREE_ARG_S +#undef GEN_FUNC_OF_ONE_ARG +#undef GEN_FUNC_OF_TWO_ARG +#undef GEN_FUNC_OF_THREE_ARG +#undef GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S +#undef GEN_FUNC_OF_TWO_ARG_S_SECOND_ARG_S +#undef GEN_FUNC_OF_TWO_ARG_SECOND_ARG_S +#undef GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER +#endif // __SYCL_DEVICE_ONLY__ + +#ifdef __SYCL_DEVICE_ONLY__ +namespace __sycl_std = cl::__spirv; +#else +namespace __sycl_std = std; +#endif + +namespace cl { +namespace sycl { +template T cos(T x) { + return __sycl_std::cos(x); +} +template T exp(T x) { + return __sycl_std::exp(x); +} +template T1 fmax(T1 x, T2 y) { + return __sycl_std::fmax(x, y); +} +template T1 fmin(T1 x, T2 y) { + return __sycl_std::fmin(x, y); +} +template T log(T x) { + return __sycl_std::log(x); +} +template T mad(T a, T b, T c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __sycl_std::mad(a, b, c); +#else + return (a * b) + c; +#endif +} +template T1 max(T1 x, T2 y) { + return __sycl_std::max(x, y); +} +template T1 min(T1 x, T2 y) { + return __sycl_std::min(x, y); +} +template T sin(T x) { + return __sycl_std::sin(x); +} +template T sqrt(T x) { + return __sycl_std::sqrt(x); +} +template T fabs(T x) { + return __sycl_std::fabs(x); +} +namespace native { +template T exp(T x) { +#ifdef __SYCL_DEVICE_ONLY__ + return __sycl_std::native_exp(x); +#else + return __sycl_std::exp(x); +#endif +} +} // namespace native +namespace half_precision {} // namespace half_precision +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/multi_ptr.hpp b/sycl/include/CL/sycl/multi_ptr.hpp new file mode 100644 index 000000000000..357aa1cd26f5 --- /dev/null +++ b/sycl/include/CL/sycl/multi_ptr.hpp @@ -0,0 +1,335 @@ +//==------------ multi_ptr.hpp - SYCL multi_ptr class ----------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +namespace cl { +namespace sycl { +template class multi_ptr { +public: + using element_type = ElementType; + using difference_type = std::ptrdiff_t; + + // Implementation defined pointer and reference types that correspond to + // SYCL/OpenCL interoperability types for OpenCL C functions + using pointer_t = typename detail::PtrValueType::type *; + using const_pointer_t = + typename detail::PtrValueType::type const *; + using reference_t = typename detail::PtrValueType::type &; + using const_reference_t = + typename detail::PtrValueType::type &; + + static constexpr access::address_space address_space = Space; + + // Constructors + multi_ptr() : m_Pointer(nullptr) {} + multi_ptr(const multi_ptr &rhs) = default; + multi_ptr(multi_ptr &&) = default; + multi_ptr(pointer_t pointer) : m_Pointer(pointer) {} +#ifdef __SYCL_DEVICE_ONLY__ + multi_ptr(ElementType *pointer) + : m_Pointer(reinterpret_cast(pointer)) { + // TODO An implementation should reject an argument if the deduced + // address space is not compatible with Space. + } +#endif + multi_ptr(std::nullptr_t) : m_Pointer(nullptr) {} + ~multi_ptr() = default; + + // Assignment and access operators + multi_ptr &operator=(const multi_ptr &) = default; + multi_ptr &operator=(multi_ptr &&) = default; + multi_ptr &operator=(pointer_t pointer) { + m_Pointer = pointer; + return *this; + } +#ifdef __SYCL_DEVICE_ONLY__ + multi_ptr &operator=(ElementType *pointer) { + m_Pointer = reinterpret_cast(pointer); + // TODO An implementation should reject an argument if the deduced + // address space is not compatible with Space. + } +#endif + multi_ptr &operator=(std::nullptr_t) { + m_Pointer = nullptr; + return *this; + } + ElementType &operator*() const { + return *(reinterpret_cast(m_Pointer)); + } + ElementType *operator->() const { + return reinterpret_cast(m_Pointer); + } + ElementType &operator[](difference_type index) { + return *(reinterpret_cast(m_Pointer + index)); + } + ElementType operator[](difference_type index) const { + return *(reinterpret_cast(m_Pointer + index)); + } + + // Only if Space == global_space + template ::type> + multi_ptr(accessor + Accessor) + : multi_ptr(Accessor.get_pointer()) {} + + // Only if Space == local_space + template < + int dimensions, access::mode Mode, access::placeholder isPlaceholder, + access::address_space _Space = Space, + typename = typename std::enable_if< + _Space == Space && Space == access::address_space::local_space>::type> + multi_ptr(accessor + Accessor) + : multi_ptr(Accessor.get_pointer()) {} + + // Only if Space == constant_space + template ::type> + multi_ptr(accessor + Accessor) + : multi_ptr(Accessor.get_pointer()) {} + + // Returns the underlying OpenCL C pointer + pointer_t get() const { return m_Pointer; } + + // Implicit conversion to the underlying pointer type + operator ElementType *() const { + return reinterpret_cast(m_Pointer); + } + + // Explicit conversion to a multi_ptr + explicit operator multi_ptr() const; + + // Arithmetic operators + multi_ptr &operator++() { + m_Pointer += (difference_type)1; + return *this; + } + multi_ptr operator++(int) { + multi_ptr result(*this); + ++(*this); + return result; + } + multi_ptr &operator--() { + m_Pointer -= (difference_type)1; + return *this; + } + multi_ptr operator--(int) { + multi_ptr result(*this); + --(*this); + return result; + } + multi_ptr &operator+=(difference_type r) { + m_Pointer += r; + return *this; + } + multi_ptr &operator-=(difference_type r) { + m_Pointer -= r; + return *this; + } + multi_ptr operator+(difference_type r) const { + return multi_ptr(m_Pointer + r); + } + multi_ptr operator-(difference_type r) const { + return multi_ptr(m_Pointer - r); + } + + void prefetch(size_t numElements) const; + +private: + pointer_t m_Pointer; +}; + +// Specialization of multi_ptr for void +template class multi_ptr { +public: + using element_type = void; + using difference_type = std::ptrdiff_t; + + // Implementation defined pointer types that correspond to + // SYCL/OpenCL interoperability types for OpenCL C functions + using pointer_t = typename detail::PtrValueType::type *; + using const_pointer_t = + typename detail::PtrValueType::type const *; + + static constexpr access::address_space address_space = Space; + + // Constructors + multi_ptr() : m_Pointer(nullptr) {} + multi_ptr(const multi_ptr &) = default; + multi_ptr(multi_ptr &&) = default; + multi_ptr(pointer_t pointer) : m_Pointer(pointer) {} +#ifdef __SYCL_DEVICE_ONLY__ + multi_ptr(void *pointer) : m_Pointer(reinterpret_cast(pointer)) { + // TODO An implementation should reject an argument if the deduced + // address space is not compatible with Space. + } +#endif + multi_ptr(std::nullptr_t) : m_Pointer(nullptr) {} + ~multi_ptr() = default; + + // Assignment operators + multi_ptr &operator=(const multi_ptr &) = default; + multi_ptr &operator=(multi_ptr &&) = default; + multi_ptr &operator=(pointer_t pointer) { + m_Pointer = pointer; + return *this; + } +#ifdef __SYCL_DEVICE_ONLY__ + multi_ptr &operator=(void *pointer) { + m_Pointer = reinterpret_cast(pointer); + // TODO An implementation should reject an argument if the deduced + // address space is not compatible with Space. + } +#endif + multi_ptr &operator=(std::nullptr_t) { + m_Pointer = nullptr; + return *this; + } + + // Only if Space == global_space + template ::type> + multi_ptr( + accessor + Accessor) + : multi_ptr(Accessor.get_pointer()) {} + + // Only if Space == local_space + template < + typename ElementType, int dimensions, access::mode Mode, + access::address_space _Space = Space, + typename = typename std::enable_if< + _Space == Space && Space == access::address_space::local_space>::type> + multi_ptr( + accessor Accessor) + : multi_ptr(Accessor.get_pointer()) {} + + // Only if Space == constant_space + template ::type> + multi_ptr( + accessor + Accessor) + : multi_ptr(Accessor.get_pointer()) {} + + // Returns the underlying OpenCL C pointer + pointer_t get() const { return m_Pointer; } + + // Implicit conversion to the underlying pointer type + operator void *() const; + + // Explicit conversion to a multi_ptr + template + explicit operator multi_ptr() const; + +private: + pointer_t m_Pointer; +}; + +template +multi_ptr +make_ptr(typename multi_ptr::pointer_t pointer) { + return multi_ptr(pointer); +} + +#ifdef __SYCL_DEVICE_ONLY__ +// An implementation should reject an argument if the deduced address space +// is not compatible with Space. +// This is guaranteed by the c'tor. +template +multi_ptr make_ptr(ElementType *pointer) { + return multi_ptr(pointer); +} +#endif + +template +bool operator==(const multi_ptr &lhs, + const multi_ptr &rhs); + +template +bool operator!=(const multi_ptr &lhs, + const multi_ptr &rhs); + +template +bool operator<(const multi_ptr &lhs, + const multi_ptr &rhs); + +template +bool operator>(const multi_ptr &lhs, + const multi_ptr &rhs); + +template +bool operator<=(const multi_ptr &lhs, + const multi_ptr &rhs); + +template +bool operator>=(const multi_ptr &lhs, + const multi_ptr &rhs); + +template +bool operator!=(const multi_ptr &lhs, std::nullptr_t rhs); + +template +bool operator!=(std::nullptr_t lhs, const multi_ptr &rhs); + +template +bool operator==(const multi_ptr &lhs, std::nullptr_t rhs); + +template +bool operator==(std::nullptr_t lhs, const multi_ptr &rhs); + +template +bool operator>(const multi_ptr &lhs, std::nullptr_t rhs); + +template +bool operator>(std::nullptr_t lhs, const multi_ptr &rhs); + +template +bool operator<(const multi_ptr &lhs, std::nullptr_t rhs); + +template +bool operator<(std::nullptr_t lhs, const multi_ptr &rhs); + +template +bool operator>=(const multi_ptr &lhs, std::nullptr_t rhs); + +template +bool operator>=(std::nullptr_t lhs, const multi_ptr &rhs); + +template +bool operator<=(const multi_ptr &lhs, std::nullptr_t rhs); + +template +bool operator<=(std::nullptr_t lhs, const multi_ptr &rhs); +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/nd_item.hpp b/sycl/include/CL/sycl/nd_item.hpp new file mode 100644 index 000000000000..5d34652dbcae --- /dev/null +++ b/sycl/include/CL/sycl/nd_item.hpp @@ -0,0 +1,180 @@ +//==--------- nd_item.hpp --- SYCL iteration nd_item -----------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cl { +namespace sycl { +namespace detail { +class Builder; +} +template struct nd_item { + + nd_item() = delete; + + id get_global_id() const { return globalItem.get_id(); } + + size_t get_global_id(int dimension) const { + return globalItem.get_id(dimension); + } + + size_t get_global_linear_id() const { return globalItem.get_linear_id(); } + + id get_local_id() const { return localItem.get_id(); } + + size_t get_local_id(int dimension) const { + return localItem.get_id(dimension); + } + + size_t get_local_linear_id() const { return localItem.get_linear_id(); } + + group get_group() const { return Group; } + + intel::sub_group get_sub_group() const { return intel::sub_group(); } + + size_t get_group(int dimension) const { return Group[dimension]; } + + size_t get_group_linear_id() const { return Group.get_linear(); } + + range get_group_range() const { + return Group.get_global_range() / Group.get_local_range(); + } + + size_t get_group_range(int dimension) const { + return Group.get_global_range(dimension) / Group.get_local_range(dimension); + } + + range get_global_range() const { return globalItem.get_range(); } + + size_t get_global_range(int dimension) const { + return globalItem.get_range(dimension); + } + + range get_local_range() const { return localItem.get_range(); } + + size_t get_local_range(int dimension) const { + return localItem.get_range(dimension); + } + + id get_offset() const { return globalItem.get_offset(); } + + nd_range get_nd_range() const { + return nd_range(get_global_range(), get_local_range(), + get_offset()); + } + + void barrier(access::fence_space accessSpace = + access::fence_space::global_and_local) const { + uint32_t flags = ::cl::__spirv::MemorySemantics::SequentiallyConsistent; + switch (accessSpace) { + case access::fence_space::global_space: + flags |= cl::__spirv::MemorySemantics::CrossWorkgroupMemory; + break; + case access::fence_space::local_space: + flags |= cl::__spirv::MemorySemantics::WorkgroupMemory; + break; + case access::fence_space::global_and_local: + default: + flags |= cl::__spirv::MemorySemantics::CrossWorkgroupMemory | + cl::__spirv::MemorySemantics::WorkgroupMemory; + break; + } + cl::__spirv::OpControlBarrier(::cl::__spirv::Scope::Workgroup, + ::cl::__spirv::Scope::Workgroup, flags); + } + + /// Executes a work-group mem-fence with memory ordering on the local address + /// space, global address space or both based on the value of \p accessSpace. + template + void + mem_fence(typename std::enable_if::type accessSpace = + access::fence_space::global_and_local) const { + Group.mem_fence(); + } + + template + device_event async_work_group_copy(local_ptr dest, + global_ptr src, + size_t numElements) const { + return Group.async_work_group_copy(dest, src, numElements); + } + + template + device_event async_work_group_copy(global_ptr dest, + local_ptr src, + size_t numElements) const { + return Group.async_work_group_copy(dest, src, numElements); + } + + template + device_event async_work_group_copy(local_ptr dest, + global_ptr src, + size_t numElements, + size_t srcStride) const { + + return Group.async_work_group_copy(dest, src, numElements, srcStride); + } + + template + device_event async_work_group_copy(global_ptr dest, + local_ptr src, + size_t numElements, + size_t destStride) const { + return Group.async_work_group_copy(dest, src, numElements, destStride); + } + + template + void wait_for(eventTN... events) const { + Group.wait_for(events...); + } + + nd_item(const nd_item &rhs) = default; + + nd_item(nd_item &&rhs) = default; + + nd_item &operator=(const nd_item &rhs) = default; + + nd_item &operator=(nd_item &&rhs) = default; + + bool operator==(const nd_item &rhs) const { + return (rhs.localItem == this->localItem) && + (rhs.globalItem == this->globalItem) && (rhs.Group == this->Group); + } + + bool operator!=(const nd_item &rhs) const { + return !((*this) == rhs); + } + +protected: + friend class detail::Builder; + nd_item(const item &GL, const item &L, + const group &GR) + : globalItem(GL), localItem(L), Group(GR) {} + +private: + item localItem; + item globalItem; + group Group; +}; +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/nd_range.hpp b/sycl/include/CL/sycl/nd_range.hpp new file mode 100644 index 000000000000..6520309792b8 --- /dev/null +++ b/sycl/include/CL/sycl/nd_range.hpp @@ -0,0 +1,59 @@ +//==-------- nd_range.hpp --- SYCL iteration nd_range ----------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +namespace cl { +namespace sycl { + +template class nd_range { + range globalSize; + range localSize; + id offset; + +public: + template + nd_range( + typename std::enable_if<((N > 0) && (N < 4)), range>::type globalSize, + range localSize, id offset = id()) + : globalSize(globalSize), localSize(localSize), offset(offset) {} + + range get_global_range() const { return globalSize; } + + range get_local_range() const { return localSize; } + + range get_group_range() const { return globalSize / localSize; } + + id get_offset() const { return offset; } + + // Common special member functions for by-value semantics + nd_range(const nd_range &rhs) = default; + nd_range(nd_range &&rhs) = default; + nd_range &operator=(const nd_range &rhs) = default; + nd_range &operator=(nd_range &&rhs) = default; + nd_range() = default; + + // Common member functions for by-value semantics + bool operator==(const nd_range &rhs) const { + return (rhs.globalSize == this->globalSize) && + (rhs.localSize == this->localSize) && (rhs.offset == this->offset); + } + + bool operator!=(const nd_range &rhs) const { + return !(*this == rhs); + } +}; + +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/platform.hpp b/sycl/include/CL/sycl/platform.hpp new file mode 100644 index 000000000000..d42ffbf5b882 --- /dev/null +++ b/sycl/include/CL/sycl/platform.hpp @@ -0,0 +1,80 @@ +//==---------------- platform.hpp - SYCL platform --------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +// 4.6.2 Platform class +#include +#include +namespace cl { +namespace sycl { + +// TODO: make code thread-safe + +// Forward declaration +class device_selector; +class device; + +class platform { +public: + platform(); + + explicit platform(cl_platform_id platform_id); + + explicit platform(const device_selector &); + + template + typename info::param_traits::return_type + get_info() const { + return impl->get_info(); + } + + platform(const platform &rhs) = default; + + platform(platform &&rhs) = default; + + platform &operator=(const platform &rhs) = default; + + platform &operator=(platform &&rhs) = default; + + bool operator==(const platform &rhs) const { return impl == rhs.impl; } + + bool operator!=(const platform &rhs) const { return !(*this == rhs); } + + cl_platform_id get() const { return impl->get(); } + + bool has_extension(const string_class &extension_name) const { + return impl->has_extension(extension_name); + } + + bool is_host() const { return impl->is_host(); } + + vector_class + get_devices(info::device_type dev_type = info::device_type::all) const; + + static vector_class get_platforms(); + +private: + std::shared_ptr impl; + template + friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject); +}; // class platform +} // namespace sycl +} // namespace cl + +namespace std { +template <> struct hash { + size_t operator()(const cl::sycl::platform &p) const { + return hash>()( + cl::sycl::detail::getSyclObjImpl(p)); + } +}; +} // namespace std diff --git a/sycl/include/CL/sycl/pointers.hpp b/sycl/include/CL/sycl/pointers.hpp new file mode 100644 index 000000000000..f8077cadd6a2 --- /dev/null +++ b/sycl/include/CL/sycl/pointers.hpp @@ -0,0 +1,35 @@ +//==------------ pointers.hpp - SYCL pointers classes ----------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include + + +namespace cl { +namespace sycl { + +template class multi_ptr; +// Template specialization aliases for different pointer address spaces + +template +using global_ptr = multi_ptr; + +template +using local_ptr = multi_ptr; + +template +using constant_ptr = + multi_ptr; + +template +using private_ptr = + multi_ptr; + +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/program.hpp b/sycl/include/CL/sycl/program.hpp new file mode 100644 index 000000000000..5db53194f22e --- /dev/null +++ b/sycl/include/CL/sycl/program.hpp @@ -0,0 +1,145 @@ +//==--------------- program.hpp --- SYCL program ---------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include + +namespace cl { +namespace sycl { + +class context; +class device; +class kernel; + +class program { + template + friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject); + template + friend T detail::createSyclObjFromImpl(decltype(T::impl) ImplObj); + +public: + program() = delete; + + explicit program(const context &context) + : impl(std::make_shared(context)) {} + + program(const context &context, vector_class deviceList) + : impl(std::make_shared(context, deviceList)) {} + + program(vector_class programList, string_class linkOptions = "") { + std::vector> impls; + for (auto &x : programList) { + impls.push_back(detail::getSyclObjImpl(x)); + } + impl = std::make_shared(impls, linkOptions); + } + + program(const context &context, cl_program clProgram) + : impl(std::make_shared(context, clProgram)) {} + + program(const program &rhs) = default; + + program(program &&rhs) = default; + + program &operator=(const program &rhs) = default; + + program &operator=(program &&rhs) = default; + + bool operator==(const program &rhs) const { return impl == rhs.impl; } + + bool operator!=(const program &rhs) const { return !operator==(rhs); } + + cl_program get() const { return impl->get(); } + + bool is_host() const { return impl->is_host(); } + + template + void compile_with_kernel_type(string_class compileOptions = "") { + impl->compile_with_kernel_type(compileOptions); + } + + void compile_with_source(string_class kernelSource, + string_class compileOptions = "") { + impl->compile_with_source(kernelSource, compileOptions); + } + + template + void build_with_kernel_type(string_class buildOptions = "") { + impl->build_with_kernel_type(buildOptions); + } + + void build_with_source(string_class kernelSource, + string_class buildOptions = "") { + impl->build_with_source(kernelSource, buildOptions); + } + + void link(string_class linkOptions = "") { impl->link(linkOptions); } + + template bool has_kernel() const { + return impl->has_kernel(); + } + + bool has_kernel(string_class kernelName) const { + return impl->has_kernel(kernelName); + } + + template kernel get_kernel() const { + return impl->get_kernel(impl); + } + + kernel get_kernel(string_class kernelName) const { + return impl->get_kernel(kernelName, impl); + } + + template + typename info::param_traits::return_type + get_info() const { + return impl->get_info(); + } + + vector_class> get_binaries() const { + return impl->get_binaries(); + } + + context get_context() const { return impl->get_context(); } + + vector_class get_devices() const { return impl->get_devices(); } + + string_class get_compile_options() const { + return impl->get_compile_options(); + } + + string_class get_link_options() const { return impl->get_link_options(); } + + string_class get_build_options() const { return impl->get_build_options(); } + + program_state get_state() const { return impl->get_state(); } + +private: + program(std::shared_ptr impl) : impl(impl) {} + + std::shared_ptr impl; +}; +} // namespace sycl +} // namespace cl + +namespace std { +template <> struct hash { + size_t operator()(const cl::sycl::program &prg) const { + return hash>()( + cl::sycl::detail::getSyclObjImpl(prg)); + } +}; +} // namespace std diff --git a/sycl/include/CL/sycl/property_list.hpp b/sycl/include/CL/sycl/property_list.hpp new file mode 100644 index 000000000000..706be7ea2a04 --- /dev/null +++ b/sycl/include/CL/sycl/property_list.hpp @@ -0,0 +1,249 @@ +//==--------- property_list.hpp --- SYCL property list ---------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +namespace cl { +namespace sycl { +// Forward declaration +class context; + +// HOW TO ADD NEW PROPERTY INSTRUCTION: +// 1. Add forward declaration of property class. +// 2. Add new record in PropKind enum. +// 3. Use RegisterProp macro passing new record from enum and new class. +// 4. Add implementation of the new property class using detail::Prop class with +// template parameter = new record in enum as a base class. + +namespace property { + +namespace image { +class use_host_ptr; +class use_mutex; +class context_bound; +} // namespace image + +namespace buffer { +class use_host_ptr; +class use_mutex; +class context_bound; +} // namespace buffer + +namespace queue { +class enable_profiling; +} // namespace queue + +namespace detail { + +// List of all properties' IDs. +enum PropKind { + // Buffer properties + BufferUseHostPtr = 0, + BufferContextBound, + BufferUseMutex, + + // Image properties + ImageUseHostPtr, + ImageContextBound, + ImageUseMutex, + + // Queue properties + QueueEnableProfiling, + + PropKindSize +}; + +// Base class for all properties. Needed to check that user passed only +// SYCL's properties to property_list c'tor. +class PropBase {}; + +// Second base class, needed for mapping PropKind to class and vice versa. +template class Prop; + +// This class is used in property_list to hold properties. +template class PropertyHolder { +public: + void setProp(const T &Rhs) { + new (m_Mem) T(Rhs); + m_Initialized = true; + } + + const T &getProp() const { + assert(true == m_Initialized && "Property was not set!"); + return *(T *)m_Mem; + } + bool isInitialized() const { return m_Initialized; } + +private: + // Memory that is used for property allocation + unsigned char m_Mem[sizeof(T)]; + // Indicate whether property initialized or not. + bool m_Initialized = false; +}; + +// This macro adds specialization of class Prop which provides possibility to +// convert PropKind to class and vice versa. +#define RegisterProp(PropKindT, Type) \ + template <> class Prop : public PropBase { \ + public: \ + static constexpr PropKind getKind() { return PropKindT; } \ + using FinalType = Type; \ + } + +// Image +RegisterProp(PropKind::ImageUseHostPtr, image::use_host_ptr); +RegisterProp(PropKind::ImageUseMutex, image::use_mutex); +RegisterProp(PropKind::ImageContextBound, image::context_bound); + +// Buffer +RegisterProp(PropKind::BufferUseHostPtr, buffer::use_host_ptr); +RegisterProp(PropKind::BufferUseMutex, buffer::use_mutex); +RegisterProp(PropKind::BufferContextBound, buffer::context_bound); + +// Queue +RegisterProp(PropKind::QueueEnableProfiling, queue::enable_profiling); + +// Sentinel, needed for automatic build of tuple in property_list. +RegisterProp(PropKind::PropKindSize, PropBase); + +// Common class for use_mutex in buffer and image namespaces. +template class UseMutexBase : public Prop { +public: + UseMutexBase(mutex_class &MutexRef) : m_MutexClass(MutexRef) {} + mutex_class *get_mutex_ptr() const { return &m_MutexClass; } + +private: + mutex_class &m_MutexClass; +}; + +// Common class for context_bound in buffer and image namespaces. +template class ContextBoundBase : public Prop { +public: + ContextBoundBase(cl::sycl::context Context) : m_Context(Context) {} + context get_context() const { return m_Context; } + +private: + cl::sycl::context m_Context; +}; +} // namespace detail + +namespace image { + +class use_host_ptr : public detail::Prop {}; + +class use_mutex : public detail::UseMutexBase { +public: + use_mutex(mutex_class &MutexRef) : UseMutexBase(MutexRef) {} +}; + +class context_bound + : public detail::ContextBoundBase { +public: + context_bound(cl::sycl::context Context) : ContextBoundBase(Context) {} +}; + +} // namespace image + +namespace buffer { + +class use_host_ptr : public detail::Prop {}; + +class use_mutex + : public detail::UseMutexBase { +public: + use_mutex(mutex_class &MutexRef) : UseMutexBase(MutexRef) {} +}; + +class context_bound + : public detail::ContextBoundBase { +public: + context_bound(cl::sycl::context Context) : ContextBoundBase(Context) {} +}; +} // namespace buffer + +namespace queue { +class enable_profiling + : public detail::Prop {}; +} // namespace queue + +} // namespace property + +class property_list { + + // The structs validate that all objects passed are base of PropBase class. + template struct AllProperties : std::true_type {}; + template + struct AllProperties + : std::conditional::value, + AllProperties, std::false_type>::type {}; + + template + using PropertyHolder = cl::sycl::property::detail::PropertyHolder; + template + using Property = cl::sycl::property::detail::Prop; + + // The structs build tuple type that can hold all properties. + template struct DefineTupleType { + using Type = std::tuple; + }; + + template + struct BuildTupleType + : public std::conditional< + (Counter < property::detail::PropKind::PropKindSize), + BuildTupleType< + Counter + 1, Head..., + PropertyHolder::FinalType>>, + DefineTupleType>::type {}; + +public: + // C'tor initialize m_PropList with properties passed by invoking ctorHelper + // recursively + template ::value>::type> + property_list(propertyTN... Props) { + ctorHelper(Props...); + } + + template propertyT get_property() const { + static_assert((int)(propertyT::getKind()) <= + property::detail::PropKind::PropKindSize, + "Invalid option passed."); + const auto &PropHolder = std::get<(int)(propertyT::getKind())>(m_PropsList); + if (PropHolder.isInitialized()) { + return PropHolder.getProp(); + } + throw invalid_object_error(); + } + + template bool has_property() const { + return std::get<(int)(propertyT::getKind())>(m_PropsList).isInitialized(); + } + +private: + void ctorHelper() {} + + template + void ctorHelper(PropT &Prop, propertyTN... props) { + std::get<(int)(PropT::getKind())>(m_PropsList).setProp(Prop); + ctorHelper(props...); + } + + // Tuple that able to hold all the properties. + BuildTupleType<0>::Type m_PropsList; +}; + +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp new file mode 100644 index 000000000000..824230607cf3 --- /dev/null +++ b/sycl/include/CL/sycl/queue.hpp @@ -0,0 +1,122 @@ +//==-------------------- queue.hpp - SYCL queue ----------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace cl { +namespace sycl { + +// Forward declaration +class context; +class device; +class queue { +public: + explicit queue(const property_list &propList = {}) + : queue(default_selector(), async_handler{}, propList) {} + + queue(const async_handler &asyncHandler, const property_list &propList = {}) + : queue(default_selector(), asyncHandler, propList) {} + + queue(const device_selector &deviceSelector, + const property_list &propList = {}) + : queue(deviceSelector.select_device(), async_handler{}, propList) {} + + queue(const device_selector &deviceSelector, + const async_handler &asyncHandler, const property_list &propList = {}) + : queue(deviceSelector.select_device(), asyncHandler, propList) {} + + queue(const device &syclDevice, const property_list &propList = {}) + : queue(syclDevice, async_handler{}, propList) {} + + queue(const device &syclDevice, const async_handler &asyncHandler, + const property_list &propList = {}); + + queue(const context &syclContext, const device_selector &deviceSelector, + const property_list &propList = {}) + : queue(syclContext, deviceSelector, + detail::getSyclObjImpl(syclContext)->get_async_handler(), + propList) {} + + queue(const context &syclContext, const device_selector &deviceSelector, + const async_handler &asyncHandler, const property_list &propList = {}); + + queue(cl_command_queue clQueue, const context &syclContext, + const async_handler &asyncHandler = {}); + + queue(const queue &rhs) = default; + + queue(queue &&rhs) = default; + + queue &operator=(const queue &rhs) = default; + + queue &operator=(queue &&rhs) = default; + + bool operator==(const queue &rhs) const { return impl == rhs.impl; } + + bool operator!=(const queue &rhs) const { return !(*this == rhs); } + + cl_command_queue get() const { return impl->get(); } + + context get_context() const { return impl->get_context(); } + + device get_device() const { return impl->get_device(); } + + bool is_host() const { return impl->is_host(); } + + template + typename info::param_traits::return_type + get_info() const { + return impl->get_info(); + } + + template event submit(T cgf) { return impl->submit(cgf, impl); } + + template event submit(T cgf, queue &secondaryQueue) { + return impl->submit(cgf, impl, secondaryQueue.impl); + } + + void wait() { impl->wait(); } + + void wait_and_throw() { impl->wait_and_throw(); } + + void throw_asynchronous() { impl->throw_asynchronous(); } + + template bool has_property() const { + return impl->has_property(); + } + + template propertyT get_property() const { + return impl->get_property(); + } + +private: + std::shared_ptr impl; + template + friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject); +}; + +} // namespace sycl +} // namespace cl + +namespace std { +template <> struct hash { + size_t operator()(const cl::sycl::queue &q) const { + return std::hash>()( + cl::sycl::detail::getSyclObjImpl(q)); + } +}; +} // namespace std diff --git a/sycl/include/CL/sycl/range.hpp b/sycl/include/CL/sycl/range.hpp new file mode 100644 index 000000000000..6d7f4919f66e --- /dev/null +++ b/sycl/include/CL/sycl/range.hpp @@ -0,0 +1,544 @@ +//==----------- range.hpp --- SYCL iteration range -------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +namespace cl { +namespace sycl { +template struct id; +template +class range : public detail::array { +public: + using base = detail::array; + /* The following constructor is only available in the range class + specialization where: dimensions==1 */ + template + range(typename std::enable_if<(N == 1), size_t>::type dim0) : base(dim0) {} + + /* The following constructor is only available in the range class + specialization where: dimensions==2 */ + template + range(typename std::enable_if<(N == 2), size_t>::type dim0, size_t dim1) + : base(dim0, dim1) {} + + /* The following constructor is only available in the range class + specialization where: dimensions==3 */ + template + range(typename std::enable_if<(N == 3), size_t>::type dim0, size_t dim1, + size_t dim2) : base(dim0, dim1, dim2) {} + + explicit operator id() const { + id result; + for (int i = 0; i < dimensions; ++i) { + result[i] = this->get(i); + } + return result; + } + + size_t size() const { + size_t size = 1; + for (int i = 0; i < dimensions; ++i) { + size *= this->get(i); + } + return size; + } + + range(const range &rhs) = default; + range(range &&rhs) = default; + range &operator=(const range &rhs) = default; + range &operator=(range &&rhs) = default; + range() = default; + + // OP is: +, -, *, /, %, <<, >>, &, |, ^, &&, ||, <, >, <=, >= + range operator+(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] + rhs.common_array[i]; + } + return result; + } + range operator-(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] - rhs.common_array[i]; + } + return result; + } + range operator*(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] * rhs.common_array[i]; + } + return result; + } + range operator/(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] / rhs.common_array[i]; + } + return result; + } + range operator%(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] % rhs.common_array[i]; + } + return result; + } + range operator<<(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] << rhs.common_array[i]; + } + return result; + } + range operator>>(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] >> rhs.common_array[i]; + } + return result; + } + range operator&(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] & rhs.common_array[i]; + } + return result; + } + range operator|(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] | rhs.common_array[i]; + } + return result; + } + range operator^(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] ^ rhs.common_array[i]; + } + return result; + } + range operator&&(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] && rhs.common_array[i]; + } + return result; + } + range operator||(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] || rhs.common_array[i]; + } + return result; + } + range operator<(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] < rhs.common_array[i]; + } + return result; + } + range operator>(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] > rhs.common_array[i]; + } + return result; + } + range operator<=(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] <= rhs.common_array[i]; + } + return result; + } + range operator>=(const range &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] >= rhs.common_array[i]; + } + return result; + } + + // OP is: +, -, *, /, %, <<, >>, &, |, ^, &&, ||, <, >, <=, >= + range operator+(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] + rhs; + } + return result; + } + range operator-(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] - rhs; + } + return result; + } + range operator*(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] * rhs; + } + return result; + } + range operator/(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] / rhs; + } + return result; + } + range operator%(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] % rhs; + } + return result; + } + range operator<<(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] << rhs; + } + return result; + } + range operator>>(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] >> rhs; + } + return result; + } + range operator&(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] & rhs; + } + return result; + } + range operator|(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] | rhs; + } + return result; + } + range operator^(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] ^ rhs; + } + return result; + } + range operator&&(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] && rhs; + } + return result; + } + range operator||(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] || rhs; + } + return result; + } + range operator<(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] < rhs; + } + return result; + } + range operator>(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] > rhs; + } + return result; + } + range operator<=(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] <= rhs; + } + return result; + } + range operator>=(const size_t &rhs) const { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = this->common_array[i] >= rhs; + } + return result; + } + + // OP is: +=, -=, *=, /=, %=, <<=, >>=, &=, |=, ^= + range &operator+=(const range &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] += rhs[i]; + } + return *this; + } + range &operator-=(const range &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] -= rhs.common_array[i]; + } + return *this; + } + range &operator*=(const range &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] *= rhs.common_array[i]; + } + return *this; + } + range &operator/=(const range &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] /= rhs.common_array[i]; + } + return *this; + } + range &operator%=(const range &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] %= rhs.common_array[i]; + } + return *this; + } + range &operator<<=(const range &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] <<= rhs.common_array[i]; + } + return *this; + } + range &operator>>=(const range &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] >>= rhs.common_array[i]; + } + return *this; + } + range &operator&=(const range &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] &= rhs.common_array[i]; + } + return *this; + } + range &operator|=(const range &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] |= rhs.common_array[i]; + } + return *this; + } + range &operator^=(const range &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] ^= rhs.common_array[i]; + } + return *this; + } + + // OP is: +=, -=, *=, /=, %=, <<=, >>=, &=, |=, ^= + range &operator+=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] += rhs; + } + return *this; + } + range &operator-=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] -= rhs; + } + return *this; + } + range &operator*=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] *= rhs; + } + return *this; + } + range &operator/=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] /= rhs; + } + return *this; + } + range &operator%=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] %= rhs; + } + return *this; + } + range &operator<<=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] <<= rhs; + } + return *this; + } + range &operator>>=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] >>= rhs; + } + return *this; + } + range &operator&=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] &= rhs; + } + return *this; + } + range &operator|=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] |= rhs; + } + return *this; + } + range &operator^=(const size_t &rhs) { + for (int i = 0; i < dimensions; ++i) { + this->common_array[i] ^= rhs; + } + return *this; + } + + // OP is: +, -, *, /, %, <<, >>, &, |, ^, <, >, <=, >=, &&, || + friend range operator+(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs + rhs.common_array[i]; + } + return result; + } + friend range operator-(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs - rhs.common_array[i]; + } + return result; + } + friend range operator*(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs * rhs.common_array[i]; + } + return result; + } + friend range operator/(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs / rhs.common_array[i]; + } + return result; + } + friend range operator%(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs % rhs.common_array[i]; + } + return result; + } + friend range operator<<(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs << rhs.common_array[i]; + } + return result; + } + friend range operator>>(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs >> rhs.common_array[i]; + } + return result; + } + friend range operator&(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs & rhs.common_array[i]; + } + return result; + } + friend range operator|(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs | rhs.common_array[i]; + } + return result; + } + friend range operator^(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs ^ rhs.common_array[i]; + } + return result; + } + friend range operator<(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs < rhs.common_array[i]; + } + return result; + } + friend range operator>(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs > rhs.common_array[i]; + } + return result; + } + friend range operator<=(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs <= rhs.common_array[i]; + } + return result; + } + friend range operator>=(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs >= rhs.common_array[i]; + } + return result; + } + friend range operator&&(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs && rhs.common_array[i]; + } + return result; + } + friend range operator||(const size_t &lhs, + const range &rhs) { + range result; + for (int i = 0; i < dimensions; ++i) { + result.common_array[i] = lhs || rhs.common_array[i]; + } + return result; + } +}; +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/stl.hpp b/sycl/include/CL/sycl/stl.hpp new file mode 100644 index 000000000000..5322fee53550 --- /dev/null +++ b/sycl/include/CL/sycl/stl.hpp @@ -0,0 +1,50 @@ +//==----------- stl.hpp - basic STL implementation -------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +// 4.5 C++ Standard library classes required for the interface + +#include +#include +#include +#include +#include +#include + +namespace cl { +namespace sycl { + + template < class T, class Alloc = std::allocator > + using vector_class = std::vector; + + using string_class = std::string; + + template + using function_class = std::function; + + using mutex_class = std::mutex; + + template > + using unique_ptr_class = std::unique_ptr; + + template + using shared_ptr_class = std::shared_ptr; + + template + using weak_ptr_class = std::weak_ptr; + + template + using hash_class = std::hash; + + using exception_ptr_class = std::exception_ptr; + +} // sycl +} // cl + diff --git a/sycl/include/CL/sycl/swizzles.def b/sycl/include/CL/sycl/swizzles.def new file mode 100644 index 000000000000..0c25d4d394b5 --- /dev/null +++ b/sycl/include/CL/sycl/swizzles.def @@ -0,0 +1,842 @@ +//==---------------- swizzles.def --- SYCL types ---------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// included to types.hppp twice, once for vec<> and once for SwizzleOp<>. + +// TODO: exclude L-Value swizzle like vec.xxxx() +#ifdef __SYCL_ACCESS +#error Undefine __SYCL_ACCESS macro. +#endif +#define __SYCL_ACCESS(_COND, _NAME, ...) \ + template \ + typename std::enable_if<(_COND), Swizzle<__VA_ARGS__>>::type _NAME() { \ + return __SYCL_ACCESS_RETURN; \ + } \ + template \ + typename std::enable_if<(_COND), ConstSwizzle<__VA_ARGS__>>::type _NAME() \ + const { \ + return __SYCL_ACCESS_RETURN; \ + } + +//__swizzled_vec__ XYZW_ACCESS() const; +__SYCL_ACCESS(N <= 4, x, Indexer(0)) +__SYCL_ACCESS(N == 2 || N == 3 || N == 4, y, Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, z, Indexer(2)) +__SYCL_ACCESS(N == 4, w, Indexer(3)) + +//__swizzled_vec__ RGBA_ACCESS() const; +__SYCL_ACCESS(N == 4, r, Indexer(0)) +__SYCL_ACCESS(N == 4, g, Indexer(1)) +__SYCL_ACCESS(N == 4, b, Indexer(2)) +__SYCL_ACCESS(N == 4, a, Indexer(3)) + +//__swizzled_vec__ INDEX_ACCESS() const; +__SYCL_ACCESS(N > 0, s0, Indexer(0)) +__SYCL_ACCESS(N > 1, s1, Indexer(1)) +__SYCL_ACCESS(N > 2, s2, Indexer(2)) +__SYCL_ACCESS(N > 2, s3, Indexer(3)) +__SYCL_ACCESS(N > 4, s4, Indexer(4)) +__SYCL_ACCESS(N > 4, s5, Indexer(5)) +__SYCL_ACCESS(N > 4, s6, Indexer(6)) +__SYCL_ACCESS(N > 4, s7, Indexer(7)) +__SYCL_ACCESS(N == 16, s8, Indexer(8)) +__SYCL_ACCESS(N == 16, s9, Indexer(9)) +__SYCL_ACCESS(N == 16, sA, Indexer(10)) +__SYCL_ACCESS(N == 16, sB, Indexer(11)) +__SYCL_ACCESS(N == 16, sC, Indexer(12)) +__SYCL_ACCESS(N == 16, sD, Indexer(13)) +__SYCL_ACCESS(N == 16, sE, Indexer(14)) +__SYCL_ACCESS(N == 16, sF, Indexer(15)) + +#ifdef SYCL_SIMPLE_SWIZZLES +//__swizzled_vec__ XYZW_SWIZZLE() const; +__SYCL_ACCESS(N <= 4, xx, Indexer(0), Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, xy, Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xz, Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, xw, Indexer(0), Indexer(3)) +__SYCL_ACCESS(2 <= N && N <= 4, yx, Indexer(1), Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, yy, Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yz, Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, yw, Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zx, Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zy, Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zz, Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, zw, Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, wx, Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, wy, Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, wz, Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, ww, Indexer(3), Indexer(3)) +__SYCL_ACCESS(N <= 4, xxx, Indexer(0), Indexer(0), Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, xxy, Indexer(0), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xxz, Indexer(0), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, xxw, Indexer(0), Indexer(0), Indexer(3)) +__SYCL_ACCESS(2 <= N && N <= 4, xyx, Indexer(0), Indexer(1), Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, xyy, Indexer(0), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xyz, Indexer(0), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, xyw, Indexer(0), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, xzx, Indexer(0), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, xzy, Indexer(0), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xzz, Indexer(0), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, xzw, Indexer(0), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, xwx, Indexer(0), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, xwy, Indexer(0), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, xwz, Indexer(0), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, xww, Indexer(0), Indexer(3), Indexer(3)) +__SYCL_ACCESS(2 <= N && N <= 4, yxx, Indexer(1), Indexer(0), Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, yxy, Indexer(1), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yxz, Indexer(1), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, yxw, Indexer(1), Indexer(0), Indexer(3)) +__SYCL_ACCESS(2 <= N && N <= 4, yyx, Indexer(1), Indexer(1), Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, yyy, Indexer(1), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yyz, Indexer(1), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, yyw, Indexer(1), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, yzx, Indexer(1), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, yzy, Indexer(1), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yzz, Indexer(1), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, yzw, Indexer(1), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, ywx, Indexer(1), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, ywy, Indexer(1), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, ywz, Indexer(1), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, yww, Indexer(1), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zxx, Indexer(2), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zxy, Indexer(2), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zxz, Indexer(2), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, zxw, Indexer(2), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zyx, Indexer(2), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zyy, Indexer(2), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zyz, Indexer(2), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, zyw, Indexer(2), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zzx, Indexer(2), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zzy, Indexer(2), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zzz, Indexer(2), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, zzw, Indexer(2), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, zwx, Indexer(2), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, zwy, Indexer(2), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, zwz, Indexer(2), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, zww, Indexer(2), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, wxx, Indexer(3), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, wxy, Indexer(3), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, wxz, Indexer(3), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, wxw, Indexer(3), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, wyx, Indexer(3), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, wyy, Indexer(3), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, wyz, Indexer(3), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, wyw, Indexer(3), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, wzx, Indexer(3), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, wzy, Indexer(3), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, wzz, Indexer(3), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, wzw, Indexer(3), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, wwx, Indexer(3), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, wwy, Indexer(3), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, wwz, Indexer(3), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, www, Indexer(3), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N <= 4, xxxx, Indexer(0), Indexer(0), Indexer(0), Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, xxxy, Indexer(0), Indexer(0), Indexer(0), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xxxz, Indexer(0), Indexer(0), Indexer(0), + Indexer(2)) +__SYCL_ACCESS(N == 4, xxxw, Indexer(0), Indexer(0), Indexer(0), Indexer(3)) +__SYCL_ACCESS(2 <= N && N <= 4, xxyx, Indexer(0), Indexer(0), Indexer(1), + Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, xxyy, Indexer(0), Indexer(0), Indexer(1), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xxyz, Indexer(0), Indexer(0), Indexer(1), + Indexer(2)) +__SYCL_ACCESS(N == 4, xxyw, Indexer(0), Indexer(0), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, xxzx, Indexer(0), Indexer(0), Indexer(2), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, xxzy, Indexer(0), Indexer(0), Indexer(2), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xxzz, Indexer(0), Indexer(0), Indexer(2), + Indexer(2)) +__SYCL_ACCESS(N == 4, xxzw, Indexer(0), Indexer(0), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, xxwx, Indexer(0), Indexer(0), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, xxwy, Indexer(0), Indexer(0), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, xxwz, Indexer(0), Indexer(0), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, xxww, Indexer(0), Indexer(0), Indexer(3), Indexer(3)) +__SYCL_ACCESS(2 <= N && N <= 4, xyxx, Indexer(0), Indexer(1), Indexer(0), + Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, xyxy, Indexer(0), Indexer(1), Indexer(0), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xyxz, Indexer(0), Indexer(1), Indexer(0), + Indexer(2)) +__SYCL_ACCESS(N == 4, xyxw, Indexer(0), Indexer(1), Indexer(0), Indexer(3)) +__SYCL_ACCESS(2 <= N && N <= 4, xyyx, Indexer(0), Indexer(1), Indexer(1), + Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, xyyy, Indexer(0), Indexer(1), Indexer(1), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xyyz, Indexer(0), Indexer(1), Indexer(1), + Indexer(2)) +__SYCL_ACCESS(N == 4, xyyw, Indexer(0), Indexer(1), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, xyzx, Indexer(0), Indexer(1), Indexer(2), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, xyzy, Indexer(0), Indexer(1), Indexer(2), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xyzz, Indexer(0), Indexer(1), Indexer(2), + Indexer(2)) +__SYCL_ACCESS(N == 4, xyzw, Indexer(0), Indexer(1), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, xywx, Indexer(0), Indexer(1), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, xywy, Indexer(0), Indexer(1), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, xywz, Indexer(0), Indexer(1), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, xyww, Indexer(0), Indexer(1), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, xzxx, Indexer(0), Indexer(2), Indexer(0), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, xzxy, Indexer(0), Indexer(2), Indexer(0), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xzxz, Indexer(0), Indexer(2), Indexer(0), + Indexer(2)) +__SYCL_ACCESS(N == 4, xzxw, Indexer(0), Indexer(2), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, xzyx, Indexer(0), Indexer(2), Indexer(1), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, xzyy, Indexer(0), Indexer(2), Indexer(1), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xzyz, Indexer(0), Indexer(2), Indexer(1), + Indexer(2)) +__SYCL_ACCESS(N == 4, xzyw, Indexer(0), Indexer(2), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, xzzx, Indexer(0), Indexer(2), Indexer(2), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, xzzy, Indexer(0), Indexer(2), Indexer(2), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, xzzz, Indexer(0), Indexer(2), Indexer(2), + Indexer(2)) +__SYCL_ACCESS(N == 4, xzzw, Indexer(0), Indexer(2), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, xzwx, Indexer(0), Indexer(2), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, xzwy, Indexer(0), Indexer(2), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, xzwz, Indexer(0), Indexer(2), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, xzww, Indexer(0), Indexer(2), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, xwxx, Indexer(0), Indexer(3), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, xwxy, Indexer(0), Indexer(3), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, xwxz, Indexer(0), Indexer(3), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, xwxw, Indexer(0), Indexer(3), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, xwyx, Indexer(0), Indexer(3), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, xwyy, Indexer(0), Indexer(3), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, xwyz, Indexer(0), Indexer(3), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, xwyw, Indexer(0), Indexer(3), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, xwzx, Indexer(0), Indexer(3), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, xwzy, Indexer(0), Indexer(3), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, xwzz, Indexer(0), Indexer(3), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, xwzw, Indexer(0), Indexer(3), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, xwwx, Indexer(0), Indexer(3), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, xwwy, Indexer(0), Indexer(3), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, xwwz, Indexer(0), Indexer(3), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, xwww, Indexer(0), Indexer(3), Indexer(3), Indexer(3)) +__SYCL_ACCESS(2 <= N && N <= 4, yxxx, Indexer(1), Indexer(0), Indexer(0), + Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, yxxy, Indexer(1), Indexer(0), Indexer(0), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yxxz, Indexer(1), Indexer(0), Indexer(0), + Indexer(2)) +__SYCL_ACCESS(N == 4, yxxw, Indexer(1), Indexer(0), Indexer(0), Indexer(3)) +__SYCL_ACCESS(2 <= N && N <= 4, yxyx, Indexer(1), Indexer(0), Indexer(1), + Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, yxyy, Indexer(1), Indexer(0), Indexer(1), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yxyz, Indexer(1), Indexer(0), Indexer(1), + Indexer(2)) +__SYCL_ACCESS(N == 4, yxyw, Indexer(1), Indexer(0), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, yxzx, Indexer(1), Indexer(0), Indexer(2), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, yxzy, Indexer(1), Indexer(0), Indexer(2), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yxzz, Indexer(1), Indexer(0), Indexer(2), + Indexer(2)) +__SYCL_ACCESS(N == 4, yxzw, Indexer(1), Indexer(0), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, yxwx, Indexer(1), Indexer(0), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, yxwy, Indexer(1), Indexer(0), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, yxwz, Indexer(1), Indexer(0), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, yxww, Indexer(1), Indexer(0), Indexer(3), Indexer(3)) +__SYCL_ACCESS(2 <= N && N <= 4, yyxx, Indexer(1), Indexer(1), Indexer(0), + Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, yyxy, Indexer(1), Indexer(1), Indexer(0), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yyxz, Indexer(1), Indexer(1), Indexer(0), + Indexer(2)) +__SYCL_ACCESS(N == 4, yyxw, Indexer(1), Indexer(1), Indexer(0), Indexer(3)) +__SYCL_ACCESS(2 <= N && N <= 4, yyyx, Indexer(1), Indexer(1), Indexer(1), + Indexer(0)) +__SYCL_ACCESS(2 <= N && N <= 4, yyyy, Indexer(1), Indexer(1), Indexer(1), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yyyz, Indexer(1), Indexer(1), Indexer(1), + Indexer(2)) +__SYCL_ACCESS(N == 4, yyyw, Indexer(1), Indexer(1), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, yyzx, Indexer(1), Indexer(1), Indexer(2), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, yyzy, Indexer(1), Indexer(1), Indexer(2), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yyzz, Indexer(1), Indexer(1), Indexer(2), + Indexer(2)) +__SYCL_ACCESS(N == 4, yyzw, Indexer(1), Indexer(1), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, yywx, Indexer(1), Indexer(1), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, yywy, Indexer(1), Indexer(1), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, yywz, Indexer(1), Indexer(1), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, yyww, Indexer(1), Indexer(1), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, yzxx, Indexer(1), Indexer(2), Indexer(0), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, yzxy, Indexer(1), Indexer(2), Indexer(0), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yzxz, Indexer(1), Indexer(2), Indexer(0), + Indexer(2)) +__SYCL_ACCESS(N == 4, yzxw, Indexer(1), Indexer(2), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, yzyx, Indexer(1), Indexer(2), Indexer(1), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, yzyy, Indexer(1), Indexer(2), Indexer(1), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yzyz, Indexer(1), Indexer(2), Indexer(1), + Indexer(2)) +__SYCL_ACCESS(N == 4, yzyw, Indexer(1), Indexer(2), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, yzzx, Indexer(1), Indexer(2), Indexer(2), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, yzzy, Indexer(1), Indexer(2), Indexer(2), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, yzzz, Indexer(1), Indexer(2), Indexer(2), + Indexer(2)) +__SYCL_ACCESS(N == 4, yzzw, Indexer(1), Indexer(2), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, yzwx, Indexer(1), Indexer(2), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, yzwy, Indexer(1), Indexer(2), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, yzwz, Indexer(1), Indexer(2), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, yzww, Indexer(1), Indexer(2), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, ywxx, Indexer(1), Indexer(3), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, ywxy, Indexer(1), Indexer(3), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, ywxz, Indexer(1), Indexer(3), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, ywxw, Indexer(1), Indexer(3), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, ywyx, Indexer(1), Indexer(3), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, ywyy, Indexer(1), Indexer(3), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, ywyz, Indexer(1), Indexer(3), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, ywyw, Indexer(1), Indexer(3), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, ywzx, Indexer(1), Indexer(3), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, ywzy, Indexer(1), Indexer(3), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, ywzz, Indexer(1), Indexer(3), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, ywzw, Indexer(1), Indexer(3), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, ywwx, Indexer(1), Indexer(3), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, ywwy, Indexer(1), Indexer(3), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, ywwz, Indexer(1), Indexer(3), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, ywww, Indexer(1), Indexer(3), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zxxx, Indexer(2), Indexer(0), Indexer(0), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zxxy, Indexer(2), Indexer(0), Indexer(0), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zxxz, Indexer(2), Indexer(0), Indexer(0), + Indexer(2)) +__SYCL_ACCESS(N == 4, zxxw, Indexer(2), Indexer(0), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zxyx, Indexer(2), Indexer(0), Indexer(1), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zxyy, Indexer(2), Indexer(0), Indexer(1), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zxyz, Indexer(2), Indexer(0), Indexer(1), + Indexer(2)) +__SYCL_ACCESS(N == 4, zxyw, Indexer(2), Indexer(0), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zxzx, Indexer(2), Indexer(0), Indexer(2), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zxzy, Indexer(2), Indexer(0), Indexer(2), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zxzz, Indexer(2), Indexer(0), Indexer(2), + Indexer(2)) +__SYCL_ACCESS(N == 4, zxzw, Indexer(2), Indexer(0), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, zxwx, Indexer(2), Indexer(0), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, zxwy, Indexer(2), Indexer(0), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, zxwz, Indexer(2), Indexer(0), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, zxww, Indexer(2), Indexer(0), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zyxx, Indexer(2), Indexer(1), Indexer(0), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zyxy, Indexer(2), Indexer(1), Indexer(0), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zyxz, Indexer(2), Indexer(1), Indexer(0), + Indexer(2)) +__SYCL_ACCESS(N == 4, zyxw, Indexer(2), Indexer(1), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zyyx, Indexer(2), Indexer(1), Indexer(1), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zyyy, Indexer(2), Indexer(1), Indexer(1), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zyyz, Indexer(2), Indexer(1), Indexer(1), + Indexer(2)) +__SYCL_ACCESS(N == 4, zyyw, Indexer(2), Indexer(1), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zyzx, Indexer(2), Indexer(1), Indexer(2), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zyzy, Indexer(2), Indexer(1), Indexer(2), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zyzz, Indexer(2), Indexer(1), Indexer(2), + Indexer(2)) +__SYCL_ACCESS(N == 4, zyzw, Indexer(2), Indexer(1), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, zywx, Indexer(2), Indexer(1), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, zywy, Indexer(2), Indexer(1), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, zywz, Indexer(2), Indexer(1), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, zyww, Indexer(2), Indexer(1), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zzxx, Indexer(2), Indexer(2), Indexer(0), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zzxy, Indexer(2), Indexer(2), Indexer(0), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zzxz, Indexer(2), Indexer(2), Indexer(0), + Indexer(2)) +__SYCL_ACCESS(N == 4, zzxw, Indexer(2), Indexer(2), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zzyx, Indexer(2), Indexer(2), Indexer(1), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zzyy, Indexer(2), Indexer(2), Indexer(1), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zzyz, Indexer(2), Indexer(2), Indexer(1), + Indexer(2)) +__SYCL_ACCESS(N == 4, zzyw, Indexer(2), Indexer(2), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 3 || N == 4, zzzx, Indexer(2), Indexer(2), Indexer(2), + Indexer(0)) +__SYCL_ACCESS(N == 3 || N == 4, zzzy, Indexer(2), Indexer(2), Indexer(2), + Indexer(1)) +__SYCL_ACCESS(N == 3 || N == 4, zzzz, Indexer(2), Indexer(2), Indexer(2), + Indexer(2)) +__SYCL_ACCESS(N == 4, zzzw, Indexer(2), Indexer(2), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, zzwx, Indexer(2), Indexer(2), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, zzwy, Indexer(2), Indexer(2), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, zzwz, Indexer(2), Indexer(2), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, zzww, Indexer(2), Indexer(2), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, zwxx, Indexer(2), Indexer(3), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, zwxy, Indexer(2), Indexer(3), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, zwxz, Indexer(2), Indexer(3), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, zwxw, Indexer(2), Indexer(3), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, zwyx, Indexer(2), Indexer(3), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, zwyy, Indexer(2), Indexer(3), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, zwyz, Indexer(2), Indexer(3), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, zwyw, Indexer(2), Indexer(3), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, zwzx, Indexer(2), Indexer(3), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, zwzy, Indexer(2), Indexer(3), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, zwzz, Indexer(2), Indexer(3), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, zwzw, Indexer(2), Indexer(3), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, zwwx, Indexer(2), Indexer(3), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, zwwy, Indexer(2), Indexer(3), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, zwwz, Indexer(2), Indexer(3), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, zwww, Indexer(2), Indexer(3), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, wxxx, Indexer(3), Indexer(0), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, wxxy, Indexer(3), Indexer(0), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, wxxz, Indexer(3), Indexer(0), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, wxxw, Indexer(3), Indexer(0), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, wxyx, Indexer(3), Indexer(0), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, wxyy, Indexer(3), Indexer(0), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, wxyz, Indexer(3), Indexer(0), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, wxyw, Indexer(3), Indexer(0), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, wxzx, Indexer(3), Indexer(0), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, wxzy, Indexer(3), Indexer(0), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, wxzz, Indexer(3), Indexer(0), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, wxzw, Indexer(3), Indexer(0), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, wxwx, Indexer(3), Indexer(0), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, wxwy, Indexer(3), Indexer(0), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, wxwz, Indexer(3), Indexer(0), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, wxww, Indexer(3), Indexer(0), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, wyxx, Indexer(3), Indexer(1), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, wyxy, Indexer(3), Indexer(1), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, wyxz, Indexer(3), Indexer(1), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, wyxw, Indexer(3), Indexer(1), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, wyyx, Indexer(3), Indexer(1), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, wyyy, Indexer(3), Indexer(1), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, wyyz, Indexer(3), Indexer(1), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, wyyw, Indexer(3), Indexer(1), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, wyzx, Indexer(3), Indexer(1), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, wyzy, Indexer(3), Indexer(1), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, wyzz, Indexer(3), Indexer(1), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, wyzw, Indexer(3), Indexer(1), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, wywx, Indexer(3), Indexer(1), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, wywy, Indexer(3), Indexer(1), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, wywz, Indexer(3), Indexer(1), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, wyww, Indexer(3), Indexer(1), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, wzxx, Indexer(3), Indexer(2), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, wzxy, Indexer(3), Indexer(2), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, wzxz, Indexer(3), Indexer(2), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, wzxw, Indexer(3), Indexer(2), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, wzyx, Indexer(3), Indexer(2), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, wzyy, Indexer(3), Indexer(2), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, wzyz, Indexer(3), Indexer(2), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, wzyw, Indexer(3), Indexer(2), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, wzzx, Indexer(3), Indexer(2), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, wzzy, Indexer(3), Indexer(2), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, wzzz, Indexer(3), Indexer(2), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, wzzw, Indexer(3), Indexer(2), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, wzwx, Indexer(3), Indexer(2), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, wzwy, Indexer(3), Indexer(2), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, wzwz, Indexer(3), Indexer(2), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, wzww, Indexer(3), Indexer(2), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, wwxx, Indexer(3), Indexer(3), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, wwxy, Indexer(3), Indexer(3), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, wwxz, Indexer(3), Indexer(3), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, wwxw, Indexer(3), Indexer(3), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, wwyx, Indexer(3), Indexer(3), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, wwyy, Indexer(3), Indexer(3), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, wwyz, Indexer(3), Indexer(3), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, wwyw, Indexer(3), Indexer(3), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, wwzx, Indexer(3), Indexer(3), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, wwzy, Indexer(3), Indexer(3), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, wwzz, Indexer(3), Indexer(3), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, wwzw, Indexer(3), Indexer(3), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, wwwx, Indexer(3), Indexer(3), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, wwwy, Indexer(3), Indexer(3), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, wwwz, Indexer(3), Indexer(3), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, wwww, Indexer(3), Indexer(3), Indexer(3), Indexer(3)) + +//__swizzled_vec__ RGBA_SWIZZLE() const; +__SYCL_ACCESS(N == 4, rr, Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, rg, Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, rb, Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, ra, Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, gr, Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, gg, Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, gb, Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, ga, Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, br, Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, bg, Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, bb, Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, ba, Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, ar, Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, ag, Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, ab, Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, aa, Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, rrr, Indexer(0), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, rrg, Indexer(0), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, rrb, Indexer(0), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, rra, Indexer(0), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, rgr, Indexer(0), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, rgg, Indexer(0), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, rgb, Indexer(0), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, rga, Indexer(0), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, rbr, Indexer(0), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, rbg, Indexer(0), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, rbb, Indexer(0), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, rba, Indexer(0), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, rar, Indexer(0), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, rag, Indexer(0), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, rab, Indexer(0), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, raa, Indexer(0), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, grr, Indexer(1), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, grg, Indexer(1), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, grb, Indexer(1), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, gra, Indexer(1), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, ggr, Indexer(1), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, ggg, Indexer(1), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, ggb, Indexer(1), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, gga, Indexer(1), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, gbr, Indexer(1), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, gbg, Indexer(1), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, gbb, Indexer(1), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, gba, Indexer(1), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, gar, Indexer(1), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, gag, Indexer(1), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, gab, Indexer(1), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, gaa, Indexer(1), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, brr, Indexer(2), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, brg, Indexer(2), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, brb, Indexer(2), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, bra, Indexer(2), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, bgr, Indexer(2), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, bgg, Indexer(2), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, bgb, Indexer(2), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, bga, Indexer(2), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, bbr, Indexer(2), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, bbg, Indexer(2), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, bbb, Indexer(2), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, bba, Indexer(2), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, bar, Indexer(2), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, bag, Indexer(2), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, bab, Indexer(2), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, baa, Indexer(2), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, arr, Indexer(3), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, arg, Indexer(3), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, arb, Indexer(3), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, ara, Indexer(3), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, agr, Indexer(3), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, agg, Indexer(3), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, agb, Indexer(3), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, aga, Indexer(3), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, abr, Indexer(3), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, abg, Indexer(3), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, abb, Indexer(3), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, aba, Indexer(3), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, aar, Indexer(3), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, aag, Indexer(3), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, aab, Indexer(3), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, aaa, Indexer(3), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, rrrr, Indexer(0), Indexer(0), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, rrrg, Indexer(0), Indexer(0), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, rrrb, Indexer(0), Indexer(0), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, rrra, Indexer(0), Indexer(0), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, rrgr, Indexer(0), Indexer(0), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, rrgg, Indexer(0), Indexer(0), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, rrgb, Indexer(0), Indexer(0), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, rrga, Indexer(0), Indexer(0), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, rrbr, Indexer(0), Indexer(0), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, rrbg, Indexer(0), Indexer(0), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, rrbb, Indexer(0), Indexer(0), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, rrba, Indexer(0), Indexer(0), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, rrar, Indexer(0), Indexer(0), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, rrag, Indexer(0), Indexer(0), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, rrab, Indexer(0), Indexer(0), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, rraa, Indexer(0), Indexer(0), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, rgrr, Indexer(0), Indexer(1), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, rgrg, Indexer(0), Indexer(1), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, rgrb, Indexer(0), Indexer(1), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, rgra, Indexer(0), Indexer(1), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, rggr, Indexer(0), Indexer(1), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, rggg, Indexer(0), Indexer(1), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, rggb, Indexer(0), Indexer(1), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, rgga, Indexer(0), Indexer(1), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, rgbr, Indexer(0), Indexer(1), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, rgbg, Indexer(0), Indexer(1), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, rgbb, Indexer(0), Indexer(1), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, rgba, Indexer(0), Indexer(1), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, rgar, Indexer(0), Indexer(1), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, rgag, Indexer(0), Indexer(1), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, rgab, Indexer(0), Indexer(1), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, rgaa, Indexer(0), Indexer(1), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, rbrr, Indexer(0), Indexer(2), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, rbrg, Indexer(0), Indexer(2), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, rbrb, Indexer(0), Indexer(2), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, rbra, Indexer(0), Indexer(2), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, rbgr, Indexer(0), Indexer(2), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, rbgg, Indexer(0), Indexer(2), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, rbgb, Indexer(0), Indexer(2), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, rbga, Indexer(0), Indexer(2), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, rbbr, Indexer(0), Indexer(2), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, rbbg, Indexer(0), Indexer(2), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, rbbb, Indexer(0), Indexer(2), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, rbba, Indexer(0), Indexer(2), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, rbar, Indexer(0), Indexer(2), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, rbag, Indexer(0), Indexer(2), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, rbab, Indexer(0), Indexer(2), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, rbaa, Indexer(0), Indexer(2), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, rarr, Indexer(0), Indexer(3), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, rarg, Indexer(0), Indexer(3), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, rarb, Indexer(0), Indexer(3), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, rara, Indexer(0), Indexer(3), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, ragr, Indexer(0), Indexer(3), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, ragg, Indexer(0), Indexer(3), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, ragb, Indexer(0), Indexer(3), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, raga, Indexer(0), Indexer(3), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, rabr, Indexer(0), Indexer(3), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, rabg, Indexer(0), Indexer(3), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, rabb, Indexer(0), Indexer(3), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, raba, Indexer(0), Indexer(3), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, raar, Indexer(0), Indexer(3), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, raag, Indexer(0), Indexer(3), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, raab, Indexer(0), Indexer(3), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, raaa, Indexer(0), Indexer(3), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, grrr, Indexer(1), Indexer(0), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, grrg, Indexer(1), Indexer(0), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, grrb, Indexer(1), Indexer(0), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, grra, Indexer(1), Indexer(0), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, grgr, Indexer(1), Indexer(0), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, grgg, Indexer(1), Indexer(0), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, grgb, Indexer(1), Indexer(0), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, grga, Indexer(1), Indexer(0), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, grbr, Indexer(1), Indexer(0), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, grbg, Indexer(1), Indexer(0), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, grbb, Indexer(1), Indexer(0), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, grba, Indexer(1), Indexer(0), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, grar, Indexer(1), Indexer(0), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, grag, Indexer(1), Indexer(0), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, grab, Indexer(1), Indexer(0), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, graa, Indexer(1), Indexer(0), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, ggrr, Indexer(1), Indexer(1), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, ggrg, Indexer(1), Indexer(1), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, ggrb, Indexer(1), Indexer(1), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, ggra, Indexer(1), Indexer(1), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, gggr, Indexer(1), Indexer(1), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, gggg, Indexer(1), Indexer(1), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, gggb, Indexer(1), Indexer(1), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, ggga, Indexer(1), Indexer(1), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, ggbr, Indexer(1), Indexer(1), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, ggbg, Indexer(1), Indexer(1), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, ggbb, Indexer(1), Indexer(1), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, ggba, Indexer(1), Indexer(1), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, ggar, Indexer(1), Indexer(1), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, ggag, Indexer(1), Indexer(1), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, ggab, Indexer(1), Indexer(1), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, ggaa, Indexer(1), Indexer(1), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, gbrr, Indexer(1), Indexer(2), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, gbrg, Indexer(1), Indexer(2), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, gbrb, Indexer(1), Indexer(2), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, gbra, Indexer(1), Indexer(2), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, gbgr, Indexer(1), Indexer(2), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, gbgg, Indexer(1), Indexer(2), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, gbgb, Indexer(1), Indexer(2), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, gbga, Indexer(1), Indexer(2), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, gbbr, Indexer(1), Indexer(2), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, gbbg, Indexer(1), Indexer(2), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, gbbb, Indexer(1), Indexer(2), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, gbba, Indexer(1), Indexer(2), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, gbar, Indexer(1), Indexer(2), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, gbag, Indexer(1), Indexer(2), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, gbab, Indexer(1), Indexer(2), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, gbaa, Indexer(1), Indexer(2), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, garr, Indexer(1), Indexer(3), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, garg, Indexer(1), Indexer(3), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, garb, Indexer(1), Indexer(3), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, gara, Indexer(1), Indexer(3), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, gagr, Indexer(1), Indexer(3), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, gagg, Indexer(1), Indexer(3), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, gagb, Indexer(1), Indexer(3), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, gaga, Indexer(1), Indexer(3), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, gabr, Indexer(1), Indexer(3), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, gabg, Indexer(1), Indexer(3), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, gabb, Indexer(1), Indexer(3), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, gaba, Indexer(1), Indexer(3), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, gaar, Indexer(1), Indexer(3), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, gaag, Indexer(1), Indexer(3), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, gaab, Indexer(1), Indexer(3), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, gaaa, Indexer(1), Indexer(3), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, brrr, Indexer(2), Indexer(0), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, brrg, Indexer(2), Indexer(0), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, brrb, Indexer(2), Indexer(0), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, brra, Indexer(2), Indexer(0), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, brgr, Indexer(2), Indexer(0), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, brgg, Indexer(2), Indexer(0), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, brgb, Indexer(2), Indexer(0), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, brga, Indexer(2), Indexer(0), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, brbr, Indexer(2), Indexer(0), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, brbg, Indexer(2), Indexer(0), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, brbb, Indexer(2), Indexer(0), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, brba, Indexer(2), Indexer(0), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, brar, Indexer(2), Indexer(0), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, brag, Indexer(2), Indexer(0), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, brab, Indexer(2), Indexer(0), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, braa, Indexer(2), Indexer(0), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, bgrr, Indexer(2), Indexer(1), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, bgrg, Indexer(2), Indexer(1), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, bgrb, Indexer(2), Indexer(1), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, bgra, Indexer(2), Indexer(1), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, bggr, Indexer(2), Indexer(1), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, bggg, Indexer(2), Indexer(1), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, bggb, Indexer(2), Indexer(1), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, bgga, Indexer(2), Indexer(1), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, bgbr, Indexer(2), Indexer(1), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, bgbg, Indexer(2), Indexer(1), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, bgbb, Indexer(2), Indexer(1), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, bgba, Indexer(2), Indexer(1), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, bgar, Indexer(2), Indexer(1), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, bgag, Indexer(2), Indexer(1), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, bgab, Indexer(2), Indexer(1), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, bgaa, Indexer(2), Indexer(1), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, bbrr, Indexer(2), Indexer(2), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, bbrg, Indexer(2), Indexer(2), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, bbrb, Indexer(2), Indexer(2), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, bbra, Indexer(2), Indexer(2), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, bbgr, Indexer(2), Indexer(2), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, bbgg, Indexer(2), Indexer(2), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, bbgb, Indexer(2), Indexer(2), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, bbga, Indexer(2), Indexer(2), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, bbbr, Indexer(2), Indexer(2), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, bbbg, Indexer(2), Indexer(2), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, bbbb, Indexer(2), Indexer(2), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, bbba, Indexer(2), Indexer(2), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, bbar, Indexer(2), Indexer(2), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, bbag, Indexer(2), Indexer(2), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, bbab, Indexer(2), Indexer(2), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, bbaa, Indexer(2), Indexer(2), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, barr, Indexer(2), Indexer(3), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, barg, Indexer(2), Indexer(3), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, barb, Indexer(2), Indexer(3), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, bara, Indexer(2), Indexer(3), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, bagr, Indexer(2), Indexer(3), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, bagg, Indexer(2), Indexer(3), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, bagb, Indexer(2), Indexer(3), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, baga, Indexer(2), Indexer(3), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, babr, Indexer(2), Indexer(3), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, babg, Indexer(2), Indexer(3), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, babb, Indexer(2), Indexer(3), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, baba, Indexer(2), Indexer(3), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, baar, Indexer(2), Indexer(3), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, baag, Indexer(2), Indexer(3), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, baab, Indexer(2), Indexer(3), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, baaa, Indexer(2), Indexer(3), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, arrr, Indexer(3), Indexer(0), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, arrg, Indexer(3), Indexer(0), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, arrb, Indexer(3), Indexer(0), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, arra, Indexer(3), Indexer(0), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, argr, Indexer(3), Indexer(0), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, argg, Indexer(3), Indexer(0), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, argb, Indexer(3), Indexer(0), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, arga, Indexer(3), Indexer(0), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, arbr, Indexer(3), Indexer(0), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, arbg, Indexer(3), Indexer(0), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, arbb, Indexer(3), Indexer(0), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, arba, Indexer(3), Indexer(0), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, arar, Indexer(3), Indexer(0), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, arag, Indexer(3), Indexer(0), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, arab, Indexer(3), Indexer(0), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, araa, Indexer(3), Indexer(0), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, agrr, Indexer(3), Indexer(1), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, agrg, Indexer(3), Indexer(1), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, agrb, Indexer(3), Indexer(1), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, agra, Indexer(3), Indexer(1), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, aggr, Indexer(3), Indexer(1), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, aggg, Indexer(3), Indexer(1), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, aggb, Indexer(3), Indexer(1), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, agga, Indexer(3), Indexer(1), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, agbr, Indexer(3), Indexer(1), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, agbg, Indexer(3), Indexer(1), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, agbb, Indexer(3), Indexer(1), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, agba, Indexer(3), Indexer(1), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, agar, Indexer(3), Indexer(1), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, agag, Indexer(3), Indexer(1), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, agab, Indexer(3), Indexer(1), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, agaa, Indexer(3), Indexer(1), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, abrr, Indexer(3), Indexer(2), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, abrg, Indexer(3), Indexer(2), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, abrb, Indexer(3), Indexer(2), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, abra, Indexer(3), Indexer(2), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, abgr, Indexer(3), Indexer(2), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, abgg, Indexer(3), Indexer(2), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, abgb, Indexer(3), Indexer(2), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, abga, Indexer(3), Indexer(2), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, abbr, Indexer(3), Indexer(2), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, abbg, Indexer(3), Indexer(2), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, abbb, Indexer(3), Indexer(2), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, abba, Indexer(3), Indexer(2), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, abar, Indexer(3), Indexer(2), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, abag, Indexer(3), Indexer(2), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, abab, Indexer(3), Indexer(2), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, abaa, Indexer(3), Indexer(2), Indexer(3), Indexer(3)) +__SYCL_ACCESS(N == 4, aarr, Indexer(3), Indexer(3), Indexer(0), Indexer(0)) +__SYCL_ACCESS(N == 4, aarg, Indexer(3), Indexer(3), Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, aarb, Indexer(3), Indexer(3), Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, aara, Indexer(3), Indexer(3), Indexer(0), Indexer(3)) +__SYCL_ACCESS(N == 4, aagr, Indexer(3), Indexer(3), Indexer(1), Indexer(0)) +__SYCL_ACCESS(N == 4, aagg, Indexer(3), Indexer(3), Indexer(1), Indexer(1)) +__SYCL_ACCESS(N == 4, aagb, Indexer(3), Indexer(3), Indexer(1), Indexer(2)) +__SYCL_ACCESS(N == 4, aaga, Indexer(3), Indexer(3), Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, aabr, Indexer(3), Indexer(3), Indexer(2), Indexer(0)) +__SYCL_ACCESS(N == 4, aabg, Indexer(3), Indexer(3), Indexer(2), Indexer(1)) +__SYCL_ACCESS(N == 4, aabb, Indexer(3), Indexer(3), Indexer(2), Indexer(2)) +__SYCL_ACCESS(N == 4, aaba, Indexer(3), Indexer(3), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, aaar, Indexer(3), Indexer(3), Indexer(3), Indexer(0)) +__SYCL_ACCESS(N == 4, aaag, Indexer(3), Indexer(3), Indexer(3), Indexer(1)) +__SYCL_ACCESS(N == 4, aaab, Indexer(3), Indexer(3), Indexer(3), Indexer(2)) +__SYCL_ACCESS(N == 4, aaaa, Indexer(3), Indexer(3), Indexer(3), Indexer(3)) + +#endif // #ifdef SYCL_SIMPLE_SWIZZLES + +//__swizzled_vec__ lo()/hi() const; +__SYCL_ACCESS(N == 2, lo, Indexer(0)) +__SYCL_ACCESS(N == 3, lo, Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 4, lo, Indexer(0), Indexer(1)) +__SYCL_ACCESS(N == 8, lo, Indexer(0), Indexer(1), Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 16, lo, Indexer(0), Indexer(1), Indexer(2), Indexer(3), + Indexer(4), Indexer(5), Indexer(6), Indexer(7)) +__SYCL_ACCESS(N == 2, hi, Indexer(1)) +__SYCL_ACCESS(N == 3, hi, Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 4, hi, Indexer(2), Indexer(3)) +__SYCL_ACCESS(N == 8, hi, Indexer(4), Indexer(5), Indexer(6), Indexer(7)) +__SYCL_ACCESS(N == 16, hi, Indexer(8), Indexer(9), Indexer(10), Indexer(11), + Indexer(12), Indexer(13), Indexer(14), Indexer(15)) +//__swizzled_vec__ odd()/even() const; +__SYCL_ACCESS(N == 2, odd, Indexer(1)) +__SYCL_ACCESS(N == 3, odd, Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 4, odd, Indexer(1), Indexer(3)) +__SYCL_ACCESS(N == 8, odd, Indexer(1), Indexer(3), Indexer(5), Indexer(7)) +__SYCL_ACCESS(N == 16, odd, Indexer(1), Indexer(3), Indexer(5), Indexer(7), + Indexer(9), Indexer(11), Indexer(13), Indexer(15)) +__SYCL_ACCESS(N == 2, even, Indexer(0)) +__SYCL_ACCESS(N == 3, even, Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 4, even, Indexer(0), Indexer(2)) +__SYCL_ACCESS(N == 8, even, Indexer(0), Indexer(2), Indexer(4), Indexer(6)) +__SYCL_ACCESS(N == 16, even, Indexer(0), Indexer(2), Indexer(4), Indexer(6), + Indexer(8), Indexer(10), Indexer(12), Indexer(14)) +#undef __SYCL_ACCESS diff --git a/sycl/include/CL/sycl/types.hpp b/sycl/include/CL/sycl/types.hpp new file mode 100644 index 000000000000..608f9537f578 --- /dev/null +++ b/sycl/include/CL/sycl/types.hpp @@ -0,0 +1,1546 @@ +//==---------------- types.hpp --- SYCL types ------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#ifndef __SYCL_DEVICE_ONLY__ +#include +#include +#endif // __SYCL_DEVICE_ONLY__ +// 4.10.1: Scalar data types +// 4.10.2: SYCL vector types + +namespace cl { +namespace sycl { + +enum class rounding_mode { automatic, rte, rtz, rtp, rtn }; +struct elem { + static constexpr int x = 0; + static constexpr int y = 1; + static constexpr int z = 2; + static constexpr int w = 3; + static constexpr int r = 0; + static constexpr int g = 1; + static constexpr int b = 2; + static constexpr int a = 3; + static constexpr int s0 = 0; + static constexpr int s1 = 1; + static constexpr int s2 = 2; + static constexpr int s3 = 3; + static constexpr int s4 = 4; + static constexpr int s5 = 5; + static constexpr int s6 = 6; + static constexpr int s7 = 7; + static constexpr int s8 = 8; + static constexpr int s9 = 9; + static constexpr int sA = 10; + static constexpr int sB = 11; + static constexpr int sC = 12; + static constexpr int sD = 13; + static constexpr int sE = 14; + static constexpr int sF = 15; +}; + +/** + * A signed 8-bit integer. + */ +typedef signed char schar; + +/** + * An unsigned 8-bit integer. + */ +typedef unsigned char uchar; + +/** + * An unsigned 16-bit integer. + */ +typedef unsigned short ushort; + +/** + * An unsigned 32-bit integer. + */ +typedef unsigned int uint; + +/** + * An unsigned 64-bit integer. + */ +typedef unsigned long ulong; + +/** + * An signed integer with width of at least 64-bit. + */ +typedef long long longlong; + +/** + * An unsigned integer with width of at least 64-bit. + */ +typedef unsigned long long ulonglong; + +namespace detail { + +template class OperationCurrentT, int... Indexes> +class SwizzleOp; + +template class BaseCLTypeConverter; + +// Element type for relational operator return value. +template +using rel_t = typename std::conditional< + sizeof(DataT) == sizeof(cl_char), cl_char, + typename std::conditional< + sizeof(DataT) == sizeof(cl_short), cl_short, + typename std::conditional< + sizeof(DataT) == sizeof(cl_int), cl_int, + typename std::conditional::type>::type>::type>::type; + +// Special type indicating that SwizzleOp should just read value from vector - +// not trying to perform any operations. Should not be called. +template class GetOp { +public: + DataT getValue(size_t Index) const; + DataT operator()(DataT LHS, DataT Rhs); +}; + +// Special type for working SwizzleOp with scalars, stores a scalar and gives +// the scalar at any index. Provides interface is compatible with SwizzleOp +// operations +template class GetScalarOp { +public: + GetScalarOp(DataT Data) : m_Data(Data) {} + DataT getValue(size_t Index) const { return m_Data; } + +private: + DataT m_Data; +}; + +template struct EqualTo { + constexpr rel_t operator()(const T &Lhs, const T &Rhs) const { + return (Lhs == Rhs) ? -1 : 0; + } +}; + +template struct NotEqualTo { + constexpr rel_t operator()(const T &Lhs, const T &Rhs) const { + return (Lhs != Rhs) ? -1 : 0; + } +}; + +template struct GreaterEqualTo { + constexpr rel_t operator()(const T &Lhs, const T &Rhs) const { + return (Lhs >= Rhs) ? -1 : 0; + } +}; + +template struct LessEqualTo { + constexpr rel_t operator()(const T &Lhs, const T &Rhs) const { + return (Lhs <= Rhs) ? -1 : 0; + } +}; + +template struct GreaterThan { + constexpr rel_t operator()(const T &Lhs, const T &Rhs) const { + return (Lhs > Rhs) ? -1 : 0; + } +}; + +template struct LessThan { + constexpr rel_t operator()(const T &Lhs, const T &Rhs) const { + return (Lhs < Rhs) ? -1 : 0; + } +}; + +template struct LogicalAnd { + constexpr rel_t operator()(const T &Lhs, const T &Rhs) const { + return (Lhs && Rhs) ? -1 : 0; + } +}; + +template struct LogicalOr { + constexpr rel_t operator()(const T &Lhs, const T &Rhs) const { + return (Lhs || Rhs) ? -1 : 0; + } +}; + +template struct RShift { + constexpr T operator()(const T &Lhs, const T &Rhs) const { + return Lhs >> Rhs; + } +}; + +template struct LShift { + constexpr T operator()(const T &Lhs, const T &Rhs) const { + return Lhs << Rhs; + } +}; + +} // namespace detail + +template class vec { + // This represent type of underlying value. There should be only one field + // in the class, so vec should be equal to float16 in memory. + using DataType = + typename detail::BaseCLTypeConverter::DataType; + + template + using conditional_t = typename std::conditional::type; + + static constexpr int getNumElements() { return NumElements; } + + // SizeChecker is needed for vec(const argTN &... args) ctor to validate args. + template + struct SizeChecker + : conditional_t {}; + + template + struct SizeChecker + : conditional_t, + std::false_type> {}; + +#define ALLOW_VECTOR_SIZES(num_elements) \ + template \ + struct SizeChecker, tail...> \ + : conditional_t, \ + std::false_type> {}; \ + template class T4, int... T5, \ + class... tail> \ + struct SizeChecker< \ + Counter, MaxValue, \ + detail::SwizzleOp, T2, T3, T4, T5...>, \ + tail...> \ + : conditional_t, \ + std::false_type> {}; \ + template class T4, int... T5, \ + class... tail> \ + struct SizeChecker< \ + Counter, MaxValue, \ + detail::SwizzleOp, T2, T3, T4, T5...>, \ + tail...> \ + : conditional_t, \ + std::false_type> {}; + + ALLOW_VECTOR_SIZES(1) + ALLOW_VECTOR_SIZES(2) + ALLOW_VECTOR_SIZES(3) + ALLOW_VECTOR_SIZES(4) + ALLOW_VECTOR_SIZES(8) + ALLOW_VECTOR_SIZES(16) +#undef ALLOW_VECTOR_SIZES + + template struct conjunction : std::true_type {}; + template + struct conjunction + : conditional_t, B1> {}; + + // TypeChecker is needed for vec(const argTN &... args) ctor to validate args. + template + struct TypeChecker : std::is_convertible {}; +#define ALLOW_VECTOR_TYPES(num_elements) \ + template \ + struct TypeChecker, DataT_> : std::true_type {}; \ + template class T4, int... T5> \ + struct TypeChecker< \ + detail::SwizzleOp, T2, T3, T4, T5...>, DataT_> \ + : std::true_type {}; \ + template class T4, int... T5> \ + struct TypeChecker< \ + detail::SwizzleOp, T2, T3, T4, T5...>, \ + DataT_> : std::true_type {}; + + ALLOW_VECTOR_TYPES(1) + ALLOW_VECTOR_TYPES(2) + ALLOW_VECTOR_TYPES(3) + ALLOW_VECTOR_TYPES(4) + ALLOW_VECTOR_TYPES(8) + ALLOW_VECTOR_TYPES(16) +#undef ALLOW_VECTOR_TYPES + + template + using Swizzle = + detail::SwizzleOp, detail::GetOp, + detail::GetOp, Indexes...>; + + template + using ConstSwizzle = + detail::SwizzleOp, detail::GetOp, + detail::GetOp, Indexes...>; + + // Shortcuts for args validation in vec(const argTN &... args) ctor. + template + using EnableIfSuitableTypes = typename std::enable_if< + conjunction...>::value>::type; + + template + using EnableIfSuitableNumElements = typename std::enable_if< + SizeChecker<0, NumElements, argTN...>::value>::type; + +public: + using element_type = DataT; + using rel_t = detail::rel_t; + +#ifdef __SYCL_DEVICE_ONLY__ + using vector_t = DataType; +#endif + + vec() { m_Data = {0}; } + + vec(const vec &Rhs) : m_Data(Rhs.m_Data) {} + + vec(vec &&Rhs) : m_Data(std::move(Rhs.m_Data)) {} + + vec &operator=(const vec &Rhs) { + m_Data = Rhs.m_Data; + return *this; + } + + // W/o this, things like "vec = vec" doesn't work. + template + typename std::enable_if::value && + std::is_convertible::value, + vec &>::type + operator=(const vec &Rhs) { + *this = Rhs.template as(); + return *this; + } + + explicit vec(const DataT &arg) { + for (int i = 0; i < NumElements; ++i) { + setValue(i, arg); + } + } + + // Constructor from values of base type or vec of base type. Checks that + // base types are match and that the NumElements == sum of lenghts of args. + template , + typename = EnableIfSuitableNumElements> + vec(const argTN &... args) { + vaargCtorHelper(0, args...); + } + + // TODO: Remove, for debug purposes only. + void dump() { +#ifndef __SYCL_DEVICE_ONLY__ + for (int I = 0; I < NumElements; ++I) { + std::cout << " " << I << ": " << m_Data.s[I] << std::endl; + } + std::cout << std::endl; +#endif // __SYCL_DEVICE_ONLY__ + } + +#ifdef __SYCL_DEVICE_ONLY__ + + template ::value && + !std::is_same::value>::type> + vec(vector_t openclVector) : m_Data(openclVector) {} + operator vector_t() const { return m_Data; } +#endif + // Available only when: NumElements == 1 + template + operator typename std::enable_if::type() const { + return m_Data; + } + size_t get_count() const { return NumElements; } + size_t get_size() const { return sizeof(m_Data); } + + // TODO: convert() for FP types. Also, check whether rounding mode handling + // is needed for integers to FP convert. + // template + // vec convert() const; + template + typename std::enable_if::value, + vec>::type + convert() const { + vec Result; + for (size_t I = 0; I < NumElements; ++I) { + Result.setValue(I, static_cast(getValue(I))); + } + return Result; + } + + template + typename std::enable_if::type + as() const { + asT Result; + *static_cast(static_cast(&Result.m_Data)) = m_Data; + return Result; + } + + template Swizzle swizzle() { + return this; + } + + template + ConstSwizzle swizzle() const { + return this; + } + + // Begin hi/lo, even/odd, xyzw, and rgba swizzles. +private: + // Indexer used in the swizzles.def + static constexpr int Indexer(int index) { return index; } + +public: +#ifdef __SYCL_ACCESS_RETURN +#error "Undefine __SYCL_ACCESS_RETURN macro" +#endif +#define __SYCL_ACCESS_RETURN this +#include "swizzles.def" +#undef __SYCL_ACCESS_RETURN + // End of hi/lo, even/odd, xyzw, and rgba swizzles. + + // TODO: make templated address space to work. + // Somehow, access<> to multi_ptr<> conversion doesn't work w/o making + // address space explicitly specified. +#ifdef __SYCL_LOADSTORE +#error "Undefine __SYCL_LOADSTORE macro" +#endif +#define __SYCL_LOADSTORE(Space) \ + void load(size_t Offset, multi_ptr Ptr) { \ + m_Data = *multi_ptr(static_cast( \ + static_cast(Ptr + Offset * NumElements))); \ + } \ + void store(size_t Offset, multi_ptr Ptr) const { \ + *multi_ptr(static_cast( \ + static_cast(Ptr + Offset * NumElements))) = m_Data; \ + } + + __SYCL_LOADSTORE(access::address_space::global_space) + __SYCL_LOADSTORE(access::address_space::local_space) + __SYCL_LOADSTORE(access::address_space::constant_space) + __SYCL_LOADSTORE(access::address_space::private_space) +#undef __SYCL_LOADSTORE + +#ifdef __SYCL_BINOP +#error "Undefine __SYCL_BINOP macro" +#endif + +#ifdef __SYCL_DEVICE_ONLY__ +#define __SYCL_BINOP(BINOP, OPASSIGN) \ + vec operator BINOP(const vec &Rhs) const { \ + vec Ret; \ + Ret.m_Data = m_Data BINOP Rhs.m_Data; \ + return Ret; \ + } \ + template \ + typename std::enable_if::value && \ + std::is_fundamental::value, \ + vec>::type \ + operator BINOP(const T &Rhs) const { \ + return *this BINOP vec(static_cast(Rhs)); \ + } \ + vec &operator OPASSIGN(const vec &Rhs) { \ + *this = *this BINOP Rhs; \ + return *this; \ + } \ + template \ + typename std::enable_if::type operator OPASSIGN( \ + const DataT &Rhs) { \ + *this = *this BINOP vec(Rhs); \ + return *this; \ + } +#else // __SYCL_DEVICE_ONLY__ +#define __SYCL_BINOP(BINOP, OPASSIGN) \ + vec operator BINOP(const vec &Rhs) const { \ + vec Ret; \ + for (size_t I = 0; I < NumElements; ++I) { \ + Ret.setValue(I, (getValue(I) BINOP Rhs.getValue(I))); \ + } \ + return Ret; \ + } \ + template \ + typename std::enable_if::value && \ + std::is_fundamental::value, \ + vec>::type \ + operator BINOP(const T &Rhs) const { \ + return *this BINOP vec(static_cast(Rhs)); \ + } \ + vec &operator OPASSIGN(const vec &Rhs) { \ + *this = *this BINOP Rhs; \ + return *this; \ + } \ + template \ + typename std::enable_if::type operator OPASSIGN( \ + const DataT &Rhs) { \ + *this = *this BINOP vec(Rhs); \ + return *this; \ + } +#endif // __SYCL_DEVICE_ONLY__ + + __SYCL_BINOP(+, +=) + __SYCL_BINOP(-, -=) + __SYCL_BINOP(*, *=) + __SYCL_BINOP(/, /=) + + // TODO: The following OPs are available only when: DataT != cl_float && + // DataT != cl_double && DataT != cl_half + __SYCL_BINOP(%, %=) + __SYCL_BINOP(|, |=) + __SYCL_BINOP(&, &=) + __SYCL_BINOP(^, ^=) + __SYCL_BINOP(>>, >>=) + __SYCL_BINOP(<<, <<=) +#undef __SYCL_BINOP +#undef __SYCL_BINOP_HELP + + // Note: vec<>/SwizzleOp logical value is 0/-1 logic, as opposed to 0/1 logic. + // As far as CTS validation is concerned, 0/-1 logic also applies when + // NumElements is equal to one, which is somewhat inconsistent with being + // tranparent with scalar data. + // + // TODO, at least for the device: Use direct comparison on aggregate data, + // e.g., Ret.m_Data = m_Data RELLOGOP Rhs.m_Data, as opposed to looping + // around scalar operations. +#ifdef __SYCL_RELLOGOP +#error "Undefine __SYCL_RELLOGOP macro" +#endif +#define __SYCL_RELLOGOP(RELLOGOP) \ + vec operator RELLOGOP(const vec &Rhs) const { \ + vec Ret; \ + for (size_t I = 0; I < NumElements; ++I) { \ + Ret.setValue(I, -(getValue(I) RELLOGOP Rhs.getValue(I))); \ + } \ + return Ret; \ + } \ + template \ + typename std::enable_if::value && \ + std::is_fundamental::value, \ + vec>::type \ + operator RELLOGOP(const T &Rhs) const { \ + return *this RELLOGOP vec(static_cast(Rhs)); \ + } + + __SYCL_RELLOGOP(==) + __SYCL_RELLOGOP(!=) + __SYCL_RELLOGOP(>) + __SYCL_RELLOGOP(<) + __SYCL_RELLOGOP(>=) + __SYCL_RELLOGOP(<=) + // TODO: limit to integral types. + __SYCL_RELLOGOP(&&) + __SYCL_RELLOGOP(||) +#undef __SYCL_RELLOGOP + +#ifdef __SYCL_UOP +#error "Undefine __SYCL_UOP macro" +#endif +#define __SYCL_UOP(UOP, OPASSIGN) \ + vec &operator UOP() { \ + *this OPASSIGN 1; \ + return *this; \ + } \ + vec operator UOP(int) { \ + vec Ret(*this); \ + *this OPASSIGN 1; \ + return Ret; \ + } + + __SYCL_UOP(++, +=) + __SYCL_UOP(--, -=) +#undef __SYCL_UOP + + template + typename std::enable_if::value, vec>::type + operator~() const { + vec Ret; + for (size_t I = 0; I < NumElements; ++I) { + Ret.setValue(I, ~getValue(I)); + } + return Ret; + } + + vec operator!() const { + vec Ret; + for (size_t I = 0; I < NumElements; ++I) { + Ret.setValue(I, !getValue(I)); + } + return Ret; + } + + // OP is: &&, || + // vec operatorOP(const vec &Rhs) const; + // vec operatorOP(const DataT &Rhs) const; + + // OP is: ==, !=, <, >, <=, >= + // vec operatorOP(const vec &Rhs) const; + // vec operatorOP(const DataT &Rhs) const; +private: + // Generic method that execute "Operation" on underlying values. + template